diff options
Diffstat (limited to 'src/dfa.c')
-rw-r--r-- | src/dfa.c | 692 |
1 files changed, 692 insertions, 0 deletions
diff --git a/src/dfa.c b/src/dfa.c new file mode 100644 index 0000000..e3bcdf0 --- /dev/null +++ b/src/dfa.c @@ -0,0 +1,692 @@ +#include "utf8.h" +#include "str.h" +#include <stdio.h> +#include "list.h" +#include "dfa.h" + +#define INIT_TRIE(X, ID) do { \ + X->id = ID; \ + X->children = MYALLOC(int, MAX_CHAR_BYTES_NUM); \ + for (int macro_index = 0; macro_index < MAX_CHAR_BYTES_NUM;) \ + *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \ + } while (0) + +/* The complete, uncompressed table is represented as an array of + arrays, each array of which has size MAX_CHAR_BYTES_NUM. The + compressed table is represented as run-lengths. The ROW_N is the + size of the total table. And the TABLE stores the data. It is an + array of arrays. The format of the array of a row is as follows: + + VALUE1 LEN1 VALUE2 LEN2 + + This means VALUE1 appears LEN1 times, and then VALUE2 appears LEN2 + times, and so on. Each array of a row ends when the lengths sum to + MAX_CHAR_BYTES_NUM. */ +struct compressed_table_s { + int row_n; + int **table; +}; + +enum DFA_TYPE { + DFA_TYPE_POSITIVE, + DFA_TYPE_NEGATIVE, + DFA_TYPE_BOTH, + DFA_TYPE_SPECIAL, +}; + +struct dfa_s { + enum DFA_TYPE type; + union { + compressed_table table; + special_dfa fn; + } data; +}; + +dfa * +new_dfa() +{ + dfa *dfap = MYALLOC(dfa, 1); + + dfap->type = 0; + + dfap->data.table.row_n = 0; + dfap->data.table.table = NULL; + + return dfap; +} + +void +destroy_dfa(dfa *table) +{ + /* function pointers need no destructors */ + if (table->type == DFA_TYPE_SPECIAL) return; + + for (int i = 0; i < table->data.table.row_n;) + free(*(table->data.table.table+i++)); + + free(table->data.table.table); + free(table); +} + +/* a trie structure */ + +typedef struct trie_s trie; + +/* REVIEW: Maybe we don't need ID? */ +struct trie_s { + /* It is guaranteed that the number of children is + MAX_CHAR_BYTES_NUM. */ + int *children; + int id; +}; + +static void +destroy_trie(void *t) +{ + trie *tp = (trie *)t; + free(tp->children); + free(tp); +} + +/* Print a list of Trie's. */ +UNUSED +static void +print_tries(const List * const restrict ls) +{ + NUM len = list_length(ls); + for (NUM i = 0; i < len; i++) { + trie *t = (trie *) list_nth(ls, i); + printf("Trie ID: %d:\n", t->id); + + BOOL has_children_p = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + if (*(t->children+j) != -1) { + has_children_p = 1; + printf(" " + "Char %d => State %d\n", j, *(t->children+j)); + } + } + + if (!has_children_p) printf(" " "No children!\n"); + + printf("\n"); + } +} + +void +print_dfa(const dfa * const restrict table) +{ + switch (table->type) { + case DFA_TYPE_POSITIVE: + printf("Printing a positive table:\n"); + break; + case DFA_TYPE_NEGATIVE: + printf("Printing a negative table:\n"); + break; + case DFA_TYPE_BOTH: + printf("Printing a mixed table:\n"); + break; + default: + printf("A special table cannot be printed\n"); + return; + break; + } + + printf("Printing a table:\n"); + + for (int i = 0; i < table->data.table.row_n; i++) { + printf("Row %d: ", i); + int size = 0; + for (int j = 0; size < MAX_CHAR_BYTES_NUM; j++) { + printf("%d %d", *(*(table->data.table.table+i)+(j<<1)), + *(*(table->data.table.table+i)+(j<<1)+1)); + size += *(*(table->data.table.table+i)+(j<<1)+1); + + if (size < MAX_CHAR_BYTES_NUM) printf(", "); + else printf("\n"); + } + } + printf("\n"); +} + +/* REVIEW: Maybe we don't want to generate the complete trie in the + process of generating the compressed table? */ +dfa * +dfa_from_bytes(int sequence_size, + const NUM * const restrict data) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the accepting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_ACCEPT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +dfa * +dfa_from_bytes_neg(int sequence_size, + const NUM * const restrict data) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the rejecting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_REJECT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + dfap->type = 1; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +dfa *dfa_from_bytes_both(int sequence_size, + const NUM * const restrict data, + int neg_sequence_size, + const NUM * const restrict negdata) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the accepting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_ACCEPT; + } + + str_set_length(strp, 5); + } + + for (int i = 0; i < neg_sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(negdata+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + fleprintf("Fail to add to list when i = %d, j = %d, " + "and current value = %ld\n", + i, j, current_value); + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the rejecting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_REJECT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +BOOL +run_dfa(const dfa * const restrict table, const NUM code) +{ + switch (table->type) { + case DFA_TYPE_POSITIVE: + case DFA_TYPE_NEGATIVE: + case DFA_TYPE_BOTH: + break; + default: + return table->data.fn(code); + break; + } + + char *s = MYALLOC(char, 5); + str *strp = new_str(s, 5); + + if (encode(code, strp)) { + destroy_str(strp, 1); + fleprintf0("Fail to encode\n"); + return 0; + } + + int current_row = 0; + + for (NUM i = 0; i < str_length(strp); i++) { + unsigned char c = (unsigned char) *(s+i); + int lookup_value = 0; + + for (int j = 0, len = 0; len <= c;) { + len += *(*(table->data.table.table+current_row)+(j<<1)+1); + lookup_value = *(*(table->data.table.table+current_row) + +(j++<<1)); + } + + /* fleprintf("lookup_value = %d\n", lookup_value); */ + + if (lookup_value > 0) { + current_row = lookup_value; + } else { + switch (lookup_value) { + case DFA_STATE_REJECT: + destroy_str(strp, 1); + return 0; + break; + case DFA_STATE_UNKNOWN: + switch (table->type) { + case DFA_TYPE_BOTH: + case DFA_TYPE_POSITIVE: + destroy_str(strp, 1); + return 0; + break; + case DFA_TYPE_NEGATIVE: + destroy_str(strp, 1); + return 1; + break; + default: + /* This should not be possible, as we won't reach here if + the table is of the special type. But I am apparently + paranoia. */ + fleprintf0("Special table in the wrong place\n"); + destroy_str(strp, 1); + return 0; + break; + } + break; + case DFA_STATE_ACCEPT: + destroy_str(strp, 1); + return 1; + break; + default: + fleprintf("Unknown state number for char %d: %d\n", + c, lookup_value); + destroy_str(strp, 1); + return 0; + break; + } + } + } + + destroy_str(strp, 1); + + return 0; +} |