From 91016bd3855375796fd1eabd501ebc12491f2655 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Tue, 11 Jan 2022 09:58:43 +0800 Subject: Add the framework for character classes. Now we have the potential to recognize character classes. But the most important task for us now is to experiment with ((B)RN)GLR algorithms, so we leave the character classes at the present state for a moment. --- src/dfa.c | 692 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 692 insertions(+) create mode 100644 src/dfa.c (limited to 'src/dfa.c') diff --git a/src/dfa.c b/src/dfa.c new file mode 100644 index 0000000..e3bcdf0 --- /dev/null +++ b/src/dfa.c @@ -0,0 +1,692 @@ +#include "utf8.h" +#include "str.h" +#include +#include "list.h" +#include "dfa.h" + +#define INIT_TRIE(X, ID) do { \ + X->id = ID; \ + X->children = MYALLOC(int, MAX_CHAR_BYTES_NUM); \ + for (int macro_index = 0; macro_index < MAX_CHAR_BYTES_NUM;) \ + *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \ + } while (0) + +/* The complete, uncompressed table is represented as an array of + arrays, each array of which has size MAX_CHAR_BYTES_NUM. The + compressed table is represented as run-lengths. The ROW_N is the + size of the total table. And the TABLE stores the data. It is an + array of arrays. The format of the array of a row is as follows: + + VALUE1 LEN1 VALUE2 LEN2 + + This means VALUE1 appears LEN1 times, and then VALUE2 appears LEN2 + times, and so on. Each array of a row ends when the lengths sum to + MAX_CHAR_BYTES_NUM. */ +struct compressed_table_s { + int row_n; + int **table; +}; + +enum DFA_TYPE { + DFA_TYPE_POSITIVE, + DFA_TYPE_NEGATIVE, + DFA_TYPE_BOTH, + DFA_TYPE_SPECIAL, +}; + +struct dfa_s { + enum DFA_TYPE type; + union { + compressed_table table; + special_dfa fn; + } data; +}; + +dfa * +new_dfa() +{ + dfa *dfap = MYALLOC(dfa, 1); + + dfap->type = 0; + + dfap->data.table.row_n = 0; + dfap->data.table.table = NULL; + + return dfap; +} + +void +destroy_dfa(dfa *table) +{ + /* function pointers need no destructors */ + if (table->type == DFA_TYPE_SPECIAL) return; + + for (int i = 0; i < table->data.table.row_n;) + free(*(table->data.table.table+i++)); + + free(table->data.table.table); + free(table); +} + +/* a trie structure */ + +typedef struct trie_s trie; + +/* REVIEW: Maybe we don't need ID? */ +struct trie_s { + /* It is guaranteed that the number of children is + MAX_CHAR_BYTES_NUM. */ + int *children; + int id; +}; + +static void +destroy_trie(void *t) +{ + trie *tp = (trie *)t; + free(tp->children); + free(tp); +} + +/* Print a list of Trie's. */ +UNUSED +static void +print_tries(const List * const restrict ls) +{ + NUM len = list_length(ls); + for (NUM i = 0; i < len; i++) { + trie *t = (trie *) list_nth(ls, i); + printf("Trie ID: %d:\n", t->id); + + BOOL has_children_p = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + if (*(t->children+j) != -1) { + has_children_p = 1; + printf(" " + "Char %d => State %d\n", j, *(t->children+j)); + } + } + + if (!has_children_p) printf(" " "No children!\n"); + + printf("\n"); + } +} + +void +print_dfa(const dfa * const restrict table) +{ + switch (table->type) { + case DFA_TYPE_POSITIVE: + printf("Printing a positive table:\n"); + break; + case DFA_TYPE_NEGATIVE: + printf("Printing a negative table:\n"); + break; + case DFA_TYPE_BOTH: + printf("Printing a mixed table:\n"); + break; + default: + printf("A special table cannot be printed\n"); + return; + break; + } + + printf("Printing a table:\n"); + + for (int i = 0; i < table->data.table.row_n; i++) { + printf("Row %d: ", i); + int size = 0; + for (int j = 0; size < MAX_CHAR_BYTES_NUM; j++) { + printf("%d %d", *(*(table->data.table.table+i)+(j<<1)), + *(*(table->data.table.table+i)+(j<<1)+1)); + size += *(*(table->data.table.table+i)+(j<<1)+1); + + if (size < MAX_CHAR_BYTES_NUM) printf(", "); + else printf("\n"); + } + } + printf("\n"); +} + +/* REVIEW: Maybe we don't want to generate the complete trie in the + process of generating the compressed table? */ +dfa * +dfa_from_bytes(int sequence_size, + const NUM * const restrict data) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the accepting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_ACCEPT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +dfa * +dfa_from_bytes_neg(int sequence_size, + const NUM * const restrict data) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the rejecting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_REJECT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + dfap->type = 1; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +dfa *dfa_from_bytes_both(int sequence_size, + const NUM * const restrict data, + int neg_sequence_size, + const NUM * const restrict negdata) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the accepting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_ACCEPT; + } + + str_set_length(strp, 5); + } + + for (int i = 0; i < neg_sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(negdata+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + fleprintf("Fail to add to list when i = %d, j = %d, " + "and current value = %ld\n", + i, j, current_value); + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the rejecting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_REJECT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +BOOL +run_dfa(const dfa * const restrict table, const NUM code) +{ + switch (table->type) { + case DFA_TYPE_POSITIVE: + case DFA_TYPE_NEGATIVE: + case DFA_TYPE_BOTH: + break; + default: + return table->data.fn(code); + break; + } + + char *s = MYALLOC(char, 5); + str *strp = new_str(s, 5); + + if (encode(code, strp)) { + destroy_str(strp, 1); + fleprintf0("Fail to encode\n"); + return 0; + } + + int current_row = 0; + + for (NUM i = 0; i < str_length(strp); i++) { + unsigned char c = (unsigned char) *(s+i); + int lookup_value = 0; + + for (int j = 0, len = 0; len <= c;) { + len += *(*(table->data.table.table+current_row)+(j<<1)+1); + lookup_value = *(*(table->data.table.table+current_row) + +(j++<<1)); + } + + /* fleprintf("lookup_value = %d\n", lookup_value); */ + + if (lookup_value > 0) { + current_row = lookup_value; + } else { + switch (lookup_value) { + case DFA_STATE_REJECT: + destroy_str(strp, 1); + return 0; + break; + case DFA_STATE_UNKNOWN: + switch (table->type) { + case DFA_TYPE_BOTH: + case DFA_TYPE_POSITIVE: + destroy_str(strp, 1); + return 0; + break; + case DFA_TYPE_NEGATIVE: + destroy_str(strp, 1); + return 1; + break; + default: + /* This should not be possible, as we won't reach here if + the table is of the special type. But I am apparently + paranoia. */ + fleprintf0("Special table in the wrong place\n"); + destroy_str(strp, 1); + return 0; + break; + } + break; + case DFA_STATE_ACCEPT: + destroy_str(strp, 1); + return 1; + break; + default: + fleprintf("Unknown state number for char %d: %d\n", + c, lookup_value); + destroy_str(strp, 1); + return 0; + break; + } + } + } + + destroy_str(strp, 1); + + return 0; +} -- cgit v1.2.3-18-g5258