#include "utf8.h" #include "str.h" #include #include "list.h" #include "dfa.h" #define INIT_TRIE(X, ID) do { \ X->id = ID; \ X->children = MYALLOC(int, MAX_CHAR_BYTES_NUM); \ for (int macro_index = 0; macro_index < MAX_CHAR_BYTES_NUM;) \ *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \ } while (0) /* The complete, uncompressed table is represented as an array of arrays, each array of which has size MAX_CHAR_BYTES_NUM. The compressed table is represented as run-lengths. The ROW_N is the size of the total table. And the TABLE stores the data. It is an array of arrays. The format of the array of a row is as follows: VALUE1 LEN1 VALUE2 LEN2 This means VALUE1 appears LEN1 times, and then VALUE2 appears LEN2 times, and so on. Each array of a row ends when the lengths sum to MAX_CHAR_BYTES_NUM. */ struct compressed_table_s { int row_n; int **table; }; enum DFA_TYPE { DFA_TYPE_POSITIVE, DFA_TYPE_NEGATIVE, DFA_TYPE_BOTH, DFA_TYPE_SPECIAL, }; struct dfa_s { enum DFA_TYPE type; union { compressed_table table; special_dfa fn; } data; }; dfa * new_dfa() { dfa *dfap = MYALLOC(dfa, 1); dfap->type = 0; dfap->data.table.row_n = 0; dfap->data.table.table = NULL; return dfap; } void destroy_dfa(dfa *table) { /* function pointers need no destructors */ if (table->type == DFA_TYPE_SPECIAL) goto free_last; for (int i = 0; i < table->data.table.row_n;) free(*(table->data.table.table+i++)); free(table->data.table.table); free_last: free(table); } /* a trie structure */ typedef struct trie_s trie; /* REVIEW: Maybe we don't need ID? */ struct trie_s { /* It is guaranteed that the number of children is MAX_CHAR_BYTES_NUM. */ int *children; int id; }; static void destroy_trie(void *t) { trie *tp = (trie *)t; free(tp->children); free(tp); } /* Print a list of Trie's. */ UNUSED static void print_tries(const List * const restrict ls) { NUM len = list_length(ls); for (NUM i = 0; i < len; i++) { trie *t = (trie *) list_nth(ls, i); printf("Trie ID: %d:\n", t->id); BOOL has_children_p = 0; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { if (*(t->children+j) != -1) { has_children_p = 1; printf(" " "Char %d => State %d\n", j, *(t->children+j)); } } if (!has_children_p) printf(" " "No children!\n"); printf("\n"); } } void print_dfa(const dfa * const restrict table) { switch (table->type) { case DFA_TYPE_POSITIVE: printf("Printing a positive table:\n"); break; case DFA_TYPE_NEGATIVE: printf("Printing a negative table:\n"); break; case DFA_TYPE_BOTH: printf("Printing a mixed table:\n"); break; default: printf("A special table cannot be printed\n"); return; break; } printf("Printing a table:\n"); for (int i = 0; i < table->data.table.row_n; i++) { printf("Row %d: ", i); int size = 0; for (int j = 0; size < MAX_CHAR_BYTES_NUM; j++) { printf("%d %d", *(*(table->data.table.table+i)+(j<<1)), *(*(table->data.table.table+i)+(j<<1)+1)); size += *(*(table->data.table.table+i)+(j<<1)+1); if (size < MAX_CHAR_BYTES_NUM) printf(", "); else printf("\n"); } } printf("\n"); } /* REVIEW: Maybe we don't want to generate the complete trie in the process of generating the compressed table? */ dfa * dfa_from_bytes(int sequence_size, const NUM * const restrict data) { List *states = new_list(); trie *state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, 0); if (add_to_list(states, state_pointer)) { fleprintf0("Cannot add to list\n"); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } char *s = MYALLOC(char, 5); str *strp = new_str(s, 5); for (int i = 0; i < sequence_size; i++) { trie current_state = *((trie *) list_nth(states, 0)); NUM current_value = *(data+i); if (encode(current_value, strp)) { fleprintf("Fail to encode %ld at i = %d\n", current_value, i); /* cleanup */ destroy_str(strp, 1); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } int str_len = (int) str_length(strp); s = get_data(strp); unsigned char c = 0; for (int j = 0; j < (str_len-1); j++) { c = (unsigned char) *(s+j); if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { /* a new state should be created */ state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, list_length(states)); *(current_state.children+c) = list_length(states); if (add_to_list(states, state_pointer)) { /* cleanup */ destroy_str(strp, 1); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } } current_state = *((trie *) list_nth(states, *(current_state.children+c))); } if (str_len) { /* the last byte leads to the accepting state */ c = (unsigned char) *(s+str_len-1); /* In the trie's that we will be forming, there will be no leaf that is also the parent of another node, by the design of UTF-8. */ *(current_state.children+c) = DFA_STATE_ACCEPT; } str_set_length(strp, 5); } destroy_str(strp, 1); /* For debugging */ /* print_tries(states); */ /* construct DFA from the trie */ NUM states_len = list_length(states); dfa *dfap = new_dfa(); dfap->data.table.table = MYALLOC(int *, states_len); dfap->data.table.row_n = states_len; for (NUM i = 0; i < states_len; i++) { int size = 1, current = -1, last = -1, current_length = 0; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) size++; last = current; } *(dfap->data.table.table+i) = MYALLOC(int, size<<1); current = (last = -1); size = 1; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; size++; last = current; current_length = 1; if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; } } else if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = current; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; } else { current_length++; } } } /* cleanup the trie */ map_list(states, destroy_trie); destroy_list(states, 0); return dfap; } dfa * dfa_from_bytes_neg(int sequence_size, const NUM * const restrict data) { List *states = new_list(); trie *state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, 0); if (add_to_list(states, state_pointer)) { fleprintf0("Cannot add to list\n"); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } char *s = MYALLOC(char, 5); str *strp = new_str(s, 5); for (int i = 0; i < sequence_size; i++) { trie current_state = *((trie *) list_nth(states, 0)); NUM current_value = *(data+i); if (encode(current_value, strp)) { fleprintf("Fail to encode %ld at i = %d\n", current_value, i); /* cleanup */ destroy_str(strp, 1); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } int str_len = (int) str_length(strp); s = get_data(strp); unsigned char c = 0; for (int j = 0; j < (str_len-1); j++) { c = (unsigned char) *(s+j); if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { /* a new state should be created */ state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, list_length(states)); *(current_state.children+c) = list_length(states); if (add_to_list(states, state_pointer)) { /* cleanup */ destroy_str(strp, 1); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } } current_state = *((trie *) list_nth(states, *(current_state.children+c))); } if (str_len) { /* the last byte leads to the rejecting state */ c = (unsigned char) *(s+str_len-1); /* In the trie's that we will be forming, there will be no leaf that is also the parent of another node, by the design of UTF-8. */ *(current_state.children+c) = DFA_STATE_REJECT; } str_set_length(strp, 5); } destroy_str(strp, 1); /* For debugging */ /* print_tries(states); */ /* construct DFA from the trie */ NUM states_len = list_length(states); dfa *dfap = new_dfa(); dfap->data.table.table = MYALLOC(int *, states_len); dfap->data.table.row_n = states_len; dfap->type = 1; for (NUM i = 0; i < states_len; i++) { int size = 1, current = -1, last = -1, current_length = 0; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) size++; last = current; } *(dfap->data.table.table+i) = MYALLOC(int, size<<1); current = (last = -1); size = 1; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; size++; last = current; current_length = 1; if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; } } else if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = current; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; } else { current_length++; } } } /* cleanup the trie */ map_list(states, destroy_trie); destroy_list(states, 0); return dfap; } dfa *dfa_from_bytes_both(int sequence_size, const NUM * const restrict data, int neg_sequence_size, const NUM * const restrict negdata) { List *states = new_list(); trie *state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, 0); if (add_to_list(states, state_pointer)) { fleprintf0("Cannot add to list\n"); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } char *s = MYALLOC(char, 5); str *strp = new_str(s, 5); for (int i = 0; i < sequence_size; i++) { trie current_state = *((trie *) list_nth(states, 0)); NUM current_value = *(data+i); if (encode(current_value, strp)) { fleprintf("Fail to encode %ld at i = %d\n", current_value, i); /* cleanup */ destroy_str(strp, 1); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } int str_len = (int) str_length(strp); s = get_data(strp); unsigned char c = 0; for (int j = 0; j < (str_len-1); j++) { c = (unsigned char) *(s+j); if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { /* a new state should be created */ state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, list_length(states)); *(current_state.children+c) = list_length(states); if (add_to_list(states, state_pointer)) { /* cleanup */ destroy_str(strp, 1); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } } current_state = *((trie *) list_nth(states, *(current_state.children+c))); } if (str_len) { /* the last byte leads to the accepting state */ c = (unsigned char) *(s+str_len-1); /* In the trie's that we will be forming, there will be no leaf that is also the parent of another node, by the design of UTF-8. */ *(current_state.children+c) = DFA_STATE_ACCEPT; } str_set_length(strp, 5); } for (int i = 0; i < neg_sequence_size; i++) { trie current_state = *((trie *) list_nth(states, 0)); NUM current_value = *(negdata+i); if (encode(current_value, strp)) { fleprintf("Fail to encode %ld at i = %d\n", current_value, i); /* cleanup */ destroy_str(strp, 1); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } int str_len = (int) str_length(strp); s = get_data(strp); unsigned char c = 0; for (int j = 0; j < (str_len-1); j++) { c = (unsigned char) *(s+j); if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { /* a new state should be created */ state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, list_length(states)); *(current_state.children+c) = list_length(states); if (add_to_list(states, state_pointer)) { /* cleanup */ fleprintf("Fail to add to list when i = %d, j = %d, " "and current value = %ld\n", i, j, current_value); destroy_str(strp, 1); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } } current_state = *((trie *) list_nth(states, *(current_state.children+c))); } if (str_len) { /* the last byte leads to the rejecting state */ c = (unsigned char) *(s+str_len-1); /* In the trie's that we will be forming, there will be no leaf that is also the parent of another node, by the design of UTF-8. */ *(current_state.children+c) = DFA_STATE_REJECT; } str_set_length(strp, 5); } destroy_str(strp, 1); /* For debugging */ /* print_tries(states); */ /* construct DFA from the trie */ NUM states_len = list_length(states); dfa *dfap = new_dfa(); dfap->data.table.table = MYALLOC(int *, states_len); dfap->data.table.row_n = states_len; for (NUM i = 0; i < states_len; i++) { int size = 1, current = -1, last = -1, current_length = 0; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) size++; last = current; } *(dfap->data.table.table+i) = MYALLOC(int, size<<1); current = (last = -1); size = 1; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; size++; last = current; current_length = 1; if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; } } else if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = current; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; } else { current_length++; } } } /* cleanup the trie */ map_list(states, destroy_trie); destroy_list(states, 0); return dfap; } BOOL run_dfa(const dfa * const restrict table, const NUM code) { switch (table->type) { case DFA_TYPE_POSITIVE: case DFA_TYPE_NEGATIVE: case DFA_TYPE_BOTH: break; default: return table->data.fn(code); break; } char *s = MYALLOC(char, 5); str *strp = new_str(s, 5); if (encode(code, strp)) { destroy_str(strp, 1); fleprintf0("Fail to encode\n"); return 0; } int current_row = 0; for (NUM i = 0; i < str_length(strp); i++) { unsigned char c = (unsigned char) *(s+i); int lookup_value = 0; for (int j = 0, len = 0; len <= c;) { len += *(*(table->data.table.table+current_row)+(j<<1)+1); lookup_value = *(*(table->data.table.table+current_row) +(j++<<1)); } /* fleprintf("lookup_value = %d\n", lookup_value); */ if (lookup_value > 0) { current_row = lookup_value; } else { switch (lookup_value) { case DFA_STATE_REJECT: destroy_str(strp, 1); return 0; break; case DFA_STATE_UNKNOWN: switch (table->type) { case DFA_TYPE_BOTH: case DFA_TYPE_POSITIVE: destroy_str(strp, 1); return 0; break; case DFA_TYPE_NEGATIVE: destroy_str(strp, 1); return 1; break; default: /* This should not be possible, as we won't reach here if the table is of the special type. But I am apparently paranoia. */ fleprintf0("Special table in the wrong place\n"); destroy_str(strp, 1); return 0; break; } break; case DFA_STATE_ACCEPT: destroy_str(strp, 1); return 1; break; default: fleprintf("Unknown state number for char %d: %d\n", c, lookup_value); destroy_str(strp, 1); return 0; break; } } } destroy_str(strp, 1); return 0; } P_ATTR static BOOL dfa_any_fun(const NUM UNUSED code) { return 1; } dfa * dfa_from_func(special_dfa func) { dfa *result = NULL; SAFE_MALLOC(dfa, result, 1, return NULL;); result->type = DFA_TYPE_SPECIAL; result->data.fn = func; return result; } dfa * dfa_any(void) { return dfa_from_func(dfa_any_fun); }