#include "utf8.h" #include "str.h" #include #include "list.h" #include "dfa.h" #define INIT_TRIE(X, ID) do { \ X->id = ID; \ X->children = MYALLOC(int, MAX_CHAR_BYTES_NUM); \ for (int macro_index = 0; macro_index < MAX_CHAR_BYTES_NUM;) \ *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \ } while (0) struct range_s { NUM beg, end; }; struct special_dfa_s { NUM len; struct range_s *ranges; /* This is only used when this is a mixed special dfa */ NUM boundary; }; /* The complete, uncompressed table is represented as an array of arrays, each array of which has size MAX_CHAR_BYTES_NUM. The compressed table is represented as run-lengths. The ROW_N is the size of the total table. And the TABLE stores the data. It is an array of arrays. The format of the array of a row is as follows: VALUE1 LEN1 VALUE2 LEN2 This means VALUE1 appears LEN1 times, and then VALUE2 appears LEN2 times, and so on. Each array of a row ends when the lengths sum to MAX_CHAR_BYTES_NUM. */ struct compressed_table_s { int row_n; int **table; }; enum DFA_TYPE { DFA_TYPE_POSITIVE, DFA_TYPE_NEGATIVE, DFA_TYPE_BOTH, DFA_TYPE_SPECIAL, DFA_TYPE_SPECIAL_NEG, DFA_TYPE_SPECIAL_BOTH, }; struct dfa_s { enum DFA_TYPE type; union { compressed_table table; special_dfa sp; } data; }; dfa * new_dfa() { dfa *dfap = NULL; SAFE_CALLOC(dfa, dfap, 1, return NULL;); return dfap; } void destroy_dfa(dfa *table) { if (table->type == DFA_TYPE_SPECIAL || table->type == DFA_TYPE_SPECIAL_NEG || table->type == DFA_TYPE_SPECIAL_BOTH) { free(table->data.sp.ranges); goto free_last; } for (int i = 0; i < table->data.table.row_n;) free(*(table->data.table.table+i++)); free(table->data.table.table); free_last: free(table); } /* a trie structure */ typedef struct trie_s trie; /* REVIEW: Maybe we don't need ID? */ struct trie_s { /* It is guaranteed that the number of children is MAX_CHAR_BYTES_NUM. */ int *children; int id; }; static void destroy_trie(void *t) { trie *tp = (trie *)t; free(tp->children); free(tp); } /* Print a list of Trie's. */ UNUSED static void print_tries(const List * const restrict ls) { NUM len = list_length(ls); for (NUM i = 0; i < len; i++) { trie *t = (trie *) list_nth(ls, i); printf("Trie ID: %d:\n", t->id); BOOL has_children_p = 0; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { if (*(t->children+j) != -1) { has_children_p = 1; printf(" " "Char %d => State %d\n", j, *(t->children+j)); } } if (!has_children_p) printf(" " "No children!\n"); printf("\n"); } } void print_dfa(const dfa * const restrict table) { switch (table->type) { case DFA_TYPE_POSITIVE: printf("Printing a positive table:\n"); goto print_table; break; case DFA_TYPE_NEGATIVE: printf("Printing a negative table:\n"); break; case DFA_TYPE_BOTH: printf("Printing a mixed table:\n"); goto print_table; break; case DFA_TYPE_SPECIAL: printf("Printing a table containing positive ranges:\n"); break; case DFA_TYPE_SPECIAL_NEG: printf("Printing a table containing negative ranges:\n"); break; case DFA_TYPE_SPECIAL_BOTH: printf("Printing a table containing mixed ranges:\n"); break; default: printf("Table of unknown type\n"); return; break; } for (int i = 0; i < table->data.sp.len; i++) { printf("Range %d: %ld to %ld\n", i, (table->data.sp.ranges+i)->beg, (table->data.sp.ranges+i)->end); } printf("\n"); return; print_table: for (int i = 0; i < table->data.table.row_n; i++) { printf("Row %d: ", i); int size = 0; for (int j = 0; size < MAX_CHAR_BYTES_NUM; j++) { printf("%d %d", *(*(table->data.table.table+i)+(j<<1)), *(*(table->data.table.table+i)+(j<<1)+1)); size += *(*(table->data.table.table+i)+(j<<1)+1); if (size < MAX_CHAR_BYTES_NUM) printf(", "); else printf("\n"); } } printf("\n"); } /* REVIEW: Maybe we don't want to generate the complete trie in the process of generating the compressed table? */ dfa * dfa_from_bytes(int sequence_size, const NUM * const restrict data) { List *states = new_list(); trie *state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, 0); if (add_to_list(states, state_pointer)) { fleprintf0("Cannot add to list\n"); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } char *s = MYALLOC(char, 5); str *strp = new_str(s, 5); for (int i = 0; i < sequence_size; i++) { trie current_state = *((trie *) list_nth(states, 0)); NUM current_value = *(data+i); if (encode(current_value, strp)) { fleprintf("Fail to encode %ld at i = %d\n", current_value, i); /* cleanup */ destroy_str(strp, 1); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } int str_len = (int) str_length(strp); s = get_data(strp); unsigned char c = 0; for (int j = 0; j < (str_len-1); j++) { c = (unsigned char) *(s+j); if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { /* a new state should be created */ state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, list_length(states)); *(current_state.children+c) = list_length(states); if (add_to_list(states, state_pointer)) { /* cleanup */ destroy_str(strp, 1); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } } current_state = *((trie *) list_nth(states, *(current_state.children+c))); } if (str_len) { /* the last byte leads to the accepting state */ c = (unsigned char) *(s+str_len-1); /* In the trie's that we will be forming, there will be no leaf that is also the parent of another node, by the design of UTF-8. */ *(current_state.children+c) = DFA_STATE_ACCEPT; } str_set_length(strp, 5); } destroy_str(strp, 1); /* For debugging */ #ifdef DEBUG /* print_tries(states); */ #endif /* construct DFA from the trie */ NUM states_len = list_length(states); dfa *dfap = new_dfa(); dfap->data.table.table = MYALLOC(int *, states_len); dfap->data.table.row_n = states_len; for (NUM i = 0; i < states_len; i++) { int size = 1, current = -1, last = -1, current_length = 0; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) size++; last = current; } *(dfap->data.table.table+i) = MYALLOC(int, size<<1); current = (last = -1); size = 1; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; size++; last = current; current_length = 1; if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; } } else if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = current; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; } else { current_length++; } } } /* cleanup the trie */ map_list(states, destroy_trie); destroy_list(states, 0); return dfap; } dfa * dfa_from_bytes_neg(int sequence_size, const NUM * const restrict data) { List *states = new_list(); trie *state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, 0); if (add_to_list(states, state_pointer)) { fleprintf0("Cannot add to list\n"); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } char *s = MYALLOC(char, 5); str *strp = new_str(s, 5); for (int i = 0; i < sequence_size; i++) { trie current_state = *((trie *) list_nth(states, 0)); NUM current_value = *(data+i); if (encode(current_value, strp)) { fleprintf("Fail to encode %ld at i = %d\n", current_value, i); /* cleanup */ destroy_str(strp, 1); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } int str_len = (int) str_length(strp); s = get_data(strp); unsigned char c = 0; for (int j = 0; j < (str_len-1); j++) { c = (unsigned char) *(s+j); if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { /* a new state should be created */ state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, list_length(states)); *(current_state.children+c) = list_length(states); if (add_to_list(states, state_pointer)) { /* cleanup */ destroy_str(strp, 1); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } } current_state = *((trie *) list_nth(states, *(current_state.children+c))); } if (str_len) { /* the last byte leads to the rejecting state */ c = (unsigned char) *(s+str_len-1); /* In the trie's that we will be forming, there will be no leaf that is also the parent of another node, by the design of UTF-8. */ *(current_state.children+c) = DFA_STATE_REJECT; } str_set_length(strp, 5); } destroy_str(strp, 1); /* For debugging */ /* print_tries(states); */ /* construct DFA from the trie */ NUM states_len = list_length(states); dfa *dfap = new_dfa(); dfap->data.table.table = MYALLOC(int *, states_len); dfap->data.table.row_n = states_len; dfap->type = 1; for (NUM i = 0; i < states_len; i++) { int size = 1, current = -1, last = -1, current_length = 0; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) size++; last = current; } *(dfap->data.table.table+i) = MYALLOC(int, size<<1); current = (last = -1); size = 1; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; size++; last = current; current_length = 1; if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; } } else if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = current; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; } else { current_length++; } } } /* cleanup the trie */ map_list(states, destroy_trie); destroy_list(states, 0); return dfap; } dfa *dfa_from_bytes_both(int sequence_size, const NUM * const restrict data, int neg_sequence_size, const NUM * const restrict negdata) { List *states = new_list(); trie *state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, 0); if (add_to_list(states, state_pointer)) { fleprintf0("Cannot add to list\n"); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } char *s = MYALLOC(char, 5); str *strp = new_str(s, 5); for (int i = 0; i < sequence_size; i++) { trie current_state = *((trie *) list_nth(states, 0)); NUM current_value = *(data+i); if (encode(current_value, strp)) { fleprintf("Fail to encode %ld at i = %d\n", current_value, i); /* cleanup */ destroy_str(strp, 1); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } int str_len = (int) str_length(strp); s = get_data(strp); unsigned char c = 0; for (int j = 0; j < (str_len-1); j++) { c = (unsigned char) *(s+j); if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { /* a new state should be created */ state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, list_length(states)); *(current_state.children+c) = list_length(states); if (add_to_list(states, state_pointer)) { /* cleanup */ destroy_str(strp, 1); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } } current_state = *((trie *) list_nth(states, *(current_state.children+c))); } if (str_len) { /* the last byte leads to the accepting state */ c = (unsigned char) *(s+str_len-1); /* In the trie's that we will be forming, there will be no leaf that is also the parent of another node, by the design of UTF-8. */ *(current_state.children+c) = DFA_STATE_ACCEPT; } str_set_length(strp, 5); } for (int i = 0; i < neg_sequence_size; i++) { trie current_state = *((trie *) list_nth(states, 0)); NUM current_value = *(negdata+i); if (encode(current_value, strp)) { fleprintf("Fail to encode %ld at i = %d\n", current_value, i); /* cleanup */ destroy_str(strp, 1); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } int str_len = (int) str_length(strp); s = get_data(strp); unsigned char c = 0; for (int j = 0; j < (str_len-1); j++) { c = (unsigned char) *(s+j); if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { /* a new state should be created */ state_pointer = MYALLOC(trie, 1); INIT_TRIE(state_pointer, list_length(states)); *(current_state.children+c) = list_length(states); if (add_to_list(states, state_pointer)) { /* cleanup */ fleprintf("Fail to add to list when i = %d, j = %d, " "and current value = %ld\n", i, j, current_value); destroy_str(strp, 1); destroy_trie(state_pointer); map_list(states, destroy_trie); destroy_list(states, 0); return NULL; } } current_state = *((trie *) list_nth(states, *(current_state.children+c))); } if (str_len) { /* the last byte leads to the rejecting state */ c = (unsigned char) *(s+str_len-1); /* In the trie's that we will be forming, there will be no leaf that is also the parent of another node, by the design of UTF-8. */ *(current_state.children+c) = DFA_STATE_REJECT; } str_set_length(strp, 5); } destroy_str(strp, 1); /* For debugging */ /* print_tries(states); */ /* construct DFA from the trie */ NUM states_len = list_length(states); dfa *dfap = new_dfa(); dfap->data.table.table = MYALLOC(int *, states_len); dfap->data.table.row_n = states_len; for (NUM i = 0; i < states_len; i++) { int size = 1, current = -1, last = -1, current_length = 0; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) size++; last = current; } *(dfap->data.table.table+i) = MYALLOC(int, size<<1); current = (last = -1); size = 1; for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { current = *(((trie *) list_nth(states, i))->children+j); if (current != last) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; size++; last = current; current_length = 1; if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = last; *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; } } else if (j == MAX_CHAR_BYTES_NUM-1) { *(*(dfap->data.table.table+i)+(size<<1)-2) = current; *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; } else { current_length++; } } } /* cleanup the trie */ map_list(states, destroy_trie); destroy_list(states, 0); return dfap; } dfa * dfa_from_ranges(int len, CCR_MOD(NUM *) data) { struct range_s *rp = NULL; SAFE_MALLOC(struct range_s, rp, len, return NULL;); for (int i = 0; i < len; i++) { (rp+i)->beg = *(data+(i<<1)); (rp+i)->end = *(data+(i<<1)+1); } dfa *dfap = NULL; SAFE_MALLOC(dfa, dfap, 1, free(rp); return NULL;); dfap->type = DFA_TYPE_SPECIAL; dfap->data.sp = (special_dfa) { len, rp, 0 }; return dfap; } dfa * dfa_from_ranges_neg(int len, CCR_MOD(NUM *) data) { dfa *dfap = dfa_from_ranges(len, data); if (dfap == NULL) return NULL; dfap->type = DFA_TYPE_SPECIAL_NEG; return dfap; } dfa * dfa_from_ranges_both(int plen, CCR_MOD(NUM *) pdata, int nlen, CCR_MOD(NUM *) ndata) { int len = plen + nlen; struct range_s *rp = NULL; SAFE_MALLOC(struct range_s, rp, len, return NULL;); for (int i = 0; i < plen; i++) { (rp+i)->beg = *(pdata+(i<<1)); (rp+i)->end = *(pdata+(i<<1)+1); } for (int i = 0; i < nlen; i++) { (rp+plen+i)->beg = *(ndata+(i<<1)); (rp+plen+i)->end = *(ndata+(i<<1)+1); } dfa *dfap = NULL; SAFE_MALLOC(dfa, dfap, 1, free(rp); return NULL;); dfap->type = DFA_TYPE_SPECIAL_BOTH; dfap->data.sp = (special_dfa) { len, rp, plen }; return dfap; } BOOL run_dfa(const dfa * const restrict table, const NUM code) { /* negative codes have special purposes */ if (code < 0) return 0; switch (table->type) { case DFA_TYPE_POSITIVE: case DFA_TYPE_NEGATIVE: case DFA_TYPE_BOTH: break; case DFA_TYPE_SPECIAL: for (int i = 0; i < table->data.sp.len; i++) if (code >= (table->data.sp.ranges+i)->beg && code <= (table->data.sp.ranges+i)->end) { #ifdef DEBUG fleprintf("i = %d, code = %ld, beg = %ld, end = %ld\n", i, code, (table->data.sp.ranges+i)->beg, (table->data.sp.ranges+i)->end); #endif return 1; } return 0; break; case DFA_TYPE_SPECIAL_NEG: for (int i = 0; i < table->data.sp.len; i++) { if (code >= (table->data.sp.ranges+i)->beg && code <= (table->data.sp.ranges+i)->end) { #ifdef DEBUG fleprintf("code = %ld, i = %d, beg = %ld, end = %ld\n", code, i, (table->data.sp.ranges+i)->beg, (table->data.sp.ranges+i)->end); #endif return 0; } } return 1; break; case DFA_TYPE_SPECIAL_BOTH: for (int i = table->data.sp.boundary; i < table->data.sp.len; i++) if (code >= (table->data.sp.ranges+i)->beg && code <= (table->data.sp.ranges+i)->end) { #ifdef DEBUG fleprintf("code = %ld, beg = %ld, end = %ld\n", code, (table->data.sp.ranges+i)->beg, (table->data.sp.ranges+i)->end); #endif return 0; } for (int i = 0; i < table->data.sp.boundary; i++) if (code >= (table->data.sp.ranges+i)->beg && code <= (table->data.sp.ranges+i)->end) { #ifdef DEBUG fleprintf("code = %ld, beg = %ld, end = %ld\n", code, (table->data.sp.ranges+i)->beg, (table->data.sp.ranges+i)->end); #endif return 1; } return 0; break; default: fleprintf("Unknown type: %d\n", table->type); return 0; break; } /* below is for running the DFA */ char s[5]; str *strp = new_str(s, 5); if (encode(code, strp)) { destroy_str(strp, 0); fleprintf0("Fail to encode\n"); return 0; } int current_row = 0; for (NUM i = 0; i < str_length(strp); i++) { unsigned char c = (unsigned char) *(s+i); int lookup_value = 0; for (int j = 0, len = 0; len <= c;) { len += *(*(table->data.table.table+current_row)+(j<<1)+1); lookup_value = *(*(table->data.table.table+current_row) +(j++<<1)); } if (lookup_value > 0) { current_row = lookup_value; } else { switch (lookup_value) { case DFA_STATE_REJECT: destroy_str(strp, 0); return 0; break; case DFA_STATE_UNKNOWN: switch (table->type) { case DFA_TYPE_BOTH: case DFA_TYPE_POSITIVE: destroy_str(strp, 0); return 0; break; case DFA_TYPE_NEGATIVE: destroy_str(strp, 0); return 1; break; default: /* This should not be possible, as we won't reach here if the table is of the special type. But I am apparently paranoia. */ fleprintf0("Special table in the wrong place\n"); destroy_str(strp, 0); return 0; break; } break; case DFA_STATE_ACCEPT: destroy_str(strp, 0); return 1; break; default: fleprintf("Unknown state number for char %d: %d\n", c, lookup_value); destroy_str(strp, 0); return 0; break; } } } destroy_str(strp, 0); return 0; }