diff options
-rw-r--r-- | src/Makefile.am | 8 | ||||
-rw-r--r-- | src/dfa.c | 692 | ||||
-rw-r--r-- | src/dfa.h | 64 | ||||
-rw-r--r-- | src/grammar.c | 17 | ||||
-rw-r--r-- | src/grammar.h | 53 | ||||
-rw-r--r-- | src/list.c | 16 | ||||
-rw-r--r-- | src/list.h | 14 | ||||
-rw-r--r-- | src/reader.c | 66 | ||||
-rw-r--r-- | src/reader.h | 2 | ||||
-rw-r--r-- | src/str.c | 10 | ||||
-rw-r--r-- | src/str.h | 8 | ||||
-rw-r--r-- | src/str_partial.h | 2 | ||||
-rw-r--r-- | src/test.txt | 34 | ||||
-rw-r--r-- | src/test/check_dfa.c | 108 | ||||
-rw-r--r-- | src/utf8.c | 4 | ||||
-rw-r--r-- | src/utf8.h | 2 | ||||
-rw-r--r-- | src/util.c | 2 | ||||
-rw-r--r-- | src/util.h | 4 |
18 files changed, 1019 insertions, 87 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index 4690876..f43fbee 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,9 +2,9 @@ AM_CFLAGS = -Wall -Wextra noinst_LIBRARIES = libeps.a libeps_a_SOURCES = grammar.c list.c util.c reader.c \ -str.c utf8.c lr_table.c \ +str.c utf8.c dfa.c \ grammar.h list.h util.h reader.h \ -str_partial.h str.h utf8.h lr_table.h +str_partial.h str.h utf8.h dfa.h libeps_a_CFLAGS = $(AM_CFLAGS) --pedantic @@ -21,7 +21,7 @@ CLEANFILES = TAGS # tests -check_PROGRAMS = check_list check_grammar check_reader +check_PROGRAMS = check_list check_grammar check_reader check_dfa check_list_SOURCES = test/check_list.c list.c @@ -31,6 +31,8 @@ utf8.c check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c \ str.c utf8.c util.c +check_dfa_SOURCES = test/check_dfa.c dfa.c list.c str.c utf8.c + TESTS = $(check_PROGRAMS) AM_COLOR_TESTS = always diff --git a/src/dfa.c b/src/dfa.c new file mode 100644 index 0000000..e3bcdf0 --- /dev/null +++ b/src/dfa.c @@ -0,0 +1,692 @@ +#include "utf8.h" +#include "str.h" +#include <stdio.h> +#include "list.h" +#include "dfa.h" + +#define INIT_TRIE(X, ID) do { \ + X->id = ID; \ + X->children = MYALLOC(int, MAX_CHAR_BYTES_NUM); \ + for (int macro_index = 0; macro_index < MAX_CHAR_BYTES_NUM;) \ + *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \ + } while (0) + +/* The complete, uncompressed table is represented as an array of + arrays, each array of which has size MAX_CHAR_BYTES_NUM. The + compressed table is represented as run-lengths. The ROW_N is the + size of the total table. And the TABLE stores the data. It is an + array of arrays. The format of the array of a row is as follows: + + VALUE1 LEN1 VALUE2 LEN2 + + This means VALUE1 appears LEN1 times, and then VALUE2 appears LEN2 + times, and so on. Each array of a row ends when the lengths sum to + MAX_CHAR_BYTES_NUM. */ +struct compressed_table_s { + int row_n; + int **table; +}; + +enum DFA_TYPE { + DFA_TYPE_POSITIVE, + DFA_TYPE_NEGATIVE, + DFA_TYPE_BOTH, + DFA_TYPE_SPECIAL, +}; + +struct dfa_s { + enum DFA_TYPE type; + union { + compressed_table table; + special_dfa fn; + } data; +}; + +dfa * +new_dfa() +{ + dfa *dfap = MYALLOC(dfa, 1); + + dfap->type = 0; + + dfap->data.table.row_n = 0; + dfap->data.table.table = NULL; + + return dfap; +} + +void +destroy_dfa(dfa *table) +{ + /* function pointers need no destructors */ + if (table->type == DFA_TYPE_SPECIAL) return; + + for (int i = 0; i < table->data.table.row_n;) + free(*(table->data.table.table+i++)); + + free(table->data.table.table); + free(table); +} + +/* a trie structure */ + +typedef struct trie_s trie; + +/* REVIEW: Maybe we don't need ID? */ +struct trie_s { + /* It is guaranteed that the number of children is + MAX_CHAR_BYTES_NUM. */ + int *children; + int id; +}; + +static void +destroy_trie(void *t) +{ + trie *tp = (trie *)t; + free(tp->children); + free(tp); +} + +/* Print a list of Trie's. */ +UNUSED +static void +print_tries(const List * const restrict ls) +{ + NUM len = list_length(ls); + for (NUM i = 0; i < len; i++) { + trie *t = (trie *) list_nth(ls, i); + printf("Trie ID: %d:\n", t->id); + + BOOL has_children_p = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + if (*(t->children+j) != -1) { + has_children_p = 1; + printf(" " + "Char %d => State %d\n", j, *(t->children+j)); + } + } + + if (!has_children_p) printf(" " "No children!\n"); + + printf("\n"); + } +} + +void +print_dfa(const dfa * const restrict table) +{ + switch (table->type) { + case DFA_TYPE_POSITIVE: + printf("Printing a positive table:\n"); + break; + case DFA_TYPE_NEGATIVE: + printf("Printing a negative table:\n"); + break; + case DFA_TYPE_BOTH: + printf("Printing a mixed table:\n"); + break; + default: + printf("A special table cannot be printed\n"); + return; + break; + } + + printf("Printing a table:\n"); + + for (int i = 0; i < table->data.table.row_n; i++) { + printf("Row %d: ", i); + int size = 0; + for (int j = 0; size < MAX_CHAR_BYTES_NUM; j++) { + printf("%d %d", *(*(table->data.table.table+i)+(j<<1)), + *(*(table->data.table.table+i)+(j<<1)+1)); + size += *(*(table->data.table.table+i)+(j<<1)+1); + + if (size < MAX_CHAR_BYTES_NUM) printf(", "); + else printf("\n"); + } + } + printf("\n"); +} + +/* REVIEW: Maybe we don't want to generate the complete trie in the + process of generating the compressed table? */ +dfa * +dfa_from_bytes(int sequence_size, + const NUM * const restrict data) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the accepting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_ACCEPT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +dfa * +dfa_from_bytes_neg(int sequence_size, + const NUM * const restrict data) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the rejecting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_REJECT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + dfap->type = 1; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +dfa *dfa_from_bytes_both(int sequence_size, + const NUM * const restrict data, + int neg_sequence_size, + const NUM * const restrict negdata) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the accepting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_ACCEPT; + } + + str_set_length(strp, 5); + } + + for (int i = 0; i < neg_sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(negdata+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + fleprintf("Fail to add to list when i = %d, j = %d, " + "and current value = %ld\n", + i, j, current_value); + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the rejecting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_REJECT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +BOOL +run_dfa(const dfa * const restrict table, const NUM code) +{ + switch (table->type) { + case DFA_TYPE_POSITIVE: + case DFA_TYPE_NEGATIVE: + case DFA_TYPE_BOTH: + break; + default: + return table->data.fn(code); + break; + } + + char *s = MYALLOC(char, 5); + str *strp = new_str(s, 5); + + if (encode(code, strp)) { + destroy_str(strp, 1); + fleprintf0("Fail to encode\n"); + return 0; + } + + int current_row = 0; + + for (NUM i = 0; i < str_length(strp); i++) { + unsigned char c = (unsigned char) *(s+i); + int lookup_value = 0; + + for (int j = 0, len = 0; len <= c;) { + len += *(*(table->data.table.table+current_row)+(j<<1)+1); + lookup_value = *(*(table->data.table.table+current_row) + +(j++<<1)); + } + + /* fleprintf("lookup_value = %d\n", lookup_value); */ + + if (lookup_value > 0) { + current_row = lookup_value; + } else { + switch (lookup_value) { + case DFA_STATE_REJECT: + destroy_str(strp, 1); + return 0; + break; + case DFA_STATE_UNKNOWN: + switch (table->type) { + case DFA_TYPE_BOTH: + case DFA_TYPE_POSITIVE: + destroy_str(strp, 1); + return 0; + break; + case DFA_TYPE_NEGATIVE: + destroy_str(strp, 1); + return 1; + break; + default: + /* This should not be possible, as we won't reach here if + the table is of the special type. But I am apparently + paranoia. */ + fleprintf0("Special table in the wrong place\n"); + destroy_str(strp, 1); + return 0; + break; + } + break; + case DFA_STATE_ACCEPT: + destroy_str(strp, 1); + return 1; + break; + default: + fleprintf("Unknown state number for char %d: %d\n", + c, lookup_value); + destroy_str(strp, 1); + return 0; + break; + } + } + } + + destroy_str(strp, 1); + + return 0; +} diff --git a/src/dfa.h b/src/dfa.h new file mode 100644 index 0000000..a22409f --- /dev/null +++ b/src/dfa.h @@ -0,0 +1,64 @@ +#include "util.h" +#ifndef DFA_H +#define DFA_H + +/* See the comments at the beginning of grammar.h for some backgrounds + about this file. */ + +/* See the following Wikipedia link for details on Run-Length + Encoding. <https://en.wikipedia.org/wiki/Run-length_encoding> */ + +/* Maximal character bytes value */ + +enum { MAX_CHAR_BYTES_NUM = 256 }; + +/* Hard-coded state numbers */ + +enum { + DFA_STATE_UNKNOWN = -1, + DFA_STATE_ACCEPT = -2, + DFA_STATE_REJECT = -3, +}; + +/* dfa type */ + +typedef BOOL (* special_dfa) (const NUM code); + +typedef struct dfa_s dfa; + +typedef struct compressed_table_s compressed_table; + +dfa *new_dfa(); + +void destroy_dfa(dfa *table); + +void print_dfa(const dfa * const restrict table); + +dfa *dfa_from_bytes(int sequence_size, + const NUM * const restrict data); + +dfa *dfa_from_bytes_neg(int sequence_size, + const NUM * const restrict data); + +dfa *dfa_from_bytes_both(int sequence_size, + const NUM * const restrict data, + int neg_sequence_size, + const NUM * const restrict negdata); + +/* TODO: Reject character bytes from a given DFA. */ + +/* NOTE: Add all unicode valid points to a DFA, so that we can + represent the ANY class. + + After having done so, this costs around 16K memory. This is not so + satisfactory, as all these memory are just to serve as a ANY + character class, which is too excessive. So I extend the DFA by a + special type. */ + +/* TODO: Construct some basic frequently used character classes. */ + +inline BOOL dfa_any_fun(const NUM UNUSED code) { return 1; } + +BOOL run_dfa(const dfa * const restrict table, const NUM code); + +#endif diff --git a/src/grammar.c b/src/grammar.c index 4052510..508c8d8 100644 --- a/src/grammar.c +++ b/src/grammar.c @@ -104,14 +104,14 @@ new_tnt(int type, ...) } Rule * -new_rule(NT left, List *right) +new_rule(NT left, const List * const right) { if (!right) return NULL; Rule *rule = MYALLOC(Rule, 1); rule->left = left; - rule->right = right; + rule->right = (List *) right; return rule; } @@ -124,10 +124,10 @@ new_grammar() } /* We classify the rules into different rule groups. */ -unsigned char -build_grammar(Grammar *g, List *rules, List *names) +BOOL +build_grammar(Grammar *g, const List *rules, const List * const names) { - g->names = names; + g->names = (List *) names; NUM len = list_length(names); NUM rule_len = list_length(rules); @@ -246,11 +246,12 @@ print_rule(void *r) } NUM -find_in_cpa_list(NUM *string, NUM size, List *list) +find_in_cpa_list(const NUM * const restrict string, NUM size, + const List * const restrict list) { NUM len = list_length(list), index = 0; - unsigned char foundp = 0; + BOOL foundp = 0; for (; !foundp && index < len;) { cpa *cpap = list_nth(list, index); @@ -304,7 +305,7 @@ print_name(void *element) /* REVIEW: Print the names of non-terminals out, instead of printing the numbers? */ void -print_grammar(Grammar *g) +print_grammar(const Grammar * const g) { printf("Printing a grammar:\n"); map_list_between(g->names, print_name, print_sep); diff --git a/src/grammar.h b/src/grammar.h index 05bc1fc..28f8fd8 100644 --- a/src/grammar.h +++ b/src/grammar.h @@ -8,14 +8,45 @@ /* The class file for grammars */ +/* TODO: Terminals should be extended to incorporate the "predicate + terminals", such as character classes. */ + /* A grammar is a list of rules. A rule replaces a non-terminal with a finite string of terminals and non-terminals. - For us, a terminal is a positive integer, and a non-terminal is a - negative integer. Since the signs of these two types is fixed, we - represent them as unsigned integers. Since we want to deal with - unicode points, the terminal is stored as unsigned long, so that we - can include all unicode poitns. */ + For us, a terminal is a positive integer, and a non-terminal is + also an integer. We represent them as unsigned integers. Since we + want to deal with unicode points, the terminal is stored as + unsigned long, so that we can include all unicode poitns. + + Moreover, a terminal might be a "predicate terminal", that is, it + does not refer to one character only. Instead, it refers to a set + of characters. The most efficient way of recognizing a set of + characters, of which I know, is to use a deterministic finite + automaton. So we represent a predicate terminal as a DFA. + + Whereas the DFA's are equivalent to regular expressions in the + expressive power, I do not need full support for regular + expressions for the predicate terminals. In particular, I only + want to match a character for one predicate terminal, so there are + no such things as repititive and optional constructs in the DFA. + Rather, each predicate terminal is a simple table, or a matrix, + which must have exactly 256 columns and an indefinite number of + rows. The columns represent the input character's bytes to match, + and the rows are the states of the DFA. The table must have at + least one row: the starting state is always the zeroth row and the + accepting state is any state that is out of bounds: either greater + than or equal to the number of rows or negative. + + Of course, it is inefficient to keep the whoe table within the + memory, especially since the most part of the table would be + unused. So we store a DFA in a compressed format. I have + considered whether to apply some more advanced compression + techniques, such as using BW transoform followed by the MTF + algorithm, but in the end I think that the DFA's that will be used + are oft times just small tables, at least I hope so, thus I decided + to simply use the Run-Length-Encoding. See the file dfa.h for + implementation details. */ typedef unsigned long T; typedef unsigned NT; @@ -41,7 +72,9 @@ typedef struct Grammar_s Grammar; /* G is expected to be allocated before this function. NAMES is a list of CPA pointers. */ -unsigned char build_grammar(Grammar *g, List *rules, List *names); +BOOL +build_grammar(Grammar *g, + const List *rules, const List * const names); /* This accepts one and only one optional argument, which is of type either T or NT, depending on TYPE being 0 or not. */ @@ -56,7 +89,9 @@ struct cpa_s { typedef struct cpa_s cpa; -NUM find_in_cpa_list(NUM *string, NUM size, List *list); +NUM +find_in_cpa_list(const NUM * const restrict string, NUM size, + const List * const restrict list); /* For debugging purposes, we need to print out rules to examine their values. */ @@ -69,7 +104,7 @@ void print_tnt(void *element); void print_rule(void *r); /* This will generate errors if G is not a list of rules. */ -void print_grammar(Grammar *g); +void print_grammar(const Grammar * const g); /* constructors */ @@ -85,7 +120,7 @@ TNT *new_tnt_pointer(size_t size); /* RIGHT should be created by new_tnt_string or something alike. If RIGHT is NULL, NULL is returned. */ -Rule *new_rule(NT left, List *right); +Rule *new_rule(NT left, const List * const right); Grammar *new_grammar(); @@ -32,7 +32,7 @@ new_list() } H_ATTR -unsigned char +BOOL add_to_list(List *ls, void *element) { if (!(ls->capacity)) { @@ -141,10 +141,10 @@ copy_num(void *p) return pointer; } -unsigned char +BOOL copy_list(List *dest, List *source, copyer copyf) { - unsigned char sign = 0; + BOOL sign = 0; if ((sign = list_assure_size(dest, source->len))) return sign; @@ -162,19 +162,19 @@ copy_list(List *dest, List *source, copyer copyf) H_ATTR void * -list_nth(List *ls, NUM n) +list_nth(const List * const ls, NUM n) { return *(ls->array+n); } H_ATTR NUM -list_length(List *ls) +list_length(const List * const restrict ls) { return ls->len; } -unsigned char +BOOL list_assure_size(List *ls, NUM size) { if (ls->capacity >= size) return 0; /* we are good */ @@ -203,7 +203,7 @@ list_assure_size(List *ls, NUM size) return 0; } -unsigned char +BOOL list_set_length(List *ls, NUM len) { list_assure_size(ls, len); @@ -254,7 +254,7 @@ array_to_list(void **array, NUM size) } void -destroy_list(List *ls, unsigned char all_free_p) +destroy_list(List *ls, BOOL all_free_p) { if (all_free_p == 1) for (NUM i = 0; i < ls->len; i++) @@ -17,7 +17,7 @@ List *new_list(); /* add an element to the end of the list */ /* upon failure return non-zero */ -unsigned char add_to_list(List *ls, void *element); +BOOL add_to_list(List *ls, void *element); /* pop an element from the end of the list, and return that element */ /* upon failure return NULL */ @@ -43,23 +43,23 @@ typedef void *(*copyer)(void *); void *copy_num(void *); /* upon failure return 1 */ -unsigned char copy_list(List *dest, List *source, copyer copyf); +BOOL copy_list(List *dest, List *source, copyer copyf); -void *list_nth(List *ls, NUM n); +void *list_nth(const List * const ls, NUM n); -NUM list_length(List *ls); +NUM list_length(const List * const restrict ls); /* Make sure the list has at least SIZE slots to use. This should only be used to create fixed capacity arrays, otherwise we risk frequently re-allocating and hence losing performance. */ /* Upon failure return non-zero. */ -unsigned char list_assure_size(List *ls, NUM size); +BOOL list_assure_size(List *ls, NUM size); /* This is mainly used to set the length of a sparse list, since only when dealing with sparse lists do we not need to care about the elements. */ /* Upon failure return non-zero. */ -unsigned char list_set_length(List *ls, NUM len); +BOOL list_set_length(List *ls, NUM len); /* Convert a list to an array. @@ -79,7 +79,7 @@ List *array_to_list(void ** array, NUM size); /* destroy the list: If ALL_FREE_P is 1, this frees every void pointers contained in the list; if it is 2, this frees the first pointer. In any case, the list is de-allocated. */ -void destroy_list(List *ls, unsigned char all_free_p); +void destroy_list(List *ls, BOOL all_free_p); void destroy_list_free_all(void *ls); void destroy_list_free_first(void *ls); diff --git a/src/reader.c b/src/reader.c index 18435a9..d715726 100644 --- a/src/reader.c +++ b/src/reader.c @@ -21,14 +21,15 @@ /* Read a string of TNT's into TNTS. TNTS is expected to be already allocated. */ -static unsigned char -read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) +static BOOL +read_tnt_string(const str * const restrict s, NUM start, NUM end, + TNT *tnts, List *cpa_list) { str_info info; /* fleprintf("S = %lu, E = %lu\n", start, end); */ - unsigned char readingp = 0, stop = 0; + BOOL readingp = 0, stop = 0, double_quote_p = 0; NUM *string = NULL, string_size = 0, nt_index = 0, tnt_count = 0; @@ -73,14 +74,15 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) } break; case 34: + double_quote_p = 1; case 39: /* read a string */ strindex = index + info.step; strinfo = get_info(s, strindex); for (;strindex < end && - strinfo.value != 34 && - strinfo.value != 39 && + ((double_quote_p && strinfo.value != 34) || + (!double_quote_p && strinfo.value != 39)) && strinfo.value != 10 && strinfo.value != 13;) { strinfo = get_info(s, strindex); @@ -88,7 +90,7 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) if (strinfo.value == 92) { strindex += info.step; - unsigned char local_exit = 0; + BOOL local_exit = 0; /* The following errors should not happen. I am just paranoid. */ @@ -118,6 +120,8 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) strindex -= strinfo.step; tnt_count--; + double_quote_p = 0; + index = strindex; info = get_info(s, index); break; @@ -166,6 +170,7 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) return 0; } +UNUSED static void print_int(void *p) { @@ -174,7 +179,7 @@ print_int(void *p) /* Read a grammar from a string in the format of BNF notation. */ Grammar * -read_grammar_from_bnf(str *s) +read_grammar_from_bnf(const str * const restrict s) { NUM len = str_length(s); @@ -186,7 +191,7 @@ read_grammar_from_bnf(str *s) List *line_indices = new_list(); - unsigned char definitionp = 1, readingp = 0; + BOOL definitionp = 1, readingp = 0; /* last index is used to collect strings */ for (NUM index = 0, last_index = 0; index < len;) { @@ -205,7 +210,7 @@ read_grammar_from_bnf(str *s) case 9: readingp = 0; - if (definitionp) { + if (definitionp && readingp) { fleprintf("%lu, A non-terminal cannot have spaces in " "the name\n", index); /* cleanup */ @@ -257,9 +262,9 @@ read_grammar_from_bnf(str *s) index += info.step; } - printf("%s:%d, print indices\n", __FILE__, __LINE__); - map_list(line_indices, print_int); - printf("\n"); + /* printf("%s:%d, print indices\n", __FILE__, __LINE__); + * map_list(line_indices, print_int); + * printf("\n"); */ /* second round collects all the rules */ @@ -267,7 +272,7 @@ read_grammar_from_bnf(str *s) NUM line_count = 0; - unsigned char right_start_p = 0; + BOOL right_start_p = 0, double_quote_p = 0; definitionp = 1, readingp = 0; @@ -286,6 +291,7 @@ read_grammar_from_bnf(str *s) switch (info.value) { /* the quote characters */ case 34: /* double quote */ + double_quote_p = 1; case 39: /* single quote */ if (definitionp) { fleprintf("%lu, A non-terminal cannot have quotes in " @@ -330,8 +336,8 @@ read_grammar_from_bnf(str *s) for (;strindex < len && strinfo.value != 13 && strinfo.value != 10 && - strinfo.value != 34 && - strinfo.value != 39;) { + ((double_quote_p && strinfo.value != 34) || + (!double_quote_p && strinfo.value != 39));) { strinfo = get_info(s, strindex); /* escape character */ @@ -339,7 +345,7 @@ read_grammar_from_bnf(str *s) strindex += strinfo.step; if (strindex < len) strinfo = get_info(s, strindex); - unsigned char local_error_p = 0; + BOOL local_error_p = 0; if (strindex >= len) { local_error_p = 1; @@ -395,8 +401,8 @@ read_grammar_from_bnf(str *s) return NULL; } } else { - if (strinfo.value != 34 && - strinfo.value != 39) { + if ((double_quote_p && strinfo.value != 34) || + (!double_quote_p && strinfo.value != 39)) { fleprintf0("input ended before string is left\n"); /* cleanup */ map_list(names, destroy_cpa_and_free_all); @@ -408,15 +414,16 @@ read_grammar_from_bnf(str *s) destroy_list(rules, 0); return NULL; } - } } + double_quote_p = 0; + break; /* spaces and tabs are ignored */ case ' ': case 9: - if (definitionp) { + if (definitionp && readingp) { fleprintf("%lu, A non-terminal cannot have spaces or " "tabs in the name\n", index); /* cleanup */ @@ -435,6 +442,11 @@ read_grammar_from_bnf(str *s) break; case '\n': case 13: + if (definitionp && !readingp) { + /* empty line */ + break; + } + definitionp = 1; readingp = 0; line_count++; @@ -461,13 +473,13 @@ read_grammar_from_bnf(str *s) return NULL; } - printf("%s:%d:%lu, Printing a TNT string: ", - __FILE__, __LINE__, index); - for (int i = 0; i < (int) tnt_count;) { - print_tnt(current_tnt_string+i++); - if (i<tnt_count) printf(", "); - } - printf("\n"); + /* printf("%s:%d:%lu, Printing a TNT string: ", + * __FILE__, __LINE__, index); */ + /* for (int i = 0; i < (int) tnt_count;) { + * print_tnt(current_tnt_string+i++); + * if (i<tnt_count) printf(", "); + * } + * printf("\n"); */ temp_pointers = MYALLOC(void *, tnt_count); diff --git a/src/reader.h b/src/reader.h index 65ee7ef..4351055 100644 --- a/src/reader.h +++ b/src/reader.h @@ -5,6 +5,6 @@ #include "utf8.h" #include "grammar.h" -Grammar *read_grammar_from_bnf(str *s); +Grammar *read_grammar_from_bnf(const str * const restrict s); #endif @@ -4,7 +4,7 @@ H_ATTR str_info -_info_getter(str *s, UNUM n) +_info_getter(const str * const restrict s, UNUM n) { return (str_info) { (NUM) *(s->data+n), 1 }; } @@ -41,14 +41,14 @@ destroy_str(str *s, int flag) H_ATTR str_info -get_info(str *s, UNUM n) +get_info(const str * const restrict s, UNUM n) { return s->getter(s, n); } H_ATTR NUM -str_length(str *s) +str_length(const str * const restrict s) { return s->size; } @@ -60,7 +60,7 @@ change_str_content(str *s, char *str, NUM size) s->size = size; } -unsigned char +BOOL copy_str(str *dest, str *source) { if (str_length(dest) < str_length(source)) @@ -74,7 +74,7 @@ copy_str(str *dest, str *source) } char * -get_data(str *s) +get_data(const str * const restrict s) { return s->data; } @@ -20,16 +20,16 @@ typedef struct str_info_s str_info; #define EMPTY_STR_INFO ((str_info) { 0, 1 }) -str_info get_info(str *, UNUM); +str_info get_info(const str * const restrict s, UNUM n); -char *get_data(str *s); +char *get_data(const str * const restrict s); -NUM str_length(str *); +NUM str_length(const str * const restrict s); void str_set_length(str *s, UNUM size); void change_str_content(str *, char *, NUM); -unsigned char copy_str(str *dest, str *source); +BOOL copy_str(str *dest, str *source); #endif diff --git a/src/str_partial.h b/src/str_partial.h index 9d4fbd6..2373555 100644 --- a/src/str_partial.h +++ b/src/str_partial.h @@ -3,7 +3,7 @@ /* This is meant to be extended, and only has minimal necessary fields. */ -typedef str_info (*info_getter) (str *, UNUM); +typedef str_info (*info_getter) (const str * const restrict , UNUM); struct str_s { UNUM size; /* the size in bytes, not in chars */ diff --git a/src/test.txt b/src/test.txt index be9b3ba..27af3ad 100644 --- a/src/test.txt +++ b/src/test.txt @@ -1,9 +1,25 @@ -S: A A LONG C 奎佑 -A: B B B -B: A -A: -LONG: B A -C: B C A -B: B B -奎佑: "handsome" -A: A B A +S: S "+" T +S: T +T: T "*" F +T: F +F: "(" S ")" +F: "0" +F: "1" +F: "2" +F: "3" +F: "4" +F: "5" +F: "6" +F: "7" +F: "8" +F: "9" +F: "0" F +F: "1" F +F: "2" F +F: "3" F +F: "4" F +F: "5" F +F: "6" F +F: "7" F +F: "8" F +F: "9" F diff --git a/src/test/check_dfa.c b/src/test/check_dfa.c new file mode 100644 index 0000000..a728fb8 --- /dev/null +++ b/src/test/check_dfa.c @@ -0,0 +1,108 @@ +#include "../utf8.h" +#include "../str.h" +#include <stdio.h> +#include "../util.h" +#include "../dfa.h" + +#define ADD_RANGE(X, Y) do { \ + for (int i = 0; i + (X) <= (Y); i++) \ + *(v+vlen++) = (NUM) i + (X); \ + } while (0) + +int +main(int UNUSED argc, char ** UNUSED argv) +{ + UNUM vlen = 0; + NUM *v = MYALLOC(NUM, 1<<21); + + ADD_RANGE(0x0020, 0x007f); + ADD_RANGE(0x2580, 0x259f); + ADD_RANGE(0x00a0, 0x00ff); + ADD_RANGE(0x25a0, 0x25ff); + ADD_RANGE(0x0100, 0x017f); + ADD_RANGE(0x2600, 0x26ff); + ADD_RANGE(0x0180, 0x024f); + ADD_RANGE(0x2700, 0x27bf); + ADD_RANGE(0x0250, 0x02af); + ADD_RANGE(0x27c0, 0x27ef); + ADD_RANGE(0x02b0, 0x02ff); + ADD_RANGE(0x27f0, 0x27ff); + ADD_RANGE(0x0300, 0x036f); + ADD_RANGE(0x2800, 0x28ff); + ADD_RANGE(0x0370, 0x03ff); + ADD_RANGE(0x2900, 0x297f); + ADD_RANGE(0x0400, 0x04ff); + ADD_RANGE(0x2980, 0x29ff); + ADD_RANGE(0x0500, 0x052f); + ADD_RANGE(0x2a00, 0x2aff); + ADD_RANGE(0x0530, 0x058f); + ADD_RANGE(0x2b00, 0x2bff); + ADD_RANGE(0x0590, 0x05ff); + ADD_RANGE(0x2e80, 0x2eff); + ADD_RANGE(0x0600, 0x06ff); + ADD_RANGE(0x2f00, 0x2fdf); + ADD_RANGE(0x0700, 0x074f); + ADD_RANGE(0x2ff0, 0x2fff); + ADD_RANGE(0x0780, 0x07bf); + ADD_RANGE(0x3000, 0x303f); + ADD_RANGE(0x0900, 0x097f); + ADD_RANGE(0x3040, 0x309f); + ADD_RANGE(0x0980, 0x09ff); + ADD_RANGE(0x30a0, 0x30ff); + ADD_RANGE(0x0a00, 0x0a7f); + ADD_RANGE(0x3100, 0x312f); + ADD_RANGE(0x0a80, 0x0aff); + ADD_RANGE(0x3130, 0x318f); + ADD_RANGE(0x0b00, 0x0b7f); + ADD_RANGE(0x3190, 0x319f); + ADD_RANGE(0x0b80, 0x0bff); + ADD_RANGE(0x31a0, 0x31bf); + ADD_RANGE(0x0c00, 0x0c7f); + ADD_RANGE(0x31f0, 0x31ff); + ADD_RANGE(0x0c80, 0x0cff); + ADD_RANGE(0x3200, 0x32ff); + ADD_RANGE(0x0d00, 0x0d7f); + ADD_RANGE(0x3300, 0x33ff); + ADD_RANGE(0x0d80, 0x0dff); + ADD_RANGE(0x3400, 0x4dbf); + ADD_RANGE(0x0e00, 0x0e7f); + ADD_RANGE(0x4dc0, 0x4dff); + ADD_RANGE(0x0e80, 0x0eff); + ADD_RANGE(0x4e00, 0x9fff); + ADD_RANGE(0x0f00, 0x0fff); + + /* for (int i = 0; i < 190; i++) + * printf("v [%d] = %d\n", i, *(v+i)); */ + + dfa *dfap = dfa_from_bytes(vlen, v); + + free(v); + + if (dfap == NULL) + printf("returned null\n"); + else + print_dfa(dfap); + + /* char *s = MYALLOC(char, 5); + * str *strp = new_str(s, 5); + * + * for (int i = 0; i < 3; i++) { + * printf("\n\n"); + * + * encode(v[i], strp); + * + * for (int j = 0; j < str_length(strp)-1;) + * printf("%d, ", *(s+j++)+(1<<8)); + * + * printf("%d\n", *(s+str_length(strp)-1)+(1<<8)); + * } + * + * destroy_str(strp, 1); */ + + /* BOOL result = run_dfa(dfap, 32239); + * + * printf("the result = %d\n", result); */ + + if (dfap) destroy_dfa(dfap); + return 0; +} @@ -41,7 +41,7 @@ decode(UTF8_State *state, uint32_t *codep, uint8_t byte) } str_info -utf8_get_info(str *s, UNUM n) +utf8_get_info(const str * const restrict s, UNUM n) { UTF8_State state = UTF8_STATE_ACCEPT; uint32_t code = 0; @@ -85,7 +85,7 @@ new_utf8(char *string, UNUM size) } /* result should be long enough to hold the data */ -unsigned char +BOOL encode(NUM code_point, str *result) { /* calculate the number of bits of CODE_POINT */ @@ -20,6 +20,6 @@ typedef enum UTF8_State_e UTF8_State; utf8 *new_utf8(char *string, UNUM size); -unsigned char encode(NUM code_point, str *result); +BOOL encode(NUM code_point, str *result); #endif @@ -12,7 +12,7 @@ Note that the length of the string str is actually 1+length, and the last character is the null character. */ -unsigned char +BOOL read_entire_file(const char *file_name, char **str, NUM *len) { long file_size = 0; @@ -10,6 +10,8 @@ typedef long DATA; typedef long NUM; typedef unsigned long long UNUM; /* definitely bigger than size_t */ +typedef unsigned char BOOL; + #define HC_ATTR __attribute__((__hot__, __const__)) #define H_ATTR __attribute__((__hot__)) @@ -31,6 +33,6 @@ typedef unsigned long long UNUM; /* definitely bigger than size_t */ #define fleprintf0(M) eprintf("%s:%d, " M, __FILE__, __LINE__) #define fleprintf(M, ...) eprintf("%s:%d, " M, __FILE__, __LINE__, __VA_ARGS__) -unsigned char read_entire_file(const char *file_name, char **str, NUM *len); +BOOL read_entire_file(const char *file_name, char **str, NUM *len); #endif |