From 91016bd3855375796fd1eabd501ebc12491f2655 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Tue, 11 Jan 2022 09:58:43 +0800 Subject: Add the framework for character classes. Now we have the potential to recognize character classes. But the most important task for us now is to experiment with ((B)RN)GLR algorithms, so we leave the character classes at the present state for a moment. --- src/Makefile.am | 8 +- src/dfa.c | 692 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/dfa.h | 64 +++++ src/grammar.c | 17 +- src/grammar.h | 53 +++- src/list.c | 16 +- src/list.h | 14 +- src/reader.c | 66 +++-- src/reader.h | 2 +- src/str.c | 10 +- src/str.h | 8 +- src/str_partial.h | 2 +- src/test.txt | 34 ++- src/test/check_dfa.c | 108 ++++++++ src/utf8.c | 4 +- src/utf8.h | 2 +- src/util.c | 2 +- src/util.h | 4 +- 18 files changed, 1019 insertions(+), 87 deletions(-) create mode 100644 src/dfa.c create mode 100644 src/dfa.h create mode 100644 src/test/check_dfa.c (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am index 4690876..f43fbee 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,9 +2,9 @@ AM_CFLAGS = -Wall -Wextra noinst_LIBRARIES = libeps.a libeps_a_SOURCES = grammar.c list.c util.c reader.c \ -str.c utf8.c lr_table.c \ +str.c utf8.c dfa.c \ grammar.h list.h util.h reader.h \ -str_partial.h str.h utf8.h lr_table.h +str_partial.h str.h utf8.h dfa.h libeps_a_CFLAGS = $(AM_CFLAGS) --pedantic @@ -21,7 +21,7 @@ CLEANFILES = TAGS # tests -check_PROGRAMS = check_list check_grammar check_reader +check_PROGRAMS = check_list check_grammar check_reader check_dfa check_list_SOURCES = test/check_list.c list.c @@ -31,6 +31,8 @@ utf8.c check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c \ str.c utf8.c util.c +check_dfa_SOURCES = test/check_dfa.c dfa.c list.c str.c utf8.c + TESTS = $(check_PROGRAMS) AM_COLOR_TESTS = always diff --git a/src/dfa.c b/src/dfa.c new file mode 100644 index 0000000..e3bcdf0 --- /dev/null +++ b/src/dfa.c @@ -0,0 +1,692 @@ +#include "utf8.h" +#include "str.h" +#include +#include "list.h" +#include "dfa.h" + +#define INIT_TRIE(X, ID) do { \ + X->id = ID; \ + X->children = MYALLOC(int, MAX_CHAR_BYTES_NUM); \ + for (int macro_index = 0; macro_index < MAX_CHAR_BYTES_NUM;) \ + *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \ + } while (0) + +/* The complete, uncompressed table is represented as an array of + arrays, each array of which has size MAX_CHAR_BYTES_NUM. The + compressed table is represented as run-lengths. The ROW_N is the + size of the total table. And the TABLE stores the data. It is an + array of arrays. The format of the array of a row is as follows: + + VALUE1 LEN1 VALUE2 LEN2 + + This means VALUE1 appears LEN1 times, and then VALUE2 appears LEN2 + times, and so on. Each array of a row ends when the lengths sum to + MAX_CHAR_BYTES_NUM. */ +struct compressed_table_s { + int row_n; + int **table; +}; + +enum DFA_TYPE { + DFA_TYPE_POSITIVE, + DFA_TYPE_NEGATIVE, + DFA_TYPE_BOTH, + DFA_TYPE_SPECIAL, +}; + +struct dfa_s { + enum DFA_TYPE type; + union { + compressed_table table; + special_dfa fn; + } data; +}; + +dfa * +new_dfa() +{ + dfa *dfap = MYALLOC(dfa, 1); + + dfap->type = 0; + + dfap->data.table.row_n = 0; + dfap->data.table.table = NULL; + + return dfap; +} + +void +destroy_dfa(dfa *table) +{ + /* function pointers need no destructors */ + if (table->type == DFA_TYPE_SPECIAL) return; + + for (int i = 0; i < table->data.table.row_n;) + free(*(table->data.table.table+i++)); + + free(table->data.table.table); + free(table); +} + +/* a trie structure */ + +typedef struct trie_s trie; + +/* REVIEW: Maybe we don't need ID? */ +struct trie_s { + /* It is guaranteed that the number of children is + MAX_CHAR_BYTES_NUM. */ + int *children; + int id; +}; + +static void +destroy_trie(void *t) +{ + trie *tp = (trie *)t; + free(tp->children); + free(tp); +} + +/* Print a list of Trie's. */ +UNUSED +static void +print_tries(const List * const restrict ls) +{ + NUM len = list_length(ls); + for (NUM i = 0; i < len; i++) { + trie *t = (trie *) list_nth(ls, i); + printf("Trie ID: %d:\n", t->id); + + BOOL has_children_p = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + if (*(t->children+j) != -1) { + has_children_p = 1; + printf(" " + "Char %d => State %d\n", j, *(t->children+j)); + } + } + + if (!has_children_p) printf(" " "No children!\n"); + + printf("\n"); + } +} + +void +print_dfa(const dfa * const restrict table) +{ + switch (table->type) { + case DFA_TYPE_POSITIVE: + printf("Printing a positive table:\n"); + break; + case DFA_TYPE_NEGATIVE: + printf("Printing a negative table:\n"); + break; + case DFA_TYPE_BOTH: + printf("Printing a mixed table:\n"); + break; + default: + printf("A special table cannot be printed\n"); + return; + break; + } + + printf("Printing a table:\n"); + + for (int i = 0; i < table->data.table.row_n; i++) { + printf("Row %d: ", i); + int size = 0; + for (int j = 0; size < MAX_CHAR_BYTES_NUM; j++) { + printf("%d %d", *(*(table->data.table.table+i)+(j<<1)), + *(*(table->data.table.table+i)+(j<<1)+1)); + size += *(*(table->data.table.table+i)+(j<<1)+1); + + if (size < MAX_CHAR_BYTES_NUM) printf(", "); + else printf("\n"); + } + } + printf("\n"); +} + +/* REVIEW: Maybe we don't want to generate the complete trie in the + process of generating the compressed table? */ +dfa * +dfa_from_bytes(int sequence_size, + const NUM * const restrict data) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the accepting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_ACCEPT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +dfa * +dfa_from_bytes_neg(int sequence_size, + const NUM * const restrict data) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the rejecting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_REJECT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + dfap->type = 1; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +dfa *dfa_from_bytes_both(int sequence_size, + const NUM * const restrict data, + int neg_sequence_size, + const NUM * const restrict negdata) +{ + List *states = new_list(); + + trie *state_pointer = MYALLOC(trie, 1); + + INIT_TRIE(state_pointer, 0); + + if (add_to_list(states, state_pointer)) { + fleprintf0("Cannot add to list\n"); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + char *s = MYALLOC(char, 5); + + str *strp = new_str(s, 5); + + for (int i = 0; i < sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(data+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the accepting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_ACCEPT; + } + + str_set_length(strp, 5); + } + + for (int i = 0; i < neg_sequence_size; i++) { + trie current_state = *((trie *) list_nth(states, 0)); + NUM current_value = *(negdata+i); + + if (encode(current_value, strp)) { + fleprintf("Fail to encode %ld at i = %d\n", current_value, i); + /* cleanup */ + destroy_str(strp, 1); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + + int str_len = (int) str_length(strp); + s = get_data(strp); + unsigned char c = 0; + for (int j = 0; j < (str_len-1); j++) { + c = (unsigned char) *(s+j); + if (*(current_state.children+c) == DFA_STATE_UNKNOWN) { + /* a new state should be created */ + state_pointer = MYALLOC(trie, 1); + INIT_TRIE(state_pointer, list_length(states)); + *(current_state.children+c) = list_length(states); + if (add_to_list(states, state_pointer)) { + /* cleanup */ + fleprintf("Fail to add to list when i = %d, j = %d, " + "and current value = %ld\n", + i, j, current_value); + destroy_str(strp, 1); + destroy_trie(state_pointer); + map_list(states, destroy_trie); + destroy_list(states, 0); + return NULL; + } + } + + current_state = + *((trie *) list_nth(states, *(current_state.children+c))); + } + + if (str_len) { + /* the last byte leads to the rejecting state */ + c = (unsigned char) *(s+str_len-1); + + /* In the trie's that we will be forming, there will be no leaf + that is also the parent of another node, by the design of + UTF-8. */ + + *(current_state.children+c) = DFA_STATE_REJECT; + } + + str_set_length(strp, 5); + } + + destroy_str(strp, 1); + + /* For debugging */ + + /* print_tries(states); */ + + /* construct DFA from the trie */ + + NUM states_len = list_length(states); + + dfa *dfap = new_dfa(); + + dfap->data.table.table = MYALLOC(int *, states_len); + dfap->data.table.row_n = states_len; + + for (NUM i = 0; i < states_len; i++) { + int size = 1, current = -1, last = -1, current_length = 0; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) size++; + last = current; + } + + *(dfap->data.table.table+i) = MYALLOC(int, size<<1); + + current = (last = -1); + + size = 1; + + for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) { + current = *(((trie *) list_nth(states, i))->children+j); + if (current != last) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length; + + size++; + last = current; + current_length = 1; + + if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = last; + *(*(dfap->data.table.table+i)+(size<<1)-1) = 1; + } + } else if (j == MAX_CHAR_BYTES_NUM-1) { + *(*(dfap->data.table.table+i)+(size<<1)-2) = current; + *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1; + } else { + current_length++; + } + } + } + + /* cleanup the trie */ + map_list(states, destroy_trie); + destroy_list(states, 0); + + return dfap; +} + +BOOL +run_dfa(const dfa * const restrict table, const NUM code) +{ + switch (table->type) { + case DFA_TYPE_POSITIVE: + case DFA_TYPE_NEGATIVE: + case DFA_TYPE_BOTH: + break; + default: + return table->data.fn(code); + break; + } + + char *s = MYALLOC(char, 5); + str *strp = new_str(s, 5); + + if (encode(code, strp)) { + destroy_str(strp, 1); + fleprintf0("Fail to encode\n"); + return 0; + } + + int current_row = 0; + + for (NUM i = 0; i < str_length(strp); i++) { + unsigned char c = (unsigned char) *(s+i); + int lookup_value = 0; + + for (int j = 0, len = 0; len <= c;) { + len += *(*(table->data.table.table+current_row)+(j<<1)+1); + lookup_value = *(*(table->data.table.table+current_row) + +(j++<<1)); + } + + /* fleprintf("lookup_value = %d\n", lookup_value); */ + + if (lookup_value > 0) { + current_row = lookup_value; + } else { + switch (lookup_value) { + case DFA_STATE_REJECT: + destroy_str(strp, 1); + return 0; + break; + case DFA_STATE_UNKNOWN: + switch (table->type) { + case DFA_TYPE_BOTH: + case DFA_TYPE_POSITIVE: + destroy_str(strp, 1); + return 0; + break; + case DFA_TYPE_NEGATIVE: + destroy_str(strp, 1); + return 1; + break; + default: + /* This should not be possible, as we won't reach here if + the table is of the special type. But I am apparently + paranoia. */ + fleprintf0("Special table in the wrong place\n"); + destroy_str(strp, 1); + return 0; + break; + } + break; + case DFA_STATE_ACCEPT: + destroy_str(strp, 1); + return 1; + break; + default: + fleprintf("Unknown state number for char %d: %d\n", + c, lookup_value); + destroy_str(strp, 1); + return 0; + break; + } + } + } + + destroy_str(strp, 1); + + return 0; +} diff --git a/src/dfa.h b/src/dfa.h new file mode 100644 index 0000000..a22409f --- /dev/null +++ b/src/dfa.h @@ -0,0 +1,64 @@ +#include "util.h" +#ifndef DFA_H +#define DFA_H + +/* See the comments at the beginning of grammar.h for some backgrounds + about this file. */ + +/* See the following Wikipedia link for details on Run-Length + Encoding. */ + +/* Maximal character bytes value */ + +enum { MAX_CHAR_BYTES_NUM = 256 }; + +/* Hard-coded state numbers */ + +enum { + DFA_STATE_UNKNOWN = -1, + DFA_STATE_ACCEPT = -2, + DFA_STATE_REJECT = -3, +}; + +/* dfa type */ + +typedef BOOL (* special_dfa) (const NUM code); + +typedef struct dfa_s dfa; + +typedef struct compressed_table_s compressed_table; + +dfa *new_dfa(); + +void destroy_dfa(dfa *table); + +void print_dfa(const dfa * const restrict table); + +dfa *dfa_from_bytes(int sequence_size, + const NUM * const restrict data); + +dfa *dfa_from_bytes_neg(int sequence_size, + const NUM * const restrict data); + +dfa *dfa_from_bytes_both(int sequence_size, + const NUM * const restrict data, + int neg_sequence_size, + const NUM * const restrict negdata); + +/* TODO: Reject character bytes from a given DFA. */ + +/* NOTE: Add all unicode valid points to a DFA, so that we can + represent the ANY class. + + After having done so, this costs around 16K memory. This is not so + satisfactory, as all these memory are just to serve as a ANY + character class, which is too excessive. So I extend the DFA by a + special type. */ + +/* TODO: Construct some basic frequently used character classes. */ + +inline BOOL dfa_any_fun(const NUM UNUSED code) { return 1; } + +BOOL run_dfa(const dfa * const restrict table, const NUM code); + +#endif diff --git a/src/grammar.c b/src/grammar.c index 4052510..508c8d8 100644 --- a/src/grammar.c +++ b/src/grammar.c @@ -104,14 +104,14 @@ new_tnt(int type, ...) } Rule * -new_rule(NT left, List *right) +new_rule(NT left, const List * const right) { if (!right) return NULL; Rule *rule = MYALLOC(Rule, 1); rule->left = left; - rule->right = right; + rule->right = (List *) right; return rule; } @@ -124,10 +124,10 @@ new_grammar() } /* We classify the rules into different rule groups. */ -unsigned char -build_grammar(Grammar *g, List *rules, List *names) +BOOL +build_grammar(Grammar *g, const List *rules, const List * const names) { - g->names = names; + g->names = (List *) names; NUM len = list_length(names); NUM rule_len = list_length(rules); @@ -246,11 +246,12 @@ print_rule(void *r) } NUM -find_in_cpa_list(NUM *string, NUM size, List *list) +find_in_cpa_list(const NUM * const restrict string, NUM size, + const List * const restrict list) { NUM len = list_length(list), index = 0; - unsigned char foundp = 0; + BOOL foundp = 0; for (; !foundp && index < len;) { cpa *cpap = list_nth(list, index); @@ -304,7 +305,7 @@ print_name(void *element) /* REVIEW: Print the names of non-terminals out, instead of printing the numbers? */ void -print_grammar(Grammar *g) +print_grammar(const Grammar * const g) { printf("Printing a grammar:\n"); map_list_between(g->names, print_name, print_sep); diff --git a/src/grammar.h b/src/grammar.h index 05bc1fc..28f8fd8 100644 --- a/src/grammar.h +++ b/src/grammar.h @@ -8,14 +8,45 @@ /* The class file for grammars */ +/* TODO: Terminals should be extended to incorporate the "predicate + terminals", such as character classes. */ + /* A grammar is a list of rules. A rule replaces a non-terminal with a finite string of terminals and non-terminals. - For us, a terminal is a positive integer, and a non-terminal is a - negative integer. Since the signs of these two types is fixed, we - represent them as unsigned integers. Since we want to deal with - unicode points, the terminal is stored as unsigned long, so that we - can include all unicode poitns. */ + For us, a terminal is a positive integer, and a non-terminal is + also an integer. We represent them as unsigned integers. Since we + want to deal with unicode points, the terminal is stored as + unsigned long, so that we can include all unicode poitns. + + Moreover, a terminal might be a "predicate terminal", that is, it + does not refer to one character only. Instead, it refers to a set + of characters. The most efficient way of recognizing a set of + characters, of which I know, is to use a deterministic finite + automaton. So we represent a predicate terminal as a DFA. + + Whereas the DFA's are equivalent to regular expressions in the + expressive power, I do not need full support for regular + expressions for the predicate terminals. In particular, I only + want to match a character for one predicate terminal, so there are + no such things as repititive and optional constructs in the DFA. + Rather, each predicate terminal is a simple table, or a matrix, + which must have exactly 256 columns and an indefinite number of + rows. The columns represent the input character's bytes to match, + and the rows are the states of the DFA. The table must have at + least one row: the starting state is always the zeroth row and the + accepting state is any state that is out of bounds: either greater + than or equal to the number of rows or negative. + + Of course, it is inefficient to keep the whoe table within the + memory, especially since the most part of the table would be + unused. So we store a DFA in a compressed format. I have + considered whether to apply some more advanced compression + techniques, such as using BW transoform followed by the MTF + algorithm, but in the end I think that the DFA's that will be used + are oft times just small tables, at least I hope so, thus I decided + to simply use the Run-Length-Encoding. See the file dfa.h for + implementation details. */ typedef unsigned long T; typedef unsigned NT; @@ -41,7 +72,9 @@ typedef struct Grammar_s Grammar; /* G is expected to be allocated before this function. NAMES is a list of CPA pointers. */ -unsigned char build_grammar(Grammar *g, List *rules, List *names); +BOOL +build_grammar(Grammar *g, + const List *rules, const List * const names); /* This accepts one and only one optional argument, which is of type either T or NT, depending on TYPE being 0 or not. */ @@ -56,7 +89,9 @@ struct cpa_s { typedef struct cpa_s cpa; -NUM find_in_cpa_list(NUM *string, NUM size, List *list); +NUM +find_in_cpa_list(const NUM * const restrict string, NUM size, + const List * const restrict list); /* For debugging purposes, we need to print out rules to examine their values. */ @@ -69,7 +104,7 @@ void print_tnt(void *element); void print_rule(void *r); /* This will generate errors if G is not a list of rules. */ -void print_grammar(Grammar *g); +void print_grammar(const Grammar * const g); /* constructors */ @@ -85,7 +120,7 @@ TNT *new_tnt_pointer(size_t size); /* RIGHT should be created by new_tnt_string or something alike. If RIGHT is NULL, NULL is returned. */ -Rule *new_rule(NT left, List *right); +Rule *new_rule(NT left, const List * const right); Grammar *new_grammar(); diff --git a/src/list.c b/src/list.c index 143c693..3c430f6 100644 --- a/src/list.c +++ b/src/list.c @@ -32,7 +32,7 @@ new_list() } H_ATTR -unsigned char +BOOL add_to_list(List *ls, void *element) { if (!(ls->capacity)) { @@ -141,10 +141,10 @@ copy_num(void *p) return pointer; } -unsigned char +BOOL copy_list(List *dest, List *source, copyer copyf) { - unsigned char sign = 0; + BOOL sign = 0; if ((sign = list_assure_size(dest, source->len))) return sign; @@ -162,19 +162,19 @@ copy_list(List *dest, List *source, copyer copyf) H_ATTR void * -list_nth(List *ls, NUM n) +list_nth(const List * const ls, NUM n) { return *(ls->array+n); } H_ATTR NUM -list_length(List *ls) +list_length(const List * const restrict ls) { return ls->len; } -unsigned char +BOOL list_assure_size(List *ls, NUM size) { if (ls->capacity >= size) return 0; /* we are good */ @@ -203,7 +203,7 @@ list_assure_size(List *ls, NUM size) return 0; } -unsigned char +BOOL list_set_length(List *ls, NUM len) { list_assure_size(ls, len); @@ -254,7 +254,7 @@ array_to_list(void **array, NUM size) } void -destroy_list(List *ls, unsigned char all_free_p) +destroy_list(List *ls, BOOL all_free_p) { if (all_free_p == 1) for (NUM i = 0; i < ls->len; i++) diff --git a/src/list.h b/src/list.h index ff401f4..6ace213 100644 --- a/src/list.h +++ b/src/list.h @@ -17,7 +17,7 @@ List *new_list(); /* add an element to the end of the list */ /* upon failure return non-zero */ -unsigned char add_to_list(List *ls, void *element); +BOOL add_to_list(List *ls, void *element); /* pop an element from the end of the list, and return that element */ /* upon failure return NULL */ @@ -43,23 +43,23 @@ typedef void *(*copyer)(void *); void *copy_num(void *); /* upon failure return 1 */ -unsigned char copy_list(List *dest, List *source, copyer copyf); +BOOL copy_list(List *dest, List *source, copyer copyf); -void *list_nth(List *ls, NUM n); +void *list_nth(const List * const ls, NUM n); -NUM list_length(List *ls); +NUM list_length(const List * const restrict ls); /* Make sure the list has at least SIZE slots to use. This should only be used to create fixed capacity arrays, otherwise we risk frequently re-allocating and hence losing performance. */ /* Upon failure return non-zero. */ -unsigned char list_assure_size(List *ls, NUM size); +BOOL list_assure_size(List *ls, NUM size); /* This is mainly used to set the length of a sparse list, since only when dealing with sparse lists do we not need to care about the elements. */ /* Upon failure return non-zero. */ -unsigned char list_set_length(List *ls, NUM len); +BOOL list_set_length(List *ls, NUM len); /* Convert a list to an array. @@ -79,7 +79,7 @@ List *array_to_list(void ** array, NUM size); /* destroy the list: If ALL_FREE_P is 1, this frees every void pointers contained in the list; if it is 2, this frees the first pointer. In any case, the list is de-allocated. */ -void destroy_list(List *ls, unsigned char all_free_p); +void destroy_list(List *ls, BOOL all_free_p); void destroy_list_free_all(void *ls); void destroy_list_free_first(void *ls); diff --git a/src/reader.c b/src/reader.c index 18435a9..d715726 100644 --- a/src/reader.c +++ b/src/reader.c @@ -21,14 +21,15 @@ /* Read a string of TNT's into TNTS. TNTS is expected to be already allocated. */ -static unsigned char -read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) +static BOOL +read_tnt_string(const str * const restrict s, NUM start, NUM end, + TNT *tnts, List *cpa_list) { str_info info; /* fleprintf("S = %lu, E = %lu\n", start, end); */ - unsigned char readingp = 0, stop = 0; + BOOL readingp = 0, stop = 0, double_quote_p = 0; NUM *string = NULL, string_size = 0, nt_index = 0, tnt_count = 0; @@ -73,14 +74,15 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) } break; case 34: + double_quote_p = 1; case 39: /* read a string */ strindex = index + info.step; strinfo = get_info(s, strindex); for (;strindex < end && - strinfo.value != 34 && - strinfo.value != 39 && + ((double_quote_p && strinfo.value != 34) || + (!double_quote_p && strinfo.value != 39)) && strinfo.value != 10 && strinfo.value != 13;) { strinfo = get_info(s, strindex); @@ -88,7 +90,7 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) if (strinfo.value == 92) { strindex += info.step; - unsigned char local_exit = 0; + BOOL local_exit = 0; /* The following errors should not happen. I am just paranoid. */ @@ -118,6 +120,8 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) strindex -= strinfo.step; tnt_count--; + double_quote_p = 0; + index = strindex; info = get_info(s, index); break; @@ -166,6 +170,7 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) return 0; } +UNUSED static void print_int(void *p) { @@ -174,7 +179,7 @@ print_int(void *p) /* Read a grammar from a string in the format of BNF notation. */ Grammar * -read_grammar_from_bnf(str *s) +read_grammar_from_bnf(const str * const restrict s) { NUM len = str_length(s); @@ -186,7 +191,7 @@ read_grammar_from_bnf(str *s) List *line_indices = new_list(); - unsigned char definitionp = 1, readingp = 0; + BOOL definitionp = 1, readingp = 0; /* last index is used to collect strings */ for (NUM index = 0, last_index = 0; index < len;) { @@ -205,7 +210,7 @@ read_grammar_from_bnf(str *s) case 9: readingp = 0; - if (definitionp) { + if (definitionp && readingp) { fleprintf("%lu, A non-terminal cannot have spaces in " "the name\n", index); /* cleanup */ @@ -257,9 +262,9 @@ read_grammar_from_bnf(str *s) index += info.step; } - printf("%s:%d, print indices\n", __FILE__, __LINE__); - map_list(line_indices, print_int); - printf("\n"); + /* printf("%s:%d, print indices\n", __FILE__, __LINE__); + * map_list(line_indices, print_int); + * printf("\n"); */ /* second round collects all the rules */ @@ -267,7 +272,7 @@ read_grammar_from_bnf(str *s) NUM line_count = 0; - unsigned char right_start_p = 0; + BOOL right_start_p = 0, double_quote_p = 0; definitionp = 1, readingp = 0; @@ -286,6 +291,7 @@ read_grammar_from_bnf(str *s) switch (info.value) { /* the quote characters */ case 34: /* double quote */ + double_quote_p = 1; case 39: /* single quote */ if (definitionp) { fleprintf("%lu, A non-terminal cannot have quotes in " @@ -330,8 +336,8 @@ read_grammar_from_bnf(str *s) for (;strindex < len && strinfo.value != 13 && strinfo.value != 10 && - strinfo.value != 34 && - strinfo.value != 39;) { + ((double_quote_p && strinfo.value != 34) || + (!double_quote_p && strinfo.value != 39));) { strinfo = get_info(s, strindex); /* escape character */ @@ -339,7 +345,7 @@ read_grammar_from_bnf(str *s) strindex += strinfo.step; if (strindex < len) strinfo = get_info(s, strindex); - unsigned char local_error_p = 0; + BOOL local_error_p = 0; if (strindex >= len) { local_error_p = 1; @@ -395,8 +401,8 @@ read_grammar_from_bnf(str *s) return NULL; } } else { - if (strinfo.value != 34 && - strinfo.value != 39) { + if ((double_quote_p && strinfo.value != 34) || + (!double_quote_p && strinfo.value != 39)) { fleprintf0("input ended before string is left\n"); /* cleanup */ map_list(names, destroy_cpa_and_free_all); @@ -408,15 +414,16 @@ read_grammar_from_bnf(str *s) destroy_list(rules, 0); return NULL; } - } } + double_quote_p = 0; + break; /* spaces and tabs are ignored */ case ' ': case 9: - if (definitionp) { + if (definitionp && readingp) { fleprintf("%lu, A non-terminal cannot have spaces or " "tabs in the name\n", index); /* cleanup */ @@ -435,6 +442,11 @@ read_grammar_from_bnf(str *s) break; case '\n': case 13: + if (definitionp && !readingp) { + /* empty line */ + break; + } + definitionp = 1; readingp = 0; line_count++; @@ -461,13 +473,13 @@ read_grammar_from_bnf(str *s) return NULL; } - printf("%s:%d:%lu, Printing a TNT string: ", - __FILE__, __LINE__, index); - for (int i = 0; i < (int) tnt_count;) { - print_tnt(current_tnt_string+i++); - if (idata+n), 1 }; } @@ -41,14 +41,14 @@ destroy_str(str *s, int flag) H_ATTR str_info -get_info(str *s, UNUM n) +get_info(const str * const restrict s, UNUM n) { return s->getter(s, n); } H_ATTR NUM -str_length(str *s) +str_length(const str * const restrict s) { return s->size; } @@ -60,7 +60,7 @@ change_str_content(str *s, char *str, NUM size) s->size = size; } -unsigned char +BOOL copy_str(str *dest, str *source) { if (str_length(dest) < str_length(source)) @@ -74,7 +74,7 @@ copy_str(str *dest, str *source) } char * -get_data(str *s) +get_data(const str * const restrict s) { return s->data; } diff --git a/src/str.h b/src/str.h index 39bddf6..bfe5867 100644 --- a/src/str.h +++ b/src/str.h @@ -20,16 +20,16 @@ typedef struct str_info_s str_info; #define EMPTY_STR_INFO ((str_info) { 0, 1 }) -str_info get_info(str *, UNUM); +str_info get_info(const str * const restrict s, UNUM n); -char *get_data(str *s); +char *get_data(const str * const restrict s); -NUM str_length(str *); +NUM str_length(const str * const restrict s); void str_set_length(str *s, UNUM size); void change_str_content(str *, char *, NUM); -unsigned char copy_str(str *dest, str *source); +BOOL copy_str(str *dest, str *source); #endif diff --git a/src/str_partial.h b/src/str_partial.h index 9d4fbd6..2373555 100644 --- a/src/str_partial.h +++ b/src/str_partial.h @@ -3,7 +3,7 @@ /* This is meant to be extended, and only has minimal necessary fields. */ -typedef str_info (*info_getter) (str *, UNUM); +typedef str_info (*info_getter) (const str * const restrict , UNUM); struct str_s { UNUM size; /* the size in bytes, not in chars */ diff --git a/src/test.txt b/src/test.txt index be9b3ba..27af3ad 100644 --- a/src/test.txt +++ b/src/test.txt @@ -1,9 +1,25 @@ -S: A A LONG C 奎佑 -A: B B B -B: A -A: -LONG: B A -C: B C A -B: B B -奎佑: "handsome" -A: A B A +S: S "+" T +S: T +T: T "*" F +T: F +F: "(" S ")" +F: "0" +F: "1" +F: "2" +F: "3" +F: "4" +F: "5" +F: "6" +F: "7" +F: "8" +F: "9" +F: "0" F +F: "1" F +F: "2" F +F: "3" F +F: "4" F +F: "5" F +F: "6" F +F: "7" F +F: "8" F +F: "9" F diff --git a/src/test/check_dfa.c b/src/test/check_dfa.c new file mode 100644 index 0000000..a728fb8 --- /dev/null +++ b/src/test/check_dfa.c @@ -0,0 +1,108 @@ +#include "../utf8.h" +#include "../str.h" +#include +#include "../util.h" +#include "../dfa.h" + +#define ADD_RANGE(X, Y) do { \ + for (int i = 0; i + (X) <= (Y); i++) \ + *(v+vlen++) = (NUM) i + (X); \ + } while (0) + +int +main(int UNUSED argc, char ** UNUSED argv) +{ + UNUM vlen = 0; + NUM *v = MYALLOC(NUM, 1<<21); + + ADD_RANGE(0x0020, 0x007f); + ADD_RANGE(0x2580, 0x259f); + ADD_RANGE(0x00a0, 0x00ff); + ADD_RANGE(0x25a0, 0x25ff); + ADD_RANGE(0x0100, 0x017f); + ADD_RANGE(0x2600, 0x26ff); + ADD_RANGE(0x0180, 0x024f); + ADD_RANGE(0x2700, 0x27bf); + ADD_RANGE(0x0250, 0x02af); + ADD_RANGE(0x27c0, 0x27ef); + ADD_RANGE(0x02b0, 0x02ff); + ADD_RANGE(0x27f0, 0x27ff); + ADD_RANGE(0x0300, 0x036f); + ADD_RANGE(0x2800, 0x28ff); + ADD_RANGE(0x0370, 0x03ff); + ADD_RANGE(0x2900, 0x297f); + ADD_RANGE(0x0400, 0x04ff); + ADD_RANGE(0x2980, 0x29ff); + ADD_RANGE(0x0500, 0x052f); + ADD_RANGE(0x2a00, 0x2aff); + ADD_RANGE(0x0530, 0x058f); + ADD_RANGE(0x2b00, 0x2bff); + ADD_RANGE(0x0590, 0x05ff); + ADD_RANGE(0x2e80, 0x2eff); + ADD_RANGE(0x0600, 0x06ff); + ADD_RANGE(0x2f00, 0x2fdf); + ADD_RANGE(0x0700, 0x074f); + ADD_RANGE(0x2ff0, 0x2fff); + ADD_RANGE(0x0780, 0x07bf); + ADD_RANGE(0x3000, 0x303f); + ADD_RANGE(0x0900, 0x097f); + ADD_RANGE(0x3040, 0x309f); + ADD_RANGE(0x0980, 0x09ff); + ADD_RANGE(0x30a0, 0x30ff); + ADD_RANGE(0x0a00, 0x0a7f); + ADD_RANGE(0x3100, 0x312f); + ADD_RANGE(0x0a80, 0x0aff); + ADD_RANGE(0x3130, 0x318f); + ADD_RANGE(0x0b00, 0x0b7f); + ADD_RANGE(0x3190, 0x319f); + ADD_RANGE(0x0b80, 0x0bff); + ADD_RANGE(0x31a0, 0x31bf); + ADD_RANGE(0x0c00, 0x0c7f); + ADD_RANGE(0x31f0, 0x31ff); + ADD_RANGE(0x0c80, 0x0cff); + ADD_RANGE(0x3200, 0x32ff); + ADD_RANGE(0x0d00, 0x0d7f); + ADD_RANGE(0x3300, 0x33ff); + ADD_RANGE(0x0d80, 0x0dff); + ADD_RANGE(0x3400, 0x4dbf); + ADD_RANGE(0x0e00, 0x0e7f); + ADD_RANGE(0x4dc0, 0x4dff); + ADD_RANGE(0x0e80, 0x0eff); + ADD_RANGE(0x4e00, 0x9fff); + ADD_RANGE(0x0f00, 0x0fff); + + /* for (int i = 0; i < 190; i++) + * printf("v [%d] = %d\n", i, *(v+i)); */ + + dfa *dfap = dfa_from_bytes(vlen, v); + + free(v); + + if (dfap == NULL) + printf("returned null\n"); + else + print_dfa(dfap); + + /* char *s = MYALLOC(char, 5); + * str *strp = new_str(s, 5); + * + * for (int i = 0; i < 3; i++) { + * printf("\n\n"); + * + * encode(v[i], strp); + * + * for (int j = 0; j < str_length(strp)-1;) + * printf("%d, ", *(s+j++)+(1<<8)); + * + * printf("%d\n", *(s+str_length(strp)-1)+(1<<8)); + * } + * + * destroy_str(strp, 1); */ + + /* BOOL result = run_dfa(dfap, 32239); + * + * printf("the result = %d\n", result); */ + + if (dfap) destroy_dfa(dfap); + return 0; +} diff --git a/src/utf8.c b/src/utf8.c index f840600..ce47faf 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -41,7 +41,7 @@ decode(UTF8_State *state, uint32_t *codep, uint8_t byte) } str_info -utf8_get_info(str *s, UNUM n) +utf8_get_info(const str * const restrict s, UNUM n) { UTF8_State state = UTF8_STATE_ACCEPT; uint32_t code = 0; @@ -85,7 +85,7 @@ new_utf8(char *string, UNUM size) } /* result should be long enough to hold the data */ -unsigned char +BOOL encode(NUM code_point, str *result) { /* calculate the number of bits of CODE_POINT */ diff --git a/src/utf8.h b/src/utf8.h index 5b3c9f2..f5e7be1 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -20,6 +20,6 @@ typedef enum UTF8_State_e UTF8_State; utf8 *new_utf8(char *string, UNUM size); -unsigned char encode(NUM code_point, str *result); +BOOL encode(NUM code_point, str *result); #endif diff --git a/src/util.c b/src/util.c index a04ef77..1af8e5e 100644 --- a/src/util.c +++ b/src/util.c @@ -12,7 +12,7 @@ Note that the length of the string str is actually 1+length, and the last character is the null character. */ -unsigned char +BOOL read_entire_file(const char *file_name, char **str, NUM *len) { long file_size = 0; diff --git a/src/util.h b/src/util.h index 940b7e4..6d6072a 100644 --- a/src/util.h +++ b/src/util.h @@ -10,6 +10,8 @@ typedef long DATA; typedef long NUM; typedef unsigned long long UNUM; /* definitely bigger than size_t */ +typedef unsigned char BOOL; + #define HC_ATTR __attribute__((__hot__, __const__)) #define H_ATTR __attribute__((__hot__)) @@ -31,6 +33,6 @@ typedef unsigned long long UNUM; /* definitely bigger than size_t */ #define fleprintf0(M) eprintf("%s:%d, " M, __FILE__, __LINE__) #define fleprintf(M, ...) eprintf("%s:%d, " M, __FILE__, __LINE__, __VA_ARGS__) -unsigned char read_entire_file(const char *file_name, char **str, NUM *len); +BOOL read_entire_file(const char *file_name, char **str, NUM *len); #endif -- cgit v1.2.3-18-g5258