diff options
Diffstat (limited to 'src/grammar.h')
-rw-r--r-- | src/grammar.h | 53 |
1 files changed, 44 insertions, 9 deletions
diff --git a/src/grammar.h b/src/grammar.h index 05bc1fc..28f8fd8 100644 --- a/src/grammar.h +++ b/src/grammar.h @@ -8,14 +8,45 @@ /* The class file for grammars */ +/* TODO: Terminals should be extended to incorporate the "predicate + terminals", such as character classes. */ + /* A grammar is a list of rules. A rule replaces a non-terminal with a finite string of terminals and non-terminals. - For us, a terminal is a positive integer, and a non-terminal is a - negative integer. Since the signs of these two types is fixed, we - represent them as unsigned integers. Since we want to deal with - unicode points, the terminal is stored as unsigned long, so that we - can include all unicode poitns. */ + For us, a terminal is a positive integer, and a non-terminal is + also an integer. We represent them as unsigned integers. Since we + want to deal with unicode points, the terminal is stored as + unsigned long, so that we can include all unicode poitns. + + Moreover, a terminal might be a "predicate terminal", that is, it + does not refer to one character only. Instead, it refers to a set + of characters. The most efficient way of recognizing a set of + characters, of which I know, is to use a deterministic finite + automaton. So we represent a predicate terminal as a DFA. + + Whereas the DFA's are equivalent to regular expressions in the + expressive power, I do not need full support for regular + expressions for the predicate terminals. In particular, I only + want to match a character for one predicate terminal, so there are + no such things as repititive and optional constructs in the DFA. + Rather, each predicate terminal is a simple table, or a matrix, + which must have exactly 256 columns and an indefinite number of + rows. The columns represent the input character's bytes to match, + and the rows are the states of the DFA. The table must have at + least one row: the starting state is always the zeroth row and the + accepting state is any state that is out of bounds: either greater + than or equal to the number of rows or negative. + + Of course, it is inefficient to keep the whoe table within the + memory, especially since the most part of the table would be + unused. So we store a DFA in a compressed format. I have + considered whether to apply some more advanced compression + techniques, such as using BW transoform followed by the MTF + algorithm, but in the end I think that the DFA's that will be used + are oft times just small tables, at least I hope so, thus I decided + to simply use the Run-Length-Encoding. See the file dfa.h for + implementation details. */ typedef unsigned long T; typedef unsigned NT; @@ -41,7 +72,9 @@ typedef struct Grammar_s Grammar; /* G is expected to be allocated before this function. NAMES is a list of CPA pointers. */ -unsigned char build_grammar(Grammar *g, List *rules, List *names); +BOOL +build_grammar(Grammar *g, + const List *rules, const List * const names); /* This accepts one and only one optional argument, which is of type either T or NT, depending on TYPE being 0 or not. */ @@ -56,7 +89,9 @@ struct cpa_s { typedef struct cpa_s cpa; -NUM find_in_cpa_list(NUM *string, NUM size, List *list); +NUM +find_in_cpa_list(const NUM * const restrict string, NUM size, + const List * const restrict list); /* For debugging purposes, we need to print out rules to examine their values. */ @@ -69,7 +104,7 @@ void print_tnt(void *element); void print_rule(void *r); /* This will generate errors if G is not a list of rules. */ -void print_grammar(Grammar *g); +void print_grammar(const Grammar * const g); /* constructors */ @@ -85,7 +120,7 @@ TNT *new_tnt_pointer(size_t size); /* RIGHT should be created by new_tnt_string or something alike. If RIGHT is NULL, NULL is returned. */ -Rule *new_rule(NT left, List *right); +Rule *new_rule(NT left, const List * const right); Grammar *new_grammar(); |