summaryrefslogtreecommitdiff
path: root/src/grammar.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/grammar.h')
-rw-r--r--src/grammar.h53
1 files changed, 44 insertions, 9 deletions
diff --git a/src/grammar.h b/src/grammar.h
index 05bc1fc..28f8fd8 100644
--- a/src/grammar.h
+++ b/src/grammar.h
@@ -8,14 +8,45 @@
/* The class file for grammars */
+/* TODO: Terminals should be extended to incorporate the "predicate
+ terminals", such as character classes. */
+
/* A grammar is a list of rules. A rule replaces a non-terminal with
a finite string of terminals and non-terminals.
- For us, a terminal is a positive integer, and a non-terminal is a
- negative integer. Since the signs of these two types is fixed, we
- represent them as unsigned integers. Since we want to deal with
- unicode points, the terminal is stored as unsigned long, so that we
- can include all unicode poitns. */
+ For us, a terminal is a positive integer, and a non-terminal is
+ also an integer. We represent them as unsigned integers. Since we
+ want to deal with unicode points, the terminal is stored as
+ unsigned long, so that we can include all unicode poitns.
+
+ Moreover, a terminal might be a "predicate terminal", that is, it
+ does not refer to one character only. Instead, it refers to a set
+ of characters. The most efficient way of recognizing a set of
+ characters, of which I know, is to use a deterministic finite
+ automaton. So we represent a predicate terminal as a DFA.
+
+ Whereas the DFA's are equivalent to regular expressions in the
+ expressive power, I do not need full support for regular
+ expressions for the predicate terminals. In particular, I only
+ want to match a character for one predicate terminal, so there are
+ no such things as repititive and optional constructs in the DFA.
+ Rather, each predicate terminal is a simple table, or a matrix,
+ which must have exactly 256 columns and an indefinite number of
+ rows. The columns represent the input character's bytes to match,
+ and the rows are the states of the DFA. The table must have at
+ least one row: the starting state is always the zeroth row and the
+ accepting state is any state that is out of bounds: either greater
+ than or equal to the number of rows or negative.
+
+ Of course, it is inefficient to keep the whoe table within the
+ memory, especially since the most part of the table would be
+ unused. So we store a DFA in a compressed format. I have
+ considered whether to apply some more advanced compression
+ techniques, such as using BW transoform followed by the MTF
+ algorithm, but in the end I think that the DFA's that will be used
+ are oft times just small tables, at least I hope so, thus I decided
+ to simply use the Run-Length-Encoding. See the file dfa.h for
+ implementation details. */
typedef unsigned long T;
typedef unsigned NT;
@@ -41,7 +72,9 @@ typedef struct Grammar_s Grammar;
/* G is expected to be allocated before this function.
NAMES is a list of CPA pointers. */
-unsigned char build_grammar(Grammar *g, List *rules, List *names);
+BOOL
+build_grammar(Grammar *g,
+ const List *rules, const List * const names);
/* This accepts one and only one optional argument, which is of type
either T or NT, depending on TYPE being 0 or not. */
@@ -56,7 +89,9 @@ struct cpa_s {
typedef struct cpa_s cpa;
-NUM find_in_cpa_list(NUM *string, NUM size, List *list);
+NUM
+find_in_cpa_list(const NUM * const restrict string, NUM size,
+ const List * const restrict list);
/* For debugging purposes, we need to print out rules to examine their
values. */
@@ -69,7 +104,7 @@ void print_tnt(void *element);
void print_rule(void *r);
/* This will generate errors if G is not a list of rules. */
-void print_grammar(Grammar *g);
+void print_grammar(const Grammar * const g);
/* constructors */
@@ -85,7 +120,7 @@ TNT *new_tnt_pointer(size_t size);
/* RIGHT should be created by new_tnt_string or something alike.
If RIGHT is NULL, NULL is returned. */
-Rule *new_rule(NT left, List *right);
+Rule *new_rule(NT left, const List * const right);
Grammar *new_grammar();