summaryrefslogtreecommitdiff
path: root/src/grammar.h
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2022-01-11 09:58:43 +0800
committerJSDurand <mmemmew@gmail.com>2022-01-11 09:58:43 +0800
commit91016bd3855375796fd1eabd501ebc12491f2655 (patch)
treec4a608966b686c9639a90b92c31c00632532fac8 /src/grammar.h
parent949888ad5d0cd0f9f9a87f9938a632b3e32df051 (diff)
Add the framework for character classes.
Now we have the potential to recognize character classes. But the most important task for us now is to experiment with ((B)RN)GLR algorithms, so we leave the character classes at the present state for a moment.
Diffstat (limited to 'src/grammar.h')
-rw-r--r--src/grammar.h53
1 files changed, 44 insertions, 9 deletions
diff --git a/src/grammar.h b/src/grammar.h
index 05bc1fc..28f8fd8 100644
--- a/src/grammar.h
+++ b/src/grammar.h
@@ -8,14 +8,45 @@
/* The class file for grammars */
+/* TODO: Terminals should be extended to incorporate the "predicate
+ terminals", such as character classes. */
+
/* A grammar is a list of rules. A rule replaces a non-terminal with
a finite string of terminals and non-terminals.
- For us, a terminal is a positive integer, and a non-terminal is a
- negative integer. Since the signs of these two types is fixed, we
- represent them as unsigned integers. Since we want to deal with
- unicode points, the terminal is stored as unsigned long, so that we
- can include all unicode poitns. */
+ For us, a terminal is a positive integer, and a non-terminal is
+ also an integer. We represent them as unsigned integers. Since we
+ want to deal with unicode points, the terminal is stored as
+ unsigned long, so that we can include all unicode poitns.
+
+ Moreover, a terminal might be a "predicate terminal", that is, it
+ does not refer to one character only. Instead, it refers to a set
+ of characters. The most efficient way of recognizing a set of
+ characters, of which I know, is to use a deterministic finite
+ automaton. So we represent a predicate terminal as a DFA.
+
+ Whereas the DFA's are equivalent to regular expressions in the
+ expressive power, I do not need full support for regular
+ expressions for the predicate terminals. In particular, I only
+ want to match a character for one predicate terminal, so there are
+ no such things as repititive and optional constructs in the DFA.
+ Rather, each predicate terminal is a simple table, or a matrix,
+ which must have exactly 256 columns and an indefinite number of
+ rows. The columns represent the input character's bytes to match,
+ and the rows are the states of the DFA. The table must have at
+ least one row: the starting state is always the zeroth row and the
+ accepting state is any state that is out of bounds: either greater
+ than or equal to the number of rows or negative.
+
+ Of course, it is inefficient to keep the whoe table within the
+ memory, especially since the most part of the table would be
+ unused. So we store a DFA in a compressed format. I have
+ considered whether to apply some more advanced compression
+ techniques, such as using BW transoform followed by the MTF
+ algorithm, but in the end I think that the DFA's that will be used
+ are oft times just small tables, at least I hope so, thus I decided
+ to simply use the Run-Length-Encoding. See the file dfa.h for
+ implementation details. */
typedef unsigned long T;
typedef unsigned NT;
@@ -41,7 +72,9 @@ typedef struct Grammar_s Grammar;
/* G is expected to be allocated before this function.
NAMES is a list of CPA pointers. */
-unsigned char build_grammar(Grammar *g, List *rules, List *names);
+BOOL
+build_grammar(Grammar *g,
+ const List *rules, const List * const names);
/* This accepts one and only one optional argument, which is of type
either T or NT, depending on TYPE being 0 or not. */
@@ -56,7 +89,9 @@ struct cpa_s {
typedef struct cpa_s cpa;
-NUM find_in_cpa_list(NUM *string, NUM size, List *list);
+NUM
+find_in_cpa_list(const NUM * const restrict string, NUM size,
+ const List * const restrict list);
/* For debugging purposes, we need to print out rules to examine their
values. */
@@ -69,7 +104,7 @@ void print_tnt(void *element);
void print_rule(void *r);
/* This will generate errors if G is not a list of rules. */
-void print_grammar(Grammar *g);
+void print_grammar(const Grammar * const g);
/* constructors */
@@ -85,7 +120,7 @@ TNT *new_tnt_pointer(size_t size);
/* RIGHT should be created by new_tnt_string or something alike.
If RIGHT is NULL, NULL is returned. */
-Rule *new_rule(NT left, List *right);
+Rule *new_rule(NT left, const List * const right);
Grammar *new_grammar();