1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
|
#ifndef GRAMMAR_H
#define GRAMMAR_H
#include <stdarg.h>
#include <stdio.h>
#include "list.h"
#include "utf8.h"
/* The class file for grammars */
/* TODO: Terminals should be extended to incorporate the "predicate
terminals", such as character classes. */
/* A grammar is a list of rules. A rule replaces a non-terminal with
a finite string of terminals and non-terminals.
For us, a terminal is a positive integer, and a non-terminal is
also an integer. We represent them as unsigned integers. Since we
want to deal with unicode points, the terminal is stored as
unsigned long, so that we can include all unicode poitns.
Moreover, a terminal might be a "predicate terminal", that is, it
does not refer to one character only. Instead, it refers to a set
of characters. The most efficient way of recognizing a set of
characters, of which I know, is to use a deterministic finite
automaton. So we represent a predicate terminal as a DFA.
Whereas the DFA's are equivalent to regular expressions in the
expressive power, I do not need full support for regular
expressions for the predicate terminals. In particular, I only
want to match a character for one predicate terminal, so there are
no such things as repititive and optional constructs in the DFA.
Rather, each predicate terminal is a simple table, or a matrix,
which must have exactly 256 columns and an indefinite number of
rows. The columns represent the input character's bytes to match,
and the rows are the states of the DFA. The table must have at
least one row: the starting state is always the zeroth row and the
accepting state is any state that is out of bounds: either greater
than or equal to the number of rows or negative.
Of course, it is inefficient to keep the whoe table within the
memory, especially since the most part of the table would be
unused. So we store a DFA in a compressed format. I have
considered whether to apply some more advanced compression
techniques, such as using BW transoform followed by the MTF
algorithm, but in the end I think that the DFA's that will be used
are oft times just small tables, at least I hope so, thus I decided
to simply use the Run-Length-Encoding. See the file dfa.h for
implementation details. */
typedef unsigned long T;
typedef unsigned NT;
/* T or NT
Don't worry, it is not explosive. */
struct TNT_s {
int type; /* 0 => T and NT otherwise */
union { T t; NT nt; } data;
};
typedef struct TNT_s TNT;
typedef struct Rule_s Rule;
/* A grammar is more than a list of rules: it should include a
correspondance to associate the names of non-terminals with
unsigned integers. Basically, this means it should hold a list of
strings as well. */
typedef struct Grammar_s Grammar;
/* G is expected to be allocated before this function.
NAMES is a list of CPA pointers. */
BOOL
build_grammar(Grammar *g,
const List *rules, const List * const names);
/* This accepts one and only one optional argument, which is of type
either T or NT, depending on TYPE being 0 or not. */
TNT *new_tnt(int type, ...);
/* An array of code-points. */
struct cpa_s {
NUM *array;
UNUM size;
};
typedef struct cpa_s cpa;
NUM
find_in_cpa_list(const NUM * const restrict string, NUM size,
const List * const restrict list);
/* For debugging purposes, we need to print out rules to examine their
values. */
/* assume element is a cpa pointer */
void print_name(void *element);
void print_tnt(void *element);
void print_rule(void *r);
/* This will generate errors if G is not a list of rules. */
void print_grammar(const Grammar * const g);
/* constructors */
/* FORMAT specifies the types of TNTs. It is a string of t and n's.
FORMAT_LEN is the length of FORMAT, excluding the terminating null
byte.
Upon failure NULL is returned. */
List *new_tnt_string(char *format, int format_len, ...);
TNT *new_tnt_pointer(size_t size);
/* RIGHT should be created by new_tnt_string or something alike.
If RIGHT is NULL, NULL is returned. */
Rule *new_rule(NT left, const List * const right);
Grammar *new_grammar();
/* destructors */
void destroy_rule(void *rule, int flag);
void destroy_rule_and_free_all(void *rule);
void destroy_rule_and_free_first(void *rule);
void destroy_rule_no_free(void *rule);
void destroy_cpa_and_free_all(void *element);
void destroy_grammar(void *grammar, int flag);
/* convenient macro */
#define NEW_CPA(X, ARRAY, SIZE) do { \
X = MYALLOC(cpa, 1); \
X->array = ARRAY; \
X->size = SIZE; \
} while (0)
#endif
|