src/grammar.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

#ifndef GRAMMAR_H
#define GRAMMAR_H

#include "dfa.h"
#include "ht.h"
#include <stdarg.h>
#include <stdio.h>
#include "list.h"
#include "utf8.h"

/* The class file for grammars */

/* A grammar is a list of rules.  A rule replaces a non-terminal with
   a finite string of terminals and non-terminals.

   For us, a terminal is a positive integer, and a non-terminal is
   also an integer.  We represent them as unsigned integers.  Since we
   want to deal with unicode points, the terminal is stored as
   unsigned long, so that we can include all unicode poitns.

   Moreover, a terminal might be a "predicate terminal", that is, it
   does not refer to one character only.  Instead, it refers to a set
   of characters.  The most efficient way of recognizing a set of
   characters, of which I know, is to use a deterministic finite
   automaton.  So we represent a predicate terminal as a DFA.

   Whereas the DFA's are equivalent to regular expressions in the
   expressive power, I do not need full support for regular
   expressions for the predicate terminals.  In particular, I only
   want to match a character for one predicate terminal, so there are
   no such things as repititive and optional constructs in the DFA.
   Rather, each predicate terminal is a simple table, or a matrix,
   which must have exactly 256 columns and an indefinite number of
   rows.  The columns represent the input character's bytes to match,
   and the rows are the states of the DFA.  The table must have at
   least one row: the starting state is always the zeroth row and the
   accepting state is any state that is out of bounds: either greater
   than or equal to the number of rows or negative.

   Of course, it is inefficient to keep the whoe table within the
   memory, especially since the most part of the table would be
   unused.  So we store a DFA in a compressed format.  I have
   considered whether to apply some more advanced compression
   techniques, such as using BW transoform followed by the MTF
   algorithm, but in the end I think that the DFA's that will be used
   are oft times just small tables, at least I hope so, thus I decided
   to simply use the Run-Length-Encoding.  See the file dfa.h for
   implementation details. */

typedef long T;
typedef unsigned long NT;
typedef unsigned long PT;

/* enum PT_TYPE_e {
 *   PT_DFA,
 *   PT_SPECIAL
 * };
 *
 * typedef enum PT_TYPE_e PT_TYPE; */

/* T or NT

   Don't worry, it is not explosive. */

enum TNT_TYPE_e {
  TERMINAL,
  PREDICATE,
  NONTERMINAL
};

typedef enum TNT_TYPE_e TNT_TYPE;

typedef struct PT_DATA_s PTD;

/* On error return NULL */
PTD *new_ptd(str *user_name, str *raw_name, dfa *dfap);

/* FLAG is used to destroy the strings contained within */
void destroy_ptd(PTD *p, int flag);

BOOL ptd_run(PTD *p, NUM code);

/* If a TNT of type TERMINAL has value END_OF_INPUT, then it means,
   surprisingly, the end of input. */
enum { END_OF_INPUT = -1 };

struct TNT_s {
  TNT_TYPE type;
  union { T t; NT nt; PT pt; } data;
};

typedef struct TNT_s TNT;
typedef struct Rule_s Rule;
typedef struct Rule_group_s Rule_group;

NT rg_left(CCR_MOD(Rule_group *) rg);
NUM rg_len(CCR_MOD(Rule_group *) rg);
List *rg_right(CCR_MOD(Rule_group *) rg);
List *rg_nth(CCR_MOD(Rule_group *) rg, NUM n);

/* A grammar is more than a list of rules: it should include a
   correspondance to associate the names of non-terminals with
   unsigned integers.  Basically, this means it should hold a list of
   strings as well. */
typedef struct Grammar_s Grammar;

/* G is expected to be allocated before this function.

   NAMES is a list of CPA pointers. */
BOOL
build_grammar(Grammar *g, const List *rules,
              CC_MOD(List *) names, CC_MOD(List *) predicates);

/* This accepts one and only one optional argument, which is of type
   either T, NT, or PT, depending on TYPE. */
TNT *new_tnt(TNT_TYPE type, ...);

/* An array of code-points. */

struct cpa_s {
  NUM *array;
  UNUM size;
};

typedef struct cpa_s cpa;

NUM
find_in_cpa_list(CCR_MOD(NUM *) string, NUM size,
                 CCR_MOD(List *) list);

/* For debugging purposes, we need to print out rules to examine their
   values. */

/* assume element is a cpa pointer */
void print_name(void *element);

void print_tnt(void *element);

void print_rule(void *r);

/* This will generate errors if G is not a list of rules. */
void print_grammar(CC_MOD(Grammar *) g);

/* constructors */

/* FORMAT specifies the types of TNTs.  It is a string of t and n's.
   FORMAT_LEN is the length of FORMAT, excluding the terminating null
   byte.

   Upon failure NULL is returned. */
List *new_tnt_string(char *format, int format_len, ...);

TNT *new_tnt_pointer(size_t size);

/* RIGHT should be created by new_tnt_string or something alike.

   If RIGHT is NULL, NULL is returned. */
Rule *new_rule(NT left, CC_MOD(List *) right);

Grammar *new_grammar();

/* destructors */
void destroy_rule(void *rule, int flag);
void destroy_rule_and_free_all(void *rule);
void destroy_rule_and_free_first(void *rule);
void destroy_rule_no_free(void *rule);
void destroy_cpa_and_free_all(void *element);
void destroy_grammar(void *grammar, int flag);

NUM grammar_left_len(CCR_MOD(Grammar *)g);

List *grammar_names(CCR_MOD(Grammar *)g);

/* look up a rule */
Rule_group *grammar_rule(CCR_MOD(Grammar *) g, NT nt);

/* lookup a predicate */
PTD *grammar_ptd(CCR_MOD(Grammar *) g, PT pt);

/* NTS should be already allocated to be an array of BOOLs.  The size
   of this array should be the same as the length of the array of
   non-terminals of G.  If any of the above two conditions is not
   satisfied, the behaviour is undefined.

   After this function returns, the element corresponding to a
   non-terminal X is non-zero if and only if X has epsilon rules; that
   is to say, X can produce the empty string. */
void epsilon_nts(CC_MOD(Grammar *) g, BOOL * const restrict nts);

/* TERMINAL_HTS and PREDICATE_HTS are supposed to be already allocated
   to have enough space to contain an array of hash tables of the same
   length as the array of non-terminals.  Each element in the array
   should be hash tables that are already created.  If any of the
   above conditions is not satisfied, the behaviour is undefined.

   TERMINAL_HTS contain mappings from non-terminals to sets of
   terminals.  A terminal belongs to the hash table corresponding to
   the non-terminal X in TERMINAL_HTS if and only if that terminal
   belongs to the first set of X.  Mutatis mutandis for PREDICATE_HTS.
   The reason for two arrays is that terminals and predicates have two
   different types in our grammar, and are stored in different arrays,
   so we cannot simply refer to them as a single integer.

   The hash tables that result will contain invalid pointers as values
   after the function returns.

   On error return non-zero. */
BOOL nt_first(CC_MOD(Grammar *) g, CCR_MOD(BOOL *) nts,
              ht *terminal_hts, ht *predicate_hts);

/* Having computed the first sets of non-terminals, we can compute the
   first sets of strings of non-terminals and terminals.
   RESULT_TERMINALS and RESULT_PREDICATES must satisfy similar
   requirements as in nt_first.

   The values are meaningless invalid pointers.  So one should not
   free those pointers afterwards.

   On error return non-zero. */
BOOL tnt_first(CC_MOD(ht *) terminal_hts, CC_MOD(ht *) predicate_hts,
               CCR_MOD(BOOL *) nts, NUM len, CCR_MOD(List *) tnts,
               ht * const restrict result_terminals,
               ht * const restrict result_predicates);

/* Mutatis mutandis to NT_FIRST, but for the follow sets of
   non-terminals.

   Note that the hash table RESULT_TERMINAL_HTS is special here.  It
   not only contains keys for the terminals in the follow set of the
   corresponding nonterminal, but it also contains END_OF_INPUT for
   the nonterminals whose follow set contains the end-of-input marker.

   On error return non-zero. */
BOOL nt_follow(CC_MOD(Grammar *) g, CCR_MOD(BOOL *) nts,
               CC_MOD(ht *)terminal_hts, CC_MOD(ht *)predicate_hts,
               ht * const result_terminals,
               ht * const result_predicates);

/* TODO: Compute which non-terminals are "LL(1)" non-terminals.  With
   this information, we can save the time to insert a process
   descriptor and pop that descriptor, if we know the non-terminal in
   question is "LL(1)".  A non-terminal A is defined as "LL(1)" if its
   different rules have disjoint FIRST sets, and if FIRST(A) is
   disjoint from FOLLOW(A), when A is nullable. */

/* struct that holds information about grammars */

typedef struct grammar_info_s grammar_info;

struct grammar_info_s {
  /* the grammar itself */
  Grammar *g;
  /* the array about epsilon non-terminals */
  BOOL *nts;
  /* the array of hash tables of first (premier) terminals */
  ht *pts;
  /* the array of hash tables of first (premier) predicate
     terminals */
  ht *pps;
  /* the array of hash tables of follow (suivant) terminals */
  ht *sts;
  /* the array of hash tables of follow (suivant) predicate
     terminals */
  ht *sps;
};

/* convenient macro */

#define NEW_CPA(X, ARRAY, SIZE) do {            \
    X = MYALLOC(cpa, 1);                        \
    X->array = ARRAY;                           \
    X->size = SIZE;                             \
  } while (0)

#endif