1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
|
#ifndef CRF_H
#define CRF_H
#include "tuple.h"
#include "grammar.h"
#include "ht.h"
/* This file implements some support functions for the Clustered
Non-terminal Parsing algorithm. See the following paper for
details:
Derivation representation using binary subtree sets
by
Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */
/* First we implement a "Call Return Forest".
A CRF has two types of nodes.
The first type is the non-leaf nodes. It is labelled as
(X, i),
where X is a non-terminal and i is an input position.
The second type is the leaf nodes. It is labelled as
(X, a, b, i),
where X is a non-terminal, a is an index of a rule in the array of
rules corresponding to X, b is the length of a prefix of the
right-hand side of the rule corresponding to a, and i is an input
position. The triple (X, a, b) is termed a "grammar slot" in the
parlance.
We need constant look-up for both types of nodes. So we implement
these using hash tables.
For the first type, we implement these as a hash table, whose keys
are those (X, i) in the set. The values are described below.
Actually we don't need those second type nodes. What we care about
are the edges between the two types of nodes. So we simply don't
store the second type nodes at all. We only store edges, or more
precisely, edge labels. Since we need to know if an edge exists
efficiently, we implement the edges as a hash table. (This seems
to be termed a "disctionary of keys" in computer science jargon.)
So the values of the hash table of the first type node are hash
tables whose keys are (X, a, b, i) that are in the edge set of the
first type node, and whose values are meaningless. */
/* Now, with the newly written library tuple.h, we implement this in a
different way. We first observe that we don't actually need those
nodes. What matters is the edges, the ability to know if an edge
exists between two given nodes, and to loop through the edges
starting from a given node. Then we record the existence of the
edge between (X, i) and (Y, a, b, j) by storing the tuple
(X, i, Y, a, b, j).
The only thing that is troublesome is to loop through the edges
from a given node. So we also observe that an edge is possible
only if j <= i. This ensures that we don't search impossible
edges. Hence the worst-case time complexity of the overall
algorithm is preserved (for every possibility, there really might
exist an edge indeed). The rest is premature optimization, I have
to proceed first. :P */
typedef struct crf_s crf;
crf *new_crf(Grammar *g, NUM input_len);
/* on error return non-zero */
BOOL crf_add_node(crf *crfp, pair2 node);
HP_ATTR BOOL crf_find_node(crf *crfp, pair2 node);
HP_ATTR BOOL crf_find_edge(crf *crfp, pair2 source, pair4 label);
BOOL crf_add_edge(crf * const restrict crfp, pair2 source,
pair4 label);
void destroy_crf(crf *crfp);
void crf_print(crf *crfp);
H_ATTR void crf_map(crf *crfp, pair2 label, map_pair6_t fn);
/* We also need to implement the set of pending actions. These are
triples of the form:
(X, k, j).
Since we need to find all j such that (X, k, j) belongs to the set,
we implement the set as a hash table whose keys are (X, k) and
whose values are those j such that (X, k, j) belongs to the set.
Precisely speaking, the values are hash tables whose keys are those
j and whose values are meaningless non-zero values. */
/* With the new library, this is now implemented as a "luple3". */
/* Set of Pending Actions */
typedef struct spa_s spa;
spa *new_spa(Grammar *g, NUM input_len);
void destroy_spa(spa *spap);
/* On error return non-zero */
BOOL spa_insert(spa *spap, pair3 label);
/* Find if a triple belongs to the set */
P_ATTR BOOL spa_belong(spa *spap, pair3 label);
H_ATTR void spa_map(spa *spap, pair2 label, map_pair3_t fn);
/* We also implement "process descriptors" for the CNP algorithm. Since
a process descriptor is of the form
(X := beta . gamma, b, j, k),
where X := beta gamma is a rule and b is the length of beta, we can
represent a process descriptor as
(X, a, b, j, k).
However, we shall notice that there is a natural "grading" on the
process descriptors. The k-th grading consists of the descriptors
with the last element in the tuple equal to k. In the clustered
nonterminal parsing algorithm, the process descriptors that are
added as a consequence of dealing with a process descriptor of
grading k must be of grade greater than or equal to k. This means
that if all process descriptors of grade k have been processed and
the descriptors awaiting to be processed all have grades > k, then
we won't see any process descriptors of grade <= k again. So we
can actually keep an array of process descriptors that consist of
descriptors of degree k. This way we don't need to store the k in
the descriptor. Therefore a process descriptor is represented as
(X, a, i, j).
Since we have no other requirements than almost constant-time
lookup, we simply represent the set of descriptors using a hash
table with keys (X, a, i, j).
The main benefit of this is that after we find that all process
descriptors in R_k have been processed, we can free those
descriptors in U_k. I think this is a good way to lower the space
requirements of this algorithm. */
/* With the new library "tuple.h", we represent a process descriptor
as a "luple5". To implement grading, we store
(X, a, b, j, k)
as
(k, X, a, b, j).
Note that the procr set is still an array of lists, since we need
to pop elements from procr efficiently. */
typedef pair4 prodecor;
/* The set named R in the paper */
typedef struct procr_s procr;
/* The set named U in the paper */
typedef struct procu_s procu;
/* constructor */
procr *new_procr(Grammar *g, NUM input_len, BOOL *errorp);
procu *new_procu(Grammar *g, NUM input_len, BOOL *errorp);
/* insert */
/* the function dscAdd in the paper */
BOOL desc_add(procr *pr, procu *pu, NUM grade, prodecor dec);
/* pop */
/* gradep will be set to the grade */
/* if all descriptors of grade k have been poped out, then the
corresponding slot in pu will be freed automatically. */
/* if no more elements are left, return NULL */
prodecor *pop_procr(procr *pr, procu *pu, NUM *gradep);
void print_procr(procr *pr);
/* destructor */
void destroy_procr(procr *pr);
void destroy_procu(procu *pu);
#endif
|