summaryrefslogtreecommitdiff
path: root/src/crf.h
blob: b9955fe2f02c93a88f9997f81e14836104c8901c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#ifndef CRF_H
#define CRF_H
#include "tuple.h"
#include "grammar.h"
#include "ht.h"

/* This file implements some support functions for the Clustered
   Non-terminal Parsing algorithm.  See the following paper for
   details:

   Derivation representation using binary subtree sets

   by

   Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */

/* First we implement a "Call Return Forest".

   A CRF has two types of nodes.

   The first type is the non-leaf nodes.  It is labelled as

   (X, i),

   where X is a non-terminal and i is an input position.

   The second type is the leaf nodes.  It is labelled as

   (X, a, b, i),

   where X is a non-terminal, a is an index of a rule in the array of
   rules corresponding to X, b is the length of a prefix of the
   right-hand side of the rule corresponding to a, and i is an input
   position.  The triple (X, a, b) is termed a "grammar slot" in the
   parlance.

   We need constant look-up for both types of nodes.  So we implement
   these using hash tables.

   For the first type, we implement these as a hash table, whose keys
   are those (X, i) in the set.  The values are described below.

   Actually we don't need those second type nodes.  What we care about
   are the edges between the two types of nodes.  So we simply don't
   store the second type nodes at all.  We only store edges, or more
   precisely, edge labels.  Since we need to know if an edge exists
   efficiently, we implement the edges as a hash table.  (This seems
   to be termed a "disctionary of keys" in computer science jargon.)
   So the values of the hash table of the first type node are hash
   tables whose keys are (X, a, b, i) that are in the edge set of the
   first type node, and whose values are meaningless. */

/* Now, with the newly written library tuple.h, we implement this in a
   different way.  We first observe that we don't actually need those
   nodes.  What matters is the edges, the ability to know if an edge
   exists between two given nodes, and to loop through the edges
   starting from a given node.  Then we record the existence of the
   edge between (X, i) and (Y, a, b, j) by storing the tuple

   (X, i, Y, a, b, j).

   The only thing that is troublesome is to loop through the edges
   from a given node.  So we also observe that an edge is possible
   only if j <= i.  This ensures that we don't search impossible
   edges.  Hence the worst-case time complexity of the overall
   algorithm is preserved (for every possibility, there really might
   exist an edge indeed).  The rest is premature optimization, I have
   to proceed first.  :P */

typedef struct crf_s crf;

crf *new_crf(Grammar *g, NUM input_len);

/* on error return non-zero */
BOOL crf_add_node(crf *crfp, pair2 node);

HP_ATTR BOOL crf_find_node(crf *crfp, pair2 node);

HP_ATTR BOOL crf_find_edge(crf *crfp, pair2 source, pair4 label);

BOOL crf_add_edge(crf * const restrict crfp, pair2 source,
                  pair4 label);

void destroy_crf(crf *crfp);

void crf_print(crf *crfp);

H_ATTR void crf_map(crf *crfp, pair2 label, map_pair6_t fn);

/* We also need to implement the set of pending actions.  These are
   triples of the form:

   (X, k, j).

   Since we need to find all j such that (X, k, j) belongs to the set,
   we implement the set as a hash table whose keys are (X, k) and
   whose values are those j such that (X, k, j) belongs to the set.
   Precisely speaking, the values are hash tables whose keys are those
   j and whose values are meaningless non-zero values. */

/* With the new library, this is now implemented as a "luple3". */

/* Set of Pending Actions */
typedef struct spa_s spa;

spa *new_spa(Grammar *g, NUM input_len);

void destroy_spa(spa *spap);

/* On error return non-zero */
BOOL spa_insert(spa *spap, pair3 label);

/* Find if a triple belongs to the set */
P_ATTR BOOL spa_belong(spa *spap, pair3 label);

H_ATTR void spa_map(spa *spap, pair2 label, map_pair3_t fn);

/* We also implement "process descriptors" for the CNP algorithm.  Since
   a process descriptor is of the form

   (X := beta . gamma, b, j, k),

   where X := beta gamma is a rule and b is the length of beta, we can
   represent a process descriptor as

   (X, a, b, j, k).

   However, we shall notice that there is a natural "grading" on the
   process descriptors.  The k-th grading consists of the descriptors
   with the last element in the tuple equal to k.  In the clustered
   nonterminal parsing algorithm, the process descriptors that are
   added as a consequence of dealing with a process descriptor of
   grading k must be of grade greater than or equal to k.  This means
   that if all process descriptors of grade k have been processed and
   the descriptors awaiting to be processed all have grades > k, then
   we won't see any process descriptors of grade <= k again.  So we
   can actually keep an array of process descriptors that consist of
   descriptors of degree k.  This way we don't need to store the k in
   the descriptor.  Therefore a process descriptor is represented as

   (X, a, i, j).

   Since we have no other requirements than almost constant-time
   lookup, we simply represent the set of descriptors using a hash
   table with keys (X, a, i, j).

   The main benefit of this is that after we find that all process
   descriptors in R_k have been processed, we can free those
   descriptors in U_k.  I think this is a good way to lower the space
   requirements of this algorithm. */

/* With the new library "tuple.h", we represent a process descriptor
   as a "luple5".  To implement grading, we store

   (X, a, b, j, k)

   as

   (k, X, a, b, j).

   Note that the procr set is still an array of lists, since we need
   to pop elements from procr efficiently. */

typedef pair4 prodecor;

/* The set named R in the paper */
typedef struct procr_s procr;

/* The set named U in the paper */
typedef struct procu_s procu;

/* constructor */
procr *new_procr(Grammar *g, NUM input_len, BOOL *errorp);
procu *new_procu(Grammar *g, NUM input_len, BOOL *errorp);

/* insert */

/* the function dscAdd in the paper */
BOOL desc_add(procr *pr, procu *pu, NUM grade, prodecor dec);

/* pop */

/* gradep will be set to the grade */

/* if all descriptors of grade k have been poped out, then the
   corresponding slot in pu will be freed automatically. */

/* if no more elements are left, return NULL */
prodecor *pop_procr(procr *pr, procu *pu, NUM *gradep);

void print_procr(procr *pr);

/* destructor */
void destroy_procr(procr *pr);
void destroy_procu(procu *pu);

#endif