src/cnp.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

#ifndef CNP_H
#define CNP_H
#include "ht.h"

/* This file implements the Clustered Non-terminal Parsing algorithm.
   See the following paper for details:

   Derivation representation using binary subtree sets

   by

   Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */

/* First we implement a "Call Return Forest".

   A CRF has two types of nodes.

   The first type is the non-leaf nodes.  It is labelled as

   (X, i),

   where X is a non-terminal and i is an input position.

   The second type is the leaf nodes.  It is labelled as

   (X, a, b, i),

   where X is a non-terminal, a is an index of a rule in the array of
   rules corresponding to X, b is the length of a prefix of the
   right-hand side of the rule corresponding to a, and i is an input
   position.  The triple (X, a, b) is termed a "grammar slot" in the
   parlance.

   We need constant look-up for both types of nodes.  So we implement
   these using hash tables.

   For the first type, we implement these as a hash table, whose keys
   are those (X, i) in the set.  The values are described below.

   Actually we don't need those second type nodes.  What we care about
   are the edges between the two types of nodes.  So we simply don't
   store the second type nodes at all.  We only store edges, or more
   precisely, edge labels.  Since we need to know if an edge exists
   efficiently, we implement the edges as a hash table.  (This seems
   to be termed a "disctionary of keys" in computer science jargon.)
   So the values of the hash table of the first type node are hash
   tables whose keys are (X, a, b, i) that are in the edge set of the
   first type node, and whose values are meaningless. */

typedef struct crf_s crf;

crf *new_crf();

/* on error return non-zero */
BOOL crf_add_node(crf * const restrict crfp, pair2 node);

/* if not found return NULL.

   if found but the node has no edges return an empty hash table. */
ht *crf_find_node(crf * const restrict crfp, pair2 node);

BOOL crf_add_edge(crf * const restrict crfp, pair2 source,
                  pair4 label);

void destroy_crf(crf * const restrict crfp);

/* We also need to implement the set of pending actions.  These are
   triples of the form:

   (X, k, j).

   Since we need to find all j such that (X, k, j) belongs to the set,
   we implement the set as a hash table whose keys are (X, k) and
   whose values are those j such that (X, k, j) belongs to the set. */

/* TODO: Pending actions */

/* We also implement "process descriptors" for the CNP algorithm.  Since
   a process descriptor is of the form

   (X := beta . gamma, b, j, k),

   where X := beta gamma is a rule and b is the length of beta, we can
   represent a process descriptor as

   (X, a, b, j, k).

   However, we shall notice that there is a natural "grading" on the
   process descriptors.  The k-th grading consists of the descriptors
   with the last element in the tuple equal to k.  In the clustered
   nonterminal parsing algorithm, the process descriptors that are
   added as a consequence of dealing with a process descriptor of
   grading k must be of grade greater than or equal to k.  This means
   that if all process descriptors of grade k have been processed and
   the descriptors awaiting to be processed all have grades > k, then
   we won't see any process descriptors of grade <= k again.  So we
   can actually keep an array of process descriptors that consist of
   descriptors of degree k.  This way we don't need to store the k in
   the descriptor.  Therefore a process descriptor is represented as

   (X, a, i, j).

   Since we have no other requirements than almost constant-time
   lookup, we simply represent the set of descriptors using a hash
   table with keys (X, a, i, j).

   The main benefit of this is that after we find that all process
   descriptors in R_k have been processed, we can free those
   descriptors in U_k.  I think this is a good way to lower the space
   requirements of this algorithm. */

typedef pair4 prodecor;

/* The set named R in the paper */
typedef struct procr_s procr;

/* The set named U in the paper */
typedef struct procu_s procu;

/* constructor */
procr *new_procr(NUM size, BOOL * const restrict errorp);
procu *new_procu(NUM size, BOOL * const restrict errorp);

/* insert */

/* the function dscAdd in the paper */
BOOL desc_add(procr * const restrict pr, procu * const restrict pu,
              NUM grade, prodecor dec);

/* pop */

/* gradep will be set to the grade */

/* if all descriptors of grade k have been poped out, then the
   corresponding slot in pu will be freed automatically. */

/* if no more elements are left, return NULL */
prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep);

/* destructor */
void destroy_procr(procr * const restrict pr);
void destroy_procu(procu * const restrict pu);

#endif