summaryrefslogtreecommitdiff
path: root/src/cnp.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/cnp.h')
-rw-r--r--src/cnp.h144
1 files changed, 144 insertions, 0 deletions
diff --git a/src/cnp.h b/src/cnp.h
new file mode 100644
index 0000000..c7d3965
--- /dev/null
+++ b/src/cnp.h
@@ -0,0 +1,144 @@
+#ifndef CNP_H
+#define CNP_H
+#include "ht.h"
+
+/* This file implements the Clustered Non-terminal Parsing algorithm.
+ See the following paper for details:
+
+ Derivation representation using binary subtree sets
+
+ by
+
+ Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */
+
+/* First we implement a "Call Return Forest".
+
+ A CRF has two types of nodes.
+
+ The first type is the non-leaf nodes. It is labelled as
+
+ (X, i),
+
+ where X is a non-terminal and i is an input position.
+
+ The second type is the leaf nodes. It is labelled as
+
+ (X, a, b, i),
+
+ where X is a non-terminal, a is an index of a rule in the array of
+ rules corresponding to X, b is the length of a prefix of the
+ right-hand side of the rule corresponding to a, and i is an input
+ position. The triple (X, a, b) is termed a "grammar slot" in the
+ parlance.
+
+ We need constant look-up for both types of nodes. So we implement
+ these using hash tables.
+
+ For the first type, we implement these as a hash table, whose keys
+ are those (X, i) in the set. The values are described below.
+
+ Actually we don't need those second type nodes. What we care about
+ are the edges between the two types of nodes. So we simply don't
+ store the second type nodes at all. We only store edges, or more
+ precisely, edge labels. Since we need to know if an edge exists
+ efficiently, we implement the edges as a hash table. (This seems
+ to be termed a "disctionary of keys" in computer science jargon.)
+ So the values of the hash table of the first type node are hash
+ tables whose keys are (X, a, b, i) that are in the edge set of the
+ first type node, and whose values are meaningless. */
+
+typedef struct crf_s crf;
+
+crf *new_crf();
+
+/* on error return non-zero */
+BOOL crf_add_node(crf * const restrict crfp, pair2 node);
+
+/* if not found return NULL.
+
+ if found but the node has no edges return an empty hash table. */
+ht *crf_find_node(crf * const restrict crfp, pair2 node);
+
+BOOL crf_add_edge(crf * const restrict crfp, pair2 source,
+ pair4 label);
+
+void destroy_crf(crf * const restrict crfp);
+
+/* We also need to implement the set of pending actions. These are
+ triples of the form:
+
+ (X, k, j).
+
+ Since we need to find all j such that (X, k, j) belongs to the set,
+ we implement the set as a hash table whose keys are (X, k) and
+ whose values are those j such that (X, k, j) belongs to the set. */
+
+/* TODO: Pending actions */
+
+/* We also implement "process descriptors" for the CNP algorithm. Since
+ a process descriptor is of the form
+
+ (X := beta . gamma, b, j, k),
+
+ where X := beta gamma is a rule and b is the length of beta, we can
+ represent a process descriptor as
+
+ (X, a, b, j, k).
+
+ However, we shall notice that there is a natural "grading" on the
+ process descriptors. The k-th grading consists of the descriptors
+ with the last element in the tuple equal to k. In the clustered
+ nonterminal parsing algorithm, the process descriptors that are
+ added as a consequence of dealing with a process descriptor of
+ grading k must be of grade greater than or equal to k. This means
+ that if all process descriptors of grade k have been processed and
+ the descriptors awaiting to be processed all have grades > k, then
+ we won't see any process descriptors of grade <= k again. So we
+ can actually keep an array of process descriptors that consist of
+ descriptors of degree k. This way we don't need to store the k in
+ the descriptor. Therefore a process descriptor is represented as
+
+ (X, a, i, j).
+
+ Since we have no other requirements than almost constant-time
+ lookup, we simply represent the set of descriptors using a hash
+ table with keys (X, a, i, j).
+
+ The main benefit of this is that after we find that all process
+ descriptors in R_k have been processed, we can free those
+ descriptors in U_k. I think this is a good way to lower the space
+ requirements of this algorithm. */
+
+typedef pair4 prodecor;
+
+/* The set named R in the paper */
+typedef struct procr_s procr;
+
+/* The set named U in the paper */
+typedef struct procu_s procu;
+
+/* constructor */
+procr *new_procr(NUM size, BOOL * const restrict errorp);
+procu *new_procu(NUM size, BOOL * const restrict errorp);
+
+/* insert */
+
+/* the function dscAdd in the paper */
+BOOL desc_add(procr * const restrict pr, procu * const restrict pu,
+ NUM grade, prodecor dec);
+
+/* pop */
+
+/* gradep will be set to the grade */
+
+/* if all descriptors of grade k have been poped out, then the
+ corresponding slot in pu will be freed automatically. */
+
+/* if no more elements are left, return NULL */
+prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep);
+
+/* destructor */
+void destroy_procr(procr * const restrict pr);
+void destroy_procu(procu * const restrict pu);
+
+#endif