1 files changed, 144 insertions, 0 deletions
diff --git a/src/cnp.h b/src/cnp.h
new file mode 100644
index 0000000..c7d3965
--- /dev/null
+++ b/src/cnp.h
@@ -0,0 +1,144 @@
+#ifndef CNP_H
+#define CNP_H
+#include "ht.h"
+
+/* This file implements the Clustered Non-terminal Parsing algorithm.
+   See the following paper for details:
+
+   Derivation representation using binary subtree sets
+
+   by
+
+   Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */
+
+/* First we implement a "Call Return Forest".
+
+   A CRF has two types of nodes.
+
+   The first type is the non-leaf nodes.  It is labelled as
+
+   (X, i),
+
+   where X is a non-terminal and i is an input position.
+
+   The second type is the leaf nodes.  It is labelled as
+
+   (X, a, b, i),
+
+   where X is a non-terminal, a is an index of a rule in the array of
+   rules corresponding to X, b is the length of a prefix of the
+   right-hand side of the rule corresponding to a, and i is an input
+   position.  The triple (X, a, b) is termed a "grammar slot" in the
+   parlance.
+
+   We need constant look-up for both types of nodes.  So we implement
+   these using hash tables.
+
+   For the first type, we implement these as a hash table, whose keys
+   are those (X, i) in the set.  The values are described below.
+
+   Actually we don't need those second type nodes.  What we care about
+   are the edges between the two types of nodes.  So we simply don't
+   store the second type nodes at all.  We only store edges, or more
+   precisely, edge labels.  Since we need to know if an edge exists
+   efficiently, we implement the edges as a hash table.  (This seems
+   to be termed a "disctionary of keys" in computer science jargon.)
+   So the values of the hash table of the first type node are hash
+   tables whose keys are (X, a, b, i) that are in the edge set of the
+   first type node, and whose values are meaningless. */
+
+typedef struct crf_s crf;
+
+crf *new_crf();
+
+/* on error return non-zero */
+BOOL crf_add_node(crf * const restrict crfp, pair2 node);
+
+/* if not found return NULL.
+
+   if found but the node has no edges return an empty hash table. */
+ht *crf_find_node(crf * const restrict crfp, pair2 node);
+
+BOOL crf_add_edge(crf * const restrict crfp, pair2 source,
+                  pair4 label);
+
+void destroy_crf(crf * const restrict crfp);
+
+/* We also need to implement the set of pending actions.  These are
+   triples of the form:
+
+   (X, k, j).
+
+   Since we need to find all j such that (X, k, j) belongs to the set,
+   we implement the set as a hash table whose keys are (X, k) and
+   whose values are those j such that (X, k, j) belongs to the set. */
+
+/* TODO: Pending actions */
+
+/* We also implement "process descriptors" for the CNP algorithm.  Since
+   a process descriptor is of the form
+
+   (X := beta . gamma, b, j, k),
+
+   where X := beta gamma is a rule and b is the length of beta, we can
+   represent a process descriptor as
+
+   (X, a, b, j, k).
+
+   However, we shall notice that there is a natural "grading" on the
+   process descriptors.  The k-th grading consists of the descriptors
+   with the last element in the tuple equal to k.  In the clustered
+   nonterminal parsing algorithm, the process descriptors that are
+   added as a consequence of dealing with a process descriptor of
+   grading k must be of grade greater than or equal to k.  This means
+   that if all process descriptors of grade k have been processed and
+   the descriptors awaiting to be processed all have grades > k, then
+   we won't see any process descriptors of grade <= k again.  So we
+   can actually keep an array of process descriptors that consist of
+   descriptors of degree k.  This way we don't need to store the k in
+   the descriptor.  Therefore a process descriptor is represented as
+
+   (X, a, i, j).
+
+   Since we have no other requirements than almost constant-time
+   lookup, we simply represent the set of descriptors using a hash
+   table with keys (X, a, i, j).
+
+   The main benefit of this is that after we find that all process
+   descriptors in R_k have been processed, we can free those
+   descriptors in U_k.  I think this is a good way to lower the space
+   requirements of this algorithm. */
+
+typedef pair4 prodecor;
+
+/* The set named R in the paper */
+typedef struct procr_s procr;
+
+/* The set named U in the paper */
+typedef struct procu_s procu;
+
+/* constructor */
+procr *new_procr(NUM size, BOOL * const restrict errorp);
+procu *new_procu(NUM size, BOOL * const restrict errorp);
+
+/* insert */
+
+/* the function dscAdd in the paper */
+BOOL desc_add(procr * const restrict pr, procu * const restrict pu,
+              NUM grade, prodecor dec);
+
+/* pop */
+
+/* gradep will be set to the grade */
+
+/* if all descriptors of grade k have been poped out, then the
+   corresponding slot in pu will be freed automatically. */
+
+/* if no more elements are left, return NULL */
+prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep);
+
+/* destructor */
+void destroy_procr(procr * const restrict pr);
+void destroy_procu(procu * const restrict pu);
+
+#endif