summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/bsr.h2
-rw-r--r--src/cnp.c362
-rw-r--r--src/cnp.h144
-rw-r--r--src/crf.c3
-rw-r--r--src/crf.h43
-rw-r--r--src/test/check_cnp.c103
6 files changed, 610 insertions, 47 deletions
diff --git a/src/bsr.h b/src/bsr.h
index b92e20e..6c5e17c 100644
--- a/src/bsr.h
+++ b/src/bsr.h
@@ -83,7 +83,7 @@ bsr *new_bsr(BOOL * const restrict errorp, Grammar *g);
data is stored. */
void destroy_bsr(bsr * const restrict bsrp);
-/* the function in the paper */
+/* the function bsrAdd in the paper */
/* X = non-terminal index, a = index of alternate, c = length of
prefix */
diff --git a/src/cnp.c b/src/cnp.c
new file mode 100644
index 0000000..fe8212e
--- /dev/null
+++ b/src/cnp.c
@@ -0,0 +1,362 @@
+#include <stdio.h>
+#include <string.h>
+#include "util.h"
+#include "list.h"
+#include "cnp.h"
+
+struct crf_s {
+ ht *table;
+};
+
+crf *
+new_crf()
+{
+ crf *crfp = NULL;
+ SAFE_MALLOC(crf, crfp, 1, goto cleanup;);
+
+ crfp->table = new_ht2(HT_INIT_CAP);
+
+ if (crfp->table == NULL) goto cleanup;
+
+ return crfp;
+
+ cleanup:
+
+ if (crfp) free(crfp);
+
+ return NULL;
+}
+
+BOOL
+crf_add_node(crf * const restrict crfp, pair2 node)
+{
+ if (ht_find(crfp->table, &node)) return 0;
+
+ pair2 *p2 = NULL;
+
+ NEW_P2(p2, node.x, node.y, return 1;);
+
+ ht *value_table = new_ht4(HT_INIT_CAP);
+
+ if (value_table == NULL) {
+ free(p2);
+ return 1;
+ }
+
+ if (ht_insert(crfp->table, p2, value_table)) {
+ fleprintf0("Fail to insert\n");
+ destroy_ht(value_table, DESTROY_NONE_SELF);
+ free(p2);
+ return 1;
+ }
+
+ return 0;
+}
+
+ht *
+crf_find_node(crf * const restrict crfp, pair2 node)
+{
+ return ht_find(crfp->table, &node);
+}
+
+BOOL
+crf_add_edge(crf * const restrict crfp, pair2 source, pair4 label)
+{
+ if (ht_find(crfp->table, &source) == NULL) {
+ fleprintf0("add the node before adding its edges\n");
+ return 1;
+ }
+
+ if (ht_find(ht_find(crfp->table, &source), &label)) return 0;
+
+ pair4 *p = NULL;
+
+ NEW_P4(p, label.x, label.y, label.z, label.w, return 1;);
+
+ if (ht_insert(ht_find(crfp->table, &source), p, (void*)1)) {
+ fleprintf0("Fail to insert\n");
+ free(p);
+ return 1;
+ }
+
+ return 0;
+}
+
+void
+destroy_crf(crf * const restrict crfp)
+{
+
+#define SIZE (ht_size(crfp->table))
+#define VALUES ((ht**) ht_values(crfp->table))
+
+ for (NUM i = 0; i < SIZE;)
+ destroy_ht(*(VALUES+i++), DESTROY_KEY_SELF);
+
+ destroy_ht(crfp->table, DESTROY_KEY_SELF);
+
+ free(crfp);
+
+#undef SIZE
+#undef VALUES
+
+}
+
+typedef struct procr_array_element_s procr_array_element;
+
+/* The list at index k is the set R_k. */
+struct procr_array_element_s {
+ BOOL initialized;
+ List *ls;
+};
+
+struct procr_s {
+ /* input length */
+ NUM len;
+ /* an array of lists */
+ procr_array_element *array;
+ /* minimal grade that is non-empty */
+ NUM mg;
+};
+
+typedef struct procu_array_element_s procu_array_element;
+
+/* The table at index k represents the set U_k. Its keys are those
+ (X, a, i, j) such that (X, a, i, j, k) belongs to U_k. */
+struct procu_array_element_s {
+ BOOL initialized;
+ ht *table;
+};
+
+struct procu_s {
+ /* input length */
+ NUM len;
+ /* an array of hash tables */
+ procu_array_element *array;
+};
+
+procr *
+new_procr(NUM size, BOOL * const restrict errorp)
+{
+ *errorp = 0;
+
+ procr *pr = NULL;
+
+ SAFE_MALLOC(procr, pr, 1, goto cleanup;);
+
+ pr->len = size;
+ pr->array = NULL;
+
+ SAFE_MALLOC(procr_array_element, pr->array,
+ sizeof(procr_array_element) * size,
+ goto cleanup;);
+
+ memset(pr->array, 0, sizeof(procr_array_element) * size);
+
+ goto success;
+
+ cleanup:
+
+ *errorp = 1;
+
+ if (pr && pr->array) free(pr->array);
+
+ if (pr) free(pr);
+
+ success:
+
+ return pr;
+}
+
+procu *
+new_procu(NUM size, BOOL * const restrict errorp)
+{
+ *errorp = 0;
+
+ procu *pu = NULL;
+
+ SAFE_MALLOC(procu, pu, 1, goto cleanup;);
+
+ pu->len = size;
+ pu->array = NULL;
+
+ SAFE_MALLOC(procu_array_element, pu->array,
+ sizeof(procu_array_element) * size,
+ goto cleanup;);
+
+ memset(pu->array, 0, sizeof(procu_array_element) * size);
+
+ goto success;
+
+ cleanup:
+
+ *errorp = 1;
+
+ if (pu && pu->array) free(pu->array);
+
+ if (pu) free(pu);
+
+ success:
+
+ return pu;
+}
+
+void
+destroy_procr(procr * const restrict pr)
+{
+ for (NUM i = 0; i < pr->len; i++)
+ if ((pr->array+i)->initialized)
+ destroy_list((pr->array+i)->ls, 1);
+
+ free(pr->array);
+
+ free(pr);
+}
+
+void
+destroy_procu(procu * const restrict pu)
+{
+ for (NUM i = 0; i < pu->len; i++)
+ if ((pu->array+i)->initialized)
+ destroy_ht((pu->array+i)->table, DESTROY_KEY_SELF);
+
+ free(pu->array);
+
+ free(pu);
+}
+
+BOOL
+desc_add(procr * const restrict pr, procu * const restrict pu,
+ NUM grade, prodecor dec)
+{
+ if (grade < 0 || grade >= pu->len) {
+ fleprintf("Invalid grade: %ld\n", grade);
+ goto cleanup;
+ }
+
+ pair4 *p4 = NULL;
+
+#define HELE (pu->array+grade)
+#define LELE (pr->array+grade)
+
+ if (HELE->initialized) {
+ /* If HELE is initialized then LELE should also be initialized. */
+ if (ht_find(HELE->table, &dec) != NULL) goto success;
+
+ NEW_P4(p4, dec.x, dec.y, dec.z, dec.w, goto cleanup;);
+
+ if (ht_insert(HELE->table, p4, (void*)1)) {
+ fleprintf0("Fail to insert\n");
+ goto cleanup;
+ }
+
+ NEW_P4(p4, dec.x, dec.y, dec.z, dec.w,
+ ht_delete(HELE->table, &dec, DELETE_KEY); goto cleanup;);
+
+ if (add_to_list(LELE->ls, p4)) {
+ fleprintf0("Fail to add to list\n");
+ ht_delete(HELE->table, &dec, DELETE_KEY);
+ goto cleanup;
+ }
+
+ goto success;
+ }
+
+ HELE->table = new_ht4(HT_INIT_CAP);
+
+ if (HELE->table == NULL) {
+ fleprintf0("Fail to have new hash table\n");
+ goto cleanup;
+ }
+
+ HELE->initialized = 1;
+
+ NEW_P4(p4, dec.x, dec.y, dec.z, dec.w,
+ destroy_ht(HELE->table, DESTROY_KEY_SELF);
+ HELE->initialized = 0;
+ goto cleanup;);
+
+ if (ht_insert(HELE->table, p4, (void*)1)) {
+ fleprintf0("Fail to insert\n");
+ destroy_ht(HELE->table, DESTROY_KEY_SELF);
+ HELE->initialized = 0;
+ goto cleanup;
+ }
+
+ LELE->ls = new_list();
+
+ if (LELE->ls == NULL) {
+ fleprintf0("Fail to have new list\n");
+ destroy_ht(HELE->table, DESTROY_KEY_SELF);
+ HELE->initialized = 0;
+ goto cleanup;
+ }
+
+ LELE->initialized = 1;
+
+ NEW_P4(p4, dec.x, dec.y, dec.z, dec.w,
+ destroy_ht(HELE->table, DESTROY_KEY_SELF);
+ HELE->initialized = 0;
+ destroy_list(LELE->ls, 1);
+ LELE->initialized = 0;
+ goto cleanup;);
+
+ if (add_to_list(LELE->ls, p4)) {
+ fleprintf0("Fail to add to list\n");
+ destroy_ht(HELE->table, DESTROY_KEY_SELF);
+ HELE->initialized = 0;
+ destroy_list(LELE->ls, 1);
+ LELE->initialized = 0;
+ goto cleanup;
+ }
+
+#undef HELE
+#undef LELE
+
+ goto success;
+
+ cleanup:
+
+ return 1;
+
+ success:
+
+ return 0;
+}
+
+prodecor *
+pop_procr(procr * pr, procu * pu, NUM *gradep)
+{
+ prodecor *result = NULL;
+
+ if (pr->mg >= pr->len) goto success;
+
+#define ELEMENT (pr->array+pr->mg)
+
+ if (!(ELEMENT->initialized)) goto reset_mg;
+
+ *gradep = pr->mg;
+
+ result = pop_from_list(ELEMENT->ls);
+
+ if (list_length(ELEMENT->ls) == 0) {
+ destroy_ht((pu->array+pr->mg)->table, DESTROY_KEY_SELF);
+ (pu->array+pr->mg)->initialized = 0;
+ (pu->array+pr->mg)->table = NULL;
+
+ destroy_list(ELEMENT->ls, 1);
+ ELEMENT->initialized = 0;
+ ELEMENT->ls = NULL;
+
+ /* REVIEW: Maybe we can just increment the minimal grade, as we
+ won't jump over two steps? */
+ reset_mg:
+ while (++(pr->mg) < pr->len) {
+ if (ELEMENT->initialized) break;
+ }
+ }
+
+#undef ELEMENT
+
+ success:
+ return result;
+}
diff --git a/src/cnp.h b/src/cnp.h
new file mode 100644
index 0000000..c7d3965
--- /dev/null
+++ b/src/cnp.h
@@ -0,0 +1,144 @@
+#ifndef CNP_H
+#define CNP_H
+#include "ht.h"
+
+/* This file implements the Clustered Non-terminal Parsing algorithm.
+ See the following paper for details:
+
+ Derivation representation using binary subtree sets
+
+ by
+
+ Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */
+
+/* First we implement a "Call Return Forest".
+
+ A CRF has two types of nodes.
+
+ The first type is the non-leaf nodes. It is labelled as
+
+ (X, i),
+
+ where X is a non-terminal and i is an input position.
+
+ The second type is the leaf nodes. It is labelled as
+
+ (X, a, b, i),
+
+ where X is a non-terminal, a is an index of a rule in the array of
+ rules corresponding to X, b is the length of a prefix of the
+ right-hand side of the rule corresponding to a, and i is an input
+ position. The triple (X, a, b) is termed a "grammar slot" in the
+ parlance.
+
+ We need constant look-up for both types of nodes. So we implement
+ these using hash tables.
+
+ For the first type, we implement these as a hash table, whose keys
+ are those (X, i) in the set. The values are described below.
+
+ Actually we don't need those second type nodes. What we care about
+ are the edges between the two types of nodes. So we simply don't
+ store the second type nodes at all. We only store edges, or more
+ precisely, edge labels. Since we need to know if an edge exists
+ efficiently, we implement the edges as a hash table. (This seems
+ to be termed a "disctionary of keys" in computer science jargon.)
+ So the values of the hash table of the first type node are hash
+ tables whose keys are (X, a, b, i) that are in the edge set of the
+ first type node, and whose values are meaningless. */
+
+typedef struct crf_s crf;
+
+crf *new_crf();
+
+/* on error return non-zero */
+BOOL crf_add_node(crf * const restrict crfp, pair2 node);
+
+/* if not found return NULL.
+
+ if found but the node has no edges return an empty hash table. */
+ht *crf_find_node(crf * const restrict crfp, pair2 node);
+
+BOOL crf_add_edge(crf * const restrict crfp, pair2 source,
+ pair4 label);
+
+void destroy_crf(crf * const restrict crfp);
+
+/* We also need to implement the set of pending actions. These are
+ triples of the form:
+
+ (X, k, j).
+
+ Since we need to find all j such that (X, k, j) belongs to the set,
+ we implement the set as a hash table whose keys are (X, k) and
+ whose values are those j such that (X, k, j) belongs to the set. */
+
+/* TODO: Pending actions */
+
+/* We also implement "process descriptors" for the CNP algorithm. Since
+ a process descriptor is of the form
+
+ (X := beta . gamma, b, j, k),
+
+ where X := beta gamma is a rule and b is the length of beta, we can
+ represent a process descriptor as
+
+ (X, a, b, j, k).
+
+ However, we shall notice that there is a natural "grading" on the
+ process descriptors. The k-th grading consists of the descriptors
+ with the last element in the tuple equal to k. In the clustered
+ nonterminal parsing algorithm, the process descriptors that are
+ added as a consequence of dealing with a process descriptor of
+ grading k must be of grade greater than or equal to k. This means
+ that if all process descriptors of grade k have been processed and
+ the descriptors awaiting to be processed all have grades > k, then
+ we won't see any process descriptors of grade <= k again. So we
+ can actually keep an array of process descriptors that consist of
+ descriptors of degree k. This way we don't need to store the k in
+ the descriptor. Therefore a process descriptor is represented as
+
+ (X, a, i, j).
+
+ Since we have no other requirements than almost constant-time
+ lookup, we simply represent the set of descriptors using a hash
+ table with keys (X, a, i, j).
+
+ The main benefit of this is that after we find that all process
+ descriptors in R_k have been processed, we can free those
+ descriptors in U_k. I think this is a good way to lower the space
+ requirements of this algorithm. */
+
+typedef pair4 prodecor;
+
+/* The set named R in the paper */
+typedef struct procr_s procr;
+
+/* The set named U in the paper */
+typedef struct procu_s procu;
+
+/* constructor */
+procr *new_procr(NUM size, BOOL * const restrict errorp);
+procu *new_procu(NUM size, BOOL * const restrict errorp);
+
+/* insert */
+
+/* the function dscAdd in the paper */
+BOOL desc_add(procr * const restrict pr, procu * const restrict pu,
+ NUM grade, prodecor dec);
+
+/* pop */
+
+/* gradep will be set to the grade */
+
+/* if all descriptors of grade k have been poped out, then the
+ corresponding slot in pu will be freed automatically. */
+
+/* if no more elements are left, return NULL */
+prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep);
+
+/* destructor */
+void destroy_procr(procr * const restrict pr);
+void destroy_procu(procu * const restrict pu);
+
+#endif
diff --git a/src/crf.c b/src/crf.c
deleted file mode 100644
index 6e215ee..0000000
--- a/src/crf.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "crf.h"
-
-int place(int x) { return x; }
diff --git a/src/crf.h b/src/crf.h
deleted file mode 100644
index 9f86178..0000000
--- a/src/crf.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef CRF_H
-#define CRF_H
-
-/* This file implements the "Call Return Forest" for use in a GLL
- Clustered Non-terminal Parser. See the following paper for
- details:
-
- Derivation representation using binary subtree sets
-
- by
-
- Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */
-
-/* A CRF has two types of nodes.
-
- The first type is the non-leaf nodes. It is labelled as
-
- (X, i),
-
- where X is a non-terminal and i is an input position.
-
- The second type is the leaf nodes. It is labelled as
-
- (X, a, b, i),
-
- where X is a non-terminal, a is an index of a rule in the array of
- rules corresponding to X, b is the length of a prefix of the
- right-hand side of the rule corresponding to a, and i is an input
- position. The triple (X, a, b) is termed a "grammar slot" in the
- parlance.
-
- We don't need constant look-up for the second type of nodes. So we
- implement these as a simple dynamic array. We do need constant
- look-up for the first type of nodes. Thus we implement these as an
- array of hash tables, where the index of the array represents the
- index of the non-terminals, and the hash-table corresponding to a
- non-terminal X contains the input positions i such that (X, i) is a
- first-type node. Additionally, the first-type nodes should also
- carry around an index into the array of second-type nodes. */
-
-int place(int x);
-
-#endif
diff --git a/src/test/check_cnp.c b/src/test/check_cnp.c
new file mode 100644
index 0000000..2fbaac7
--- /dev/null
+++ b/src/test/check_cnp.c
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include "../util.h"
+#include "../cnp.h"
+
+int
+main(int UNUSED argc, char ** UNUSED argv)
+{
+ procr *pr = NULL;
+ procu *pu = NULL;
+
+ prodecor *prod = NULL;
+
+ crf *crfp = new_crf();
+ pair2 p = { 0 };
+ p.x = 0;
+ p.y = 1;
+
+ if (crf_find_node(crfp, p) == NULL) {
+ if (crf_add_node(crfp, p)) {
+ fleprintf0("fail to add node\n");
+ goto cleanup;
+ } else {
+ printf("Successfully add node (0, 1)\n");
+ }
+ }
+
+ if (crf_find_node(crfp, p) == NULL) {
+ fleprintf0("Fail to find node\n");
+ goto cleanup;
+ } else {
+ printf("Successfully find node for (0, 1)\n");
+ }
+
+ pair4 p4 = { 0 };
+
+ if (crf_add_edge(crfp, p, p4)) {
+ fleprintf0("Fail to add edge\n");
+ goto cleanup;
+ } else {
+ printf("Successfully add edge from (0, 1) to zero\n");
+ }
+
+ if (crf_find_node(crfp, p)) {
+ printf("Printing edges for (0, 1)...\n");
+
+#define TABLE (crf_find_node(crfp, p))
+
+ for (NUM i = 0; i < ht_size(TABLE); i++) {
+#define KEY (*((pair4**) ht_keys(TABLE)+i))
+ printf("(%ld, %ld, %ld, %ld)",
+ KEY->x, KEY->y, KEY->z, KEY->w);
+ if (i+1<ht_size(TABLE)) printf(", ");
+ else printf("\n");
+ }
+ }
+
+#undef TABLE
+#undef KEY
+
+ BOOL error = 0;
+
+ pr = new_procr(10, &error);
+ pu = new_procu(10, &error);
+
+ p4 = (pair4) { 0 };
+
+ p4.x = 1;
+
+ if (desc_add(pr, pu, 2, p4)) {
+ fleprintf0("Fail to add descriptor\n");
+ goto cleanup;
+ }
+
+ NUM grade = 0;
+
+ prod = pop_procr(pr, pu, &grade);
+
+ if (prod == NULL) {
+ printf("The first pop failed. "
+ "But maybe it occurs due to some misses in the minimal "
+ "weight.\n");
+ printf("So we shall pop again\n");
+ prod = pop_procr(pr, pu, &grade);
+
+ if (prod == NULL) {
+ fleprintf0("Fail to pop\n");
+ goto cleanup;
+ }
+ }
+
+ printf("popped a process descriptor: (%ld, %ld, %ld, %ld, %ld)\n",
+ prod->x, prod->y, prod->z, prod->w, grade);
+
+ cleanup:
+
+ destroy_crf(crfp);
+
+ if (pr) destroy_procr(pr);
+ if (pu) destroy_procu(pu);
+ if (prod) free(prod);
+
+ return 0;
+}