CNP save point

CRF and process descriptors seem to work now. It only remains to implement the set of pending actions before I can work on the driver program.
author: JSDurand <mmemmew@gmail.com> 2022-01-28 11:17:01 +0800
committer: JSDurand <mmemmew@gmail.com> 2022-01-28 19:27:54 +0800
commit: 8ace61933130416a0b8a6b250de681a606439f48 (patch)
tree: bddeadb4960cecd4b21896ec23238ed31ee237f0
parent: e8e1c91b40c9c82a0fd8373746a7b8cfb130f467 (diff)
6 files changed, 610 insertions, 47 deletions
diff --git a/src/bsr.h b/src/bsr.h
index b92e20e..6c5e17c 100644
--- a/src/bsr.h
+++ b/src/bsr.h
@@ -83,7 +83,7 @@ bsr *new_bsr(BOOL * const restrict errorp, Grammar *g);
    data is stored. */
 void destroy_bsr(bsr * const restrict bsrp);
 
-/* the function in the paper */
+/* the function bsrAdd in the paper */
 
 /* X = non-terminal index, a = index of alternate, c = length of
    prefix */
diff --git a/src/cnp.c b/src/cnp.c
new file mode 100644
index 0000000..fe8212e
--- /dev/null
+++ b/src/cnp.c
@@ -0,0 +1,362 @@
+#include <stdio.h>
+#include <string.h>
+#include "util.h"
+#include "list.h"
+#include "cnp.h"
+
+struct crf_s {
+  ht *table;
+};
+
+crf *
+new_crf()
+{
+  crf *crfp = NULL;
+  SAFE_MALLOC(crf, crfp, 1, goto cleanup;);
+
+  crfp->table = new_ht2(HT_INIT_CAP);
+
+  if (crfp->table == NULL) goto cleanup;
+
+  return crfp;
+
+ cleanup:
+
+  if (crfp) free(crfp);
+
+  return NULL;
+}
+
+BOOL
+crf_add_node(crf * const restrict crfp, pair2 node)
+{
+  if (ht_find(crfp->table, &node)) return 0;
+
+  pair2 *p2 = NULL;
+
+  NEW_P2(p2, node.x, node.y, return 1;);
+
+  ht *value_table = new_ht4(HT_INIT_CAP);
+
+  if (value_table == NULL) {
+    free(p2);
+    return 1;
+  }
+
+  if (ht_insert(crfp->table, p2, value_table)) {
+    fleprintf0("Fail to insert\n");
+    destroy_ht(value_table, DESTROY_NONE_SELF);
+    free(p2);
+    return 1;
+  }
+
+  return 0;
+}
+
+ht *
+crf_find_node(crf * const restrict crfp, pair2 node)
+{
+  return ht_find(crfp->table, &node);
+}
+
+BOOL
+crf_add_edge(crf * const restrict crfp, pair2 source, pair4 label)
+{
+  if (ht_find(crfp->table, &source) == NULL) {
+    fleprintf0("add the node before adding its edges\n");
+    return 1;
+  }
+
+  if (ht_find(ht_find(crfp->table, &source), &label)) return 0;
+
+  pair4 *p = NULL;
+
+  NEW_P4(p, label.x, label.y, label.z, label.w, return 1;);
+
+  if (ht_insert(ht_find(crfp->table, &source), p, (void*)1)) {
+    fleprintf0("Fail to insert\n");
+    free(p);
+    return 1;
+  }
+
+  return 0;
+}
+
+void
+destroy_crf(crf * const restrict crfp)
+{
+
+#define SIZE (ht_size(crfp->table))
+#define VALUES ((ht**) ht_values(crfp->table))
+
+  for (NUM i = 0; i < SIZE;)
+    destroy_ht(*(VALUES+i++), DESTROY_KEY_SELF);
+
+  destroy_ht(crfp->table, DESTROY_KEY_SELF);
+
+  free(crfp);
+
+#undef SIZE
+#undef VALUES
+
+}
+
+typedef struct procr_array_element_s procr_array_element;
+
+/* The list at index k is the set R_k. */
+struct procr_array_element_s {
+  BOOL initialized;
+  List *ls;
+};
+
+struct procr_s {
+  /* input length */
+  NUM len;
+  /* an array of lists */
+  procr_array_element *array;
+  /* minimal grade that is non-empty */
+  NUM mg;
+};
+
+typedef struct procu_array_element_s procu_array_element;
+
+/* The table at index k represents the set U_k.  Its keys are those
+   (X, a, i, j) such that (X, a, i, j, k) belongs to U_k. */
+struct procu_array_element_s {
+  BOOL initialized;
+  ht *table;
+};
+
+struct procu_s {
+  /* input length */
+  NUM len;
+  /* an array of hash tables */
+  procu_array_element *array;
+};
+
+procr *
+new_procr(NUM size, BOOL * const restrict errorp)
+{
+  *errorp = 0;
+
+  procr *pr = NULL;
+
+  SAFE_MALLOC(procr, pr, 1, goto cleanup;);
+
+  pr->len = size;
+  pr->array = NULL;
+
+  SAFE_MALLOC(procr_array_element, pr->array,
+              sizeof(procr_array_element) * size,
+              goto cleanup;);
+
+  memset(pr->array, 0, sizeof(procr_array_element) * size);
+
+  goto success;
+
+ cleanup:
+
+  *errorp = 1;
+
+  if (pr && pr->array) free(pr->array);
+
+  if (pr) free(pr);
+
+ success:
+
+  return pr;
+}
+
+procu *
+new_procu(NUM size, BOOL * const restrict errorp)
+{
+  *errorp = 0;
+
+  procu *pu = NULL;
+
+  SAFE_MALLOC(procu, pu, 1, goto cleanup;);
+
+  pu->len = size;
+  pu->array = NULL;
+
+  SAFE_MALLOC(procu_array_element, pu->array,
+              sizeof(procu_array_element) * size,
+              goto cleanup;);
+
+  memset(pu->array, 0, sizeof(procu_array_element) * size);
+
+  goto success;
+
+ cleanup:
+
+  *errorp = 1;
+
+  if (pu && pu->array) free(pu->array);
+
+  if (pu) free(pu);
+
+ success:
+
+  return pu;
+}
+
+void
+destroy_procr(procr * const restrict pr)
+{
+  for (NUM i = 0; i < pr->len; i++)
+    if ((pr->array+i)->initialized)
+      destroy_list((pr->array+i)->ls, 1);
+
+  free(pr->array);
+
+  free(pr);
+}
+
+void
+destroy_procu(procu * const restrict pu)
+{
+  for (NUM i = 0; i < pu->len; i++)
+    if ((pu->array+i)->initialized)
+      destroy_ht((pu->array+i)->table, DESTROY_KEY_SELF);
+
+  free(pu->array);
+
+  free(pu);
+}
+
+BOOL
+desc_add(procr * const restrict pr, procu * const restrict pu,
+         NUM grade, prodecor dec)
+{
+  if (grade < 0 || grade >= pu->len) {
+    fleprintf("Invalid grade: %ld\n", grade);
+    goto cleanup;
+  }
+
+  pair4 *p4 = NULL;
+
+#define HELE (pu->array+grade)
+#define LELE (pr->array+grade)
+
+  if (HELE->initialized) {
+    /* If HELE is initialized then LELE should also be initialized. */
+    if (ht_find(HELE->table, &dec) != NULL) goto success;
+
+    NEW_P4(p4, dec.x, dec.y, dec.z, dec.w, goto cleanup;);
+
+    if (ht_insert(HELE->table, p4, (void*)1)) {
+      fleprintf0("Fail to insert\n");
+      goto cleanup;
+    }
+
+    NEW_P4(p4, dec.x, dec.y, dec.z, dec.w,
+           ht_delete(HELE->table, &dec, DELETE_KEY); goto cleanup;);
+
+    if (add_to_list(LELE->ls, p4)) {
+      fleprintf0("Fail to add to list\n");
+      ht_delete(HELE->table, &dec, DELETE_KEY);
+      goto cleanup;
+    }
+
+    goto success;
+  }
+
+  HELE->table = new_ht4(HT_INIT_CAP);
+
+  if (HELE->table == NULL) {
+    fleprintf0("Fail to have new hash table\n");
+    goto cleanup;
+  }
+
+  HELE->initialized = 1;
+
+  NEW_P4(p4, dec.x, dec.y, dec.z, dec.w,
+         destroy_ht(HELE->table, DESTROY_KEY_SELF);
+         HELE->initialized = 0;
+         goto cleanup;);
+
+  if (ht_insert(HELE->table, p4, (void*)1)) {
+    fleprintf0("Fail to insert\n");
+    destroy_ht(HELE->table, DESTROY_KEY_SELF);
+    HELE->initialized = 0;
+    goto cleanup;
+  }
+
+  LELE->ls = new_list();
+
+  if (LELE->ls == NULL) {
+    fleprintf0("Fail to have new list\n");
+    destroy_ht(HELE->table, DESTROY_KEY_SELF);
+    HELE->initialized = 0;
+    goto cleanup;
+  }
+
+  LELE->initialized = 1;
+
+  NEW_P4(p4, dec.x, dec.y, dec.z, dec.w,
+         destroy_ht(HELE->table, DESTROY_KEY_SELF);
+         HELE->initialized = 0;
+         destroy_list(LELE->ls, 1);
+         LELE->initialized = 0;
+         goto cleanup;);
+
+  if (add_to_list(LELE->ls, p4)) {
+    fleprintf0("Fail to add to list\n");
+    destroy_ht(HELE->table, DESTROY_KEY_SELF);
+    HELE->initialized = 0;
+    destroy_list(LELE->ls, 1);
+    LELE->initialized = 0;
+    goto cleanup;
+  }
+
+#undef HELE
+#undef LELE
+
+  goto success;
+
+ cleanup:
+
+  return 1;
+
+ success:
+
+  return 0;
+}
+
+prodecor *
+pop_procr(procr * pr, procu * pu, NUM *gradep)
+{
+  prodecor *result = NULL;
+
+  if (pr->mg >= pr->len) goto success;
+
+#define ELEMENT (pr->array+pr->mg)
+
+  if (!(ELEMENT->initialized)) goto reset_mg;
+
+  *gradep = pr->mg;
+
+  result = pop_from_list(ELEMENT->ls);
+
+  if (list_length(ELEMENT->ls) == 0) {
+    destroy_ht((pu->array+pr->mg)->table, DESTROY_KEY_SELF);
+    (pu->array+pr->mg)->initialized = 0;
+    (pu->array+pr->mg)->table = NULL;
+
+    destroy_list(ELEMENT->ls, 1);
+    ELEMENT->initialized = 0;
+    ELEMENT->ls = NULL;
+
+    /* REVIEW: Maybe we can just increment the minimal grade, as we
+       won't jump over two steps? */
+  reset_mg:
+    while (++(pr->mg) < pr->len) {
+      if (ELEMENT->initialized) break;
+    }
+  }
+
+#undef ELEMENT
+
+ success:
+  return result;
+}
diff --git a/src/cnp.h b/src/cnp.h
new file mode 100644
index 0000000..c7d3965
--- /dev/null
+++ b/src/cnp.h
@@ -0,0 +1,144 @@
+#ifndef CNP_H
+#define CNP_H
+#include "ht.h"
+
+/* This file implements the Clustered Non-terminal Parsing algorithm.
+   See the following paper for details:
+
+   Derivation representation using binary subtree sets
+
+   by
+
+   Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */
+
+/* First we implement a "Call Return Forest".
+
+   A CRF has two types of nodes.
+
+   The first type is the non-leaf nodes.  It is labelled as
+
+   (X, i),
+
+   where X is a non-terminal and i is an input position.
+
+   The second type is the leaf nodes.  It is labelled as
+
+   (X, a, b, i),
+
+   where X is a non-terminal, a is an index of a rule in the array of
+   rules corresponding to X, b is the length of a prefix of the
+   right-hand side of the rule corresponding to a, and i is an input
+   position.  The triple (X, a, b) is termed a "grammar slot" in the
+   parlance.
+
+   We need constant look-up for both types of nodes.  So we implement
+   these using hash tables.
+
+   For the first type, we implement these as a hash table, whose keys
+   are those (X, i) in the set.  The values are described below.
+
+   Actually we don't need those second type nodes.  What we care about
+   are the edges between the two types of nodes.  So we simply don't
+   store the second type nodes at all.  We only store edges, or more
+   precisely, edge labels.  Since we need to know if an edge exists
+   efficiently, we implement the edges as a hash table.  (This seems
+   to be termed a "disctionary of keys" in computer science jargon.)
+   So the values of the hash table of the first type node are hash
+   tables whose keys are (X, a, b, i) that are in the edge set of the
+   first type node, and whose values are meaningless. */
+
+typedef struct crf_s crf;
+
+crf *new_crf();
+
+/* on error return non-zero */
+BOOL crf_add_node(crf * const restrict crfp, pair2 node);
+
+/* if not found return NULL.
+
+   if found but the node has no edges return an empty hash table. */
+ht *crf_find_node(crf * const restrict crfp, pair2 node);
+
+BOOL crf_add_edge(crf * const restrict crfp, pair2 source,
+                  pair4 label);
+
+void destroy_crf(crf * const restrict crfp);
+
+/* We also need to implement the set of pending actions.  These are
+   triples of the form:
+
+   (X, k, j).
+
+   Since we need to find all j such that (X, k, j) belongs to the set,
+   we implement the set as a hash table whose keys are (X, k) and
+   whose values are those j such that (X, k, j) belongs to the set. */
+
+/* TODO: Pending actions */
+
+/* We also implement "process descriptors" for the CNP algorithm.  Since
+   a process descriptor is of the form
+
+   (X := beta . gamma, b, j, k),
+
+   where X := beta gamma is a rule and b is the length of beta, we can
+   represent a process descriptor as
+
+   (X, a, b, j, k).
+
+   However, we shall notice that there is a natural "grading" on the
+   process descriptors.  The k-th grading consists of the descriptors
+   with the last element in the tuple equal to k.  In the clustered
+   nonterminal parsing algorithm, the process descriptors that are
+   added as a consequence of dealing with a process descriptor of
+   grading k must be of grade greater than or equal to k.  This means
+   that if all process descriptors of grade k have been processed and
+   the descriptors awaiting to be processed all have grades > k, then
+   we won't see any process descriptors of grade <= k again.  So we
+   can actually keep an array of process descriptors that consist of
+   descriptors of degree k.  This way we don't need to store the k in
+   the descriptor.  Therefore a process descriptor is represented as
+
+   (X, a, i, j).
+
+   Since we have no other requirements than almost constant-time
+   lookup, we simply represent the set of descriptors using a hash
+   table with keys (X, a, i, j).
+
+   The main benefit of this is that after we find that all process
+   descriptors in R_k have been processed, we can free those
+   descriptors in U_k.  I think this is a good way to lower the space
+   requirements of this algorithm. */
+
+typedef pair4 prodecor;
+
+/* The set named R in the paper */
+typedef struct procr_s procr;
+
+/* The set named U in the paper */
+typedef struct procu_s procu;
+
+/* constructor */
+procr *new_procr(NUM size, BOOL * const restrict errorp);
+procu *new_procu(NUM size, BOOL * const restrict errorp);
+
+/* insert */
+
+/* the function dscAdd in the paper */
+BOOL desc_add(procr * const restrict pr, procu * const restrict pu,
+              NUM grade, prodecor dec);
+
+/* pop */
+
+/* gradep will be set to the grade */
+
+/* if all descriptors of grade k have been poped out, then the
+   corresponding slot in pu will be freed automatically. */
+
+/* if no more elements are left, return NULL */
+prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep);
+
+/* destructor */
+void destroy_procr(procr * const restrict pr);
+void destroy_procu(procu * const restrict pu);
+
+#endif
diff --git a/src/crf.c b/src/crf.c
deleted file mode 100644
index 6e215ee..0000000
--- a/src/crf.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "crf.h"
-
-int place(int x) { return x; }
diff --git a/src/crf.h b/src/crf.h
deleted file mode 100644
index 9f86178..0000000
--- a/src/crf.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef CRF_H
-#define CRF_H
-
-/* This file implements the "Call Return Forest" for use in a GLL
-   Clustered Non-terminal Parser.  See the following paper for
-   details:
-
-   Derivation representation using binary subtree sets
-   
-   by
-   
-   Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */
-
-/* A CRF has two types of nodes.
-
-   The first type is the non-leaf nodes.  It is labelled as
-
-   (X, i),
-
-   where X is a non-terminal and i is an input position.
-
-   The second type is the leaf nodes.  It is labelled as
-
-   (X, a, b, i),
-
-   where X is a non-terminal, a is an index of a rule in the array of
-   rules corresponding to X, b is the length of a prefix of the
-   right-hand side of the rule corresponding to a, and i is an input
-   position.  The triple (X, a, b) is termed a "grammar slot" in the
-   parlance.
-
-   We don't need constant look-up for the second type of nodes.  So we
-   implement these as a simple dynamic array.  We do need constant
-   look-up for the first type of nodes.  Thus we implement these as an
-   array of hash tables, where the index of the array represents the
-   index of the non-terminals, and the hash-table corresponding to a
-   non-terminal X contains the input positions i such that (X, i) is a
-   first-type node.  Additionally, the first-type nodes should also
-   carry around an index into the array of second-type nodes. */
-
-int place(int x);
-
-#endif
diff --git a/src/test/check_cnp.c b/src/test/check_cnp.c
new file mode 100644
index 0000000..2fbaac7
--- /dev/null
+++ b/src/test/check_cnp.c
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include "../util.h"
+#include "../cnp.h"
+
+int
+main(int UNUSED argc, char ** UNUSED argv)
+{
+  procr *pr = NULL;
+  procu *pu = NULL;
+
+  prodecor *prod = NULL;
+
+  crf *crfp = new_crf();
+  pair2 p = { 0 };
+  p.x = 0;
+  p.y = 1;
+
+  if (crf_find_node(crfp, p) == NULL) {
+    if (crf_add_node(crfp, p)) {
+      fleprintf0("fail to add node\n");
+      goto cleanup;
+    } else {
+      printf("Successfully add node (0, 1)\n");
+    }
+  }
+
+  if (crf_find_node(crfp, p) == NULL) {
+    fleprintf0("Fail to find node\n");
+    goto cleanup;
+  } else {
+    printf("Successfully find node for (0, 1)\n");
+  }
+
+  pair4 p4 = { 0 };
+
+  if (crf_add_edge(crfp, p, p4)) {
+    fleprintf0("Fail to add edge\n");
+    goto cleanup;
+  } else {
+    printf("Successfully add edge from (0, 1) to zero\n");
+  }
+
+  if (crf_find_node(crfp, p)) {
+    printf("Printing edges for (0, 1)...\n");
+
+#define TABLE (crf_find_node(crfp, p))
+
+    for (NUM i = 0; i < ht_size(TABLE); i++) {
+#define KEY (*((pair4**) ht_keys(TABLE)+i))
+      printf("(%ld, %ld, %ld, %ld)",
+             KEY->x, KEY->y, KEY->z, KEY->w);
+      if (i+1<ht_size(TABLE)) printf(", ");
+      else printf("\n");
+    }
+  }
+
+#undef TABLE
+#undef KEY
+
+  BOOL error = 0;
+
+  pr = new_procr(10, &error);
+  pu = new_procu(10, &error);
+
+  p4 = (pair4) { 0 };
+
+  p4.x = 1;
+
+  if (desc_add(pr, pu, 2, p4)) {
+    fleprintf0("Fail to add descriptor\n");
+    goto cleanup;
+  }
+
+  NUM grade = 0;
+
+  prod = pop_procr(pr, pu, &grade);
+
+  if (prod == NULL) {
+    printf("The first pop failed.  "
+           "But maybe it occurs due to some misses in the minimal "
+           "weight.\n");
+    printf("So we shall pop again\n");
+    prod = pop_procr(pr, pu, &grade);
+
+    if (prod == NULL) {
+      fleprintf0("Fail to pop\n");
+      goto cleanup;
+    }
+  }
+
+  printf("popped a process descriptor: (%ld, %ld, %ld, %ld, %ld)\n",
+         prod->x, prod->y, prod->z, prod->w, grade);
+
+ cleanup:
+
+  destroy_crf(crfp);
+
+  if (pr) destroy_procr(pr);
+  if (pu) destroy_procu(pu);
+  if (prod) free(prod);
+
+  return 0;
+}
author	JSDurand <mmemmew@gmail.com>	2022-01-28 11:17:01 +0800
committer	JSDurand <mmemmew@gmail.com>	2022-01-28 19:27:54 +0800
commit	8ace61933130416a0b8a6b250de681a606439f48 (patch)
tree	bddeadb4960cecd4b21896ec23238ed31ee237f0
parent	e8e1c91b40c9c82a0fd8373746a7b8cfb130f467 (diff)