diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/bsr.h | 2 | ||||
-rw-r--r-- | src/cnp.c | 362 | ||||
-rw-r--r-- | src/cnp.h | 144 | ||||
-rw-r--r-- | src/crf.c | 3 | ||||
-rw-r--r-- | src/crf.h | 43 | ||||
-rw-r--r-- | src/test/check_cnp.c | 103 |
6 files changed, 610 insertions, 47 deletions
@@ -83,7 +83,7 @@ bsr *new_bsr(BOOL * const restrict errorp, Grammar *g); data is stored. */ void destroy_bsr(bsr * const restrict bsrp); -/* the function in the paper */ +/* the function bsrAdd in the paper */ /* X = non-terminal index, a = index of alternate, c = length of prefix */ diff --git a/src/cnp.c b/src/cnp.c new file mode 100644 index 0000000..fe8212e --- /dev/null +++ b/src/cnp.c @@ -0,0 +1,362 @@ +#include <stdio.h> +#include <string.h> +#include "util.h" +#include "list.h" +#include "cnp.h" + +struct crf_s { + ht *table; +}; + +crf * +new_crf() +{ + crf *crfp = NULL; + SAFE_MALLOC(crf, crfp, 1, goto cleanup;); + + crfp->table = new_ht2(HT_INIT_CAP); + + if (crfp->table == NULL) goto cleanup; + + return crfp; + + cleanup: + + if (crfp) free(crfp); + + return NULL; +} + +BOOL +crf_add_node(crf * const restrict crfp, pair2 node) +{ + if (ht_find(crfp->table, &node)) return 0; + + pair2 *p2 = NULL; + + NEW_P2(p2, node.x, node.y, return 1;); + + ht *value_table = new_ht4(HT_INIT_CAP); + + if (value_table == NULL) { + free(p2); + return 1; + } + + if (ht_insert(crfp->table, p2, value_table)) { + fleprintf0("Fail to insert\n"); + destroy_ht(value_table, DESTROY_NONE_SELF); + free(p2); + return 1; + } + + return 0; +} + +ht * +crf_find_node(crf * const restrict crfp, pair2 node) +{ + return ht_find(crfp->table, &node); +} + +BOOL +crf_add_edge(crf * const restrict crfp, pair2 source, pair4 label) +{ + if (ht_find(crfp->table, &source) == NULL) { + fleprintf0("add the node before adding its edges\n"); + return 1; + } + + if (ht_find(ht_find(crfp->table, &source), &label)) return 0; + + pair4 *p = NULL; + + NEW_P4(p, label.x, label.y, label.z, label.w, return 1;); + + if (ht_insert(ht_find(crfp->table, &source), p, (void*)1)) { + fleprintf0("Fail to insert\n"); + free(p); + return 1; + } + + return 0; +} + +void +destroy_crf(crf * const restrict crfp) +{ + +#define SIZE (ht_size(crfp->table)) +#define VALUES ((ht**) ht_values(crfp->table)) + + for (NUM i = 0; i < SIZE;) + destroy_ht(*(VALUES+i++), DESTROY_KEY_SELF); + + destroy_ht(crfp->table, DESTROY_KEY_SELF); + + free(crfp); + +#undef SIZE +#undef VALUES + +} + +typedef struct procr_array_element_s procr_array_element; + +/* The list at index k is the set R_k. */ +struct procr_array_element_s { + BOOL initialized; + List *ls; +}; + +struct procr_s { + /* input length */ + NUM len; + /* an array of lists */ + procr_array_element *array; + /* minimal grade that is non-empty */ + NUM mg; +}; + +typedef struct procu_array_element_s procu_array_element; + +/* The table at index k represents the set U_k. Its keys are those + (X, a, i, j) such that (X, a, i, j, k) belongs to U_k. */ +struct procu_array_element_s { + BOOL initialized; + ht *table; +}; + +struct procu_s { + /* input length */ + NUM len; + /* an array of hash tables */ + procu_array_element *array; +}; + +procr * +new_procr(NUM size, BOOL * const restrict errorp) +{ + *errorp = 0; + + procr *pr = NULL; + + SAFE_MALLOC(procr, pr, 1, goto cleanup;); + + pr->len = size; + pr->array = NULL; + + SAFE_MALLOC(procr_array_element, pr->array, + sizeof(procr_array_element) * size, + goto cleanup;); + + memset(pr->array, 0, sizeof(procr_array_element) * size); + + goto success; + + cleanup: + + *errorp = 1; + + if (pr && pr->array) free(pr->array); + + if (pr) free(pr); + + success: + + return pr; +} + +procu * +new_procu(NUM size, BOOL * const restrict errorp) +{ + *errorp = 0; + + procu *pu = NULL; + + SAFE_MALLOC(procu, pu, 1, goto cleanup;); + + pu->len = size; + pu->array = NULL; + + SAFE_MALLOC(procu_array_element, pu->array, + sizeof(procu_array_element) * size, + goto cleanup;); + + memset(pu->array, 0, sizeof(procu_array_element) * size); + + goto success; + + cleanup: + + *errorp = 1; + + if (pu && pu->array) free(pu->array); + + if (pu) free(pu); + + success: + + return pu; +} + +void +destroy_procr(procr * const restrict pr) +{ + for (NUM i = 0; i < pr->len; i++) + if ((pr->array+i)->initialized) + destroy_list((pr->array+i)->ls, 1); + + free(pr->array); + + free(pr); +} + +void +destroy_procu(procu * const restrict pu) +{ + for (NUM i = 0; i < pu->len; i++) + if ((pu->array+i)->initialized) + destroy_ht((pu->array+i)->table, DESTROY_KEY_SELF); + + free(pu->array); + + free(pu); +} + +BOOL +desc_add(procr * const restrict pr, procu * const restrict pu, + NUM grade, prodecor dec) +{ + if (grade < 0 || grade >= pu->len) { + fleprintf("Invalid grade: %ld\n", grade); + goto cleanup; + } + + pair4 *p4 = NULL; + +#define HELE (pu->array+grade) +#define LELE (pr->array+grade) + + if (HELE->initialized) { + /* If HELE is initialized then LELE should also be initialized. */ + if (ht_find(HELE->table, &dec) != NULL) goto success; + + NEW_P4(p4, dec.x, dec.y, dec.z, dec.w, goto cleanup;); + + if (ht_insert(HELE->table, p4, (void*)1)) { + fleprintf0("Fail to insert\n"); + goto cleanup; + } + + NEW_P4(p4, dec.x, dec.y, dec.z, dec.w, + ht_delete(HELE->table, &dec, DELETE_KEY); goto cleanup;); + + if (add_to_list(LELE->ls, p4)) { + fleprintf0("Fail to add to list\n"); + ht_delete(HELE->table, &dec, DELETE_KEY); + goto cleanup; + } + + goto success; + } + + HELE->table = new_ht4(HT_INIT_CAP); + + if (HELE->table == NULL) { + fleprintf0("Fail to have new hash table\n"); + goto cleanup; + } + + HELE->initialized = 1; + + NEW_P4(p4, dec.x, dec.y, dec.z, dec.w, + destroy_ht(HELE->table, DESTROY_KEY_SELF); + HELE->initialized = 0; + goto cleanup;); + + if (ht_insert(HELE->table, p4, (void*)1)) { + fleprintf0("Fail to insert\n"); + destroy_ht(HELE->table, DESTROY_KEY_SELF); + HELE->initialized = 0; + goto cleanup; + } + + LELE->ls = new_list(); + + if (LELE->ls == NULL) { + fleprintf0("Fail to have new list\n"); + destroy_ht(HELE->table, DESTROY_KEY_SELF); + HELE->initialized = 0; + goto cleanup; + } + + LELE->initialized = 1; + + NEW_P4(p4, dec.x, dec.y, dec.z, dec.w, + destroy_ht(HELE->table, DESTROY_KEY_SELF); + HELE->initialized = 0; + destroy_list(LELE->ls, 1); + LELE->initialized = 0; + goto cleanup;); + + if (add_to_list(LELE->ls, p4)) { + fleprintf0("Fail to add to list\n"); + destroy_ht(HELE->table, DESTROY_KEY_SELF); + HELE->initialized = 0; + destroy_list(LELE->ls, 1); + LELE->initialized = 0; + goto cleanup; + } + +#undef HELE +#undef LELE + + goto success; + + cleanup: + + return 1; + + success: + + return 0; +} + +prodecor * +pop_procr(procr * pr, procu * pu, NUM *gradep) +{ + prodecor *result = NULL; + + if (pr->mg >= pr->len) goto success; + +#define ELEMENT (pr->array+pr->mg) + + if (!(ELEMENT->initialized)) goto reset_mg; + + *gradep = pr->mg; + + result = pop_from_list(ELEMENT->ls); + + if (list_length(ELEMENT->ls) == 0) { + destroy_ht((pu->array+pr->mg)->table, DESTROY_KEY_SELF); + (pu->array+pr->mg)->initialized = 0; + (pu->array+pr->mg)->table = NULL; + + destroy_list(ELEMENT->ls, 1); + ELEMENT->initialized = 0; + ELEMENT->ls = NULL; + + /* REVIEW: Maybe we can just increment the minimal grade, as we + won't jump over two steps? */ + reset_mg: + while (++(pr->mg) < pr->len) { + if (ELEMENT->initialized) break; + } + } + +#undef ELEMENT + + success: + return result; +} diff --git a/src/cnp.h b/src/cnp.h new file mode 100644 index 0000000..c7d3965 --- /dev/null +++ b/src/cnp.h @@ -0,0 +1,144 @@ +#ifndef CNP_H +#define CNP_H +#include "ht.h" + +/* This file implements the Clustered Non-terminal Parsing algorithm. + See the following paper for details: + + Derivation representation using binary subtree sets + + by + + Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */ + +/* First we implement a "Call Return Forest". + + A CRF has two types of nodes. + + The first type is the non-leaf nodes. It is labelled as + + (X, i), + + where X is a non-terminal and i is an input position. + + The second type is the leaf nodes. It is labelled as + + (X, a, b, i), + + where X is a non-terminal, a is an index of a rule in the array of + rules corresponding to X, b is the length of a prefix of the + right-hand side of the rule corresponding to a, and i is an input + position. The triple (X, a, b) is termed a "grammar slot" in the + parlance. + + We need constant look-up for both types of nodes. So we implement + these using hash tables. + + For the first type, we implement these as a hash table, whose keys + are those (X, i) in the set. The values are described below. + + Actually we don't need those second type nodes. What we care about + are the edges between the two types of nodes. So we simply don't + store the second type nodes at all. We only store edges, or more + precisely, edge labels. Since we need to know if an edge exists + efficiently, we implement the edges as a hash table. (This seems + to be termed a "disctionary of keys" in computer science jargon.) + So the values of the hash table of the first type node are hash + tables whose keys are (X, a, b, i) that are in the edge set of the + first type node, and whose values are meaningless. */ + +typedef struct crf_s crf; + +crf *new_crf(); + +/* on error return non-zero */ +BOOL crf_add_node(crf * const restrict crfp, pair2 node); + +/* if not found return NULL. + + if found but the node has no edges return an empty hash table. */ +ht *crf_find_node(crf * const restrict crfp, pair2 node); + +BOOL crf_add_edge(crf * const restrict crfp, pair2 source, + pair4 label); + +void destroy_crf(crf * const restrict crfp); + +/* We also need to implement the set of pending actions. These are + triples of the form: + + (X, k, j). + + Since we need to find all j such that (X, k, j) belongs to the set, + we implement the set as a hash table whose keys are (X, k) and + whose values are those j such that (X, k, j) belongs to the set. */ + +/* TODO: Pending actions */ + +/* We also implement "process descriptors" for the CNP algorithm. Since + a process descriptor is of the form + + (X := beta . gamma, b, j, k), + + where X := beta gamma is a rule and b is the length of beta, we can + represent a process descriptor as + + (X, a, b, j, k). + + However, we shall notice that there is a natural "grading" on the + process descriptors. The k-th grading consists of the descriptors + with the last element in the tuple equal to k. In the clustered + nonterminal parsing algorithm, the process descriptors that are + added as a consequence of dealing with a process descriptor of + grading k must be of grade greater than or equal to k. This means + that if all process descriptors of grade k have been processed and + the descriptors awaiting to be processed all have grades > k, then + we won't see any process descriptors of grade <= k again. So we + can actually keep an array of process descriptors that consist of + descriptors of degree k. This way we don't need to store the k in + the descriptor. Therefore a process descriptor is represented as + + (X, a, i, j). + + Since we have no other requirements than almost constant-time + lookup, we simply represent the set of descriptors using a hash + table with keys (X, a, i, j). + + The main benefit of this is that after we find that all process + descriptors in R_k have been processed, we can free those + descriptors in U_k. I think this is a good way to lower the space + requirements of this algorithm. */ + +typedef pair4 prodecor; + +/* The set named R in the paper */ +typedef struct procr_s procr; + +/* The set named U in the paper */ +typedef struct procu_s procu; + +/* constructor */ +procr *new_procr(NUM size, BOOL * const restrict errorp); +procu *new_procu(NUM size, BOOL * const restrict errorp); + +/* insert */ + +/* the function dscAdd in the paper */ +BOOL desc_add(procr * const restrict pr, procu * const restrict pu, + NUM grade, prodecor dec); + +/* pop */ + +/* gradep will be set to the grade */ + +/* if all descriptors of grade k have been poped out, then the + corresponding slot in pu will be freed automatically. */ + +/* if no more elements are left, return NULL */ +prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep); + +/* destructor */ +void destroy_procr(procr * const restrict pr); +void destroy_procu(procu * const restrict pu); + +#endif diff --git a/src/crf.c b/src/crf.c deleted file mode 100644 index 6e215ee..0000000 --- a/src/crf.c +++ /dev/null @@ -1,3 +0,0 @@ -#include "crf.h" - -int place(int x) { return x; } diff --git a/src/crf.h b/src/crf.h deleted file mode 100644 index 9f86178..0000000 --- a/src/crf.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef CRF_H -#define CRF_H - -/* This file implements the "Call Return Forest" for use in a GLL - Clustered Non-terminal Parser. See the following paper for - details: - - Derivation representation using binary subtree sets - - by - - Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */ - -/* A CRF has two types of nodes. - - The first type is the non-leaf nodes. It is labelled as - - (X, i), - - where X is a non-terminal and i is an input position. - - The second type is the leaf nodes. It is labelled as - - (X, a, b, i), - - where X is a non-terminal, a is an index of a rule in the array of - rules corresponding to X, b is the length of a prefix of the - right-hand side of the rule corresponding to a, and i is an input - position. The triple (X, a, b) is termed a "grammar slot" in the - parlance. - - We don't need constant look-up for the second type of nodes. So we - implement these as a simple dynamic array. We do need constant - look-up for the first type of nodes. Thus we implement these as an - array of hash tables, where the index of the array represents the - index of the non-terminals, and the hash-table corresponding to a - non-terminal X contains the input positions i such that (X, i) is a - first-type node. Additionally, the first-type nodes should also - carry around an index into the array of second-type nodes. */ - -int place(int x); - -#endif diff --git a/src/test/check_cnp.c b/src/test/check_cnp.c new file mode 100644 index 0000000..2fbaac7 --- /dev/null +++ b/src/test/check_cnp.c @@ -0,0 +1,103 @@ +#include <stdio.h> +#include "../util.h" +#include "../cnp.h" + +int +main(int UNUSED argc, char ** UNUSED argv) +{ + procr *pr = NULL; + procu *pu = NULL; + + prodecor *prod = NULL; + + crf *crfp = new_crf(); + pair2 p = { 0 }; + p.x = 0; + p.y = 1; + + if (crf_find_node(crfp, p) == NULL) { + if (crf_add_node(crfp, p)) { + fleprintf0("fail to add node\n"); + goto cleanup; + } else { + printf("Successfully add node (0, 1)\n"); + } + } + + if (crf_find_node(crfp, p) == NULL) { + fleprintf0("Fail to find node\n"); + goto cleanup; + } else { + printf("Successfully find node for (0, 1)\n"); + } + + pair4 p4 = { 0 }; + + if (crf_add_edge(crfp, p, p4)) { + fleprintf0("Fail to add edge\n"); + goto cleanup; + } else { + printf("Successfully add edge from (0, 1) to zero\n"); + } + + if (crf_find_node(crfp, p)) { + printf("Printing edges for (0, 1)...\n"); + +#define TABLE (crf_find_node(crfp, p)) + + for (NUM i = 0; i < ht_size(TABLE); i++) { +#define KEY (*((pair4**) ht_keys(TABLE)+i)) + printf("(%ld, %ld, %ld, %ld)", + KEY->x, KEY->y, KEY->z, KEY->w); + if (i+1<ht_size(TABLE)) printf(", "); + else printf("\n"); + } + } + +#undef TABLE +#undef KEY + + BOOL error = 0; + + pr = new_procr(10, &error); + pu = new_procu(10, &error); + + p4 = (pair4) { 0 }; + + p4.x = 1; + + if (desc_add(pr, pu, 2, p4)) { + fleprintf0("Fail to add descriptor\n"); + goto cleanup; + } + + NUM grade = 0; + + prod = pop_procr(pr, pu, &grade); + + if (prod == NULL) { + printf("The first pop failed. " + "But maybe it occurs due to some misses in the minimal " + "weight.\n"); + printf("So we shall pop again\n"); + prod = pop_procr(pr, pu, &grade); + + if (prod == NULL) { + fleprintf0("Fail to pop\n"); + goto cleanup; + } + } + + printf("popped a process descriptor: (%ld, %ld, %ld, %ld, %ld)\n", + prod->x, prod->y, prod->z, prod->w, grade); + + cleanup: + + destroy_crf(crfp); + + if (pr) destroy_procr(pr); + if (pu) destroy_procu(pu); + if (prod) free(prod); + + return 0; +} |