From 8ace61933130416a0b8a6b250de681a606439f48 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 28 Jan 2022 11:17:01 +0800 Subject: CNP save point CRF and process descriptors seem to work now. It only remains to implement the set of pending actions before I can work on the driver program. --- src/cnp.h | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 src/cnp.h (limited to 'src/cnp.h') diff --git a/src/cnp.h b/src/cnp.h new file mode 100644 index 0000000..c7d3965 --- /dev/null +++ b/src/cnp.h @@ -0,0 +1,144 @@ +#ifndef CNP_H +#define CNP_H +#include "ht.h" + +/* This file implements the Clustered Non-terminal Parsing algorithm. + See the following paper for details: + + Derivation representation using binary subtree sets + + by + + Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */ + +/* First we implement a "Call Return Forest". + + A CRF has two types of nodes. + + The first type is the non-leaf nodes. It is labelled as + + (X, i), + + where X is a non-terminal and i is an input position. + + The second type is the leaf nodes. It is labelled as + + (X, a, b, i), + + where X is a non-terminal, a is an index of a rule in the array of + rules corresponding to X, b is the length of a prefix of the + right-hand side of the rule corresponding to a, and i is an input + position. The triple (X, a, b) is termed a "grammar slot" in the + parlance. + + We need constant look-up for both types of nodes. So we implement + these using hash tables. + + For the first type, we implement these as a hash table, whose keys + are those (X, i) in the set. The values are described below. + + Actually we don't need those second type nodes. What we care about + are the edges between the two types of nodes. So we simply don't + store the second type nodes at all. We only store edges, or more + precisely, edge labels. Since we need to know if an edge exists + efficiently, we implement the edges as a hash table. (This seems + to be termed a "disctionary of keys" in computer science jargon.) + So the values of the hash table of the first type node are hash + tables whose keys are (X, a, b, i) that are in the edge set of the + first type node, and whose values are meaningless. */ + +typedef struct crf_s crf; + +crf *new_crf(); + +/* on error return non-zero */ +BOOL crf_add_node(crf * const restrict crfp, pair2 node); + +/* if not found return NULL. + + if found but the node has no edges return an empty hash table. */ +ht *crf_find_node(crf * const restrict crfp, pair2 node); + +BOOL crf_add_edge(crf * const restrict crfp, pair2 source, + pair4 label); + +void destroy_crf(crf * const restrict crfp); + +/* We also need to implement the set of pending actions. These are + triples of the form: + + (X, k, j). + + Since we need to find all j such that (X, k, j) belongs to the set, + we implement the set as a hash table whose keys are (X, k) and + whose values are those j such that (X, k, j) belongs to the set. */ + +/* TODO: Pending actions */ + +/* We also implement "process descriptors" for the CNP algorithm. Since + a process descriptor is of the form + + (X := beta . gamma, b, j, k), + + where X := beta gamma is a rule and b is the length of beta, we can + represent a process descriptor as + + (X, a, b, j, k). + + However, we shall notice that there is a natural "grading" on the + process descriptors. The k-th grading consists of the descriptors + with the last element in the tuple equal to k. In the clustered + nonterminal parsing algorithm, the process descriptors that are + added as a consequence of dealing with a process descriptor of + grading k must be of grade greater than or equal to k. This means + that if all process descriptors of grade k have been processed and + the descriptors awaiting to be processed all have grades > k, then + we won't see any process descriptors of grade <= k again. So we + can actually keep an array of process descriptors that consist of + descriptors of degree k. This way we don't need to store the k in + the descriptor. Therefore a process descriptor is represented as + + (X, a, i, j). + + Since we have no other requirements than almost constant-time + lookup, we simply represent the set of descriptors using a hash + table with keys (X, a, i, j). + + The main benefit of this is that after we find that all process + descriptors in R_k have been processed, we can free those + descriptors in U_k. I think this is a good way to lower the space + requirements of this algorithm. */ + +typedef pair4 prodecor; + +/* The set named R in the paper */ +typedef struct procr_s procr; + +/* The set named U in the paper */ +typedef struct procu_s procu; + +/* constructor */ +procr *new_procr(NUM size, BOOL * const restrict errorp); +procu *new_procu(NUM size, BOOL * const restrict errorp); + +/* insert */ + +/* the function dscAdd in the paper */ +BOOL desc_add(procr * const restrict pr, procu * const restrict pu, + NUM grade, prodecor dec); + +/* pop */ + +/* gradep will be set to the grade */ + +/* if all descriptors of grade k have been poped out, then the + corresponding slot in pu will be freed automatically. */ + +/* if no more elements are left, return NULL */ +prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep); + +/* destructor */ +void destroy_procr(procr * const restrict pr); +void destroy_procu(procu * const restrict pu); + +#endif -- cgit v1.2.3-18-g5258