#ifndef CNP_H #define CNP_H #include "ht.h" /* This file implements the Clustered Non-terminal Parsing algorithm. See the following paper for details: Derivation representation using binary subtree sets by Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */ /* First we implement a "Call Return Forest". A CRF has two types of nodes. The first type is the non-leaf nodes. It is labelled as (X, i), where X is a non-terminal and i is an input position. The second type is the leaf nodes. It is labelled as (X, a, b, i), where X is a non-terminal, a is an index of a rule in the array of rules corresponding to X, b is the length of a prefix of the right-hand side of the rule corresponding to a, and i is an input position. The triple (X, a, b) is termed a "grammar slot" in the parlance. We need constant look-up for both types of nodes. So we implement these using hash tables. For the first type, we implement these as a hash table, whose keys are those (X, i) in the set. The values are described below. Actually we don't need those second type nodes. What we care about are the edges between the two types of nodes. So we simply don't store the second type nodes at all. We only store edges, or more precisely, edge labels. Since we need to know if an edge exists efficiently, we implement the edges as a hash table. (This seems to be termed a "disctionary of keys" in computer science jargon.) So the values of the hash table of the first type node are hash tables whose keys are (X, a, b, i) that are in the edge set of the first type node, and whose values are meaningless. */ typedef struct crf_s crf; crf *new_crf(); /* on error return non-zero */ BOOL crf_add_node(crf * const restrict crfp, pair2 node); /* if not found return NULL. if found but the node has no edges return an empty hash table. */ ht *crf_find_node(crf * const restrict crfp, pair2 node); BOOL crf_add_edge(crf * const restrict crfp, pair2 source, pair4 label); void destroy_crf(crf * const restrict crfp); /* We also need to implement the set of pending actions. These are triples of the form: (X, k, j). Since we need to find all j such that (X, k, j) belongs to the set, we implement the set as a hash table whose keys are (X, k) and whose values are those j such that (X, k, j) belongs to the set. */ /* TODO: Pending actions */ /* We also implement "process descriptors" for the CNP algorithm. Since a process descriptor is of the form (X := beta . gamma, b, j, k), where X := beta gamma is a rule and b is the length of beta, we can represent a process descriptor as (X, a, b, j, k). However, we shall notice that there is a natural "grading" on the process descriptors. The k-th grading consists of the descriptors with the last element in the tuple equal to k. In the clustered nonterminal parsing algorithm, the process descriptors that are added as a consequence of dealing with a process descriptor of grading k must be of grade greater than or equal to k. This means that if all process descriptors of grade k have been processed and the descriptors awaiting to be processed all have grades > k, then we won't see any process descriptors of grade <= k again. So we can actually keep an array of process descriptors that consist of descriptors of degree k. This way we don't need to store the k in the descriptor. Therefore a process descriptor is represented as (X, a, i, j). Since we have no other requirements than almost constant-time lookup, we simply represent the set of descriptors using a hash table with keys (X, a, i, j). The main benefit of this is that after we find that all process descriptors in R_k have been processed, we can free those descriptors in U_k. I think this is a good way to lower the space requirements of this algorithm. */ typedef pair4 prodecor; /* The set named R in the paper */ typedef struct procr_s procr; /* The set named U in the paper */ typedef struct procu_s procu; /* constructor */ procr *new_procr(NUM size, BOOL * const restrict errorp); procu *new_procu(NUM size, BOOL * const restrict errorp); /* insert */ /* the function dscAdd in the paper */ BOOL desc_add(procr * const restrict pr, procu * const restrict pu, NUM grade, prodecor dec); /* pop */ /* gradep will be set to the grade */ /* if all descriptors of grade k have been poped out, then the corresponding slot in pu will be freed automatically. */ /* if no more elements are left, return NULL */ prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep); /* destructor */ void destroy_procr(procr * const restrict pr); void destroy_procu(procu * const restrict pu); #endif