From e555c88b8107caf886da229444c2bed1aaef6c2c Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 28 Jan 2022 23:21:00 +0800 Subject: rename cnp to crf THat file implements support functions for the CNP algorithm, not the algorithm itself. --- src/cnp.h | 142 +------------------------------------------------------------- 1 file changed, 1 insertion(+), 141 deletions(-) (limited to 'src/cnp.h') diff --git a/src/cnp.h b/src/cnp.h index c7d3965..5944ca3 100644 --- a/src/cnp.h +++ b/src/cnp.h @@ -1,144 +1,4 @@ #ifndef CNP_H #define CNP_H -#include "ht.h" - -/* This file implements the Clustered Non-terminal Parsing algorithm. - See the following paper for details: - - Derivation representation using binary subtree sets - - by - - Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */ - -/* First we implement a "Call Return Forest". - - A CRF has two types of nodes. - - The first type is the non-leaf nodes. It is labelled as - - (X, i), - - where X is a non-terminal and i is an input position. - - The second type is the leaf nodes. It is labelled as - - (X, a, b, i), - - where X is a non-terminal, a is an index of a rule in the array of - rules corresponding to X, b is the length of a prefix of the - right-hand side of the rule corresponding to a, and i is an input - position. The triple (X, a, b) is termed a "grammar slot" in the - parlance. - - We need constant look-up for both types of nodes. So we implement - these using hash tables. - - For the first type, we implement these as a hash table, whose keys - are those (X, i) in the set. The values are described below. - - Actually we don't need those second type nodes. What we care about - are the edges between the two types of nodes. So we simply don't - store the second type nodes at all. We only store edges, or more - precisely, edge labels. Since we need to know if an edge exists - efficiently, we implement the edges as a hash table. (This seems - to be termed a "disctionary of keys" in computer science jargon.) - So the values of the hash table of the first type node are hash - tables whose keys are (X, a, b, i) that are in the edge set of the - first type node, and whose values are meaningless. */ - -typedef struct crf_s crf; - -crf *new_crf(); - -/* on error return non-zero */ -BOOL crf_add_node(crf * const restrict crfp, pair2 node); - -/* if not found return NULL. - - if found but the node has no edges return an empty hash table. */ -ht *crf_find_node(crf * const restrict crfp, pair2 node); - -BOOL crf_add_edge(crf * const restrict crfp, pair2 source, - pair4 label); - -void destroy_crf(crf * const restrict crfp); - -/* We also need to implement the set of pending actions. These are - triples of the form: - - (X, k, j). - - Since we need to find all j such that (X, k, j) belongs to the set, - we implement the set as a hash table whose keys are (X, k) and - whose values are those j such that (X, k, j) belongs to the set. */ - -/* TODO: Pending actions */ - -/* We also implement "process descriptors" for the CNP algorithm. Since - a process descriptor is of the form - - (X := beta . gamma, b, j, k), - - where X := beta gamma is a rule and b is the length of beta, we can - represent a process descriptor as - - (X, a, b, j, k). - - However, we shall notice that there is a natural "grading" on the - process descriptors. The k-th grading consists of the descriptors - with the last element in the tuple equal to k. In the clustered - nonterminal parsing algorithm, the process descriptors that are - added as a consequence of dealing with a process descriptor of - grading k must be of grade greater than or equal to k. This means - that if all process descriptors of grade k have been processed and - the descriptors awaiting to be processed all have grades > k, then - we won't see any process descriptors of grade <= k again. So we - can actually keep an array of process descriptors that consist of - descriptors of degree k. This way we don't need to store the k in - the descriptor. Therefore a process descriptor is represented as - - (X, a, i, j). - - Since we have no other requirements than almost constant-time - lookup, we simply represent the set of descriptors using a hash - table with keys (X, a, i, j). - - The main benefit of this is that after we find that all process - descriptors in R_k have been processed, we can free those - descriptors in U_k. I think this is a good way to lower the space - requirements of this algorithm. */ - -typedef pair4 prodecor; - -/* The set named R in the paper */ -typedef struct procr_s procr; - -/* The set named U in the paper */ -typedef struct procu_s procu; - -/* constructor */ -procr *new_procr(NUM size, BOOL * const restrict errorp); -procu *new_procu(NUM size, BOOL * const restrict errorp); - -/* insert */ - -/* the function dscAdd in the paper */ -BOOL desc_add(procr * const restrict pr, procu * const restrict pu, - NUM grade, prodecor dec); - -/* pop */ - -/* gradep will be set to the grade */ - -/* if all descriptors of grade k have been poped out, then the - corresponding slot in pu will be freed automatically. */ - -/* if no more elements are left, return NULL */ -prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep); - -/* destructor */ -void destroy_procr(procr * const restrict pr); -void destroy_procu(procu * const restrict pu); - +/* TODO */ #endif -- cgit v1.2.3-18-g5258