From 510b10b96b546fcc6c6b6be85050305ddd192a41 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Sat, 5 Feb 2022 17:30:11 +0800 Subject: replace some hash table usage by tuples Previously I used hash tables, which consume too much memory. Now the critical parts are replaced by a new hand-written library called "tuple.h". Now we can easily parse larger inputs. I haven't tested its limits, though. --- src/crf.h | 80 ++++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 26 deletions(-) (limited to 'src/crf.h') diff --git a/src/crf.h b/src/crf.h index 9d1cc78..b9955fe 100644 --- a/src/crf.h +++ b/src/crf.h @@ -1,5 +1,7 @@ #ifndef CRF_H #define CRF_H +#include "tuple.h" +#include "grammar.h" #include "ht.h" /* This file implements some support functions for the Clustered @@ -48,22 +50,42 @@ tables whose keys are (X, a, b, i) that are in the edge set of the first type node, and whose values are meaningless. */ +/* Now, with the newly written library tuple.h, we implement this in a + different way. We first observe that we don't actually need those + nodes. What matters is the edges, the ability to know if an edge + exists between two given nodes, and to loop through the edges + starting from a given node. Then we record the existence of the + edge between (X, i) and (Y, a, b, j) by storing the tuple + + (X, i, Y, a, b, j). + + The only thing that is troublesome is to loop through the edges + from a given node. So we also observe that an edge is possible + only if j <= i. This ensures that we don't search impossible + edges. Hence the worst-case time complexity of the overall + algorithm is preserved (for every possibility, there really might + exist an edge indeed). The rest is premature optimization, I have + to proceed first. :P */ + typedef struct crf_s crf; -crf *new_crf(); +crf *new_crf(Grammar *g, NUM input_len); /* on error return non-zero */ -BOOL crf_add_node(crf * const restrict crfp, pair2 node); +BOOL crf_add_node(crf *crfp, pair2 node); -/* if not found return NULL. +HP_ATTR BOOL crf_find_node(crf *crfp, pair2 node); - if found but the node has no edges return an empty hash table. */ -ht *crf_find_node(crf * const restrict crfp, pair2 node); +HP_ATTR BOOL crf_find_edge(crf *crfp, pair2 source, pair4 label); BOOL crf_add_edge(crf * const restrict crfp, pair2 source, pair4 label); -void destroy_crf(crf * const restrict crfp); +void destroy_crf(crf *crfp); + +void crf_print(crf *crfp); + +H_ATTR void crf_map(crf *crfp, pair2 label, map_pair6_t fn); /* We also need to implement the set of pending actions. These are triples of the form: @@ -76,25 +98,22 @@ void destroy_crf(crf * const restrict crfp); Precisely speaking, the values are hash tables whose keys are those j and whose values are meaningless non-zero values. */ +/* With the new library, this is now implemented as a "luple3". */ + /* Set of Pending Actions */ typedef struct spa_s spa; -spa *new_spa(); - -void destroy_spa(spa * const restrict spap); +spa *new_spa(Grammar *g, NUM input_len); -__attribute__((__pure__, __cold__)) ht *spa_ht(CCR_MOD(spa *) spap); +void destroy_spa(spa *spap); /* On error return non-zero */ -BOOL spa_insert(spa * const restrict spap, pair3 label); +BOOL spa_insert(spa *spap, pair3 label); /* Find if a triple belongs to the set */ -P_ATTR BOOL spa_belong(CCR_MOD(spa *) spap, pair3 label); +P_ATTR BOOL spa_belong(spa *spap, pair3 label); -/* Find if for a pair (X, k) there exist some j such that (X, k, j) - belong to the set. If so, return the hash table whose keys are - those j; otherwise return NULL. */ -P_ATTR ht *spa_find(CCR_MOD(spa *) spap, pair2 label); +H_ATTR void spa_map(spa *spap, pair2 label, map_pair3_t fn); /* We also implement "process descriptors" for the CNP algorithm. Since a process descriptor is of the form @@ -130,6 +149,18 @@ P_ATTR ht *spa_find(CCR_MOD(spa *) spap, pair2 label); descriptors in U_k. I think this is a good way to lower the space requirements of this algorithm. */ +/* With the new library "tuple.h", we represent a process descriptor + as a "luple5". To implement grading, we store + + (X, a, b, j, k) + + as + + (k, X, a, b, j). + + Note that the procr set is still an array of lists, since we need + to pop elements from procr efficiently. */ + typedef pair4 prodecor; /* The set named R in the paper */ @@ -139,14 +170,13 @@ typedef struct procr_s procr; typedef struct procu_s procu; /* constructor */ -procr *new_procr(NUM size, BOOL * const restrict errorp); -procu *new_procu(NUM size, BOOL * const restrict errorp); +procr *new_procr(Grammar *g, NUM input_len, BOOL *errorp); +procu *new_procu(Grammar *g, NUM input_len, BOOL *errorp); /* insert */ /* the function dscAdd in the paper */ -BOOL desc_add(procr * const restrict pr, procu * const restrict pu, - NUM grade, prodecor dec); +BOOL desc_add(procr *pr, procu *pu, NUM grade, prodecor dec); /* pop */ @@ -156,14 +186,12 @@ BOOL desc_add(procr * const restrict pr, procu * const restrict pu, corresponding slot in pu will be freed automatically. */ /* if no more elements are left, return NULL */ -prodecor *pop_procr(procr * pr, procu * pu, NUM *gradep); +prodecor *pop_procr(procr *pr, procu *pu, NUM *gradep); -void print_procr(CCR_MOD(procr *) pr); +void print_procr(procr *pr); /* destructor */ -void destroy_procr(procr * const restrict pr); -void destroy_procu(procu * const restrict pu); - -#define TEST haha +void destroy_procr(procr *pr); +void destroy_procu(procu *pu); #endif -- cgit v1.2.3-18-g5258