From e8e1c91b40c9c82a0fd8373746a7b8cfb130f467 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 28 Jan 2022 11:16:16 +0800 Subject: BSR A prototype of BSR is roughly finished. --- src/bsr.h | 92 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 45 insertions(+), 47 deletions(-) (limited to 'src/bsr.h') diff --git a/src/bsr.h b/src/bsr.h index d6b96e5..b92e20e 100644 --- a/src/bsr.h +++ b/src/bsr.h @@ -1,14 +1,16 @@ #ifndef BSR_H #define BSR_H +#include "util.h" +#include "grammar.h" /* This file implements the "Binary Subtree Representation" of a derivation forest. See the following paper for the miraculous details: Derivation representation using binary subtree sets - + by - + Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */ /* Note that there is an implementation of this GLL algorithm in the @@ -50,58 +52,54 @@ length of beta. The simplest way to implement this is to have two "sets", one for - each type. Each set is an array of arrays, where the root array's - indices correspond to non-terminals, and the leaf arrays correspond - to rules of that non-terminal. Then the rest is implemented - through hash-tables, whose keys are tuples of integers. For the - first type, the keys are the (i, k) pairs and the values are those - j. For the second, the keys are the (b, i, k) tupels and the - values are those j. The reason to use i and k as keys is that they - represent the left and the right extent of the BSR respectively, so - we index them in order to find them easily. In addition, this will - be useful if we want to extract SPPF out of a BSR set. + each type. But the two types need to be implemented in different + ways. For both types, we need to know if a BSR belongs to the set + in (almost) constnat time, so we exclude the possibility of linked + lists first. + + For the first type, we also need the ability to loop through all + BSRs of the form (X, ...) for a given non-terminal X. Further, we + need to index the left and the right extents in almost constant + time as well. So we represent the first type as an array of hash + tables, whose indices correspond to nonterminals. Each hash table + has as keys (i, k) and as values hash tables whose keys are those + (a, j) such that (X, a, i, j, k) belong to the set. + + For the second type, we require the capability to loop through all + BSRs of the form (X, a, b, ...) in a set for some given X, a, and + b. So we represent the second type as an array of arrays of arrays + of hash tables, whose array indices correspond to X, a, and b, + respectively. The hash tables have keys (i, k) and values those j + such that (X, a, b, i, j, k) belongs to the set. I don't know if there are better implementations. If I think of better ways in the future, I will re-implement the BSR sets. */ -/* Note that we also need to implement "process descriptors" for the - GLL parser. Since a process descriptor is of the form +typedef struct bsr_s bsr; - (X := beta . gamma, i, j, k), +bsr *new_bsr(BOOL * const restrict errorp, Grammar *g); - where X := beta gamma is a rule and i is the length of beta, we can - represent a process descriptor as +/* No flag to control the deletion, as the user cannot change how the + data is stored. */ +void destroy_bsr(bsr * const restrict bsrp); - (X, a, i, j, k), +/* the function in the paper */ + +/* X = non-terminal index, a = index of alternate, c = length of + prefix */ + +/* i = left-extent of X, and j = right-extent. k = the left-extent of + the right-most terminal or non-terminal in the alternate + corresponding to a. */ + +BOOL bsr_add(CCR_MOD(Grammar *) g, bsr * const restrict b, + NUM X, NUM a, NUM c, NUM i, NUM k, NUM j); + +BOOL bsr_find(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g, + BOOL * const restrict errorp, + NUM X, NUM a, NUM c, NUM i, NUM k, NUM j); - i.e. as a first-type BSR set. But the meaning of the "i" is - different. - - However, we shall notice that there is a natural "grading" on the - process descriptors. The k-th grading consists of the descriptors - with the last element in the tuple equal to k. In the clustered - nonterminal parsing algorithm, the process descriptors that are - added as a consequence of dealing with a process descriptor of - grading k must be of grade greater than or equal to k. This means - that if all process descriptors of grade k have been processed and - the descriptors awaiting to be processed all have grades > k, then - we won't see any process descriptors of grade <= k again. So we - can actually keep a list of process descriptors that serve the role - of the set U_k in the afore-mentionned paper. This list - corresponds to the grading of the descriptors. This way we don't - need to store the k in the descriptor. Therefore a process - descriptor is represented as - - (X, a, i, j). - - Again we store an array of arrays corresponding to X and a. Thus - the hash table simply has keys i and values j. - - The main benefit of this is that after we find that all process - descriptors in R_k have been processed, we can free those - descriptors in U_k. I think this is a good way to lower the space - requirements of this algorithm. */ - -int place(int x); +/* Change to new line after LINE_LEN many elements. */ +void bsr_print(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g, NUM line_len); #endif -- cgit v1.2.3-18-g5258