#ifndef BSR_H #define BSR_H #include "util.h" #include "grammar.h" /* This file implements the "Binary Subtree Representation" of a derivation forest. See the following paper for the miraculous details: Derivation representation using binary subtree sets by Elizabeth Scott, Adrian Johnstone and L.Thomas van Binsbergen. */ /* Note that there is an implementation of this GLL algorithm in the following GitHub repository: https://github.com/goccmack/gogll But the implementation there uses array look-up to determine if a BSR has already been added to a set. I guess the reason is that this is not the bottleneck of performance in its applications. But I am not satisfied with that solution. I want to take advantage of hash-table's almost constant-time look-up feature to implement this. */ /* FIXME: Don't use hash-tables, as that uses too much space! */ /* A BSR set has two types. The first type is of the form (X := alpha, i, j, k), where X := alpha is a grammar rule. This is equivalent with a 5-tuple (X, a, i, j, k), where X is the integer corresponding to X and a is the index of the rule X := alpha. The second type is of the form (X := beta . gamma, i, j, k), where beta is a prefix of the rule X := beta gamma. So this is equivalent with a 6-tuple (X, a, b, i, j, k), where a is the index of the rule X := beta gamma and b is the length of beta. The simplest way to implement this is to have two "sets", one for each type. But the two types need to be implemented in different ways. For both types, we need to know if a BSR belongs to the set in (almost) constnat time, so we exclude the possibility of linked lists first. For the first type, we also need the ability to loop through all BSRs of the form (X, ...) for a given non-terminal X. Further, we need to index the left and the right extents in almost constant time as well. So we represent the first type as an array of hash tables, whose indices correspond to nonterminals. Each hash table has as keys (i, k) and as values hash tables whose keys are those (a, j) such that (X, a, i, j, k) belong to the set. For the second type, we require the capability to loop through all BSRs of the form (X, a, b, ...) in a set for some given X, a, and b. So we represent the second type as an array of arrays of arrays of hash tables, whose array indices correspond to X, a, and b, respectively. The hash tables have keys (i, k) and values those j such that (X, a, b, i, j, k) belongs to the set. I don't know if there are better implementations. If I think of better ways in the future, I will re-implement the BSR sets. */ typedef struct bsr_s bsr; bsr *new_bsr(BOOL * const restrict errorp, Grammar *g); /* No flag to control the deletion, as the user cannot change how the data is stored. */ void destroy_bsr(bsr * const restrict bsrp); /* the function bsrAdd in the paper */ /* X = non-terminal index, a = index of alternate, c = length of prefix */ /* i = left-extent of X, and j = right-extent. k = the left-extent of the right-most terminal or non-terminal in the alternate corresponding to a. */ BOOL bsr_add(CCR_MOD(Grammar *) g, bsr * const restrict b, NUM X, NUM a, NUM c, NUM i, NUM k, NUM j); BOOL bsr_find(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g, BOOL * const restrict errorp, NUM X, NUM a, NUM c, NUM i, NUM k, NUM j); /* if not found return NULL */ ht *bsr_lookup(CCR_MOD(bsr *) b, BOOL * const restrict errorp, NUM X, NUM i, NUM j); /* Change to new line after LINE_LEN many elements. */ void bsr_print(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g, NUM line_len); #endif