From 3fb5430080199a6d92a63f8259fe4a88df9b83ba Mon Sep 17 00:00:00 2001 From: JSDurand Date: Tue, 1 Feb 2022 12:22:34 +0800 Subject: need to stop abusing hash tables Hash tables take too much space! If I use hash tables, the length of the input will be severely limited, to an unacceptable extent. So we have to use arrays instead. --- src/Makefile.am | 13 ++-- src/bsr.c | 27 +++++++ src/bsr.h | 6 ++ src/grammar.c | 55 ++++++++++++-- src/grammar.h | 14 +++- src/ht.c | 7 +- src/reader.c | 2 +- src/test/check_bsr.c | 2 +- src/test/check_cnp.c | 193 ++++++++++++++++++++++++++++++++++++++--------- src/test/check_grammar.c | 2 +- src/test/check_pred.c | 7 ++ 11 files changed, 270 insertions(+), 58 deletions(-) create mode 100644 src/test/check_pred.c (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am index 446961d..b897ec0 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -22,29 +22,32 @@ CLEANFILES = TAGS # tests check_PROGRAMS = check_list check_grammar check_reader check_dfa \ -check_ht check_bsr check_crf check_cnp +check_ht check_bsr check_crf check_cnp check_pred check_list_SOURCES = test/check_list.c list.c check_grammar_SOURCES = test/check_grammar.c list.c ht.c grammar.c \ -utf8.c str.c +utf8.c str.c dfa.c check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c \ -str.c utf8.c util.c ht.c +str.c utf8.c util.c ht.c dfa.c check_dfa_SOURCES = test/check_dfa.c dfa.c list.c str.c utf8.c check_ht_SOURCES = test/check_ht.c ht.c check_bsr_SOURCES = test/check_bsr.c bsr.c grammar.c list.c util.c \ -ht.c utf8.c str.c +ht.c utf8.c str.c dfa.c check_crf_SOURCES = test/check_crf.c crf.c grammar.c list.c util.c \ -ht.c utf8.c str.c +ht.c utf8.c str.c dfa.c check_cnp_SOURCES = test/check_cnp.c crf.c cnp.c grammar.c list.c \ util.c ht.c utf8.c str.c dfa.c bsr.c +check_pred_SOURCES = test/check_pred.c dfa.c list.c str.c utf8.c \ +grammar.c util.c ht.c + TESTS = $(check_PROGRAMS) AM_COLOR_TESTS = always diff --git a/src/bsr.c b/src/bsr.c index bd0a84c..c034072 100644 --- a/src/bsr.c +++ b/src/bsr.c @@ -474,6 +474,33 @@ bsr_find(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g, return result; } +ht * +bsr_lookup(CCR_MOD(bsr *) b, BOOL * const restrict errorp, + NUM X, NUM i, NUM j) +{ + *errorp = 0; + ht *result = NULL; + + if (X < 0 || X >= b->f.len) { + fleprintf("Invalid X: %ld\n", X); + goto cleanup; + } + + if (!((b->f.array+X)->initialized)) goto success; + + pair2 p2 = (pair2) { .x = i, .y = j }; + + result = (ht *) ht_find((b->f.array+X)->table, &p2); + + goto success; + + cleanup: + *errorp = 1; + + success: + return result; +} + static void print_sep() { diff --git a/src/bsr.h b/src/bsr.h index 6c5e17c..64e428a 100644 --- a/src/bsr.h +++ b/src/bsr.h @@ -25,6 +25,8 @@ hash-table's almost constant-time look-up feature to implement this. */ +/* FIXME: Don't use hash-tables, as that uses too much space! */ + /* A BSR set has two types. The first type is of the form @@ -99,6 +101,10 @@ BOOL bsr_find(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g, BOOL * const restrict errorp, NUM X, NUM a, NUM c, NUM i, NUM k, NUM j); +/* if not found return NULL */ +ht *bsr_lookup(CCR_MOD(bsr *) b, BOOL * const restrict errorp, + NUM X, NUM i, NUM j); + /* Change to new line after LINE_LEN many elements. */ void bsr_print(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g, NUM line_len); diff --git a/src/grammar.c b/src/grammar.c index afb9353..5ed0b96 100644 --- a/src/grammar.c +++ b/src/grammar.c @@ -46,12 +46,44 @@ rg_nth(CCR_MOD(Rule_group *) rg, NUM n) struct PT_DATA_s { dfa *dfap; - /* NULL-terminated strings */ - char *user_name; - char *raw_name; + str *user_name; + str *raw_name; }; -typedef struct PT_DATA_s PTD; +PTD * +new_ptd(str *user_name, str *raw_name, dfa *dfap) +{ + PTD *result = NULL; + SAFE_MALLOC(PTD, result, 1, return NULL;); + + result->dfap = dfap; + result->user_name = user_name; + result->raw_name = raw_name; + + return result; +} + +void +destroy_ptd(PTD *p, int flag) +{ + destroy_dfa(p->dfap); + destroy_str(p->user_name, flag); + destroy_str(p->raw_name, flag); + + free(p); +} + +static void +destroy_ptd_free_all(void *e) +{ + destroy_ptd((PTD *)e, 1); +} + +static void +destroy_ptd_no_free(void *e) +{ + destroy_ptd((PTD *)e, 0); +} /* TODO: Add some statistic counts to assist the hash tables. For example, store the total number of terminals as an integer; then @@ -72,8 +104,8 @@ struct Grammar_s { To print them, first encode them into normal strings. */ List *names; - /* an array of predicates. */ - PTD *predicates; + /* a list of predicates, of type PTD. */ + List *predicates; }; static void @@ -186,10 +218,11 @@ new_grammar() /* We classify the rules into different rule groups. */ BOOL -build_grammar(Grammar *g, const List *rules, CC_MOD(List *) names) +build_grammar(Grammar *g, const List *rules, + CC_MOD(List *) names, CC_MOD(List *) predicates) { g->names = (List *) names; - g->predicates = NULL; + g->predicates = predicates; NUM len = list_length(names); NUM rule_len = list_length(rules); @@ -486,6 +519,12 @@ destroy_grammar(void *grammar, int flag) destroy_list(g->names, 0); + if (g->predicates) { + if (flag) map_list(g->predicates, destroy_ptd_free_all); + else map_list(g->predicates, destroy_ptd_no_free); + destroy_list(g->predicates, 0); + } + free(grammar); } diff --git a/src/grammar.h b/src/grammar.h index d51c612..fd8d5c9 100644 --- a/src/grammar.h +++ b/src/grammar.h @@ -55,7 +55,7 @@ typedef unsigned long PT; * PT_DFA, * PT_SPECIAL * }; - * + * * typedef enum PT_TYPE_e PT_TYPE; */ /* T or NT @@ -70,6 +70,14 @@ enum TNT_TYPE_e { typedef enum TNT_TYPE_e TNT_TYPE; +typedef struct PT_DATA_s PTD; + +/* On error return NULL */ +PTD *new_ptd(str *user_name, str *raw_name, dfa *dfap); + +/* FLAG is used to destroy the strings contained within */ +void destroy_ptd(PTD *p, int flag); + /* If a TNT of type TERMINAL has value END_OF_INPUT, then it means, surprisingly, the end of input. */ enum { END_OF_INPUT = -1 }; @@ -98,8 +106,8 @@ typedef struct Grammar_s Grammar; NAMES is a list of CPA pointers. */ BOOL -build_grammar(Grammar *g, - const List *rules, CC_MOD(List *) names); +build_grammar(Grammar *g, const List *rules, + CC_MOD(List *) names, CC_MOD(List *) predicates); /* This accepts one and only one optional argument, which is of type either T, NT, or PT, depending on TYPE. */ diff --git a/src/ht.c b/src/ht.c index 45953f4..770eff6 100644 --- a/src/ht.c +++ b/src/ht.c @@ -3,7 +3,7 @@ #include #include "ht.h" -#define HT_REHASH_THRESHOLD 0.5 +#define HT_REHASH_THRESHOLD 0.7 P_ATTR static NUM @@ -171,7 +171,10 @@ destroy_ht(ht * restrict htp, HT_DESTROY_FLAG flag) static BOOL ht_expand(ht *htp) { - htp->capability <<= 1; + UNUM newcap = htp->capability << 1; + + if (newcap < htp->capability + (1 << 20)) htp->capability = newcap; + else htp->capability = htp->capability + (1 << 20); void **keys = NULL; NUM size = htp->size; diff --git a/src/reader.c b/src/reader.c index 7676d2d..0ab9b69 100644 --- a/src/reader.c +++ b/src/reader.c @@ -536,7 +536,7 @@ read_grammar_from_bnf(const str * const restrict s) Grammar *g = new_grammar(); - if (build_grammar(g, rules, names)) { + if (build_grammar(g, rules, names, NULL)) { fleprintf0("Failed to build the grammar\n"); map_list(names, destroy_cpa_and_free_all); destroy_list(names, 0); diff --git a/src/test/check_bsr.c b/src/test/check_bsr.c index 4a6ba4e..81a84a7 100644 --- a/src/test/check_bsr.c +++ b/src/test/check_bsr.c @@ -108,7 +108,7 @@ main(int UNUSED argc, char ** UNUSED argv) Grammar *g = new_grammar(); - build_grammar(g, rules, names); + build_grammar(g, rules, names, NULL); print_grammar(g); diff --git a/src/test/check_cnp.c b/src/test/check_cnp.c index 7a0a325..91952ef 100644 --- a/src/test/check_cnp.c +++ b/src/test/check_cnp.c @@ -1,5 +1,17 @@ +#include "time.h" #include "../cnp.h" +#define TIC do { \ + clock_gettime(CLOCK_MONOTONIC_RAW, &tic); \ + } while (0) + +#define TOC do { \ + clock_gettime(CLOCK_MONOTONIC_RAW, &toc); \ + printf("\nTotal time = %f seconds\n", \ + (toc.tv_nsec - tic.tv_nsec) / 1000000000.0 + \ + toc.tv_sec - tic.tv_sec); \ + } while (0) + #define READ_INTO_CPA(N, U, L, I, VA, VI, CP) do { \ U = new_utf8(N, L); \ I = get_info((str *)U, 0); \ @@ -34,6 +46,8 @@ add_to_list(rules, rule); \ } while (0) +#define GRAMMAR 2 + int main(int UNUSED argc, char ** UNUSED argv) { @@ -44,9 +58,6 @@ main(int UNUSED argc, char ** UNUSED argv) List *names = new_list(); char *name = MYALLOC(char, 2); - *(name+1) = 0; - *name = 'B'; - utf8* uname = NULL; str_info info = EMPTY_STR_INFO; @@ -55,58 +66,166 @@ main(int UNUSED argc, char ** UNUSED argv) NUM vindex = 0; cpa *cpap = NULL; - READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); - - name = MYALLOC(char, 3); - *(name+1) = 0; - *name = 'U'; - - READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); - - /* name = MYALLOC(char, 2); - * *(name+1) = 0; - * *name = 'F'; - * - * READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); - * - * name = MYALLOC(char, 2); - * *(name+1) = 0; - * *name = 'D'; - * - * READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); */ - - READ_TNT_STRING(0, "nn", 2, (NT) 1, (NT) 0); - rule = new_rule(0, new_list()); - add_to_list(rules, rule); - - READ_TNT_STRING(1, "t", 1, (T) '>'); - READ_TNT_STRING(1, "t", 1, (T) '<'); - READ_TNT_STRING(1, "t", 1, (T) '+'); - READ_TNT_STRING(1, "t", 1, (T) '-'); - READ_TNT_STRING(1, "t", 1, (T) ','); - READ_TNT_STRING(1, "t", 1, (T) '.'); - READ_TNT_STRING(1, "tnt", 3, (T) '[', (NT) 0, (T) ']'); + switch (GRAMMAR) { + case 1: + *(name+1) = 0; + *name = 'S'; + + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + name = MYALLOC(char, 3); + *(name+1) = 0; + *name = 'A'; + + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + name = MYALLOC(char, 2); + *(name+1) = 0; + *name = 'B'; + + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + READ_TNT_STRING(0, "tnn", 3, (T) 'a', (NT) 1, (NT) 2); + READ_TNT_STRING(0, "tnt", 3, (T) 'a', (NT) 1, (T) 'b'); + + READ_TNT_STRING(1, "t", 1, (T) 'a'); + READ_TNT_STRING(1, "t", 1, (T) 'c'); + rule = new_rule(1, new_list()); + add_to_list(rules, rule); + + READ_TNT_STRING(2, "t", 1, (T) 'b'); + READ_TNT_STRING(2, "nt", 2, (NT) 2, (T) 'c'); + rule = new_rule(2, new_list()); + add_to_list(rules, rule); + break; + case 2: + *(name+1) = 0; + *name = 'B'; + + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + name = MYALLOC(char, 2); + *(name+1) = 0; + *name = 'U'; + + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + READ_TNT_STRING(0, "nn", 2, (NT) 1, (NT) 0); + rule = new_rule(0, new_list()); + add_to_list(rules, rule); + + READ_TNT_STRING(1, "t", 1, (T) '>'); + READ_TNT_STRING(1, "t", 1, (T) '<'); + READ_TNT_STRING(1, "t", 1, (T) '+'); + READ_TNT_STRING(1, "t", 1, (T) '-'); + READ_TNT_STRING(1, "t", 1, (T) ','); + READ_TNT_STRING(1, "t", 1, (T) '.'); + READ_TNT_STRING(1, "tnt", 3, (T) '[', (NT) 0, (T) ']'); + break; + case 3: + *(name+1) = 0; + *name = 'S'; + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + READ_TNT_STRING(0, "t", 1, (T) 'b'); + READ_TNT_STRING(0, "nn", 2, (NT) 0, (NT) 0); + READ_TNT_STRING(0, "nnn", 3, (NT) 0, (NT) 0, (NT) 0); + break; + case 4: + *(name+1) = 0; + *name = 'S'; + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + name = MYALLOC(char, 2); + *(name+1) = 0; + *name = 'T'; + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + name = MYALLOC(char, 2); + *(name+1) = 0; + *name = 'U'; + READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); + + READ_TNT_STRING(0, "nnn", 3, (NT) 0, (NT) 1, (NT) 2); + READ_TNT_STRING(1, "nnn", 3, (NT) 1, (NT) 2, (NT) 0); + READ_TNT_STRING(2, "nnn", 3, (NT) 2, (NT) 0, (NT) 1); + rule = new_rule(0, new_list()); + add_to_list(rules, rule); + rule = new_rule(1, new_list()); + add_to_list(rules, rule); + rule = new_rule(2, new_list()); + add_to_list(rules, rule); + READ_TNT_STRING(0, "t", 1, (T) 'a'); + break; + default: + fleprintf0("Forgot to assign grammar!\n"); + break; + } Grammar *g = new_grammar(); - build_grammar(g, rules, names); + build_grammar(g, rules, names, NULL); print_grammar(g); - utf8 *string = new_utf8("[,[.[+]]-]", 10); + utf8 *string = NULL; + + switch (GRAMMAR) { + case 1: + string = new_utf8("aab", 3); + break; + case 2: + string = new_utf8("[+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><]", 100); + break; + case 3: + string = new_utf8("bbbbb", 5); + break; + case 4: + string = new_utf8("aaaaaaaaaaaa", 12); + break; + default: + fleprintf0("Forgot to assign input!\n"); + break; + } printf("\nPrinting the input...\n%s\n", get_data((str *) string)); + struct timespec tic, toc; + TIC; + Environment *env = cnp_parse(g, (str *) string); + TOC; + if (env) { if (!(env_error_p(env))) { - printf("Successfully parsed the input:\n"); + BOOL error = 0; + ht *result = bsr_lookup + (env_bsrp(env), &error, 0, 0, str_length((str *) string)); + + if (error) { + printf("There are errors!\n"); + goto destroy; + } + + if (result && ht_size(result)) { + printf("\nSuccessfully parsed the input!\n"); + for (NUM i = 0; i < ht_size(result); i++) + printf("(%d, %ld, %d, %ld, %ld)\n", + 0, (*((pair2 **) ht_keys(result)+i))->x, + 0, (*((pair2 **) ht_keys(result)+i))->y, + str_length((str *) string)); + } else { + printf("\nThe input does not parse!\n"); + } + + printf("\nAll BSRs follow:\n\n"); bsr_print(env_bsrp(env), env_grammar(env), 1); } else { printf("There are errors!\n"); } + destroy: destroy_env(env); } diff --git a/src/test/check_grammar.c b/src/test/check_grammar.c index 7e55681..e66b262 100644 --- a/src/test/check_grammar.c +++ b/src/test/check_grammar.c @@ -136,7 +136,7 @@ main(UNUSED int argc, UNUSED char **argv) Grammar *g = new_grammar(); - build_grammar(g, rules, names); + build_grammar(g, rules, names, NULL); print_grammar(g); diff --git a/src/test/check_pred.c b/src/test/check_pred.c new file mode 100644 index 0000000..b0b95b3 --- /dev/null +++ b/src/test/check_pred.c @@ -0,0 +1,7 @@ +#include "../grammar.h" + +int +main(int UNUSED argc, char ** UNUSED argv) +{ + return 77; +} -- cgit v1.2.3-18-g5258