summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2022-02-01 12:22:34 +0800
committerJSDurand <mmemmew@gmail.com>2022-02-01 12:22:34 +0800
commit3fb5430080199a6d92a63f8259fe4a88df9b83ba (patch)
tree767c3266b59566e98234ec81b228cf8115096939
parenteb007d554251456a2a508849edf91b15aab1333e (diff)
need to stop abusing hash tables
Hash tables take too much space! If I use hash tables, the length of the input will be severely limited, to an unacceptable extent. So we have to use arrays instead.
-rw-r--r--src/Makefile.am13
-rw-r--r--src/bsr.c27
-rw-r--r--src/bsr.h6
-rw-r--r--src/grammar.c55
-rw-r--r--src/grammar.h14
-rw-r--r--src/ht.c7
-rw-r--r--src/reader.c2
-rw-r--r--src/test/check_bsr.c2
-rw-r--r--src/test/check_cnp.c193
-rw-r--r--src/test/check_grammar.c2
-rw-r--r--src/test/check_pred.c7
11 files changed, 270 insertions, 58 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 446961d..b897ec0 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -22,29 +22,32 @@ CLEANFILES = TAGS
# tests
check_PROGRAMS = check_list check_grammar check_reader check_dfa \
-check_ht check_bsr check_crf check_cnp
+check_ht check_bsr check_crf check_cnp check_pred
check_list_SOURCES = test/check_list.c list.c
check_grammar_SOURCES = test/check_grammar.c list.c ht.c grammar.c \
-utf8.c str.c
+utf8.c str.c dfa.c
check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c \
-str.c utf8.c util.c ht.c
+str.c utf8.c util.c ht.c dfa.c
check_dfa_SOURCES = test/check_dfa.c dfa.c list.c str.c utf8.c
check_ht_SOURCES = test/check_ht.c ht.c
check_bsr_SOURCES = test/check_bsr.c bsr.c grammar.c list.c util.c \
-ht.c utf8.c str.c
+ht.c utf8.c str.c dfa.c
check_crf_SOURCES = test/check_crf.c crf.c grammar.c list.c util.c \
-ht.c utf8.c str.c
+ht.c utf8.c str.c dfa.c
check_cnp_SOURCES = test/check_cnp.c crf.c cnp.c grammar.c list.c \
util.c ht.c utf8.c str.c dfa.c bsr.c
+check_pred_SOURCES = test/check_pred.c dfa.c list.c str.c utf8.c \
+grammar.c util.c ht.c
+
TESTS = $(check_PROGRAMS)
AM_COLOR_TESTS = always
diff --git a/src/bsr.c b/src/bsr.c
index bd0a84c..c034072 100644
--- a/src/bsr.c
+++ b/src/bsr.c
@@ -474,6 +474,33 @@ bsr_find(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g,
return result;
}
+ht *
+bsr_lookup(CCR_MOD(bsr *) b, BOOL * const restrict errorp,
+ NUM X, NUM i, NUM j)
+{
+ *errorp = 0;
+ ht *result = NULL;
+
+ if (X < 0 || X >= b->f.len) {
+ fleprintf("Invalid X: %ld\n", X);
+ goto cleanup;
+ }
+
+ if (!((b->f.array+X)->initialized)) goto success;
+
+ pair2 p2 = (pair2) { .x = i, .y = j };
+
+ result = (ht *) ht_find((b->f.array+X)->table, &p2);
+
+ goto success;
+
+ cleanup:
+ *errorp = 1;
+
+ success:
+ return result;
+}
+
static void
print_sep()
{
diff --git a/src/bsr.h b/src/bsr.h
index 6c5e17c..64e428a 100644
--- a/src/bsr.h
+++ b/src/bsr.h
@@ -25,6 +25,8 @@
hash-table's almost constant-time look-up feature to implement
this. */
+/* FIXME: Don't use hash-tables, as that uses too much space! */
+
/* A BSR set has two types.
The first type is of the form
@@ -99,6 +101,10 @@ BOOL bsr_find(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g,
BOOL * const restrict errorp,
NUM X, NUM a, NUM c, NUM i, NUM k, NUM j);
+/* if not found return NULL */
+ht *bsr_lookup(CCR_MOD(bsr *) b, BOOL * const restrict errorp,
+ NUM X, NUM i, NUM j);
+
/* Change to new line after LINE_LEN many elements. */
void bsr_print(CCR_MOD(bsr *) b, CCR_MOD(Grammar *) g, NUM line_len);
diff --git a/src/grammar.c b/src/grammar.c
index afb9353..5ed0b96 100644
--- a/src/grammar.c
+++ b/src/grammar.c
@@ -46,12 +46,44 @@ rg_nth(CCR_MOD(Rule_group *) rg, NUM n)
struct PT_DATA_s {
dfa *dfap;
- /* NULL-terminated strings */
- char *user_name;
- char *raw_name;
+ str *user_name;
+ str *raw_name;
};
-typedef struct PT_DATA_s PTD;
+PTD *
+new_ptd(str *user_name, str *raw_name, dfa *dfap)
+{
+ PTD *result = NULL;
+ SAFE_MALLOC(PTD, result, 1, return NULL;);
+
+ result->dfap = dfap;
+ result->user_name = user_name;
+ result->raw_name = raw_name;
+
+ return result;
+}
+
+void
+destroy_ptd(PTD *p, int flag)
+{
+ destroy_dfa(p->dfap);
+ destroy_str(p->user_name, flag);
+ destroy_str(p->raw_name, flag);
+
+ free(p);
+}
+
+static void
+destroy_ptd_free_all(void *e)
+{
+ destroy_ptd((PTD *)e, 1);
+}
+
+static void
+destroy_ptd_no_free(void *e)
+{
+ destroy_ptd((PTD *)e, 0);
+}
/* TODO: Add some statistic counts to assist the hash tables. For
example, store the total number of terminals as an integer; then
@@ -72,8 +104,8 @@ struct Grammar_s {
To print them, first encode them into normal strings. */
List *names;
- /* an array of predicates. */
- PTD *predicates;
+ /* a list of predicates, of type PTD. */
+ List *predicates;
};
static void
@@ -186,10 +218,11 @@ new_grammar()
/* We classify the rules into different rule groups. */
BOOL
-build_grammar(Grammar *g, const List *rules, CC_MOD(List *) names)
+build_grammar(Grammar *g, const List *rules,
+ CC_MOD(List *) names, CC_MOD(List *) predicates)
{
g->names = (List *) names;
- g->predicates = NULL;
+ g->predicates = predicates;
NUM len = list_length(names);
NUM rule_len = list_length(rules);
@@ -486,6 +519,12 @@ destroy_grammar(void *grammar, int flag)
destroy_list(g->names, 0);
+ if (g->predicates) {
+ if (flag) map_list(g->predicates, destroy_ptd_free_all);
+ else map_list(g->predicates, destroy_ptd_no_free);
+ destroy_list(g->predicates, 0);
+ }
+
free(grammar);
}
diff --git a/src/grammar.h b/src/grammar.h
index d51c612..fd8d5c9 100644
--- a/src/grammar.h
+++ b/src/grammar.h
@@ -55,7 +55,7 @@ typedef unsigned long PT;
* PT_DFA,
* PT_SPECIAL
* };
- *
+ *
* typedef enum PT_TYPE_e PT_TYPE; */
/* T or NT
@@ -70,6 +70,14 @@ enum TNT_TYPE_e {
typedef enum TNT_TYPE_e TNT_TYPE;
+typedef struct PT_DATA_s PTD;
+
+/* On error return NULL */
+PTD *new_ptd(str *user_name, str *raw_name, dfa *dfap);
+
+/* FLAG is used to destroy the strings contained within */
+void destroy_ptd(PTD *p, int flag);
+
/* If a TNT of type TERMINAL has value END_OF_INPUT, then it means,
surprisingly, the end of input. */
enum { END_OF_INPUT = -1 };
@@ -98,8 +106,8 @@ typedef struct Grammar_s Grammar;
NAMES is a list of CPA pointers. */
BOOL
-build_grammar(Grammar *g,
- const List *rules, CC_MOD(List *) names);
+build_grammar(Grammar *g, const List *rules,
+ CC_MOD(List *) names, CC_MOD(List *) predicates);
/* This accepts one and only one optional argument, which is of type
either T, NT, or PT, depending on TYPE. */
diff --git a/src/ht.c b/src/ht.c
index 45953f4..770eff6 100644
--- a/src/ht.c
+++ b/src/ht.c
@@ -3,7 +3,7 @@
#include <stdio.h>
#include "ht.h"
-#define HT_REHASH_THRESHOLD 0.5
+#define HT_REHASH_THRESHOLD 0.7
P_ATTR
static NUM
@@ -171,7 +171,10 @@ destroy_ht(ht * restrict htp, HT_DESTROY_FLAG flag)
static BOOL
ht_expand(ht *htp)
{
- htp->capability <<= 1;
+ UNUM newcap = htp->capability << 1;
+
+ if (newcap < htp->capability + (1 << 20)) htp->capability = newcap;
+ else htp->capability = htp->capability + (1 << 20);
void **keys = NULL;
NUM size = htp->size;
diff --git a/src/reader.c b/src/reader.c
index 7676d2d..0ab9b69 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -536,7 +536,7 @@ read_grammar_from_bnf(const str * const restrict s)
Grammar *g = new_grammar();
- if (build_grammar(g, rules, names)) {
+ if (build_grammar(g, rules, names, NULL)) {
fleprintf0("Failed to build the grammar\n");
map_list(names, destroy_cpa_and_free_all);
destroy_list(names, 0);
diff --git a/src/test/check_bsr.c b/src/test/check_bsr.c
index 4a6ba4e..81a84a7 100644
--- a/src/test/check_bsr.c
+++ b/src/test/check_bsr.c
@@ -108,7 +108,7 @@ main(int UNUSED argc, char ** UNUSED argv)
Grammar *g = new_grammar();
- build_grammar(g, rules, names);
+ build_grammar(g, rules, names, NULL);
print_grammar(g);
diff --git a/src/test/check_cnp.c b/src/test/check_cnp.c
index 7a0a325..91952ef 100644
--- a/src/test/check_cnp.c
+++ b/src/test/check_cnp.c
@@ -1,5 +1,17 @@
+#include "time.h"
#include "../cnp.h"
+#define TIC do { \
+ clock_gettime(CLOCK_MONOTONIC_RAW, &tic); \
+ } while (0)
+
+#define TOC do { \
+ clock_gettime(CLOCK_MONOTONIC_RAW, &toc); \
+ printf("\nTotal time = %f seconds\n", \
+ (toc.tv_nsec - tic.tv_nsec) / 1000000000.0 + \
+ toc.tv_sec - tic.tv_sec); \
+ } while (0)
+
#define READ_INTO_CPA(N, U, L, I, VA, VI, CP) do { \
U = new_utf8(N, L); \
I = get_info((str *)U, 0); \
@@ -34,6 +46,8 @@
add_to_list(rules, rule); \
} while (0)
+#define GRAMMAR 2
+
int
main(int UNUSED argc, char ** UNUSED argv)
{
@@ -44,9 +58,6 @@ main(int UNUSED argc, char ** UNUSED argv)
List *names = new_list();
char *name = MYALLOC(char, 2);
- *(name+1) = 0;
- *name = 'B';
-
utf8* uname = NULL;
str_info info = EMPTY_STR_INFO;
@@ -55,58 +66,166 @@ main(int UNUSED argc, char ** UNUSED argv)
NUM vindex = 0;
cpa *cpap = NULL;
- READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
-
- name = MYALLOC(char, 3);
- *(name+1) = 0;
- *name = 'U';
-
- READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
-
- /* name = MYALLOC(char, 2);
- * *(name+1) = 0;
- * *name = 'F';
- *
- * READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
- *
- * name = MYALLOC(char, 2);
- * *(name+1) = 0;
- * *name = 'D';
- *
- * READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap); */
-
- READ_TNT_STRING(0, "nn", 2, (NT) 1, (NT) 0);
- rule = new_rule(0, new_list());
- add_to_list(rules, rule);
-
- READ_TNT_STRING(1, "t", 1, (T) '>');
- READ_TNT_STRING(1, "t", 1, (T) '<');
- READ_TNT_STRING(1, "t", 1, (T) '+');
- READ_TNT_STRING(1, "t", 1, (T) '-');
- READ_TNT_STRING(1, "t", 1, (T) ',');
- READ_TNT_STRING(1, "t", 1, (T) '.');
- READ_TNT_STRING(1, "tnt", 3, (T) '[', (NT) 0, (T) ']');
+ switch (GRAMMAR) {
+ case 1:
+ *(name+1) = 0;
+ *name = 'S';
+
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ name = MYALLOC(char, 3);
+ *(name+1) = 0;
+ *name = 'A';
+
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ name = MYALLOC(char, 2);
+ *(name+1) = 0;
+ *name = 'B';
+
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ READ_TNT_STRING(0, "tnn", 3, (T) 'a', (NT) 1, (NT) 2);
+ READ_TNT_STRING(0, "tnt", 3, (T) 'a', (NT) 1, (T) 'b');
+
+ READ_TNT_STRING(1, "t", 1, (T) 'a');
+ READ_TNT_STRING(1, "t", 1, (T) 'c');
+ rule = new_rule(1, new_list());
+ add_to_list(rules, rule);
+
+ READ_TNT_STRING(2, "t", 1, (T) 'b');
+ READ_TNT_STRING(2, "nt", 2, (NT) 2, (T) 'c');
+ rule = new_rule(2, new_list());
+ add_to_list(rules, rule);
+ break;
+ case 2:
+ *(name+1) = 0;
+ *name = 'B';
+
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ name = MYALLOC(char, 2);
+ *(name+1) = 0;
+ *name = 'U';
+
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ READ_TNT_STRING(0, "nn", 2, (NT) 1, (NT) 0);
+ rule = new_rule(0, new_list());
+ add_to_list(rules, rule);
+
+ READ_TNT_STRING(1, "t", 1, (T) '>');
+ READ_TNT_STRING(1, "t", 1, (T) '<');
+ READ_TNT_STRING(1, "t", 1, (T) '+');
+ READ_TNT_STRING(1, "t", 1, (T) '-');
+ READ_TNT_STRING(1, "t", 1, (T) ',');
+ READ_TNT_STRING(1, "t", 1, (T) '.');
+ READ_TNT_STRING(1, "tnt", 3, (T) '[', (NT) 0, (T) ']');
+ break;
+ case 3:
+ *(name+1) = 0;
+ *name = 'S';
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ READ_TNT_STRING(0, "t", 1, (T) 'b');
+ READ_TNT_STRING(0, "nn", 2, (NT) 0, (NT) 0);
+ READ_TNT_STRING(0, "nnn", 3, (NT) 0, (NT) 0, (NT) 0);
+ break;
+ case 4:
+ *(name+1) = 0;
+ *name = 'S';
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ name = MYALLOC(char, 2);
+ *(name+1) = 0;
+ *name = 'T';
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ name = MYALLOC(char, 2);
+ *(name+1) = 0;
+ *name = 'U';
+ READ_INTO_CPA(name, uname, 1, info, varray, vindex, cpap);
+
+ READ_TNT_STRING(0, "nnn", 3, (NT) 0, (NT) 1, (NT) 2);
+ READ_TNT_STRING(1, "nnn", 3, (NT) 1, (NT) 2, (NT) 0);
+ READ_TNT_STRING(2, "nnn", 3, (NT) 2, (NT) 0, (NT) 1);
+ rule = new_rule(0, new_list());
+ add_to_list(rules, rule);
+ rule = new_rule(1, new_list());
+ add_to_list(rules, rule);
+ rule = new_rule(2, new_list());
+ add_to_list(rules, rule);
+ READ_TNT_STRING(0, "t", 1, (T) 'a');
+ break;
+ default:
+ fleprintf0("Forgot to assign grammar!\n");
+ break;
+ }
Grammar *g = new_grammar();
- build_grammar(g, rules, names);
+ build_grammar(g, rules, names, NULL);
print_grammar(g);
- utf8 *string = new_utf8("[,[.[+]]-]", 10);
+ utf8 *string = NULL;
+
+ switch (GRAMMAR) {
+ case 1:
+ string = new_utf8("aab", 3);
+ break;
+ case 2:
+ string = new_utf8("[+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><][+-[.,]><]", 100);
+ break;
+ case 3:
+ string = new_utf8("bbbbb", 5);
+ break;
+ case 4:
+ string = new_utf8("aaaaaaaaaaaa", 12);
+ break;
+ default:
+ fleprintf0("Forgot to assign input!\n");
+ break;
+ }
printf("\nPrinting the input...\n%s\n", get_data((str *) string));
+ struct timespec tic, toc;
+ TIC;
+
Environment *env = cnp_parse(g, (str *) string);
+ TOC;
+
if (env) {
if (!(env_error_p(env))) {
- printf("Successfully parsed the input:\n");
+ BOOL error = 0;
+ ht *result = bsr_lookup
+ (env_bsrp(env), &error, 0, 0, str_length((str *) string));
+
+ if (error) {
+ printf("There are errors!\n");
+ goto destroy;
+ }
+
+ if (result && ht_size(result)) {
+ printf("\nSuccessfully parsed the input!\n");
+ for (NUM i = 0; i < ht_size(result); i++)
+ printf("(%d, %ld, %d, %ld, %ld)\n",
+ 0, (*((pair2 **) ht_keys(result)+i))->x,
+ 0, (*((pair2 **) ht_keys(result)+i))->y,
+ str_length((str *) string));
+ } else {
+ printf("\nThe input does not parse!\n");
+ }
+
+ printf("\nAll BSRs follow:\n\n");
bsr_print(env_bsrp(env), env_grammar(env), 1);
} else {
printf("There are errors!\n");
}
+ destroy:
destroy_env(env);
}
diff --git a/src/test/check_grammar.c b/src/test/check_grammar.c
index 7e55681..e66b262 100644
--- a/src/test/check_grammar.c
+++ b/src/test/check_grammar.c
@@ -136,7 +136,7 @@ main(UNUSED int argc, UNUSED char **argv)
Grammar *g = new_grammar();
- build_grammar(g, rules, names);
+ build_grammar(g, rules, names, NULL);
print_grammar(g);
diff --git a/src/test/check_pred.c b/src/test/check_pred.c
new file mode 100644
index 0000000..b0b95b3
--- /dev/null
+++ b/src/test/check_pred.c
@@ -0,0 +1,7 @@
+#include "../grammar.h"
+
+int
+main(int UNUSED argc, char ** UNUSED argv)
+{
+ return 77;
+}