summaryrefslogtreecommitdiff
path: root/src/grammar.c
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2022-01-21 21:53:50 +0800
committerJSDurand <mmemmew@gmail.com>2022-01-21 21:53:50 +0800
commit53865aad225ffbe5cf3c42736e5a2095092f9fff (patch)
tree697ecbee7fa69ae8c58a0ec2f69cdf84d7cd313f /src/grammar.c
parent5730d6c04258e192195bfbbbed76d68fd78ed458 (diff)
temporary save point
Just to save some work.
Diffstat (limited to 'src/grammar.c')
-rw-r--r--src/grammar.c448
1 files changed, 426 insertions, 22 deletions
diff --git a/src/grammar.c b/src/grammar.c
index 692e858..a24f9cb 100644
--- a/src/grammar.c
+++ b/src/grammar.c
@@ -1,3 +1,5 @@
+#include <string.h>
+#include "ht.h"
#include "grammar.h"
struct Rule_s {
@@ -5,12 +7,51 @@ struct Rule_s {
List *right; /* a list of TNT's */
};
-typedef struct {
+struct Rule_group_s {
/* this group holds rules for this non-terminal */
NT left;
/* a list of lists of TNT's */
List *rights;
-} Rule_group;
+};
+
+NT
+rg_left(CCR_MOD(Rule_group *) rg)
+{
+ return rg->left;
+}
+
+NUM
+rg_len(CCR_MOD(Rule_group *) rg)
+{
+ return list_length(rg->rights);
+}
+
+List *
+rg_right(CCR_MOD(Rule_group *) rg)
+{
+ return rg->rights;
+}
+
+List *
+rg_nth(CCR_MOD(Rule_group *) rg, NUM n)
+{
+ if (n < 0 || n >= list_length(rg->rights)) {
+ fleprintf("Out of bounds: %ld with len = %ld\n",
+ n, list_length(rg->rights));
+ return NULL;
+ }
+
+ return (List *) list_nth(rg->rights, n);
+}
+
+/* TODO: Add some statistic counts to assist the hash tables. For
+ example, store the total number of terminals as an integer; then
+ the hash table can be hinted at initialization to contain that many
+ elements, which can reduce the number of time a hash table needs to
+ expand and re-insert all its keys, which then helps the
+ performance. But this is not the top priority thing to do, and
+ might be postponed until a proto-type of the parser generator is
+ usable already. */
/* rule_grps and names should have the same length */
struct Grammar_s {
@@ -22,6 +63,7 @@ struct Grammar_s {
To print them, first encode them into normal strings. */
List *names;
+ /* TODO: Add an array of predicates. */
};
static void
@@ -80,14 +122,14 @@ print_rule_group(void *rule_grp)
for (int i = 0; i < list_length(rg->rights); i++) {
List *str = (List *) list_nth(rg->rights, i);
- printf("Rule %u => ", rg->left);
+ printf("Rule %lu => ", rg->left);
map_list_between(str, print_tnt, print_sep);
printf("\n");
}
}
TNT *
-new_tnt(int type, ...)
+new_tnt(TNT_TYPE type, ...)
{
va_list args;
va_start(args, type);
@@ -95,8 +137,17 @@ new_tnt(int type, ...)
TNT *result = MYALLOC(TNT, 1);
result->type = type;
- if (type) result->data.nt = va_arg(args, NT);
- else result->data.t = va_arg(args, T);
+ switch (type) {
+ case TERMINAL:
+ result->data.t = va_arg(args, T);
+ break;
+ case NONTERMINAL:
+ result->data.nt = va_arg(args, NT);
+ break;
+ default:
+ result->data.pt = va_arg(args, PT);
+ break;
+ }
va_end(args);
@@ -104,7 +155,7 @@ new_tnt(int type, ...)
}
Rule *
-new_rule(NT left, const List * const right)
+new_rule(NT left, CC_MOD(List *) right)
{
if (!right) return NULL;
@@ -125,7 +176,7 @@ new_grammar()
/* We classify the rules into different rule groups. */
BOOL
-build_grammar(Grammar *g, const List *rules, const List * const names)
+build_grammar(Grammar *g, const List *rules, CC_MOD(List *) names)
{
g->names = (List *) names;
@@ -135,7 +186,7 @@ build_grammar(Grammar *g, const List *rules, const List * const names)
/* If a rule corresponds to a non-terminal not on the list, signal
an error. */
for (int i = 0; i < rule_len; i++) {
- if (((Rule *) list_nth(rules, i))->left >= len) {
+ if ((NUM) (((Rule *) list_nth(rules, i))->left) >= len) {
fleprintf("%d, Rule contains weird non-terminal\n", i);
return 1;
}
@@ -228,17 +279,25 @@ print_tnt(void *element)
{
TNT *tnt = (TNT*) element;
- if (tnt->type)
- printf("NT %u", tnt->data.nt);
- else
- printf("T %lu", tnt->data.t);
+ switch (tnt->type) {
+ case TERMINAL:
+ if (tnt->data.t == END_OF_INPUT) printf("T $");
+ else printf("T %lu", tnt->data.t);
+ break;
+ case NONTERMINAL:
+ printf("NT %lu", tnt->data.nt);
+ break;
+ default:
+ printf("PT %lu", tnt->data.pt);
+ break;
+ }
}
void
print_rule(void *r)
{
Rule *rule = (Rule *) r;
- printf("Rule: %u => ", rule->left);
+ printf("Rule: %lu => ", rule->left);
map_list(rule->right, print_tnt);
@@ -246,8 +305,8 @@ print_rule(void *r)
}
NUM
-find_in_cpa_list(const NUM * const restrict string, NUM size,
- const List * const restrict list)
+find_in_cpa_list(CCR_MOD(NUM *) string, NUM size,
+ CCR_MOD(List *) list)
{
NUM len = list_length(list), index = 0;
@@ -305,7 +364,7 @@ print_name(void *element)
/* REVIEW: Print the names of non-terminals out, instead of printing
the numbers? */
void
-print_grammar(const Grammar * const g)
+print_grammar(CC_MOD(Grammar *) g)
{
printf("Printing a grammar:\n");
map_list_between(g->names, print_name, print_sep);
@@ -335,13 +394,16 @@ new_tnt_string(char *format, int format_len, ...)
for (int point = 0; point < format_len; point++) {
switch (*(format+point)) {
case 'n':
- add_to_list(result, new_tnt(1, va_arg(args, NT)));
+ add_to_list(result, new_tnt(NONTERMINAL, va_arg(args, NT)));
break;
case 't':
- add_to_list(result, new_tnt(0, va_arg(args, T)));
+ add_to_list(result, new_tnt(TERMINAL, va_arg(args, T)));
+ break;
+ case 'p':
+ add_to_list(result, new_tnt(PREDICATE, va_arg(args, PT)));
break;
default:
- eprintf("Wront character: %c\n", *(format+point));
+ eprintf("Wrong character: %c\n", *(format+point));
destroy_list(result, 1);
va_end(args);
return NULL;
@@ -403,9 +465,9 @@ destroy_grammar(void *grammar, int flag)
map_list(g->rule_grps, destroy_rule_group_free_all);
else if (flag == 2)
map_list(g->rule_grps, destroy_rule_group_free_first);
-
+
destroy_list(g->rule_grps, 0);
-
+
/* CPA structure has no concept of freeing first. So if flag is
non-zero, we just free everything, and hope no disaster
ensues. */
@@ -415,3 +477,345 @@ destroy_grammar(void *grammar, int flag)
free(grammar);
}
+
+Rule_group *
+grammar_rule(CCR_MOD(Grammar *) g, NT nt)
+{
+ if ((NUM) nt >= list_length(g->rule_grps)) {
+ fleprintf("Invalid nonterminal: %lu\n", nt);
+ return NULL;
+ }
+
+ return (Rule_group *) list_nth(g->rule_grps, nt);
+}
+
+NUM
+grammar_left_len(CCR_MOD(Grammar *)g)
+{
+ return list_length(g->names);
+}
+
+/* A transitive closure algorithm */
+void
+epsilon_nts(CC_MOD(Grammar *) g, BOOL * const restrict nts)
+{
+ NUM left_len = grammar_left_len(g);
+
+ memset(nts, 0, sizeof(BOOL) * left_len);
+
+ BOOL changed = 0, first_time = 1;
+
+ do {
+ changed = 0;
+
+ for (NUM i = 0; i < left_len; i++) {
+ /* If this non-terminal is already known to produce the empty
+ string, then we don't need to check it again. */
+ if (*(nts+i)) continue;
+
+ Rule_group *rg = grammar_rule(g, (NT) i);
+ NUM rg_length = rg_len(rg);
+
+ for (NUM j = 0; j < rg_length; j++) {
+ List *string = rg_nth(rg, j);
+ if (first_time && list_length(string) == 0) {
+ changed = 1;
+ *(nts+i) = 1;
+ break;
+ }
+
+ NUM string_len = list_length(string);
+
+ BOOL non_epsilon = 0;
+
+ for (NUM k = 0; k < string_len;) {
+ TNT *tnte = (TNT *) list_nth(string, k++);
+ if (tnte->type == NONTERMINAL && *(nts+tnte->data.nt))
+ continue;
+
+ non_epsilon = 1;
+ break;
+ }
+
+ if (!non_epsilon && !(*(nts+i))) {
+ changed = 1;
+ *(nts+i) = 1;
+ }
+ }
+ }
+
+ first_time = 0;
+ } while (changed);
+}
+
+BOOL
+nt_first(CC_MOD(Grammar *) g, CCR_MOD(BOOL *) nts,
+ ht *terminal_hts, ht *predicate_hts)
+{
+ NUM left_len = grammar_left_len(g);
+
+ BOOL changed = 0, first = 1;
+
+ ht *terminal_ht = NULL, *pred_ht = NULL;
+
+ /* lazy macro */
+
+#define BREAKOUT k = string_len
+
+ do {
+ changed = 0;
+
+ for (NUM i = 0; i < left_len; i++) {
+ Rule_group *rg = grammar_rule(g, (NT) i);
+ NUM rg_length = rg_len(rg);
+
+ for (NUM j = 0; j < rg_length;) {
+ List *string = rg_nth(rg, j++);
+ NUM string_len = list_length(string);
+ TNT *top = NULL;
+
+ for (NUM k = 0; k < string_len;) {
+ top = (TNT *) list_nth(string, k++);
+
+ switch (top->type) {
+ case TERMINAL:
+ if (first &&
+ ht_find(terminal_hts+i, top->data.t) == NULL) {
+ changed = 1;
+ ht_insert(terminal_hts+i, top->data.t, (void*)1);
+ BREAKOUT;
+ }
+ break;
+ case PREDICATE:
+ if (first &&
+ ht_find(predicate_hts+i, top->data.pt) == NULL) {
+ changed = 1;
+ ht_insert(predicate_hts+i, top->data.pt, (void*)1);
+ BREAKOUT;
+ }
+ break;
+ default:
+ terminal_ht = terminal_hts+top->data.nt;
+ pred_ht = predicate_hts+top->data.nt;
+ /* Add all entries to the corresponding hash table for the
+ nonterminal I. Also record if new entries have been
+ found. */
+
+ NUM len = ht_size(terminal_ht);
+ NUM *keys = ht_keys(terminal_ht);
+
+ for (NUM ell = 0; ell < len;) {
+ NUM key = *(keys+ell++);
+ if (ht_find(terminal_hts+i, key) == NULL) {
+ changed = 1;
+ ht_insert(terminal_hts+i, key, (void*)1);
+ }
+ }
+
+ len = ht_size(pred_ht);
+ keys = ht_keys(pred_ht);
+
+ for (NUM ell = 0; ell < len;) {
+ NUM key = *(keys+ell++);
+ if (ht_find(predicate_hts+i, key) == NULL) {
+ changed = 1;
+ ht_insert(predicate_hts+i, key, (void*)1);
+ }
+ }
+
+ if (!(*(nts+(top->data.nt)))) BREAKOUT;
+ break;
+ }
+ }
+ }
+ }
+ first = 0;
+ } while (changed);
+
+#undef BREAKOUT
+
+ return 0;
+}
+
+BOOL
+tnt_first(CC_MOD(ht *) terminal_hts, CC_MOD(ht *) predicate_hts,
+ CCR_MOD(BOOL *) nts, NUM len, List *tnts,
+ ht * const restrict result_terminals,
+ ht * const restrict result_predicates)
+{
+ if (tnts == NULL) return 0;
+
+ NUM tnt_len = list_length(tnts);
+
+ if (!tnt_len) return 0;
+
+ NUM temp_len = 0, *keys = NULL;
+
+ TNT *top = NULL;
+
+ NT current = 0;
+
+ for (NUM i = 0; i < tnt_len; i++) {
+ top = (TNT *) list_nth(tnts, i);
+
+ switch (top->type) {
+ case TERMINAL:
+ ht_insert(result_terminals, top->data.t, (void *)1);
+ return 0;
+ break;
+ case PREDICATE:
+ ht_insert(result_predicates, top->data.pt, (void *)1);
+ return 0;
+ break;
+ default:
+ current = top->data.nt;
+
+ if (current >= (NT) len || current < 0) {
+ fleprintf("Wrong non-terminal: %ld>%ld\n", current, len);
+ return 1;
+ }
+
+ temp_len = ht_size(terminal_hts+current);
+ keys = ht_keys(terminal_hts+current);
+
+ for (NUM j = 0; j < temp_len;)
+ ht_insert(result_terminals, *(keys+j++), (void *)1);
+
+ temp_len = ht_size(predicate_hts+current);
+ keys = ht_keys(predicate_hts+current);
+
+ for (NUM j = 0; j < temp_len;)
+ ht_insert(result_predicates, *(keys+j++), (void *)1);
+
+ if (!(*(nts+current))) i = tnt_len;
+
+ break;
+ }
+ }
+
+ return 0;
+}
+
+BOOL
+nt_follow(CC_MOD(Grammar *) g, CCR_MOD(BOOL *) nts,
+ CC_MOD(ht *)terminal_hts, CC_MOD(ht *)predicate_hts,
+ ht * const restrict result_terminals,
+ ht * const restrict result_predicates)
+{
+ ht_insert(result_terminals, END_OF_INPUT, (void*)1);
+
+ NUM left_len = grammar_left_len(g);
+
+ BOOL changed = 0, first = 1;
+
+ NUM ht_len = 0, *keys = NULL;
+
+ List *temp = NULL;
+
+ do {
+ changed = 0;
+
+ for (NUM i = 0; i < left_len; i++) {
+ Rule_group *rg = grammar_rule(g, (NT) i);
+ NUM rg_length = rg_len(rg);
+
+ for (NUM j = 0; j < rg_length;) {
+ List *string = rg_nth(rg, j++);
+ NUM string_len = list_length(string);
+ TNT *top = NULL;
+
+ NUM rest_produce_epsilon = -1;
+
+ for (NUM ell = 0; ell < string_len; ell++) {
+ top = (TNT *) list_nth(string, ell);
+
+ switch (top->type) {
+ case NONTERMINAL:
+ if (!(*(nts+top->data.nt)))
+ rest_produce_epsilon = -1;
+ else if (rest_produce_epsilon < 0)
+ rest_produce_epsilon = ell;
+ break;
+ default:
+ rest_produce_epsilon = -1;
+ break;
+ }
+ }
+
+ for (NUM k = 0; k < string_len; k++) {
+ top = (TNT *) list_nth(string, k);
+
+ switch (top->type) {
+ case NONTERMINAL:
+ if (first && k+1<string_len) {
+ changed = 1;
+ temp = array_to_list
+ (list_array(string)+k+1, string_len-k-1);
+
+ if (temp == NULL) return 1;
+
+ if (tnt_first
+ (terminal_hts, predicate_hts, nts, left_len,
+ temp,
+ result_terminals+top->data.nt,
+ result_predicates+top->data.nt)) {
+ fleprintf("Error in generating the first set for %ld:"
+ " i = %ld, j = %ld, k = %ld, len = %ld\n",
+ top->data.nt, i, j, k, string_len);
+ free(temp);
+ return 1;
+ }
+
+ free(temp);
+ }
+
+ /* Test if the remaining thing produces the emtpy
+ string. */
+
+ if (k+1 == string_len ||
+ (rest_produce_epsilon >= 0 &&
+ rest_produce_epsilon <= k))
+ goto add_to_follow;
+
+ break;
+
+ add_to_follow:
+
+ ht_len = ht_size(result_terminals+i);
+ keys = ht_keys(result_terminals+i);
+
+ for (NUM ell = 0; ell < ht_len;) {
+ NUM key = *(keys+ell++);
+ if (ht_find(result_terminals+top->data.nt,
+ key) == NULL) {
+ changed = 1;
+ ht_insert(result_terminals+top->data.nt,
+ key, (void*)1);
+ }
+ }
+
+ ht_len = ht_size(result_predicates+i);
+ keys = ht_keys(result_predicates+i);
+
+ for (NUM ell = 0; ell < ht_len;) {
+ NUM key = *(keys+ell++);
+ if (ht_find(result_predicates+top->data.nt,
+ key) == NULL) {
+ changed = 1;
+ ht_insert(result_predicates+top->data.nt,
+ key, (void*)1);
+ }
+ }
+
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ first = 0;
+ } while (changed);
+
+ return 0;
+}