#include #include "ht.h" #include "grammar.h" struct Rule_s { NT left; List *right; /* a list of TNT's */ }; struct Rule_group_s { /* this group holds rules for this non-terminal */ NT left; /* a list of lists of TNT's */ List *rights; }; NT rg_left(CCR_MOD(Rule_group *) rg) { return rg->left; } NUM rg_len(CCR_MOD(Rule_group *) rg) { return list_length(rg->rights); } List * rg_right(CCR_MOD(Rule_group *) rg) { return rg->rights; } List * rg_nth(CCR_MOD(Rule_group *) rg, NUM n) { if (n < 0 || n >= list_length(rg->rights)) { fleprintf("Out of bounds: %ld with len = %ld\n", n, list_length(rg->rights)); return NULL; } return (List *) list_nth(rg->rights, n); } /* TODO: Add some statistic counts to assist the hash tables. For example, store the total number of terminals as an integer; then the hash table can be hinted at initialization to contain that many elements, which can reduce the number of time a hash table needs to expand and re-insert all its keys, which then helps the performance. But this is not the top priority thing to do, and might be postponed until a proto-type of the parser generator is usable already. */ /* rule_grps and names should have the same length */ struct Grammar_s { /* a list of Rule_group's */ List *rule_grps; /* a list of names of non-terminals Each element is an array of code-points. To print them, first encode them into normal strings. */ List *names; /* TODO: Add an array of predicates. */ }; static void destroy_rule_group(void *rule_grp, int flag) { Rule_group *rg = (Rule_group *) rule_grp; switch (flag) { case 1: map_list(rg->rights, destroy_list_free_all); break; case 2: map_list(rg->rights, destroy_list_free_first); break; default: map_list(rg->rights, destroy_list_no_free); break; } destroy_list(rg->rights, 0); free(rule_grp); } UNUSED static void destroy_rule_group_no_free(void *rule_grp) { destroy_rule_group(rule_grp, 0); } UNUSED static void destroy_rule_group_free_all(void *rule_grp) { destroy_rule_group(rule_grp, 1); } UNUSED static void destroy_rule_group_free_first(void *rule_grp) { destroy_rule_group(rule_grp, 2); } static void print_sep() { printf(", "); } static void print_rule_group(void *rule_grp) { Rule_group *rg = (Rule_group *) rule_grp; for (int i = 0; i < list_length(rg->rights); i++) { List *str = (List *) list_nth(rg->rights, i); printf("Rule %lu => ", rg->left); map_list_between(str, print_tnt, print_sep); printf("\n"); } } TNT * new_tnt(TNT_TYPE type, ...) { va_list args; va_start(args, type); TNT *result = MYALLOC(TNT, 1); result->type = type; switch (type) { case TERMINAL: result->data.t = va_arg(args, T); break; case NONTERMINAL: result->data.nt = va_arg(args, NT); break; default: result->data.pt = va_arg(args, PT); break; } va_end(args); return result; } Rule * new_rule(NT left, CC_MOD(List *) right) { if (!right) return NULL; Rule *rule = MYALLOC(Rule, 1); rule->left = left; rule->right = (List *) right; return rule; } Grammar * new_grammar() { Grammar *g = MYALLOC(Grammar, 1); return g; } /* We classify the rules into different rule groups. */ BOOL build_grammar(Grammar *g, const List *rules, CC_MOD(List *) names) { g->names = (List *) names; NUM len = list_length(names); NUM rule_len = list_length(rules); /* If a rule corresponds to a non-terminal not on the list, signal an error. */ for (int i = 0; i < rule_len; i++) { if ((NUM) (((Rule *) list_nth(rules, i))->left) >= len) { fleprintf("%d, Rule contains weird non-terminal\n", i); return 1; } } List *rule_grps = new_list(); if (rule_grps == NULL) { fleprintf0("Cannot create list\n"); return 1; } if (list_assure_size(rule_grps, len)) { fleprintf0("Cannot assure size of rule groups\n"); return 1; } List *temp_ls = NULL; void *temp_pointer = NULL; /* Initialize the list of rule groups */ for (int i = 0; i < len; i++) { if ((temp_ls = new_list()) == NULL) { fleprintf("%d, Cannot create list\n", i); map_list(rule_grps, destroy_rule_group_no_free); destroy_list(rule_grps, 0); return 1; } temp_pointer = MYALLOC(Rule_group, 1); if (temp_pointer == NULL) { fleprintf0("Cannot malloc\n"); map_list(rule_grps, destroy_rule_group_no_free); destroy_list(rule_grps, 0); destroy_list(temp_ls, 0); return 1; } ((Rule_group *) temp_pointer)->left = i; ((Rule_group *) temp_pointer)->rights = temp_ls; int result = add_to_list(rule_grps, temp_pointer); if (result) { fleprintf("%d, Cannot add to list\n", i); map_list(rule_grps, destroy_rule_group_no_free); destroy_list(rule_grps, 0); destroy_list(temp_ls, 0); free(temp_pointer); return 1; } } /* Now fill in rule groups */ for (int i = 0; i < rule_len; i++) { Rule *r = (Rule *) list_nth(rules, i); int result = add_to_list (((Rule_group *) list_nth(rule_grps, r->left))->rights, r->right); if (result) { fleprintf("%d, Cannot add to list\n", i); for (int j = 0; j < list_length(rule_grps); j++) { Rule_group *rg = (Rule_group *) list_nth(rule_grps, j); map_list(rg->rights, destroy_list_free_all); destroy_list(rg->rights, 0); free(rg); } destroy_list(rule_grps, 0); return 1; } } g->rule_grps = rule_grps; return 0; } void print_tnt(void *element) { TNT *tnt = (TNT*) element; switch (tnt->type) { case TERMINAL: if (tnt->data.t == END_OF_INPUT) printf("T $"); else printf("T %lu", tnt->data.t); break; case NONTERMINAL: printf("NT %lu", tnt->data.nt); break; default: printf("PT %lu", tnt->data.pt); break; } } void print_rule(void *r) { Rule *rule = (Rule *) r; printf("Rule: %lu => ", rule->left); map_list(rule->right, print_tnt); printf("\n"); } NUM find_in_cpa_list(CCR_MOD(NUM *) string, NUM size, CCR_MOD(List *) list) { NUM len = list_length(list), index = 0; BOOL foundp = 0; for (; !foundp && index < len;) { cpa *cpap = list_nth(list, index); if (cpap->size != (UNUM) size) { index++; continue; } foundp = 1; for (NUM j = 0; j < size; j++) { if (*(string+j) != *(cpap->array+j)) { foundp = 0; break; } } if (!foundp) index++; } return (foundp) ? index : -1; } void print_name(void *element) { cpa *array = (cpa *) element; char *carray = MYALLOC(char, 5); str *string = new_str(carray, 5); for (UNUM i = 0; i < array->size; i++) { int result = encode(*(array->array+i), string); if (result) { fleprintf("%llu, fail to encode\n", i); str_set_length(string, 5); continue; } carray = get_data(string); *(carray+str_length(string)) = 0; printf("%s", carray); str_set_length(string, 5); } destroy_str(string, 1); } /* REVIEW: Print the names of non-terminals out, instead of printing the numbers? */ void print_grammar(CC_MOD(Grammar *) g) { printf("Printing a grammar:\n"); map_list_between(g->names, print_name, print_sep); printf("\n"); for (int i = 0; i < list_length(g->rule_grps); i++) { print_rule_group(list_nth(g->rule_grps, i)); printf("\n"); } printf("\n"); } List * new_tnt_string(char *format, int format_len, ...) { /* FORMAT_LEN does not include the terminating null byte, and it should be > 0. */ if (format_len <= 0) return NULL; List *result = new_list(); va_list args; va_start(args, format_len); for (int point = 0; point < format_len; point++) { switch (*(format+point)) { case 'n': add_to_list(result, new_tnt(NONTERMINAL, va_arg(args, NT))); break; case 't': add_to_list(result, new_tnt(TERMINAL, va_arg(args, T))); break; case 'p': add_to_list(result, new_tnt(PREDICATE, va_arg(args, PT))); break; default: eprintf("Wrong character: %c\n", *(format+point)); destroy_list(result, 1); va_end(args); return NULL; break; } } va_end(args); return result; } TNT * new_tnt_pointer(size_t size) { return MYALLOC(TNT, size); } void destroy_rule(void *rule, int flag) { Rule * r = (Rule *) rule; destroy_list(r->right, flag); free(rule); } void destroy_rule_and_free_all(void *rule) { destroy_rule(rule, 1); } void destroy_rule_and_free_first(void *rule) { destroy_rule(rule, 2); } void destroy_rule_no_free(void *rule) { destroy_rule(rule, 0); } void destroy_cpa_and_free_all(void *element) { cpa *c = (cpa *) element; free(c->array); free(c); } void destroy_grammar(void *grammar, int flag) { Grammar *g = (Grammar *) grammar; if (flag == 1) map_list(g->rule_grps, destroy_rule_group_free_all); else if (flag == 2) map_list(g->rule_grps, destroy_rule_group_free_first); destroy_list(g->rule_grps, 0); /* CPA structure has no concept of freeing first. So if flag is non-zero, we just free everything, and hope no disaster ensues. */ if (flag) map_list(g->names, destroy_cpa_and_free_all); destroy_list(g->names, 0); free(grammar); } Rule_group * grammar_rule(CCR_MOD(Grammar *) g, NT nt) { if ((NUM) nt >= list_length(g->rule_grps)) { fleprintf("Invalid nonterminal: %lu\n", nt); return NULL; } return (Rule_group *) list_nth(g->rule_grps, nt); } NUM grammar_left_len(CCR_MOD(Grammar *)g) { return list_length(g->names); } P_ATTR List * grammar_names(CCR_MOD(Grammar *)g) { return g->names; } /* A transitive closure algorithm */ void epsilon_nts(CC_MOD(Grammar *) g, BOOL * const restrict nts) { NUM left_len = grammar_left_len(g); memset(nts, 0, sizeof(BOOL) * left_len); BOOL changed = 0, first_time = 1; do { changed = 0; for (NUM i = 0; i < left_len; i++) { /* If this non-terminal is already known to produce the empty string, then we don't need to check it again. */ if (*(nts+i)) continue; Rule_group *rg = grammar_rule(g, (NT) i); NUM rg_length = rg_len(rg); for (NUM j = 0; j < rg_length; j++) { List *string = rg_nth(rg, j); if (first_time && list_length(string) == 0) { changed = 1; *(nts+i) = 1; break; } NUM string_len = list_length(string); BOOL non_epsilon = 0; for (NUM k = 0; k < string_len;) { TNT *tnte = (TNT *) list_nth(string, k++); if (tnte->type == NONTERMINAL && *(nts+tnte->data.nt)) continue; non_epsilon = 1; break; } if (!non_epsilon && !(*(nts+i))) { changed = 1; *(nts+i) = 1; } } } first_time = 0; } while (changed); } BOOL nt_first(CC_MOD(Grammar *) g, CCR_MOD(BOOL *) nts, ht *terminal_hts, ht *predicate_hts) { NUM left_len = grammar_left_len(g); BOOL changed = 0, first = 1; ht *terminal_ht = NULL, *pred_ht = NULL; NUM *temp = NULL; T *tempT = NULL; PT *tempPT = NULL; /* lazy macro */ #define BREAKOUT k = string_len do { changed = 0; for (NUM i = 0; i < left_len; i++) { Rule_group *rg = grammar_rule(g, (NT) i); NUM rg_length = rg_len(rg); /* fleprintf("rg_length = %ld\n", rg_length); */ for (NUM j = 0; j < rg_length; j++) { List *string = rg_nth(rg, j); NUM string_len = list_length(string); /* fleprintf("j = %ld, len = %ld\n", j, string_len); */ TNT *top = NULL; for (NUM k = 0; k < string_len; k++) { top = (TNT *) list_nth(string, k); switch (top->type) { case TERMINAL: SAFE_MALLOC(NUM, tempT, 1, return 1;); *tempT = top->data.t; if (first && ht_find(terminal_hts+i, tempT) == NULL) { changed = 1; ht_insert(terminal_hts+i, tempT, (void*)1); /* fleprintf("After insertion we find %p for %ld\n", * ht_find(terminal_hts+i, tempT), * *tempT); */ BREAKOUT; } else { free(tempT); } break; case PREDICATE: SAFE_MALLOC(NUM, tempPT, 1, return 1;); *tempPT = top->data.pt; if (first && ht_find(predicate_hts+i, tempPT) == NULL) { changed = 1; ht_insert(predicate_hts+i, tempPT, (void*)1); BREAKOUT; } else { free(tempPT); } break; default: terminal_ht = terminal_hts+top->data.nt; pred_ht = predicate_hts+top->data.nt; /* Add all entries to the corresponding hash table for the nonterminal I. Also record if new entries have been found. */ NUM len = ht_size(terminal_ht); void **keys = ht_keys(terminal_ht); for (NUM ell = 0; ell < len;) { NUM *key = *(keys+ell++); SAFE_MALLOC(NUM, temp, 1, return 1;); *temp = *key; /* fleprintf("i = %ld, j = %ld, ", i, j); */ if (ht_find(terminal_hts+i, temp) == NULL) { /* eprintf("new item\n"); */ changed = 1; ht_insert(terminal_hts+i, temp, (void*)1); } else { /* eprintf("already found for key = %ld\n", *temp); */ free(temp); } } len = ht_size(pred_ht); keys = ht_keys(pred_ht); for (NUM ell = 0; ell < len;) { NUM *key = *(keys+ell++); SAFE_MALLOC(NUM, temp, 1, return 1;); *temp = *key; /* fleprintf("i = %ld, j = %ld, ", i, j); */ if (ht_find(predicate_hts+i, temp) == NULL) { /* eprintf("new item\n"); */ changed = 1; ht_insert(predicate_hts+i, temp, (void*)1); } else { free(temp); /* eprintf("already found for key = %ld\n", *temp); */ } } if (!(*(nts+(top->data.nt)))) BREAKOUT; break; } } } } first = 0; } while (changed); #undef BREAKOUT return 0; } BOOL tnt_first(CC_MOD(ht *) terminal_hts, CC_MOD(ht *) predicate_hts, CCR_MOD(BOOL *) nts, NUM len, CCR_MOD(List *) tnts, ht * const restrict result_terminals, ht * const restrict result_predicates) { if (tnts == NULL) return 0; NUM tnt_len = list_length(tnts); if (!tnt_len) return 0; NUM temp_len = 0, **keys = NULL; TNT *top = NULL; NT current = 0; NUM *temp = NULL; T *tempT = NULL; PT *tempPT = NULL; for (NUM i = 0; i < tnt_len; i++) { top = (TNT *) list_nth(tnts, i); switch (top->type) { case TERMINAL: if (ht_find(result_terminals, &(top->data.t)) == NULL) { SAFE_MALLOC(T, tempT, 1, return 1;); *tempT = top->data.t; ht_insert(result_terminals, tempT, (void *)1); } return 0; break; case PREDICATE: if (ht_find(result_predicates, &(top->data.pt)) == NULL) { SAFE_MALLOC(PT, tempPT, 1, return 1;); *tempPT = top->data.pt; ht_insert(result_predicates, tempPT, (void *)1); } return 0; break; default: current = top->data.nt; if (current >= (NT) len || current < 0) { fleprintf("Wrong non-terminal: %ld>%ld\n", current, len); return 1; } temp_len = ht_size(terminal_hts+current); keys = (NUM **) ht_keys(terminal_hts+current); for (NUM j = 0; j < temp_len; j++) { if (ht_find(result_terminals, *(keys+j)) == NULL) { SAFE_MALLOC(NUM, temp, 1, return 1;); *temp = **(keys+j); ht_insert(result_terminals, temp, (void *)1); } } temp_len = ht_size(predicate_hts+current); keys = (NUM **) ht_keys(predicate_hts+current); for (NUM j = 0; j < temp_len; j++) { if (ht_find(result_predicates, *(keys+j)) == NULL) { SAFE_MALLOC(NUM, temp, 1, return 1;); *temp = **(keys+j); ht_insert(result_predicates, temp, (void *)1); } } if (!(*(nts+current))) i = tnt_len; break; } } return 0; } BOOL nt_follow(CC_MOD(Grammar *) g, CCR_MOD(BOOL *) nts, CC_MOD(ht *)terminal_hts, CC_MOD(ht *)predicate_hts, ht * const restrict result_terminals, ht * const restrict result_predicates) { NUM *tempN = NULL; SAFE_MALLOC(NUM, tempN, 1, return 1;); *tempN = END_OF_INPUT; if (ht_find(result_terminals, tempN) == NULL) { ht_insert(result_terminals, tempN, (void*)1); } else { free(tempN); } NUM left_len = grammar_left_len(g); BOOL changed = 0, first = 1; NUM ht_len = 0, **keys = NULL; List *temp = NULL; do { changed = 0; for (NUM i = 0; i < left_len; i++) { Rule_group *rg = grammar_rule(g, (NT) i); NUM rg_length = rg_len(rg); for (NUM j = 0; j < rg_length; j++) { List *string = rg_nth(rg, j); NUM string_len = list_length(string); TNT *top = NULL; NUM rest_produce_epsilon = -1; for (NUM ell = 0; ell < string_len; ell++) { top = (TNT *) list_nth(string, ell); switch (top->type) { case NONTERMINAL: if (!(*(nts+top->data.nt))) rest_produce_epsilon = -1; else if (rest_produce_epsilon < 0) rest_produce_epsilon = ell; break; default: rest_produce_epsilon = -1; break; } } for (NUM k = 0; k < string_len; k++) { top = (TNT *) list_nth(string, k); switch (top->type) { case NONTERMINAL: if (first && k+1data.nt, result_predicates+top->data.nt)) { fleprintf("Error in generating the first set for %ld:" " i = %ld, j = %ld, k = %ld, len = %ld\n", top->data.nt, i, j, k, string_len); free(temp); return 1; } free(temp); } /* Test if the remaining thing produces the emtpy string. */ if (k+1 == string_len || (rest_produce_epsilon >= 0 && rest_produce_epsilon <= k+1)) goto add_to_follow; break; add_to_follow: ht_len = ht_size(result_terminals+i); keys = (NUM **) ht_keys(result_terminals+i); for (NUM ell = 0; ell < ht_len;) { SAFE_MALLOC(NUM, tempN, 1, return 1;); *tempN = **(keys+ell++); if (ht_find(result_terminals+top->data.nt, tempN) == NULL) { changed = 1; ht_insert(result_terminals+top->data.nt, tempN, (void*)1); } else { free(tempN); } } ht_len = ht_size(result_predicates+i); keys = (NUM **) ht_keys(result_predicates+i); for (NUM ell = 0; ell < ht_len;) { SAFE_MALLOC(NUM, tempN, 1, return 1;); *tempN = **(keys+ell++); if (ht_find(result_predicates+top->data.nt, tempN) == NULL) { changed = 1; ht_insert(result_predicates+top->data.nt, tempN, (void*)1); } else { free(tempN); } } break; default: break; } } } } first = 0; } while (changed); return 0; }