From 55dc897da6e81f2a26cfc7e66ac942824773498b Mon Sep 17 00:00:00 2001 From: JSDurand Date: Tue, 4 Jan 2022 11:51:58 +0800 Subject: temporary commit Now we can read grammars from a file. But we need to check if it works for reading strings still. --- src/Makefile.am | 12 +- src/grammar.c | 280 +++++++++++++++++++++++++--- src/grammar.h | 39 +++- src/list.c | 44 ++++- src/list.h | 11 ++ src/reader.c | 466 +++++++++++++++++++++++++++++++++++++++++++++-- src/reader.h | 30 +-- src/str.c | 86 +++++++++ src/str.h | 35 ++++ src/str_partial.h | 13 ++ src/test.txt | 7 + src/test/check_grammar.c | 36 +++- src/test/check_list.c | 29 ++- src/test/check_reader.c | 32 +++- src/utf8.c | 150 +++++++++++++++ src/utf8.h | 25 +++ src/util.c | 49 +++++ src/util.h | 4 +- 18 files changed, 1250 insertions(+), 98 deletions(-) create mode 100644 src/str.c create mode 100644 src/str.h create mode 100644 src/str_partial.h create mode 100644 src/test.txt create mode 100644 src/utf8.c create mode 100644 src/utf8.h create mode 100644 src/util.c (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am index ffb9a5c..5eff3c5 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,8 +1,10 @@ AM_CFLAGS = -Wall -Wextra noinst_LIBRARIES = libeps.a -libeps_a_SOURCES = grammar.c list.c reader.c \ -grammar.h list.h util.h reader.h +libeps_a_SOURCES = grammar.c list.c util.c reader.c \ +str.c utf8.c \ +grammar.h list.h util.h reader.h \ +str_partial.h str.h utf8.h libeps_a_CFLAGS = $(AM_CFLAGS) --pedantic @@ -23,9 +25,11 @@ check_PROGRAMS = check_list check_grammar check_reader check_list_SOURCES = test/check_list.c list.c -check_grammar_SOURCES = test/check_grammar.c list.c grammar.c +check_grammar_SOURCES = test/check_grammar.c list.c grammar.c str.c \ +utf8.c -check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c +check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c \ +str.c utf8.c util.c TESTS = $(check_PROGRAMS) diff --git a/src/grammar.c b/src/grammar.c index fd38ac8..118a10e 100644 --- a/src/grammar.c +++ b/src/grammar.c @@ -1,21 +1,85 @@ #include "grammar.h" -struct TNT_s { - int type; /* 0 => T and NT otherwise */ - union { T t; NT nt; } data; -}; - struct Rule_s { NT left; - List *right; + List *right; /* a list of TNT's */ }; +typedef struct { + /* this group holds rules for this non-terminal */ + NT left; + /* a list of lists of TNT's */ + List *rights; +} Rule_group; + +/* rule_grps and names should have the same length */ struct Grammar_s { - List *rules; - /* a list of names of non-terminals */ + /* a list of Rule_group's */ + List *rule_grps; + /* a list of names of non-terminals + + Each element is an array of code-points. + + To print them, first encode them into normal strings. */ List *names; }; +static void +destroy_rule_group(void *rule_grp, int flag) +{ + Rule_group *rg = (Rule_group *) rule_grp; + + switch (flag) { + case 1: + map_list(rg->rights, destroy_list_free_all); + break; + case 2: + map_list(rg->rights, destroy_list_free_first); + break; + default: + map_list(rg->rights, destroy_list_no_free); + break; + } + + destroy_list(rg->rights, 0); + + free(rule_grp); +} + +UNUSED +static void +destroy_rule_group_no_free(void *rule_grp) +{ + destroy_rule_group(rule_grp, 0); +} + +UNUSED +static void +destroy_rule_group_free_all(void *rule_grp) +{ + destroy_rule_group(rule_grp, 1); +} + +UNUSED +static void +destroy_rule_group_free_first(void *rule_grp) +{ + destroy_rule_group(rule_grp, 2); +} + +static void +print_rule_group(void *rule_grp) +{ + Rule_group *rg = (Rule_group *) rule_grp; + + for (int i = 0; i < list_length(rg->rights); i++) { + List *str = (List *) list_nth(rg->rights, i); + printf("Rule %u => ", rg->left); + map_list(str, print_tnt); + printf("\n"); + } +} + TNT * new_tnt(int type, ...) { @@ -53,11 +117,104 @@ new_grammar() return g; } -void +/* We classify the rules into different rule groups. */ +unsigned char build_grammar(Grammar *g, List *rules, List *names) { - g->rules = rules; g->names = names; + + NUM len = list_length(names); + NUM rule_len = list_length(rules); + + /* If a rule corresponds to a non-terminal not on the list, signal + an error. */ + for (int i = 0; i < rule_len; i++) { + if (((Rule *) list_nth(rules, i))->left >= len) { + fleprintf("%s:%d:%d, Rule contains weird non-terminal\n", i); + return 1; + } + } + + List *rule_grps = new_list(); + + if (rule_grps == NULL) { + fleprintf0("%s:%d, Cannot create list\n"); + return 1; + } + + if (list_assure_size(rule_grps, len)) { + fleprintf0("%s:%d, Cannot assure size of rule groups\n"); + return 1; + } + + List *temp_ls = NULL; + void *temp_pointer = NULL; + + /* Initialize the list of rule groups */ + for (int i = 0; i < len; i++) { + if ((temp_ls = new_list()) == NULL) { + fleprintf("%s:%d:%d, Cannot create list\n", i); + + map_list(rule_grps, destroy_rule_group_no_free); + destroy_list(rule_grps, 0); + return 1; + } + + temp_pointer = MYALLOC(Rule_group, 1); + + if (temp_pointer == NULL) { + fleprintf0("%s:%d, Cannot malloc\n"); + map_list(rule_grps, destroy_rule_group_no_free); + destroy_list(rule_grps, 0); + destroy_list(temp_ls, 0); + return 1; + } + + ((Rule_group *) temp_pointer)->left = i; + ((Rule_group *) temp_pointer)->rights = temp_ls; + + int result = add_to_list(rule_grps, temp_pointer); + + if (result) { + fleprintf("%s:%d:%d, Cannot add to list\n", i); + + map_list(rule_grps, destroy_rule_group_no_free); + destroy_list(rule_grps, 0); + destroy_list(temp_ls, 0); + free(temp_pointer); + return 1; + } + } + + /* Now fill in rule groups */ + + for (int i = 0; i < rule_len; i++) { + Rule *r = (Rule *) list_nth(rules, i); + + int result = add_to_list + (((Rule_group *) list_nth(rule_grps, r->left))->rights, + r->right); + + if (result) { + fleprintf("%s:%d:%d, Cannot add to list\n", i); + + for (int j = 0; j < list_length(rule_grps); j++) { + Rule_group *rg = (Rule_group *) list_nth(rule_grps, j); + + map_list(rg->rights, destroy_list_free_all); + destroy_list(rg->rights, 0); + free(rg); + } + + destroy_list(rule_grps, 0); + + return 1; + } + } + + g->rule_grps = rule_grps; + + return 0; } void @@ -82,12 +239,61 @@ print_rule(void *r) printf("\n"); } +NUM +find_in_cpa_list(NUM *string, NUM size, List *list) +{ + NUM len = list_length(list), index = 0; + + unsigned char foundp = 0; + + for (; !foundp && index < len;) { + cpa *cpap = list_nth(list, index); + + if (cpap->size != (UNUM) size) { + index++; + continue; + } + + foundp = 1; + + for (NUM j = 0; j < size; j++) { + if (*(string+j) != *(cpap->array+j)) { + foundp = 0; + break; + } + } + + if (!foundp) index++; + } + + return (foundp) ? index : -1; +} + /* a local function pointer */ void print_name(void *element) { - char *str = (char *) element; - printf("%s", str); + cpa *array = (cpa *) element; + char *carray = MYALLOC(char, 5); + str *string = new_str(carray, 5); + + for (UNUM i = 0; i < array->size; i++) { + int result = encode(*(array->array+i), string); + + if (result) { + fleprintf("%s:%d:%llu, fail to encode\n", i); + str_set_length(string, 5); + continue; + } + + carray = get_data(string); + *(carray+str_length(string)) = 0; + + printf("%s", carray); + str_set_length(string, 5); + } + + destroy_str(string, 1); } /* a local function pointer */ @@ -103,7 +309,12 @@ print_grammar(Grammar *g) printf("Printing a grammar:\n"); map_list_between(g->names, print_name, print_sep); printf("\n"); - map_list(g->rules, print_rule); + + for (int i = 0; i < list_length(g->rule_grps); i++) { + print_rule_group(list_nth(g->rule_grps, i)); + printf("\n"); + } + printf("\n"); } @@ -142,6 +353,12 @@ new_tnt_string(char *format, int format_len, ...) return result; } +TNT * +new_tnt_pointer(size_t size) +{ + return MYALLOC(TNT, size); +} + void destroy_rule(void *rule, int flag) { @@ -168,23 +385,32 @@ destroy_rule_no_free(void *rule) destroy_rule(rule, 0); } +void +destroy_cpa_and_free_all(void *element) +{ + cpa *c = (cpa *) element; + free(c->array); + free(c); +} + void destroy_grammar(void *grammar, int flag) { Grammar *g = (Grammar *) grammar; - switch (flag) { - case 0: - map_list(g->rules, destroy_rule_no_free); - break; - case 1: - map_list(g->rules, destroy_rule_and_free_all); - break; - default: - map_list(g->rules, destroy_rule_and_free_first); - break; - } - - destroy_list(g->rules, 0); - destroy_list(g->names, flag); + if (flag == 1) + map_list(g->rule_grps, destroy_rule_group_free_all); + else if (flag == 2) + map_list(g->rule_grps, destroy_rule_group_free_first); + + destroy_list(g->rule_grps, 0); + + /* CPA structure has no concept of freeing first. So if flag is + non-zero, we just free everything, and hope no disaster + ensues. */ + if (flag) map_list(g->names, destroy_cpa_and_free_all); + + destroy_list(g->names, 0); + + free(grammar); } diff --git a/src/grammar.h b/src/grammar.h index 8353dd5..05bc1fc 100644 --- a/src/grammar.h +++ b/src/grammar.h @@ -4,6 +4,7 @@ #include #include #include "list.h" +#include "utf8.h" /* The class file for grammars */ @@ -19,6 +20,15 @@ typedef unsigned long T; typedef unsigned NT; +/* T or NT + + Don't worry, it is not explosive. */ + +struct TNT_s { + int type; /* 0 => T and NT otherwise */ + union { T t; NT nt; } data; +}; + typedef struct TNT_s TNT; typedef struct Rule_s Rule; @@ -28,16 +38,30 @@ typedef struct Rule_s Rule; strings as well. */ typedef struct Grammar_s Grammar; -/* G is expected to be allocated before this function. */ -void build_grammar(Grammar *g, List *rules, List *names); +/* G is expected to be allocated before this function. + + NAMES is a list of CPA pointers. */ +unsigned char build_grammar(Grammar *g, List *rules, List *names); /* This accepts one and only one optional argument, which is of type either T or NT, depending on TYPE being 0 or not. */ TNT *new_tnt(int type, ...); +/* An array of code-points. */ + +struct cpa_s { + NUM *array; + UNUM size; +}; + +typedef struct cpa_s cpa; + +NUM find_in_cpa_list(NUM *string, NUM size, List *list); + /* For debugging purposes, we need to print out rules to examine their values. */ +/* assume element is a cpa pointer */ void print_name(void *element); void print_tnt(void *element); @@ -56,6 +80,8 @@ void print_grammar(Grammar *g); Upon failure NULL is returned. */ List *new_tnt_string(char *format, int format_len, ...); +TNT *new_tnt_pointer(size_t size); + /* RIGHT should be created by new_tnt_string or something alike. If RIGHT is NULL, NULL is returned. */ @@ -68,6 +94,15 @@ void destroy_rule(void *rule, int flag); void destroy_rule_and_free_all(void *rule); void destroy_rule_and_free_first(void *rule); void destroy_rule_no_free(void *rule); +void destroy_cpa_and_free_all(void *element); void destroy_grammar(void *grammar, int flag); +/* convenient macro */ + +#define NEW_CPA(X, ARRAY, SIZE) do { \ + X = MYALLOC(cpa, 1); \ + X->array = ARRAY; \ + X->size = SIZE; \ + } while (0) + #endif diff --git a/src/list.c b/src/list.c index d823100..143c693 100644 --- a/src/list.c +++ b/src/list.c @@ -5,8 +5,8 @@ struct List_s { void **array; /* the array of elements */ - NUM len; /* the length of the array */ - NUM capacity; /* the number of pre-allocated bytes + NUM len; /* the length of the array */ + NUM capacity; /* the number of pre-allocated bytes for the array */ }; @@ -183,14 +183,14 @@ list_assure_size(List *ls, NUM size) if (array == NULL) return 1; - for (NUM i = 0; i < ls->capacity; i++) + for (NUM i = 0; i < ls->len; i++) *(array+i) = *(ls->array+i); ls->array = realloc(ls->array, sizeof(void*) * size); if (ls->array == NULL) return 1; - for (NUM i = 0; i < ls->capacity; i++) + for (NUM i = 0; i < ls->len; i++) *(ls->array+i) = *(array+i); free(array); @@ -206,6 +206,8 @@ list_assure_size(List *ls, NUM size) unsigned char list_set_length(List *ls, NUM len) { + list_assure_size(ls, len); + if (ls->len >= len) return 0; NUM *array = MYALLOC(NUM, len - ls->len); @@ -240,6 +242,17 @@ list_to_array(List *ls, NUM element_bytes, NUM *num) return array; } +List * +array_to_list(void **array, NUM size) +{ + List *ls = MYALLOC(List, 1); + + ls->array = array; + ls->len = ls->capacity = size; + + return ls; +} + void destroy_list(List *ls, unsigned char all_free_p) { @@ -250,8 +263,29 @@ destroy_list(List *ls, unsigned char all_free_p) if (all_free_p == 2) free(*(ls->array)); + ls->len = (ls->capacity = 0); + free(ls->array); free(ls); +} - ls->len = (ls->capacity = 0); +void +destroy_list_free_all(void *ls) +{ + List *list = (List *) ls; + destroy_list(list, 1); +} + +void +destroy_list_free_first(void *ls) +{ + List *list = (List *) ls; + destroy_list(list, 2); +} + +void +destroy_list_no_free(void *ls) +{ + List *list = (List *) ls; + destroy_list(list, 0); } diff --git a/src/list.h b/src/list.h index b710aa2..ff401f4 100644 --- a/src/list.h +++ b/src/list.h @@ -69,10 +69,21 @@ unsigned char list_set_length(List *ls, NUM len); The number of elements of the array will be stored in *NUM. */ void *list_to_array(List *ls, NUM element_bytes, NUM *num); +/* Create a list from an array. + + The array will be re-used, so it will be destroyed when the list is + destroyed. */ +List *array_to_list(void ** array, NUM size); + + /* destroy the list: If ALL_FREE_P is 1, this frees every void pointers contained in the list; if it is 2, this frees the first pointer. In any case, the list is de-allocated. */ void destroy_list(List *ls, unsigned char all_free_p); +void destroy_list_free_all(void *ls); +void destroy_list_free_first(void *ls); +void destroy_list_no_free(void *ls); + #endif diff --git a/src/reader.c b/src/reader.c index 4bbcf5a..1e7c49c 100644 --- a/src/reader.c +++ b/src/reader.c @@ -1,20 +1,462 @@ +#include "util.h" +#include "str.h" #include "reader.h" -struct Reader_s { - reader_func_t func; - List *args; /* the list of arguments; could be - NULL. */ -}; +#define COUNT_CPA(S, E, Y, L) { \ + L = 0; \ + for (NUM macroind = S; macroind < E; L++) { \ + str_info macroinfo = get_info(Y, macroind); \ + macroind += macroinfo.step; \ + } \ + } -void build_reader(Reader *reader, reader_func_t func, List *args) +#define READ_CPA(X, S, E, Y, L) { \ + L = 0; \ + for (NUM macroind = S; macroind < E;) { \ + str_info macroinfo = get_info(Y, macroind); \ + *(X+(L++)) = macroinfo.value; \ + macroind += macroinfo.step; \ + } \ + } + +/* Read a string of TNT's into TNTS. TNTS is expected to be already + allocated. */ +static void +read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list) { - reader->func = func; - reader->args = args; + str_info info; + + /* fleprintf("%s:%d, S = %lu, E = %lu\n", start, end); */ + + unsigned char readingp = 0, stop = 0; + + NUM *string = NULL, string_size = 0, nt_index = 0, tnt_count = 0; + + NUM strindex = 0; + str_info strinfo = EMPTY_STR_INFO; + + for (NUM index = start, last_index = start; + index < end && !stop;) { + info = get_info(s, index); + switch (info.value) { + case 10: + case 13: + /* if a newline is somehow encountered, we stop reading */ + fleprintf0("%s:%d, hi\n"); + stop = 1; + break; + case 9: + case 32: + /* space separated */ + if (readingp) { + COUNT_CPA(last_index, index, s, string_size); + string = MYALLOC(NUM, string_size+1); + READ_CPA(string, last_index, index, s, string_size); + + nt_index = find_in_cpa_list(string, string_size, cpa_list); + + if (nt_index < 0) { + fleprintf("%s:%d:%lu, Undefined non-terminal: ", index); + print_name(&((cpa) { string, string_size })); + printf("\n"); + free(string); + string = NULL; + return; + } + + free(string); + string = NULL; + + *(tnts+tnt_count++) = (TNT) { 1, { .nt = nt_index } }; + + readingp = 0; + } + break; + case 34: + case 39: + /* read a string */ + strindex = index + info.step; + strinfo = get_info(s, strindex); + + for (;strindex < end && + strinfo.value != 34 && + strinfo.value != 39 && + strinfo.value != 10 && + strinfo.value != 13;) { + + if (strinfo.value == 92) { + strindex += info.step; + + unsigned char local_exit = 0; + + /* The following errors should not happen. I am just + paranoid. */ + if (strindex >= end) { + local_exit = 1; + fleprintf("%s:%d:%lu, Escape end\n", strindex); + } + + strinfo = get_info(s, strindex); + + if (strinfo.value == 10 || strinfo.value == 13) { + local_exit = 1; + fleprintf("%s:%d:%lu, Escape newline not allowed\n", + strindex); + } + + /* cleanup */ + /* no cleanup needed */ + if (local_exit) return; + } + + *(tnts+tnt_count++) = (TNT) { 0, { .t = strinfo.value } }; + + strindex += strinfo.step; + } + break; + default: + if (!readingp) { + last_index = index; + readingp = 1; + } + + /* if we are at the end, we need to process one additional + non-terminal */ + if (index < end && index + (NUM) info.step >= end) { + /* fleprintf0("%s:%d, hello\n"); */ + COUNT_CPA(last_index, index+(NUM) info.step, + s, string_size); + /* fleprintf0("%s:%d, hello\n"); */ + string = MYALLOC(NUM, string_size+1); + READ_CPA(string, last_index, index+(NUM) info.step, + s, string_size); + /* fleprintf0("%s:%d, hello\n"); */ + + nt_index = find_in_cpa_list(string, string_size, cpa_list); + /* fleprintf0("%s:%d, hello\n"); */ + + if (nt_index < 0) { + fleprintf("%s:%d:%lu, Undefined non-terminal: ", index); + print_name(&((cpa) { string, string_size })); + printf("\n"); + eprintf("END = %lu\n", end); + free(string); + string = NULL; + return; + } + + free(string); + string = NULL; + + *(tnts+tnt_count++) = (TNT) { 1, { .nt = nt_index } }; + } + break; + } + + index += info.step; + } +} + +static void +print_int(void *p) +{ + printf("%d\n", *((int *)p)); } -void destroy_reader(Reader *reader, int flag) +/* Read a grammar from a string in the format of BNF notation. */ +Grammar * +read_grammar_from_bnf(str *s) { - destroy_list(reader->args, flag); - reader->args = NULL; - reader->func = NULL; + NUM len = str_length(s); + + /* fleprintf("%s:%d, len = %lu\n", len); */ + + /* The first loop collects names of non-terminals */ + + List *names = new_list(); + + List *line_indices = new_list(); + + unsigned char definitionp = 1, readingp = 0; + + /* last index is used to collect strings */ + for (NUM index = 0, last_index = 0; index < len;) { + str_info info = get_info(s, index); + + /* This is used to collect indices of definitions per line. */ + NUM *line_index = NULL; + + NUM *string = NULL, string_size = 0; + + cpa *cpap = NULL; + + switch (info.value) { + /* spaces and tabs are ignored */ + case ' ': + case 9: + readingp = 0; + + if (definitionp) { + fleprintf("%s:%d:%lu, A non-terminal cannot have spaces in " + "the name\n", index); + /* cleanup */ + map_list(names, destroy_cpa_and_free_all); + destroy_list(names, 0); + destroy_list(line_indices, 1); + return NULL; + } + break; + case '\n': + case 13: + definitionp = 1; + readingp = 0; + break; + case ':': + if (definitionp) { + definitionp = 0; + readingp = 0; + /* read a non-terminal */ + COUNT_CPA(last_index, index, s, string_size); + string = MYALLOC(NUM, string_size+1); + READ_CPA(string, last_index, index, s, string_size); + NEW_CPA(cpap, string, string_size); + + line_index = MYALLOC(NUM, 1); + *line_index = find_in_cpa_list(string, string_size, names); + + if (*line_index < 0) { + *line_index = list_length(names); + add_to_list(names, cpap); + } else { + destroy_cpa_and_free_all(cpap); + string = NULL; + } + + add_to_list(line_indices, line_index); + /* fleprintf0("%s:%d, Definition should be over\n"); */ + } + break; + default: + if (!readingp) { + last_index = index; + readingp = 1; + /* fleprintf0("%s:%d, Start reading\n"); */ + } + break; + } + + index += info.step; + } + + printf("%s:%d, print indices\n", __FILE__, __LINE__); + map_list(line_indices, print_int); + printf("\n"); + + /* second round collects all the rules */ + + List *rules = new_list(); + + NUM line_count = 0; + + unsigned char right_start_p = 0; + + definitionp = 1, readingp = 0; + + NUM tnt_count = 0; + NT current_nt = 0; + + /* last index is used to collect strings */ + for (NUM index = 0, last_index = 0; index < len;) { + str_info info = get_info(s, index); + /* fleprintf("%s:%d, index = %lu, value = %lu\n", index, info.value); */ + + TNT *current_tnt_string = NULL; + + void **temp_pointers = NULL; + + switch (info.value) { + /* the quote characters */ + case 34: /* double quote */ + case 39: /* single quote */ + if (definitionp) { + fleprintf("%s:%d:%lu, A non-terminal cannot have quotes in " + "the name\n", index); + /* cleanup */ + map_list(names, destroy_cpa_and_free_all); + destroy_list(names, 0); + + destroy_list(line_indices, 1); + + map_list(rules, destroy_rule_and_free_first); + destroy_list(rules, 0); + return NULL; + } else { + /* read a string of terminals */ + + NUM strindex = index + info.step; + str_info strinfo = get_info(s, strindex); + + for (;strindex < len && + strinfo.value != 13 && + strinfo.value != 10 && + strinfo.value != 34 && + strinfo.value != 39;) { + strinfo = get_info(s, strindex); + + /* escape character */ + if (strinfo.value == 92) { + strindex += strinfo.step; + if (strindex < len) strinfo = get_info(s, strindex); + + unsigned char local_error_p = 0; + + if (strindex >= len) { + local_error_p = 1; + fleprintf("%s:%d:%ld, Escape at the end not allowed\n", + strindex); + } else if (strinfo.value == 13 || strinfo.value == 10) { + local_error_p = 1; + fleprintf("%s:%d:%ld, Escape newline in string not " + "allowed\n", strindex); + } + + if (local_error_p) { + /* cleanup */ + map_list(names, destroy_cpa_and_free_all); + destroy_list(names, 0); + + destroy_list(line_indices, 1); + + map_list(rules, destroy_rule_and_free_first); + destroy_list(rules, 0); + return NULL; + } + } + /* count a terminal */ + tnt_count++; + + strindex += strinfo.step; + } + + if (strinfo.value == 13 || strinfo.value == 10) { + fleprintf("%s:%d:%ld, Newline encountered before string " + "ended\n", strindex); + /* cleanup */ + map_list(names, destroy_cpa_and_free_all); + destroy_list(names, 0); + + destroy_list(line_indices, 1); + + map_list(rules, destroy_rule_and_free_first); + destroy_list(rules, 0); + return NULL; + } + + index = strindex + 1; + } + + break; + /* spaces and tabs are ignored */ + case ' ': + case 9: + if (definitionp) { + fleprintf("%s:%d:%lu, A non-terminal cannot have spaces or " + "tabs in the name\n", index); + /* cleanup */ + map_list(names, destroy_cpa_and_free_all); + destroy_list(names, 0); + + destroy_list(line_indices, 1); + + map_list(rules, destroy_rule_and_free_first); + destroy_list(rules, 0); + return NULL; + } + + readingp = 0; + + break; + case '\n': + case 13: + definitionp = 1; + readingp = 0; + line_count++; + + /* fleprintf("%s:%d, tnt count = %lu\n", tnt_count); */ + + if (right_start_p) { + right_start_p = 0; + + current_tnt_string = new_tnt_pointer(tnt_count); + read_tnt_string(s, last_index, index, current_tnt_string, + names); + + printf("%s:%d:%lu, Printing a TNT string: ", + __FILE__, __LINE__, index); + for (int i = 0; i < (int) tnt_count;) + print_tnt(current_tnt_string+i++); + printf("\n"); + + temp_pointers = MYALLOC(void *, tnt_count); + + for (NUM tempindex = 0; tempindex < tnt_count; tempindex++) + *(temp_pointers + tempindex) = current_tnt_string+tempindex; + + add_to_list + (rules, + new_rule + (current_nt, array_to_list(temp_pointers, tnt_count))); + } else { + /* empty rule */ + add_to_list(rules, new_rule(current_nt, new_list())); + } + + tnt_count = 0; + + break; + case ':': + definitionp = 0; + readingp = 0; + right_start_p = 0; + /* the non-terminal at this line is cached by line_indices */ + current_nt = *((NUM *) list_nth(line_indices, line_count)); + /* fleprintf("%s:%d, current nt = %u, count = %lu\n", + * current_nt, line_count); */ + break; + default: + if (!readingp && !definitionp) tnt_count++; + + if (!readingp && + (definitionp || !right_start_p)) { + last_index = index; + right_start_p = 1; + } + + readingp = 1; + break; + } + + index += info.step; + } + + destroy_list(line_indices, 1); + + /* We build the grammar after the string is processed, so as to + avoid de-allocating the grammar should errors occur. */ + + Grammar *g = new_grammar(); + + if (build_grammar(g, rules, names)) { + fleprintf0("%s:%d, Failed to build the grammar\n"); + map_list(names, destroy_cpa_and_free_all); + destroy_list(names, 0); + map_list(rules, destroy_rule_and_free_first); + destroy_list(rules, 0); + destroy_grammar(g, 0); + return NULL; + } + + destroy_list(rules, 1); + + return g; } diff --git a/src/reader.h b/src/reader.h index 77e7b84..65ee7ef 100644 --- a/src/reader.h +++ b/src/reader.h @@ -1,34 +1,10 @@ #ifndef INPUT_H #define INPUT_H +#include "str.h" +#include "utf8.h" #include "grammar.h" -/* The class of a general reader, which can read a grammar, either - from a file, or elsewhere, such as from an Emacs buffer, the - standard input, or even from some other places. - - The only expectation, or requirement, of such a reader is to - provide a function with a variable number and type of arguments - which returns a grammar. Then the package provides a function to - properly read a grammar, according to the specifications of the - reader. */ - -typedef Grammar *(*reader_func_t)(List *args); - -typedef struct Reader_s Reader; - -/* contructor */ - -/* Build a reader and assign to *reader. It is expected to be - allocated before this function. */ -void build_reader(Reader *reader, reader_func_t func, List *args); - -/* destructor */ - -/* Does not free READER. - - FLAG is passed to destroy_list, which see. */ -void destroy_reader(Reader *reader, int flag); - +Grammar *read_grammar_from_bnf(str *s); #endif diff --git a/src/str.c b/src/str.c new file mode 100644 index 0000000..46dc4a8 --- /dev/null +++ b/src/str.c @@ -0,0 +1,86 @@ +#include "str.h" +#include "str_partial.h" +#include + +H_ATTR +str_info +_info_getter(str *s, UNUM n) +{ + return (str_info) { (NUM) *(s->data+n), 1 }; +} + +/* H_ATTR + * NUM + * _lenner(str *s) + * { + * return s->size; + * } */ + +str * +new_str(char *s, NUM size) +{ + str *str_pointer = MYALLOC(str, 1); + + str_pointer->size = size; + str_pointer->data = s; + str_pointer->getter = _info_getter; + /* str_pointer->lenner = _lenner; */ + + return str_pointer; +} + +void +destroy_str(str *s, int flag) +{ + if (flag) free(s->data); + + s->size = 0; + + free(s); +} + +H_ATTR +str_info +get_info(str *s, UNUM n) +{ + return s->getter(s, n); +} + +H_ATTR +NUM +str_length(str *s) +{ + return s->size; +} + +void +change_str_content(str *s, char *str, NUM size) +{ + s->data = str; + s->size = size; +} + +unsigned char +copy_str(str *dest, str *source) +{ + if (str_length(dest) < str_length(source)) + return 1; + + dest->size = source->size; + + memcpy(dest->data, source->data, source->size); + + return 0; +} + +char * +get_data(str *s) +{ + return s->data; +} + +void +str_set_length(str *s, UNUM size) +{ + s->size = size; +} diff --git a/src/str.h b/src/str.h new file mode 100644 index 0000000..39bddf6 --- /dev/null +++ b/src/str.h @@ -0,0 +1,35 @@ +#ifndef STR_H +#define STR_H + +#include + +typedef struct str_s str; + +str *new_str(char *s, NUM size); + +/* flag non-zero => free the stored character array as well */ +void destroy_str(str *s, int flag); + +/* deliberately expose this struct */ +struct str_info_s { + NUM value; + size_t step; +}; + +typedef struct str_info_s str_info; + +#define EMPTY_STR_INFO ((str_info) { 0, 1 }) + +str_info get_info(str *, UNUM); + +char *get_data(str *s); + +NUM str_length(str *); + +void str_set_length(str *s, UNUM size); + +void change_str_content(str *, char *, NUM); + +unsigned char copy_str(str *dest, str *source); + +#endif diff --git a/src/str_partial.h b/src/str_partial.h new file mode 100644 index 0000000..9d4fbd6 --- /dev/null +++ b/src/str_partial.h @@ -0,0 +1,13 @@ +#include "str.h" + +/* This is meant to be extended, and only has minimal necessary + fields. */ + +typedef str_info (*info_getter) (str *, UNUM); + +struct str_s { + UNUM size; /* the size in bytes, not in chars */ + char *data; /* a void pointer is too general */ + /* polymorphic behaviour */ + info_getter getter; +}; diff --git a/src/test.txt b/src/test.txt new file mode 100644 index 0000000..71e125b --- /dev/null +++ b/src/test.txt @@ -0,0 +1,7 @@ +S: A A C +A: B B B +B: A +A: +C: B C A +B: B B +A: A B A diff --git a/src/test/check_grammar.c b/src/test/check_grammar.c index 39f66ee..98a07cd 100644 --- a/src/test/check_grammar.c +++ b/src/test/check_grammar.c @@ -37,8 +37,7 @@ main(U_ATTR int argc, U_ATTR char **argv) List *rules = new_list(); List *names = new_list(); - - for (int i = 0; i < 3; i++) { + for (int i = 0; i < 4; i++) { tnt_string = new_tnt_string("ntn", 3, (NT) 3*(i+1), (T) 3*(i+1)*(i+1), @@ -48,18 +47,41 @@ main(U_ATTR int argc, U_ATTR char **argv) eprintf("i = %d, cannot create tnt string\n", i); map_list(rules, destroy_rule_and_free_all); - destroy_list(rules, 0); - destroy_list(names, 1); + + map_list(names, destroy_cpa_and_free_all); + destroy_list(names, 0); return 1; } - rule = new_rule(i, tnt_string); + rule = new_rule(i%3, tnt_string); add_to_list(rules, rule); char *name = MYALLOC(char, 7); snprintf(name, 7, "Rule %d", i); - add_to_list(names, name); + + utf8* uname = new_utf8(name, 6); + + str_info info = get_info((str *)uname, 0); + + NUM *varray = MYALLOC(NUM, 6); + NUM vindex = 0; + + for (NUM index = 0; + info.value >= 0 && index < str_length((str *) uname); + index += info.step, vindex++) { + info = get_info((str *)uname, index); + + *(varray+vindex) = info.value; + } + + destroy_str((str *) uname, 1); + + cpa *cpap = MYALLOC(cpa, 1); + cpap->array = varray; + cpap->size = vindex; + + add_to_list(names, cpap); } Grammar *g = new_grammar(); @@ -70,7 +92,7 @@ main(U_ATTR int argc, U_ATTR char **argv) destroy_grammar(g, 1); - free(g); + destroy_list(rules, 1); return 0; } diff --git a/src/test/check_list.c b/src/test/check_list.c index 8677cd3..fe23b2b 100644 --- a/src/test/check_list.c +++ b/src/test/check_list.c @@ -2,7 +2,7 @@ #include #include "../list.h" -void +static void print_int(void *p) { printf("%d\n", *((int *)p)); @@ -155,6 +155,8 @@ main(U_ATTR int argc, U_ATTR char **argv) exit(1); } + eprintf("Successfully created LS2!\n"); + for (NUM i = 0; i < 5; i++) { int *pointer = malloc(sizeof *pointer * 1); @@ -197,7 +199,7 @@ main(U_ATTR int argc, U_ATTR char **argv) } NUM arr[10]; - + for (NUM i = 0; i < 10; i++) { arr[i] = i*i-i+1; if (add_to_list(ls, arr+i)) { @@ -211,7 +213,28 @@ main(U_ATTR int argc, U_ATTR char **argv) destroy_list(ls, 0); + /* test array_to_list */ + int *arr3 = MYALLOC(int, 20); + + void **arr2 = NULL; + + for (NUM i = 0; i < 20; i++) { + *(arr3+i) = i*i+i+1; + } + + arr2 = MYALLOC(void *, 20); + + for (NUM i = 0; i < 20; i++) *(arr2+i) = arr3+i; + + ls = array_to_list(arr2, 20); + + printf("Printing a list from an array\n"); + + map_list(ls, print_int); + + destroy_list(ls, 2); + printf("Every test is successful!\n"); - + return 0; } diff --git a/src/test/check_reader.c b/src/test/check_reader.c index 1d73b3f..8769cd7 100644 --- a/src/test/check_reader.c +++ b/src/test/check_reader.c @@ -6,17 +6,29 @@ #include "../grammar.h" #include "../reader.h" -/* The test of reading grammars should be postponed till later. */ - -/* Grammar * - * read_grammar(List *args) - * { - * - * } */ - - +/* TODO: check string */ int main(U_ATTR int argc, U_ATTR char **argv) { - return 77; + /* return 77; */ + + char *file_name = "test.txt"; + char *buffer = MYALLOC(char, 512); + NUM buffer_size = 0; + + if (read_entire_file(file_name, &buffer, &buffer_size)) { + fleprintf("%s:%d, Cannot read file %s", file_name); + free(buffer); + return 1; + } + + utf8 *s = new_utf8(buffer, buffer_size); + + Grammar *g = read_grammar_from_bnf((str *) s); + + print_grammar(g); + destroy_grammar(g, 2); + destroy_str((str *)s, 1); + + return 0; } diff --git a/src/utf8.c b/src/utf8.c new file mode 100644 index 0000000..f840600 --- /dev/null +++ b/src/utf8.c @@ -0,0 +1,150 @@ +#include "utf8.h" +#include "str_partial.h" +#include +#include + +struct utf8_s { + str s; +}; + +/* The classification and the transition table. */ + +static const +uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +static UTF8_State +decode(UTF8_State *state, uint32_t *codep, uint8_t byte) +{ + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_STATE_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & ((uint32_t) byte); + + *state = utf8d[256 + *state + type]; + return *state; +} + +str_info +utf8_get_info(str *s, UNUM n) +{ + UTF8_State state = UTF8_STATE_ACCEPT; + uint32_t code = 0; + NUM orig_n = n; + + /* error */ + if (n >= s->size) return (str_info) { -1, 0 }; + + decode(&state, &code, *(s->data+n)); + + if (state == UTF8_STATE_ACCEPT) + return (str_info) { code, 1 }; + + if (state == UTF8_STATE_REJECT) + return (str_info) { -1, 1 }; + + n++; + + if (n >= s->size) return (str_info) { -1, 1 }; + + for (; n < s->size + && state > UTF8_STATE_REJECT; + n++) { + decode(&state, &code, *(s->data+n)); + } + + if (state == UTF8_STATE_ACCEPT) + return (str_info) { code, n - orig_n }; + + return (str_info) { -1, n - orig_n - 1}; +} + +utf8 * +new_utf8(char *string, UNUM size) +{ + str *s = new_str(string, size); + + s->getter = utf8_get_info; + + return (utf8 *)s; +} + +/* result should be long enough to hold the data */ +unsigned char +encode(NUM code_point, str *result) +{ + /* calculate the number of bits of CODE_POINT */ + int number_of_bits = 0; + + for (;code_point >> number_of_bits;) number_of_bits++; + + if (!number_of_bits) { + result->size = 0; + return 1; + } + + if (number_of_bits <= 7) { + if (!(result->size)) { + eprintf("%s:%d, Result cannot hold the encoded value\n", + __FILE__, __LINE__); + return 1; + } + + result->size = 1; + *(result->data) = (char) code_point; + } else if (number_of_bits <= 11) { + if (result->size < 2) { + eprintf("%s:%d, Result cannot hold the encoded value", + __FILE__, __LINE__); + return 1; + } + + result->size = 2; + *(result->data) = (char) (0xc0 | (code_point >> 6)); + *(result->data+1) = (char) (0x80 | (code_point & 0x3f)); + } else if (number_of_bits <= 16) { + if (result->size < 3) { + eprintf("%s:%d, Result cannot hold the encoded value", + __FILE__, __LINE__); + return 1; + } + + result->size = 3; + *(result->data) = (char) (0xe0 | (code_point >> 12)); + *(result->data+1) = (char) (0x80 | ((code_point >> 6) & 0x3f)); + *(result->data+2) = (char) (0x80 | (code_point & 0x3f)); + } else if (number_of_bits <= 21) { + if (result->size < 4) { + eprintf("%s:%d, Result cannot hold the encoded value", + __FILE__, __LINE__); + return 1; + } + + result->size = 4; + *(result->data) = (char) (0xf0 | (code_point >> 18)); + *(result->data+1) = (char) (0x80 | ((code_point >> 12) & 0x3f)); + *(result->data+2) = (char) (0x80 | ((code_point >> 6) & 0x3f)); + *(result->data+3) = (char) (0x80 | (code_point & 0x3f)); + } else { + eprintf("%s:%d, Invalid code point to encode", + __FILE__, __LINE__); + return 1; + } + + return 0; +} diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..5b3c9f2 --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,25 @@ +#ifndef UTF8_H +#define UTF8_H +#include "str.h" + +typedef struct utf8_s utf8; + +enum UTF8_State_e { + UTF8_STATE_ACCEPT = 0, + UTF8_STATE_REJECT = 12, + UTF8_STATE_1 = 24, + UTF8_STATE_2 = 36, + UTF8_STATE_3 = 48, + UTF8_STATE_4 = 60, + UTF8_STATE_5 = 72, + UTF8_STATE_6 = 84, + UTF8_STATE_7 = 96 +}; + +typedef enum UTF8_State_e UTF8_State; + +utf8 *new_utf8(char *string, UNUM size); + +unsigned char encode(NUM code_point, str *result); + +#endif diff --git a/src/util.c b/src/util.c new file mode 100644 index 0000000..e228fbb --- /dev/null +++ b/src/util.c @@ -0,0 +1,49 @@ +#include +#include +#include "util.h" + +/* Put the entire file content into the string str and the file size + into *len. + + If errors occur, return non-zero; otherwise the return value is + zero. + + Note that *str should be already allocated. + + Note that the length of the string str is actually 1+length, and + the last character is the null character. */ +unsigned char +read_entire_file(const char *file_name, char **str, NUM *len) +{ + long file_size = 0; + + FILE *file = fopen(file_name, "r"); + + if (!file) { + eprintf("%s:%d, Cannot open file \"%s\": ", + __FILE__, __LINE__, file_name); + perror(NULL); + return 1; + } + + fseek(file, 0, SEEK_END); + file_size = ftell(file); + fseek(file, 0, SEEK_SET); + + *len = 1+file_size; + + *str = realloc(*str, 1+file_size); + + if (*str == NULL) { + fleprintf0("%s:%d, Cannot realloc\n"); + return 1; + } + + fread(*str, sizeof(char), file_size, file); + + fclose(file); + + *(*str+file_size) = 0; + + return 0; +} diff --git a/src/util.h b/src/util.h index d657f0e..9120e9b 100644 --- a/src/util.h +++ b/src/util.h @@ -28,7 +28,9 @@ typedef unsigned long long UNUM; /* definitely bigger than size_t */ #define eprintf(...) fprintf(stderr, __VA_ARGS__) +#define fleprintf0(M) eprintf(M, __FILE__, __LINE__) +#define fleprintf(M, ...) eprintf(M, __FILE__, __LINE__, __VA_ARGS__) - +unsigned char read_entire_file(const char *file_name, char **str, NUM *len); #endif -- cgit v1.2.3-18-g5258