summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am12
-rw-r--r--src/grammar.c280
-rw-r--r--src/grammar.h39
-rw-r--r--src/list.c44
-rw-r--r--src/list.h11
-rw-r--r--src/reader.c466
-rw-r--r--src/reader.h30
-rw-r--r--src/str.c86
-rw-r--r--src/str.h35
-rw-r--r--src/str_partial.h13
-rw-r--r--src/test.txt7
-rw-r--r--src/test/check_grammar.c36
-rw-r--r--src/test/check_list.c29
-rw-r--r--src/test/check_reader.c32
-rw-r--r--src/utf8.c150
-rw-r--r--src/utf8.h25
-rw-r--r--src/util.c49
-rw-r--r--src/util.h4
18 files changed, 1250 insertions, 98 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index ffb9a5c..5eff3c5 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,8 +1,10 @@
AM_CFLAGS = -Wall -Wextra
noinst_LIBRARIES = libeps.a
-libeps_a_SOURCES = grammar.c list.c reader.c \
-grammar.h list.h util.h reader.h
+libeps_a_SOURCES = grammar.c list.c util.c reader.c \
+str.c utf8.c \
+grammar.h list.h util.h reader.h \
+str_partial.h str.h utf8.h
libeps_a_CFLAGS = $(AM_CFLAGS) --pedantic
@@ -23,9 +25,11 @@ check_PROGRAMS = check_list check_grammar check_reader
check_list_SOURCES = test/check_list.c list.c
-check_grammar_SOURCES = test/check_grammar.c list.c grammar.c
+check_grammar_SOURCES = test/check_grammar.c list.c grammar.c str.c \
+utf8.c
-check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c
+check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c \
+str.c utf8.c util.c
TESTS = $(check_PROGRAMS)
diff --git a/src/grammar.c b/src/grammar.c
index fd38ac8..118a10e 100644
--- a/src/grammar.c
+++ b/src/grammar.c
@@ -1,21 +1,85 @@
#include "grammar.h"
-struct TNT_s {
- int type; /* 0 => T and NT otherwise */
- union { T t; NT nt; } data;
-};
-
struct Rule_s {
NT left;
- List *right;
+ List *right; /* a list of TNT's */
};
+typedef struct {
+ /* this group holds rules for this non-terminal */
+ NT left;
+ /* a list of lists of TNT's */
+ List *rights;
+} Rule_group;
+
+/* rule_grps and names should have the same length */
struct Grammar_s {
- List *rules;
- /* a list of names of non-terminals */
+ /* a list of Rule_group's */
+ List *rule_grps;
+ /* a list of names of non-terminals
+
+ Each element is an array of code-points.
+
+ To print them, first encode them into normal strings. */
List *names;
};
+static void
+destroy_rule_group(void *rule_grp, int flag)
+{
+ Rule_group *rg = (Rule_group *) rule_grp;
+
+ switch (flag) {
+ case 1:
+ map_list(rg->rights, destroy_list_free_all);
+ break;
+ case 2:
+ map_list(rg->rights, destroy_list_free_first);
+ break;
+ default:
+ map_list(rg->rights, destroy_list_no_free);
+ break;
+ }
+
+ destroy_list(rg->rights, 0);
+
+ free(rule_grp);
+}
+
+UNUSED
+static void
+destroy_rule_group_no_free(void *rule_grp)
+{
+ destroy_rule_group(rule_grp, 0);
+}
+
+UNUSED
+static void
+destroy_rule_group_free_all(void *rule_grp)
+{
+ destroy_rule_group(rule_grp, 1);
+}
+
+UNUSED
+static void
+destroy_rule_group_free_first(void *rule_grp)
+{
+ destroy_rule_group(rule_grp, 2);
+}
+
+static void
+print_rule_group(void *rule_grp)
+{
+ Rule_group *rg = (Rule_group *) rule_grp;
+
+ for (int i = 0; i < list_length(rg->rights); i++) {
+ List *str = (List *) list_nth(rg->rights, i);
+ printf("Rule %u => ", rg->left);
+ map_list(str, print_tnt);
+ printf("\n");
+ }
+}
+
TNT *
new_tnt(int type, ...)
{
@@ -53,11 +117,104 @@ new_grammar()
return g;
}
-void
+/* We classify the rules into different rule groups. */
+unsigned char
build_grammar(Grammar *g, List *rules, List *names)
{
- g->rules = rules;
g->names = names;
+
+ NUM len = list_length(names);
+ NUM rule_len = list_length(rules);
+
+ /* If a rule corresponds to a non-terminal not on the list, signal
+ an error. */
+ for (int i = 0; i < rule_len; i++) {
+ if (((Rule *) list_nth(rules, i))->left >= len) {
+ fleprintf("%s:%d:%d, Rule contains weird non-terminal\n", i);
+ return 1;
+ }
+ }
+
+ List *rule_grps = new_list();
+
+ if (rule_grps == NULL) {
+ fleprintf0("%s:%d, Cannot create list\n");
+ return 1;
+ }
+
+ if (list_assure_size(rule_grps, len)) {
+ fleprintf0("%s:%d, Cannot assure size of rule groups\n");
+ return 1;
+ }
+
+ List *temp_ls = NULL;
+ void *temp_pointer = NULL;
+
+ /* Initialize the list of rule groups */
+ for (int i = 0; i < len; i++) {
+ if ((temp_ls = new_list()) == NULL) {
+ fleprintf("%s:%d:%d, Cannot create list\n", i);
+
+ map_list(rule_grps, destroy_rule_group_no_free);
+ destroy_list(rule_grps, 0);
+ return 1;
+ }
+
+ temp_pointer = MYALLOC(Rule_group, 1);
+
+ if (temp_pointer == NULL) {
+ fleprintf0("%s:%d, Cannot malloc\n");
+ map_list(rule_grps, destroy_rule_group_no_free);
+ destroy_list(rule_grps, 0);
+ destroy_list(temp_ls, 0);
+ return 1;
+ }
+
+ ((Rule_group *) temp_pointer)->left = i;
+ ((Rule_group *) temp_pointer)->rights = temp_ls;
+
+ int result = add_to_list(rule_grps, temp_pointer);
+
+ if (result) {
+ fleprintf("%s:%d:%d, Cannot add to list\n", i);
+
+ map_list(rule_grps, destroy_rule_group_no_free);
+ destroy_list(rule_grps, 0);
+ destroy_list(temp_ls, 0);
+ free(temp_pointer);
+ return 1;
+ }
+ }
+
+ /* Now fill in rule groups */
+
+ for (int i = 0; i < rule_len; i++) {
+ Rule *r = (Rule *) list_nth(rules, i);
+
+ int result = add_to_list
+ (((Rule_group *) list_nth(rule_grps, r->left))->rights,
+ r->right);
+
+ if (result) {
+ fleprintf("%s:%d:%d, Cannot add to list\n", i);
+
+ for (int j = 0; j < list_length(rule_grps); j++) {
+ Rule_group *rg = (Rule_group *) list_nth(rule_grps, j);
+
+ map_list(rg->rights, destroy_list_free_all);
+ destroy_list(rg->rights, 0);
+ free(rg);
+ }
+
+ destroy_list(rule_grps, 0);
+
+ return 1;
+ }
+ }
+
+ g->rule_grps = rule_grps;
+
+ return 0;
}
void
@@ -82,12 +239,61 @@ print_rule(void *r)
printf("\n");
}
+NUM
+find_in_cpa_list(NUM *string, NUM size, List *list)
+{
+ NUM len = list_length(list), index = 0;
+
+ unsigned char foundp = 0;
+
+ for (; !foundp && index < len;) {
+ cpa *cpap = list_nth(list, index);
+
+ if (cpap->size != (UNUM) size) {
+ index++;
+ continue;
+ }
+
+ foundp = 1;
+
+ for (NUM j = 0; j < size; j++) {
+ if (*(string+j) != *(cpap->array+j)) {
+ foundp = 0;
+ break;
+ }
+ }
+
+ if (!foundp) index++;
+ }
+
+ return (foundp) ? index : -1;
+}
+
/* a local function pointer */
void
print_name(void *element)
{
- char *str = (char *) element;
- printf("%s", str);
+ cpa *array = (cpa *) element;
+ char *carray = MYALLOC(char, 5);
+ str *string = new_str(carray, 5);
+
+ for (UNUM i = 0; i < array->size; i++) {
+ int result = encode(*(array->array+i), string);
+
+ if (result) {
+ fleprintf("%s:%d:%llu, fail to encode\n", i);
+ str_set_length(string, 5);
+ continue;
+ }
+
+ carray = get_data(string);
+ *(carray+str_length(string)) = 0;
+
+ printf("%s", carray);
+ str_set_length(string, 5);
+ }
+
+ destroy_str(string, 1);
}
/* a local function pointer */
@@ -103,7 +309,12 @@ print_grammar(Grammar *g)
printf("Printing a grammar:\n");
map_list_between(g->names, print_name, print_sep);
printf("\n");
- map_list(g->rules, print_rule);
+
+ for (int i = 0; i < list_length(g->rule_grps); i++) {
+ print_rule_group(list_nth(g->rule_grps, i));
+ printf("\n");
+ }
+
printf("\n");
}
@@ -142,6 +353,12 @@ new_tnt_string(char *format, int format_len, ...)
return result;
}
+TNT *
+new_tnt_pointer(size_t size)
+{
+ return MYALLOC(TNT, size);
+}
+
void
destroy_rule(void *rule, int flag)
{
@@ -169,22 +386,31 @@ destroy_rule_no_free(void *rule)
}
void
+destroy_cpa_and_free_all(void *element)
+{
+ cpa *c = (cpa *) element;
+ free(c->array);
+ free(c);
+}
+
+void
destroy_grammar(void *grammar, int flag)
{
Grammar *g = (Grammar *) grammar;
- switch (flag) {
- case 0:
- map_list(g->rules, destroy_rule_no_free);
- break;
- case 1:
- map_list(g->rules, destroy_rule_and_free_all);
- break;
- default:
- map_list(g->rules, destroy_rule_and_free_first);
- break;
- }
-
- destroy_list(g->rules, 0);
- destroy_list(g->names, flag);
+ if (flag == 1)
+ map_list(g->rule_grps, destroy_rule_group_free_all);
+ else if (flag == 2)
+ map_list(g->rule_grps, destroy_rule_group_free_first);
+
+ destroy_list(g->rule_grps, 0);
+
+ /* CPA structure has no concept of freeing first. So if flag is
+ non-zero, we just free everything, and hope no disaster
+ ensues. */
+ if (flag) map_list(g->names, destroy_cpa_and_free_all);
+
+ destroy_list(g->names, 0);
+
+ free(grammar);
}
diff --git a/src/grammar.h b/src/grammar.h
index 8353dd5..05bc1fc 100644
--- a/src/grammar.h
+++ b/src/grammar.h
@@ -4,6 +4,7 @@
#include <stdarg.h>
#include <stdio.h>
#include "list.h"
+#include "utf8.h"
/* The class file for grammars */
@@ -19,6 +20,15 @@
typedef unsigned long T;
typedef unsigned NT;
+/* T or NT
+
+ Don't worry, it is not explosive. */
+
+struct TNT_s {
+ int type; /* 0 => T and NT otherwise */
+ union { T t; NT nt; } data;
+};
+
typedef struct TNT_s TNT;
typedef struct Rule_s Rule;
@@ -28,16 +38,30 @@ typedef struct Rule_s Rule;
strings as well. */
typedef struct Grammar_s Grammar;
-/* G is expected to be allocated before this function. */
-void build_grammar(Grammar *g, List *rules, List *names);
+/* G is expected to be allocated before this function.
+
+ NAMES is a list of CPA pointers. */
+unsigned char build_grammar(Grammar *g, List *rules, List *names);
/* This accepts one and only one optional argument, which is of type
either T or NT, depending on TYPE being 0 or not. */
TNT *new_tnt(int type, ...);
+/* An array of code-points. */
+
+struct cpa_s {
+ NUM *array;
+ UNUM size;
+};
+
+typedef struct cpa_s cpa;
+
+NUM find_in_cpa_list(NUM *string, NUM size, List *list);
+
/* For debugging purposes, we need to print out rules to examine their
values. */
+/* assume element is a cpa pointer */
void print_name(void *element);
void print_tnt(void *element);
@@ -56,6 +80,8 @@ void print_grammar(Grammar *g);
Upon failure NULL is returned. */
List *new_tnt_string(char *format, int format_len, ...);
+TNT *new_tnt_pointer(size_t size);
+
/* RIGHT should be created by new_tnt_string or something alike.
If RIGHT is NULL, NULL is returned. */
@@ -68,6 +94,15 @@ void destroy_rule(void *rule, int flag);
void destroy_rule_and_free_all(void *rule);
void destroy_rule_and_free_first(void *rule);
void destroy_rule_no_free(void *rule);
+void destroy_cpa_and_free_all(void *element);
void destroy_grammar(void *grammar, int flag);
+/* convenient macro */
+
+#define NEW_CPA(X, ARRAY, SIZE) do { \
+ X = MYALLOC(cpa, 1); \
+ X->array = ARRAY; \
+ X->size = SIZE; \
+ } while (0)
+
#endif
diff --git a/src/list.c b/src/list.c
index d823100..143c693 100644
--- a/src/list.c
+++ b/src/list.c
@@ -5,8 +5,8 @@
struct List_s {
void **array; /* the array of elements */
- NUM len; /* the length of the array */
- NUM capacity; /* the number of pre-allocated bytes
+ NUM len; /* the length of the array */
+ NUM capacity; /* the number of pre-allocated bytes
for the array */
};
@@ -183,14 +183,14 @@ list_assure_size(List *ls, NUM size)
if (array == NULL) return 1;
- for (NUM i = 0; i < ls->capacity; i++)
+ for (NUM i = 0; i < ls->len; i++)
*(array+i) = *(ls->array+i);
ls->array = realloc(ls->array, sizeof(void*) * size);
if (ls->array == NULL) return 1;
- for (NUM i = 0; i < ls->capacity; i++)
+ for (NUM i = 0; i < ls->len; i++)
*(ls->array+i) = *(array+i);
free(array);
@@ -206,6 +206,8 @@ list_assure_size(List *ls, NUM size)
unsigned char
list_set_length(List *ls, NUM len)
{
+ list_assure_size(ls, len);
+
if (ls->len >= len) return 0;
NUM *array = MYALLOC(NUM, len - ls->len);
@@ -240,6 +242,17 @@ list_to_array(List *ls, NUM element_bytes, NUM *num)
return array;
}
+List *
+array_to_list(void **array, NUM size)
+{
+ List *ls = MYALLOC(List, 1);
+
+ ls->array = array;
+ ls->len = ls->capacity = size;
+
+ return ls;
+}
+
void
destroy_list(List *ls, unsigned char all_free_p)
{
@@ -250,8 +263,29 @@ destroy_list(List *ls, unsigned char all_free_p)
if (all_free_p == 2)
free(*(ls->array));
+ ls->len = (ls->capacity = 0);
+
free(ls->array);
free(ls);
+}
- ls->len = (ls->capacity = 0);
+void
+destroy_list_free_all(void *ls)
+{
+ List *list = (List *) ls;
+ destroy_list(list, 1);
+}
+
+void
+destroy_list_free_first(void *ls)
+{
+ List *list = (List *) ls;
+ destroy_list(list, 2);
+}
+
+void
+destroy_list_no_free(void *ls)
+{
+ List *list = (List *) ls;
+ destroy_list(list, 0);
}
diff --git a/src/list.h b/src/list.h
index b710aa2..ff401f4 100644
--- a/src/list.h
+++ b/src/list.h
@@ -69,10 +69,21 @@ unsigned char list_set_length(List *ls, NUM len);
The number of elements of the array will be stored in *NUM. */
void *list_to_array(List *ls, NUM element_bytes, NUM *num);
+/* Create a list from an array.
+
+ The array will be re-used, so it will be destroyed when the list is
+ destroyed. */
+List *array_to_list(void ** array, NUM size);
+
+
/* destroy the list: If ALL_FREE_P is 1, this frees every void
pointers contained in the list; if it is 2, this frees the first
pointer. In any case, the list is de-allocated. */
void destroy_list(List *ls, unsigned char all_free_p);
+void destroy_list_free_all(void *ls);
+void destroy_list_free_first(void *ls);
+void destroy_list_no_free(void *ls);
+
#endif
diff --git a/src/reader.c b/src/reader.c
index 4bbcf5a..1e7c49c 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -1,20 +1,462 @@
+#include "util.h"
+#include "str.h"
#include "reader.h"
-struct Reader_s {
- reader_func_t func;
- List *args; /* the list of arguments; could be
- NULL. */
-};
+#define COUNT_CPA(S, E, Y, L) { \
+ L = 0; \
+ for (NUM macroind = S; macroind < E; L++) { \
+ str_info macroinfo = get_info(Y, macroind); \
+ macroind += macroinfo.step; \
+ } \
+ }
-void build_reader(Reader *reader, reader_func_t func, List *args)
+#define READ_CPA(X, S, E, Y, L) { \
+ L = 0; \
+ for (NUM macroind = S; macroind < E;) { \
+ str_info macroinfo = get_info(Y, macroind); \
+ *(X+(L++)) = macroinfo.value; \
+ macroind += macroinfo.step; \
+ } \
+ }
+
+/* Read a string of TNT's into TNTS. TNTS is expected to be already
+ allocated. */
+static void
+read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list)
{
- reader->func = func;
- reader->args = args;
+ str_info info;
+
+ /* fleprintf("%s:%d, S = %lu, E = %lu\n", start, end); */
+
+ unsigned char readingp = 0, stop = 0;
+
+ NUM *string = NULL, string_size = 0, nt_index = 0, tnt_count = 0;
+
+ NUM strindex = 0;
+ str_info strinfo = EMPTY_STR_INFO;
+
+ for (NUM index = start, last_index = start;
+ index < end && !stop;) {
+ info = get_info(s, index);
+ switch (info.value) {
+ case 10:
+ case 13:
+ /* if a newline is somehow encountered, we stop reading */
+ fleprintf0("%s:%d, hi\n");
+ stop = 1;
+ break;
+ case 9:
+ case 32:
+ /* space separated */
+ if (readingp) {
+ COUNT_CPA(last_index, index, s, string_size);
+ string = MYALLOC(NUM, string_size+1);
+ READ_CPA(string, last_index, index, s, string_size);
+
+ nt_index = find_in_cpa_list(string, string_size, cpa_list);
+
+ if (nt_index < 0) {
+ fleprintf("%s:%d:%lu, Undefined non-terminal: ", index);
+ print_name(&((cpa) { string, string_size }));
+ printf("\n");
+ free(string);
+ string = NULL;
+ return;
+ }
+
+ free(string);
+ string = NULL;
+
+ *(tnts+tnt_count++) = (TNT) { 1, { .nt = nt_index } };
+
+ readingp = 0;
+ }
+ break;
+ case 34:
+ case 39:
+ /* read a string */
+ strindex = index + info.step;
+ strinfo = get_info(s, strindex);
+
+ for (;strindex < end &&
+ strinfo.value != 34 &&
+ strinfo.value != 39 &&
+ strinfo.value != 10 &&
+ strinfo.value != 13;) {
+
+ if (strinfo.value == 92) {
+ strindex += info.step;
+
+ unsigned char local_exit = 0;
+
+ /* The following errors should not happen. I am just
+ paranoid. */
+ if (strindex >= end) {
+ local_exit = 1;
+ fleprintf("%s:%d:%lu, Escape end\n", strindex);
+ }
+
+ strinfo = get_info(s, strindex);
+
+ if (strinfo.value == 10 || strinfo.value == 13) {
+ local_exit = 1;
+ fleprintf("%s:%d:%lu, Escape newline not allowed\n",
+ strindex);
+ }
+
+ /* cleanup */
+ /* no cleanup needed */
+ if (local_exit) return;
+ }
+
+ *(tnts+tnt_count++) = (TNT) { 0, { .t = strinfo.value } };
+
+ strindex += strinfo.step;
+ }
+ break;
+ default:
+ if (!readingp) {
+ last_index = index;
+ readingp = 1;
+ }
+
+ /* if we are at the end, we need to process one additional
+ non-terminal */
+ if (index < end && index + (NUM) info.step >= end) {
+ /* fleprintf0("%s:%d, hello\n"); */
+ COUNT_CPA(last_index, index+(NUM) info.step,
+ s, string_size);
+ /* fleprintf0("%s:%d, hello\n"); */
+ string = MYALLOC(NUM, string_size+1);
+ READ_CPA(string, last_index, index+(NUM) info.step,
+ s, string_size);
+ /* fleprintf0("%s:%d, hello\n"); */
+
+ nt_index = find_in_cpa_list(string, string_size, cpa_list);
+ /* fleprintf0("%s:%d, hello\n"); */
+
+ if (nt_index < 0) {
+ fleprintf("%s:%d:%lu, Undefined non-terminal: ", index);
+ print_name(&((cpa) { string, string_size }));
+ printf("\n");
+ eprintf("END = %lu\n", end);
+ free(string);
+ string = NULL;
+ return;
+ }
+
+ free(string);
+ string = NULL;
+
+ *(tnts+tnt_count++) = (TNT) { 1, { .nt = nt_index } };
+ }
+ break;
+ }
+
+ index += info.step;
+ }
+}
+
+static void
+print_int(void *p)
+{
+ printf("%d\n", *((int *)p));
}
-void destroy_reader(Reader *reader, int flag)
+/* Read a grammar from a string in the format of BNF notation. */
+Grammar *
+read_grammar_from_bnf(str *s)
{
- destroy_list(reader->args, flag);
- reader->args = NULL;
- reader->func = NULL;
+ NUM len = str_length(s);
+
+ /* fleprintf("%s:%d, len = %lu\n", len); */
+
+ /* The first loop collects names of non-terminals */
+
+ List *names = new_list();
+
+ List *line_indices = new_list();
+
+ unsigned char definitionp = 1, readingp = 0;
+
+ /* last index is used to collect strings */
+ for (NUM index = 0, last_index = 0; index < len;) {
+ str_info info = get_info(s, index);
+
+ /* This is used to collect indices of definitions per line. */
+ NUM *line_index = NULL;
+
+ NUM *string = NULL, string_size = 0;
+
+ cpa *cpap = NULL;
+
+ switch (info.value) {
+ /* spaces and tabs are ignored */
+ case ' ':
+ case 9:
+ readingp = 0;
+
+ if (definitionp) {
+ fleprintf("%s:%d:%lu, A non-terminal cannot have spaces in "
+ "the name\n", index);
+ /* cleanup */
+ map_list(names, destroy_cpa_and_free_all);
+ destroy_list(names, 0);
+ destroy_list(line_indices, 1);
+ return NULL;
+ }
+ break;
+ case '\n':
+ case 13:
+ definitionp = 1;
+ readingp = 0;
+ break;
+ case ':':
+ if (definitionp) {
+ definitionp = 0;
+ readingp = 0;
+ /* read a non-terminal */
+ COUNT_CPA(last_index, index, s, string_size);
+ string = MYALLOC(NUM, string_size+1);
+ READ_CPA(string, last_index, index, s, string_size);
+ NEW_CPA(cpap, string, string_size);
+
+ line_index = MYALLOC(NUM, 1);
+ *line_index = find_in_cpa_list(string, string_size, names);
+
+ if (*line_index < 0) {
+ *line_index = list_length(names);
+ add_to_list(names, cpap);
+ } else {
+ destroy_cpa_and_free_all(cpap);
+ string = NULL;
+ }
+
+ add_to_list(line_indices, line_index);
+ /* fleprintf0("%s:%d, Definition should be over\n"); */
+ }
+ break;
+ default:
+ if (!readingp) {
+ last_index = index;
+ readingp = 1;
+ /* fleprintf0("%s:%d, Start reading\n"); */
+ }
+ break;
+ }
+
+ index += info.step;
+ }
+
+ printf("%s:%d, print indices\n", __FILE__, __LINE__);
+ map_list(line_indices, print_int);
+ printf("\n");
+
+ /* second round collects all the rules */
+
+ List *rules = new_list();
+
+ NUM line_count = 0;
+
+ unsigned char right_start_p = 0;
+
+ definitionp = 1, readingp = 0;
+
+ NUM tnt_count = 0;
+ NT current_nt = 0;
+
+ /* last index is used to collect strings */
+ for (NUM index = 0, last_index = 0; index < len;) {
+ str_info info = get_info(s, index);
+ /* fleprintf("%s:%d, index = %lu, value = %lu\n", index, info.value); */
+
+ TNT *current_tnt_string = NULL;
+
+ void **temp_pointers = NULL;
+
+ switch (info.value) {
+ /* the quote characters */
+ case 34: /* double quote */
+ case 39: /* single quote */
+ if (definitionp) {
+ fleprintf("%s:%d:%lu, A non-terminal cannot have quotes in "
+ "the name\n", index);
+ /* cleanup */
+ map_list(names, destroy_cpa_and_free_all);
+ destroy_list(names, 0);
+
+ destroy_list(line_indices, 1);
+
+ map_list(rules, destroy_rule_and_free_first);
+ destroy_list(rules, 0);
+ return NULL;
+ } else {
+ /* read a string of terminals */
+
+ NUM strindex = index + info.step;
+ str_info strinfo = get_info(s, strindex);
+
+ for (;strindex < len &&
+ strinfo.value != 13 &&
+ strinfo.value != 10 &&
+ strinfo.value != 34 &&
+ strinfo.value != 39;) {
+ strinfo = get_info(s, strindex);
+
+ /* escape character */
+ if (strinfo.value == 92) {
+ strindex += strinfo.step;
+ if (strindex < len) strinfo = get_info(s, strindex);
+
+ unsigned char local_error_p = 0;
+
+ if (strindex >= len) {
+ local_error_p = 1;
+ fleprintf("%s:%d:%ld, Escape at the end not allowed\n",
+ strindex);
+ } else if (strinfo.value == 13 || strinfo.value == 10) {
+ local_error_p = 1;
+ fleprintf("%s:%d:%ld, Escape newline in string not "
+ "allowed\n", strindex);
+ }
+
+ if (local_error_p) {
+ /* cleanup */
+ map_list(names, destroy_cpa_and_free_all);
+ destroy_list(names, 0);
+
+ destroy_list(line_indices, 1);
+
+ map_list(rules, destroy_rule_and_free_first);
+ destroy_list(rules, 0);
+ return NULL;
+ }
+ }
+ /* count a terminal */
+ tnt_count++;
+
+ strindex += strinfo.step;
+ }
+
+ if (strinfo.value == 13 || strinfo.value == 10) {
+ fleprintf("%s:%d:%ld, Newline encountered before string "
+ "ended\n", strindex);
+ /* cleanup */
+ map_list(names, destroy_cpa_and_free_all);
+ destroy_list(names, 0);
+
+ destroy_list(line_indices, 1);
+
+ map_list(rules, destroy_rule_and_free_first);
+ destroy_list(rules, 0);
+ return NULL;
+ }
+
+ index = strindex + 1;
+ }
+
+ break;
+ /* spaces and tabs are ignored */
+ case ' ':
+ case 9:
+ if (definitionp) {
+ fleprintf("%s:%d:%lu, A non-terminal cannot have spaces or "
+ "tabs in the name\n", index);
+ /* cleanup */
+ map_list(names, destroy_cpa_and_free_all);
+ destroy_list(names, 0);
+
+ destroy_list(line_indices, 1);
+
+ map_list(rules, destroy_rule_and_free_first);
+ destroy_list(rules, 0);
+ return NULL;
+ }
+
+ readingp = 0;
+
+ break;
+ case '\n':
+ case 13:
+ definitionp = 1;
+ readingp = 0;
+ line_count++;
+
+ /* fleprintf("%s:%d, tnt count = %lu\n", tnt_count); */
+
+ if (right_start_p) {
+ right_start_p = 0;
+
+ current_tnt_string = new_tnt_pointer(tnt_count);
+ read_tnt_string(s, last_index, index, current_tnt_string,
+ names);
+
+ printf("%s:%d:%lu, Printing a TNT string: ",
+ __FILE__, __LINE__, index);
+ for (int i = 0; i < (int) tnt_count;)
+ print_tnt(current_tnt_string+i++);
+ printf("\n");
+
+ temp_pointers = MYALLOC(void *, tnt_count);
+
+ for (NUM tempindex = 0; tempindex < tnt_count; tempindex++)
+ *(temp_pointers + tempindex) = current_tnt_string+tempindex;
+
+ add_to_list
+ (rules,
+ new_rule
+ (current_nt, array_to_list(temp_pointers, tnt_count)));
+ } else {
+ /* empty rule */
+ add_to_list(rules, new_rule(current_nt, new_list()));
+ }
+
+ tnt_count = 0;
+
+ break;
+ case ':':
+ definitionp = 0;
+ readingp = 0;
+ right_start_p = 0;
+ /* the non-terminal at this line is cached by line_indices */
+ current_nt = *((NUM *) list_nth(line_indices, line_count));
+ /* fleprintf("%s:%d, current nt = %u, count = %lu\n",
+ * current_nt, line_count); */
+ break;
+ default:
+ if (!readingp && !definitionp) tnt_count++;
+
+ if (!readingp &&
+ (definitionp || !right_start_p)) {
+ last_index = index;
+ right_start_p = 1;
+ }
+
+ readingp = 1;
+ break;
+ }
+
+ index += info.step;
+ }
+
+ destroy_list(line_indices, 1);
+
+ /* We build the grammar after the string is processed, so as to
+ avoid de-allocating the grammar should errors occur. */
+
+ Grammar *g = new_grammar();
+
+ if (build_grammar(g, rules, names)) {
+ fleprintf0("%s:%d, Failed to build the grammar\n");
+ map_list(names, destroy_cpa_and_free_all);
+ destroy_list(names, 0);
+ map_list(rules, destroy_rule_and_free_first);
+ destroy_list(rules, 0);
+ destroy_grammar(g, 0);
+ return NULL;
+ }
+
+ destroy_list(rules, 1);
+
+ return g;
}
diff --git a/src/reader.h b/src/reader.h
index 77e7b84..65ee7ef 100644
--- a/src/reader.h
+++ b/src/reader.h
@@ -1,34 +1,10 @@
#ifndef INPUT_H
#define INPUT_H
+#include "str.h"
+#include "utf8.h"
#include "grammar.h"
-/* The class of a general reader, which can read a grammar, either
- from a file, or elsewhere, such as from an Emacs buffer, the
- standard input, or even from some other places.
-
- The only expectation, or requirement, of such a reader is to
- provide a function with a variable number and type of arguments
- which returns a grammar. Then the package provides a function to
- properly read a grammar, according to the specifications of the
- reader. */
-
-typedef Grammar *(*reader_func_t)(List *args);
-
-typedef struct Reader_s Reader;
-
-/* contructor */
-
-/* Build a reader and assign to *reader. It is expected to be
- allocated before this function. */
-void build_reader(Reader *reader, reader_func_t func, List *args);
-
-/* destructor */
-
-/* Does not free READER.
-
- FLAG is passed to destroy_list, which see. */
-void destroy_reader(Reader *reader, int flag);
-
+Grammar *read_grammar_from_bnf(str *s);
#endif
diff --git a/src/str.c b/src/str.c
new file mode 100644
index 0000000..46dc4a8
--- /dev/null
+++ b/src/str.c
@@ -0,0 +1,86 @@
+#include "str.h"
+#include "str_partial.h"
+#include <string.h>
+
+H_ATTR
+str_info
+_info_getter(str *s, UNUM n)
+{
+ return (str_info) { (NUM) *(s->data+n), 1 };
+}
+
+/* H_ATTR
+ * NUM
+ * _lenner(str *s)
+ * {
+ * return s->size;
+ * } */
+
+str *
+new_str(char *s, NUM size)
+{
+ str *str_pointer = MYALLOC(str, 1);
+
+ str_pointer->size = size;
+ str_pointer->data = s;
+ str_pointer->getter = _info_getter;
+ /* str_pointer->lenner = _lenner; */
+
+ return str_pointer;
+}
+
+void
+destroy_str(str *s, int flag)
+{
+ if (flag) free(s->data);
+
+ s->size = 0;
+
+ free(s);
+}
+
+H_ATTR
+str_info
+get_info(str *s, UNUM n)
+{
+ return s->getter(s, n);
+}
+
+H_ATTR
+NUM
+str_length(str *s)
+{
+ return s->size;
+}
+
+void
+change_str_content(str *s, char *str, NUM size)
+{
+ s->data = str;
+ s->size = size;
+}
+
+unsigned char
+copy_str(str *dest, str *source)
+{
+ if (str_length(dest) < str_length(source))
+ return 1;
+
+ dest->size = source->size;
+
+ memcpy(dest->data, source->data, source->size);
+
+ return 0;
+}
+
+char *
+get_data(str *s)
+{
+ return s->data;
+}
+
+void
+str_set_length(str *s, UNUM size)
+{
+ s->size = size;
+}
diff --git a/src/str.h b/src/str.h
new file mode 100644
index 0000000..39bddf6
--- /dev/null
+++ b/src/str.h
@@ -0,0 +1,35 @@
+#ifndef STR_H
+#define STR_H
+
+#include <util.h>
+
+typedef struct str_s str;
+
+str *new_str(char *s, NUM size);
+
+/* flag non-zero => free the stored character array as well */
+void destroy_str(str *s, int flag);
+
+/* deliberately expose this struct */
+struct str_info_s {
+ NUM value;
+ size_t step;
+};
+
+typedef struct str_info_s str_info;
+
+#define EMPTY_STR_INFO ((str_info) { 0, 1 })
+
+str_info get_info(str *, UNUM);
+
+char *get_data(str *s);
+
+NUM str_length(str *);
+
+void str_set_length(str *s, UNUM size);
+
+void change_str_content(str *, char *, NUM);
+
+unsigned char copy_str(str *dest, str *source);
+
+#endif
diff --git a/src/str_partial.h b/src/str_partial.h
new file mode 100644
index 0000000..9d4fbd6
--- /dev/null
+++ b/src/str_partial.h
@@ -0,0 +1,13 @@
+#include "str.h"
+
+/* This is meant to be extended, and only has minimal necessary
+ fields. */
+
+typedef str_info (*info_getter) (str *, UNUM);
+
+struct str_s {
+ UNUM size; /* the size in bytes, not in chars */
+ char *data; /* a void pointer is too general */
+ /* polymorphic behaviour */
+ info_getter getter;
+};
diff --git a/src/test.txt b/src/test.txt
new file mode 100644
index 0000000..71e125b
--- /dev/null
+++ b/src/test.txt
@@ -0,0 +1,7 @@
+S: A A C
+A: B B B
+B: A
+A:
+C: B C A
+B: B B
+A: A B A
diff --git a/src/test/check_grammar.c b/src/test/check_grammar.c
index 39f66ee..98a07cd 100644
--- a/src/test/check_grammar.c
+++ b/src/test/check_grammar.c
@@ -37,8 +37,7 @@ main(U_ATTR int argc, U_ATTR char **argv)
List *rules = new_list();
List *names = new_list();
-
- for (int i = 0; i < 3; i++) {
+ for (int i = 0; i < 4; i++) {
tnt_string = new_tnt_string("ntn", 3,
(NT) 3*(i+1),
(T) 3*(i+1)*(i+1),
@@ -48,18 +47,41 @@ main(U_ATTR int argc, U_ATTR char **argv)
eprintf("i = %d, cannot create tnt string\n", i);
map_list(rules, destroy_rule_and_free_all);
-
destroy_list(rules, 0);
- destroy_list(names, 1);
+
+ map_list(names, destroy_cpa_and_free_all);
+ destroy_list(names, 0);
return 1;
}
- rule = new_rule(i, tnt_string);
+ rule = new_rule(i%3, tnt_string);
add_to_list(rules, rule);
char *name = MYALLOC(char, 7);
snprintf(name, 7, "Rule %d", i);
- add_to_list(names, name);
+
+ utf8* uname = new_utf8(name, 6);
+
+ str_info info = get_info((str *)uname, 0);
+
+ NUM *varray = MYALLOC(NUM, 6);
+ NUM vindex = 0;
+
+ for (NUM index = 0;
+ info.value >= 0 && index < str_length((str *) uname);
+ index += info.step, vindex++) {
+ info = get_info((str *)uname, index);
+
+ *(varray+vindex) = info.value;
+ }
+
+ destroy_str((str *) uname, 1);
+
+ cpa *cpap = MYALLOC(cpa, 1);
+ cpap->array = varray;
+ cpap->size = vindex;
+
+ add_to_list(names, cpap);
}
Grammar *g = new_grammar();
@@ -70,7 +92,7 @@ main(U_ATTR int argc, U_ATTR char **argv)
destroy_grammar(g, 1);
- free(g);
+ destroy_list(rules, 1);
return 0;
}
diff --git a/src/test/check_list.c b/src/test/check_list.c
index 8677cd3..fe23b2b 100644
--- a/src/test/check_list.c
+++ b/src/test/check_list.c
@@ -2,7 +2,7 @@
#include <stdlib.h>
#include "../list.h"
-void
+static void
print_int(void *p)
{
printf("%d\n", *((int *)p));
@@ -155,6 +155,8 @@ main(U_ATTR int argc, U_ATTR char **argv)
exit(1);
}
+ eprintf("Successfully created LS2!\n");
+
for (NUM i = 0; i < 5; i++) {
int *pointer = malloc(sizeof *pointer * 1);
@@ -197,7 +199,7 @@ main(U_ATTR int argc, U_ATTR char **argv)
}
NUM arr[10];
-
+
for (NUM i = 0; i < 10; i++) {
arr[i] = i*i-i+1;
if (add_to_list(ls, arr+i)) {
@@ -211,7 +213,28 @@ main(U_ATTR int argc, U_ATTR char **argv)
destroy_list(ls, 0);
+ /* test array_to_list */
+ int *arr3 = MYALLOC(int, 20);
+
+ void **arr2 = NULL;
+
+ for (NUM i = 0; i < 20; i++) {
+ *(arr3+i) = i*i+i+1;
+ }
+
+ arr2 = MYALLOC(void *, 20);
+
+ for (NUM i = 0; i < 20; i++) *(arr2+i) = arr3+i;
+
+ ls = array_to_list(arr2, 20);
+
+ printf("Printing a list from an array\n");
+
+ map_list(ls, print_int);
+
+ destroy_list(ls, 2);
+
printf("Every test is successful!\n");
-
+
return 0;
}
diff --git a/src/test/check_reader.c b/src/test/check_reader.c
index 1d73b3f..8769cd7 100644
--- a/src/test/check_reader.c
+++ b/src/test/check_reader.c
@@ -6,17 +6,29 @@
#include "../grammar.h"
#include "../reader.h"
-/* The test of reading grammars should be postponed till later. */
-
-/* Grammar *
- * read_grammar(List *args)
- * {
- *
- * } */
-
-
+/* TODO: check string */
int
main(U_ATTR int argc, U_ATTR char **argv)
{
- return 77;
+ /* return 77; */
+
+ char *file_name = "test.txt";
+ char *buffer = MYALLOC(char, 512);
+ NUM buffer_size = 0;
+
+ if (read_entire_file(file_name, &buffer, &buffer_size)) {
+ fleprintf("%s:%d, Cannot read file %s", file_name);
+ free(buffer);
+ return 1;
+ }
+
+ utf8 *s = new_utf8(buffer, buffer_size);
+
+ Grammar *g = read_grammar_from_bnf((str *) s);
+
+ print_grammar(g);
+ destroy_grammar(g, 2);
+ destroy_str((str *)s, 1);
+
+ return 0;
}
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..f840600
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,150 @@
+#include "utf8.h"
+#include "str_partial.h"
+#include <stdint.h>
+#include <stdio.h>
+
+struct utf8_s {
+ str s;
+};
+
+/* The classification and the transition table. */
+
+static const
+uint8_t utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static UTF8_State
+decode(UTF8_State *state, uint32_t *codep, uint8_t byte)
+{
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_STATE_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & ((uint32_t) byte);
+
+ *state = utf8d[256 + *state + type];
+ return *state;
+}
+
+str_info
+utf8_get_info(str *s, UNUM n)
+{
+ UTF8_State state = UTF8_STATE_ACCEPT;
+ uint32_t code = 0;
+ NUM orig_n = n;
+
+ /* error */
+ if (n >= s->size) return (str_info) { -1, 0 };
+
+ decode(&state, &code, *(s->data+n));
+
+ if (state == UTF8_STATE_ACCEPT)
+ return (str_info) { code, 1 };
+
+ if (state == UTF8_STATE_REJECT)
+ return (str_info) { -1, 1 };
+
+ n++;
+
+ if (n >= s->size) return (str_info) { -1, 1 };
+
+ for (; n < s->size
+ && state > UTF8_STATE_REJECT;
+ n++) {
+ decode(&state, &code, *(s->data+n));
+ }
+
+ if (state == UTF8_STATE_ACCEPT)
+ return (str_info) { code, n - orig_n };
+
+ return (str_info) { -1, n - orig_n - 1};
+}
+
+utf8 *
+new_utf8(char *string, UNUM size)
+{
+ str *s = new_str(string, size);
+
+ s->getter = utf8_get_info;
+
+ return (utf8 *)s;
+}
+
+/* result should be long enough to hold the data */
+unsigned char
+encode(NUM code_point, str *result)
+{
+ /* calculate the number of bits of CODE_POINT */
+ int number_of_bits = 0;
+
+ for (;code_point >> number_of_bits;) number_of_bits++;
+
+ if (!number_of_bits) {
+ result->size = 0;
+ return 1;
+ }
+
+ if (number_of_bits <= 7) {
+ if (!(result->size)) {
+ eprintf("%s:%d, Result cannot hold the encoded value\n",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ result->size = 1;
+ *(result->data) = (char) code_point;
+ } else if (number_of_bits <= 11) {
+ if (result->size < 2) {
+ eprintf("%s:%d, Result cannot hold the encoded value",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ result->size = 2;
+ *(result->data) = (char) (0xc0 | (code_point >> 6));
+ *(result->data+1) = (char) (0x80 | (code_point & 0x3f));
+ } else if (number_of_bits <= 16) {
+ if (result->size < 3) {
+ eprintf("%s:%d, Result cannot hold the encoded value",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ result->size = 3;
+ *(result->data) = (char) (0xe0 | (code_point >> 12));
+ *(result->data+1) = (char) (0x80 | ((code_point >> 6) & 0x3f));
+ *(result->data+2) = (char) (0x80 | (code_point & 0x3f));
+ } else if (number_of_bits <= 21) {
+ if (result->size < 4) {
+ eprintf("%s:%d, Result cannot hold the encoded value",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ result->size = 4;
+ *(result->data) = (char) (0xf0 | (code_point >> 18));
+ *(result->data+1) = (char) (0x80 | ((code_point >> 12) & 0x3f));
+ *(result->data+2) = (char) (0x80 | ((code_point >> 6) & 0x3f));
+ *(result->data+3) = (char) (0x80 | (code_point & 0x3f));
+ } else {
+ eprintf("%s:%d, Invalid code point to encode",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/src/utf8.h b/src/utf8.h
new file mode 100644
index 0000000..5b3c9f2
--- /dev/null
+++ b/src/utf8.h
@@ -0,0 +1,25 @@
+#ifndef UTF8_H
+#define UTF8_H
+#include "str.h"
+
+typedef struct utf8_s utf8;
+
+enum UTF8_State_e {
+ UTF8_STATE_ACCEPT = 0,
+ UTF8_STATE_REJECT = 12,
+ UTF8_STATE_1 = 24,
+ UTF8_STATE_2 = 36,
+ UTF8_STATE_3 = 48,
+ UTF8_STATE_4 = 60,
+ UTF8_STATE_5 = 72,
+ UTF8_STATE_6 = 84,
+ UTF8_STATE_7 = 96
+};
+
+typedef enum UTF8_State_e UTF8_State;
+
+utf8 *new_utf8(char *string, UNUM size);
+
+unsigned char encode(NUM code_point, str *result);
+
+#endif
diff --git a/src/util.c b/src/util.c
new file mode 100644
index 0000000..e228fbb
--- /dev/null
+++ b/src/util.c
@@ -0,0 +1,49 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "util.h"
+
+/* Put the entire file content into the string str and the file size
+ into *len.
+
+ If errors occur, return non-zero; otherwise the return value is
+ zero.
+
+ Note that *str should be already allocated.
+
+ Note that the length of the string str is actually 1+length, and
+ the last character is the null character. */
+unsigned char
+read_entire_file(const char *file_name, char **str, NUM *len)
+{
+ long file_size = 0;
+
+ FILE *file = fopen(file_name, "r");
+
+ if (!file) {
+ eprintf("%s:%d, Cannot open file \"%s\": ",
+ __FILE__, __LINE__, file_name);
+ perror(NULL);
+ return 1;
+ }
+
+ fseek(file, 0, SEEK_END);
+ file_size = ftell(file);
+ fseek(file, 0, SEEK_SET);
+
+ *len = 1+file_size;
+
+ *str = realloc(*str, 1+file_size);
+
+ if (*str == NULL) {
+ fleprintf0("%s:%d, Cannot realloc\n");
+ return 1;
+ }
+
+ fread(*str, sizeof(char), file_size, file);
+
+ fclose(file);
+
+ *(*str+file_size) = 0;
+
+ return 0;
+}
diff --git a/src/util.h b/src/util.h
index d657f0e..9120e9b 100644
--- a/src/util.h
+++ b/src/util.h
@@ -28,7 +28,9 @@ typedef unsigned long long UNUM; /* definitely bigger than size_t */
#define eprintf(...) fprintf(stderr, __VA_ARGS__)
+#define fleprintf0(M) eprintf(M, __FILE__, __LINE__)
+#define fleprintf(M, ...) eprintf(M, __FILE__, __LINE__, __VA_ARGS__)
-
+unsigned char read_entire_file(const char *file_name, char **str, NUM *len);
#endif