#ifndef GRAMMAR_H #define GRAMMAR_H #include #include #include "list.h" #include "utf8.h" /* The class file for grammars */ /* TODO: Terminals should be extended to incorporate the "predicate terminals", such as character classes. */ /* A grammar is a list of rules. A rule replaces a non-terminal with a finite string of terminals and non-terminals. For us, a terminal is a positive integer, and a non-terminal is also an integer. We represent them as unsigned integers. Since we want to deal with unicode points, the terminal is stored as unsigned long, so that we can include all unicode poitns. Moreover, a terminal might be a "predicate terminal", that is, it does not refer to one character only. Instead, it refers to a set of characters. The most efficient way of recognizing a set of characters, of which I know, is to use a deterministic finite automaton. So we represent a predicate terminal as a DFA. Whereas the DFA's are equivalent to regular expressions in the expressive power, I do not need full support for regular expressions for the predicate terminals. In particular, I only want to match a character for one predicate terminal, so there are no such things as repititive and optional constructs in the DFA. Rather, each predicate terminal is a simple table, or a matrix, which must have exactly 256 columns and an indefinite number of rows. The columns represent the input character's bytes to match, and the rows are the states of the DFA. The table must have at least one row: the starting state is always the zeroth row and the accepting state is any state that is out of bounds: either greater than or equal to the number of rows or negative. Of course, it is inefficient to keep the whoe table within the memory, especially since the most part of the table would be unused. So we store a DFA in a compressed format. I have considered whether to apply some more advanced compression techniques, such as using BW transoform followed by the MTF algorithm, but in the end I think that the DFA's that will be used are oft times just small tables, at least I hope so, thus I decided to simply use the Run-Length-Encoding. See the file dfa.h for implementation details. */ typedef unsigned long T; typedef unsigned NT; /* T or NT Don't worry, it is not explosive. */ struct TNT_s { int type; /* 0 => T and NT otherwise */ union { T t; NT nt; } data; }; typedef struct TNT_s TNT; typedef struct Rule_s Rule; /* A grammar is more than a list of rules: it should include a correspondance to associate the names of non-terminals with unsigned integers. Basically, this means it should hold a list of strings as well. */ typedef struct Grammar_s Grammar; /* G is expected to be allocated before this function. NAMES is a list of CPA pointers. */ BOOL build_grammar(Grammar *g, const List *rules, const List * const names); /* This accepts one and only one optional argument, which is of type either T or NT, depending on TYPE being 0 or not. */ TNT *new_tnt(int type, ...); /* An array of code-points. */ struct cpa_s { NUM *array; UNUM size; }; typedef struct cpa_s cpa; NUM find_in_cpa_list(const NUM * const restrict string, NUM size, const List * const restrict list); /* For debugging purposes, we need to print out rules to examine their values. */ /* assume element is a cpa pointer */ void print_name(void *element); void print_tnt(void *element); void print_rule(void *r); /* This will generate errors if G is not a list of rules. */ void print_grammar(const Grammar * const g); /* constructors */ /* FORMAT specifies the types of TNTs. It is a string of t and n's. FORMAT_LEN is the length of FORMAT, excluding the terminating null byte. Upon failure NULL is returned. */ List *new_tnt_string(char *format, int format_len, ...); TNT *new_tnt_pointer(size_t size); /* RIGHT should be created by new_tnt_string or something alike. If RIGHT is NULL, NULL is returned. */ Rule *new_rule(NT left, const List * const right); Grammar *new_grammar(); /* destructors */ void destroy_rule(void *rule, int flag); void destroy_rule_and_free_all(void *rule); void destroy_rule_and_free_first(void *rule); void destroy_rule_no_free(void *rule); void destroy_cpa_and_free_all(void *element); void destroy_grammar(void *grammar, int flag); /* convenient macro */ #define NEW_CPA(X, ARRAY, SIZE) do { \ X = MYALLOC(cpa, 1); \ X->array = ARRAY; \ X->size = SIZE; \ } while (0) #endif