#ifndef GRAMMAR_H #define GRAMMAR_H #include "dfa.h" #include "ht.h" #include #include #include "list.h" #include "utf8.h" /* The class file for grammars */ /* A grammar is a list of rules. A rule replaces a non-terminal with a finite string of terminals and non-terminals. For us, a terminal is a positive integer, and a non-terminal is also an integer. We represent them as unsigned integers. Since we want to deal with unicode points, the terminal is stored as unsigned long, so that we can include all unicode poitns. Moreover, a terminal might be a "predicate terminal", that is, it does not refer to one character only. Instead, it refers to a set of characters. The most efficient way of recognizing a set of characters, of which I know, is to use a deterministic finite automaton. So we represent a predicate terminal as a DFA. Whereas the DFA's are equivalent to regular expressions in the expressive power, I do not need full support for regular expressions for the predicate terminals. In particular, I only want to match a character for one predicate terminal, so there are no such things as repititive and optional constructs in the DFA. Rather, each predicate terminal is a simple table, or a matrix, which must have exactly 256 columns and an indefinite number of rows. The columns represent the input character's bytes to match, and the rows are the states of the DFA. The table must have at least one row: the starting state is always the zeroth row and the accepting state is any state that is out of bounds: either greater than or equal to the number of rows or negative. Of course, it is inefficient to keep the whoe table within the memory, especially since the most part of the table would be unused. So we store a DFA in a compressed format. I have considered whether to apply some more advanced compression techniques, such as using BW transoform followed by the MTF algorithm, but in the end I think that the DFA's that will be used are oft times just small tables, at least I hope so, thus I decided to simply use the Run-Length-Encoding. See the file dfa.h for implementation details. */ typedef long T; typedef unsigned long NT; typedef unsigned long PT; enum PT_TYPE_e { PT_DFA, PT_SPECIAL }; typedef enum PT_TYPE_e PT_TYPE; typedef struct PT_DATA_s PTD; struct PT_DATA_s { PT_TYPE type; union { special_dfa s; dfa *d; } data; char *label; /* NULL-terminated string */ }; /* T or NT Don't worry, it is not explosive. */ enum TNT_TYPE_e { TERMINAL, PREDICATE, NONTERMINAL }; typedef enum TNT_TYPE_e TNT_TYPE; /* If a TNT of type TERMINAL has value END_OF_INPUT, then it means, surprisingly, the end of input. */ enum { END_OF_INPUT = -1 }; struct TNT_s { TNT_TYPE type; union { T t; NT nt; PT pt; } data; }; typedef struct TNT_s TNT; typedef struct Rule_s Rule; typedef struct Rule_group_s Rule_group; NT rg_left(CCR_MOD(Rule_group *) rg); NUM rg_len(CCR_MOD(Rule_group *) rg); List *rg_right(CCR_MOD(Rule_group *) rg); List *rg_nth(CCR_MOD(Rule_group *) rg, NUM n); /* A grammar is more than a list of rules: it should include a correspondance to associate the names of non-terminals with unsigned integers. Basically, this means it should hold a list of strings as well. */ typedef struct Grammar_s Grammar; /* G is expected to be allocated before this function. NAMES is a list of CPA pointers. */ BOOL build_grammar(Grammar *g, const List *rules, CC_MOD(List *) names); /* This accepts one and only one optional argument, which is of type either T, NT, or PT, depending on TYPE. */ TNT *new_tnt(TNT_TYPE type, ...); /* An array of code-points. */ struct cpa_s { NUM *array; UNUM size; }; typedef struct cpa_s cpa; NUM find_in_cpa_list(CCR_MOD(NUM *) string, NUM size, CCR_MOD(List *) list); /* For debugging purposes, we need to print out rules to examine their values. */ /* assume element is a cpa pointer */ void print_name(void *element); void print_tnt(void *element); void print_rule(void *r); /* This will generate errors if G is not a list of rules. */ void print_grammar(CC_MOD(Grammar *) g); /* constructors */ /* FORMAT specifies the types of TNTs. It is a string of t and n's. FORMAT_LEN is the length of FORMAT, excluding the terminating null byte. Upon failure NULL is returned. */ List *new_tnt_string(char *format, int format_len, ...); TNT *new_tnt_pointer(size_t size); /* RIGHT should be created by new_tnt_string or something alike. If RIGHT is NULL, NULL is returned. */ Rule *new_rule(NT left, CC_MOD(List *) right); Grammar *new_grammar(); /* destructors */ void destroy_rule(void *rule, int flag); void destroy_rule_and_free_all(void *rule); void destroy_rule_and_free_first(void *rule); void destroy_rule_no_free(void *rule); void destroy_cpa_and_free_all(void *element); void destroy_grammar(void *grammar, int flag); NUM grammar_left_len(CCR_MOD(Grammar *)g); /* look up a rule */ Rule_group *grammar_rule(CCR_MOD(Grammar *) g, NT nt); /* NTS should be already allocated to be an array of BOOLs. The size of this array should be the same as the length of the array of non-terminals of G. If any of the above two conditions is not satisfied, the behaviour is undefined. After this function returns, the element corresponding to a non-terminal X is non-zero if and only if X has epsilon rules; that is to say, X can produce the empty string. */ void epsilon_nts(CC_MOD(Grammar *) g, BOOL * const restrict nts); /* TERMINAL_HTS and PREDICATE_HTS are supposed to be already allocated to have enough space to contain an array of hash tables of the same length as the array of non-terminals. Each element in the array should be hash tables that are already created. If any of the above conditions is not satisfied, the behaviour is undefined. TERMINAL_HTS contain mappings from non-terminals to sets of terminals. A terminal belongs to the hash table corresponding to the non-terminal X in TERMINAL_HTS if and only if that terminal belongs to the first set of X. Mutatis mutandis for PREDICATE_HTS. The reason for two arrays is that terminals and predicates have two different types in our grammar, and are stored in different arrays, so we cannot simply refer to them as a single integer. The hash tables that result will contain invalid pointers as values after the function returns. On error return non-zero. */ BOOL nt_first(CC_MOD(Grammar *) g, CCR_MOD(BOOL *) nts, ht *terminal_hts, ht *predicate_hts); /* Having computed the first sets of non-terminals, we can compute the first sets of strings of non-terminals and terminals. RESULT_TERMINALS and RESULT_PREDICATES must satisfy similar requirements as in nt_first. The values are meaningless invalid pointers. So one should not free those pointers afterwards. On error return non-zero. */ BOOL tnt_first(CC_MOD(ht *) terminal_hts, CC_MOD(ht *) predicate_hts, CCR_MOD(BOOL *) nts, NUM len, List *tnts, ht * const restrict result_terminals, ht * const restrict result_predicates); /* Mutatis mutandis to NT_FIRST, but for the follow sets of non-terminals. Note that the hash table RESULT_TERMINAL_HTS is special here. It not only contains keys for the terminals in the follow set of the corresponding nonterminal, but it also contains END_OF_INPUT for the nonterminals whose follow set contains the end-of-input marker. On error return non-zero. */ BOOL nt_follow(CC_MOD(Grammar *) g, CCR_MOD(BOOL *) nts, CC_MOD(ht *)terminal_hts, CC_MOD(ht *)predicate_hts, ht * const result_terminals, ht * const result_predicates); /* convenient macro */ #define NEW_CPA(X, ARRAY, SIZE) do { \ X = MYALLOC(cpa, 1); \ X->array = ARRAY; \ X->size = SIZE; \ } while (0) #endif