src/grammar.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

#ifndef GRAMMAR_H
#define GRAMMAR_H

#include <stdarg.h>
#include <stdio.h>
#include "list.h"
#include "utf8.h"

/* The class file for grammars */

/* TODO: Terminals should be extended to incorporate the "predicate
   terminals", such as character classes. */

/* A grammar is a list of rules.  A rule replaces a non-terminal with
   a finite string of terminals and non-terminals.

   For us, a terminal is a positive integer, and a non-terminal is
   also an integer.  We represent them as unsigned integers.  Since we
   want to deal with unicode points, the terminal is stored as
   unsigned long, so that we can include all unicode poitns.

   Moreover, a terminal might be a "predicate terminal", that is, it
   does not refer to one character only.  Instead, it refers to a set
   of characters.  The most efficient way of recognizing a set of
   characters, of which I know, is to use a deterministic finite
   automaton.  So we represent a predicate terminal as a DFA.

   Whereas the DFA's are equivalent to regular expressions in the
   expressive power, I do not need full support for regular
   expressions for the predicate terminals.  In particular, I only
   want to match a character for one predicate terminal, so there are
   no such things as repititive and optional constructs in the DFA.
   Rather, each predicate terminal is a simple table, or a matrix,
   which must have exactly 256 columns and an indefinite number of
   rows.  The columns represent the input character's bytes to match,
   and the rows are the states of the DFA.  The table must have at
   least one row: the starting state is always the zeroth row and the
   accepting state is any state that is out of bounds: either greater
   than or equal to the number of rows or negative.

   Of course, it is inefficient to keep the whoe table within the
   memory, especially since the most part of the table would be
   unused.  So we store a DFA in a compressed format.  I have
   considered whether to apply some more advanced compression
   techniques, such as using BW transoform followed by the MTF
   algorithm, but in the end I think that the DFA's that will be used
   are oft times just small tables, at least I hope so, thus I decided
   to simply use the Run-Length-Encoding.  See the file dfa.h for
   implementation details. */

typedef unsigned long T;
typedef unsigned NT;

/* T or NT

   Don't worry, it is not explosive. */

struct TNT_s {
  int type;                     /* 0 => T and NT otherwise */
  union { T t; NT nt; } data;
};

typedef struct TNT_s TNT;
typedef struct Rule_s Rule;

/* A grammar is more than a list of rules: it should include a
   correspondance to associate the names of non-terminals with
   unsigned integers.  Basically, this means it should hold a list of
   strings as well. */
typedef struct Grammar_s Grammar;

/* G is expected to be allocated before this function.

   NAMES is a list of CPA pointers. */
BOOL
build_grammar(Grammar *g,
              const List *rules, const List * const names);

/* This accepts one and only one optional argument, which is of type
   either T or NT, depending on TYPE being 0 or not. */
TNT *new_tnt(int type, ...);

/* An array of code-points. */

struct cpa_s {
  NUM *array;
  UNUM size;
};

typedef struct cpa_s cpa;

NUM
find_in_cpa_list(const NUM * const restrict string, NUM size,
                 const List * const restrict list);

/* For debugging purposes, we need to print out rules to examine their
   values. */

/* assume element is a cpa pointer */
void print_name(void *element);

void print_tnt(void *element);

void print_rule(void *r);

/* This will generate errors if G is not a list of rules. */
void print_grammar(const Grammar * const g);

/* constructors */

/* FORMAT specifies the types of TNTs.  It is a string of t and n's.
   FORMAT_LEN is the length of FORMAT, excluding the terminating null
   byte.

   Upon failure NULL is returned. */
List *new_tnt_string(char *format, int format_len, ...);

TNT *new_tnt_pointer(size_t size);

/* RIGHT should be created by new_tnt_string or something alike.

   If RIGHT is NULL, NULL is returned. */
Rule *new_rule(NT left, const List * const right);

Grammar *new_grammar();

/* destructors */
void destroy_rule(void *rule, int flag);
void destroy_rule_and_free_all(void *rule);
void destroy_rule_and_free_first(void *rule);
void destroy_rule_no_free(void *rule);
void destroy_cpa_and_free_all(void *element);
void destroy_grammar(void *grammar, int flag);

/* convenient macro */

#define NEW_CPA(X, ARRAY, SIZE) do {            \
    X = MYALLOC(cpa, 1);                        \
    X->array = ARRAY;                           \
    X->size = SIZE;                             \
  } while (0)

#endif