summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2022-02-06 23:35:42 +0800
committerJSDurand <mmemmew@gmail.com>2022-02-06 23:50:22 +0800
commitaaa12504c6095b2cdfa213a3d4b269bbd5e7038a (patch)
tree9513833a65f0f2687b238fe6d0415bd5877ed8ae
parent3d709982b66314b23b5957041580dd4918561a53 (diff)
dfa: add the type of "ranged dfas"
Strictly speaking, they are not DFA's at all. They contain ranges which can determine whether or not a character belongs to the specified predicate terminal.
-rw-r--r--src/dfa.c197
-rw-r--r--src/dfa.h23
-rw-r--r--src/test/check_dfa.c10
-rw-r--r--src/test/check_pred.c15
4 files changed, 198 insertions, 47 deletions
diff --git a/src/dfa.c b/src/dfa.c
index 9da201d..2edda77 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -11,6 +11,17 @@
*(X->children+macro_index++) = DFA_STATE_UNKNOWN; \
} while (0)
+struct range_s {
+ NUM beg, end;
+};
+
+struct special_dfa_s {
+ NUM len;
+ struct range_s *ranges;
+ /* This is only used when this is a mixed special dfa */
+ NUM boundary;
+};
+
/* The complete, uncompressed table is represented as an array of
arrays, each array of which has size MAX_CHAR_BYTES_NUM. The
compressed table is represented as run-lengths. The ROW_N is the
@@ -32,25 +43,24 @@ enum DFA_TYPE {
DFA_TYPE_NEGATIVE,
DFA_TYPE_BOTH,
DFA_TYPE_SPECIAL,
+ DFA_TYPE_SPECIAL_NEG,
+ DFA_TYPE_SPECIAL_BOTH,
};
struct dfa_s {
enum DFA_TYPE type;
union {
compressed_table table;
- special_dfa fn;
+ special_dfa sp;
} data;
};
dfa *
new_dfa()
{
- dfa *dfap = MYALLOC(dfa, 1);
-
- dfap->type = 0;
+ dfa *dfap = NULL;
- dfap->data.table.row_n = 0;
- dfap->data.table.table = NULL;
+ SAFE_CALLOC(dfa, dfap, 1, return NULL;);
return dfap;
}
@@ -58,8 +68,12 @@ new_dfa()
void
destroy_dfa(dfa *table)
{
- /* function pointers need no destructors */
- if (table->type == DFA_TYPE_SPECIAL) goto free_last;
+ if (table->type == DFA_TYPE_SPECIAL ||
+ table->type == DFA_TYPE_SPECIAL_NEG ||
+ table->type == DFA_TYPE_SPECIAL_BOTH) {
+ free(table->data.sp.ranges);
+ goto free_last;
+ }
for (int i = 0; i < table->data.table.row_n;)
free(*(table->data.table.table+i++));
@@ -123,21 +137,40 @@ print_dfa(const dfa * const restrict table)
switch (table->type) {
case DFA_TYPE_POSITIVE:
printf("Printing a positive table:\n");
+ goto print_table;
break;
case DFA_TYPE_NEGATIVE:
printf("Printing a negative table:\n");
break;
case DFA_TYPE_BOTH:
printf("Printing a mixed table:\n");
+ goto print_table;
+ break;
+ case DFA_TYPE_SPECIAL:
+ printf("Printing a table containing positive ranges:\n");
+ break;
+ case DFA_TYPE_SPECIAL_NEG:
+ printf("Printing a table containing negative ranges:\n");
+ break;
+ case DFA_TYPE_SPECIAL_BOTH:
+ printf("Printing a table containing mixed ranges:\n");
break;
default:
- printf("A special table cannot be printed\n");
+ printf("Table of unknown type\n");
return;
break;
}
- printf("Printing a table:\n");
+ for (int i = 0; i < table->data.sp.len; i++) {
+ printf("Range %d: %ld to %ld\n", i,
+ (table->data.sp.ranges+i)->beg,
+ (table->data.sp.ranges+i)->end);
+ }
+ printf("\n");
+
+ return;
+ print_table:
for (int i = 0; i < table->data.table.row_n; i++) {
printf("Row %d: ", i);
int size = 0;
@@ -232,7 +265,9 @@ dfa_from_bytes(int sequence_size,
/* For debugging */
+#ifdef DEBUG
/* print_tries(states); */
+#endif
/* construct DFA from the trie */
@@ -610,22 +645,135 @@ dfa *dfa_from_bytes_both(int sequence_size,
return dfap;
}
+dfa *
+dfa_from_ranges(int len, CCR_MOD(NUM *) data)
+{
+ struct range_s *rp = NULL;
+ SAFE_MALLOC(struct range_s, rp, len, return NULL;);
+
+ for (int i = 0; i < len; i++) {
+ (rp+i)->beg = *(data+(i<<1));
+ (rp+i)->end = *(data+(i<<1)+1);
+ }
+
+ dfa *dfap = NULL;
+ SAFE_MALLOC(dfa, dfap, 1, free(rp); return NULL;);
+
+ dfap->type = DFA_TYPE_SPECIAL;
+ dfap->data.sp = (special_dfa) { len, rp, 0 };
+
+ return dfap;
+}
+
+dfa *
+dfa_from_ranges_neg(int len, CCR_MOD(NUM *) data)
+{
+ dfa *dfap = dfa_from_ranges(len, data);
+ if (dfap == NULL) return NULL;
+ dfap->type = DFA_TYPE_SPECIAL_NEG;
+ return dfap;
+}
+
+dfa *
+dfa_from_ranges_both(int plen, CCR_MOD(NUM *) pdata,
+ int nlen, CCR_MOD(NUM *) ndata)
+{
+ int len = plen + nlen;
+ struct range_s *rp = NULL;
+ SAFE_MALLOC(struct range_s, rp, len, return NULL;);
+
+ for (int i = 0; i < plen; i++) {
+ (rp+i)->beg = *(pdata+(i<<1));
+ (rp+i)->end = *(pdata+(i<<1)+1);
+ }
+
+ for (int i = 0; i < nlen; i++) {
+ (rp+plen+i)->beg = *(ndata+(i<<1));
+ (rp+plen+i)->end = *(ndata+(i<<1)+1);
+ }
+
+ dfa *dfap = NULL;
+ SAFE_MALLOC(dfa, dfap, 1, free(rp); return NULL;);
+
+ dfap->type = DFA_TYPE_SPECIAL_BOTH;
+ dfap->data.sp = (special_dfa) { len, rp, plen };
+
+ return dfap;
+}
+
BOOL
run_dfa(const dfa * const restrict table, const NUM code)
{
+ /* negative codes have special purposes */
+ if (code < 0) return 0;
+
switch (table->type) {
case DFA_TYPE_POSITIVE:
case DFA_TYPE_NEGATIVE:
case DFA_TYPE_BOTH:
break;
+ case DFA_TYPE_SPECIAL:
+ for (int i = 0; i < table->data.sp.len; i++)
+ if (code >= (table->data.sp.ranges+i)->beg &&
+ code <= (table->data.sp.ranges+i)->end) {
+#ifdef DEBUG
+ fleprintf("code = %ld, beg = %ld, end = %ld\n",
+ code,
+ (table->data.sp.ranges+i)->beg,
+ (table->data.sp.ranges+i)->end);
+#endif
+ return 1;
+ }
+ return 0;
+ break;
+ case DFA_TYPE_SPECIAL_NEG:
+ for (int i = 0; i < table->data.sp.len; i++)
+ if (code >= (table->data.sp.ranges+i)->beg &&
+ code <= (table->data.sp.ranges+i)->end) {
+#ifdef DEBUG
+ fleprintf("code = %ld, beg = %ld, end = %ld\n",
+ code,
+ (table->data.sp.ranges+i)->beg,
+ (table->data.sp.ranges+i)->end);
+#endif
+ return 0;
+ }
+ return 1;
+ break;
+ case DFA_TYPE_SPECIAL_BOTH:
+ for (int i = table->data.sp.boundary;
+ i < table->data.sp.len;
+ i++)
+ if (code >= (table->data.sp.ranges+i)->beg &&
+ code <= (table->data.sp.ranges+i)->end) {
+#ifdef DEBUG
+ fleprintf("code = %ld, beg = %ld, end = %ld\n",
+ code,
+ (table->data.sp.ranges+i)->beg,
+ (table->data.sp.ranges+i)->end);
+#endif
+ return 0;
+ }
+ for (int i = 0; i < table->data.sp.boundary; i++)
+ if (code >= (table->data.sp.ranges+i)->beg &&
+ code <= (table->data.sp.ranges+i)->end) {
+#ifdef DEBUG
+ fleprintf("code = %ld, beg = %ld, end = %ld\n",
+ code,
+ (table->data.sp.ranges+i)->beg,
+ (table->data.sp.ranges+i)->end);
+#endif
+ return 1;
+ }
+ return 0;
+ break;
default:
- return table->data.fn(code);
+ fleprintf("Unknown type: %d\n", table->type);
+ return 0;
break;
}
- /* negative codes have special purposes */
- if (code < 0) return 0;
-
+ /* below is for running the DFA */
char s[5];
str *strp = new_str(s, 5);
@@ -696,24 +844,3 @@ run_dfa(const dfa * const restrict table, const NUM code)
return 0;
}
-
-P_ATTR
-static BOOL
-dfa_any_fun(const NUM UNUSED code) { return 1; }
-
-dfa *
-dfa_from_func(special_dfa func)
-{
- dfa *result = NULL;
- SAFE_MALLOC(dfa, result, 1, return NULL;);
- result->type = DFA_TYPE_SPECIAL;
- result->data.fn = func;
-
- return result;
-}
-
-dfa *
-dfa_any(void)
-{
- return dfa_from_func(dfa_any_fun);
-}
diff --git a/src/dfa.h b/src/dfa.h
index 9116999..9d76a22 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -22,7 +22,9 @@ enum {
/* dfa type */
-typedef BOOL (* special_dfa) (const NUM code);
+/* typedef BOOL (* special_dfa) (const NUM code); */
+
+typedef struct special_dfa_s special_dfa;
typedef struct dfa_s dfa;
@@ -45,6 +47,20 @@ dfa *dfa_from_bytes_both(int sequence_size,
int neg_sequence_size,
CCR_MOD(NUM *) negdata);
+/* 2*LEN is the lengths of DATA. The number at index 2*i is the start
+ of the i-th range and the number at index 2*i+1 is the end of the
+ i-th range.
+
+ On error return NULL. */
+dfa *dfa_from_ranges(int len, CCR_MOD(NUM *) data);
+
+/* mutatis mutandis */
+dfa *dfa_from_ranges_neg(int len, CCR_MOD(NUM *) data);
+
+/* mutatis mutandis */
+dfa *dfa_from_ranges_both(int plen, CCR_MOD(NUM *) pdata,
+ int nlen, CCR_MOD(NUM *) ndata);
+
/* TODO: Reject character bytes from a given DFA. */
/* NOTE: Add all unicode valid points to a DFA, so that we can
@@ -57,11 +73,6 @@ dfa *dfa_from_bytes_both(int sequence_size,
/* TODO: Construct some basic frequently used character classes. */
-dfa *dfa_from_func(special_dfa func);
-
-/* return a new instance of the any class */
-dfa *dfa_any(void);
-
BOOL run_dfa(CCR_MOD(dfa *) table, const NUM code);
#endif
diff --git a/src/test/check_dfa.c b/src/test/check_dfa.c
index a728fb8..426eee4 100644
--- a/src/test/check_dfa.c
+++ b/src/test/check_dfa.c
@@ -104,5 +104,15 @@ main(int UNUSED argc, char ** UNUSED argv)
* printf("the result = %d\n", result); */
if (dfap) destroy_dfa(dfap);
+
+ dfap = dfa_from_ranges(3, (NUM []) { 1, 10, 'a', 'z', 'A', 'Z' });
+
+ if (dfap) {
+ printf("Successfully created a DFA from an array of ranges\n");
+ print_dfa(dfap);
+ }
+
+ if (dfap) destroy_dfa(dfap);
+
return 0;
}
diff --git a/src/test/check_pred.c b/src/test/check_pred.c
index 14081d5..8a3e613 100644
--- a/src/test/check_pred.c
+++ b/src/test/check_pred.c
@@ -1,3 +1,4 @@
+#include "limits.h"
#include "time.h"
#include "../cnp.h"
@@ -88,7 +89,7 @@ main(int UNUSED argc, char ** UNUSED argv)
*(user_name+4) = 'i';
*(user_name+5) = 0;
- user_name_s = new_utf8(user_name, 5);
+ user_name_s = (str *) new_utf8(user_name, 5);
SAFE_MALLOC(char, raw_name, 8, return 1;);
@@ -101,7 +102,7 @@ main(int UNUSED argc, char ** UNUSED argv)
*(raw_name+6) = 'Z';
*(raw_name+7) = 0;
- raw_name_s = new_utf8(raw_name, 6);
+ raw_name_s = (str *) new_utf8(raw_name, 6);
SAFE_MALLOC(NUM, pred_bytes, 52, return 1;);
@@ -111,9 +112,11 @@ main(int UNUSED argc, char ** UNUSED argv)
for (int i = 0; i < 26; i++)
*(pred_bytes+pred_bytes_len++) = 'A' + i;
- if (add_to_list(preds, new_ptd(user_name_s, raw_name_s,
- dfa_from_bytes_neg
- (pred_bytes_len, pred_bytes)))) {
+ if (add_to_list(preds,
+ new_ptd(user_name_s, raw_name_s,
+ dfa_from_ranges_both
+ (2, (NUM[]) { 'a', 'z', 'A', 'Z' },
+ 2, (NUM[]) { 'b', 'w', 'B', 'W' })))) {
fleprintf0("Fail to add a predicate\n");
return 1;
}
@@ -126,7 +129,7 @@ main(int UNUSED argc, char ** UNUSED argv)
print_grammar(g);
- utf8 *string = new_utf8("++--__,.(){}><|", 15);
+ utf8 *string = new_utf8("aaaxxxxaaaaaaaa", 15);
printf("\nPrinting the input...\n%s\n", get_data((str *) string));