From aaa12504c6095b2cdfa213a3d4b269bbd5e7038a Mon Sep 17 00:00:00 2001 From: JSDurand Date: Sun, 6 Feb 2022 23:35:42 +0800 Subject: dfa: add the type of "ranged dfas" Strictly speaking, they are not DFA's at all. They contain ranges which can determine whether or not a character belongs to the specified predicate terminal. --- src/dfa.c | 197 +++++++++++++++++++++++++++++++++++++++++--------- src/dfa.h | 23 ++++-- src/test/check_dfa.c | 10 +++ src/test/check_pred.c | 15 ++-- 4 files changed, 198 insertions(+), 47 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 9da201d..2edda77 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -11,6 +11,17 @@ *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \ } while (0) +struct range_s { + NUM beg, end; +}; + +struct special_dfa_s { + NUM len; + struct range_s *ranges; + /* This is only used when this is a mixed special dfa */ + NUM boundary; +}; + /* The complete, uncompressed table is represented as an array of arrays, each array of which has size MAX_CHAR_BYTES_NUM. The compressed table is represented as run-lengths. The ROW_N is the @@ -32,25 +43,24 @@ enum DFA_TYPE { DFA_TYPE_NEGATIVE, DFA_TYPE_BOTH, DFA_TYPE_SPECIAL, + DFA_TYPE_SPECIAL_NEG, + DFA_TYPE_SPECIAL_BOTH, }; struct dfa_s { enum DFA_TYPE type; union { compressed_table table; - special_dfa fn; + special_dfa sp; } data; }; dfa * new_dfa() { - dfa *dfap = MYALLOC(dfa, 1); - - dfap->type = 0; + dfa *dfap = NULL; - dfap->data.table.row_n = 0; - dfap->data.table.table = NULL; + SAFE_CALLOC(dfa, dfap, 1, return NULL;); return dfap; } @@ -58,8 +68,12 @@ new_dfa() void destroy_dfa(dfa *table) { - /* function pointers need no destructors */ - if (table->type == DFA_TYPE_SPECIAL) goto free_last; + if (table->type == DFA_TYPE_SPECIAL || + table->type == DFA_TYPE_SPECIAL_NEG || + table->type == DFA_TYPE_SPECIAL_BOTH) { + free(table->data.sp.ranges); + goto free_last; + } for (int i = 0; i < table->data.table.row_n;) free(*(table->data.table.table+i++)); @@ -123,21 +137,40 @@ print_dfa(const dfa * const restrict table) switch (table->type) { case DFA_TYPE_POSITIVE: printf("Printing a positive table:\n"); + goto print_table; break; case DFA_TYPE_NEGATIVE: printf("Printing a negative table:\n"); break; case DFA_TYPE_BOTH: printf("Printing a mixed table:\n"); + goto print_table; + break; + case DFA_TYPE_SPECIAL: + printf("Printing a table containing positive ranges:\n"); + break; + case DFA_TYPE_SPECIAL_NEG: + printf("Printing a table containing negative ranges:\n"); + break; + case DFA_TYPE_SPECIAL_BOTH: + printf("Printing a table containing mixed ranges:\n"); break; default: - printf("A special table cannot be printed\n"); + printf("Table of unknown type\n"); return; break; } - printf("Printing a table:\n"); + for (int i = 0; i < table->data.sp.len; i++) { + printf("Range %d: %ld to %ld\n", i, + (table->data.sp.ranges+i)->beg, + (table->data.sp.ranges+i)->end); + } + printf("\n"); + + return; + print_table: for (int i = 0; i < table->data.table.row_n; i++) { printf("Row %d: ", i); int size = 0; @@ -232,7 +265,9 @@ dfa_from_bytes(int sequence_size, /* For debugging */ +#ifdef DEBUG /* print_tries(states); */ +#endif /* construct DFA from the trie */ @@ -610,22 +645,135 @@ dfa *dfa_from_bytes_both(int sequence_size, return dfap; } +dfa * +dfa_from_ranges(int len, CCR_MOD(NUM *) data) +{ + struct range_s *rp = NULL; + SAFE_MALLOC(struct range_s, rp, len, return NULL;); + + for (int i = 0; i < len; i++) { + (rp+i)->beg = *(data+(i<<1)); + (rp+i)->end = *(data+(i<<1)+1); + } + + dfa *dfap = NULL; + SAFE_MALLOC(dfa, dfap, 1, free(rp); return NULL;); + + dfap->type = DFA_TYPE_SPECIAL; + dfap->data.sp = (special_dfa) { len, rp, 0 }; + + return dfap; +} + +dfa * +dfa_from_ranges_neg(int len, CCR_MOD(NUM *) data) +{ + dfa *dfap = dfa_from_ranges(len, data); + if (dfap == NULL) return NULL; + dfap->type = DFA_TYPE_SPECIAL_NEG; + return dfap; +} + +dfa * +dfa_from_ranges_both(int plen, CCR_MOD(NUM *) pdata, + int nlen, CCR_MOD(NUM *) ndata) +{ + int len = plen + nlen; + struct range_s *rp = NULL; + SAFE_MALLOC(struct range_s, rp, len, return NULL;); + + for (int i = 0; i < plen; i++) { + (rp+i)->beg = *(pdata+(i<<1)); + (rp+i)->end = *(pdata+(i<<1)+1); + } + + for (int i = 0; i < nlen; i++) { + (rp+plen+i)->beg = *(ndata+(i<<1)); + (rp+plen+i)->end = *(ndata+(i<<1)+1); + } + + dfa *dfap = NULL; + SAFE_MALLOC(dfa, dfap, 1, free(rp); return NULL;); + + dfap->type = DFA_TYPE_SPECIAL_BOTH; + dfap->data.sp = (special_dfa) { len, rp, plen }; + + return dfap; +} + BOOL run_dfa(const dfa * const restrict table, const NUM code) { + /* negative codes have special purposes */ + if (code < 0) return 0; + switch (table->type) { case DFA_TYPE_POSITIVE: case DFA_TYPE_NEGATIVE: case DFA_TYPE_BOTH: break; + case DFA_TYPE_SPECIAL: + for (int i = 0; i < table->data.sp.len; i++) + if (code >= (table->data.sp.ranges+i)->beg && + code <= (table->data.sp.ranges+i)->end) { +#ifdef DEBUG + fleprintf("code = %ld, beg = %ld, end = %ld\n", + code, + (table->data.sp.ranges+i)->beg, + (table->data.sp.ranges+i)->end); +#endif + return 1; + } + return 0; + break; + case DFA_TYPE_SPECIAL_NEG: + for (int i = 0; i < table->data.sp.len; i++) + if (code >= (table->data.sp.ranges+i)->beg && + code <= (table->data.sp.ranges+i)->end) { +#ifdef DEBUG + fleprintf("code = %ld, beg = %ld, end = %ld\n", + code, + (table->data.sp.ranges+i)->beg, + (table->data.sp.ranges+i)->end); +#endif + return 0; + } + return 1; + break; + case DFA_TYPE_SPECIAL_BOTH: + for (int i = table->data.sp.boundary; + i < table->data.sp.len; + i++) + if (code >= (table->data.sp.ranges+i)->beg && + code <= (table->data.sp.ranges+i)->end) { +#ifdef DEBUG + fleprintf("code = %ld, beg = %ld, end = %ld\n", + code, + (table->data.sp.ranges+i)->beg, + (table->data.sp.ranges+i)->end); +#endif + return 0; + } + for (int i = 0; i < table->data.sp.boundary; i++) + if (code >= (table->data.sp.ranges+i)->beg && + code <= (table->data.sp.ranges+i)->end) { +#ifdef DEBUG + fleprintf("code = %ld, beg = %ld, end = %ld\n", + code, + (table->data.sp.ranges+i)->beg, + (table->data.sp.ranges+i)->end); +#endif + return 1; + } + return 0; + break; default: - return table->data.fn(code); + fleprintf("Unknown type: %d\n", table->type); + return 0; break; } - /* negative codes have special purposes */ - if (code < 0) return 0; - + /* below is for running the DFA */ char s[5]; str *strp = new_str(s, 5); @@ -696,24 +844,3 @@ run_dfa(const dfa * const restrict table, const NUM code) return 0; } - -P_ATTR -static BOOL -dfa_any_fun(const NUM UNUSED code) { return 1; } - -dfa * -dfa_from_func(special_dfa func) -{ - dfa *result = NULL; - SAFE_MALLOC(dfa, result, 1, return NULL;); - result->type = DFA_TYPE_SPECIAL; - result->data.fn = func; - - return result; -} - -dfa * -dfa_any(void) -{ - return dfa_from_func(dfa_any_fun); -} diff --git a/src/dfa.h b/src/dfa.h index 9116999..9d76a22 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -22,7 +22,9 @@ enum { /* dfa type */ -typedef BOOL (* special_dfa) (const NUM code); +/* typedef BOOL (* special_dfa) (const NUM code); */ + +typedef struct special_dfa_s special_dfa; typedef struct dfa_s dfa; @@ -45,6 +47,20 @@ dfa *dfa_from_bytes_both(int sequence_size, int neg_sequence_size, CCR_MOD(NUM *) negdata); +/* 2*LEN is the lengths of DATA. The number at index 2*i is the start + of the i-th range and the number at index 2*i+1 is the end of the + i-th range. + + On error return NULL. */ +dfa *dfa_from_ranges(int len, CCR_MOD(NUM *) data); + +/* mutatis mutandis */ +dfa *dfa_from_ranges_neg(int len, CCR_MOD(NUM *) data); + +/* mutatis mutandis */ +dfa *dfa_from_ranges_both(int plen, CCR_MOD(NUM *) pdata, + int nlen, CCR_MOD(NUM *) ndata); + /* TODO: Reject character bytes from a given DFA. */ /* NOTE: Add all unicode valid points to a DFA, so that we can @@ -57,11 +73,6 @@ dfa *dfa_from_bytes_both(int sequence_size, /* TODO: Construct some basic frequently used character classes. */ -dfa *dfa_from_func(special_dfa func); - -/* return a new instance of the any class */ -dfa *dfa_any(void); - BOOL run_dfa(CCR_MOD(dfa *) table, const NUM code); #endif diff --git a/src/test/check_dfa.c b/src/test/check_dfa.c index a728fb8..426eee4 100644 --- a/src/test/check_dfa.c +++ b/src/test/check_dfa.c @@ -104,5 +104,15 @@ main(int UNUSED argc, char ** UNUSED argv) * printf("the result = %d\n", result); */ if (dfap) destroy_dfa(dfap); + + dfap = dfa_from_ranges(3, (NUM []) { 1, 10, 'a', 'z', 'A', 'Z' }); + + if (dfap) { + printf("Successfully created a DFA from an array of ranges\n"); + print_dfa(dfap); + } + + if (dfap) destroy_dfa(dfap); + return 0; } diff --git a/src/test/check_pred.c b/src/test/check_pred.c index 14081d5..8a3e613 100644 --- a/src/test/check_pred.c +++ b/src/test/check_pred.c @@ -1,3 +1,4 @@ +#include "limits.h" #include "time.h" #include "../cnp.h" @@ -88,7 +89,7 @@ main(int UNUSED argc, char ** UNUSED argv) *(user_name+4) = 'i'; *(user_name+5) = 0; - user_name_s = new_utf8(user_name, 5); + user_name_s = (str *) new_utf8(user_name, 5); SAFE_MALLOC(char, raw_name, 8, return 1;); @@ -101,7 +102,7 @@ main(int UNUSED argc, char ** UNUSED argv) *(raw_name+6) = 'Z'; *(raw_name+7) = 0; - raw_name_s = new_utf8(raw_name, 6); + raw_name_s = (str *) new_utf8(raw_name, 6); SAFE_MALLOC(NUM, pred_bytes, 52, return 1;); @@ -111,9 +112,11 @@ main(int UNUSED argc, char ** UNUSED argv) for (int i = 0; i < 26; i++) *(pred_bytes+pred_bytes_len++) = 'A' + i; - if (add_to_list(preds, new_ptd(user_name_s, raw_name_s, - dfa_from_bytes_neg - (pred_bytes_len, pred_bytes)))) { + if (add_to_list(preds, + new_ptd(user_name_s, raw_name_s, + dfa_from_ranges_both + (2, (NUM[]) { 'a', 'z', 'A', 'Z' }, + 2, (NUM[]) { 'b', 'w', 'B', 'W' })))) { fleprintf0("Fail to add a predicate\n"); return 1; } @@ -126,7 +129,7 @@ main(int UNUSED argc, char ** UNUSED argv) print_grammar(g); - utf8 *string = new_utf8("++--__,.(){}><|", 15); + utf8 *string = new_utf8("aaaxxxxaaaaaaaa", 15); printf("\nPrinting the input...\n%s\n", get_data((str *) string)); -- cgit v1.2.3-18-g5258