diff options
author | JSDurand <mmemmew@gmail.com> | 2022-01-04 11:51:58 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2022-01-04 11:51:58 +0800 |
commit | 55dc897da6e81f2a26cfc7e66ac942824773498b (patch) | |
tree | fce0d7d57832907c991d551833bf5eecde947dd2 /src/utf8.c | |
parent | 53b8b6ffab5a968db75e9babddf4e2dbb2c688a3 (diff) |
temporary commit
Now we can read grammars from a file.
But we need to check if it works for reading strings still.
Diffstat (limited to 'src/utf8.c')
-rw-r--r-- | src/utf8.c | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/src/utf8.c b/src/utf8.c new file mode 100644 index 0000000..f840600 --- /dev/null +++ b/src/utf8.c @@ -0,0 +1,150 @@ +#include "utf8.h" +#include "str_partial.h" +#include <stdint.h> +#include <stdio.h> + +struct utf8_s { + str s; +}; + +/* The classification and the transition table. */ + +static const +uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +static UTF8_State +decode(UTF8_State *state, uint32_t *codep, uint8_t byte) +{ + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_STATE_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & ((uint32_t) byte); + + *state = utf8d[256 + *state + type]; + return *state; +} + +str_info +utf8_get_info(str *s, UNUM n) +{ + UTF8_State state = UTF8_STATE_ACCEPT; + uint32_t code = 0; + NUM orig_n = n; + + /* error */ + if (n >= s->size) return (str_info) { -1, 0 }; + + decode(&state, &code, *(s->data+n)); + + if (state == UTF8_STATE_ACCEPT) + return (str_info) { code, 1 }; + + if (state == UTF8_STATE_REJECT) + return (str_info) { -1, 1 }; + + n++; + + if (n >= s->size) return (str_info) { -1, 1 }; + + for (; n < s->size + && state > UTF8_STATE_REJECT; + n++) { + decode(&state, &code, *(s->data+n)); + } + + if (state == UTF8_STATE_ACCEPT) + return (str_info) { code, n - orig_n }; + + return (str_info) { -1, n - orig_n - 1}; +} + +utf8 * +new_utf8(char *string, UNUM size) +{ + str *s = new_str(string, size); + + s->getter = utf8_get_info; + + return (utf8 *)s; +} + +/* result should be long enough to hold the data */ +unsigned char +encode(NUM code_point, str *result) +{ + /* calculate the number of bits of CODE_POINT */ + int number_of_bits = 0; + + for (;code_point >> number_of_bits;) number_of_bits++; + + if (!number_of_bits) { + result->size = 0; + return 1; + } + + if (number_of_bits <= 7) { + if (!(result->size)) { + eprintf("%s:%d, Result cannot hold the encoded value\n", + __FILE__, __LINE__); + return 1; + } + + result->size = 1; + *(result->data) = (char) code_point; + } else if (number_of_bits <= 11) { + if (result->size < 2) { + eprintf("%s:%d, Result cannot hold the encoded value", + __FILE__, __LINE__); + return 1; + } + + result->size = 2; + *(result->data) = (char) (0xc0 | (code_point >> 6)); + *(result->data+1) = (char) (0x80 | (code_point & 0x3f)); + } else if (number_of_bits <= 16) { + if (result->size < 3) { + eprintf("%s:%d, Result cannot hold the encoded value", + __FILE__, __LINE__); + return 1; + } + + result->size = 3; + *(result->data) = (char) (0xe0 | (code_point >> 12)); + *(result->data+1) = (char) (0x80 | ((code_point >> 6) & 0x3f)); + *(result->data+2) = (char) (0x80 | (code_point & 0x3f)); + } else if (number_of_bits <= 21) { + if (result->size < 4) { + eprintf("%s:%d, Result cannot hold the encoded value", + __FILE__, __LINE__); + return 1; + } + + result->size = 4; + *(result->data) = (char) (0xf0 | (code_point >> 18)); + *(result->data+1) = (char) (0x80 | ((code_point >> 12) & 0x3f)); + *(result->data+2) = (char) (0x80 | ((code_point >> 6) & 0x3f)); + *(result->data+3) = (char) (0x80 | (code_point & 0x3f)); + } else { + eprintf("%s:%d, Invalid code point to encode", + __FILE__, __LINE__); + return 1; + } + + return 0; +} |