#include "utf8.h" #include "str_partial.h" #include #include struct utf8_s { str s; }; /* The classification and the transition table. */ static const uint8_t utf8d[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,12,12,12,12,12, }; static UTF8_State decode(UTF8_State *state, uint32_t *codep, uint8_t byte) { uint32_t type = utf8d[byte]; *codep = (*state != UTF8_STATE_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & ((uint32_t) byte); *state = utf8d[256 + *state + type]; return *state; } str_info utf8_get_info(const str * const restrict s, UNUM n) { UTF8_State state = UTF8_STATE_ACCEPT; uint32_t code = 0; NUM orig_n = n; /* error */ if (n >= s->size) return (str_info) { -1, 0 }; decode(&state, &code, *(s->data+n)); if (state == UTF8_STATE_ACCEPT) return (str_info) { code, 1 }; if (state == UTF8_STATE_REJECT) return (str_info) { -1, 1 }; n++; if (n >= s->size) return (str_info) { -1, 1 }; for (; n < s->size && state > UTF8_STATE_REJECT; n++) { decode(&state, &code, *(s->data+n)); } if (state == UTF8_STATE_ACCEPT) return (str_info) { code, n - orig_n }; return (str_info) { -1, n - orig_n - 1}; } utf8 * new_utf8(char *string, UNUM size) { str *s = new_str(string, size); s->getter = utf8_get_info; return (utf8 *)s; } /* result should be long enough to hold the data */ BOOL encode(NUM code_point, str *result) { /* calculate the number of bits of CODE_POINT */ int number_of_bits = 0; for (;code_point >> number_of_bits;) number_of_bits++; if (!number_of_bits) { result->size = 0; return 1; } if (number_of_bits <= 7) { if (!(result->size)) { eprintf("%s:%d, Result cannot hold the encoded value\n", __FILE__, __LINE__); return 1; } result->size = 1; *(result->data) = (char) code_point; } else if (number_of_bits <= 11) { if (result->size < 2) { eprintf("%s:%d, Result cannot hold the encoded value", __FILE__, __LINE__); return 1; } result->size = 2; *(result->data) = (char) (0xc0 | (code_point >> 6)); *(result->data+1) = (char) (0x80 | (code_point & 0x3f)); } else if (number_of_bits <= 16) { if (result->size < 3) { eprintf("%s:%d, Result cannot hold the encoded value", __FILE__, __LINE__); return 1; } result->size = 3; *(result->data) = (char) (0xe0 | (code_point >> 12)); *(result->data+1) = (char) (0x80 | ((code_point >> 6) & 0x3f)); *(result->data+2) = (char) (0x80 | (code_point & 0x3f)); } else if (number_of_bits <= 21) { if (result->size < 4) { eprintf("%s:%d, Result cannot hold the encoded value", __FILE__, __LINE__); return 1; } result->size = 4; *(result->data) = (char) (0xf0 | (code_point >> 18)); *(result->data+1) = (char) (0x80 | ((code_point >> 12) & 0x3f)); *(result->data+2) = (char) (0x80 | ((code_point >> 6) & 0x3f)); *(result->data+3) = (char) (0x80 | (code_point & 0x3f)); } else { eprintf("%s:%d, Invalid code point to encode", __FILE__, __LINE__); return 1; } return 0; }