summaryrefslogtreecommitdiff
path: root/src/utf8.c
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2022-01-04 11:51:58 +0800
committerJSDurand <mmemmew@gmail.com>2022-01-04 11:51:58 +0800
commit55dc897da6e81f2a26cfc7e66ac942824773498b (patch)
treefce0d7d57832907c991d551833bf5eecde947dd2 /src/utf8.c
parent53b8b6ffab5a968db75e9babddf4e2dbb2c688a3 (diff)
temporary commit
Now we can read grammars from a file. But we need to check if it works for reading strings still.
Diffstat (limited to 'src/utf8.c')
-rw-r--r--src/utf8.c150
1 files changed, 150 insertions, 0 deletions
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..f840600
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,150 @@
+#include "utf8.h"
+#include "str_partial.h"
+#include <stdint.h>
+#include <stdio.h>
+
+struct utf8_s {
+ str s;
+};
+
+/* The classification and the transition table. */
+
+static const
+uint8_t utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static UTF8_State
+decode(UTF8_State *state, uint32_t *codep, uint8_t byte)
+{
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_STATE_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & ((uint32_t) byte);
+
+ *state = utf8d[256 + *state + type];
+ return *state;
+}
+
+str_info
+utf8_get_info(str *s, UNUM n)
+{
+ UTF8_State state = UTF8_STATE_ACCEPT;
+ uint32_t code = 0;
+ NUM orig_n = n;
+
+ /* error */
+ if (n >= s->size) return (str_info) { -1, 0 };
+
+ decode(&state, &code, *(s->data+n));
+
+ if (state == UTF8_STATE_ACCEPT)
+ return (str_info) { code, 1 };
+
+ if (state == UTF8_STATE_REJECT)
+ return (str_info) { -1, 1 };
+
+ n++;
+
+ if (n >= s->size) return (str_info) { -1, 1 };
+
+ for (; n < s->size
+ && state > UTF8_STATE_REJECT;
+ n++) {
+ decode(&state, &code, *(s->data+n));
+ }
+
+ if (state == UTF8_STATE_ACCEPT)
+ return (str_info) { code, n - orig_n };
+
+ return (str_info) { -1, n - orig_n - 1};
+}
+
+utf8 *
+new_utf8(char *string, UNUM size)
+{
+ str *s = new_str(string, size);
+
+ s->getter = utf8_get_info;
+
+ return (utf8 *)s;
+}
+
+/* result should be long enough to hold the data */
+unsigned char
+encode(NUM code_point, str *result)
+{
+ /* calculate the number of bits of CODE_POINT */
+ int number_of_bits = 0;
+
+ for (;code_point >> number_of_bits;) number_of_bits++;
+
+ if (!number_of_bits) {
+ result->size = 0;
+ return 1;
+ }
+
+ if (number_of_bits <= 7) {
+ if (!(result->size)) {
+ eprintf("%s:%d, Result cannot hold the encoded value\n",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ result->size = 1;
+ *(result->data) = (char) code_point;
+ } else if (number_of_bits <= 11) {
+ if (result->size < 2) {
+ eprintf("%s:%d, Result cannot hold the encoded value",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ result->size = 2;
+ *(result->data) = (char) (0xc0 | (code_point >> 6));
+ *(result->data+1) = (char) (0x80 | (code_point & 0x3f));
+ } else if (number_of_bits <= 16) {
+ if (result->size < 3) {
+ eprintf("%s:%d, Result cannot hold the encoded value",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ result->size = 3;
+ *(result->data) = (char) (0xe0 | (code_point >> 12));
+ *(result->data+1) = (char) (0x80 | ((code_point >> 6) & 0x3f));
+ *(result->data+2) = (char) (0x80 | (code_point & 0x3f));
+ } else if (number_of_bits <= 21) {
+ if (result->size < 4) {
+ eprintf("%s:%d, Result cannot hold the encoded value",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ result->size = 4;
+ *(result->data) = (char) (0xf0 | (code_point >> 18));
+ *(result->data+1) = (char) (0x80 | ((code_point >> 12) & 0x3f));
+ *(result->data+2) = (char) (0x80 | ((code_point >> 6) & 0x3f));
+ *(result->data+3) = (char) (0x80 | (code_point & 0x3f));
+ } else {
+ eprintf("%s:%d, Invalid code point to encode",
+ __FILE__, __LINE__);
+ return 1;
+ }
+
+ return 0;
+}