1 files changed, 150 insertions, 0 deletions
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..f840600
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,150 @@
+#include "utf8.h"
+#include "str_partial.h"
+#include <stdint.h>
+#include <stdio.h>
+
+struct utf8_s {
+  str s;
+};
+
+/* The classification and the transition table. */
+
+static const
+uint8_t utf8d[] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+  0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static UTF8_State
+decode(UTF8_State *state, uint32_t *codep, uint8_t byte)
+{
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != UTF8_STATE_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & ((uint32_t) byte);
+
+  *state = utf8d[256 + *state + type];
+  return *state;
+}
+
+str_info
+utf8_get_info(str *s, UNUM n)
+{
+  UTF8_State state = UTF8_STATE_ACCEPT;
+  uint32_t code = 0;
+  NUM orig_n = n;
+
+  /* error */
+  if (n >= s->size) return (str_info) { -1, 0 };
+
+  decode(&state, &code, *(s->data+n));
+
+  if (state == UTF8_STATE_ACCEPT)
+    return (str_info) { code, 1 };
+
+  if (state == UTF8_STATE_REJECT)
+    return (str_info) { -1, 1 };
+
+  n++;
+
+  if (n >= s->size) return (str_info) { -1, 1 };
+
+  for (; n < s->size
+         && state > UTF8_STATE_REJECT;
+       n++) {
+    decode(&state, &code, *(s->data+n));
+  }
+
+  if (state == UTF8_STATE_ACCEPT)
+    return (str_info) { code, n - orig_n };
+
+  return (str_info) { -1, n - orig_n - 1};
+}
+
+utf8 *
+new_utf8(char *string, UNUM size)
+{
+  str *s = new_str(string, size);
+
+  s->getter = utf8_get_info;
+
+  return (utf8 *)s;
+}
+  
+/* result should be long enough to hold the data */
+unsigned char
+encode(NUM code_point, str *result)
+{
+  /* calculate the number of bits of CODE_POINT */
+  int number_of_bits = 0;
+
+  for (;code_point >> number_of_bits;) number_of_bits++;
+
+  if (!number_of_bits) {
+    result->size = 0;
+    return 1;
+  }
+
+  if (number_of_bits <= 7) {
+    if (!(result->size)) {
+      eprintf("%s:%d, Result cannot hold the encoded value\n",
+              __FILE__, __LINE__);
+      return 1;
+    }
+
+    result->size = 1;
+    *(result->data) = (char) code_point;
+  } else if (number_of_bits <= 11) {
+    if (result->size < 2) {
+      eprintf("%s:%d, Result cannot hold the encoded value",
+              __FILE__, __LINE__);
+      return 1;
+    }
+
+    result->size = 2;
+    *(result->data) = (char) (0xc0 | (code_point >> 6));
+    *(result->data+1) = (char) (0x80 | (code_point & 0x3f));
+  } else if (number_of_bits <= 16) {
+    if (result->size < 3) {
+      eprintf("%s:%d, Result cannot hold the encoded value",
+              __FILE__, __LINE__);
+      return 1;
+    }
+
+    result->size = 3;
+    *(result->data) = (char) (0xe0 | (code_point >> 12));
+    *(result->data+1) = (char) (0x80 | ((code_point >> 6) & 0x3f));
+    *(result->data+2) = (char) (0x80 | (code_point & 0x3f));
+  } else if (number_of_bits <= 21) {
+    if (result->size < 4) {
+      eprintf("%s:%d, Result cannot hold the encoded value",
+              __FILE__, __LINE__);
+      return 1;
+    }
+
+    result->size = 4;
+    *(result->data) = (char) (0xf0 | (code_point >> 18));
+    *(result->data+1) = (char) (0x80 | ((code_point >> 12) & 0x3f));
+    *(result->data+2) = (char) (0x80 | ((code_point >> 6) & 0x3f));
+    *(result->data+3) = (char) (0x80 | (code_point & 0x3f));
+  } else {
+    eprintf("%s:%d, Invalid code point to encode",
+            __FILE__, __LINE__);
+    return 1;
+  }
+
+  return 0;
+}