src/utf8.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

#include "utf8.h"
#include "str_partial.h"
#include <stdint.h>
#include <stdio.h>

struct utf8_s {
  str s;
};

/* The classification and the transition table. */

static const
uint8_t utf8d[] = {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

  0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};

static UTF8_State
decode(UTF8_State *state, uint32_t *codep, uint8_t byte)
{
  uint32_t type = utf8d[byte];

  *codep = (*state != UTF8_STATE_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & ((uint32_t) byte);

  *state = utf8d[256 + *state + type];
  return *state;
}

str_info
utf8_get_info(const str * const restrict s, UNUM n)
{
  UTF8_State state = UTF8_STATE_ACCEPT;
  uint32_t code = 0;
  NUM orig_n = n;

  /* error */
  if (n >= s->size) return (str_info) { -1, 0 };

  decode(&state, &code, *(s->data+n));

  if (state == UTF8_STATE_ACCEPT)
    return (str_info) { code, 1 };

  if (state == UTF8_STATE_REJECT)
    return (str_info) { -1, 1 };

  n++;

  if (n >= s->size) return (str_info) { -1, 1 };

  for (; n < s->size
         && state > UTF8_STATE_REJECT;
       n++) {
    decode(&state, &code, *(s->data+n));
  }

  if (state == UTF8_STATE_ACCEPT)
    return (str_info) { code, n - orig_n };

  return (str_info) { -1, n - orig_n - 1};
}

utf8 *
new_utf8(char *string, UNUM size)
{
  str *s = new_str(string, size);

  s->getter = utf8_get_info;

  return (utf8 *)s;
}
  
/* result should be long enough to hold the data */
BOOL
encode(NUM code_point, str *result)
{
  /* calculate the number of bits of CODE_POINT */
  int number_of_bits = 0;

  for (;code_point >> number_of_bits;) number_of_bits++;

  if (!number_of_bits) {
    result->size = 0;
    return 1;
  }

  if (number_of_bits <= 7) {
    if (!(result->size)) {
      eprintf("%s:%d, Result cannot hold the encoded value\n",
              __FILE__, __LINE__);
      return 1;
    }

    result->size = 1;
    *(result->data) = (char) code_point;
  } else if (number_of_bits <= 11) {
    if (result->size < 2) {
      eprintf("%s:%d, Result cannot hold the encoded value",
              __FILE__, __LINE__);
      return 1;
    }

    result->size = 2;
    *(result->data) = (char) (0xc0 | (code_point >> 6));
    *(result->data+1) = (char) (0x80 | (code_point & 0x3f));
  } else if (number_of_bits <= 16) {
    if (result->size < 3) {
      eprintf("%s:%d, Result cannot hold the encoded value",
              __FILE__, __LINE__);
      return 1;
    }

    result->size = 3;
    *(result->data) = (char) (0xe0 | (code_point >> 12));
    *(result->data+1) = (char) (0x80 | ((code_point >> 6) & 0x3f));
    *(result->data+2) = (char) (0x80 | (code_point & 0x3f));
  } else if (number_of_bits <= 21) {
    if (result->size < 4) {
      eprintf("%s:%d, Result cannot hold the encoded value",
              __FILE__, __LINE__);
      return 1;
    }

    result->size = 4;
    *(result->data) = (char) (0xf0 | (code_point >> 18));
    *(result->data+1) = (char) (0x80 | ((code_point >> 12) & 0x3f));
    *(result->data+2) = (char) (0x80 | ((code_point >> 6) & 0x3f));
    *(result->data+3) = (char) (0x80 | (code_point & 0x3f));
  } else {
    eprintf("%s:%d, Invalid code point to encode",
            __FILE__, __LINE__);
    return 1;
  }

  return 0;
}