1*0a6a1f1dSLionel Sambuc /* $NetBSD: normalize.c,v 1.1.1.2 2014/04/24 12:45:56 pettai Exp $ */
2ebfedea0SLionel Sambuc
3ebfedea0SLionel Sambuc /*
4ebfedea0SLionel Sambuc * Copyright (c) 2004 Kungliga Tekniska Högskolan
5ebfedea0SLionel Sambuc * (Royal Institute of Technology, Stockholm, Sweden).
6ebfedea0SLionel Sambuc * All rights reserved.
7ebfedea0SLionel Sambuc *
8ebfedea0SLionel Sambuc * Redistribution and use in source and binary forms, with or without
9ebfedea0SLionel Sambuc * modification, are permitted provided that the following conditions
10ebfedea0SLionel Sambuc * are met:
11ebfedea0SLionel Sambuc *
12ebfedea0SLionel Sambuc * 1. Redistributions of source code must retain the above copyright
13ebfedea0SLionel Sambuc * notice, this list of conditions and the following disclaimer.
14ebfedea0SLionel Sambuc *
15ebfedea0SLionel Sambuc * 2. Redistributions in binary form must reproduce the above copyright
16ebfedea0SLionel Sambuc * notice, this list of conditions and the following disclaimer in the
17ebfedea0SLionel Sambuc * documentation and/or other materials provided with the distribution.
18ebfedea0SLionel Sambuc *
19ebfedea0SLionel Sambuc * 3. Neither the name of the Institute nor the names of its contributors
20ebfedea0SLionel Sambuc * may be used to endorse or promote products derived from this software
21ebfedea0SLionel Sambuc * without specific prior written permission.
22ebfedea0SLionel Sambuc *
23ebfedea0SLionel Sambuc * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
24ebfedea0SLionel Sambuc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25ebfedea0SLionel Sambuc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26ebfedea0SLionel Sambuc * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
27ebfedea0SLionel Sambuc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28ebfedea0SLionel Sambuc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29ebfedea0SLionel Sambuc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30ebfedea0SLionel Sambuc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31ebfedea0SLionel Sambuc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32ebfedea0SLionel Sambuc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33ebfedea0SLionel Sambuc * SUCH DAMAGE.
34ebfedea0SLionel Sambuc */
35ebfedea0SLionel Sambuc
36ebfedea0SLionel Sambuc #ifdef HAVE_CONFIG_H
37ebfedea0SLionel Sambuc #include <config.h>
38ebfedea0SLionel Sambuc #endif
39ebfedea0SLionel Sambuc #include "windlocl.h"
40ebfedea0SLionel Sambuc
41ebfedea0SLionel Sambuc #include <assert.h>
42ebfedea0SLionel Sambuc #include <stdlib.h>
43ebfedea0SLionel Sambuc #include <errno.h>
44ebfedea0SLionel Sambuc #include <stdio.h>
45ebfedea0SLionel Sambuc
46ebfedea0SLionel Sambuc #include <krb5/roken.h>
47ebfedea0SLionel Sambuc
48ebfedea0SLionel Sambuc #include "normalize_table.h"
49ebfedea0SLionel Sambuc
50ebfedea0SLionel Sambuc static int
translation_cmp(const void * key,const void * data)51ebfedea0SLionel Sambuc translation_cmp(const void *key, const void *data)
52ebfedea0SLionel Sambuc {
53ebfedea0SLionel Sambuc const struct translation *t1 = (const struct translation *)key;
54ebfedea0SLionel Sambuc const struct translation *t2 = (const struct translation *)data;
55ebfedea0SLionel Sambuc
56ebfedea0SLionel Sambuc return t1->key - t2->key;
57ebfedea0SLionel Sambuc }
58ebfedea0SLionel Sambuc
59ebfedea0SLionel Sambuc enum { s_base = 0xAC00};
60ebfedea0SLionel Sambuc enum { s_count = 11172};
61ebfedea0SLionel Sambuc enum { l_base = 0x1100};
62ebfedea0SLionel Sambuc enum { l_count = 19};
63ebfedea0SLionel Sambuc enum { v_base = 0x1161};
64ebfedea0SLionel Sambuc enum { v_count = 21};
65ebfedea0SLionel Sambuc enum { t_base = 0x11A7};
66ebfedea0SLionel Sambuc enum { t_count = 28};
67ebfedea0SLionel Sambuc enum { n_count = v_count * t_count};
68ebfedea0SLionel Sambuc
69ebfedea0SLionel Sambuc static int
hangul_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)70ebfedea0SLionel Sambuc hangul_decomp(const uint32_t *in, size_t in_len,
71ebfedea0SLionel Sambuc uint32_t *out, size_t *out_len)
72ebfedea0SLionel Sambuc {
73ebfedea0SLionel Sambuc uint32_t u = *in;
74ebfedea0SLionel Sambuc unsigned s_index;
75ebfedea0SLionel Sambuc unsigned l, v, t;
76ebfedea0SLionel Sambuc unsigned o;
77ebfedea0SLionel Sambuc
78ebfedea0SLionel Sambuc if (u < s_base || u >= s_base + s_count)
79ebfedea0SLionel Sambuc return 0;
80ebfedea0SLionel Sambuc s_index = u - s_base;
81ebfedea0SLionel Sambuc l = l_base + s_index / n_count;
82ebfedea0SLionel Sambuc v = v_base + (s_index % n_count) / t_count;
83ebfedea0SLionel Sambuc t = t_base + s_index % t_count;
84ebfedea0SLionel Sambuc o = 2;
85ebfedea0SLionel Sambuc if (t != t_base)
86ebfedea0SLionel Sambuc ++o;
87ebfedea0SLionel Sambuc if (*out_len < o)
88ebfedea0SLionel Sambuc return WIND_ERR_OVERRUN;
89ebfedea0SLionel Sambuc out[0] = l;
90ebfedea0SLionel Sambuc out[1] = v;
91ebfedea0SLionel Sambuc if (t != t_base)
92ebfedea0SLionel Sambuc out[2] = t;
93ebfedea0SLionel Sambuc *out_len = o;
94ebfedea0SLionel Sambuc return 1;
95ebfedea0SLionel Sambuc }
96ebfedea0SLionel Sambuc
97ebfedea0SLionel Sambuc static uint32_t
hangul_composition(const uint32_t * in,size_t in_len)98ebfedea0SLionel Sambuc hangul_composition(const uint32_t *in, size_t in_len)
99ebfedea0SLionel Sambuc {
100ebfedea0SLionel Sambuc if (in_len < 2)
101ebfedea0SLionel Sambuc return 0;
102ebfedea0SLionel Sambuc if (in[0] >= l_base && in[0] < l_base + l_count) {
103ebfedea0SLionel Sambuc unsigned l_index = in[0] - l_base;
104ebfedea0SLionel Sambuc unsigned v_index;
105ebfedea0SLionel Sambuc
106ebfedea0SLionel Sambuc if (in[1] < v_base || in[1] >= v_base + v_count)
107ebfedea0SLionel Sambuc return 0;
108ebfedea0SLionel Sambuc v_index = in[1] - v_base;
109ebfedea0SLionel Sambuc return (l_index * v_count + v_index) * t_count + s_base;
110ebfedea0SLionel Sambuc } else if (in[0] >= s_base && in[0] < s_base + s_count) {
111ebfedea0SLionel Sambuc unsigned s_index = in[0] - s_base;
112ebfedea0SLionel Sambuc unsigned t_index;
113ebfedea0SLionel Sambuc
114ebfedea0SLionel Sambuc if (s_index % t_count != 0)
115ebfedea0SLionel Sambuc return 0;
116ebfedea0SLionel Sambuc if (in[1] < t_base || in[1] >= t_base + t_count)
117ebfedea0SLionel Sambuc return 0;
118ebfedea0SLionel Sambuc t_index = in[1] - t_base;
119ebfedea0SLionel Sambuc return in[0] + t_index;
120ebfedea0SLionel Sambuc }
121ebfedea0SLionel Sambuc return 0;
122ebfedea0SLionel Sambuc }
123ebfedea0SLionel Sambuc
124ebfedea0SLionel Sambuc static int
compat_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)125ebfedea0SLionel Sambuc compat_decomp(const uint32_t *in, size_t in_len,
126ebfedea0SLionel Sambuc uint32_t *out, size_t *out_len)
127ebfedea0SLionel Sambuc {
128ebfedea0SLionel Sambuc unsigned i;
129ebfedea0SLionel Sambuc unsigned o = 0;
130ebfedea0SLionel Sambuc
131ebfedea0SLionel Sambuc for (i = 0; i < in_len; ++i) {
132ebfedea0SLionel Sambuc struct translation ts = {in[i]};
133ebfedea0SLionel Sambuc size_t sub_len = *out_len - o;
134ebfedea0SLionel Sambuc int ret;
135ebfedea0SLionel Sambuc
136ebfedea0SLionel Sambuc ret = hangul_decomp(in + i, in_len - i,
137ebfedea0SLionel Sambuc out + o, &sub_len);
138ebfedea0SLionel Sambuc if (ret) {
139ebfedea0SLionel Sambuc if (ret == WIND_ERR_OVERRUN)
140ebfedea0SLionel Sambuc return ret;
141ebfedea0SLionel Sambuc o += sub_len;
142ebfedea0SLionel Sambuc } else {
143ebfedea0SLionel Sambuc void *s = bsearch(&ts,
144ebfedea0SLionel Sambuc _wind_normalize_table,
145ebfedea0SLionel Sambuc _wind_normalize_table_size,
146ebfedea0SLionel Sambuc sizeof(_wind_normalize_table[0]),
147ebfedea0SLionel Sambuc translation_cmp);
148ebfedea0SLionel Sambuc if (s != NULL) {
149ebfedea0SLionel Sambuc const struct translation *t = (const struct translation *)s;
150ebfedea0SLionel Sambuc
151ebfedea0SLionel Sambuc ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
152ebfedea0SLionel Sambuc t->val_len,
153ebfedea0SLionel Sambuc out + o, &sub_len);
154ebfedea0SLionel Sambuc if (ret)
155ebfedea0SLionel Sambuc return ret;
156ebfedea0SLionel Sambuc o += sub_len;
157ebfedea0SLionel Sambuc } else {
158ebfedea0SLionel Sambuc if (o >= *out_len)
159ebfedea0SLionel Sambuc return WIND_ERR_OVERRUN;
160ebfedea0SLionel Sambuc out[o++] = in[i];
161ebfedea0SLionel Sambuc
162ebfedea0SLionel Sambuc }
163ebfedea0SLionel Sambuc }
164ebfedea0SLionel Sambuc }
165ebfedea0SLionel Sambuc *out_len = o;
166ebfedea0SLionel Sambuc return 0;
167ebfedea0SLionel Sambuc }
168ebfedea0SLionel Sambuc
169ebfedea0SLionel Sambuc static void
swap_char(uint32_t * a,uint32_t * b)170ebfedea0SLionel Sambuc swap_char(uint32_t * a, uint32_t * b)
171ebfedea0SLionel Sambuc {
172ebfedea0SLionel Sambuc uint32_t t;
173ebfedea0SLionel Sambuc t = *a;
174ebfedea0SLionel Sambuc *a = *b;
175ebfedea0SLionel Sambuc *b = t;
176ebfedea0SLionel Sambuc }
177ebfedea0SLionel Sambuc
178ebfedea0SLionel Sambuc /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
179ebfedea0SLionel Sambuc * that all have Canonical_Combining_Class > 0 */
180ebfedea0SLionel Sambuc static void
canonical_reorder_sequence(uint32_t * a,size_t len)181ebfedea0SLionel Sambuc canonical_reorder_sequence(uint32_t * a, size_t len)
182ebfedea0SLionel Sambuc {
183ebfedea0SLionel Sambuc size_t i, j;
184ebfedea0SLionel Sambuc
185ebfedea0SLionel Sambuc if (len <= 1)
186ebfedea0SLionel Sambuc return;
187ebfedea0SLionel Sambuc
188ebfedea0SLionel Sambuc for (i = 1; i < len; i++) {
189ebfedea0SLionel Sambuc for (j = i;
190ebfedea0SLionel Sambuc j > 0 &&
191ebfedea0SLionel Sambuc _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
192ebfedea0SLionel Sambuc j--)
193ebfedea0SLionel Sambuc swap_char(&a[j], &a[j-1]);
194ebfedea0SLionel Sambuc }
195ebfedea0SLionel Sambuc }
196ebfedea0SLionel Sambuc
197ebfedea0SLionel Sambuc static void
canonical_reorder(uint32_t * tmp,size_t tmp_len)198ebfedea0SLionel Sambuc canonical_reorder(uint32_t *tmp, size_t tmp_len)
199ebfedea0SLionel Sambuc {
200ebfedea0SLionel Sambuc size_t i;
201ebfedea0SLionel Sambuc
202ebfedea0SLionel Sambuc for (i = 0; i < tmp_len; ++i) {
203ebfedea0SLionel Sambuc int cc = _wind_combining_class(tmp[i]);
204ebfedea0SLionel Sambuc if (cc) {
205ebfedea0SLionel Sambuc size_t j;
206ebfedea0SLionel Sambuc for (j = i + 1;
207ebfedea0SLionel Sambuc j < tmp_len && _wind_combining_class(tmp[j]);
208ebfedea0SLionel Sambuc ++j)
209ebfedea0SLionel Sambuc ;
210ebfedea0SLionel Sambuc canonical_reorder_sequence(&tmp[i], j - i);
211ebfedea0SLionel Sambuc i = j;
212ebfedea0SLionel Sambuc }
213ebfedea0SLionel Sambuc }
214ebfedea0SLionel Sambuc }
215ebfedea0SLionel Sambuc
216ebfedea0SLionel Sambuc static uint32_t
find_composition(const uint32_t * in,unsigned in_len)217ebfedea0SLionel Sambuc find_composition(const uint32_t *in, unsigned in_len)
218ebfedea0SLionel Sambuc {
219ebfedea0SLionel Sambuc unsigned short canon_index = 0;
220ebfedea0SLionel Sambuc uint32_t cur;
221ebfedea0SLionel Sambuc unsigned n = 0;
222ebfedea0SLionel Sambuc
223ebfedea0SLionel Sambuc cur = hangul_composition(in, in_len);
224ebfedea0SLionel Sambuc if (cur)
225ebfedea0SLionel Sambuc return cur;
226ebfedea0SLionel Sambuc
227ebfedea0SLionel Sambuc do {
228ebfedea0SLionel Sambuc const struct canon_node *c = &_wind_canon_table[canon_index];
229ebfedea0SLionel Sambuc unsigned i;
230ebfedea0SLionel Sambuc
231ebfedea0SLionel Sambuc if (n % 5 == 0) {
232ebfedea0SLionel Sambuc cur = *in++;
233ebfedea0SLionel Sambuc if (in_len-- == 0)
234ebfedea0SLionel Sambuc return c->val;
235ebfedea0SLionel Sambuc }
236ebfedea0SLionel Sambuc
237ebfedea0SLionel Sambuc i = cur >> 16;
238ebfedea0SLionel Sambuc if (i < c->next_start || i >= c->next_end)
239ebfedea0SLionel Sambuc canon_index = 0;
240ebfedea0SLionel Sambuc else
241ebfedea0SLionel Sambuc canon_index =
242ebfedea0SLionel Sambuc _wind_canon_next_table[c->next_offset + i - c->next_start];
243ebfedea0SLionel Sambuc if (canon_index != 0) {
244ebfedea0SLionel Sambuc cur = (cur << 4) & 0xFFFFF;
245ebfedea0SLionel Sambuc ++n;
246ebfedea0SLionel Sambuc }
247ebfedea0SLionel Sambuc } while (canon_index != 0);
248ebfedea0SLionel Sambuc return 0;
249ebfedea0SLionel Sambuc }
250ebfedea0SLionel Sambuc
251ebfedea0SLionel Sambuc static int
combine(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)252ebfedea0SLionel Sambuc combine(const uint32_t *in, size_t in_len,
253ebfedea0SLionel Sambuc uint32_t *out, size_t *out_len)
254ebfedea0SLionel Sambuc {
255ebfedea0SLionel Sambuc unsigned i;
256ebfedea0SLionel Sambuc int ostarter;
257ebfedea0SLionel Sambuc unsigned o = 0;
258ebfedea0SLionel Sambuc int old_cc;
259ebfedea0SLionel Sambuc
260ebfedea0SLionel Sambuc for (i = 0; i < in_len;) {
261ebfedea0SLionel Sambuc while (i < in_len && _wind_combining_class(in[i]) != 0) {
262ebfedea0SLionel Sambuc out[o++] = in[i++];
263ebfedea0SLionel Sambuc }
264ebfedea0SLionel Sambuc if (i < in_len) {
265ebfedea0SLionel Sambuc if (o >= *out_len)
266ebfedea0SLionel Sambuc return WIND_ERR_OVERRUN;
267ebfedea0SLionel Sambuc ostarter = o;
268ebfedea0SLionel Sambuc out[o++] = in[i++];
269ebfedea0SLionel Sambuc old_cc = -1;
270ebfedea0SLionel Sambuc
271ebfedea0SLionel Sambuc while (i < in_len) {
272ebfedea0SLionel Sambuc uint32_t comb;
273ebfedea0SLionel Sambuc uint32_t v[2];
274ebfedea0SLionel Sambuc int cc;
275ebfedea0SLionel Sambuc
276ebfedea0SLionel Sambuc v[0] = out[ostarter];
277ebfedea0SLionel Sambuc v[1] = in[i];
278ebfedea0SLionel Sambuc
279ebfedea0SLionel Sambuc cc = _wind_combining_class(in[i]);
280ebfedea0SLionel Sambuc if (old_cc != cc && (comb = find_composition(v, 2))) {
281ebfedea0SLionel Sambuc out[ostarter] = comb;
282ebfedea0SLionel Sambuc } else if (cc == 0) {
283ebfedea0SLionel Sambuc break;
284ebfedea0SLionel Sambuc } else {
285ebfedea0SLionel Sambuc if (o >= *out_len)
286ebfedea0SLionel Sambuc return WIND_ERR_OVERRUN;
287ebfedea0SLionel Sambuc out[o++] = in[i];
288ebfedea0SLionel Sambuc old_cc = cc;
289ebfedea0SLionel Sambuc }
290ebfedea0SLionel Sambuc ++i;
291ebfedea0SLionel Sambuc }
292ebfedea0SLionel Sambuc }
293ebfedea0SLionel Sambuc }
294ebfedea0SLionel Sambuc *out_len = o;
295ebfedea0SLionel Sambuc return 0;
296ebfedea0SLionel Sambuc }
297ebfedea0SLionel Sambuc
298ebfedea0SLionel Sambuc int
_wind_stringprep_normalize(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)299ebfedea0SLionel Sambuc _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
300ebfedea0SLionel Sambuc uint32_t *out, size_t *out_len)
301ebfedea0SLionel Sambuc {
302ebfedea0SLionel Sambuc size_t tmp_len;
303ebfedea0SLionel Sambuc uint32_t *tmp;
304ebfedea0SLionel Sambuc int ret;
305ebfedea0SLionel Sambuc
306ebfedea0SLionel Sambuc if (in_len == 0) {
307ebfedea0SLionel Sambuc *out_len = 0;
308ebfedea0SLionel Sambuc return 0;
309ebfedea0SLionel Sambuc }
310ebfedea0SLionel Sambuc
311ebfedea0SLionel Sambuc tmp_len = in_len * 4;
312ebfedea0SLionel Sambuc if (tmp_len < MAX_LENGTH_CANON)
313ebfedea0SLionel Sambuc tmp_len = MAX_LENGTH_CANON;
314ebfedea0SLionel Sambuc tmp = malloc(tmp_len * sizeof(uint32_t));
315ebfedea0SLionel Sambuc if (tmp == NULL)
316ebfedea0SLionel Sambuc return ENOMEM;
317ebfedea0SLionel Sambuc
318ebfedea0SLionel Sambuc ret = compat_decomp(in, in_len, tmp, &tmp_len);
319ebfedea0SLionel Sambuc if (ret) {
320ebfedea0SLionel Sambuc free(tmp);
321ebfedea0SLionel Sambuc return ret;
322ebfedea0SLionel Sambuc }
323ebfedea0SLionel Sambuc canonical_reorder(tmp, tmp_len);
324ebfedea0SLionel Sambuc ret = combine(tmp, tmp_len, out, out_len);
325ebfedea0SLionel Sambuc free(tmp);
326ebfedea0SLionel Sambuc return ret;
327ebfedea0SLionel Sambuc }
328