xref: /netbsd-src/crypto/external/bsd/heimdal/dist/lib/wind/normalize.c (revision afab4e300d3a9fb07dd8c80daf53d0feb3345706)
1*afab4e30Schristos /*	$NetBSD: normalize.c,v 1.3 2023/06/19 21:41:45 christos Exp $	*/
2ca1c9b0cSelric 
3ca1c9b0cSelric /*
4ca1c9b0cSelric  * Copyright (c) 2004 Kungliga Tekniska Högskolan
5ca1c9b0cSelric  * (Royal Institute of Technology, Stockholm, Sweden).
6ca1c9b0cSelric  * All rights reserved.
7ca1c9b0cSelric  *
8ca1c9b0cSelric  * Redistribution and use in source and binary forms, with or without
9ca1c9b0cSelric  * modification, are permitted provided that the following conditions
10ca1c9b0cSelric  * are met:
11ca1c9b0cSelric  *
12ca1c9b0cSelric  * 1. Redistributions of source code must retain the above copyright
13ca1c9b0cSelric  *    notice, this list of conditions and the following disclaimer.
14ca1c9b0cSelric  *
15ca1c9b0cSelric  * 2. Redistributions in binary form must reproduce the above copyright
16ca1c9b0cSelric  *    notice, this list of conditions and the following disclaimer in the
17ca1c9b0cSelric  *    documentation and/or other materials provided with the distribution.
18ca1c9b0cSelric  *
19ca1c9b0cSelric  * 3. Neither the name of the Institute nor the names of its contributors
20ca1c9b0cSelric  *    may be used to endorse or promote products derived from this software
21ca1c9b0cSelric  *    without specific prior written permission.
22ca1c9b0cSelric  *
23ca1c9b0cSelric  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
24ca1c9b0cSelric  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25ca1c9b0cSelric  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26ca1c9b0cSelric  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
27ca1c9b0cSelric  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28ca1c9b0cSelric  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29ca1c9b0cSelric  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30ca1c9b0cSelric  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31ca1c9b0cSelric  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32ca1c9b0cSelric  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33ca1c9b0cSelric  * SUCH DAMAGE.
34ca1c9b0cSelric  */
35ca1c9b0cSelric 
36ca1c9b0cSelric #ifdef HAVE_CONFIG_H
37ca1c9b0cSelric #include <config.h>
38ca1c9b0cSelric #endif
39ca1c9b0cSelric #include "windlocl.h"
40ca1c9b0cSelric 
41ca1c9b0cSelric #include <assert.h>
42ca1c9b0cSelric #include <stdlib.h>
43ca1c9b0cSelric #include <errno.h>
44ca1c9b0cSelric #include <stdio.h>
45ca1c9b0cSelric 
46ca1c9b0cSelric #include <krb5/roken.h>
47ca1c9b0cSelric 
48ca1c9b0cSelric #include "normalize_table.h"
49ca1c9b0cSelric 
50ca1c9b0cSelric static int
translation_cmp(const void * key,const void * data)51ca1c9b0cSelric translation_cmp(const void *key, const void *data)
52ca1c9b0cSelric {
53ca1c9b0cSelric     const struct translation *t1 = (const struct translation *)key;
54ca1c9b0cSelric     const struct translation *t2 = (const struct translation *)data;
55ca1c9b0cSelric 
56ca1c9b0cSelric     return t1->key - t2->key;
57ca1c9b0cSelric }
58ca1c9b0cSelric 
59ca1c9b0cSelric enum { s_base  = 0xAC00};
60ca1c9b0cSelric enum { s_count = 11172};
61ca1c9b0cSelric enum { l_base  = 0x1100};
62ca1c9b0cSelric enum { l_count = 19};
63ca1c9b0cSelric enum { v_base  = 0x1161};
64ca1c9b0cSelric enum { v_count = 21};
65ca1c9b0cSelric enum { t_base  = 0x11A7};
66ca1c9b0cSelric enum { t_count = 28};
67ca1c9b0cSelric enum { n_count = v_count * t_count};
68ca1c9b0cSelric 
69ca1c9b0cSelric static int
hangul_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)70ca1c9b0cSelric hangul_decomp(const uint32_t *in, size_t in_len,
71ca1c9b0cSelric 	      uint32_t *out, size_t *out_len)
72ca1c9b0cSelric {
73ca1c9b0cSelric     uint32_t u = *in;
74ca1c9b0cSelric     unsigned s_index;
75ca1c9b0cSelric     unsigned l, v, t;
76ca1c9b0cSelric     unsigned o;
77ca1c9b0cSelric 
78ca1c9b0cSelric     if (u < s_base || u >= s_base + s_count)
79ca1c9b0cSelric 	return 0;
80ca1c9b0cSelric     s_index = u - s_base;
81ca1c9b0cSelric     l = l_base + s_index / n_count;
82ca1c9b0cSelric     v = v_base + (s_index % n_count) / t_count;
83ca1c9b0cSelric     t = t_base + s_index % t_count;
84ca1c9b0cSelric     o = 2;
85ca1c9b0cSelric     if (t != t_base)
86ca1c9b0cSelric 	++o;
87ca1c9b0cSelric     if (*out_len < o)
88ca1c9b0cSelric 	return WIND_ERR_OVERRUN;
89ca1c9b0cSelric     out[0] = l;
90ca1c9b0cSelric     out[1] = v;
91ca1c9b0cSelric     if (t != t_base)
92ca1c9b0cSelric 	out[2] = t;
93ca1c9b0cSelric     *out_len = o;
94ca1c9b0cSelric     return 1;
95ca1c9b0cSelric }
96ca1c9b0cSelric 
97ca1c9b0cSelric static uint32_t
hangul_composition(const uint32_t * in,size_t in_len)98ca1c9b0cSelric hangul_composition(const uint32_t *in, size_t in_len)
99ca1c9b0cSelric {
100ca1c9b0cSelric     if (in_len < 2)
101ca1c9b0cSelric 	return 0;
102ca1c9b0cSelric     if (in[0] >= l_base && in[0] < l_base + l_count) {
103ca1c9b0cSelric 	unsigned l_index = in[0] - l_base;
104ca1c9b0cSelric 	unsigned v_index;
105ca1c9b0cSelric 
106ca1c9b0cSelric 	if (in[1] < v_base || in[1] >= v_base + v_count)
107ca1c9b0cSelric 	    return 0;
108ca1c9b0cSelric 	v_index = in[1] - v_base;
109ca1c9b0cSelric 	return (l_index * v_count + v_index) * t_count + s_base;
110ca1c9b0cSelric     } else if (in[0] >= s_base && in[0] < s_base + s_count) {
111ca1c9b0cSelric 	unsigned s_index = in[0] - s_base;
112ca1c9b0cSelric 	unsigned t_index;
113ca1c9b0cSelric 
114ca1c9b0cSelric 	if (s_index % t_count != 0)
115ca1c9b0cSelric 	    return 0;
116ca1c9b0cSelric 	if (in[1] < t_base || in[1] >= t_base + t_count)
117ca1c9b0cSelric 	    return 0;
118ca1c9b0cSelric 	t_index = in[1] - t_base;
119ca1c9b0cSelric 	return in[0] + t_index;
120ca1c9b0cSelric     }
121ca1c9b0cSelric     return 0;
122ca1c9b0cSelric }
123ca1c9b0cSelric 
124ca1c9b0cSelric static int
compat_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)125ca1c9b0cSelric compat_decomp(const uint32_t *in, size_t in_len,
126ca1c9b0cSelric 	      uint32_t *out, size_t *out_len)
127ca1c9b0cSelric {
128ca1c9b0cSelric     unsigned i;
129ca1c9b0cSelric     unsigned o = 0;
130ca1c9b0cSelric 
131ca1c9b0cSelric     for (i = 0; i < in_len; ++i) {
132b9d004c6Schristos 	struct translation ts = {in[i], 0, 0};
133ca1c9b0cSelric 	size_t sub_len = *out_len - o;
134ca1c9b0cSelric 	int ret;
135ca1c9b0cSelric 
136ca1c9b0cSelric 	ret = hangul_decomp(in + i, in_len - i,
137ca1c9b0cSelric 			    out + o, &sub_len);
138ca1c9b0cSelric 	if (ret) {
139ca1c9b0cSelric 	    if (ret == WIND_ERR_OVERRUN)
140ca1c9b0cSelric 		return ret;
141ca1c9b0cSelric 	    o += sub_len;
142ca1c9b0cSelric 	} else {
143ca1c9b0cSelric 	    void *s = bsearch(&ts,
144ca1c9b0cSelric 			      _wind_normalize_table,
145ca1c9b0cSelric 			      _wind_normalize_table_size,
146ca1c9b0cSelric 			      sizeof(_wind_normalize_table[0]),
147ca1c9b0cSelric 			      translation_cmp);
148ca1c9b0cSelric 	    if (s != NULL) {
149ca1c9b0cSelric 		const struct translation *t = (const struct translation *)s;
150ca1c9b0cSelric 
151ca1c9b0cSelric 		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
152ca1c9b0cSelric 				    t->val_len,
153ca1c9b0cSelric 				    out + o, &sub_len);
154ca1c9b0cSelric 		if (ret)
155ca1c9b0cSelric 		    return ret;
156ca1c9b0cSelric 		o += sub_len;
157ca1c9b0cSelric 	    } else {
158ca1c9b0cSelric 		if (o >= *out_len)
159ca1c9b0cSelric 		    return WIND_ERR_OVERRUN;
160ca1c9b0cSelric 		out[o++] = in[i];
161ca1c9b0cSelric 
162ca1c9b0cSelric 	    }
163ca1c9b0cSelric 	}
164ca1c9b0cSelric     }
165ca1c9b0cSelric     *out_len = o;
166ca1c9b0cSelric     return 0;
167ca1c9b0cSelric }
168ca1c9b0cSelric 
169ca1c9b0cSelric static void
swap_char(uint32_t * a,uint32_t * b)170ca1c9b0cSelric swap_char(uint32_t * a, uint32_t * b)
171ca1c9b0cSelric {
172ca1c9b0cSelric     uint32_t t;
173ca1c9b0cSelric     t = *a;
174ca1c9b0cSelric     *a = *b;
175ca1c9b0cSelric     *b = t;
176ca1c9b0cSelric }
177ca1c9b0cSelric 
178ca1c9b0cSelric /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
179ca1c9b0cSelric  * that all have Canonical_Combining_Class > 0 */
180ca1c9b0cSelric static void
canonical_reorder_sequence(uint32_t * a,size_t len)181ca1c9b0cSelric canonical_reorder_sequence(uint32_t * a, size_t len)
182ca1c9b0cSelric {
183ca1c9b0cSelric     size_t i, j;
184ca1c9b0cSelric 
185ca1c9b0cSelric     if (len <= 1)
186ca1c9b0cSelric 	return;
187ca1c9b0cSelric 
188ca1c9b0cSelric     for (i = 1; i < len; i++) {
189ca1c9b0cSelric 	for (j = i;
190ca1c9b0cSelric 	     j > 0 &&
191ca1c9b0cSelric 		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
192ca1c9b0cSelric 	     j--)
193ca1c9b0cSelric 	    swap_char(&a[j], &a[j-1]);
194ca1c9b0cSelric     }
195ca1c9b0cSelric }
196ca1c9b0cSelric 
197ca1c9b0cSelric static void
canonical_reorder(uint32_t * tmp,size_t tmp_len)198ca1c9b0cSelric canonical_reorder(uint32_t *tmp, size_t tmp_len)
199ca1c9b0cSelric {
200ca1c9b0cSelric     size_t i;
201ca1c9b0cSelric 
202ca1c9b0cSelric     for (i = 0; i < tmp_len; ++i) {
203ca1c9b0cSelric 	int cc = _wind_combining_class(tmp[i]);
204ca1c9b0cSelric 	if (cc) {
205ca1c9b0cSelric 	    size_t j;
206ca1c9b0cSelric 	    for (j = i + 1;
207ca1c9b0cSelric 		 j < tmp_len && _wind_combining_class(tmp[j]);
208ca1c9b0cSelric 		 ++j)
209ca1c9b0cSelric 		;
210ca1c9b0cSelric 	    canonical_reorder_sequence(&tmp[i], j - i);
211ca1c9b0cSelric 	    i = j;
212ca1c9b0cSelric 	}
213ca1c9b0cSelric     }
214ca1c9b0cSelric }
215ca1c9b0cSelric 
216ca1c9b0cSelric static uint32_t
find_composition(const uint32_t * in,unsigned in_len)217ca1c9b0cSelric find_composition(const uint32_t *in, unsigned in_len)
218ca1c9b0cSelric {
219ca1c9b0cSelric     unsigned short canon_index = 0;
220ca1c9b0cSelric     uint32_t cur;
221ca1c9b0cSelric     unsigned n = 0;
222ca1c9b0cSelric 
223ca1c9b0cSelric     cur = hangul_composition(in, in_len);
224ca1c9b0cSelric     if (cur)
225ca1c9b0cSelric 	return cur;
226ca1c9b0cSelric 
227ca1c9b0cSelric     do {
228ca1c9b0cSelric 	const struct canon_node *c = &_wind_canon_table[canon_index];
229ca1c9b0cSelric 	unsigned i;
230ca1c9b0cSelric 
231ca1c9b0cSelric 	if (n % 5 == 0) {
232ca1c9b0cSelric 	    if (in_len-- == 0)
233ca1c9b0cSelric 		return c->val;
234*afab4e30Schristos 	    cur = *in++;
235ca1c9b0cSelric 	}
236ca1c9b0cSelric 
237ca1c9b0cSelric 	i = cur >> 16;
238ca1c9b0cSelric 	if (i < c->next_start || i >= c->next_end)
239ca1c9b0cSelric 	    canon_index = 0;
240ca1c9b0cSelric 	else
241ca1c9b0cSelric 	    canon_index =
242ca1c9b0cSelric 		_wind_canon_next_table[c->next_offset + i - c->next_start];
243ca1c9b0cSelric 	if (canon_index != 0) {
244ca1c9b0cSelric 	    cur = (cur << 4) & 0xFFFFF;
245ca1c9b0cSelric 	    ++n;
246ca1c9b0cSelric 	}
247ca1c9b0cSelric     } while (canon_index != 0);
248ca1c9b0cSelric     return 0;
249ca1c9b0cSelric }
250ca1c9b0cSelric 
251ca1c9b0cSelric static int
combine(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)252ca1c9b0cSelric combine(const uint32_t *in, size_t in_len,
253ca1c9b0cSelric 	uint32_t *out, size_t *out_len)
254ca1c9b0cSelric {
255ca1c9b0cSelric     unsigned i;
256ca1c9b0cSelric     int ostarter;
257ca1c9b0cSelric     unsigned o = 0;
258ca1c9b0cSelric     int old_cc;
259ca1c9b0cSelric 
260ca1c9b0cSelric     for (i = 0; i < in_len;) {
261ca1c9b0cSelric 	while (i < in_len && _wind_combining_class(in[i]) != 0) {
262ca1c9b0cSelric 	    out[o++] = in[i++];
263ca1c9b0cSelric 	}
264ca1c9b0cSelric 	if (i < in_len) {
265ca1c9b0cSelric 	    if (o >= *out_len)
266ca1c9b0cSelric 		return WIND_ERR_OVERRUN;
267ca1c9b0cSelric 	    ostarter = o;
268ca1c9b0cSelric 	    out[o++] = in[i++];
269ca1c9b0cSelric 	    old_cc   = -1;
270ca1c9b0cSelric 
271ca1c9b0cSelric 	    while (i < in_len) {
272ca1c9b0cSelric 		uint32_t comb;
273ca1c9b0cSelric 		uint32_t v[2];
274ca1c9b0cSelric 		int cc;
275ca1c9b0cSelric 
276ca1c9b0cSelric 		v[0] = out[ostarter];
277ca1c9b0cSelric 		v[1] = in[i];
278ca1c9b0cSelric 
279ca1c9b0cSelric 		cc = _wind_combining_class(in[i]);
280ca1c9b0cSelric 		if (old_cc != cc && (comb = find_composition(v, 2))) {
281ca1c9b0cSelric 		    out[ostarter] = comb;
282ca1c9b0cSelric 		} else if (cc == 0) {
283ca1c9b0cSelric 		    break;
284ca1c9b0cSelric 		} else {
285ca1c9b0cSelric 		    if (o >= *out_len)
286ca1c9b0cSelric 			return WIND_ERR_OVERRUN;
287ca1c9b0cSelric 		    out[o++] = in[i];
288ca1c9b0cSelric 		    old_cc   = cc;
289ca1c9b0cSelric 		}
290ca1c9b0cSelric 		++i;
291ca1c9b0cSelric 	    }
292ca1c9b0cSelric 	}
293ca1c9b0cSelric     }
294ca1c9b0cSelric     *out_len = o;
295ca1c9b0cSelric     return 0;
296ca1c9b0cSelric }
297ca1c9b0cSelric 
298ca1c9b0cSelric int
_wind_stringprep_normalize(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)299ca1c9b0cSelric _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
300ca1c9b0cSelric 			   uint32_t *out, size_t *out_len)
301ca1c9b0cSelric {
302ca1c9b0cSelric     size_t tmp_len;
303ca1c9b0cSelric     uint32_t *tmp;
304ca1c9b0cSelric     int ret;
305ca1c9b0cSelric 
306ca1c9b0cSelric     if (in_len == 0) {
307ca1c9b0cSelric 	*out_len = 0;
308ca1c9b0cSelric 	return 0;
309ca1c9b0cSelric     }
310ca1c9b0cSelric 
311ca1c9b0cSelric     tmp_len = in_len * 4;
312ca1c9b0cSelric     if (tmp_len < MAX_LENGTH_CANON)
313ca1c9b0cSelric 	tmp_len = MAX_LENGTH_CANON;
314ca1c9b0cSelric     tmp = malloc(tmp_len * sizeof(uint32_t));
315ca1c9b0cSelric     if (tmp == NULL)
316ca1c9b0cSelric 	return ENOMEM;
317ca1c9b0cSelric 
318ca1c9b0cSelric     ret = compat_decomp(in, in_len, tmp, &tmp_len);
319ca1c9b0cSelric     if (ret) {
320ca1c9b0cSelric 	free(tmp);
321ca1c9b0cSelric 	return ret;
322ca1c9b0cSelric     }
323ca1c9b0cSelric     canonical_reorder(tmp, tmp_len);
324ca1c9b0cSelric     ret = combine(tmp, tmp_len, out, out_len);
325ca1c9b0cSelric     free(tmp);
326ca1c9b0cSelric     return ret;
327ca1c9b0cSelric }
328