xref: /minix3/external/bsd/bind/dist/contrib/idn/idnkit-1.0-src/lib/punycode.c (revision 00b67f09dd46474d133c95011a48590a8e8f94c7)
1 /*	$NetBSD: punycode.c,v 1.4 2014/12/10 04:37:55 christos Exp $	*/
2 
3 #ifndef lint
4 static char *rcsid = "Id: punycode.c,v 1.1 2003/06/04 00:26:06 marka Exp ";
5 #endif
6 
7 /*
8  * Copyright (c) 2001,2002 Japan Network Information Center.
9  * All rights reserved.
10  *
11  * By using this file, you agree to the terms and conditions set forth bellow.
12  *
13  * 			LICENSE TERMS AND CONDITIONS
14  *
15  * The following License Terms and Conditions apply, unless a different
16  * license is obtained from Japan Network Information Center ("JPNIC"),
17  * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
18  * Chiyoda-ku, Tokyo 101-0047, Japan.
19  *
20  * 1. Use, Modification and Redistribution (including distribution of any
21  *    modified or derived work) in source and/or binary forms is permitted
22  *    under this License Terms and Conditions.
23  *
24  * 2. Redistribution of source code must retain the copyright notices as they
25  *    appear in each source code file, this License Terms and Conditions.
26  *
27  * 3. Redistribution in binary form must reproduce the Copyright Notice,
28  *    this License Terms and Conditions, in the documentation and/or other
29  *    materials provided with the distribution.  For the purposes of binary
30  *    distribution the "Copyright Notice" refers to the following language:
31  *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
32  *
33  * 4. The name of JPNIC may not be used to endorse or promote products
34  *    derived from this Software without specific prior written approval of
35  *    JPNIC.
36  *
37  * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
38  *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39  *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
40  *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
41  *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42  *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43  *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44  *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45  *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
46  *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47  *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48  */
49 
50 #include <config.h>
51 
52 #include <stddef.h>
53 #include <stdlib.h>
54 #include <string.h>
55 
56 #include <idn/result.h>
57 #include <idn/assert.h>
58 #include <idn/logmacro.h>
59 #include <idn/converter.h>
60 #include <idn/ucs4.h>
61 #include <idn/debug.h>
62 #include <idn/punycode.h>
63 #include <idn/util.h>
64 
65 /*
66  * Although draft-ietf-idn-punycode-00.txt doesn't specify the ACE
67  * signature, we have to choose one.  In order to prevent the converted
68  * name from beginning with a hyphen, we should choose a prefix rather
69  * than a suffix.
70  */
71 #ifndef IDN_PUNYCODE_PREFIX
72 #define IDN_PUNYCODE_PREFIX	"xn--"
73 #endif
74 
75 #define INVALID_UCS	0x80000000
76 #define MAX_UCS		0x10FFFF
77 
78 /*
79  * As the draft states, it is possible that `delta' may overflow during
80  * the encoding.  The upper bound of 'delta' is:
81  *   <# of chars. of input string> + <max. difference in code point> *
82  *   <# of chars. of input string + 1>
83  * For this value not to be greater than 0xffffffff (since the calculation
84  * is done using unsigned long, which is at least 32bit long), the maxmum
85  * input string size is about 3850 characters, which is long enough for
86  * a domain label...
87  */
88 #define PUNYCODE_MAXINPUT	3800
89 
90 /*
91  * Parameters.
92  */
93 #define PUNYCODE_BASE		36
94 #define PUNYCODE_TMIN		1
95 #define PUNYCODE_TMAX		26
96 #define PUNYCODE_SKEW		38
97 #define PUNYCODE_DAMP		700
98 #define PUNYCODE_INITIAL_BIAS	72
99 #define PUNYCODE_INITIAL_N	0x80
100 
101 static int		punycode_getwc(const char *s, size_t len,
102 				      int bias, unsigned long *vp);
103 static int		punycode_putwc(char *s, size_t len,
104 				      unsigned long delta, int bias);
105 static int		punycode_update_bias(unsigned long delta,
106 					    size_t npoints, int first);
107 
108 idn_result_t
idn__punycode_decode(idn_converter_t ctx,void * privdata,const char * from,unsigned long * to,size_t tolen)109 idn__punycode_decode(idn_converter_t ctx, void *privdata,
110 		    const char *from, unsigned long *to, size_t tolen) {
111 	unsigned long *to_org = to;
112 	unsigned long c, idx;
113 	size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX);
114 	size_t fromlen;
115 	size_t uidx, fidx, ucslen;
116 	int first, bias;
117 	idn_result_t r;
118 
119 	assert(ctx != NULL);
120 
121 	TRACE(("idn__punycode_decode(from=\"%s\", tolen=%d)\n",
122 	       idn__debug_xstring(from, 50), (int)tolen));
123 
124 	if (!idn__util_asciihaveaceprefix(from, IDN_PUNYCODE_PREFIX)) {
125 		if (*from == '\0') {
126 			r = idn_ucs4_utf8toucs4(from, to, tolen);
127 			goto ret;
128 		}
129 		r = idn_invalid_encoding;
130 		goto ret;
131 	}
132 	from += prefixlen;
133 	fromlen = strlen(from);
134 
135 	/*
136 	 * Find the last delimiter, and copy the characters
137 	 * before it verbatim.
138 	 */
139 	ucslen = 0;
140 	for (fidx = fromlen; fidx > 0; fidx--) {
141 		if (from[fidx - 1] == '-') {
142 			if (tolen < fidx) {
143 				r = idn_buffer_overflow;
144 				goto ret;
145 			}
146 			for (uidx = 0; uidx < fidx - 1; uidx++) {
147 				to[uidx] = from[uidx];
148 			}
149 			ucslen = uidx;
150 			break;
151 		}
152 	}
153 
154 	first = 1;
155 	bias = PUNYCODE_INITIAL_BIAS;
156 	c = PUNYCODE_INITIAL_N;
157 	idx = 0;
158 	while (fidx < fromlen) {
159 		int len;
160 		unsigned long delta;
161 		int i;
162 
163 		len = punycode_getwc(from + fidx, fromlen - fidx, bias, &delta);
164 		if (len == 0) {
165 			r = idn_invalid_encoding;
166 			goto ret;
167 		}
168 		fidx += len;
169 
170 		bias = punycode_update_bias(delta, ucslen + 1, first);
171 		first = 0;
172 		idx += delta;
173 		c += idx / (ucslen + 1);
174 		uidx = idx % (ucslen + 1);
175 
176 		/* Insert 'c' at uidx. */
177 		if (tolen-- <= 0) {
178 			r = idn_buffer_overflow;
179 			goto ret;
180 		}
181 		for (i = ucslen; i > uidx; i--)
182 			to[i] = to[i - 1];
183 		to[uidx] = c;
184 
185 		ucslen++;
186 		idx = uidx + 1;
187 	}
188 
189 	/* Terminate with NUL. */
190 	if (tolen <= 0) {
191 		r = idn_buffer_overflow;
192 		goto ret;
193 	}
194 	to[ucslen] = '\0';
195 	r = idn_success;
196 
197 ret:
198 	if (r == idn_success) {
199 		TRACE(("idn__punycode_decode(): succcess (to=\"%s\")\n",
200 		       idn__debug_ucs4xstring(to_org, 50)));
201 	} else {
202 		TRACE(("idn__punycode_decode(): %s\n", idn_result_tostring(r)));
203 	}
204 	return (r);
205 }
206 
207 idn_result_t
idn__punycode_encode(idn_converter_t ctx,void * privdata,const unsigned long * from,char * to,size_t tolen)208 idn__punycode_encode(idn_converter_t ctx, void *privdata,
209 		    const unsigned long *from, char *to, size_t tolen) {
210 	char *to_org = to;
211 	unsigned long cur_code, next_code, delta;
212 	size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX);
213 	size_t fromlen;
214 	size_t ucsdone;
215 	size_t toidx;
216 	int uidx, bias, first;
217 	idn_result_t r;
218 
219 	assert(ctx != NULL);
220 
221 	TRACE(("idn__punycode_encode(from=\"%s\", tolen=%d)\n",
222 	       idn__debug_ucs4xstring(from, 50), (int)tolen));
223 
224 	if (*from == '\0') {
225 		r = idn_ucs4_ucs4toutf8(from, to, tolen);
226 		goto ret;
227 	} else if (idn__util_ucs4haveaceprefix(from, IDN_PUNYCODE_PREFIX)) {
228 		r = idn_prohibited;
229 		goto ret;
230 	}
231 
232 	if (tolen < prefixlen) {
233 		r = idn_buffer_overflow;
234 		goto ret;
235 	}
236 	memcpy(to, IDN_PUNYCODE_PREFIX, prefixlen);
237 	to += prefixlen;
238 	tolen -= prefixlen;
239 
240 	fromlen = idn_ucs4_strlen(from);
241 
242 	/*
243 	 * If the input string is too long (actually too long to be sane),
244 	 * return failure in order to prevent possible overflow.
245 	 */
246 	if (fromlen > PUNYCODE_MAXINPUT) {
247 		ERROR(("idn__punycode_encode(): "
248 		       "the input string is too long to convert Punycode\n",
249 		       idn__debug_ucs4xstring(from, 50)));
250 		r = idn_failure;
251 		goto ret;
252 	}
253 
254 	ucsdone = 0;	/* number of characters processed */
255 	toidx = 0;
256 
257 	/*
258 	 * First, pick up basic code points and copy them to 'to'.
259 	 */
260 	for (uidx = 0; uidx < fromlen; uidx++) {
261 		if (from[uidx] < 0x80) {
262 			if (toidx >= tolen) {
263 				r = idn_buffer_overflow;
264 				goto ret;
265 			}
266 			to[toidx++] = from[uidx];
267 			ucsdone++;
268 		}
269 	}
270 
271 	/*
272 	 * If there are any basic code points, output a delimiter
273 	 * (hyphen-minus).
274 	 */
275 	if (toidx > 0) {
276 		if (toidx >= tolen) {
277 			r = idn_buffer_overflow;
278 			goto ret;
279 		}
280 		to[toidx++] = '-';
281 		to += toidx;
282 		tolen -= toidx;
283 	}
284 
285 	/*
286 	 * Then encode non-basic characters.
287 	 */
288 	first = 1;
289 	cur_code = PUNYCODE_INITIAL_N;
290 	bias = PUNYCODE_INITIAL_BIAS;
291 	delta = 0;
292 	while (ucsdone < fromlen) {
293 		int limit = -1, rest;
294 
295 		/*
296 		 * Find the smallest code point equal to or greater
297 		 * than 'cur_code'.  Also remember the index of the
298 		 * last occurence of the code point.
299 		 */
300 		for (next_code = MAX_UCS, uidx = fromlen - 1;
301 		     uidx >= 0; uidx--) {
302 			if (from[uidx] >= cur_code && from[uidx] < next_code) {
303 				next_code = from[uidx];
304 				limit = uidx;
305 			}
306 		}
307 		/* There must be such code point. */
308 		assert(limit >= 0);
309 
310 		delta += (next_code - cur_code) * (ucsdone + 1);
311 		cur_code = next_code;
312 
313 		/*
314 		 * Scan the input string again, and encode characters
315 		 * whose code point is 'cur_code'.  Use 'limit' to avoid
316 		 * unnecessary scan.
317 		 */
318 		for (uidx = 0, rest = ucsdone; uidx <= limit; uidx++) {
319 			if (from[uidx] < cur_code) {
320 				delta++;
321 				rest--;
322 			} else if (from[uidx] == cur_code) {
323 				int sz = punycode_putwc(to, tolen, delta, bias);
324 				if (sz == 0) {
325 					r = idn_buffer_overflow;
326 					goto ret;
327 				}
328 				to += sz;
329 				tolen -= sz;
330 				ucsdone++;
331 				bias = punycode_update_bias(delta, ucsdone,
332 							   first);
333 				delta = 0;
334 				first = 0;
335 			}
336 		}
337 		delta += rest + 1;
338 		cur_code++;
339 	}
340 
341 	/*
342 	 * Terminate with NUL.
343 	 */
344 	if (tolen <= 0) {
345 		r = idn_buffer_overflow;
346 		goto ret;
347 	}
348 	*to = '\0';
349 	r = idn_success;
350 
351 ret:
352 	if (r == idn_success) {
353 		TRACE(("idn__punycode_encode(): succcess (to=\"%s\")\n",
354 		       idn__debug_xstring(to_org, 50)));
355 	} else {
356 		TRACE(("idn__punycode_encode(): %s\n", idn_result_tostring(r)));
357 	}
358 	return (r);
359 }
360 
361 static int
punycode_getwc(const char * s,size_t len,int bias,unsigned long * vp)362 punycode_getwc(const char *s, size_t len, int bias, unsigned long *vp) {
363 	size_t orglen = len;
364 	unsigned long v = 0, w = 1;
365 	int k;
366 
367 	for (k = PUNYCODE_BASE - bias; len > 0; k += PUNYCODE_BASE) {
368 		int c = *s++;
369 		int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN :
370 			(k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k;
371 
372 		len--;
373 		if ('a' <= c && c <= 'z')
374 			c = c - 'a';
375 		else if ('A' <= c && c <= 'Z')
376 			c = c - 'A';
377 		else if ('0' <= c && c <= '9')
378 			c = c - '0' + 26;
379 		else
380 			c = -1;
381 
382 		if (c < 0)
383 			return (0);	/* invalid character */
384 
385 		v += c * w;
386 
387 		if (c < t) {
388 			*vp = v;
389 			return (orglen - len);
390 		}
391 
392 		w *= (PUNYCODE_BASE - t);
393 	}
394 
395 	return (0);	/* final character missing */
396 }
397 
398 static int
punycode_putwc(char * s,size_t len,unsigned long delta,int bias)399 punycode_putwc(char *s, size_t len, unsigned long delta, int bias) {
400 	const char *punycode_base36 = "abcdefghijklmnopqrstuvwxyz0123456789";
401 	int k;
402 	char *sorg = s;
403 
404 	for (k = PUNYCODE_BASE - bias; 1; k += PUNYCODE_BASE) {
405 		int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN :
406 			(k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k;
407 
408 		if (delta < t)
409 			break;
410 		if (len < 1)
411 			return (0);
412 		*s++ = punycode_base36[t + ((delta - t) % (PUNYCODE_BASE - t))];
413 		len--;
414 		delta = (delta - t) / (PUNYCODE_BASE - t);
415 	}
416 	if (len < 1)
417 		return (0);
418 	*s++ = punycode_base36[delta];
419 	return (s - sorg);
420 }
421 
422 static int
punycode_update_bias(unsigned long delta,size_t npoints,int first)423 punycode_update_bias(unsigned long delta, size_t npoints, int first) {
424 	int k = 0;
425 
426 	delta /= first ? PUNYCODE_DAMP : 2;
427 	delta += delta / npoints;
428 
429 	while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) {
430 		delta /= PUNYCODE_BASE - PUNYCODE_TMIN;
431 		k++;
432 	}
433 	return (PUNYCODE_BASE * k +
434 		(((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta) /
435 		 (delta + PUNYCODE_SKEW)));
436 }
437