10e552da7Schristos /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
20e552da7Schristos *
30e552da7Schristos * Permission to use, copy, modify, and/or distribute this software for any
40e552da7Schristos * purpose with or without fee is hereby granted, provided that the above
50e552da7Schristos * copyright notice and this permission notice appear in all copies.
60e552da7Schristos *
70e552da7Schristos * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
80e552da7Schristos * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
90e552da7Schristos * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
100e552da7Schristos * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
110e552da7Schristos * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
120e552da7Schristos * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
130e552da7Schristos * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
140e552da7Schristos */
150e552da7Schristos
160e552da7Schristos /* Derived from https://github.com/bnoordhuis/punycode
170e552da7Schristos * but updated to support IDNA 2008.
180e552da7Schristos */
190e552da7Schristos
200e552da7Schristos #include "uv.h"
210e552da7Schristos #include "idna.h"
22*5f2f4271Schristos #include <assert.h>
230e552da7Schristos #include <string.h>
24*5f2f4271Schristos #include <limits.h> /* UINT_MAX */
250e552da7Schristos
uv__utf8_decode1_slow(const char ** p,const char * pe,unsigned a)260e552da7Schristos static unsigned uv__utf8_decode1_slow(const char** p,
270e552da7Schristos const char* pe,
280e552da7Schristos unsigned a) {
290e552da7Schristos unsigned b;
300e552da7Schristos unsigned c;
310e552da7Schristos unsigned d;
320e552da7Schristos unsigned min;
330e552da7Schristos
340e552da7Schristos if (a > 0xF7)
350e552da7Schristos return -1;
360e552da7Schristos
37*5f2f4271Schristos switch (pe - *p) {
380e552da7Schristos default:
390e552da7Schristos if (a > 0xEF) {
400e552da7Schristos min = 0x10000;
410e552da7Schristos a = a & 7;
420e552da7Schristos b = (unsigned char) *(*p)++;
430e552da7Schristos c = (unsigned char) *(*p)++;
440e552da7Schristos d = (unsigned char) *(*p)++;
450e552da7Schristos break;
460e552da7Schristos }
470e552da7Schristos /* Fall through. */
480e552da7Schristos case 2:
490e552da7Schristos if (a > 0xDF) {
500e552da7Schristos min = 0x800;
510e552da7Schristos b = 0x80 | (a & 15);
520e552da7Schristos c = (unsigned char) *(*p)++;
530e552da7Schristos d = (unsigned char) *(*p)++;
540e552da7Schristos a = 0;
550e552da7Schristos break;
560e552da7Schristos }
570e552da7Schristos /* Fall through. */
580e552da7Schristos case 1:
590e552da7Schristos if (a > 0xBF) {
600e552da7Schristos min = 0x80;
610e552da7Schristos b = 0x80;
620e552da7Schristos c = 0x80 | (a & 31);
630e552da7Schristos d = (unsigned char) *(*p)++;
640e552da7Schristos a = 0;
650e552da7Schristos break;
660e552da7Schristos }
67*5f2f4271Schristos /* Fall through. */
68*5f2f4271Schristos case 0:
690e552da7Schristos return -1; /* Invalid continuation byte. */
700e552da7Schristos }
710e552da7Schristos
720e552da7Schristos if (0x80 != (0xC0 & (b ^ c ^ d)))
730e552da7Schristos return -1; /* Invalid sequence. */
740e552da7Schristos
750e552da7Schristos b &= 63;
760e552da7Schristos c &= 63;
770e552da7Schristos d &= 63;
780e552da7Schristos a = (a << 18) | (b << 12) | (c << 6) | d;
790e552da7Schristos
800e552da7Schristos if (a < min)
810e552da7Schristos return -1; /* Overlong sequence. */
820e552da7Schristos
830e552da7Schristos if (a > 0x10FFFF)
840e552da7Schristos return -1; /* Four-byte sequence > U+10FFFF. */
850e552da7Schristos
860e552da7Schristos if (a >= 0xD800 && a <= 0xDFFF)
870e552da7Schristos return -1; /* Surrogate pair. */
880e552da7Schristos
890e552da7Schristos return a;
900e552da7Schristos }
910e552da7Schristos
uv__utf8_decode1(const char ** p,const char * pe)920e552da7Schristos unsigned uv__utf8_decode1(const char** p, const char* pe) {
930e552da7Schristos unsigned a;
940e552da7Schristos
95*5f2f4271Schristos assert(*p < pe);
96*5f2f4271Schristos
970e552da7Schristos a = (unsigned char) *(*p)++;
980e552da7Schristos
990e552da7Schristos if (a < 128)
1000e552da7Schristos return a; /* ASCII, common case. */
1010e552da7Schristos
1020e552da7Schristos return uv__utf8_decode1_slow(p, pe, a);
1030e552da7Schristos }
1040e552da7Schristos
uv__idna_toascii_label(const char * s,const char * se,char ** d,char * de)1050e552da7Schristos static int uv__idna_toascii_label(const char* s, const char* se,
1060e552da7Schristos char** d, char* de) {
1070e552da7Schristos static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
1080e552da7Schristos const char* ss;
1090e552da7Schristos unsigned c;
1100e552da7Schristos unsigned h;
1110e552da7Schristos unsigned k;
1120e552da7Schristos unsigned n;
1130e552da7Schristos unsigned m;
1140e552da7Schristos unsigned q;
1150e552da7Schristos unsigned t;
1160e552da7Schristos unsigned x;
1170e552da7Schristos unsigned y;
1180e552da7Schristos unsigned bias;
1190e552da7Schristos unsigned delta;
1200e552da7Schristos unsigned todo;
1210e552da7Schristos int first;
1220e552da7Schristos
1230e552da7Schristos h = 0;
1240e552da7Schristos ss = s;
1250e552da7Schristos todo = 0;
1260e552da7Schristos
127*5f2f4271Schristos /* Note: after this loop we've visited all UTF-8 characters and know
128*5f2f4271Schristos * they're legal so we no longer need to check for decode errors.
129*5f2f4271Schristos */
130*5f2f4271Schristos while (s < se) {
131*5f2f4271Schristos c = uv__utf8_decode1(&s, se);
132*5f2f4271Schristos
133*5f2f4271Schristos if (c == UINT_MAX)
134*5f2f4271Schristos return UV_EINVAL;
135*5f2f4271Schristos
1360e552da7Schristos if (c < 128)
1370e552da7Schristos h++;
1380e552da7Schristos else
1390e552da7Schristos todo++;
1400e552da7Schristos }
1410e552da7Schristos
142*5f2f4271Schristos /* Only write "xn--" when there are non-ASCII characters. */
1430e552da7Schristos if (todo > 0) {
1440e552da7Schristos if (*d < de) *(*d)++ = 'x';
1450e552da7Schristos if (*d < de) *(*d)++ = 'n';
1460e552da7Schristos if (*d < de) *(*d)++ = '-';
1470e552da7Schristos if (*d < de) *(*d)++ = '-';
1480e552da7Schristos }
1490e552da7Schristos
150*5f2f4271Schristos /* Write ASCII characters. */
1510e552da7Schristos x = 0;
1520e552da7Schristos s = ss;
153*5f2f4271Schristos while (s < se) {
154*5f2f4271Schristos c = uv__utf8_decode1(&s, se);
155*5f2f4271Schristos assert(c != UINT_MAX);
156*5f2f4271Schristos
1570e552da7Schristos if (c > 127)
1580e552da7Schristos continue;
1590e552da7Schristos
1600e552da7Schristos if (*d < de)
1610e552da7Schristos *(*d)++ = c;
1620e552da7Schristos
1630e552da7Schristos if (++x == h)
1640e552da7Schristos break; /* Visited all ASCII characters. */
1650e552da7Schristos }
1660e552da7Schristos
1670e552da7Schristos if (todo == 0)
1680e552da7Schristos return h;
1690e552da7Schristos
1700e552da7Schristos /* Only write separator when we've written ASCII characters first. */
1710e552da7Schristos if (h > 0)
1720e552da7Schristos if (*d < de)
1730e552da7Schristos *(*d)++ = '-';
1740e552da7Schristos
1750e552da7Schristos n = 128;
1760e552da7Schristos bias = 72;
1770e552da7Schristos delta = 0;
1780e552da7Schristos first = 1;
1790e552da7Schristos
1800e552da7Schristos while (todo > 0) {
1810e552da7Schristos m = -1;
1820e552da7Schristos s = ss;
183*5f2f4271Schristos
184*5f2f4271Schristos while (s < se) {
185*5f2f4271Schristos c = uv__utf8_decode1(&s, se);
186*5f2f4271Schristos assert(c != UINT_MAX);
187*5f2f4271Schristos
1880e552da7Schristos if (c >= n)
1890e552da7Schristos if (c < m)
1900e552da7Schristos m = c;
191*5f2f4271Schristos }
1920e552da7Schristos
1930e552da7Schristos x = m - n;
1940e552da7Schristos y = h + 1;
1950e552da7Schristos
1960e552da7Schristos if (x > ~delta / y)
1970e552da7Schristos return UV_E2BIG; /* Overflow. */
1980e552da7Schristos
1990e552da7Schristos delta += x * y;
2000e552da7Schristos n = m;
2010e552da7Schristos
2020e552da7Schristos s = ss;
203*5f2f4271Schristos while (s < se) {
204*5f2f4271Schristos c = uv__utf8_decode1(&s, se);
205*5f2f4271Schristos assert(c != UINT_MAX);
206*5f2f4271Schristos
2070e552da7Schristos if (c < n)
2080e552da7Schristos if (++delta == 0)
2090e552da7Schristos return UV_E2BIG; /* Overflow. */
2100e552da7Schristos
2110e552da7Schristos if (c != n)
2120e552da7Schristos continue;
2130e552da7Schristos
2140e552da7Schristos for (k = 36, q = delta; /* empty */; k += 36) {
2150e552da7Schristos t = 1;
2160e552da7Schristos
2170e552da7Schristos if (k > bias)
2180e552da7Schristos t = k - bias;
2190e552da7Schristos
2200e552da7Schristos if (t > 26)
2210e552da7Schristos t = 26;
2220e552da7Schristos
2230e552da7Schristos if (q < t)
2240e552da7Schristos break;
2250e552da7Schristos
2260e552da7Schristos /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
2270e552da7Schristos * 10 <= y <= 35, we can optimize the long division
2280e552da7Schristos * into a table-based reciprocal multiplication.
2290e552da7Schristos */
2300e552da7Schristos x = q - t;
2310e552da7Schristos y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */
2320e552da7Schristos q = x / y;
2330e552da7Schristos t = t + x % y; /* 1 <= t <= 35 because of y. */
2340e552da7Schristos
2350e552da7Schristos if (*d < de)
2360e552da7Schristos *(*d)++ = alphabet[t];
2370e552da7Schristos }
2380e552da7Schristos
2390e552da7Schristos if (*d < de)
2400e552da7Schristos *(*d)++ = alphabet[q];
2410e552da7Schristos
2420e552da7Schristos delta /= 2;
2430e552da7Schristos
2440e552da7Schristos if (first) {
2450e552da7Schristos delta /= 350;
2460e552da7Schristos first = 0;
2470e552da7Schristos }
2480e552da7Schristos
2490e552da7Schristos /* No overflow check is needed because |delta| was just
2500e552da7Schristos * divided by 2 and |delta+delta >= delta + delta/h|.
2510e552da7Schristos */
2520e552da7Schristos h++;
2530e552da7Schristos delta += delta / h;
2540e552da7Schristos
2550e552da7Schristos for (bias = 0; delta > 35 * 26 / 2; bias += 36)
2560e552da7Schristos delta /= 35;
2570e552da7Schristos
2580e552da7Schristos bias += 36 * delta / (delta + 38);
2590e552da7Schristos delta = 0;
2600e552da7Schristos todo--;
2610e552da7Schristos }
2620e552da7Schristos
2630e552da7Schristos delta++;
2640e552da7Schristos n++;
2650e552da7Schristos }
2660e552da7Schristos
2670e552da7Schristos return 0;
2680e552da7Schristos }
2690e552da7Schristos
uv__idna_toascii(const char * s,const char * se,char * d,char * de)2700e552da7Schristos long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
2710e552da7Schristos const char* si;
2720e552da7Schristos const char* st;
2730e552da7Schristos unsigned c;
2740e552da7Schristos char* ds;
2750e552da7Schristos int rc;
2760e552da7Schristos
2770e552da7Schristos ds = d;
2780e552da7Schristos
279*5f2f4271Schristos si = s;
280*5f2f4271Schristos while (si < se) {
2810e552da7Schristos st = si;
2820e552da7Schristos c = uv__utf8_decode1(&si, se);
2830e552da7Schristos
284*5f2f4271Schristos if (c == UINT_MAX)
285*5f2f4271Schristos return UV_EINVAL;
286*5f2f4271Schristos
2870e552da7Schristos if (c != '.')
2880e552da7Schristos if (c != 0x3002) /* 。 */
2890e552da7Schristos if (c != 0xFF0E) /* . */
2900e552da7Schristos if (c != 0xFF61) /* 。 */
2910e552da7Schristos continue;
2920e552da7Schristos
2930e552da7Schristos rc = uv__idna_toascii_label(s, st, &d, de);
2940e552da7Schristos
2950e552da7Schristos if (rc < 0)
2960e552da7Schristos return rc;
2970e552da7Schristos
2980e552da7Schristos if (d < de)
2990e552da7Schristos *d++ = '.';
3000e552da7Schristos
3010e552da7Schristos s = si;
3020e552da7Schristos }
3030e552da7Schristos
3040e552da7Schristos if (s < se) {
3050e552da7Schristos rc = uv__idna_toascii_label(s, se, &d, de);
3060e552da7Schristos
3070e552da7Schristos if (rc < 0)
3080e552da7Schristos return rc;
3090e552da7Schristos }
3100e552da7Schristos
3110e552da7Schristos if (d < de)
3120e552da7Schristos *d++ = '\0';
3130e552da7Schristos
3140e552da7Schristos return d - ds; /* Number of bytes written. */
3150e552da7Schristos }
316