xref: /netbsd-src/external/mit/libuv/dist/src/idna.c (revision 5f2f42719cd62ff11fd913b40b7ce19f07c4fd25)
10e552da7Schristos /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
20e552da7Schristos  *
30e552da7Schristos  * Permission to use, copy, modify, and/or distribute this software for any
40e552da7Schristos  * purpose with or without fee is hereby granted, provided that the above
50e552da7Schristos  * copyright notice and this permission notice appear in all copies.
60e552da7Schristos  *
70e552da7Schristos  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
80e552da7Schristos  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
90e552da7Schristos  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
100e552da7Schristos  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
110e552da7Schristos  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
120e552da7Schristos  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
130e552da7Schristos  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
140e552da7Schristos  */
150e552da7Schristos 
160e552da7Schristos /* Derived from https://github.com/bnoordhuis/punycode
170e552da7Schristos  * but updated to support IDNA 2008.
180e552da7Schristos  */
190e552da7Schristos 
200e552da7Schristos #include "uv.h"
210e552da7Schristos #include "idna.h"
22*5f2f4271Schristos #include <assert.h>
230e552da7Schristos #include <string.h>
24*5f2f4271Schristos #include <limits.h> /* UINT_MAX */
250e552da7Schristos 
uv__utf8_decode1_slow(const char ** p,const char * pe,unsigned a)260e552da7Schristos static unsigned uv__utf8_decode1_slow(const char** p,
270e552da7Schristos                                       const char* pe,
280e552da7Schristos                                       unsigned a) {
290e552da7Schristos   unsigned b;
300e552da7Schristos   unsigned c;
310e552da7Schristos   unsigned d;
320e552da7Schristos   unsigned min;
330e552da7Schristos 
340e552da7Schristos   if (a > 0xF7)
350e552da7Schristos     return -1;
360e552da7Schristos 
37*5f2f4271Schristos   switch (pe - *p) {
380e552da7Schristos   default:
390e552da7Schristos     if (a > 0xEF) {
400e552da7Schristos       min = 0x10000;
410e552da7Schristos       a = a & 7;
420e552da7Schristos       b = (unsigned char) *(*p)++;
430e552da7Schristos       c = (unsigned char) *(*p)++;
440e552da7Schristos       d = (unsigned char) *(*p)++;
450e552da7Schristos       break;
460e552da7Schristos     }
470e552da7Schristos     /* Fall through. */
480e552da7Schristos   case 2:
490e552da7Schristos     if (a > 0xDF) {
500e552da7Schristos       min = 0x800;
510e552da7Schristos       b = 0x80 | (a & 15);
520e552da7Schristos       c = (unsigned char) *(*p)++;
530e552da7Schristos       d = (unsigned char) *(*p)++;
540e552da7Schristos       a = 0;
550e552da7Schristos       break;
560e552da7Schristos     }
570e552da7Schristos     /* Fall through. */
580e552da7Schristos   case 1:
590e552da7Schristos     if (a > 0xBF) {
600e552da7Schristos       min = 0x80;
610e552da7Schristos       b = 0x80;
620e552da7Schristos       c = 0x80 | (a & 31);
630e552da7Schristos       d = (unsigned char) *(*p)++;
640e552da7Schristos       a = 0;
650e552da7Schristos       break;
660e552da7Schristos     }
67*5f2f4271Schristos     /* Fall through. */
68*5f2f4271Schristos   case 0:
690e552da7Schristos     return -1;  /* Invalid continuation byte. */
700e552da7Schristos   }
710e552da7Schristos 
720e552da7Schristos   if (0x80 != (0xC0 & (b ^ c ^ d)))
730e552da7Schristos     return -1;  /* Invalid sequence. */
740e552da7Schristos 
750e552da7Schristos   b &= 63;
760e552da7Schristos   c &= 63;
770e552da7Schristos   d &= 63;
780e552da7Schristos   a = (a << 18) | (b << 12) | (c << 6) | d;
790e552da7Schristos 
800e552da7Schristos   if (a < min)
810e552da7Schristos     return -1;  /* Overlong sequence. */
820e552da7Schristos 
830e552da7Schristos   if (a > 0x10FFFF)
840e552da7Schristos     return -1;  /* Four-byte sequence > U+10FFFF. */
850e552da7Schristos 
860e552da7Schristos   if (a >= 0xD800 && a <= 0xDFFF)
870e552da7Schristos     return -1;  /* Surrogate pair. */
880e552da7Schristos 
890e552da7Schristos   return a;
900e552da7Schristos }
910e552da7Schristos 
uv__utf8_decode1(const char ** p,const char * pe)920e552da7Schristos unsigned uv__utf8_decode1(const char** p, const char* pe) {
930e552da7Schristos   unsigned a;
940e552da7Schristos 
95*5f2f4271Schristos   assert(*p < pe);
96*5f2f4271Schristos 
970e552da7Schristos   a = (unsigned char) *(*p)++;
980e552da7Schristos 
990e552da7Schristos   if (a < 128)
1000e552da7Schristos     return a;  /* ASCII, common case. */
1010e552da7Schristos 
1020e552da7Schristos   return uv__utf8_decode1_slow(p, pe, a);
1030e552da7Schristos }
1040e552da7Schristos 
uv__idna_toascii_label(const char * s,const char * se,char ** d,char * de)1050e552da7Schristos static int uv__idna_toascii_label(const char* s, const char* se,
1060e552da7Schristos                                   char** d, char* de) {
1070e552da7Schristos   static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
1080e552da7Schristos   const char* ss;
1090e552da7Schristos   unsigned c;
1100e552da7Schristos   unsigned h;
1110e552da7Schristos   unsigned k;
1120e552da7Schristos   unsigned n;
1130e552da7Schristos   unsigned m;
1140e552da7Schristos   unsigned q;
1150e552da7Schristos   unsigned t;
1160e552da7Schristos   unsigned x;
1170e552da7Schristos   unsigned y;
1180e552da7Schristos   unsigned bias;
1190e552da7Schristos   unsigned delta;
1200e552da7Schristos   unsigned todo;
1210e552da7Schristos   int first;
1220e552da7Schristos 
1230e552da7Schristos   h = 0;
1240e552da7Schristos   ss = s;
1250e552da7Schristos   todo = 0;
1260e552da7Schristos 
127*5f2f4271Schristos   /* Note: after this loop we've visited all UTF-8 characters and know
128*5f2f4271Schristos    * they're legal so we no longer need to check for decode errors.
129*5f2f4271Schristos    */
130*5f2f4271Schristos   while (s < se) {
131*5f2f4271Schristos     c = uv__utf8_decode1(&s, se);
132*5f2f4271Schristos 
133*5f2f4271Schristos     if (c == UINT_MAX)
134*5f2f4271Schristos       return UV_EINVAL;
135*5f2f4271Schristos 
1360e552da7Schristos     if (c < 128)
1370e552da7Schristos       h++;
1380e552da7Schristos     else
1390e552da7Schristos       todo++;
1400e552da7Schristos   }
1410e552da7Schristos 
142*5f2f4271Schristos   /* Only write "xn--" when there are non-ASCII characters. */
1430e552da7Schristos   if (todo > 0) {
1440e552da7Schristos     if (*d < de) *(*d)++ = 'x';
1450e552da7Schristos     if (*d < de) *(*d)++ = 'n';
1460e552da7Schristos     if (*d < de) *(*d)++ = '-';
1470e552da7Schristos     if (*d < de) *(*d)++ = '-';
1480e552da7Schristos   }
1490e552da7Schristos 
150*5f2f4271Schristos   /* Write ASCII characters. */
1510e552da7Schristos   x = 0;
1520e552da7Schristos   s = ss;
153*5f2f4271Schristos   while (s < se) {
154*5f2f4271Schristos     c = uv__utf8_decode1(&s, se);
155*5f2f4271Schristos     assert(c != UINT_MAX);
156*5f2f4271Schristos 
1570e552da7Schristos     if (c > 127)
1580e552da7Schristos       continue;
1590e552da7Schristos 
1600e552da7Schristos     if (*d < de)
1610e552da7Schristos       *(*d)++ = c;
1620e552da7Schristos 
1630e552da7Schristos     if (++x == h)
1640e552da7Schristos       break;  /* Visited all ASCII characters. */
1650e552da7Schristos   }
1660e552da7Schristos 
1670e552da7Schristos   if (todo == 0)
1680e552da7Schristos     return h;
1690e552da7Schristos 
1700e552da7Schristos   /* Only write separator when we've written ASCII characters first. */
1710e552da7Schristos   if (h > 0)
1720e552da7Schristos     if (*d < de)
1730e552da7Schristos       *(*d)++ = '-';
1740e552da7Schristos 
1750e552da7Schristos   n = 128;
1760e552da7Schristos   bias = 72;
1770e552da7Schristos   delta = 0;
1780e552da7Schristos   first = 1;
1790e552da7Schristos 
1800e552da7Schristos   while (todo > 0) {
1810e552da7Schristos     m = -1;
1820e552da7Schristos     s = ss;
183*5f2f4271Schristos 
184*5f2f4271Schristos     while (s < se) {
185*5f2f4271Schristos       c = uv__utf8_decode1(&s, se);
186*5f2f4271Schristos       assert(c != UINT_MAX);
187*5f2f4271Schristos 
1880e552da7Schristos       if (c >= n)
1890e552da7Schristos         if (c < m)
1900e552da7Schristos           m = c;
191*5f2f4271Schristos     }
1920e552da7Schristos 
1930e552da7Schristos     x = m - n;
1940e552da7Schristos     y = h + 1;
1950e552da7Schristos 
1960e552da7Schristos     if (x > ~delta / y)
1970e552da7Schristos       return UV_E2BIG;  /* Overflow. */
1980e552da7Schristos 
1990e552da7Schristos     delta += x * y;
2000e552da7Schristos     n = m;
2010e552da7Schristos 
2020e552da7Schristos     s = ss;
203*5f2f4271Schristos     while (s < se) {
204*5f2f4271Schristos       c = uv__utf8_decode1(&s, se);
205*5f2f4271Schristos       assert(c != UINT_MAX);
206*5f2f4271Schristos 
2070e552da7Schristos       if (c < n)
2080e552da7Schristos         if (++delta == 0)
2090e552da7Schristos           return UV_E2BIG;  /* Overflow. */
2100e552da7Schristos 
2110e552da7Schristos       if (c != n)
2120e552da7Schristos         continue;
2130e552da7Schristos 
2140e552da7Schristos       for (k = 36, q = delta; /* empty */; k += 36) {
2150e552da7Schristos         t = 1;
2160e552da7Schristos 
2170e552da7Schristos         if (k > bias)
2180e552da7Schristos           t = k - bias;
2190e552da7Schristos 
2200e552da7Schristos         if (t > 26)
2210e552da7Schristos           t = 26;
2220e552da7Schristos 
2230e552da7Schristos         if (q < t)
2240e552da7Schristos           break;
2250e552da7Schristos 
2260e552da7Schristos         /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
2270e552da7Schristos          * 10 <= y <= 35, we can optimize the long division
2280e552da7Schristos          * into a table-based reciprocal multiplication.
2290e552da7Schristos          */
2300e552da7Schristos         x = q - t;
2310e552da7Schristos         y = 36 - t;  /* 10 <= y <= 35 since 1 <= t <= 26. */
2320e552da7Schristos         q = x / y;
2330e552da7Schristos         t = t + x % y;  /* 1 <= t <= 35 because of y. */
2340e552da7Schristos 
2350e552da7Schristos         if (*d < de)
2360e552da7Schristos           *(*d)++ = alphabet[t];
2370e552da7Schristos       }
2380e552da7Schristos 
2390e552da7Schristos       if (*d < de)
2400e552da7Schristos         *(*d)++ = alphabet[q];
2410e552da7Schristos 
2420e552da7Schristos       delta /= 2;
2430e552da7Schristos 
2440e552da7Schristos       if (first) {
2450e552da7Schristos         delta /= 350;
2460e552da7Schristos         first = 0;
2470e552da7Schristos       }
2480e552da7Schristos 
2490e552da7Schristos       /* No overflow check is needed because |delta| was just
2500e552da7Schristos        * divided by 2 and |delta+delta >= delta + delta/h|.
2510e552da7Schristos        */
2520e552da7Schristos       h++;
2530e552da7Schristos       delta += delta / h;
2540e552da7Schristos 
2550e552da7Schristos       for (bias = 0; delta > 35 * 26 / 2; bias += 36)
2560e552da7Schristos         delta /= 35;
2570e552da7Schristos 
2580e552da7Schristos       bias += 36 * delta / (delta + 38);
2590e552da7Schristos       delta = 0;
2600e552da7Schristos       todo--;
2610e552da7Schristos     }
2620e552da7Schristos 
2630e552da7Schristos     delta++;
2640e552da7Schristos     n++;
2650e552da7Schristos   }
2660e552da7Schristos 
2670e552da7Schristos   return 0;
2680e552da7Schristos }
2690e552da7Schristos 
uv__idna_toascii(const char * s,const char * se,char * d,char * de)2700e552da7Schristos long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
2710e552da7Schristos   const char* si;
2720e552da7Schristos   const char* st;
2730e552da7Schristos   unsigned c;
2740e552da7Schristos   char* ds;
2750e552da7Schristos   int rc;
2760e552da7Schristos 
2770e552da7Schristos   ds = d;
2780e552da7Schristos 
279*5f2f4271Schristos   si = s;
280*5f2f4271Schristos   while (si < se) {
2810e552da7Schristos     st = si;
2820e552da7Schristos     c = uv__utf8_decode1(&si, se);
2830e552da7Schristos 
284*5f2f4271Schristos     if (c == UINT_MAX)
285*5f2f4271Schristos       return UV_EINVAL;
286*5f2f4271Schristos 
2870e552da7Schristos     if (c != '.')
2880e552da7Schristos       if (c != 0x3002)  /* 。 */
2890e552da7Schristos         if (c != 0xFF0E)  /* . */
2900e552da7Schristos           if (c != 0xFF61)  /* 。 */
2910e552da7Schristos             continue;
2920e552da7Schristos 
2930e552da7Schristos     rc = uv__idna_toascii_label(s, st, &d, de);
2940e552da7Schristos 
2950e552da7Schristos     if (rc < 0)
2960e552da7Schristos       return rc;
2970e552da7Schristos 
2980e552da7Schristos     if (d < de)
2990e552da7Schristos       *d++ = '.';
3000e552da7Schristos 
3010e552da7Schristos     s = si;
3020e552da7Schristos   }
3030e552da7Schristos 
3040e552da7Schristos   if (s < se) {
3050e552da7Schristos     rc = uv__idna_toascii_label(s, se, &d, de);
3060e552da7Schristos 
3070e552da7Schristos     if (rc < 0)
3080e552da7Schristos       return rc;
3090e552da7Schristos   }
3100e552da7Schristos 
3110e552da7Schristos   if (d < de)
3120e552da7Schristos     *d++ = '\0';
3130e552da7Schristos 
3140e552da7Schristos   return d - ds;  /* Number of bytes written. */
3150e552da7Schristos }
316