xref: /netbsd-src/external/mit/libuv/dist/src/idna.c (revision fb5eed702691094bd687fbf1ded189c87457cd35)
1 /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl>
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14  */
15 
16 /* Derived from https://github.com/bnoordhuis/punycode
17  * but updated to support IDNA 2008.
18  */
19 
20 #include "uv.h"
21 #include "idna.h"
22 #include <string.h>
23 
24 static unsigned uv__utf8_decode1_slow(const char** p,
25                                       const char* pe,
26                                       unsigned a) {
27   unsigned b;
28   unsigned c;
29   unsigned d;
30   unsigned min;
31 
32   if (a > 0xF7)
33     return -1;
34 
35   switch (*p - pe) {
36   default:
37     if (a > 0xEF) {
38       min = 0x10000;
39       a = a & 7;
40       b = (unsigned char) *(*p)++;
41       c = (unsigned char) *(*p)++;
42       d = (unsigned char) *(*p)++;
43       break;
44     }
45     /* Fall through. */
46   case 2:
47     if (a > 0xDF) {
48       min = 0x800;
49       b = 0x80 | (a & 15);
50       c = (unsigned char) *(*p)++;
51       d = (unsigned char) *(*p)++;
52       a = 0;
53       break;
54     }
55     /* Fall through. */
56   case 1:
57     if (a > 0xBF) {
58       min = 0x80;
59       b = 0x80;
60       c = 0x80 | (a & 31);
61       d = (unsigned char) *(*p)++;
62       a = 0;
63       break;
64     }
65     return -1;  /* Invalid continuation byte. */
66   }
67 
68   if (0x80 != (0xC0 & (b ^ c ^ d)))
69     return -1;  /* Invalid sequence. */
70 
71   b &= 63;
72   c &= 63;
73   d &= 63;
74   a = (a << 18) | (b << 12) | (c << 6) | d;
75 
76   if (a < min)
77     return -1;  /* Overlong sequence. */
78 
79   if (a > 0x10FFFF)
80     return -1;  /* Four-byte sequence > U+10FFFF. */
81 
82   if (a >= 0xD800 && a <= 0xDFFF)
83     return -1;  /* Surrogate pair. */
84 
85   return a;
86 }
87 
88 unsigned uv__utf8_decode1(const char** p, const char* pe) {
89   unsigned a;
90 
91   a = (unsigned char) *(*p)++;
92 
93   if (a < 128)
94     return a;  /* ASCII, common case. */
95 
96   return uv__utf8_decode1_slow(p, pe, a);
97 }
98 
99 #define foreach_codepoint(c, p, pe) \
100   for (; (void) (*p <= pe && (c = uv__utf8_decode1(p, pe))), *p <= pe;)
101 
102 static int uv__idna_toascii_label(const char* s, const char* se,
103                                   char** d, char* de) {
104   static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
105   const char* ss;
106   unsigned c;
107   unsigned h;
108   unsigned k;
109   unsigned n;
110   unsigned m;
111   unsigned q;
112   unsigned t;
113   unsigned x;
114   unsigned y;
115   unsigned bias;
116   unsigned delta;
117   unsigned todo;
118   int first;
119 
120   h = 0;
121   ss = s;
122   todo = 0;
123 
124   foreach_codepoint(c, &s, se) {
125     if (c < 128)
126       h++;
127     else if (c == (unsigned) -1)
128       return UV_EINVAL;
129     else
130       todo++;
131   }
132 
133   if (todo > 0) {
134     if (*d < de) *(*d)++ = 'x';
135     if (*d < de) *(*d)++ = 'n';
136     if (*d < de) *(*d)++ = '-';
137     if (*d < de) *(*d)++ = '-';
138   }
139 
140   x = 0;
141   s = ss;
142   foreach_codepoint(c, &s, se) {
143     if (c > 127)
144       continue;
145 
146     if (*d < de)
147       *(*d)++ = c;
148 
149     if (++x == h)
150       break;  /* Visited all ASCII characters. */
151   }
152 
153   if (todo == 0)
154     return h;
155 
156   /* Only write separator when we've written ASCII characters first. */
157   if (h > 0)
158     if (*d < de)
159       *(*d)++ = '-';
160 
161   n = 128;
162   bias = 72;
163   delta = 0;
164   first = 1;
165 
166   while (todo > 0) {
167     m = -1;
168     s = ss;
169     foreach_codepoint(c, &s, se)
170       if (c >= n)
171         if (c < m)
172           m = c;
173 
174     x = m - n;
175     y = h + 1;
176 
177     if (x > ~delta / y)
178       return UV_E2BIG;  /* Overflow. */
179 
180     delta += x * y;
181     n = m;
182 
183     s = ss;
184     foreach_codepoint(c, &s, se) {
185       if (c < n)
186         if (++delta == 0)
187           return UV_E2BIG;  /* Overflow. */
188 
189       if (c != n)
190         continue;
191 
192       for (k = 36, q = delta; /* empty */; k += 36) {
193         t = 1;
194 
195         if (k > bias)
196           t = k - bias;
197 
198         if (t > 26)
199           t = 26;
200 
201         if (q < t)
202           break;
203 
204         /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
205          * 10 <= y <= 35, we can optimize the long division
206          * into a table-based reciprocal multiplication.
207          */
208         x = q - t;
209         y = 36 - t;  /* 10 <= y <= 35 since 1 <= t <= 26. */
210         q = x / y;
211         t = t + x % y;  /* 1 <= t <= 35 because of y. */
212 
213         if (*d < de)
214           *(*d)++ = alphabet[t];
215       }
216 
217       if (*d < de)
218         *(*d)++ = alphabet[q];
219 
220       delta /= 2;
221 
222       if (first) {
223         delta /= 350;
224         first = 0;
225       }
226 
227       /* No overflow check is needed because |delta| was just
228        * divided by 2 and |delta+delta >= delta + delta/h|.
229        */
230       h++;
231       delta += delta / h;
232 
233       for (bias = 0; delta > 35 * 26 / 2; bias += 36)
234         delta /= 35;
235 
236       bias += 36 * delta / (delta + 38);
237       delta = 0;
238       todo--;
239     }
240 
241     delta++;
242     n++;
243   }
244 
245   return 0;
246 }
247 
248 #undef foreach_codepoint
249 
250 long uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
251   const char* si;
252   const char* st;
253   unsigned c;
254   char* ds;
255   int rc;
256 
257   ds = d;
258 
259   for (si = s; si < se; /* empty */) {
260     st = si;
261     c = uv__utf8_decode1(&si, se);
262 
263     if (c != '.')
264       if (c != 0x3002)  /* 。 */
265         if (c != 0xFF0E)  /* . */
266           if (c != 0xFF61)  /* 。 */
267             continue;
268 
269     rc = uv__idna_toascii_label(s, st, &d, de);
270 
271     if (rc < 0)
272       return rc;
273 
274     if (d < de)
275       *d++ = '.';
276 
277     s = si;
278   }
279 
280   if (s < se) {
281     rc = uv__idna_toascii_label(s, se, &d, de);
282 
283     if (rc < 0)
284       return rc;
285   }
286 
287   if (d < de)
288     *d++ = '\0';
289 
290   return d - ds;  /* Number of bytes written. */
291 }
292