1 /* Copyright (c) 2011, 2018 Ben Noordhuis <info@bnoordhuis.nl> 2 * 3 * Permission to use, copy, modify, and/or distribute this software for any 4 * purpose with or without fee is hereby granted, provided that the above 5 * copyright notice and this permission notice appear in all copies. 6 * 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 */ 15 16 /* Derived from https://github.com/bnoordhuis/punycode 17 * but updated to support IDNA 2008. 18 */ 19 20 #include "uv.h" 21 #include "idna.h" 22 #include <string.h> 23 24 static unsigned uv__utf8_decode1_slow(const char** p, 25 const char* pe, 26 unsigned a) { 27 unsigned b; 28 unsigned c; 29 unsigned d; 30 unsigned min; 31 32 if (a > 0xF7) 33 return -1; 34 35 switch (*p - pe) { 36 default: 37 if (a > 0xEF) { 38 min = 0x10000; 39 a = a & 7; 40 b = (unsigned char) *(*p)++; 41 c = (unsigned char) *(*p)++; 42 d = (unsigned char) *(*p)++; 43 break; 44 } 45 /* Fall through. */ 46 case 2: 47 if (a > 0xDF) { 48 min = 0x800; 49 b = 0x80 | (a & 15); 50 c = (unsigned char) *(*p)++; 51 d = (unsigned char) *(*p)++; 52 a = 0; 53 break; 54 } 55 /* Fall through. */ 56 case 1: 57 if (a > 0xBF) { 58 min = 0x80; 59 b = 0x80; 60 c = 0x80 | (a & 31); 61 d = (unsigned char) *(*p)++; 62 a = 0; 63 break; 64 } 65 return -1; /* Invalid continuation byte. */ 66 } 67 68 if (0x80 != (0xC0 & (b ^ c ^ d))) 69 return -1; /* Invalid sequence. */ 70 71 b &= 63; 72 c &= 63; 73 d &= 63; 74 a = (a << 18) | (b << 12) | (c << 6) | d; 75 76 if (a < min) 77 return -1; /* Overlong sequence. */ 78 79 if (a > 0x10FFFF) 80 return -1; /* Four-byte sequence > U+10FFFF. */ 81 82 if (a >= 0xD800 && a <= 0xDFFF) 83 return -1; /* Surrogate pair. */ 84 85 return a; 86 } 87 88 unsigned uv__utf8_decode1(const char** p, const char* pe) { 89 unsigned a; 90 91 a = (unsigned char) *(*p)++; 92 93 if (a < 128) 94 return a; /* ASCII, common case. */ 95 96 return uv__utf8_decode1_slow(p, pe, a); 97 } 98 99 #define foreach_codepoint(c, p, pe) \ 100 for (; (void) (*p <= pe && (c = uv__utf8_decode1(p, pe))), *p <= pe;) 101 102 static int uv__idna_toascii_label(const char* s, const char* se, 103 char** d, char* de) { 104 static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789"; 105 const char* ss; 106 unsigned c; 107 unsigned h; 108 unsigned k; 109 unsigned n; 110 unsigned m; 111 unsigned q; 112 unsigned t; 113 unsigned x; 114 unsigned y; 115 unsigned bias; 116 unsigned delta; 117 unsigned todo; 118 int first; 119 120 h = 0; 121 ss = s; 122 todo = 0; 123 124 foreach_codepoint(c, &s, se) { 125 if (c < 128) 126 h++; 127 else if (c == (unsigned) -1) 128 return UV_EINVAL; 129 else 130 todo++; 131 } 132 133 if (todo > 0) { 134 if (*d < de) *(*d)++ = 'x'; 135 if (*d < de) *(*d)++ = 'n'; 136 if (*d < de) *(*d)++ = '-'; 137 if (*d < de) *(*d)++ = '-'; 138 } 139 140 x = 0; 141 s = ss; 142 foreach_codepoint(c, &s, se) { 143 if (c > 127) 144 continue; 145 146 if (*d < de) 147 *(*d)++ = c; 148 149 if (++x == h) 150 break; /* Visited all ASCII characters. */ 151 } 152 153 if (todo == 0) 154 return h; 155 156 /* Only write separator when we've written ASCII characters first. */ 157 if (h > 0) 158 if (*d < de) 159 *(*d)++ = '-'; 160 161 n = 128; 162 bias = 72; 163 delta = 0; 164 first = 1; 165 166 while (todo > 0) { 167 m = -1; 168 s = ss; 169 foreach_codepoint(c, &s, se) 170 if (c >= n) 171 if (c < m) 172 m = c; 173 174 x = m - n; 175 y = h + 1; 176 177 if (x > ~delta / y) 178 return UV_E2BIG; /* Overflow. */ 179 180 delta += x * y; 181 n = m; 182 183 s = ss; 184 foreach_codepoint(c, &s, se) { 185 if (c < n) 186 if (++delta == 0) 187 return UV_E2BIG; /* Overflow. */ 188 189 if (c != n) 190 continue; 191 192 for (k = 36, q = delta; /* empty */; k += 36) { 193 t = 1; 194 195 if (k > bias) 196 t = k - bias; 197 198 if (t > 26) 199 t = 26; 200 201 if (q < t) 202 break; 203 204 /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore 205 * 10 <= y <= 35, we can optimize the long division 206 * into a table-based reciprocal multiplication. 207 */ 208 x = q - t; 209 y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */ 210 q = x / y; 211 t = t + x % y; /* 1 <= t <= 35 because of y. */ 212 213 if (*d < de) 214 *(*d)++ = alphabet[t]; 215 } 216 217 if (*d < de) 218 *(*d)++ = alphabet[q]; 219 220 delta /= 2; 221 222 if (first) { 223 delta /= 350; 224 first = 0; 225 } 226 227 /* No overflow check is needed because |delta| was just 228 * divided by 2 and |delta+delta >= delta + delta/h|. 229 */ 230 h++; 231 delta += delta / h; 232 233 for (bias = 0; delta > 35 * 26 / 2; bias += 36) 234 delta /= 35; 235 236 bias += 36 * delta / (delta + 38); 237 delta = 0; 238 todo--; 239 } 240 241 delta++; 242 n++; 243 } 244 245 return 0; 246 } 247 248 #undef foreach_codepoint 249 250 long uv__idna_toascii(const char* s, const char* se, char* d, char* de) { 251 const char* si; 252 const char* st; 253 unsigned c; 254 char* ds; 255 int rc; 256 257 ds = d; 258 259 for (si = s; si < se; /* empty */) { 260 st = si; 261 c = uv__utf8_decode1(&si, se); 262 263 if (c != '.') 264 if (c != 0x3002) /* 。 */ 265 if (c != 0xFF0E) /* . */ 266 if (c != 0xFF61) /* 。 */ 267 continue; 268 269 rc = uv__idna_toascii_label(s, st, &d, de); 270 271 if (rc < 0) 272 return rc; 273 274 if (d < de) 275 *d++ = '.'; 276 277 s = si; 278 } 279 280 if (s < se) { 281 rc = uv__idna_toascii_label(s, se, &d, de); 282 283 if (rc < 0) 284 return rc; 285 } 286 287 if (d < de) 288 *d++ = '\0'; 289 290 return d - ds; /* Number of bytes written. */ 291 } 292