1 // Written in the D programming language. 2 3 /** 4 * Encode and decode Uniform Resource Identifiers (URIs). 5 * URIs are used in internet transfer protocols. 6 * Valid URI characters consist of letters, digits, 7 * and the characters $(B ;/?:@&=+$,-_.!~*'()) 8 * Reserved URI characters are $(B ;/?:@&=+$,) 9 * Escape sequences consist of $(B %) followed by two hex digits. 10 * 11 * See_Also: 12 * $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br> 13 * $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia) 14 * Copyright: Copyright Digital Mars 2000 - 2009. 15 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 16 * Authors: $(HTTP digitalmars.com, Walter Bright) 17 * Source: $(PHOBOSSRC std/_uri.d) 18 */ 19 /* Copyright Digital Mars 2000 - 2009. 20 * Distributed under the Boost Software License, Version 1.0. 21 * (See accompanying file LICENSE_1_0.txt or copy at 22 * http://www.boost.org/LICENSE_1_0.txt) 23 */ 24 module std.uri; 25 26 //debug=uri; // uncomment to turn on debugging writefln's 27 debug(uri) import std.stdio; 28 import std.traits : isSomeChar; 29 30 /** This Exception is thrown if something goes wrong when encoding or 31 decoding a URI. 32 */ 33 class URIException : Exception 34 { 35 import std.exception : basicExceptionCtors; 36 mixin basicExceptionCtors; 37 } 38 39 private enum 40 { 41 URI_Alpha = 1, 42 URI_Reserved = 2, 43 URI_Mark = 4, 44 URI_Digit = 8, 45 URI_Hash = 0x10, // '#' 46 } 47 48 private immutable char[16] hex2ascii = "0123456789ABCDEF"; 49 50 private immutable ubyte[128] uri_flags = // indexed by character 51 ({ 52 ubyte[128] uflags; 53 54 // Compile time initialize 55 uflags['#'] |= URI_Hash; 56 57 foreach (c; 'A' .. 'Z' + 1) 58 { 59 uflags[c] |= URI_Alpha; 60 uflags[c + 0x20] |= URI_Alpha; // lowercase letters 61 } 62 foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit; 63 foreach (c; ";/?:@&=+$,") uflags[c] |= URI_Reserved; 64 foreach (c; "-_.!~*'()") uflags[c] |= URI_Mark; 65 return uflags; 66 })(); 67 68 private string URI_Encode(dstring str, uint unescapedSet) 69 { 70 import core.exception : OutOfMemoryError; 71 import core.stdc.stdlib : alloca; 72 73 uint j; 74 uint k; 75 dchar V; 76 dchar C; 77 78 // result buffer 79 char[50] buffer = void; 80 char* R; 81 uint Rlen; 82 uint Rsize; // alloc'd size 83 84 immutable len = str.length; 85 86 R = buffer.ptr; 87 Rsize = buffer.length; 88 Rlen = 0; 89 90 for (k = 0; k != len; k++) 91 { 92 C = str[k]; 93 // if (C in unescapedSet) 94 if (C < uri_flags.length && uri_flags[C] & unescapedSet) 95 { 96 if (Rlen == Rsize) 97 { 98 char* R2; 99 100 Rsize *= 2; 101 if (Rsize > 1024) 102 { 103 R2 = (new char[Rsize]).ptr; 104 } 105 else 106 { 107 R2 = cast(char *) alloca(Rsize * char.sizeof); 108 if (!R2) 109 throw new OutOfMemoryError("Alloca failure"); 110 } 111 R2[0 .. Rlen] = R[0 .. Rlen]; 112 R = R2; 113 } 114 R[Rlen] = cast(char) C; 115 Rlen++; 116 } 117 else 118 { 119 char[6] Octet; 120 uint L; 121 122 V = C; 123 124 // Transform V into octets 125 if (V <= 0x7F) 126 { 127 Octet[0] = cast(char) V; 128 L = 1; 129 } 130 else if (V <= 0x7FF) 131 { 132 Octet[0] = cast(char)(0xC0 | (V >> 6)); 133 Octet[1] = cast(char)(0x80 | (V & 0x3F)); 134 L = 2; 135 } 136 else if (V <= 0xFFFF) 137 { 138 Octet[0] = cast(char)(0xE0 | (V >> 12)); 139 Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F)); 140 Octet[2] = cast(char)(0x80 | (V & 0x3F)); 141 L = 3; 142 } 143 else if (V <= 0x1FFFFF) 144 { 145 Octet[0] = cast(char)(0xF0 | (V >> 18)); 146 Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F)); 147 Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F)); 148 Octet[3] = cast(char)(0x80 | (V & 0x3F)); 149 L = 4; 150 } 151 else 152 { 153 throw new URIException("Undefined UTF-32 code point"); 154 } 155 156 if (Rlen + L * 3 > Rsize) 157 { 158 char *R2; 159 160 Rsize = 2 * (Rlen + L * 3); 161 if (Rsize > 1024) 162 { 163 R2 = (new char[Rsize]).ptr; 164 } 165 else 166 { 167 R2 = cast(char *) alloca(Rsize * char.sizeof); 168 if (!R2) 169 throw new OutOfMemoryError("Alloca failure"); 170 } 171 R2[0 .. Rlen] = R[0 .. Rlen]; 172 R = R2; 173 } 174 175 for (j = 0; j < L; j++) 176 { 177 R[Rlen] = '%'; 178 R[Rlen + 1] = hex2ascii[Octet[j] >> 4]; 179 R[Rlen + 2] = hex2ascii[Octet[j] & 15]; 180 181 Rlen += 3; 182 } 183 } 184 } 185 186 return R[0 .. Rlen].idup; 187 } 188 189 private uint ascii2hex(dchar c) @nogc @safe pure nothrow 190 { 191 return (c <= '9') ? c - '0' : 192 (c <= 'F') ? c - 'A' + 10 : 193 c - 'a' + 10; 194 } 195 196 private dstring URI_Decode(Char)(in Char[] uri, uint reservedSet) 197 if (isSomeChar!Char) 198 { 199 import core.exception : OutOfMemoryError; 200 import core.stdc.stdlib : alloca; 201 import std.ascii : isHexDigit; 202 203 uint j; 204 uint k; 205 uint V; 206 dchar C; 207 208 // Result array, allocated on stack 209 dchar* R; 210 uint Rlen; 211 212 immutable len = uri.length; 213 auto s = uri.ptr; 214 215 // Preallocate result buffer R guaranteed to be large enough for result 216 auto Rsize = len; 217 if (Rsize > 1024 / dchar.sizeof) 218 { 219 R = (new dchar[Rsize]).ptr; 220 } 221 else 222 { 223 R = cast(dchar *) alloca(Rsize * dchar.sizeof); 224 if (!R) 225 throw new OutOfMemoryError("Alloca failure"); 226 } 227 Rlen = 0; 228 229 for (k = 0; k != len; k++) 230 { 231 char B; 232 uint start; 233 234 C = s[k]; 235 if (C != '%') 236 { 237 R[Rlen] = C; 238 Rlen++; 239 continue; 240 } 241 start = k; 242 if (k + 2 >= len) 243 throw new URIException("Unexpected end of URI"); 244 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) 245 throw new URIException("Expected two hexadecimal digits after '%'"); 246 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); 247 k += 2; 248 if ((B & 0x80) == 0) 249 { 250 C = B; 251 } 252 else 253 { 254 uint n; 255 256 for (n = 1; ; n++) 257 { 258 if (n > 4) 259 throw new URIException("UTF-32 code point size too large"); 260 if (((B << n) & 0x80) == 0) 261 { 262 if (n == 1) 263 throw new URIException("UTF-32 code point size too small"); 264 break; 265 } 266 } 267 268 // Pick off (7 - n) significant bits of B from first byte of octet 269 V = B & ((1 << (7 - n)) - 1); // (!!!) 270 271 if (k + (3 * (n - 1)) >= len) 272 throw new URIException("UTF-32 unaligned String"); 273 for (j = 1; j != n; j++) 274 { 275 k++; 276 if (s[k] != '%') 277 throw new URIException("Expected: '%'"); 278 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) 279 throw new URIException("Expected two hexadecimal digits after '%'"); 280 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); 281 if ((B & 0xC0) != 0x80) 282 throw new URIException("Incorrect UTF-32 multi-byte sequence"); 283 k += 2; 284 V = (V << 6) | (B & 0x3F); 285 } 286 if (V > 0x10FFFF) 287 throw new URIException("Unknown UTF-32 code point"); 288 C = V; 289 } 290 if (C < uri_flags.length && uri_flags[C] & reservedSet) 291 { 292 // R ~= s[start .. k + 1]; 293 immutable width = (k + 1) - start; 294 for (int ii = 0; ii < width; ii++) 295 R[Rlen + ii] = s[start + ii]; 296 Rlen += width; 297 } 298 else 299 { 300 R[Rlen] = C; 301 Rlen++; 302 } 303 } 304 assert(Rlen <= Rsize); // enforce our preallocation size guarantee 305 306 // Copy array on stack to array in memory 307 return R[0 .. Rlen].idup; 308 } 309 310 /************************************* 311 * Decodes the URI string encodedURI into a UTF-8 string and returns it. 312 * Escape sequences that resolve to reserved URI characters are not replaced. 313 * Escape sequences that resolve to the '#' character are not replaced. 314 */ 315 316 string decode(Char)(in Char[] encodedURI) 317 if (isSomeChar!Char) 318 { 319 import std.algorithm.iteration : each; 320 import std.utf : encode; 321 auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash); 322 char[] r; 323 s.each!(c => encode(r, c)); 324 return r; 325 } 326 327 /******************************* 328 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All 329 * escape sequences are decoded. 330 */ 331 332 string decodeComponent(Char)(in Char[] encodedURIComponent) 333 if (isSomeChar!Char) 334 { 335 import std.algorithm.iteration : each; 336 import std.utf : encode; 337 auto s = URI_Decode(encodedURIComponent, 0); 338 char[] r; 339 s.each!(c => encode(r, c)); 340 return r; 341 } 342 343 /***************************** 344 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character 345 * not a valid URI character is escaped. The '#' character is not escaped. 346 */ 347 348 string encode(Char)(in Char[] uri) 349 if (isSomeChar!Char) 350 { 351 import std.utf : toUTF32; 352 auto s = toUTF32(uri); 353 return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark); 354 } 355 356 /******************************** 357 * Encodes the UTF-8 string uriComponent into a URI and returns that URI. 358 * Any character not a letter, digit, or one of -_.!~*'() is escaped. 359 */ 360 361 string encodeComponent(Char)(in Char[] uriComponent) 362 if (isSomeChar!Char) 363 { 364 import std.utf : toUTF32; 365 auto s = toUTF32(uriComponent); 366 return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark); 367 } 368 369 /* Encode associative array using www-form-urlencoding 370 * 371 * Params: 372 * values = an associative array containing the values to be encoded. 373 * 374 * Returns: 375 * A string encoded using www-form-urlencoding. 376 */ 377 package string urlEncode(in string[string] values) 378 { 379 if (values.length == 0) 380 return ""; 381 382 import std.array : Appender; 383 import std.format : formattedWrite; 384 385 Appender!string enc; 386 enc.reserve(values.length * 128); 387 388 bool first = true; 389 foreach (k, v; values) 390 { 391 if (!first) 392 enc.put('&'); 393 formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v)); 394 first = false; 395 } 396 return enc.data; 397 } 398 399 @system unittest 400 { 401 // @system because urlEncode -> encodeComponent -> URI_Encode 402 // URI_Encode uses alloca and pointer slicing 403 string[string] a; 404 assert(urlEncode(a) == ""); 405 assert(urlEncode(["name1" : "value1"]) == "name1=value1"); 406 auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]); 407 assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1"); 408 } 409 410 /*************************** 411 * Does string s[] start with a URL? 412 * Returns: 413 * -1 it does not 414 * len it does, and s[0 .. len] is the slice of s[] that is that URL 415 */ 416 417 ptrdiff_t uriLength(Char)(in Char[] s) 418 if (isSomeChar!Char) 419 { 420 /* Must start with one of: 421 * http:// 422 * https:// 423 * www. 424 */ 425 import std.ascii : isAlphaNum; 426 import std.uni : icmp; 427 428 ptrdiff_t i; 429 430 if (s.length <= 4) 431 return -1; 432 433 if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0) 434 { 435 i = 7; 436 } 437 else 438 { 439 if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0) 440 i = 8; 441 else 442 return -1; 443 } 444 445 ptrdiff_t lastdot; 446 for (; i < s.length; i++) 447 { 448 auto c = s[i]; 449 if (isAlphaNum(c)) 450 continue; 451 if (c == '-' || c == '_' || c == '?' || 452 c == '=' || c == '%' || c == '&' || 453 c == '/' || c == '+' || c == '#' || 454 c == '~' || c == '$') 455 continue; 456 if (c == '.') 457 { 458 lastdot = i; 459 continue; 460 } 461 break; 462 } 463 if (!lastdot) 464 return -1; 465 466 return i; 467 } 468 469 /// 470 @safe unittest 471 { 472 string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!"; 473 assert(uriLength(s1) == 49); 474 string s2 = "no uri here"; 475 assert(uriLength(s2) == -1); 476 assert(uriLength("issue 14924") < 0); 477 } 478 479 480 /*************************** 481 * Does string s[] start with an email address? 482 * Returns: 483 * -1 it does not 484 * len it does, and s[0 .. i] is the slice of s[] that is that email address 485 * References: 486 * RFC2822 487 */ 488 ptrdiff_t emailLength(Char)(in Char[] s) 489 if (isSomeChar!Char) 490 { 491 import std.ascii : isAlpha, isAlphaNum; 492 493 ptrdiff_t i; 494 495 if (!isAlpha(s[0])) 496 return -1; 497 498 for (i = 1; 1; i++) 499 { 500 if (i == s.length) 501 return -1; 502 auto c = s[i]; 503 if (isAlphaNum(c)) 504 continue; 505 if (c == '-' || c == '_' || c == '.') 506 continue; 507 if (c != '@') 508 return -1; 509 i++; 510 break; 511 } 512 513 /* Now do the part past the '@' 514 */ 515 ptrdiff_t lastdot; 516 for (; i < s.length; i++) 517 { 518 auto c = s[i]; 519 if (isAlphaNum(c)) 520 continue; 521 if (c == '-' || c == '_') 522 continue; 523 if (c == '.') 524 { 525 lastdot = i; 526 continue; 527 } 528 break; 529 } 530 if (!lastdot || (i - lastdot != 3 && i - lastdot != 4)) 531 return -1; 532 533 return i; 534 } 535 536 /// 537 @safe unittest 538 { 539 string s1 = "my.e-mail@www.example-domain.com with garbage added"; 540 assert(emailLength(s1) == 32); 541 string s2 = "no email address here"; 542 assert(emailLength(s2) == -1); 543 assert(emailLength("issue 14924") < 0); 544 } 545 546 547 @system unittest 548 { 549 //@system because of encode -> URI_Encode 550 debug(uri) writeln("uri.encodeURI.unittest"); 551 552 string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo"; 553 string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo"; 554 555 auto result = encode(source); 556 debug(uri) writefln("result = '%s'", result); 557 assert(result == target); 558 result = decode(target); 559 debug(uri) writefln("result = '%s'", result); 560 assert(result == source); 561 562 result = encode(decode("%E3%81%82%E3%81%82")); 563 assert(result == "%E3%81%82%E3%81%82"); 564 565 result = encodeComponent("c++"); 566 assert(result == "c%2B%2B"); 567 568 auto str = new char[10_000_000]; 569 str[] = 'A'; 570 result = encodeComponent(str); 571 foreach (char c; result) 572 assert(c == 'A'); 573 574 result = decode("%41%42%43"); 575 debug(uri) writeln(result); 576 577 import std.meta : AliasSeq; 578 foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring)) 579 { 580 import std.conv : to; 581 StringType decoded1 = source.to!StringType; 582 string encoded1 = encode(decoded1); 583 assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed 584 assert(encoded1 == target); 585 assert(decoded1 == decode(encoded1).to!StringType); 586 587 StringType encoded2 = target.to!StringType; 588 string decoded2 = decode(encoded2); 589 assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed 590 assert(decoded2 == source); 591 assert(encoded2 == encode(decoded2).to!StringType); 592 } 593 } 594