xref: /netbsd-src/external/gpl3/gcc.old/dist/libphobos/src/std/uri.d (revision 627f7eb200a4419d89b531d55fccd2ee3ffdcde0)
1 // Written in the D programming language.
2 
3 /**
4  * Encode and decode Uniform Resource Identifiers (URIs).
5  * URIs are used in internet transfer protocols.
6  * Valid URI characters consist of letters, digits,
7  * and the characters $(B ;/?:@&=+$,-_.!~*'())
8  * Reserved URI characters are $(B ;/?:@&=+$,)
9  * Escape sequences consist of $(B %) followed by two hex digits.
10  *
11  * See_Also:
12  *  $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
13  *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
14  * Copyright: Copyright Digital Mars 2000 - 2009.
15  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
16  * Authors:   $(HTTP digitalmars.com, Walter Bright)
17  * Source:    $(PHOBOSSRC std/_uri.d)
18  */
19 /*          Copyright Digital Mars 2000 - 2009.
20  * Distributed under the Boost Software License, Version 1.0.
21  *    (See accompanying file LICENSE_1_0.txt or copy at
22  *          http://www.boost.org/LICENSE_1_0.txt)
23  */
24 module std.uri;
25 
26 //debug=uri;        // uncomment to turn on debugging writefln's
27 debug(uri) import std.stdio;
28 import std.traits : isSomeChar;
29 
30 /** This Exception is thrown if something goes wrong when encoding or
31 decoding a URI.
32 */
33 class URIException : Exception
34 {
35     import std.exception : basicExceptionCtors;
36     mixin basicExceptionCtors;
37 }
38 
39 private enum
40 {
41     URI_Alpha = 1,
42     URI_Reserved = 2,
43     URI_Mark = 4,
44     URI_Digit = 8,
45     URI_Hash = 0x10,        // '#'
46 }
47 
48 private immutable char[16] hex2ascii = "0123456789ABCDEF";
49 
50 private immutable ubyte[128] uri_flags =      // indexed by character
51     ({
52         ubyte[128] uflags;
53 
54         // Compile time initialize
55         uflags['#'] |= URI_Hash;
56 
57         foreach (c; 'A' .. 'Z' + 1)
58         {
59             uflags[c] |= URI_Alpha;
60             uflags[c + 0x20] |= URI_Alpha;   // lowercase letters
61         }
62         foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
63         foreach (c; ";/?:@&=+$,")   uflags[c] |= URI_Reserved;
64         foreach (c; "-_.!~*'()")    uflags[c] |= URI_Mark;
65         return uflags;
66     })();
67 
URI_Encode(dstring str,uint unescapedSet)68 private string URI_Encode(dstring str, uint unescapedSet)
69 {
70     import core.exception : OutOfMemoryError;
71     import core.stdc.stdlib : alloca;
72 
73     uint j;
74     uint k;
75     dchar V;
76     dchar C;
77 
78     // result buffer
79     char[50] buffer = void;
80     char* R;
81     uint Rlen;
82     uint Rsize; // alloc'd size
83 
84     immutable len = str.length;
85 
86     R = buffer.ptr;
87     Rsize = buffer.length;
88     Rlen = 0;
89 
90     for (k = 0; k != len; k++)
91     {
92         C = str[k];
93         // if (C in unescapedSet)
94         if (C < uri_flags.length && uri_flags[C] & unescapedSet)
95         {
96             if (Rlen == Rsize)
97             {
98                 char* R2;
99 
100                 Rsize *= 2;
101                 if (Rsize > 1024)
102                 {
103                     R2 = (new char[Rsize]).ptr;
104                 }
105                 else
106                 {
107                     R2 = cast(char *) alloca(Rsize * char.sizeof);
108                     if (!R2)
109                         throw new OutOfMemoryError("Alloca failure");
110                 }
111                 R2[0 .. Rlen] = R[0 .. Rlen];
112                 R = R2;
113             }
114             R[Rlen] = cast(char) C;
115             Rlen++;
116         }
117         else
118         {
119             char[6] Octet;
120             uint L;
121 
122             V = C;
123 
124             // Transform V into octets
125             if (V <= 0x7F)
126             {
127                 Octet[0] = cast(char) V;
128                 L = 1;
129             }
130             else if (V <= 0x7FF)
131             {
132                 Octet[0] = cast(char)(0xC0 | (V >> 6));
133                 Octet[1] = cast(char)(0x80 | (V & 0x3F));
134                 L = 2;
135             }
136             else if (V <= 0xFFFF)
137             {
138                 Octet[0] = cast(char)(0xE0 | (V >> 12));
139                 Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
140                 Octet[2] = cast(char)(0x80 | (V & 0x3F));
141                 L = 3;
142             }
143             else if (V <= 0x1FFFFF)
144             {
145                 Octet[0] = cast(char)(0xF0 | (V >> 18));
146                 Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
147                 Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
148                 Octet[3] = cast(char)(0x80 | (V & 0x3F));
149                 L = 4;
150             }
151             else
152             {
153                 throw new URIException("Undefined UTF-32 code point");
154             }
155 
156             if (Rlen + L * 3 > Rsize)
157             {
158                 char *R2;
159 
160                 Rsize = 2 * (Rlen + L * 3);
161                 if (Rsize > 1024)
162                 {
163                     R2 = (new char[Rsize]).ptr;
164                 }
165                 else
166                 {
167                     R2 = cast(char *) alloca(Rsize * char.sizeof);
168                     if (!R2)
169                         throw new OutOfMemoryError("Alloca failure");
170                 }
171                 R2[0 .. Rlen] = R[0 .. Rlen];
172                 R = R2;
173             }
174 
175             for (j = 0; j < L; j++)
176             {
177                 R[Rlen] = '%';
178                 R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
179                 R[Rlen + 2] = hex2ascii[Octet[j] & 15];
180 
181                 Rlen += 3;
182             }
183         }
184     }
185 
186     return R[0 .. Rlen].idup;
187 }
188 
ascii2hex(dchar c)189 private uint ascii2hex(dchar c) @nogc @safe pure nothrow
190 {
191     return (c <= '9') ? c - '0' :
192         (c <= 'F') ? c - 'A' + 10 :
193         c - 'a' + 10;
194 }
195 
196 private dstring URI_Decode(Char)(in Char[] uri, uint reservedSet)
197 if (isSomeChar!Char)
198 {
199     import core.exception : OutOfMemoryError;
200     import core.stdc.stdlib : alloca;
201     import std.ascii : isHexDigit;
202 
203     uint j;
204     uint k;
205     uint V;
206     dchar C;
207 
208     // Result array, allocated on stack
209     dchar* R;
210     uint Rlen;
211 
212     immutable len = uri.length;
213     auto s = uri.ptr;
214 
215     // Preallocate result buffer R guaranteed to be large enough for result
216     auto Rsize = len;
217     if (Rsize > 1024 / dchar.sizeof)
218     {
219         R = (new dchar[Rsize]).ptr;
220     }
221     else
222     {
223         R = cast(dchar *) alloca(Rsize * dchar.sizeof);
224         if (!R)
225             throw new OutOfMemoryError("Alloca failure");
226     }
227     Rlen = 0;
228 
229     for (k = 0; k != len; k++)
230     {
231         char B;
232         uint start;
233 
234         C = s[k];
235         if (C != '%')
236         {
237             R[Rlen] = C;
238             Rlen++;
239             continue;
240         }
241         start = k;
242         if (k + 2 >= len)
243             throw new URIException("Unexpected end of URI");
244         if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
245             throw new URIException("Expected two hexadecimal digits after '%'");
246         B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
247         k += 2;
248         if ((B & 0x80) == 0)
249         {
250             C = B;
251         }
252         else
253         {
254             uint n;
255 
256             for (n = 1; ; n++)
257             {
258                 if (n > 4)
259                     throw new URIException("UTF-32 code point size too large");
260                 if (((B << n) & 0x80) == 0)
261                 {
262                     if (n == 1)
263                         throw new URIException("UTF-32 code point size too small");
264                     break;
265                 }
266             }
267 
268             // Pick off (7 - n) significant bits of B from first byte of octet
269             V = B & ((1 << (7 - n)) - 1);   // (!!!)
270 
271             if (k + (3 * (n - 1)) >= len)
272                 throw new URIException("UTF-32 unaligned String");
273             for (j = 1; j != n; j++)
274             {
275                 k++;
276                 if (s[k] != '%')
277                     throw new URIException("Expected: '%'");
278                 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
279                     throw new URIException("Expected two hexadecimal digits after '%'");
280                 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
281                 if ((B & 0xC0) != 0x80)
282                     throw new URIException("Incorrect UTF-32 multi-byte sequence");
283                 k += 2;
284                 V = (V << 6) | (B & 0x3F);
285             }
286             if (V > 0x10FFFF)
287                 throw new URIException("Unknown UTF-32 code point");
288             C = V;
289         }
290         if (C < uri_flags.length && uri_flags[C] & reservedSet)
291         {
292             // R ~= s[start .. k + 1];
293             immutable width = (k + 1) - start;
294             for (int ii = 0; ii < width; ii++)
295                 R[Rlen + ii] = s[start + ii];
296             Rlen += width;
297         }
298         else
299         {
300             R[Rlen] = C;
301             Rlen++;
302         }
303     }
304     assert(Rlen <= Rsize);  // enforce our preallocation size guarantee
305 
306     // Copy array on stack to array in memory
307     return R[0 .. Rlen].idup;
308 }
309 
310 /*************************************
311  * Decodes the URI string encodedURI into a UTF-8 string and returns it.
312  * Escape sequences that resolve to reserved URI characters are not replaced.
313  * Escape sequences that resolve to the '#' character are not replaced.
314  */
315 
316 string decode(Char)(in Char[] encodedURI)
317 if (isSomeChar!Char)
318 {
319     import std.algorithm.iteration : each;
320     import std.utf : encode;
321     auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
322     char[] r;
323     s.each!(c => encode(r, c));
324     return r;
325 }
326 
327 /*******************************
328  * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
329  * escape sequences are decoded.
330  */
331 
332 string decodeComponent(Char)(in Char[] encodedURIComponent)
333 if (isSomeChar!Char)
334 {
335     import std.algorithm.iteration : each;
336     import std.utf : encode;
337     auto s = URI_Decode(encodedURIComponent, 0);
338     char[] r;
339     s.each!(c => encode(r, c));
340     return r;
341 }
342 
343 /*****************************
344  * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
345  * not a valid URI character is escaped. The '#' character is not escaped.
346  */
347 
348 string encode(Char)(in Char[] uri)
349 if (isSomeChar!Char)
350 {
351     import std.utf : toUTF32;
352     auto s = toUTF32(uri);
353     return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
354 }
355 
356 /********************************
357  * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
358  * Any character not a letter, digit, or one of -_.!~*'() is escaped.
359  */
360 
361 string encodeComponent(Char)(in Char[] uriComponent)
362 if (isSomeChar!Char)
363 {
364     import std.utf : toUTF32;
365     auto s = toUTF32(uriComponent);
366     return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
367 }
368 
369 /* Encode associative array using www-form-urlencoding
370  *
371  * Params:
372  *      values = an associative array containing the values to be encoded.
373  *
374  * Returns:
375  *      A string encoded using www-form-urlencoding.
376  */
urlEncode(in string[string]values)377 package string urlEncode(in string[string] values)
378 {
379     if (values.length == 0)
380         return "";
381 
382     import std.array : Appender;
383     import std.format : formattedWrite;
384 
385     Appender!string enc;
386     enc.reserve(values.length * 128);
387 
388     bool first = true;
389     foreach (k, v; values)
390     {
391         if (!first)
392             enc.put('&');
393         formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
394         first = false;
395     }
396     return enc.data;
397 }
398 
399 @system unittest
400 {
401     // @system because urlEncode -> encodeComponent -> URI_Encode
402     // URI_Encode uses alloca and pointer slicing
403     string[string] a;
404     assert(urlEncode(a) == "");
405     assert(urlEncode(["name1" : "value1"]) == "name1=value1");
406     auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
407     assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
408 }
409 
410 /***************************
411  * Does string s[] start with a URL?
412  * Returns:
413  *  -1   it does not
414  *  len  it does, and s[0 .. len] is the slice of s[] that is that URL
415  */
416 
417 ptrdiff_t uriLength(Char)(in Char[] s)
418 if (isSomeChar!Char)
419 {
420     /* Must start with one of:
421      *  http://
422      *  https://
423      *  www.
424      */
425     import std.ascii : isAlphaNum;
426     import std.uni : icmp;
427 
428     ptrdiff_t i;
429 
430     if (s.length <= 4)
431         return -1;
432 
433     if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
434     {
435         i = 7;
436     }
437     else
438     {
439         if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
440             i = 8;
441         else
442             return -1;
443     }
444 
445     ptrdiff_t lastdot;
446     for (; i < s.length; i++)
447     {
448         auto c = s[i];
449         if (isAlphaNum(c))
450             continue;
451         if (c == '-' || c == '_' || c == '?' ||
452                 c == '=' || c == '%' || c == '&' ||
453                 c == '/' || c == '+' || c == '#' ||
454                 c == '~' || c == '$')
455             continue;
456         if (c == '.')
457         {
458             lastdot = i;
459             continue;
460         }
461         break;
462     }
463     if (!lastdot)
464         return -1;
465 
466     return i;
467 }
468 
469 ///
470 @safe unittest
471 {
472     string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
473     assert(uriLength(s1) == 49);
474     string s2 = "no uri here";
475     assert(uriLength(s2) == -1);
476     assert(uriLength("issue 14924") < 0);
477 }
478 
479 
480 /***************************
481  * Does string s[] start with an email address?
482  * Returns:
483  *  -1    it does not
484  *  len   it does, and s[0 .. i] is the slice of s[] that is that email address
485  * References:
486  *  RFC2822
487  */
488 ptrdiff_t emailLength(Char)(in Char[] s)
489 if (isSomeChar!Char)
490 {
491     import std.ascii : isAlpha, isAlphaNum;
492 
493     ptrdiff_t i;
494 
495     if (!isAlpha(s[0]))
496         return -1;
497 
498     for (i = 1; 1; i++)
499     {
500         if (i == s.length)
501             return -1;
502         auto c = s[i];
503         if (isAlphaNum(c))
504             continue;
505         if (c == '-' || c == '_' || c == '.')
506             continue;
507         if (c != '@')
508             return -1;
509         i++;
510         break;
511     }
512 
513     /* Now do the part past the '@'
514      */
515     ptrdiff_t lastdot;
516     for (; i < s.length; i++)
517     {
518         auto c = s[i];
519         if (isAlphaNum(c))
520             continue;
521         if (c == '-' || c == '_')
522             continue;
523         if (c == '.')
524         {
525             lastdot = i;
526             continue;
527         }
528         break;
529     }
530     if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
531         return -1;
532 
533     return i;
534 }
535 
536 ///
537 @safe unittest
538 {
539     string s1 = "my.e-mail@www.example-domain.com with garbage added";
540     assert(emailLength(s1) == 32);
541     string s2 = "no email address here";
542     assert(emailLength(s2) == -1);
543     assert(emailLength("issue 14924") < 0);
544 }
545 
546 
547 @system unittest
548 {
549     //@system because of encode -> URI_Encode
550     debug(uri) writeln("uri.encodeURI.unittest");
551 
552     string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
553     string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
554 
555     auto result = encode(source);
556     debug(uri) writefln("result = '%s'", result);
557     assert(result == target);
558     result = decode(target);
559     debug(uri) writefln("result = '%s'", result);
560     assert(result == source);
561 
562     result = encode(decode("%E3%81%82%E3%81%82"));
563     assert(result == "%E3%81%82%E3%81%82");
564 
565     result = encodeComponent("c++");
566     assert(result == "c%2B%2B");
567 
568     auto str = new char[10_000_000];
569     str[] = 'A';
570     result = encodeComponent(str);
571     foreach (char c; result)
572         assert(c == 'A');
573 
574     result = decode("%41%42%43");
575     debug(uri) writeln(result);
576 
577     import std.meta : AliasSeq;
578     foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
579     {
580         import std.conv : to;
581         StringType decoded1 = source.to!StringType;
582         string encoded1 = encode(decoded1);
583         assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
584         assert(encoded1 == target);
585         assert(decoded1 == decode(encoded1).to!StringType);
586 
587         StringType encoded2 = target.to!StringType;
588         string decoded2 = decode(encoded2);
589         assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
590         assert(decoded2 == source);
591         assert(encoded2 == encode(decoded2).to!StringType);
592     }
593 }
594