1implement URIs; 2 3# 4# RFC3986, URI Generic Syntax 5# 6 7include "sys.m"; 8 sys: Sys; 9 10include "string.m"; 11 S: String; 12 13include "uris.m"; 14 15Alpha: con "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; 16Digit: con "0123456789"; 17 18GenDelims: con ":/?#[]@"; 19SubDelims: con "!$&'()*+,;="; 20Reserved: con GenDelims + SubDelims; 21HexDigit: con Digit+"abcdefABCDEF"; 22 23Escape: con GenDelims+"%"; # "%" must be encoded as %25 24 25Unreserved: con Alpha+Digit+"-._~"; 26 27F_Esc, F_Scheme: con byte(1<<iota); 28 29ctype: array of byte; 30 31classify(s: string, f: byte) 32{ 33 for(i := 0; i < len s; i++) 34 ctype[s[i]] |= f; 35} 36 37init() 38{ 39 sys = load Sys Sys->PATH; 40 S = load String String->PATH; 41 if(S == nil) 42 raise sys->sprint("can't load %s: %r", String->PATH); 43 44 ctype = array [256] of { * => byte 0 }; 45 classify(Escape, F_Esc); 46 for(i := 0; i <= ' '; i++) 47 ctype[i] |= F_Esc; 48 for(i = 16r80; i <= 16rFF; i++) 49 ctype[i] |= F_Esc; 50 classify(Alpha+Digit+"+-.", F_Scheme); 51} 52 53# scheme://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment> 54# 55# ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 56# 57# delimiters: :/?# /?# ?# # 58# 59URI.parse(url: string): ref URI 60{ 61 scheme, userinfo, host, port, path, query, frag: string; 62 for(i := 0; i < len url; i++){ 63 c := url[i]; 64 if(c == ':'){ 65 scheme = S->tolower(url[0:i]); 66 url = url[i+1:]; 67 break; 68 } 69 if(c < 0 || c >= len ctype || (ctype[c] & F_Scheme) == byte 0) 70 break; 71 } 72 73 if(S->prefix("//", url)){ 74 authority: string; 75 (authority, path) = S->splitstrl(url[2:], "/"); 76 (up, hp) := splitl(authority, "@"); 77 if(hp == "") 78 hp = authority; 79 else 80 userinfo = up; 81 if(hp != nil && hp[0] == '['){ # another rfc hack, for IPv6 addresses, which contain : 82 (host, hp) = S->splitstrr(hp, "]"); 83 if(hp != nil && hp[0] == ':') 84 port = hp[1:]; 85 else 86 host += hp; # put it back 87 }else 88 (host, port) = splitl(hp, ":"); 89 if(path == nil) 90 path = "/"; 91 }else 92 path = url; 93 (path, frag) = S->splitstrl(path, "#"); # includes # in frag 94 (path, query) = S->splitstrl(path, "?"); # includes ? in query 95 return ref URI(scheme, dec(userinfo), dec(host), port, dec(path), query, dec(frag)); 96} 97 98URI.userpw(u: self ref URI): (string, string) 99{ 100 return splitl(u.userinfo, ":"); 101} 102 103URI.text(u: self ref URI): string 104{ 105 s := ""; 106 if(u.scheme != nil) 107 s += u.scheme + ":"; 108 if(u.hasauthority()) 109 s += "//" + u.authority(); 110 return s + enc(u.path, "/@:") + u.query + enc1(u.fragment, "@:/?"); 111} 112 113URI.copy(u: self ref URI): ref URI 114{ 115 return ref *u; 116} 117 118URI.pathonly(u: self ref URI): ref URI 119{ 120 v := ref *u; 121 v.userinfo = nil; 122 v.query = nil; 123 v.fragment = nil; 124 return v; 125} 126 127URI.addbase(u: self ref URI, b: ref URI): ref URI 128{ 129 # RFC3986 5.2.2, rearranged 130 r := ref *u; 131 if(r.scheme == nil && b != nil){ 132 r.scheme = b.scheme; 133 if(!r.hasauthority()){ 134 r.userinfo = b.userinfo; 135 r.host = b.host; 136 r.port = b.port; 137 if(r.path == nil){ 138 r.path = b.path; 139 if(r.query == nil) 140 r.query = b.query; 141 }else if(r.path[0] != '/'){ 142 # 5.2.3: merge paths 143 if(b.path == "" && b.hasauthority()) 144 p1 := "/"; 145 else 146 (p1, nil) = S->splitstrr(b.path, "/"); 147 r.path = p1 + r.path; 148 } 149 } 150 } 151 r.path = removedots(r.path); 152 return r; 153} 154 155URI.nodots(u: self ref URI): ref URI 156{ 157 return u.addbase(nil); 158} 159 160URI.hasauthority(u: self ref URI): int 161{ 162 return u.host != nil || u.userinfo != nil || u.port != nil; 163} 164 165URI.isabsolute(u: self ref URI): int 166{ 167 return u.scheme != nil; 168} 169 170URI.authority(u: self ref URI): string 171{ 172 s := enc(u.userinfo, ":"); 173 if(s != nil) 174 s += "@"; 175 if(u.host != nil){ 176 s += enc(u.host, "[]:"); # assumes : appears inside []; could enforce it 177 if(u.port != nil) 178 s += ":" + enc(u.port,nil); 179 } 180 return s; 181} 182 183# 184# simplified version of procedure in RFC3986 5.2.4: 185# it extracts a complete segment from the input first, then analyses it 186# 187removedots(s: string): string 188{ 189 if(s == nil) 190 return ""; 191 out := ""; 192 for(p := 0; p < len s;){ 193 # extract the first segment and any preceding / 194 q := p; 195 if(++p < len s){ 196 while(++p < len s && s[p] != '/') 197 {} 198 } 199 seg := s[q: p]; 200 if((e := p) < len s) 201 e++; 202 case s[q: e] { # includes any following / 203 "../" or "./" => ; 204 "/./" or "/." => 205 if(p >= len s) 206 s += "/"; 207 "/../" or "/.." => 208 if(p >= len s) 209 s += "/"; 210 if(out != nil){ 211 for(q = len out; --q > 0 && out[q] != '/';) 212 {} # skip 213 out = out[0: q]; 214 } 215 "." or ".." => ; # null effect 216 * => # including "/" 217 out += seg; 218 } 219 } 220 return out; 221} 222 223# 224# similar to splitstrl but trims the matched character from the result 225# 226splitl(s, c: string): (string, string) 227{ 228 (a, b) := S->splitstrl(s, c); 229 if(b != "") 230 b = b[1:]; 231 return (a, b); 232} 233 234hex2(s: string): int 235{ 236 n := 0; 237 for(i := 0; i < 2; i++){ 238 if(i >= len s) 239 return -1; 240 n <<= 4; 241 case c := s[i] { 242 '0' to '9' => 243 n += c-'0'; 244 'a' to 'f' => 245 n += 10+(c-'a'); 246 'A' to 'F' => 247 n += 10+(c-'A'); 248 * => 249 return -1; 250 } 251 } 252 return n; 253} 254 255dec(s: string): string 256{ 257 for(i := 0;; i++){ 258 if(i >= len s) 259 return s; 260 if(s[i] == '%' || s[i] == 0) 261 break; 262 } 263 t := s[0:i]; 264 a := array[Sys->UTFmax*len s] of byte; # upper bound 265 o := 0; 266 while(i < len s){ 267 c := s[i++]; 268 if(c < 16r80){ 269 case c { 270 '%' => 271 if((v := hex2(s[i:])) > 0){ 272 c = v; 273 i += 2; 274 } 275 0 => 276 c = ' '; # shouldn't happen 277 } 278 a[o++] = byte c; 279 }else 280 o += sys->char2byte(c, a, o); # string contained Unicode 281 } 282 return t + string a[0:o]; 283} 284 285enc1(s: string, safe: string): string 286{ 287 if(len s > 1) 288 return s[0:1] + enc(s[1:], safe); 289 return s; 290} 291 292# encoding depends on context (eg, &=/: not escaped in `query' string) 293enc(s: string, safe: string): string 294{ 295 for(i := 0;; i++){ 296 if(i >= len s) 297 return s; # use as-is 298 c := s[i]; 299 if(c >= 16r80 || (ctype[c] & F_Esc) != byte 0 && !S->in(c, safe)) 300 break; 301 } 302 t := s[0: i]; 303 b := array of byte s[i:]; 304 for(i = 0; i < len b; i++){ 305 c := int b[i]; 306 if((ctype[c] & F_Esc) != byte 0 && !S->in(c, safe)) 307 t += sys->sprint("%%%.2X", c); 308 else 309 t[len t] = c; 310 } 311 return t; 312} 313 314URI.eq(u: self ref URI, v: ref URI): int 315{ 316 if(v == nil) 317 return 0; 318 return u.scheme == v.scheme && u.userinfo == v.userinfo && 319 u.host == v.host && u.port == v.port && u.path == v.path && # path might need canon 320 u.query == v.query; # not fragment 321} 322 323URI.eqf(u: self ref URI, v: ref URI): int 324{ 325 return u.eq(v) && u.fragment == v.fragment; 326} 327