xref: /inferno-os/appl/lib/w3c/uris.b (revision 2b69dba5038ffd0b59cf30a4c44bce549e5097f8)
1implement URIs;
2
3#
4# RFC3986, URI Generic Syntax
5#
6
7include "sys.m";
8	sys: Sys;
9
10include "string.m";
11	S: String;
12
13include "uris.m";
14
15Alpha: con "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
16Digit: con "0123456789";
17
18GenDelims: con ":/?#[]@";
19SubDelims: con "!$&'()*+,;=";
20Reserved: con GenDelims + SubDelims;
21HexDigit: con Digit+"abcdefABCDEF";
22
23Escape: con GenDelims+"%";	# "%" must be encoded as %25
24
25Unreserved: con Alpha+Digit+"-._~";
26
27F_Esc, F_Scheme: con byte(1<<iota);
28
29ctype: array of byte;
30
31classify(s: string, f: byte)
32{
33	for(i := 0; i < len s; i++)
34		ctype[s[i]] |= f;
35}
36
37init()
38{
39	sys = load Sys Sys->PATH;
40	S = load String String->PATH;
41	if(S == nil)
42		raise sys->sprint("can't load %s: %r", String->PATH);
43
44	ctype = array [256] of { * => byte 0 };
45	classify(Escape, F_Esc);
46	for(i := 0; i <= ' '; i++)
47		ctype[i] |= F_Esc;
48	for(i = 16r80; i <= 16rFF; i++)
49		ctype[i] |= F_Esc;
50	classify(Alpha+Digit+"+-.", F_Scheme);
51}
52
53#      scheme://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment>
54#
55#      ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
56#
57#	delimiters:  :/?#  /?#  ?#  #
58#
59URI.parse(url: string): ref URI
60{
61	scheme, userinfo, host, port, path, query, frag: string;
62	for(i := 0; i < len url; i++){
63		c := url[i];
64		if(c == ':'){
65			scheme = S->tolower(url[0:i]);
66			url = url[i+1:];
67			break;
68		}
69		if(c < 0 || c >= len ctype || (ctype[c] & F_Scheme) == byte 0)
70			break;
71	}
72
73	if(S->prefix("//", url)){
74		authority: string;
75		(authority, path) = S->splitstrl(url[2:], "/");
76		(up, hp) := splitl(authority, "@");
77		if(hp == "")
78			hp = authority;
79		else
80			userinfo = up;
81		if(hp != nil && hp[0] == '['){	# another rfc hack, for IPv6 addresses, which contain :
82			(host, hp) = S->splitstrr(hp, "]");
83			if(hp != nil && hp[0] == ':')
84				port = hp[1:];
85			else
86				host += hp;	# put it back
87		}else
88			(host, port) = splitl(hp, ":");
89		if(path == nil)
90			path = "/";
91	}else
92		path = url;
93	(path, frag) = S->splitstrl(path, "#");		# includes # in frag
94	(path, query) = S->splitstrl(path, "?");	#  includes ? in query
95	return ref URI(scheme, dec(userinfo), dec(host), port, dec(path), query, dec(frag));
96}
97
98URI.userpw(u: self ref URI): (string, string)
99{
100	return splitl(u.userinfo, ":");
101}
102
103URI.text(u: self ref URI): string
104{
105	s := "";
106	if(u.scheme != nil)
107		s += u.scheme + ":";
108	if(u.hasauthority())
109		s += "//" + u.authority();
110	return s + enc(u.path, "/@:") + u.query + enc1(u.fragment, "@:/?");
111}
112
113URI.copy(u: self ref URI): ref URI
114{
115	return ref *u;
116}
117
118URI.pathonly(u: self ref URI): ref URI
119{
120	v := ref *u;
121	v.userinfo = nil;
122	v.query = nil;
123	v.fragment = nil;
124	return v;
125}
126
127URI.addbase(u: self ref URI, b: ref URI): ref URI
128{
129	# RFC3986 5.2.2, rearranged
130	r := ref *u;
131	if(r.scheme == nil && b != nil){
132		r.scheme = b.scheme;
133		if(!r.hasauthority()){
134			r.userinfo = b.userinfo;
135			r.host = b.host;
136			r.port = b.port;
137			if(r.path == nil){
138				r.path = b.path;
139				if(r.query == nil)
140					r.query = b.query;
141			}else if(r.path[0] != '/'){
142				# 5.2.3: merge paths
143				if(b.path == "" && b.hasauthority())
144					p1 := "/";
145				else
146					(p1, nil) = S->splitstrr(b.path, "/");
147				r.path = p1 + r.path;
148			}
149		}
150	}
151	r.path = removedots(r.path);
152	return r;
153}
154
155URI.nodots(u: self ref URI): ref URI
156{
157	return u.addbase(nil);
158}
159
160URI.hasauthority(u: self ref URI): int
161{
162	return u.host != nil || u.userinfo != nil || u.port != nil;
163}
164
165URI.isabsolute(u: self ref URI): int
166{
167	return u.scheme != nil;
168}
169
170URI.authority(u: self ref URI): string
171{
172	s := enc(u.userinfo, ":");
173	if(s != nil)
174		s += "@";
175	if(u.host != nil){
176		s += enc(u.host, "[]:");	# assumes : appears inside []; could enforce it
177		if(u.port != nil)
178			s += ":" + enc(u.port,nil);
179	}
180	return s;
181}
182
183#
184# simplified version of procedure in RFC3986 5.2.4:
185# it extracts a complete segment from the input first, then analyses it
186#
187removedots(s: string): string
188{
189	if(s == nil)
190		return "";
191	out := "";
192	for(p := 0; p < len s;){
193		# extract the first segment and any preceding /
194		q := p;
195		if(++p < len s){
196			while(++p < len s && s[p] != '/')
197				{}
198		}
199		seg := s[q: p];
200		if((e := p) < len s)
201			e++;
202		case s[q: e] {	# includes any following /
203		"../" or "./" =>	;
204		"/./" or "/." =>
205			if(p >= len s)
206				s += "/";
207		"/../" or "/.." =>
208			if(p >= len s)
209				s += "/";
210			if(out != nil){
211				for(q = len out; --q > 0 && out[q] != '/';)
212					{}	# skip
213				out = out[0: q];
214			}
215		"." or ".." =>	;	# null effect
216		* =>		# including "/"
217			out += seg;
218		}
219	}
220	return out;
221}
222
223#
224# similar to splitstrl but trims the matched character from the result
225#
226splitl(s, c: string): (string, string)
227{
228	(a, b) := S->splitstrl(s, c);
229	if(b != "")
230		b = b[1:];
231	return (a, b);
232}
233
234hex2(s: string): int
235{
236	n := 0;
237	for(i := 0; i < 2; i++){
238		if(i >= len s)
239			return -1;
240		n <<= 4;
241		case c := s[i] {
242		'0' to '9' =>
243			n += c-'0';
244		'a' to 'f' =>
245			n += 10+(c-'a');
246		'A' to 'F' =>
247			n += 10+(c-'A');
248		* =>
249			return -1;
250		}
251	}
252	return n;
253}
254
255dec(s: string): string
256{
257	for(i := 0;; i++){
258		if(i >= len s)
259			return s;
260		if(s[i] == '%' || s[i] == 0)
261			break;
262	}
263	t := s[0:i];
264	a := array[Sys->UTFmax*len s] of byte;	# upper bound
265	o := 0;
266	while(i < len s){
267		c := s[i++];
268		if(c < 16r80){
269			case c {
270			'%' =>
271				if((v := hex2(s[i:])) > 0){
272					c = v;
273					i += 2;
274				}
275			0 =>
276				c = ' ';	# shouldn't happen
277			}
278			a[o++] = byte c;
279		}else
280			o += sys->char2byte(c, a, o);	# string contained Unicode
281	}
282	return t + string a[0:o];
283}
284
285enc1(s: string, safe: string): string
286{
287	if(len s > 1)
288		return s[0:1] + enc(s[1:], safe);
289	return s;
290}
291
292# encoding depends on context (eg, &=/: not escaped in `query' string)
293enc(s: string, safe: string): string
294{
295	for(i := 0;; i++){
296		if(i >= len s)
297			return s;	# use as-is
298		c := s[i];
299		if(c >= 16r80 || (ctype[c] & F_Esc) != byte 0 && !S->in(c, safe))
300			break;
301	}
302	t := s[0: i];
303	b := array of byte s[i:];
304	for(i = 0; i < len b; i++){
305		c := int b[i];
306		if((ctype[c] & F_Esc) != byte 0 && !S->in(c, safe))
307			t += sys->sprint("%%%.2X", c);
308		else
309			t[len t] = c;
310	}
311	return t;
312}
313
314URI.eq(u: self ref URI, v: ref URI): int
315{
316	if(v == nil)
317		return 0;
318	return u.scheme == v.scheme && u.userinfo == v.userinfo &&
319		u.host == v.host && u.port == v.port && u.path == v.path &&	# path might need canon
320		u.query == v.query;	# not fragment
321}
322
323URI.eqf(u: self ref URI, v: ref URI): int
324{
325	return u.eq(v) && u.fragment == v.fragment;
326}
327