xref: /plan9/sys/src/libhtml/utils.c (revision 9a747e4fd48b9f4522c70c07e8f882a15030f964)
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <html.h>
5 #include "impl.h"
6 
7 Rune* whitespace = L" \t\n\r";
8 Rune* notwhitespace = L"^ \t\n\r";
9 
10 // All lists start out like List structure.
11 // List itself can be used as list of int.
12 int
13 _listlen(List* l)
14 {
15 	int n = 0;
16 
17 	while(l != nil) {
18 		l = l->next;
19 		n++;
20 	}
21 	return n;
22 }
23 
24 // Cons
25 List*
26 _newlist(int val, List* rest)
27 {
28 	List* ans;
29 
30 	ans = (List*)emalloc(sizeof(List));
31 	ans->val = val;
32 	ans->next = rest;
33 	return ans;
34 }
35 
36 // Reverse a list in place
37 List*
38 _revlist(List* l)
39 {
40 	List* newl;
41 	List* nextl;
42 
43 	newl = nil;
44 	while(l != nil) {
45 		nextl = l->next;
46 		l->next = newl;
47 		newl = l;
48 		l = nextl;
49 	}
50 	return newl;
51 }
52 
53 // The next few routines take a "character class" as argument.
54 //    e.g., "a-zA-Z", or "^ \t\n"
55 // (ranges indicated by - except in first position;
56 //  ^ is first position means "not in" the following class)
57 
58 // Splitl splits s[0:n] just before first character of class cl.
59 // Answers go in (p1, n1) and (p2, n2).
60 // If no split, the whole thing goes in the first component.
61 // Note: answers contain pointers into original string.
62 void
63 _splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
64 {
65 	Rune* p;
66 
67 	p = _Strnclass(s, cl, n);
68 	*p1 = s;
69 	if(p == nil) {
70 		*n1 = n;
71 		*p2 = nil;
72 		*n2 = 0;
73 	}
74 	else {
75 		*p2 = p;
76 		*n1 = p-s;
77 		*n2 = n-*n1;
78 	}
79 }
80 
81 // Splitr splits s[0:n] just after last character of class cl.
82 // Answers go in (p1, n1) and (p2, n2).
83 // If no split, the whole thing goes in the last component.
84 // Note: answers contain pointers into original string.
85 void
86 _splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
87 {
88 	Rune* p;
89 
90 	p = _Strnrclass(s, cl, n);
91 	if(p == nil) {
92 		*p1 = nil;
93 		*n1 = 0;
94 		*p2 = s;
95 		*n2 = n;
96 	}
97 	else {
98 		*p1 = s;
99 		*p2 = p+1;
100 		*n1 = *p2-s;
101 		*n2 = n-*n1;
102 	}
103 }
104 
105 // Splitall splits s[0:n] into parts that are separated by characters from class cl.
106 // Each part will have nonzero length.
107 // At most alen parts are found, and pointers to their starts go into
108 // the strarr array, while their lengths go into the lenarr array.
109 // The return value is the number of parts found.
110 int
111 _splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen)
112 {
113 	int i;
114 	Rune* p;
115 	Rune* q;
116 	Rune* slast;
117 
118 	if(s == nil || n == 0)
119 		return 0;
120 	i = 0;
121 	p = s;
122 	slast = s+n;
123 	while(p < slast && i < alen) {
124 		while(p < slast && _inclass(*p, cl))
125 			p++;
126 		if(p == slast)
127 			break;
128 		q = _Strnclass(p, cl, slast-p);
129 		if(q == nil)
130 			q = slast;
131 		assert(q > p && q <= slast);
132 		strarr[i] = p;
133 		lenarr[i] = q-p;
134 		i++;
135 		p = q;
136 	}
137 	return i;
138 }
139 
140 // Find part of s that excludes leading and trailing whitespace,
141 // and return that part in *pans (and its length in *panslen).
142 void
143 _trimwhite(Rune* s, int n, Rune** pans, int* panslen)
144 {
145 	Rune* p;
146 	Rune* q;
147 
148 	p = nil;
149 	if(n > 0) {
150 		p = _Strnclass(s, notwhitespace, n);
151 		if(p != nil) {
152 			q = _Strnrclass(s, notwhitespace, n);
153 			assert(q != nil);
154 			n = q+1-p;
155 		}
156 	}
157 	*pans = p;
158 	*panslen = n;
159 }
160 
161 // _Strclass returns a pointer to the first element of s that is
162 // a member of class cl, nil if none.
163 Rune*
164 _Strclass(Rune* s, Rune* cl)
165 {
166 	Rune* p;
167 
168 	for(p = s; *p != 0; p++)
169 		if(_inclass(*p, cl))
170 			return p;
171 	return nil;
172 }
173 
174 // _Strnclass returns a pointer to the first element of s[0:n] that is
175 // a member of class cl, nil if none.
176 Rune*
177 _Strnclass(Rune* s, Rune* cl, int n)
178 {
179 	Rune* p;
180 
181 	for(p = s; n-- && *p != 0; p++)
182 		if(_inclass(*p, cl))
183 			return p;
184 	return nil;
185 }
186 
187 // _Strrclass returns a pointer to the last element of s that is
188 // a member of class cl, nil if none
189 Rune*
190 _Strrclass(Rune* s, Rune* cl)
191 {
192 	Rune* p;
193 
194 	if(s == nil || *s == 0)
195 		return nil;
196 	p = s + runestrlen(s) - 1;
197 	while(p >= s) {
198 		if(_inclass(*p, cl))
199 			return p;
200 		p--;
201 	};
202 	return nil;
203 }
204 
205 // _Strnrclass returns a pointer to the last element of s[0:n] that is
206 // a member of class cl, nil if none
207 Rune*
208 _Strnrclass(Rune* s, Rune* cl, int n)
209 {
210 	Rune* p;
211 
212 	if(s == nil || *s == 0 || n == 0)
213 		return nil;
214 	p = s + n - 1;
215 	while(p >= s) {
216 		if(_inclass(*p, cl))
217 			return p;
218 		p--;
219 	};
220 	return nil;
221 }
222 
223 // Is c in the class cl?
224 int
225 _inclass(Rune c, Rune* cl)
226 {
227 	int	n;
228 	int	ans;
229 	int	negate;
230 	int	i;
231 
232 	n = runestrlen(cl);
233 	if(n == 0)
234 		return 0;
235 	ans = 0;
236 	negate = 0;
237 	if(cl[0] == '^') {
238 		negate = 1;
239 		cl++;
240 		n--;
241 	}
242 	for(i = 0; i < n; i++) {
243 		if(cl[i] == '-' && i > 0 && i < n - 1) {
244 			if(c >= cl[i - 1] && c <= cl[i + 1]) {
245 				ans = 1;
246 				break;
247 			}
248 			i++;
249 		}
250 		else if(c == cl[i]) {
251 			ans = 1;
252 			break;
253 		}
254 	}
255 	if(negate)
256 		ans = !ans;
257 	return ans;
258 }
259 
260 // Is pre a prefix of s?
261 int
262 _prefix(Rune* pre, Rune* s)
263 {
264 	int	ns;
265 	int	n;
266 	int	k;
267 
268 	ns = runestrlen(s);
269 	n = runestrlen(pre);
270 	if(ns < n)
271 		return 0;
272 	for(k = 0; k < n; k++) {
273 		if(pre[k] != s[k])
274 			return 0;
275 	}
276 	return 1;
277 }
278 
279 // Like Strcmp, but use exactly n chars of s1 (assume s1 has at least n chars).
280 // Also, do a case-insensitive match, assuming s2
281 // has no chars in [A-Z], only their lowercase versions.
282 // (This routine is used for in-place keyword lookup, where s2 is in a keyword
283 // list and s1 is some substring, possibly mixed-case, in a buffer.)
284 int
285 _Strncmpci(Rune *s1, int n1, Rune *s2)
286 {
287 	Rune c1, c2;
288 
289 	for(;;) {
290 		if(n1-- == 0) {
291 			if(*s2 == 0)
292 				return 0;
293 			return -1;
294 		}
295 		c1 = *s1++;
296 		c2 = *s2++;
297 		if(c1 >= 'A' && c1 <= 'Z')
298 			c1 = c1 - 'A' + 'a';
299 		if(c1 != c2) {
300 			if(c1 > c2)
301 				return 1;
302 			return -1;
303 		}
304 	}
305 }
306 
307 // emalloc and copy
308 Rune*
309 _Strdup(Rune* s)
310 {
311 	if(s == nil)
312 		return nil;
313 	return _Strndup(s, runestrlen(s));
314 }
315 
316 // emalloc and copy n chars of s (assume s is at least that long),
317 // and add 0 terminator.
318 // Return nil if n==0.
319 Rune*
320 _Strndup(Rune* s, int n)
321 {
322 	Rune* ans;
323 
324 	if(n <= 0)
325 		return nil;
326 	ans = _newstr(n);
327 	memmove(ans, s, n*sizeof(Rune));
328 	ans[n] = 0;
329 	return ans;
330 }
331 // emalloc enough room for n Runes, plus 1 null terminator.
332 // (Not initialized to anything.)
333 Rune*
334 _newstr(int n)
335 {
336 	return (Rune*)emalloc((n+1)*sizeof(Rune));
337 }
338 
339 // emalloc and copy s+t
340 Rune*
341 _Strdup2(Rune* s, Rune* t)
342 {
343 	int ns, nt;
344 	Rune* ans;
345 	Rune* p;
346 
347 	ns = runestrlen(s);
348 	nt = runestrlen(t);
349 	if(ns+nt == 0)
350 		return nil;
351 	ans = _newstr(ns+nt);
352 	p = _Stradd(ans, s, ns);
353 	p = _Stradd(p, t, nt);
354 	*p = 0;
355 	return ans;
356 }
357 
358 // Return emalloc'd substring s[start:stop],
359 Rune*
360 _Strsubstr(Rune* s, int start, int stop)
361 {
362 	Rune* t;
363 
364 	if(start == stop)
365 		return nil;
366 	t = _Strndup(s+start, stop-start);
367 	return t;
368 }
369 
370 // Copy n chars to s1 from s2, and return s1+n
371 Rune*
372 _Stradd(Rune* s1, Rune* s2, int n)
373 {
374 	if(n == 0)
375 		return s1;
376 	memmove(s1, s2, n*sizeof(Rune));
377 	return s1+n;
378 }
379 
380 // Like strtol, but converting from Rune* string
381 
382 #define LONG_MAX	2147483647L
383 #define LONG_MIN	-2147483648L
384 
385 long
386 _Strtol(Rune* nptr, Rune** endptr, int base)
387 {
388 	Rune* p;
389 	long n, nn;
390 	int c, ovfl, v, neg, ndig;
391 
392 	p = nptr;
393 	neg = 0;
394 	n = 0;
395 	ndig = 0;
396 	ovfl = 0;
397 
398 	/*
399 	 * White space
400 	 */
401 	for(;;p++){
402 		switch(*p){
403 		case ' ':
404 		case '\t':
405 		case '\n':
406 		case '\f':
407 		case '\r':
408 		case '\v':
409 			continue;
410 		}
411 		break;
412 	}
413 
414 	/*
415 	 * Sign
416 	 */
417 	if(*p=='-' || *p=='+')
418 		if(*p++ == '-')
419 			neg = 1;
420 
421 	/*
422 	 * Base
423 	 */
424 	if(base==0){
425 		if(*p != '0')
426 			base = 10;
427 		else{
428 			base = 8;
429 			if(p[1]=='x' || p[1]=='X'){
430 				p += 2;
431 				base = 16;
432 			}
433 		}
434 	}else if(base==16 && *p=='0'){
435 		if(p[1]=='x' || p[1]=='X')
436 			p += 2;
437 	}else if(base<0 || 36<base)
438 		goto Return;
439 
440 	/*
441 	 * Non-empty sequence of digits
442 	 */
443 	for(;; p++,ndig++){
444 		c = *p;
445 		v = base;
446 		if('0'<=c && c<='9')
447 			v = c - '0';
448 		else if('a'<=c && c<='z')
449 			v = c - 'a' + 10;
450 		else if('A'<=c && c<='Z')
451 			v = c - 'A' + 10;
452 		if(v >= base)
453 			break;
454 		nn = n*base + v;
455 		if(nn < n)
456 			ovfl = 1;
457 		n = nn;
458 	}
459 
460     Return:
461 	if(ndig == 0)
462 		p = nptr;
463 	if(endptr)
464 		*endptr = p;
465 	if(ovfl){
466 		if(neg)
467 			return LONG_MIN;
468 		return LONG_MAX;
469 	}
470 	if(neg)
471 		return -n;
472 	return n;
473 }
474 
475 // Convert buf[0:n], bytes whose character set is chset,
476 // into a emalloc'd null-terminated Unicode string.
477 Rune*
478 toStr(uchar* buf, int n, int chset)
479 {
480 	int i;
481 	int m;
482 	Rune ch;
483 	Rune* ans;
484 
485 	switch(chset) {
486 	case US_Ascii:
487 	case ISO_8859_1:
488 		ans = (Rune*)emalloc((n+1)*sizeof(Rune));
489 		for(i = 0; i < n; i++)
490 			ans[i] = buf[i];
491 		ans[n] = 0;
492 		break;
493 
494 	case UTF_8:
495 		m = 0;
496 		for(i = 0; i < n; ) {
497 			i += chartorune(&ch, (char*)(buf+i));
498 			m++;
499 		}
500 		ans = (Rune*)emalloc((m+1)*sizeof(Rune));
501 		m = 0;
502 		for(i = 0; i < n; ) {
503 			i += chartorune(&ch, (char*)(buf+i));
504 			ans[m++] = ch;
505 		}
506 		ans[m] = 0;
507 		break;
508 
509 	default:
510 		ans = nil;
511 		assert(0);
512 	}
513 	return ans;
514 }
515 
516 // Convert buf[0:n], Unicode characters,
517 // into an emalloc'd null-terminated string in character set chset.
518 // Use 0x80 for unconvertable characters.
519 uchar*
520 fromStr(Rune* buf, int n, int chset)
521 {
522 	uchar* ans;
523 	int i, lim, m;
524 	Rune ch;
525 	uchar* p;
526 	uchar s[UTFmax];
527 
528 	ans = nil;
529 	switch(chset) {
530 	case US_Ascii:
531 	case ISO_8859_1:
532 		ans = (uchar*)emalloc(n+1);
533 		lim = (chset==US_Ascii)? 127 : 255;
534 		for(i = 0; i < n; i++) {
535 			ch = buf[i];
536 			if(ch > lim)
537 				ch = 0x80;
538 			ans[i] = ch;
539 		}
540 		ans[n] = 0;
541 		break;
542 
543 	case UTF_8:
544 		m = 0;
545 		for(i = 0; i < n; i++) {
546 			m += runetochar((char*)s, &buf[i]);
547 		}
548 		ans = (uchar*)emalloc(m+1);
549 		p = ans;
550 		for(i = 0; i < n; i++)
551 			p += runetochar((char*)p, &buf[i]);
552 		*p = 0;
553 		break;
554 
555 	default:
556 		assert(0);
557 	}
558 	return ans;
559 
560 }
561 
562 // Convert n to emalloc'd String.
563 Rune*
564 _ltoStr(int n)
565 {
566 	int m;
567 	uchar buf[20];
568 
569 	m = snprint((char*)buf, sizeof(buf), "%d", n);
570 	return toStr(buf, m, US_Ascii);
571 }
572