xref: /plan9/sys/src/libhtml/utils.c (revision c93608cc76758b2be624199c6208a0f90bad298d)
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <html.h>
5 #include "impl.h"
6 
7 Rune* whitespace = L" \t\n\r";
8 Rune* notwhitespace = L"^ \t\n\r";
9 
10 // All lists start out like List structure.
11 // List itself can be used as list of int.
12 int
_listlen(List * l)13 _listlen(List* l)
14 {
15 	int n = 0;
16 
17 	while(l != nil) {
18 		l = l->next;
19 		n++;
20 	}
21 	return n;
22 }
23 
24 // Cons
25 List*
_newlist(int val,List * rest)26 _newlist(int val, List* rest)
27 {
28 	List* ans;
29 
30 	ans = (List*)emalloc(sizeof(List));
31 	ans->val = val;
32 	ans->next = rest;
33 	return ans;
34 }
35 
36 // Reverse a list in place
37 List*
_revlist(List * l)38 _revlist(List* l)
39 {
40 	List* newl;
41 	List* nextl;
42 
43 	newl = nil;
44 	while(l != nil) {
45 		nextl = l->next;
46 		l->next = newl;
47 		newl = l;
48 		l = nextl;
49 	}
50 	return newl;
51 }
52 
53 // The next few routines take a "character class" as argument.
54 //    e.g., "a-zA-Z", or "^ \t\n"
55 // (ranges indicated by - except in first position;
56 //  ^ is first position means "not in" the following class)
57 
58 // Splitl splits s[0:n] just before first character of class cl.
59 // Answers go in (p1, n1) and (p2, n2).
60 // If no split, the whole thing goes in the first component.
61 // Note: answers contain pointers into original string.
62 void
_splitl(Rune * s,int n,Rune * cl,Rune ** p1,int * n1,Rune ** p2,int * n2)63 _splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
64 {
65 	Rune* p;
66 
67 	p = _Strnclass(s, cl, n);
68 	*p1 = s;
69 	if(p == nil) {
70 		*n1 = n;
71 		*p2 = nil;
72 		*n2 = 0;
73 	}
74 	else {
75 		*p2 = p;
76 		*n1 = p-s;
77 		*n2 = n-*n1;
78 	}
79 }
80 
81 // Splitr splits s[0:n] just after last character of class cl.
82 // Answers go in (p1, n1) and (p2, n2).
83 // If no split, the whole thing goes in the last component.
84 // Note: answers contain pointers into original string.
85 void
_splitr(Rune * s,int n,Rune * cl,Rune ** p1,int * n1,Rune ** p2,int * n2)86 _splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
87 {
88 	Rune* p;
89 
90 	p = _Strnrclass(s, cl, n);
91 	if(p == nil) {
92 		*p1 = nil;
93 		*n1 = 0;
94 		*p2 = s;
95 		*n2 = n;
96 	}
97 	else {
98 		*p1 = s;
99 		*p2 = p+1;
100 		*n1 = *p2-s;
101 		*n2 = n-*n1;
102 	}
103 }
104 
105 // Splitall splits s[0:n] into parts that are separated by characters from class cl.
106 // Each part will have nonzero length.
107 // At most alen parts are found, and pointers to their starts go into
108 // the strarr array, while their lengths go into the lenarr array.
109 // The return value is the number of parts found.
110 int
_splitall(Rune * s,int n,Rune * cl,Rune ** strarr,int * lenarr,int alen)111 _splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen)
112 {
113 	int i;
114 	Rune* p;
115 	Rune* q;
116 	Rune* slast;
117 
118 	if(s == nil || n == 0)
119 		return 0;
120 	i = 0;
121 	p = s;
122 	slast = s+n;
123 	while(p < slast && i < alen) {
124 		while(p < slast && _inclass(*p, cl))
125 			p++;
126 		if(p == slast)
127 			break;
128 		q = _Strnclass(p, cl, slast-p);
129 		if(q == nil)
130 			q = slast;
131 		assert(q > p && q <= slast);
132 		strarr[i] = p;
133 		lenarr[i] = q-p;
134 		i++;
135 		p = q;
136 	}
137 	return i;
138 }
139 
140 // Find part of s that excludes leading and trailing whitespace,
141 // and return that part in *pans (and its length in *panslen).
142 void
_trimwhite(Rune * s,int n,Rune ** pans,int * panslen)143 _trimwhite(Rune* s, int n, Rune** pans, int* panslen)
144 {
145 	Rune* p;
146 	Rune* q;
147 
148 	p = nil;
149 	if(n > 0) {
150 		p = _Strnclass(s, notwhitespace, n);
151 		if(p != nil) {
152 			q = _Strnrclass(s, notwhitespace, n);
153 			assert(q != nil);
154 			n = q+1-p;
155 		}
156 	}
157 	*pans = p;
158 	*panslen = n;
159 }
160 
161 // _Strclass returns a pointer to the first element of s that is
162 // a member of class cl, nil if none.
163 Rune*
_Strclass(Rune * s,Rune * cl)164 _Strclass(Rune* s, Rune* cl)
165 {
166 	Rune* p;
167 
168 	for(p = s; *p != 0; p++)
169 		if(_inclass(*p, cl))
170 			return p;
171 	return nil;
172 }
173 
174 // _Strnclass returns a pointer to the first element of s[0:n] that is
175 // a member of class cl, nil if none.
176 Rune*
_Strnclass(Rune * s,Rune * cl,int n)177 _Strnclass(Rune* s, Rune* cl, int n)
178 {
179 	Rune* p;
180 
181 	for(p = s; n-- && *p != 0; p++)
182 		if(_inclass(*p, cl))
183 			return p;
184 	return nil;
185 }
186 
187 // _Strrclass returns a pointer to the last element of s that is
188 // a member of class cl, nil if none
189 Rune*
_Strrclass(Rune * s,Rune * cl)190 _Strrclass(Rune* s, Rune* cl)
191 {
192 	Rune* p;
193 
194 	if(s == nil || *s == 0)
195 		return nil;
196 	p = s + runestrlen(s) - 1;
197 	while(p >= s) {
198 		if(_inclass(*p, cl))
199 			return p;
200 		p--;
201 	};
202 	return nil;
203 }
204 
205 // _Strnrclass returns a pointer to the last element of s[0:n] that is
206 // a member of class cl, nil if none
207 Rune*
_Strnrclass(Rune * s,Rune * cl,int n)208 _Strnrclass(Rune* s, Rune* cl, int n)
209 {
210 	Rune* p;
211 
212 	if(s == nil || *s == 0 || n == 0)
213 		return nil;
214 	p = s + n - 1;
215 	while(p >= s) {
216 		if(_inclass(*p, cl))
217 			return p;
218 		p--;
219 	};
220 	return nil;
221 }
222 
223 // Is c in the class cl?
224 int
_inclass(Rune c,Rune * cl)225 _inclass(Rune c, Rune* cl)
226 {
227 	int	n;
228 	int	ans;
229 	int	negate;
230 	int	i;
231 
232 	n = _Strlen(cl);
233 	if(n == 0)
234 		return 0;
235 	ans = 0;
236 	negate = 0;
237 	if(cl[0] == '^') {
238 		negate = 1;
239 		cl++;
240 		n--;
241 	}
242 	for(i = 0; i < n; i++) {
243 		if(cl[i] == '-' && i > 0 && i < n - 1) {
244 			if(c >= cl[i - 1] && c <= cl[i + 1]) {
245 				ans = 1;
246 				break;
247 			}
248 			i++;
249 		}
250 		else if(c == cl[i]) {
251 			ans = 1;
252 			break;
253 		}
254 	}
255 	if(negate)
256 		ans = !ans;
257 	return ans;
258 }
259 
260 // Is pre a prefix of s?
261 int
_prefix(Rune * pre,Rune * s)262 _prefix(Rune* pre, Rune* s)
263 {
264 	int	ns;
265 	int	n;
266 	int	k;
267 
268 	ns = _Strlen(s);
269 	n = _Strlen(pre);
270 	if(ns < n)
271 		return 0;
272 	for(k = 0; k < n; k++) {
273 		if(pre[k] != s[k])
274 			return 0;
275 	}
276 	return 1;
277 }
278 
279 // Number of runes in (null-terminated) s
280 int
_Strlen(Rune * s)281 _Strlen(Rune* s)
282 {
283 	if(s == nil)
284 		return 0;
285 	return runestrlen(s);
286 }
287 
288 // -1, 0, 1 as s1 is lexicographically less, equal greater than s2
289 int
_Strcmp(Rune * s1,Rune * s2)290 _Strcmp(Rune *s1, Rune *s2)
291 {
292 	if(s1 == nil)
293 		return (s2 == nil || *s2 == 0) ? 0 : -1;
294 	if(s2 == nil)
295 		return (*s1 == 0) ? 0 : 1;
296 	return runestrcmp(s1, s2);
297 }
298 
299 // Like Strcmp, but use exactly n chars of s1 (assume s1 has at least n chars).
300 // Also, do a case-insensitive match, assuming s2
301 // has no chars in [A-Z], only their lowercase versions.
302 // (This routine is used for in-place keyword lookup, where s2 is in a keyword
303 // list and s1 is some substring, possibly mixed-case, in a buffer.)
304 int
_Strncmpci(Rune * s1,int n1,Rune * s2)305 _Strncmpci(Rune *s1, int n1, Rune *s2)
306 {
307 	Rune c1, c2;
308 
309 	for(;;) {
310 		if(n1-- == 0) {
311 			if(*s2 == 0)
312 				return 0;
313 			return -1;
314 		}
315 		c1 = *s1++;
316 		c2 = *s2++;
317 		if(c1 >= 'A' && c1 <= 'Z')
318 			c1 = c1 - 'A' + 'a';
319 		if(c1 != c2) {
320 			if(c1 > c2)
321 				return 1;
322 			return -1;
323 		}
324 	}
325 }
326 
327 // emalloc and copy
328 Rune*
_Strdup(Rune * s)329 _Strdup(Rune* s)
330 {
331 	if(s == nil)
332 		return nil;
333 	return _Strndup(s, runestrlen(s));
334 }
335 
336 // emalloc and copy n chars of s (assume s is at least that long),
337 // and add 0 terminator.
338 // Return nil if n==0.
339 Rune*
_Strndup(Rune * s,int n)340 _Strndup(Rune* s, int n)
341 {
342 	Rune* ans;
343 
344 	if(n <= 0)
345 		return nil;
346 	ans = _newstr(n);
347 	memmove(ans, s, n*sizeof(Rune));
348 	ans[n] = 0;
349 	return ans;
350 }
351 // emalloc enough room for n Runes, plus 1 null terminator.
352 // (Not initialized to anything.)
353 Rune*
_newstr(int n)354 _newstr(int n)
355 {
356 	return (Rune*)emalloc((n+1)*sizeof(Rune));
357 }
358 
359 // emalloc and copy s+t
360 Rune*
_Strdup2(Rune * s,Rune * t)361 _Strdup2(Rune* s, Rune* t)
362 {
363 	int ns, nt;
364 	Rune* ans;
365 	Rune* p;
366 
367 	ns = _Strlen(s);
368 	nt = _Strlen(t);
369 	if(ns+nt == 0)
370 		return nil;
371 	ans = _newstr(ns+nt);
372 	p = _Stradd(ans, s, ns);
373 	p = _Stradd(p, t, nt);
374 	*p = 0;
375 	return ans;
376 }
377 
378 // Return emalloc'd substring s[start:stop],
379 Rune*
_Strsubstr(Rune * s,int start,int stop)380 _Strsubstr(Rune* s, int start, int stop)
381 {
382 	Rune* t;
383 
384 	if(start == stop)
385 		return nil;
386 	t = _Strndup(s+start, stop-start);
387 	return t;
388 }
389 
390 // Copy n chars to s1 from s2, and return s1+n
391 Rune*
_Stradd(Rune * s1,Rune * s2,int n)392 _Stradd(Rune* s1, Rune* s2, int n)
393 {
394 	if(n == 0)
395 		return s1;
396 	memmove(s1, s2, n*sizeof(Rune));
397 	return s1+n;
398 }
399 
400 // Like strtol, but converting from Rune* string
401 
402 #define LONG_MAX	2147483647L
403 #define LONG_MIN	-2147483648L
404 
405 long
_Strtol(Rune * nptr,Rune ** endptr,int base)406 _Strtol(Rune* nptr, Rune** endptr, int base)
407 {
408 	Rune* p;
409 	long n, nn;
410 	int c, ovfl, v, neg, ndig;
411 
412 	p = nptr;
413 	neg = 0;
414 	n = 0;
415 	ndig = 0;
416 	ovfl = 0;
417 
418 	/*
419 	 * White space
420 	 */
421 	for(;;p++){
422 		switch(*p){
423 		case ' ':
424 		case '\t':
425 		case '\n':
426 		case '\f':
427 		case '\r':
428 		case '\v':
429 			continue;
430 		}
431 		break;
432 	}
433 
434 	/*
435 	 * Sign
436 	 */
437 	if(*p=='-' || *p=='+')
438 		if(*p++ == '-')
439 			neg = 1;
440 
441 	/*
442 	 * Base
443 	 */
444 	if(base==0){
445 		if(*p != '0')
446 			base = 10;
447 		else{
448 			base = 8;
449 			if(p[1]=='x' || p[1]=='X'){
450 				p += 2;
451 				base = 16;
452 			}
453 		}
454 	}else if(base==16 && *p=='0'){
455 		if(p[1]=='x' || p[1]=='X')
456 			p += 2;
457 	}else if(base<0 || 36<base)
458 		goto Return;
459 
460 	/*
461 	 * Non-empty sequence of digits
462 	 */
463 	for(;; p++,ndig++){
464 		c = *p;
465 		v = base;
466 		if('0'<=c && c<='9')
467 			v = c - '0';
468 		else if('a'<=c && c<='z')
469 			v = c - 'a' + 10;
470 		else if('A'<=c && c<='Z')
471 			v = c - 'A' + 10;
472 		if(v >= base)
473 			break;
474 		nn = n*base + v;
475 		if(nn < n)
476 			ovfl = 1;
477 		n = nn;
478 	}
479 
480     Return:
481 	if(ndig == 0)
482 		p = nptr;
483 	if(endptr)
484 		*endptr = p;
485 	if(ovfl){
486 		if(neg)
487 			return LONG_MIN;
488 		return LONG_MAX;
489 	}
490 	if(neg)
491 		return -n;
492 	return n;
493 }
494 
495 // Convert buf[0:n], bytes whose character set is chset,
496 // into a emalloc'd null-terminated Unicode string.
497 Rune*
toStr(uchar * buf,int n,int chset)498 toStr(uchar* buf, int n, int chset)
499 {
500 	int i;
501 	int m;
502 	Rune ch;
503 	Rune* ans;
504 
505 	switch(chset) {
506 	case US_Ascii:
507 	case ISO_8859_1:
508 		ans = (Rune*)emalloc((n+1)*sizeof(Rune));
509 		for(i = 0; i < n; i++)
510 			ans[i] = buf[i];
511 		ans[n] = 0;
512 		break;
513 
514 	case UTF_8:
515 		m = 0;
516 		for(i = 0; i < n; ) {
517 			i += chartorune(&ch, (char*)(buf+i));
518 			m++;
519 		}
520 		ans = (Rune*)emalloc((m+1)*sizeof(Rune));
521 		m = 0;
522 		for(i = 0; i < n; ) {
523 			i += chartorune(&ch, (char*)(buf+i));
524 			ans[m++] = ch;
525 		}
526 		ans[m] = 0;
527 		break;
528 
529 	default:
530 		ans = nil;
531 		assert(0);
532 	}
533 	return ans;
534 }
535 
536 // Convert buf[0:n], Unicode characters,
537 // into an emalloc'd null-terminated string in character set chset.
538 // Use 0x80 for unconvertable characters.
539 uchar*
fromStr(Rune * buf,int n,int chset)540 fromStr(Rune* buf, int n, int chset)
541 {
542 	uchar* ans;
543 	int i, lim, m;
544 	Rune ch;
545 	uchar* p;
546 	uchar s[UTFmax];
547 
548 	ans = nil;
549 	switch(chset) {
550 	case US_Ascii:
551 	case ISO_8859_1:
552 		ans = (uchar*)emalloc(n+1);
553 		lim = (chset==US_Ascii)? 127 : 255;
554 		for(i = 0; i < n; i++) {
555 			ch = buf[i];
556 			if(ch > lim)
557 				ch = 0x80;
558 			ans[i] = ch;
559 		}
560 		ans[n] = 0;
561 		break;
562 
563 	case UTF_8:
564 		m = 0;
565 		for(i = 0; i < n; i++) {
566 			m += runetochar((char*)s, &buf[i]);
567 		}
568 		ans = (uchar*)emalloc(m+1);
569 		p = ans;
570 		for(i = 0; i < n; i++)
571 			p += runetochar((char*)p, &buf[i]);
572 		*p = 0;
573 		break;
574 
575 	default:
576 		assert(0);
577 	}
578 	return ans;
579 
580 }
581