xref: /plan9-contrib/sys/src/libhtml/lex.c (revision ec59a3ddbfceee0efe34584c2c9981a5e5ff1ec4)
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7 
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 	int			i;		// index of next byte to use
12 	uchar*		data;		// all the data
13 	int			edata;	// data[0:edata] is valid
14 	int			chset;	// one of US_Ascii, etc.
15 	int			mtype;	// TextHtml or TextPlain
16 };
17 
18 enum {
19 	EOF = -2,
20 	EOB = -1
21 };
22 
23 #define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27 
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune* tagnames[] = {
31 	L" ",
32 	L"!",
33 	L"a",
34 	L"abbr",
35 	L"acronym",
36 	L"address",
37 	L"applet",
38 	L"area",
39 	L"b",
40 	L"base",
41 	L"basefont",
42 	L"bdo",
43 	L"big",
44 	L"blink",
45 	L"blockquote",
46 	L"body",
47 	L"bq",
48 	L"br",
49 	L"button",
50 	L"caption",
51 	L"center",
52 	L"cite",
53 	L"code",
54 	L"col",
55 	L"colgroup",
56 	L"dd",
57 	L"del",
58 	L"dfn",
59 	L"dir",
60 	L"div",
61 	L"dl",
62 	L"dt",
63 	L"em",
64 	L"fieldset",
65 	L"font",
66 	L"form",
67 	L"frame",
68 	L"frameset",
69 	L"h1",
70 	L"h2",
71 	L"h3",
72 	L"h4",
73 	L"h5",
74 	L"h6",
75 	L"head",
76 	L"hr",
77 	L"html",
78 	L"i",
79 	L"iframe",
80 	L"img",
81 	L"input",
82 	L"ins",
83 	L"isindex",
84 	L"kbd",
85 	L"label",
86 	L"legend",
87 	L"li",
88 	L"link",
89 	L"map",
90 	L"menu",
91 	L"meta",
92 	L"nobr",
93 	L"noframes",
94 	L"noscript",
95 	L"object",
96 	L"ol",
97 	L"optgroup",
98 	L"option",
99 	L"p",
100 	L"param",
101 	L"pre",
102 	L"q",
103 	L"s",
104 	L"samp",
105 	L"script",
106 	L"select",
107 	L"small",
108 	L"span",
109 	L"strike",
110 	L"strong",
111 	L"style",
112 	L"sub",
113 	L"sup",
114 	L"table",
115 	L"tbody",
116 	L"td",
117 	L"textarea",
118 	L"tfoot",
119 	L"th",
120 	L"thead",
121 	L"title",
122 	L"tr",
123 	L"tt",
124 	L"u",
125 	L"ul",
126 	L"var"
127 };
128 
129 // HTML 4.0 attribute names.
130 // Keep sorted, and in correspondence with enum in i.h.
131 Rune* attrnames[] = {
132 	L"abbr",
133 	L"accept-charset",
134 	L"access-key",
135 	L"action",
136 	L"align",
137 	L"alink",
138 	L"alt",
139 	L"archive",
140 	L"axis",
141 	L"background",
142 	L"bgcolor",
143 	L"border",
144 	L"cellpadding",
145 	L"cellspacing",
146 	L"char",
147 	L"charoff",
148 	L"charset",
149 	L"checked",
150 	L"cite",
151 	L"class",
152 	L"classid",
153 	L"clear",
154 	L"code",
155 	L"codebase",
156 	L"codetype",
157 	L"color",
158 	L"cols",
159 	L"colspan",
160 	L"compact",
161 	L"content",
162 	L"coords",
163 	L"data",
164 	L"datetime",
165 	L"declare",
166 	L"defer",
167 	L"dir",
168 	L"disabled",
169 	L"enctype",
170 	L"face",
171 	L"for",
172 	L"frame",
173 	L"frameborder",
174 	L"headers",
175 	L"height",
176 	L"href",
177 	L"hreflang",
178 	L"hspace",
179 	L"http-equiv",
180 	L"id",
181 	L"ismap",
182 	L"label",
183 	L"lang",
184 	L"link",
185 	L"longdesc",
186 	L"marginheight",
187 	L"marginwidth",
188 	L"maxlength",
189 	L"media",
190 	L"method",
191 	L"multiple",
192 	L"name",
193 	L"nohref",
194 	L"noresize",
195 	L"noshade",
196 	L"nowrap",
197 	L"object",
198 	L"onblur",
199 	L"onchange",
200 	L"onclick",
201 	L"ondblclick",
202 	L"onfocus",
203 	L"onkeypress",
204 	L"onkeyup",
205 	L"onload",
206 	L"onmousedown",
207 	L"onmousemove",
208 	L"onmouseout",
209 	L"onmouseover",
210 	L"onmouseup",
211 	L"onreset",
212 	L"onselect",
213 	L"onsubmit",
214 	L"onunload",
215 	L"profile",
216 	L"prompt",
217 	L"readonly",
218 	L"rel",
219 	L"rev",
220 	L"rows",
221 	L"rowspan",
222 	L"rules",
223 	L"scheme",
224 	L"scope",
225 	L"scrolling",
226 	L"selected",
227 	L"shape",
228 	L"size",
229 	L"span",
230 	L"src",
231 	L"standby",
232 	L"start",
233 	L"style",
234 	L"summary",
235 	L"tabindex",
236 	L"target",
237 	L"text",
238 	L"title",
239 	L"type",
240 	L"usemap",
241 	L"valign",
242 	L"value",
243 	L"valuetype",
244 	L"version",
245 	L"vlink",
246 	L"vspace",
247 	L"width"
248 };
249 
250 
251 // Character entity to unicode character number map.
252 // Keep sorted by name.
253 StringInt	chartab[]= {
254 	{L"AElig", 198},
255 	{L"Aacute", 193},
256 	{L"Acirc", 194},
257 	{L"Agrave", 192},
258 	{L"Alpha", 913},
259 	{L"Aring", 197},
260 	{L"Atilde", 195},
261 	{L"Auml", 196},
262 	{L"Beta", 914},
263 	{L"Ccedil", 199},
264 	{L"Chi", 935},
265 	{L"Dagger", 8225},
266 	{L"Delta", 916},
267 	{L"ETH", 208},
268 	{L"Eacute", 201},
269 	{L"Ecirc", 202},
270 	{L"Egrave", 200},
271 	{L"Epsilon", 917},
272 	{L"Eta", 919},
273 	{L"Euml", 203},
274 	{L"Gamma", 915},
275 	{L"Iacute", 205},
276 	{L"Icirc", 206},
277 	{L"Igrave", 204},
278 	{L"Iota", 921},
279 	{L"Iuml", 207},
280 	{L"Kappa", 922},
281 	{L"Lambda", 923},
282 	{L"Mu", 924},
283 	{L"Ntilde", 209},
284 	{L"Nu", 925},
285 	{L"OElig", 338},
286 	{L"Oacute", 211},
287 	{L"Ocirc", 212},
288 	{L"Ograve", 210},
289 	{L"Omega", 937},
290 	{L"Omicron", 927},
291 	{L"Oslash", 216},
292 	{L"Otilde", 213},
293 	{L"Ouml", 214},
294 	{L"Phi", 934},
295 	{L"Pi", 928},
296 	{L"Prime", 8243},
297 	{L"Psi", 936},
298 	{L"Rho", 929},
299 	{L"Scaron", 352},
300 	{L"Sigma", 931},
301 	{L"THORN", 222},
302 	{L"Tau", 932},
303 	{L"Theta", 920},
304 	{L"Uacute", 218},
305 	{L"Ucirc", 219},
306 	{L"Ugrave", 217},
307 	{L"Upsilon", 933},
308 	{L"Uuml", 220},
309 	{L"Xi", 926},
310 	{L"Yacute", 221},
311 	{L"Yuml", 376},
312 	{L"Zeta", 918},
313 	{L"aacute", 225},
314 	{L"acirc", 226},
315 	{L"acute", 180},
316 	{L"aelig", 230},
317 	{L"agrave", 224},
318 	{L"alefsym", 8501},
319 	{L"alpha", 945},
320 	{L"amp", 38},
321 	{L"and", 8743},
322 	{L"ang", 8736},
323 	{L"aring", 229},
324 	{L"asymp", 8776},
325 	{L"atilde", 227},
326 	{L"auml", 228},
327 	{L"bdquo", 8222},
328 	{L"beta", 946},
329 	{L"brvbar", 166},
330 	{L"bull", 8226},
331 	{L"cap", 8745},
332 	{L"ccedil", 231},
333 	{L"cdots", 8943},
334 	{L"cedil", 184},
335 	{L"cent", 162},
336 	{L"chi", 967},
337 	{L"circ", 710},
338 	{L"clubs", 9827},
339 	{L"cong", 8773},
340 	{L"copy", 169},
341 	{L"crarr", 8629},
342 	{L"cup", 8746},
343 	{L"curren", 164},
344 	{L"dArr", 8659},
345 	{L"dagger", 8224},
346 	{L"darr", 8595},
347 	{L"ddots", 8945},
348 	{L"deg", 176},
349 	{L"delta", 948},
350 	{L"diams", 9830},
351 	{L"divide", 247},
352 	{L"eacute", 233},
353 	{L"ecirc", 234},
354 	{L"egrave", 232},
355 	{L"emdash", 8212},	/* non-standard but commonly used */
356 	{L"empty", 8709},
357 	{L"emsp", 8195},
358 	{L"endash", 8211},	/* non-standard but commonly used */
359 	{L"ensp", 8194},
360 	{L"epsilon", 949},
361 	{L"equiv", 8801},
362 	{L"eta", 951},
363 	{L"eth", 240},
364 	{L"euml", 235},
365 	{L"euro", 8364},
366 	{L"exist", 8707},
367 	{L"fnof", 402},
368 	{L"forall", 8704},
369 	{L"frac12", 189},
370 	{L"frac14", 188},
371 	{L"frac34", 190},
372 	{L"frasl", 8260},
373 	{L"gamma", 947},
374 	{L"ge", 8805},
375 	{L"gt", 62},
376 	{L"hArr", 8660},
377 	{L"harr", 8596},
378 	{L"hearts", 9829},
379 	{L"hellip", 8230},
380 	{L"iacute", 237},
381 	{L"icirc", 238},
382 	{L"iexcl", 161},
383 	{L"igrave", 236},
384 	{L"image", 8465},
385 	{L"infin", 8734},
386 	{L"int", 8747},
387 	{L"iota", 953},
388 	{L"iquest", 191},
389 	{L"isin", 8712},
390 	{L"iuml", 239},
391 	{L"kappa", 954},
392 	{L"lArr", 8656},
393 	{L"lambda", 955},
394 	{L"lang", 9001},
395 	{L"laquo", 171},
396 	{L"larr", 8592},
397 	{L"lceil", 8968},
398 	{L"ldots", 8230},
399 	{L"ldquo", 8220},
400 	{L"le", 8804},
401 	{L"lfloor", 8970},
402 	{L"lowast", 8727},
403 	{L"loz", 9674},
404 	{L"lrm", 8206},
405 	{L"lsaquo", 8249},
406 	{L"lsquo", 8216},
407 	{L"lt", 60},
408 	{L"macr", 175},
409 	{L"mdash", 8212},
410 	{L"micro", 181},
411 	{L"middot", 183},
412 	{L"minus", 8722},
413 	{L"mu", 956},
414 	{L"nabla", 8711},
415 	{L"nbsp", 160},
416 	{L"ndash", 8211},
417 	{L"ne", 8800},
418 	{L"ni", 8715},
419 	{L"not", 172},
420 	{L"notin", 8713},
421 	{L"nsub", 8836},
422 	{L"ntilde", 241},
423 	{L"nu", 957},
424 	{L"oacute", 243},
425 	{L"ocirc", 244},
426 	{L"oelig", 339},
427 	{L"ograve", 242},
428 	{L"oline", 8254},
429 	{L"omega", 969},
430 	{L"omicron", 959},
431 	{L"oplus", 8853},
432 	{L"or", 8744},
433 	{L"ordf", 170},
434 	{L"ordm", 186},
435 	{L"oslash", 248},
436 	{L"otilde", 245},
437 	{L"otimes", 8855},
438 	{L"ouml", 246},
439 	{L"para", 182},
440 	{L"part", 8706},
441 	{L"permil", 8240},
442 	{L"perp", 8869},
443 	{L"phi", 966},
444 	{L"pi", 960},
445 	{L"piv", 982},
446 	{L"plusmn", 177},
447 	{L"pound", 163},
448 	{L"prime", 8242},
449 	{L"prod", 8719},
450 	{L"prop", 8733},
451 	{L"psi", 968},
452 	{L"quad", 8193},
453 	{L"quot", 34},
454 	{L"rArr", 8658},
455 	{L"radic", 8730},
456 	{L"rang", 9002},
457 	{L"raquo", 187},
458 	{L"rarr", 8594},
459 	{L"rceil", 8969},
460 	{L"rdquo", 8221},
461 	{L"real", 8476},
462 	{L"reg", 174},
463 	{L"rfloor", 8971},
464 	{L"rho", 961},
465 	{L"rlm", 8207},
466 	{L"rsaquo", 8250},
467 	{L"rsquo", 8217},
468 	{L"sbquo", 8218},
469 	{L"scaron", 353},
470 	{L"sdot", 8901},
471 	{L"sect", 167},
472 	{L"shy", 173},
473 	{L"sigma", 963},
474 	{L"sigmaf", 962},
475 	{L"sim", 8764},
476 	{L"sp", 8194},
477 	{L"spades", 9824},
478 	{L"sub", 8834},
479 	{L"sube", 8838},
480 	{L"sum", 8721},
481 	{L"sup", 8835},
482 	{L"sup1", 185},
483 	{L"sup2", 178},
484 	{L"sup3", 179},
485 	{L"supe", 8839},
486 	{L"szlig", 223},
487 	{L"tau", 964},
488 	{L"there4", 8756},
489 	{L"theta", 952},
490 	{L"thetasym", 977},
491 	{L"thinsp", 8201},
492 	{L"thorn", 254},
493 	{L"tilde", 732},
494 	{L"times", 215},
495 	{L"trade", 8482},
496 	{L"uArr", 8657},
497 	{L"uacute", 250},
498 	{L"uarr", 8593},
499 	{L"ucirc", 251},
500 	{L"ugrave", 249},
501 	{L"uml", 168},
502 	{L"upsih", 978},
503 	{L"upsilon", 965},
504 	{L"uuml", 252},
505 	{L"varepsilon", 8712},
506 	{L"varphi", 981},
507 	{L"varpi", 982},
508 	{L"varrho", 1009},
509 	{L"vdots", 8942},
510 	{L"vsigma", 962},
511 	{L"vtheta", 977},
512 	{L"weierp", 8472},
513 	{L"xi", 958},
514 	{L"yacute", 253},
515 	{L"yen", 165},
516 	{L"yuml", 255},
517 	{L"zeta", 950},
518 	{L"zwj", 8205},
519 	{L"zwnj", 8204}
520 };
521 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
522 
523 // Characters Winstart..Winend are those that Windows
524 // uses interpolated into the Latin1 set.
525 // They aren't supposed to appear in HTML, but they do....
526 enum {
527 	Winstart = 127,
528 	Winend = 159
529 };
530 
531 static int	winchars[]= { 8226,	// 8226 is a bullet
532 	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
533 	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
534 	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
535 	732, 8482, 353, 8250, 339, 8226, 8226, 376};
536 
537 static StringInt*	tagtable;		// initialized from tagnames
538 static StringInt*	attrtable;		// initialized from attrnames
539 
540 static void		lexinit();
541 static int		getplaindata(TokenSource* ts, Token* a, int* pai);
542 static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
543 static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
544 static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
545 static Rune*	buftostr(Rune* s, Rune* buf, int j);
546 static int		comment(TokenSource* ts);
547 static int		findstr(TokenSource* ts, Rune* s);
548 static int		ampersand(TokenSource* ts);
549 static int		lowerc(int c);
550 static int		getchar(TokenSource* ts);
551 static void		ungetchar(TokenSource* ts, int c);
552 static void		backup(TokenSource* ts, int savei);
553 static void		freeinsidetoken(Token* t);
554 static void		freeattrs(Attr* ahead);
555 static Attr*	newattr(int attid, Rune* value, Attr* link);
556 static int		Tconv(Fmt* f);
557 
558 int	dbglex = 0;
559 static int lexinited = 0;
560 
561 static void
562 lexinit(void)
563 {
564 	tagtable = _makestrinttab(tagnames, Numtags);
565 	attrtable = _makestrinttab(attrnames, Numattrs);
566 	fmtinstall('T', Tconv);
567 	lexinited = 1;
568 }
569 
570 static TokenSource*
571 newtokensource(uchar* data, int edata, int chset, int mtype)
572 {
573 	TokenSource*	ans;
574 
575 	assert(chset == US_Ascii || chset == ISO_8859_1 ||
576 			chset == UTF_8 || chset == Unicode);
577 	ans = (TokenSource*)emalloc(sizeof(TokenSource));
578 	ans->i = 0;
579 	ans->data = data;
580 	ans->edata = edata;
581 	ans->chset = chset;
582 	ans->mtype = mtype;
583 	return ans;
584 }
585 
586 enum {
587 	ToksChunk = 500
588 };
589 
590 // Call this to get the tokens.
591 //  The number of returned tokens is returned in *plen.
592 Token*
593 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
594 {
595 	TokenSource*	ts;
596 	Token*		a;
597 	int	alen;
598 	int	ai;
599 	int	starti;
600 	int	c;
601 	int	tag;
602 
603 	if(!lexinited)
604 		lexinit();
605 	ts = newtokensource(data, datalen, chset, mtype);
606 	alen = ToksChunk;
607 	a = (Token*)emalloc(alen * sizeof(Token));
608 	ai = 0;
609 	if(dbglex)
610 		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
611 	if(ts->mtype == TextHtml) {
612 		for(;;) {
613 			if(ai == alen) {
614 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
615 				alen += ToksChunk;
616 			}
617 			starti = ts->i;
618 			c = getchar(ts);
619 			if(c < 0)
620 				break;
621 			if(c == '<') {
622 				tag = gettag(ts, starti, a, &ai);
623 				if(tag == Tscript) {
624 					// special rules for getting Data after....
625 					starti = ts->i;
626 					c = getchar(ts);
627 					tag = getscriptdata(ts, c, starti, a, &ai);
628 				}
629 			}
630 			else
631 				tag = getdata(ts, c, starti, a, &ai);
632 			if(tag == -1)
633 				break;
634 			else if(dbglex > 1 && tag != Comment)
635 				fprint(2, "lex: got token %T\n", &a[ai-1]);
636 		}
637 	}
638 	else {
639 		// plain text (non-html) tokens
640 		for(;;) {
641 			if(ai == alen) {
642 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
643 				alen += ToksChunk;
644 			}
645 			tag = getplaindata(ts, a, &ai);
646 			if(tag == -1)
647 				break;
648 			if(dbglex > 1)
649 				fprint(2, "lex: got token %T\n", &a[ai]);
650 		}
651 	}
652 	if(dbglex)
653 		fprint(2, "lex: returning %d tokens\n", ai);
654 	*plen = ai;
655 	if(ai == 0)
656 		return nil;
657 	return a;
658 }
659 
660 // For case where source isn't HTML.
661 // Just make data tokens, one per line (or partial line,
662 // at end of buffer), ignoring non-whitespace control
663 // characters and dumping \r's.
664 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
665 // Otherwise return -1;
666 static int
667 getplaindata(TokenSource* ts, Token* a, int* pai)
668 {
669 	Rune*	s;
670 	int	j;
671 	int	starti;
672 	int	c;
673 	Token*	tok;
674 	Rune	buf[BIGBUFSIZE];
675 
676 	s = nil;
677 	j = 0;
678 	starti = ts->i;
679 	for(c = getchar(ts); c >= 0; c = getchar(ts)) {
680 		if(c < ' ') {
681 			if(isspace(c)) {
682 				if(c == '\r') {
683 					// ignore it unless no following '\n',
684 					// in which case treat it like '\n'
685 					c = getchar(ts);
686 					if(c != '\n') {
687 						if(c >= 0)
688 							ungetchar(ts, c);
689 						c = '\n';
690 					}
691 				}
692 			}
693 			else
694 				c = 0;
695 		}
696 		if(c != 0) {
697 			buf[j++] = c;
698 			if(j == sizeof(buf)-1) {
699 				s = buftostr(s, buf, j);
700 				j = 0;
701 			}
702 		}
703 		if(c == '\n')
704 			break;
705 	}
706 	s = buftostr(s, buf, j);
707 	if(s == nil)
708 		return -1;
709 	tok = &a[(*pai)++];
710 	tok->tag = Data;
711 	tok->text = s;
712 	tok->attr = nil;
713 	tok->starti = starti;
714 	return Data;
715 }
716 
717 // Return concatenation of s and buf[0:j]
718 static Rune*
719 buftostr(Rune* s, Rune* buf, int j)
720 {
721 	buf[j] = 0;
722 	if(s == nil)
723 		s = _Strndup(buf, j);
724 	else
725 		s = _Strdup2(s, buf);
726 	return s;
727 }
728 
729 // Gather data up to next start-of-tag or end-of-buffer.
730 // Translate entity references (&amp;).
731 // Ignore non-whitespace control characters and get rid of \r's.
732 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
733 // Otherwise return -1;
734 static int
735 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
736 {
737 	Rune*	s;
738 	int	j;
739 	int	c;
740 	Token*	tok;
741 	Rune	buf[BIGBUFSIZE];
742 
743 	s = nil;
744 	j = 0;
745 	c = firstc;
746 	while(c >= 0) {
747 		if(c == '&') {
748 			c = ampersand(ts);
749 			if(c < 0)
750 				break;
751 		}
752 		else if(c < ' ') {
753 			if(isspace(c)) {
754 				if(c == '\r') {
755 					// ignore it unless no following '\n',
756 					// in which case treat it like '\n'
757 					c = getchar(ts);
758 					if(c != '\n') {
759 						if(c >= 0)
760 							ungetchar(ts, c);
761 						c = '\n';
762 					}
763 				}
764 			}
765 			else {
766 				if(warn)
767 					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
768 				c = 0;
769 			}
770 		}
771 		else if(c == '<') {
772 			ungetchar(ts, c);
773 			break;
774 		}
775 		if(c != 0) {
776 			buf[j++] = c;
777 			if(j == BIGBUFSIZE-1) {
778 				s = buftostr(s, buf, j);
779 				j = 0;
780 			}
781 		}
782 		c = getchar(ts);
783 	}
784 	s = buftostr(s, buf, j);
785 	if(s == nil)
786 		return -1;
787 	tok = &a[(*pai)++];
788 	tok->tag = Data;
789 	tok->text = s;
790 	tok->attr = nil;
791 	tok->starti = starti;
792 	return Data;
793 }
794 
795 // The rules for lexing scripts are different (ugh).
796 // Gather up everything until see a </SCRIPT>.
797 static int
798 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
799 {
800 	Rune*	s;
801 	int	j;
802 	int	tstarti;
803 	int	savei;
804 	int	c;
805 	int	tag;
806 	int	done;
807 	Token*	tok;
808 	Rune	buf[BIGBUFSIZE];
809 
810 	s = nil;
811 	j = 0;
812 	tstarti = starti;
813 	c = firstc;
814 	done = 0;
815 	while(c >= 0) {
816 		if(c == '<') {
817 			// other browsers ignore stuff to end of line after <!
818 			savei = ts->i;
819 			c = getchar(ts);
820 			if(c == '!') {
821 				while(c >= 0 && c != '\n' && c != '\r')
822 					c = getchar(ts);
823 				if(c == '\r')
824 					c = getchar(ts);
825 				if(c == '\n')
826 					c = getchar(ts);
827 			}
828 			else if(c >= 0) {
829 				backup(ts, savei);
830 				tag = gettag(ts, tstarti, a, pai);
831 				if(tag == -1)
832 					break;
833 				if(tag != Comment)
834 					(*pai)--;
835 				backup(ts, tstarti);
836 				if(tag == Tscript + RBRA) {
837 					done = 1;
838 					break;
839 				}
840 				// here tag was not </SCRIPT>, so take as regular data
841 				c = getchar(ts);
842 			}
843 		}
844 		if(c < 0)
845 			break;
846 		if(c != 0) {
847 			buf[j++] = c;
848 			if(j == BIGBUFSIZE-1) {
849 				s = buftostr(s, buf, j);
850 				j = 0;
851 			}
852 		}
853 		tstarti = ts->i;
854 		c = getchar(ts);
855 	}
856 	if(done || ts->i == ts->edata) {
857 		s = buftostr(s, buf, j);
858 		tok = &a[(*pai)++];
859 		tok->tag = Data;
860 		tok->text = s;
861 		tok->attr = nil;
862 		tok->starti = starti;
863 		return Data;
864 	}
865 	backup(ts, starti);
866 	return -1;
867 }
868 
869 // We've just seen a '<'.  Gather up stuff to closing '>' (if buffer
870 // ends before then, return -1).
871 // If it's a tag, look up the name, gather the attributes, and return
872 // the appropriate token.
873 // Else it's either just plain data or some kind of ignorable stuff:
874 // return Data or Comment as appropriate.
875 // If it's not a Comment, put it in a[*pai] and bump *pai.
876 static int
877 gettag(TokenSource* ts, int starti, Token* a, int* pai)
878 {
879 	int	rbra;
880 	int	ans;
881 	Attr*	al;
882 	int	nexti;
883 	int	c;
884 	int	ti;
885 	int	afnd;
886 	int	attid;
887 	int	quote;
888 	Rune*	val;
889 	int	nv;
890 	int	i;
891 	int	tag;
892 	Token*	tok;
893 	Rune	buf[BIGBUFSIZE];
894 
895 	rbra = 0;
896 	nexti = ts->i;
897 	tok = &a[*pai];
898 	tok->tag = Notfound;
899 	tok->text = nil;
900 	tok->attr = nil;
901 	tok->starti = starti;
902 	c = getchar(ts);
903 	if(c == '/') {
904 		rbra = RBRA;
905 		c = getchar(ts);
906 	}
907 	if(c < 0)
908 		goto eob_done;
909 	if(c >= 256 || !isalpha(c)) {
910 		// not a tag
911 		if(c == '!') {
912 			ans = comment(ts);
913 			if(ans != -1)
914 				return ans;
915 			goto eob_done;
916 		}
917 		else {
918 			backup(ts, nexti);
919 			tok->tag = Data;
920 			tok->text = _Strdup(L"<");
921 			(*pai)++;
922 			return Data;
923 		}
924 	}
925 	// c starts a tagname
926 	buf[0] = c;
927 	i = 1;
928 	while(1) {
929 		c = getchar(ts);
930 		if(c < 0)
931 			goto eob_done;
932 		if(!ISNAMCHAR(c))
933 			break;
934 		// if name is bigger than buf it won't be found anyway...
935 		if(i < BIGBUFSIZE)
936 			buf[i++] = c;
937 	}
938 	if(_lookup(tagtable, Numtags, buf, i, &tag))
939 		tok->tag = tag + rbra;
940 	else
941 		tok->text = _Strndup(buf, i);	// for warning print, in build
942 
943 	// attribute gathering loop
944 	al = nil;
945 	while(1) {
946 		// look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
947 		// skip whitespace
948 attrloop_continue:
949 		while(c < 256 && isspace(c)) {
950 			c = getchar(ts);
951 			if(c < 0)
952 				goto eob_done;
953 		}
954 		if(c == '>')
955 			goto attrloop_done;
956 		if(c == '<') {
957 			if(warn)
958 				fprint(2, "warning: unclosed tag\n");
959 			ungetchar(ts, c);
960 			goto attrloop_done;
961 		}
962 		if(c >= 256 || !isalpha(c)) {
963 			if(warn)
964 				fprint(2, "warning: expected attribute name\n");
965 			// skipt to next attribute name
966 			while(1) {
967 				c = getchar(ts);
968 				if(c < 0)
969 					goto eob_done;
970 				if(c < 256 && isalpha(c))
971 					goto attrloop_continue;
972 				if(c == '<') {
973 					if(warn)
974 						fprint(2, "warning: unclosed tag\n");
975 					ungetchar(ts, 60);
976 					goto attrloop_done;
977 				}
978 				if(c == '>')
979 					goto attrloop_done;
980 			}
981 		}
982 		// gather attribute name
983 		buf[0] = c;
984 		i = 1;
985 		while(1) {
986 			c = getchar(ts);
987 			if(c < 0)
988 				goto eob_done;
989 			if(!ISNAMCHAR(c))
990 				break;
991 			if(i < BIGBUFSIZE-1)
992 				buf[i++] = c;
993 		}
994 		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
995 		if(warn && !afnd) {
996 			buf[i] = 0;
997 			fprint(2, "warning: unknown attribute name %S\n", buf);
998 		}
999 		// skip whitespace
1000 		while(c < 256 && isspace(c)) {
1001 			c = getchar(ts);
1002 			if(c < 0)
1003 				goto eob_done;
1004 		}
1005 		if(c != '=') {
1006 			if(afnd)
1007 				al = newattr(attid, nil, al);
1008 			goto attrloop_continue;
1009 		}
1010 		//# c is '=' here;  skip whitespace
1011 		while(1) {
1012 			c = getchar(ts);
1013 			if(c < 0)
1014 				goto eob_done;
1015 			if(c >= 256 || !isspace(c))
1016 				break;
1017 		}
1018 		quote = 0;
1019 		if(c == '\'' || c == '"') {
1020 			quote = c;
1021 			c = getchar(ts);
1022 			if(c < 0)
1023 				goto eob_done;
1024 		}
1025 		val = nil;
1026 		nv = 0;
1027 		while(1) {
1028 valloop_continue:
1029 			if(c < 0)
1030 				goto eob_done;
1031 			if(c == '>') {
1032 				if(quote) {
1033 					// c might be part of string (though not good style)
1034 					// but if line ends before close quote, assume
1035 					// there was an unmatched quote
1036 					ti = ts->i;
1037 					while(1) {
1038 						c = getchar(ts);
1039 						if(c < 0)
1040 							goto eob_done;
1041 						if(c == quote) {
1042 							backup(ts, ti);
1043 							buf[nv++] = '>';
1044 							if(nv == BIGBUFSIZE-1) {
1045 								val = buftostr(val, buf, nv);
1046 								nv = 0;
1047 							}
1048 							c = getchar(ts);
1049 							goto valloop_continue;
1050 						}
1051 						if(c == '\n') {
1052 							if(warn)
1053 								fprint(2, "warning: apparent unmatched quote\n");
1054 							backup(ts, ti);
1055 							c = '>';
1056 							goto valloop_done;
1057 						}
1058 					}
1059 				}
1060 				else
1061 					goto valloop_done;
1062 			}
1063 			if(quote) {
1064 				if(c == quote) {
1065 					c = getchar(ts);
1066 					if(c < 0)
1067 						goto eob_done;
1068 					goto valloop_done;
1069 				}
1070 				if(c == '\r') {
1071 					c = getchar(ts);
1072 					goto valloop_continue;
1073 				}
1074 				if(c == '\t' || c == '\n')
1075 					c = ' ';
1076 			}
1077 			else {
1078 				if(c < 256 && isspace(c))
1079 					goto valloop_done;
1080 			}
1081 			if(c == '&') {
1082 				c = ampersand(ts);
1083 				if(c == -1)
1084 					goto eob_done;
1085 			}
1086 			buf[nv++] = c;
1087 			if(nv == BIGBUFSIZE-1) {
1088 				val = buftostr(val, buf, nv);
1089 				nv = 0;
1090 			}
1091 			c = getchar(ts);
1092 		}
1093 valloop_done:
1094 		if(afnd) {
1095 			val = buftostr(val, buf, nv);
1096 			al = newattr(attid, val, al);
1097 		}
1098 	}
1099 
1100 attrloop_done:
1101 	tok->attr = al;
1102 	(*pai)++;
1103 	return tok->tag;
1104 
1105 eob_done:
1106 	if(warn)
1107 		fprint(2, "warning: incomplete tag at end of page\n");
1108 	backup(ts, nexti);
1109 	tok->tag = Data;
1110 	tok->text = _Strdup(L"<");
1111 	return Data;
1112 }
1113 
1114 // We've just read a '<!' at position starti,
1115 // so this may be a comment or other ignored section, or it may
1116 // be just a literal string if there is no close before end of file
1117 // (other browsers do that).
1118 // The accepted practice seems to be (note: contrary to SGML spec!):
1119 // If see <!--, look for --> to close, or if none, > to close.
1120 // If see <!(not --), look for > to close.
1121 // If no close before end of file, leave original characters in as literal data.
1122 //
1123 // If we see ignorable stuff, return Comment.
1124 // Else return nil (caller should back up and try again when more data arrives,
1125 // unless at end of file, in which case caller should just make '<' a data token).
1126 static int
1127 comment(TokenSource* ts)
1128 {
1129 	int	nexti;
1130 	int	havecomment;
1131 	int	c;
1132 
1133 	nexti = ts->i;
1134 	havecomment = 0;
1135 	c = getchar(ts);
1136 	if(c == '-') {
1137 		c = getchar(ts);
1138 		if(c == '-') {
1139 			if(findstr(ts, L"-->"))
1140 				havecomment = 1;
1141 			else
1142 				backup(ts, nexti);
1143 		}
1144 	}
1145 	if(!havecomment) {
1146 		if(c == '>')
1147 			havecomment = 1;
1148 		else if(c >= 0) {
1149 			if(findstr(ts, L">"))
1150 				havecomment = 1;
1151 		}
1152 	}
1153 	if(havecomment)
1154 		return Comment;
1155 	return -1;
1156 }
1157 
1158 // Look for string s in token source.
1159 // If found, return 1, with buffer at next char after s,
1160 // else return 0 (caller should back up).
1161 static int
1162 findstr(TokenSource* ts, Rune* s)
1163 {
1164 	int	c0;
1165 	int	n;
1166 	int	nexti;
1167 	int	i;
1168 	int	c;
1169 
1170 	c0 = s[0];
1171 	n = runestrlen(s);
1172 	while(1) {
1173 		c = getchar(ts);
1174 		if(c < 0)
1175 			break;
1176 		if(c == c0) {
1177 			if(n == 1)
1178 				return 1;
1179 			nexti = ts->i;
1180 			for(i = 1; i < n; i++) {
1181 				c = getchar(ts);
1182 				if(c < 0)
1183 					goto mainloop_done;
1184 				if(c != s[i])
1185 					break;
1186 			}
1187 			if(i == n)
1188 				return 1;
1189 			backup(ts, nexti);
1190 		}
1191 	}
1192 mainloop_done:
1193 	return 0;
1194 }
1195 
1196 // We've just read an '&'; look for an entity reference
1197 // name, and if found, return translated char.
1198 // if there is a complete entity name but it isn't known,
1199 // try prefixes (gets around some buggy HTML out there),
1200 // and if that fails, back up to just past the '&' and return '&'.
1201 // If the entity can't be completed in the current buffer, back up
1202 // to the '&' and return -1.
1203 static int
1204 ampersand(TokenSource* ts)
1205 {
1206 	int	savei;
1207 	int	c;
1208 	int	fnd;
1209 	int	ans;
1210 	int	v;
1211 	int	i;
1212 	int	k;
1213 	Rune	buf[SMALLBUFSIZE];
1214 
1215 	savei = ts->i;
1216 	c = getchar(ts);
1217 	fnd = 0;
1218 	ans = -1;
1219 	if(c == '#') {
1220 		c = getchar(ts);
1221 		v = 0;
1222 		while(c >= 0) {
1223 			if(!(c < 256 && isdigit(c)))
1224 				break;
1225 			v = v*10 + c - 48;
1226 			c = getchar(ts);
1227 		}
1228 		if(c >= 0) {
1229 			if(!(c == ';' || c == '\n' || c == '\r'))
1230 				ungetchar(ts, c);
1231 			c = v;
1232 			if(c == 160)
1233 				c = 160;
1234 			if(c >= Winstart && c <= Winend) {
1235 				c = winchars[c - Winstart];
1236 			}
1237 			ans = c;
1238 			fnd = 1;
1239 		}
1240 	}
1241 	else if(c < 256 && isalpha(c)) {
1242 		buf[0] = c;
1243 		k = 1;
1244 		while(1) {
1245 			c = getchar(ts);
1246 			if(c < 0)
1247 				break;
1248 			if(ISNAMCHAR(c)) {
1249 				if(k < SMALLBUFSIZE-1)
1250 					buf[k++] = c;
1251 			}
1252 			else {
1253 				if(!(c == ';' || c == '\n' || c == '\r'))
1254 					ungetchar(ts, c);
1255 				break;
1256 			}
1257 		}
1258 		if(c >= 0) {
1259 			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1260 			if(!fnd) {
1261 				// Try prefixes of s
1262 				if(c == ';' || c == '\n' || c == '\r')
1263 					ungetchar(ts, c);
1264 				i = k;
1265 				while(--k > 0) {
1266 					fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1267 					if(fnd) {
1268 						while(i > k) {
1269 							i--;
1270 							ungetchar(ts, buf[i]);
1271 						}
1272 						break;
1273 					}
1274 				}
1275 			}
1276 		}
1277 	}
1278 	if(!fnd) {
1279 		backup(ts, savei);
1280 		ans = '&';
1281 	}
1282 	return ans;
1283 }
1284 
1285 // Get next char, obeying ts.chset.
1286 // Returns -1 if no complete character left before current end of data.
1287 static int
1288 getchar(TokenSource* ts)
1289 {
1290 	uchar*	buf;
1291 	int	c;
1292 	int	n;
1293 	int	ok;
1294 	Rune	r;
1295 
1296 	if(ts->i >= ts->edata)
1297 		return -1;
1298 	buf = ts->data;
1299 	c = buf[ts->i];
1300 	switch(ts->chset) {
1301 	case ISO_8859_1:
1302 		if(c >= Winstart && c <= Winend)
1303 			c = winchars[c - Winstart];
1304 		ts->i++;
1305 		break;
1306 	case US_Ascii:
1307 		if(c > 127) {
1308 			if(warn)
1309 				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1310 		}
1311 		ts->i++;
1312 		break;
1313 	case UTF_8:
1314 		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1315 		n = chartorune(&r, (char*)(buf+ts->i));
1316 		if(ok) {
1317 			if(warn && c == 0x80)
1318 				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1319 			ts->i += n;
1320 			c = r;
1321 		}
1322 		else {
1323 			// not enough bytes in buf to complete utf-8 char
1324 			ts->i = ts->edata;	// mark "all used"
1325 			c = -1;
1326 		}
1327 		break;
1328 	case Unicode:
1329 		if(ts->i < ts->edata - 1) {
1330 			//standards say most-significant byte first
1331 			c = (c << 8)|(buf[ts->i + 1]);
1332 			ts->i += 2;
1333 		}
1334 		else {
1335 			ts->i = ts->edata;	// mark "all used"
1336 			c = -1;
1337 		}
1338 		break;
1339 	}
1340 	return c;
1341 }
1342 
1343 // Assuming c was the last character returned by getchar, set
1344 // things up so that next getchar will get that same character
1345 // followed by the current 'next character', etc.
1346 static void
1347 ungetchar(TokenSource* ts, int c)
1348 {
1349 	int	n;
1350 	Rune	r;
1351 	char	a[UTFmax];
1352 
1353 	n = 1;
1354 	switch(ts->chset) {
1355 	case UTF_8:
1356 		if(c >= 128) {
1357 			r = c;
1358 			n = runetochar(a, &r);
1359 		}
1360 		break;
1361 	case Unicode:
1362 		n = 2;
1363 		break;
1364 	}
1365 	ts->i -= n;
1366 }
1367 
1368 // Restore ts so that it is at the state where the index was savei.
1369 static void
1370 backup(TokenSource* ts, int savei)
1371 {
1372 	if(dbglex)
1373 		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1374 	ts->i = savei;
1375 }
1376 
1377 
1378 // Look for value associated with attribute attid in token t.
1379 // If there is one, return 1 and put the value in *pans,
1380 // else return 0.
1381 // If xfer is true, transfer ownership of the string to the caller
1382 // (nil it out here); otherwise, caller must duplicate the answer
1383 // if it needs to save it.
1384 // OK to have pans==0, in which case this is just looking
1385 // to see if token is present.
1386 int
1387 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1388 {
1389 	Attr*	attr;
1390 
1391 	attr = t->attr;
1392 	while(attr != nil) {
1393 		if(attr->attid == attid) {
1394 			if(pans != nil)
1395 				*pans = attr->value;
1396 			if(xfer)
1397 				attr->value = nil;
1398 			return 1;
1399 		}
1400 		attr = attr->next;
1401 	}
1402 	if(pans != nil)
1403 		*pans = nil;
1404 	return 0;
1405 }
1406 
1407 static int
1408 Tconv(Fmt *f)
1409 {
1410 	Token*	t;
1411 	int	i;
1412 	int	tag;
1413 	char*	srbra;
1414 	Rune*	aname;
1415 	Rune*	tname;
1416 	Attr*	a;
1417 	char	buf[BIGBUFSIZE];
1418 
1419 	t = va_arg(f->args, Token*);
1420 	if(t == nil)
1421 		sprint(buf, "<null>");
1422 	else {
1423 		i = 0;
1424 		if(dbglex > 1)
1425 			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1426 		tag = t->tag;
1427 		if(tag == Data) {
1428 			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1429 		}
1430 		else {
1431 			srbra = "";
1432 			if(tag >= RBRA) {
1433 				tag -= RBRA;
1434 				srbra = "/";
1435 			}
1436 			tname = tagnames[tag];
1437 			if(tag == Notfound)
1438 				tname = L"?";
1439 			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1440 			for(a = t->attr; a != nil; a = a->next) {
1441 				aname = attrnames[a->attid];
1442 				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1443 				if(a->value != nil)
1444 					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1445 			}
1446 			i += snprint(buf+i, sizeof(buf)-i-1, ">");
1447 		}
1448 		buf[i] = 0;
1449 	}
1450 	return fmtstrcpy(f, buf);
1451 }
1452 
1453 // Attrs own their constituent strings, but build may eventually
1454 // transfer some values to its items and nil them out in the Attr.
1455 static Attr*
1456 newattr(int attid, Rune* value, Attr* link)
1457 {
1458 	Attr* ans;
1459 
1460 	ans = (Attr*)emalloc(sizeof(Attr));
1461 	ans->attid = attid;
1462 	ans->value = value;
1463 	ans->next = link;
1464 	return ans;
1465 }
1466 
1467 // Free list of Attrs linked through next field
1468 static void
1469 freeattrs(Attr* ahead)
1470 {
1471 	Attr* a;
1472 	Attr* nexta;
1473 
1474 	a = ahead;
1475 	while(a != nil) {
1476 		nexta = a->next;
1477 		free(a->value);
1478 		free(a);
1479 		a = nexta;
1480 	}
1481 }
1482 
1483 // Free array of Tokens.
1484 // Allocated space might have room for more than n tokens,
1485 // but only n of them are initialized.
1486 // If caller has transferred ownership of constitutent strings
1487 // or attributes, it must have nil'd out the pointers in the Tokens.
1488 void
1489 _freetokens(Token* tarray, int n)
1490 {
1491 	int i;
1492 	Token* t;
1493 
1494 	if(tarray == nil)
1495 		return;
1496 	for(i = 0; i < n; i++) {
1497 		t = &tarray[i];
1498 		free(t->text);
1499 		freeattrs(t->attr);
1500 	}
1501 	free(tarray);
1502 }
1503