xref: /plan9/sys/src/libhtml/lex.c (revision edc15dd6d93c9a1b27df91ec433ef75c20e7345d)
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7 
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 	int			i;		// index of next byte to use
12 	uchar*		data;		// all the data
13 	int			edata;	// data[0:edata] is valid
14 	int			chset;	// one of US_Ascii, etc.
15 	int			mtype;	// TextHtml or TextPlain
16 };
17 
18 enum {
19 	EOF = -2,
20 	EOB = -1
21 };
22 
23 #define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27 
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune* tagnames[] = {
31 	L" ",
32 	L"!",
33 	L"a",
34 	L"abbr",
35 	L"acronym",
36 	L"address",
37 	L"applet",
38 	L"area",
39 	L"b",
40 	L"base",
41 	L"basefont",
42 	L"bdo",
43 	L"big",
44 	L"blink",
45 	L"blockquote",
46 	L"body",
47 	L"bq",
48 	L"br",
49 	L"button",
50 	L"caption",
51 	L"center",
52 	L"cite",
53 	L"code",
54 	L"col",
55 	L"colgroup",
56 	L"dd",
57 	L"del",
58 	L"dfn",
59 	L"dir",
60 	L"div",
61 	L"dl",
62 	L"dt",
63 	L"em",
64 	L"fieldset",
65 	L"font",
66 	L"form",
67 	L"frame",
68 	L"frameset",
69 	L"h1",
70 	L"h2",
71 	L"h3",
72 	L"h4",
73 	L"h5",
74 	L"h6",
75 	L"head",
76 	L"hr",
77 	L"html",
78 	L"i",
79 	L"iframe",
80 	L"img",
81 	L"input",
82 	L"ins",
83 	L"isindex",
84 	L"kbd",
85 	L"label",
86 	L"legend",
87 	L"li",
88 	L"link",
89 	L"map",
90 	L"menu",
91 	L"meta",
92 	L"nobr",
93 	L"noframes",
94 	L"noscript",
95 	L"object",
96 	L"ol",
97 	L"optgroup",
98 	L"option",
99 	L"p",
100 	L"param",
101 	L"pre",
102 	L"q",
103 	L"s",
104 	L"samp",
105 	L"script",
106 	L"select",
107 	L"small",
108 	L"span",
109 	L"strike",
110 	L"strong",
111 	L"style",
112 	L"sub",
113 	L"sup",
114 	L"table",
115 	L"tbody",
116 	L"td",
117 	L"textarea",
118 	L"tfoot",
119 	L"th",
120 	L"thead",
121 	L"title",
122 	L"tr",
123 	L"tt",
124 	L"u",
125 	L"ul",
126 	L"var"
127 };
128 
129 // HTML 4.0 attribute names.
130 // Keep sorted, and in correspondence with enum in impl.h.
131 Rune* attrnames[] = {
132 	L"abbr",
133 	L"accept-charset",
134 	L"access-key",
135 	L"action",
136 	L"align",
137 	L"alink",
138 	L"alt",
139 	L"archive",
140 	L"axis",
141 	L"background",
142 	L"bgcolor",
143 	L"border",
144 	L"cellpadding",
145 	L"cellspacing",
146 	L"char",
147 	L"charoff",
148 	L"charset",
149 	L"checked",
150 	L"cite",
151 	L"class",
152 	L"classid",
153 	L"clear",
154 	L"code",
155 	L"codebase",
156 	L"codetype",
157 	L"color",
158 	L"cols",
159 	L"colspan",
160 	L"compact",
161 	L"content",
162 	L"coords",
163 	L"data",
164 	L"datetime",
165 	L"declare",
166 	L"defer",
167 	L"dir",
168 	L"disabled",
169 	L"enctype",
170 	L"face",
171 	L"for",
172 	L"frame",
173 	L"frameborder",
174 	L"headers",
175 	L"height",
176 	L"href",
177 	L"hreflang",
178 	L"hspace",
179 	L"http-equiv",
180 	L"id",
181 	L"ismap",
182 	L"label",
183 	L"lang",
184 	L"link",
185 	L"longdesc",
186 	L"marginheight",
187 	L"marginwidth",
188 	L"maxlength",
189 	L"media",
190 	L"method",
191 	L"multiple",
192 	L"name",
193 	L"nohref",
194 	L"noresize",
195 	L"noshade",
196 	L"nowrap",
197 	L"object",
198 	L"onblur",
199 	L"onchange",
200 	L"onclick",
201 	L"ondblclick",
202 	L"onfocus",
203 	L"onkeypress",
204 	L"onkeyup",
205 	L"onload",
206 	L"onmousedown",
207 	L"onmousemove",
208 	L"onmouseout",
209 	L"onmouseover",
210 	L"onmouseup",
211 	L"onreset",
212 	L"onselect",
213 	L"onsubmit",
214 	L"onunload",
215 	L"profile",
216 	L"prompt",
217 	L"readonly",
218 	L"rel",
219 	L"rev",
220 	L"rows",
221 	L"rowspan",
222 	L"rules",
223 	L"scheme",
224 	L"scope",
225 	L"scrolling",
226 	L"selected",
227 	L"shape",
228 	L"size",
229 	L"span",
230 	L"src",
231 	L"standby",
232 	L"start",
233 	L"style",
234 	L"summary",
235 	L"tabindex",
236 	L"target",
237 	L"text",
238 	L"title",
239 	L"type",
240 	L"usemap",
241 	L"valign",
242 	L"value",
243 	L"valuetype",
244 	L"version",
245 	L"vlink",
246 	L"vspace",
247 	L"width"
248 };
249 
250 
251 // Character entity to unicode character number map.
252 // Keep sorted by name.
253 StringInt	chartab[]= {
254 	{L"AElig", 198},
255 	{L"Aacute", 193},
256 	{L"Acirc", 194},
257 	{L"Agrave", 192},
258 	{L"Alpha", 913},
259 	{L"Aring", 197},
260 	{L"Atilde", 195},
261 	{L"Auml", 196},
262 	{L"Beta", 914},
263 	{L"Ccedil", 199},
264 	{L"Chi", 935},
265 	{L"Dagger", 8225},
266 	{L"Delta", 916},
267 	{L"ETH", 208},
268 	{L"Eacute", 201},
269 	{L"Ecirc", 202},
270 	{L"Egrave", 200},
271 	{L"Epsilon", 917},
272 	{L"Eta", 919},
273 	{L"Euml", 203},
274 	{L"Gamma", 915},
275 	{L"Iacute", 205},
276 	{L"Icirc", 206},
277 	{L"Igrave", 204},
278 	{L"Iota", 921},
279 	{L"Iuml", 207},
280 	{L"Kappa", 922},
281 	{L"Lambda", 923},
282 	{L"Mu", 924},
283 	{L"Ntilde", 209},
284 	{L"Nu", 925},
285 	{L"OElig", 338},
286 	{L"Oacute", 211},
287 	{L"Ocirc", 212},
288 	{L"Ograve", 210},
289 	{L"Omega", 937},
290 	{L"Omicron", 927},
291 	{L"Oslash", 216},
292 	{L"Otilde", 213},
293 	{L"Ouml", 214},
294 	{L"Phi", 934},
295 	{L"Pi", 928},
296 	{L"Prime", 8243},
297 	{L"Psi", 936},
298 	{L"Rho", 929},
299 	{L"Scaron", 352},
300 	{L"Sigma", 931},
301 	{L"THORN", 222},
302 	{L"Tau", 932},
303 	{L"Theta", 920},
304 	{L"Uacute", 218},
305 	{L"Ucirc", 219},
306 	{L"Ugrave", 217},
307 	{L"Upsilon", 933},
308 	{L"Uuml", 220},
309 	{L"Xi", 926},
310 	{L"Yacute", 221},
311 	{L"Yuml", 376},
312 	{L"Zeta", 918},
313 	{L"aacute", 225},
314 	{L"acirc", 226},
315 	{L"acute", 180},
316 	{L"aelig", 230},
317 	{L"agrave", 224},
318 	{L"alefsym", 8501},
319 	{L"alpha", 945},
320 	{L"amp", 38},
321 	{L"and", 8743},
322 	{L"ang", 8736},
323 	{L"aring", 229},
324 	{L"asymp", 8776},
325 	{L"atilde", 227},
326 	{L"auml", 228},
327 	{L"bdquo", 8222},
328 	{L"beta", 946},
329 	{L"brvbar", 166},
330 	{L"bull", 8226},
331 	{L"cap", 8745},
332 	{L"ccedil", 231},
333 	{L"cdots", 8943},
334 	{L"cedil", 184},
335 	{L"cent", 162},
336 	{L"chi", 967},
337 	{L"circ", 710},
338 	{L"clubs", 9827},
339 	{L"cong", 8773},
340 	{L"copy", 169},
341 	{L"crarr", 8629},
342 	{L"cup", 8746},
343 	{L"curren", 164},
344 	{L"dArr", 8659},
345 	{L"dagger", 8224},
346 	{L"darr", 8595},
347 	{L"ddots", 8945},
348 	{L"deg", 176},
349 	{L"delta", 948},
350 	{L"diams", 9830},
351 	{L"divide", 247},
352 	{L"eacute", 233},
353 	{L"ecirc", 234},
354 	{L"egrave", 232},
355 	{L"emdash", 8212},	/* non-standard but commonly used */
356 	{L"empty", 8709},
357 	{L"emsp", 8195},
358 	{L"endash", 8211},	/* non-standard but commonly used */
359 	{L"ensp", 8194},
360 	{L"epsilon", 949},
361 	{L"equiv", 8801},
362 	{L"eta", 951},
363 	{L"eth", 240},
364 	{L"euml", 235},
365 	{L"euro", 8364},
366 	{L"exist", 8707},
367 	{L"fnof", 402},
368 	{L"forall", 8704},
369 	{L"frac12", 189},
370 	{L"frac14", 188},
371 	{L"frac34", 190},
372 	{L"frasl", 8260},
373 	{L"gamma", 947},
374 	{L"ge", 8805},
375 	{L"gt", 62},
376 	{L"hArr", 8660},
377 	{L"harr", 8596},
378 	{L"hearts", 9829},
379 	{L"hellip", 8230},
380 	{L"iacute", 237},
381 	{L"icirc", 238},
382 	{L"iexcl", 161},
383 	{L"igrave", 236},
384 	{L"image", 8465},
385 	{L"infin", 8734},
386 	{L"int", 8747},
387 	{L"iota", 953},
388 	{L"iquest", 191},
389 	{L"isin", 8712},
390 	{L"iuml", 239},
391 	{L"kappa", 954},
392 	{L"lArr", 8656},
393 	{L"lambda", 955},
394 	{L"lang", 9001},
395 	{L"laquo", 171},
396 	{L"larr", 8592},
397 	{L"lceil", 8968},
398 	{L"ldots", 8230},
399 	{L"ldquo", 8220},
400 	{L"le", 8804},
401 	{L"lfloor", 8970},
402 	{L"lowast", 8727},
403 	{L"loz", 9674},
404 	{L"lrm", 8206},
405 	{L"lsaquo", 8249},
406 	{L"lsquo", 8216},
407 	{L"lt", 60},
408 	{L"macr", 175},
409 	{L"mdash", 8212},
410 	{L"micro", 181},
411 	{L"middot", 183},
412 	{L"minus", 8722},
413 	{L"mu", 956},
414 	{L"nabla", 8711},
415 	{L"nbsp", 160},
416 	{L"ndash", 8211},
417 	{L"ne", 8800},
418 	{L"ni", 8715},
419 	{L"not", 172},
420 	{L"notin", 8713},
421 	{L"nsub", 8836},
422 	{L"ntilde", 241},
423 	{L"nu", 957},
424 	{L"oacute", 243},
425 	{L"ocirc", 244},
426 	{L"oelig", 339},
427 	{L"ograve", 242},
428 	{L"oline", 8254},
429 	{L"omega", 969},
430 	{L"omicron", 959},
431 	{L"oplus", 8853},
432 	{L"or", 8744},
433 	{L"ordf", 170},
434 	{L"ordm", 186},
435 	{L"oslash", 248},
436 	{L"otilde", 245},
437 	{L"otimes", 8855},
438 	{L"ouml", 246},
439 	{L"para", 182},
440 	{L"part", 8706},
441 	{L"permil", 8240},
442 	{L"perp", 8869},
443 	{L"phi", 966},
444 	{L"pi", 960},
445 	{L"piv", 982},
446 	{L"plusmn", 177},
447 	{L"pound", 163},
448 	{L"prime", 8242},
449 	{L"prod", 8719},
450 	{L"prop", 8733},
451 	{L"psi", 968},
452 	{L"quad", 8193},
453 	{L"quot", 34},
454 	{L"rArr", 8658},
455 	{L"radic", 8730},
456 	{L"rang", 9002},
457 	{L"raquo", 187},
458 	{L"rarr", 8594},
459 	{L"rceil", 8969},
460 	{L"rdquo", 8221},
461 	{L"real", 8476},
462 	{L"reg", 174},
463 	{L"rfloor", 8971},
464 	{L"rho", 961},
465 	{L"rlm", 8207},
466 	{L"rsaquo", 8250},
467 	{L"rsquo", 8217},
468 	{L"sbquo", 8218},
469 	{L"scaron", 353},
470 	{L"sdot", 8901},
471 	{L"sect", 167},
472 	{L"shy", 173},
473 	{L"sigma", 963},
474 	{L"sigmaf", 962},
475 	{L"sim", 8764},
476 	{L"sp", 8194},
477 	{L"spades", 9824},
478 	{L"sub", 8834},
479 	{L"sube", 8838},
480 	{L"sum", 8721},
481 	{L"sup", 8835},
482 	{L"sup1", 185},
483 	{L"sup2", 178},
484 	{L"sup3", 179},
485 	{L"supe", 8839},
486 	{L"szlig", 223},
487 	{L"tau", 964},
488 	{L"there4", 8756},
489 	{L"theta", 952},
490 	{L"thetasym", 977},
491 	{L"thinsp", 8201},
492 	{L"thorn", 254},
493 	{L"tilde", 732},
494 	{L"times", 215},
495 	{L"trade", 8482},
496 	{L"uArr", 8657},
497 	{L"uacute", 250},
498 	{L"uarr", 8593},
499 	{L"ucirc", 251},
500 	{L"ugrave", 249},
501 	{L"uml", 168},
502 	{L"upsih", 978},
503 	{L"upsilon", 965},
504 	{L"uuml", 252},
505 	{L"varepsilon", 8712},
506 	{L"varphi", 981},
507 	{L"varpi", 982},
508 	{L"varrho", 1009},
509 	{L"vdots", 8942},
510 	{L"vsigma", 962},
511 	{L"vtheta", 977},
512 	{L"weierp", 8472},
513 	{L"xi", 958},
514 	{L"yacute", 253},
515 	{L"yen", 165},
516 	{L"yuml", 255},
517 	{L"zeta", 950},
518 	{L"zwj", 8205},
519 	{L"zwnj", 8204}
520 };
521 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
522 
523 // Characters Winstart..Winend are those that Windows
524 // uses interpolated into the Latin1 set.
525 // They aren't supposed to appear in HTML, but they do....
526 enum {
527 	Winstart = 127,
528 	Winend = 159
529 };
530 
531 static int	winchars[]= { 8226,	// 8226 is a bullet
532 	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
533 	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
534 	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
535 	732, 8482, 353, 8250, 339, 8226, 8226, 376};
536 
537 static StringInt*	tagtable;		// initialized from tagnames
538 static StringInt*	attrtable;		// initialized from attrnames
539 
540 static void	lexinit(void);
541 static int		getplaindata(TokenSource* ts, Token* a, int* pai);
542 static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
543 static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
544 static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
545 static Rune*	buftostr(Rune* s, Rune* buf, int j);
546 static int		comment(TokenSource* ts);
547 static int		findstr(TokenSource* ts, Rune* s);
548 static int		ampersand(TokenSource* ts);
549 static int		lowerc(int c);
550 static int		getchar(TokenSource* ts);
551 static void		ungetchar(TokenSource* ts, int c);
552 static void		backup(TokenSource* ts, int savei);
553 static void		freeinsidetoken(Token* t);
554 static void		freeattrs(Attr* ahead);
555 static Attr*	newattr(int attid, Rune* value, Attr* link);
556 static int		Tconv(Fmt* f);
557 
558 int	dbglex = 0;
559 static int lexinited = 0;
560 
561 static void
lexinit(void)562 lexinit(void)
563 {
564 	tagtable = _makestrinttab(tagnames, Numtags);
565 	attrtable = _makestrinttab(attrnames, Numattrs);
566 	fmtinstall('T', Tconv);
567 	lexinited = 1;
568 }
569 
570 static TokenSource*
newtokensource(uchar * data,int edata,int chset,int mtype)571 newtokensource(uchar* data, int edata, int chset, int mtype)
572 {
573 	TokenSource*	ans;
574 
575 	assert(chset == US_Ascii || chset == ISO_8859_1 ||
576 			chset == UTF_8 || chset == Unicode);
577 	ans = (TokenSource*)emalloc(sizeof(TokenSource));
578 	ans->i = 0;
579 	ans->data = data;
580 	ans->edata = edata;
581 	ans->chset = chset;
582 	ans->mtype = mtype;
583 	return ans;
584 }
585 
586 enum {
587 	ToksChunk = 500,
588 };
589 
590 // Call this to get the tokens.
591 //  The number of returned tokens is returned in *plen.
592 Token*
_gettoks(uchar * data,int datalen,int chset,int mtype,int * plen)593 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
594 {
595 	TokenSource*	ts;
596 	Token*		a;
597 	int	alen;
598 	int	ai;
599 	int	starti;
600 	int	c;
601 	int	tag;
602 
603 	if(!lexinited)
604 		lexinit();
605 	ts = newtokensource(data, datalen, chset, mtype);
606 	if(dbglex)
607 		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
608 	alen = 0;
609 	ai = 0;
610 	a = 0;
611 	if(ts->mtype == TextHtml) {
612 		for(;;) {
613 			if(alen - ai < ToksChunk/32) {
614 				alen += ToksChunk;
615 				a = erealloc(a, alen*sizeof *a);
616 			}
617 			starti = ts->i;
618 			c = getchar(ts);
619 			if(c < 0)
620 				break;
621 			if(c == '<') {
622 				tag = gettag(ts, starti, a, &ai);
623 				if(tag == Tscript || tag == Tstyle) {
624 					// special rules for getting Data after....
625 					starti = ts->i;
626 					c = getchar(ts);
627 					tag = getscriptdata(ts, c, starti, a, &ai, tag);
628 				}
629 			}
630 			else
631 				tag = getdata(ts, c, starti, a, &ai);
632 			if(tag == -1)
633 				break;
634 			else if(dbglex > 1 && tag != Comment)
635 				fprint(2, "lex: got token %T\n", &a[ai-1]);
636 		}
637 	}
638 	else {
639 		// plain text (non-html) tokens
640 		for(;;) {
641 			if(alen - ai < ToksChunk/32) {
642 				alen += ToksChunk;
643 				a = erealloc(a, alen*sizeof *a);
644 			}
645 			tag = getplaindata(ts, a, &ai);
646 			if(tag == -1)
647 				break;
648 			if(dbglex > 1)
649 				fprint(2, "lex: got token %T\n", &a[ai]);
650 		}
651 	}
652 	free(ts);
653 	if(dbglex)
654 		fprint(2, "lex: returning %d tokens\n", ai);
655 	*plen = ai;
656 	if(ai == 0){
657 		free(a);
658 		a = 0;
659 	}
660 	return a;
661 }
662 
663 // For case where source isn't HTML.
664 // Just make data tokens, one per line (or partial line,
665 // at end of buffer), ignoring non-whitespace control
666 // characters and dumping \r's.
667 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
668 // Otherwise return -1;
669 static int
getplaindata(TokenSource * ts,Token * a,int * pai)670 getplaindata(TokenSource* ts, Token* a, int* pai)
671 {
672 	Rune*	s;
673 	int	j;
674 	int	starti;
675 	int	c;
676 	Token*	tok;
677 	Rune	buf[BIGBUFSIZE];
678 
679 	s = nil;
680 	j = 0;
681 	starti = ts->i;
682 	for(c = getchar(ts); c >= 0; c = getchar(ts)) {
683 		if(c < ' ') {
684 			if(isspace(c)) {
685 				if(c == '\r') {
686 					// ignore it unless no following '\n',
687 					// in which case treat it like '\n'
688 					c = getchar(ts);
689 					if(c != '\n') {
690 						if(c >= 0)
691 							ungetchar(ts, c);
692 						c = '\n';
693 					}
694 				}
695 			}
696 			else
697 				c = 0;
698 		}
699 		if(c != 0) {
700 			buf[j++] = c;
701 			if(j == nelem(buf)-1) {
702 				s = buftostr(s, buf, j);
703 				j = 0;
704 			}
705 		}
706 		if(c == '\n')
707 			break;
708 	}
709 	s = buftostr(s, buf, j);
710 	if(s == nil)
711 		return -1;
712 	tok = &a[(*pai)++];
713 	tok->tag = Data;
714 	tok->text = s;
715 	tok->attr = nil;
716 	tok->starti = starti;
717 	return Data;
718 }
719 
720 // Return concatenation of s and buf[0:j]
721 static Rune*
buftostr(Rune * s,Rune * buf,int j)722 buftostr(Rune* s, Rune* buf, int j)
723 {
724 	int i;
725 
726 	if(s == nil)
727 		s = _Strndup(buf, j);
728 	else {
729 		i = _Strlen(s);
730 		s = realloc(s, ( i+j+1)*sizeof *s);
731 		memcpy(&s[i], buf, j*sizeof *s);
732 		s[i+j] = 0;
733 	}
734 	return s;
735 }
736 
737 // Gather data up to next start-of-tag or end-of-buffer.
738 // Translate entity references (&amp;).
739 // Ignore non-whitespace control characters and get rid of \r's.
740 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
741 // Otherwise return -1;
742 static int
getdata(TokenSource * ts,int firstc,int starti,Token * a,int * pai)743 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
744 {
745 	Rune*	s;
746 	int	j;
747 	int	c;
748 	Token*	tok;
749 	Rune	buf[SMALLBUFSIZE];
750 
751 	s = nil;
752 	j = 0;
753 	for(c = firstc; c >= 0; c = getchar(ts)){
754 		if(c == '&') {
755 			c = ampersand(ts);
756 			if(c < 0)
757 				break;
758 		}
759 		else if(c < ' ') {
760 			if(isspace(c)) {
761 				if(c == '\r') {
762 					// ignore it unless no following '\n',
763 					// in which case treat it like '\n'
764 					c = getchar(ts);
765 					if(c != '\n') {
766 						if(c >= 0)
767 							ungetchar(ts, c);
768 						c = '\n';
769 					}
770 				}
771 			}
772 			else {
773 				if(warn)
774 					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
775 				c = 0;
776 			}
777 		}
778 		else if(c == '<') {
779 			ungetchar(ts, c);
780 			break;
781 		}
782 		if(c != 0) {
783 			buf[j++] = c;
784 			if(j == nelem(buf)-1) {
785 				s = buftostr(s, buf, j);
786 				j = 0;
787 			}
788 		}
789 	}
790 	s = buftostr(s, buf, j);
791 	if(s == nil)
792 		return -1;
793 	tok = &a[(*pai)++];
794 	tok->tag = Data;
795 	tok->text = s;
796 	tok->attr = nil;
797 	tok->starti = starti;
798 	return Data;
799 }
800 
801 // The rules for lexing scripts are different (ugh).
802 // Gather up everything until see an "</" tagnames[tok] ">"
803 static int
getscriptdata(TokenSource * ts,int firstc,int starti,Token * a,int * pai,int findtag)804 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
805 {
806 	Rune*	s;
807 	int	j;
808 	int	tstarti;
809 	int	savei;
810 	int	c;
811 	int	tag;
812 	int	done;
813 	Token*	tok;
814 	Rune	buf[BIGBUFSIZE];
815 
816 	s = nil;
817 	j = 0;
818 	tstarti = starti;
819 	c = firstc;
820 	done = 0;
821 	while(c >= 0) {
822 		if(c == '<') {
823 			// other browsers ignore stuff to end of line after <!
824 			savei = ts->i;
825 			c = getchar(ts);
826 			if(c == '!') {
827 				if(comment(ts) == -1)
828 					break;
829 				if(c == '\r')
830 					c = getchar(ts);
831 				if(c == '\n')
832 					c = getchar(ts);
833 			}
834 			else if(c >= 0) {
835 				backup(ts, savei);
836 				tag = gettag(ts, tstarti, a, pai);
837 				if(tag == -1)
838 					break;
839 				if(tag != Comment)
840 					(*pai)--;
841 				backup(ts, tstarti);
842 				if(tag == findtag + RBRA) {
843 					done = 1;
844 					break;
845 				}
846 				// here tag was not the one we were looking for, so take as regular data
847 				c = getchar(ts);
848 			}
849 		}
850 		if(c < 0)
851 			break;
852 		if(c != 0) {
853 			buf[j++] = c;
854 			if(j == nelem(buf)-1) {
855 				s = buftostr(s, buf, j);
856 				j = 0;
857 			}
858 		}
859 		tstarti = ts->i;
860 		c = getchar(ts);
861 	}
862 	if(done || ts->i == ts->edata) {
863 		s = buftostr(s, buf, j);
864 		tok = &a[(*pai)++];
865 		tok->tag = Data;
866 		tok->text = s;
867 		tok->attr = nil;
868 		tok->starti = starti;
869 		return Data;
870 	}
871 	free(s);
872 	backup(ts, starti);
873 	return -1;
874 }
875 
876 // We've just seen a '<'.  Gather up stuff to closing '>' (if buffer
877 // ends before then, return -1).
878 // If it's a tag, look up the name, gather the attributes, and return
879 // the appropriate token.
880 // Else it's either just plain data or some kind of ignorable stuff:
881 // return Data or Comment as appropriate.
882 // If it's not a Comment, put it in a[*pai] and bump *pai.
883 static int
gettag(TokenSource * ts,int starti,Token * a,int * pai)884 gettag(TokenSource* ts, int starti, Token* a, int* pai)
885 {
886 	int	rbra;
887 	int	ans;
888 	Attr*	al;
889 	int	nexti;
890 	int	c;
891 	int	ti;
892 	int	afnd;
893 	int	attid;
894 	int	quote;
895 	Rune*	val;
896 	int	nv;
897 	int	i;
898 	int	tag;
899 	Token*	tok;
900 	Rune	buf[BIGBUFSIZE];
901 
902 	rbra = 0;
903 	nexti = ts->i;
904 	tok = &a[*pai];
905 	tok->tag = Notfound;
906 	tok->text = nil;
907 	tok->attr = nil;
908 	tok->starti = starti;
909 	c = getchar(ts);
910 	if(c == '/') {
911 		rbra = RBRA;
912 		c = getchar(ts);
913 	}
914 	if(c < 0)
915 		goto eob_done;
916 	if(c >= 256 || !isalpha(c)) {
917 		// not a tag
918 		if(c == '!') {
919 			ans = comment(ts);
920 			if(ans != -1)
921 				return ans;
922 			goto eob_done;
923 		}
924 		else {
925 			backup(ts, nexti);
926 			tok->tag = Data;
927 			tok->text = _Strdup(L"<");
928 			(*pai)++;
929 			return Data;
930 		}
931 	}
932 	// c starts a tagname
933 	buf[0] = c;
934 	i = 1;
935 	while(1) {
936 		c = getchar(ts);
937 		if(c < 0)
938 			goto eob_done;
939 		if(!ISNAMCHAR(c))
940 			break;
941 		// if name is bigger than buf it won't be found anyway...
942 		if(i < BIGBUFSIZE)
943 			buf[i++] = c;
944 	}
945 	if(_lookup(tagtable, Numtags, buf, i, &tag))
946 		tok->tag = tag + rbra;
947 	else
948 		tok->text = _Strndup(buf, i);	// for warning print, in build
949 	// attribute gathering loop
950 	al = nil;
951 	while(1) {
952 		// look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
953 		// skip whitespace
954 attrloop_continue:
955 		while(c < 256 && isspace(c)) {
956 			c = getchar(ts);
957 			if(c < 0)
958 				goto eob_done;
959 		}
960 		if(c == '>')
961 			goto attrloop_done;
962 		if(c == '<') {
963 			if(warn)
964 				fprint(2, "warning: unclosed tag\n");
965 			ungetchar(ts, c);
966 			goto attrloop_done;
967 		}
968 		if(c >= 256 || !isalpha(c)) {
969 			if(warn)
970 				fprint(2, "warning: expected attribute name\n");
971 			// skipt to next attribute name
972 			while(1) {
973 				c = getchar(ts);
974 				if(c < 0)
975 					goto eob_done;
976 				if(c < 256 && isalpha(c))
977 					goto attrloop_continue;
978 				if(c == '<') {
979 					if(warn)
980 						fprint(2, "warning: unclosed tag\n");
981 					ungetchar(ts, 60);
982 					goto attrloop_done;
983 				}
984 				if(c == '>')
985 					goto attrloop_done;
986 			}
987 		}
988 		// gather attribute name
989 		buf[0] = c;
990 		i = 1;
991 		while(1) {
992 			c = getchar(ts);
993 			if(c < 0)
994 				goto eob_done;
995 			if(!ISNAMCHAR(c))
996 				break;
997 			if(i < BIGBUFSIZE-1)
998 				buf[i++] = c;
999 		}
1000 		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
1001 		if(warn && !afnd) {
1002 			buf[i] = 0;
1003 			fprint(2, "warning: unknown attribute name %S\n", buf);
1004 		}
1005 		// skip whitespace
1006 		while(c < 256 && isspace(c)) {
1007 			c = getchar(ts);
1008 			if(c < 0)
1009 				goto eob_done;
1010 		}
1011 		if(c != '=') {
1012 			if(afnd)
1013 				al = newattr(attid, nil, al);
1014 			goto attrloop_continue;
1015 		}
1016 		//# c is '=' here;  skip whitespace
1017 		while(1) {
1018 			c = getchar(ts);
1019 			if(c < 0)
1020 				goto eob_done;
1021 			if(c >= 256 || !isspace(c))
1022 				break;
1023 		}
1024 		quote = 0;
1025 		if(c == '\'' || c == '"') {
1026 			quote = c;
1027 			c = getchar(ts);
1028 			if(c < 0)
1029 				goto eob_done;
1030 		}
1031 		val = nil;
1032 		nv = 0;
1033 		while(1) {
1034 valloop_continue:
1035 			if(c < 0)
1036 				goto eob_done;
1037 			if(c == '>') {
1038 				if(quote) {
1039 					// c might be part of string (though not good style)
1040 					// but if line ends before close quote, assume
1041 					// there was an unmatched quote
1042 					ti = ts->i;
1043 					while(1) {
1044 						c = getchar(ts);
1045 						if(c < 0)
1046 							goto eob_done;
1047 						if(c == quote) {
1048 							backup(ts, ti);
1049 							buf[nv++] = '>';
1050 							if(nv == BIGBUFSIZE-1) {
1051 								val = buftostr(val, buf, nv);
1052 								nv = 0;
1053 							}
1054 							c = getchar(ts);
1055 							goto valloop_continue;
1056 						}
1057 						if(c == '\n') {
1058 							if(warn)
1059 								fprint(2, "warning: apparent unmatched quote\n");
1060 							backup(ts, ti);
1061 							c = '>';
1062 							goto valloop_done;
1063 						}
1064 					}
1065 				}
1066 				else
1067 					goto valloop_done;
1068 			}
1069 			if(quote) {
1070 				if(c == quote) {
1071 					c = getchar(ts);
1072 					if(c < 0)
1073 						goto eob_done;
1074 					goto valloop_done;
1075 				}
1076 				if(c == '\r') {
1077 					c = getchar(ts);
1078 					goto valloop_continue;
1079 				}
1080 				if(c == '\t' || c == '\n')
1081 					c = ' ';
1082 			}
1083 			else {
1084 				if(c < 256 && isspace(c))
1085 					goto valloop_done;
1086 			}
1087 			if(c == '&') {
1088 				c = ampersand(ts);
1089 				if(c == -1)
1090 					goto eob_done;
1091 			}
1092 			buf[nv++] = c;
1093 			if(nv == BIGBUFSIZE-1) {
1094 				val = buftostr(val, buf, nv);
1095 				nv = 0;
1096 			}
1097 			c = getchar(ts);
1098 		}
1099 valloop_done:
1100 		if(afnd) {
1101 			val = buftostr(val, buf, nv);
1102 			al = newattr(attid, val, al);
1103 		}
1104 	}
1105 
1106 attrloop_done:
1107 	tok->attr = al;
1108 	(*pai)++;
1109 	return tok->tag;
1110 
1111 eob_done:
1112 	if(warn)
1113 		fprint(2, "warning: incomplete tag at end of page\n");
1114 	backup(ts, nexti);
1115 	tok->tag = Data;
1116 	tok->text = _Strdup(L"<");
1117 	return Data;
1118 }
1119 
1120 // We've just read a '<!' at position starti,
1121 // so this may be a comment or other ignored section, or it may
1122 // be just a literal string if there is no close before end of file
1123 // (other browsers do that).
1124 // The accepted practice seems to be (note: contrary to SGML spec!):
1125 // If see <!--, look for --> to close, or if none, > to close.
1126 // If see <!(not --), look for > to close.
1127 // If no close before end of file, leave original characters in as literal data.
1128 //
1129 // If we see ignorable stuff, return Comment.
1130 // Else return nil (caller should back up and try again when more data arrives,
1131 // unless at end of file, in which case caller should just make '<' a data token).
1132 static int
comment(TokenSource * ts)1133 comment(TokenSource* ts)
1134 {
1135 	int	nexti;
1136 	int	havecomment;
1137 	int	c;
1138 
1139 	nexti = ts->i;
1140 	havecomment = 0;
1141 	c = getchar(ts);
1142 	if(c == '-') {
1143 		c = getchar(ts);
1144 		if(c == '-') {
1145 			if(findstr(ts, L"-->"))
1146 				havecomment = 1;
1147 			else
1148 				backup(ts, nexti);
1149 		}
1150 	}
1151 	if(!havecomment) {
1152 		if(c == '>')
1153 			havecomment = 1;
1154 		else if(c >= 0) {
1155 			if(findstr(ts, L">"))
1156 				havecomment = 1;
1157 		}
1158 	}
1159 	if(havecomment)
1160 		return Comment;
1161 	return -1;
1162 }
1163 
1164 // Look for string s in token source.
1165 // If found, return 1, with buffer at next char after s,
1166 // else return 0 (caller should back up).
1167 static int
findstr(TokenSource * ts,Rune * s)1168 findstr(TokenSource* ts, Rune* s)
1169 {
1170 	int	c0;
1171 	int	n;
1172 	int	nexti;
1173 	int	i;
1174 	int	c;
1175 
1176 	c0 = s[0];
1177 	n = runestrlen(s);
1178 	while(1) {
1179 		c = getchar(ts);
1180 		if(c < 0)
1181 			break;
1182 		if(c == c0) {
1183 			if(n == 1)
1184 				return 1;
1185 			nexti = ts->i;
1186 			for(i = 1; i < n; i++) {
1187 				c = getchar(ts);
1188 				if(c < 0)
1189 					goto mainloop_done;
1190 				if(c != s[i])
1191 					break;
1192 			}
1193 			if(i == n)
1194 				return 1;
1195 			backup(ts, nexti);
1196 		}
1197 	}
1198 mainloop_done:
1199 	return 0;
1200 }
1201 
1202 // We've just read an '&'; look for an entity reference
1203 // name, and if found, return translated char.
1204 // if there is a complete entity name but it isn't known,
1205 // back up to just past the '&' and return '&'.
1206 // If the entity can't be completed in the current buffer, back up
1207 // to the '&' and return -1.
1208 static int
ampersand(TokenSource * ts)1209 ampersand(TokenSource* ts)
1210 {
1211 	int	savei;
1212 	int	c;
1213 	int	fnd;
1214 	int	ans;
1215 	int	v;
1216 	int	k;
1217 	Rune	buf[25];
1218 
1219 	savei = ts->i;
1220 	c = getchar(ts);
1221 	fnd = 0;
1222 	ans = -1;
1223 	if(c == '#') {
1224 		c = getchar(ts);
1225 		v = 0;
1226 		if(c == 'X' || c == 'x')
1227 			for(c = getchar(ts); c < 256; c = getchar(ts))
1228 				if(c >= '0' && c <= '9')
1229 					v = v*16+c-'0';
1230 				else if(c >= 'A' && c<= 'F')
1231 					v = v*16+c-'A'+10;
1232 				else if(c >= 'a' && c <= 'f')
1233 					v = v*16+c-'a'+10;
1234 				else
1235 					break;
1236 		else
1237 			while(c >= 0) {
1238 				if(!(c < 256 && isdigit(c)))
1239 					break;
1240 				v = v*10 + c - 48;
1241 				c = getchar(ts);
1242 			}
1243 		if(c >= 0) {
1244 			if(!(c == ';' || c == '\n' || c == '\r'))
1245 				ungetchar(ts, c);
1246 			c = v;
1247 			if(c == 160)
1248 				c = 160;
1249 			if(c >= Winstart && c <= Winend) {
1250 				c = winchars[c - Winstart];
1251 			}
1252 			ans = c;
1253 			fnd = 1;
1254 		}
1255 	}
1256 	else if(c < 256 && isalpha(c)) {
1257 		buf[0] = c;
1258 		k = 1;
1259 		while(1) {
1260 			c = getchar(ts);
1261 			if(c < 0)
1262 				break;
1263 			if(c < 256 && (isalpha(c) || isdigit(c))) {
1264 				if(k < nelem(buf)-1)
1265 					buf[k++] = c;
1266 			}
1267 			else {
1268 				if(!(c == ';' || c == '\n' || c == '\r'))
1269 					ungetchar(ts, c);
1270 				break;
1271 			}
1272 		}
1273 		if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
1274 			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1275 	}
1276 	if(!fnd) {
1277 		backup(ts, savei);
1278 		ans = '&';
1279 	}
1280 	return ans;
1281 }
1282 
1283 // Get next char, obeying ts.chset.
1284 // Returns -1 if no complete character left before current end of data.
1285 static int
getchar(TokenSource * ts)1286 getchar(TokenSource* ts)
1287 {
1288 	uchar*	buf;
1289 	int	c;
1290 	int	n;
1291 	int	ok;
1292 	Rune	r;
1293 
1294 	if(ts->i >= ts->edata)
1295 		return -1;
1296 	buf = ts->data;
1297 	c = buf[ts->i];
1298 	switch(ts->chset) {
1299 	case ISO_8859_1:
1300 		if(c >= Winstart && c <= Winend)
1301 			c = winchars[c - Winstart];
1302 		ts->i++;
1303 		break;
1304 	case US_Ascii:
1305 		if(c > 127) {
1306 			if(warn)
1307 				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1308 		}
1309 		ts->i++;
1310 		break;
1311 	case UTF_8:
1312 		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1313 		n = chartorune(&r, (char*)(buf+ts->i));
1314 		if(ok) {
1315 			if(warn && c == 0x80)
1316 				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1317 			ts->i += n;
1318 			c = r;
1319 		}
1320 		else {
1321 			// not enough bytes in buf to complete utf-8 char
1322 			ts->i = ts->edata;	// mark "all used"
1323 			c = -1;
1324 		}
1325 		break;
1326 	case Unicode:
1327 		if(ts->i < ts->edata - 1) {
1328 			//standards say most-significant byte first
1329 			c = (c << 8)|(buf[ts->i + 1]);
1330 			ts->i += 2;
1331 		}
1332 		else {
1333 			ts->i = ts->edata;	// mark "all used"
1334 			c = -1;
1335 		}
1336 		break;
1337 	default:
1338 		return -1;
1339 	}
1340 	return c;
1341 }
1342 
1343 // Assuming c was the last character returned by getchar, set
1344 // things up so that next getchar will get that same character
1345 // followed by the current 'next character', etc.
1346 static void
ungetchar(TokenSource * ts,int c)1347 ungetchar(TokenSource* ts, int c)
1348 {
1349 	int	n;
1350 	Rune	r;
1351 	char	a[UTFmax];
1352 
1353 	n = 1;
1354 	switch(ts->chset) {
1355 	case UTF_8:
1356 		if(c >= 128) {
1357 			r = c;
1358 			n = runetochar(a, &r);
1359 		}
1360 		break;
1361 	case Unicode:
1362 		n = 2;
1363 		break;
1364 	}
1365 	ts->i -= n;
1366 }
1367 
1368 // Restore ts so that it is at the state where the index was savei.
1369 static void
backup(TokenSource * ts,int savei)1370 backup(TokenSource* ts, int savei)
1371 {
1372 	if(dbglex)
1373 		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1374 	ts->i = savei;
1375 }
1376 
1377 
1378 // Look for value associated with attribute attid in token t.
1379 // If there is one, return 1 and put the value in *pans,
1380 // else return 0.
1381 // If xfer is true, transfer ownership of the string to the caller
1382 // (nil it out here); otherwise, caller must duplicate the answer
1383 // if it needs to save it.
1384 // OK to have pans==0, in which case this is just looking
1385 // to see if token is present.
1386 int
_tokaval(Token * t,int attid,Rune ** pans,int xfer)1387 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1388 {
1389 	Attr*	attr;
1390 
1391 	attr = t->attr;
1392 	while(attr != nil) {
1393 		if(attr->attid == attid) {
1394 			if(pans != nil)
1395 				*pans = attr->value;
1396 			if(xfer)
1397 				attr->value = nil;
1398 			return 1;
1399 		}
1400 		attr = attr->next;
1401 	}
1402 	if(pans != nil)
1403 		*pans = nil;
1404 	return 0;
1405 }
1406 
1407 static int
Tconv(Fmt * f)1408 Tconv(Fmt *f)
1409 {
1410 	Token*	t;
1411 	int	i;
1412 	int	tag;
1413 	char*	srbra;
1414 	Rune*	aname;
1415 	Rune*	tname;
1416 	Attr*	a;
1417 	char	buf[BIGBUFSIZE];
1418 
1419 	t = va_arg(f->args, Token*);
1420 	if(t == nil)
1421 		sprint(buf, "<null>");
1422 	else {
1423 		i = 0;
1424 		if(dbglex > 1)
1425 			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1426 		tag = t->tag;
1427 		if(tag == Data) {
1428 			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1429 		}
1430 		else {
1431 			srbra = "";
1432 			if(tag >= RBRA) {
1433 				tag -= RBRA;
1434 				srbra = "/";
1435 			}
1436 			tname = tagnames[tag];
1437 			if(tag == Notfound)
1438 				tname = L"?";
1439 			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1440 			for(a = t->attr; a != nil; a = a->next) {
1441 				aname = attrnames[a->attid];
1442 				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1443 				if(a->value != nil)
1444 					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1445 			}
1446 			i += snprint(buf+i, sizeof(buf)-i-1, ">");
1447 		}
1448 		buf[i] = 0;
1449 	}
1450 	return fmtstrcpy(f, buf);
1451 }
1452 
1453 // Attrs own their constituent strings, but build may eventually
1454 // transfer some values to its items and nil them out in the Attr.
1455 static Attr*
newattr(int attid,Rune * value,Attr * link)1456 newattr(int attid, Rune* value, Attr* link)
1457 {
1458 	Attr* ans;
1459 
1460 	ans = (Attr*)emalloc(sizeof(Attr));
1461 	ans->attid = attid;
1462 	ans->value = value;
1463 	ans->next = link;
1464 	return ans;
1465 }
1466 
1467 // Free list of Attrs linked through next field
1468 static void
freeattrs(Attr * ahead)1469 freeattrs(Attr* ahead)
1470 {
1471 	Attr* a;
1472 	Attr* nexta;
1473 
1474 	a = ahead;
1475 	while(a != nil) {
1476 		nexta = a->next;
1477 		free(a->value);
1478 		free(a);
1479 		a = nexta;
1480 	}
1481 }
1482 
1483 // Free array of Tokens.
1484 // Allocated space might have room for more than n tokens,
1485 // but only n of them are initialized.
1486 // If caller has transferred ownership of constitutent strings
1487 // or attributes, it must have nil'd out the pointers in the Tokens.
1488 void
_freetokens(Token * tarray,int n)1489 _freetokens(Token* tarray, int n)
1490 {
1491 	int i;
1492 	Token* t;
1493 
1494 	if(tarray == nil)
1495 		return;
1496 	for(i = 0; i < n; i++) {
1497 		t = &tarray[i];
1498 		free(t->text);
1499 		freeattrs(t->attr);
1500 	}
1501 	free(tarray);
1502 }
1503