1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 int i; // index of next byte to use
12 uchar* data; // all the data
13 int edata; // data[0:edata] is valid
14 int chset; // one of US_Ascii, etc.
15 int mtype; // TextHtml or TextPlain
16 };
17
18 enum {
19 EOF = -2,
20 EOB = -1
21 };
22
23 #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune* tagnames[] = {
31 L" ",
32 L"!",
33 L"a",
34 L"abbr",
35 L"acronym",
36 L"address",
37 L"applet",
38 L"area",
39 L"b",
40 L"base",
41 L"basefont",
42 L"bdo",
43 L"big",
44 L"blink",
45 L"blockquote",
46 L"body",
47 L"bq",
48 L"br",
49 L"button",
50 L"caption",
51 L"center",
52 L"cite",
53 L"code",
54 L"col",
55 L"colgroup",
56 L"dd",
57 L"del",
58 L"dfn",
59 L"dir",
60 L"div",
61 L"dl",
62 L"dt",
63 L"em",
64 L"fieldset",
65 L"font",
66 L"form",
67 L"frame",
68 L"frameset",
69 L"h1",
70 L"h2",
71 L"h3",
72 L"h4",
73 L"h5",
74 L"h6",
75 L"head",
76 L"hr",
77 L"html",
78 L"i",
79 L"iframe",
80 L"img",
81 L"input",
82 L"ins",
83 L"isindex",
84 L"kbd",
85 L"label",
86 L"legend",
87 L"li",
88 L"link",
89 L"map",
90 L"menu",
91 L"meta",
92 L"nobr",
93 L"noframes",
94 L"noscript",
95 L"object",
96 L"ol",
97 L"optgroup",
98 L"option",
99 L"p",
100 L"param",
101 L"pre",
102 L"q",
103 L"s",
104 L"samp",
105 L"script",
106 L"select",
107 L"small",
108 L"span",
109 L"strike",
110 L"strong",
111 L"style",
112 L"sub",
113 L"sup",
114 L"table",
115 L"tbody",
116 L"td",
117 L"textarea",
118 L"tfoot",
119 L"th",
120 L"thead",
121 L"title",
122 L"tr",
123 L"tt",
124 L"u",
125 L"ul",
126 L"var"
127 };
128
129 // HTML 4.0 attribute names.
130 // Keep sorted, and in correspondence with enum in impl.h.
131 Rune* attrnames[] = {
132 L"abbr",
133 L"accept-charset",
134 L"access-key",
135 L"action",
136 L"align",
137 L"alink",
138 L"alt",
139 L"archive",
140 L"axis",
141 L"background",
142 L"bgcolor",
143 L"border",
144 L"cellpadding",
145 L"cellspacing",
146 L"char",
147 L"charoff",
148 L"charset",
149 L"checked",
150 L"cite",
151 L"class",
152 L"classid",
153 L"clear",
154 L"code",
155 L"codebase",
156 L"codetype",
157 L"color",
158 L"cols",
159 L"colspan",
160 L"compact",
161 L"content",
162 L"coords",
163 L"data",
164 L"datetime",
165 L"declare",
166 L"defer",
167 L"dir",
168 L"disabled",
169 L"enctype",
170 L"face",
171 L"for",
172 L"frame",
173 L"frameborder",
174 L"headers",
175 L"height",
176 L"href",
177 L"hreflang",
178 L"hspace",
179 L"http-equiv",
180 L"id",
181 L"ismap",
182 L"label",
183 L"lang",
184 L"link",
185 L"longdesc",
186 L"marginheight",
187 L"marginwidth",
188 L"maxlength",
189 L"media",
190 L"method",
191 L"multiple",
192 L"name",
193 L"nohref",
194 L"noresize",
195 L"noshade",
196 L"nowrap",
197 L"object",
198 L"onblur",
199 L"onchange",
200 L"onclick",
201 L"ondblclick",
202 L"onfocus",
203 L"onkeypress",
204 L"onkeyup",
205 L"onload",
206 L"onmousedown",
207 L"onmousemove",
208 L"onmouseout",
209 L"onmouseover",
210 L"onmouseup",
211 L"onreset",
212 L"onselect",
213 L"onsubmit",
214 L"onunload",
215 L"profile",
216 L"prompt",
217 L"readonly",
218 L"rel",
219 L"rev",
220 L"rows",
221 L"rowspan",
222 L"rules",
223 L"scheme",
224 L"scope",
225 L"scrolling",
226 L"selected",
227 L"shape",
228 L"size",
229 L"span",
230 L"src",
231 L"standby",
232 L"start",
233 L"style",
234 L"summary",
235 L"tabindex",
236 L"target",
237 L"text",
238 L"title",
239 L"type",
240 L"usemap",
241 L"valign",
242 L"value",
243 L"valuetype",
244 L"version",
245 L"vlink",
246 L"vspace",
247 L"width"
248 };
249
250
251 // Character entity to unicode character number map.
252 // Keep sorted by name.
253 StringInt chartab[]= {
254 {L"AElig", 198},
255 {L"Aacute", 193},
256 {L"Acirc", 194},
257 {L"Agrave", 192},
258 {L"Alpha", 913},
259 {L"Aring", 197},
260 {L"Atilde", 195},
261 {L"Auml", 196},
262 {L"Beta", 914},
263 {L"Ccedil", 199},
264 {L"Chi", 935},
265 {L"Dagger", 8225},
266 {L"Delta", 916},
267 {L"ETH", 208},
268 {L"Eacute", 201},
269 {L"Ecirc", 202},
270 {L"Egrave", 200},
271 {L"Epsilon", 917},
272 {L"Eta", 919},
273 {L"Euml", 203},
274 {L"Gamma", 915},
275 {L"Iacute", 205},
276 {L"Icirc", 206},
277 {L"Igrave", 204},
278 {L"Iota", 921},
279 {L"Iuml", 207},
280 {L"Kappa", 922},
281 {L"Lambda", 923},
282 {L"Mu", 924},
283 {L"Ntilde", 209},
284 {L"Nu", 925},
285 {L"OElig", 338},
286 {L"Oacute", 211},
287 {L"Ocirc", 212},
288 {L"Ograve", 210},
289 {L"Omega", 937},
290 {L"Omicron", 927},
291 {L"Oslash", 216},
292 {L"Otilde", 213},
293 {L"Ouml", 214},
294 {L"Phi", 934},
295 {L"Pi", 928},
296 {L"Prime", 8243},
297 {L"Psi", 936},
298 {L"Rho", 929},
299 {L"Scaron", 352},
300 {L"Sigma", 931},
301 {L"THORN", 222},
302 {L"Tau", 932},
303 {L"Theta", 920},
304 {L"Uacute", 218},
305 {L"Ucirc", 219},
306 {L"Ugrave", 217},
307 {L"Upsilon", 933},
308 {L"Uuml", 220},
309 {L"Xi", 926},
310 {L"Yacute", 221},
311 {L"Yuml", 376},
312 {L"Zeta", 918},
313 {L"aacute", 225},
314 {L"acirc", 226},
315 {L"acute", 180},
316 {L"aelig", 230},
317 {L"agrave", 224},
318 {L"alefsym", 8501},
319 {L"alpha", 945},
320 {L"amp", 38},
321 {L"and", 8743},
322 {L"ang", 8736},
323 {L"aring", 229},
324 {L"asymp", 8776},
325 {L"atilde", 227},
326 {L"auml", 228},
327 {L"bdquo", 8222},
328 {L"beta", 946},
329 {L"brvbar", 166},
330 {L"bull", 8226},
331 {L"cap", 8745},
332 {L"ccedil", 231},
333 {L"cdots", 8943},
334 {L"cedil", 184},
335 {L"cent", 162},
336 {L"chi", 967},
337 {L"circ", 710},
338 {L"clubs", 9827},
339 {L"cong", 8773},
340 {L"copy", 169},
341 {L"crarr", 8629},
342 {L"cup", 8746},
343 {L"curren", 164},
344 {L"dArr", 8659},
345 {L"dagger", 8224},
346 {L"darr", 8595},
347 {L"ddots", 8945},
348 {L"deg", 176},
349 {L"delta", 948},
350 {L"diams", 9830},
351 {L"divide", 247},
352 {L"eacute", 233},
353 {L"ecirc", 234},
354 {L"egrave", 232},
355 {L"emdash", 8212}, /* non-standard but commonly used */
356 {L"empty", 8709},
357 {L"emsp", 8195},
358 {L"endash", 8211}, /* non-standard but commonly used */
359 {L"ensp", 8194},
360 {L"epsilon", 949},
361 {L"equiv", 8801},
362 {L"eta", 951},
363 {L"eth", 240},
364 {L"euml", 235},
365 {L"euro", 8364},
366 {L"exist", 8707},
367 {L"fnof", 402},
368 {L"forall", 8704},
369 {L"frac12", 189},
370 {L"frac14", 188},
371 {L"frac34", 190},
372 {L"frasl", 8260},
373 {L"gamma", 947},
374 {L"ge", 8805},
375 {L"gt", 62},
376 {L"hArr", 8660},
377 {L"harr", 8596},
378 {L"hearts", 9829},
379 {L"hellip", 8230},
380 {L"iacute", 237},
381 {L"icirc", 238},
382 {L"iexcl", 161},
383 {L"igrave", 236},
384 {L"image", 8465},
385 {L"infin", 8734},
386 {L"int", 8747},
387 {L"iota", 953},
388 {L"iquest", 191},
389 {L"isin", 8712},
390 {L"iuml", 239},
391 {L"kappa", 954},
392 {L"lArr", 8656},
393 {L"lambda", 955},
394 {L"lang", 9001},
395 {L"laquo", 171},
396 {L"larr", 8592},
397 {L"lceil", 8968},
398 {L"ldots", 8230},
399 {L"ldquo", 8220},
400 {L"le", 8804},
401 {L"lfloor", 8970},
402 {L"lowast", 8727},
403 {L"loz", 9674},
404 {L"lrm", 8206},
405 {L"lsaquo", 8249},
406 {L"lsquo", 8216},
407 {L"lt", 60},
408 {L"macr", 175},
409 {L"mdash", 8212},
410 {L"micro", 181},
411 {L"middot", 183},
412 {L"minus", 8722},
413 {L"mu", 956},
414 {L"nabla", 8711},
415 {L"nbsp", 160},
416 {L"ndash", 8211},
417 {L"ne", 8800},
418 {L"ni", 8715},
419 {L"not", 172},
420 {L"notin", 8713},
421 {L"nsub", 8836},
422 {L"ntilde", 241},
423 {L"nu", 957},
424 {L"oacute", 243},
425 {L"ocirc", 244},
426 {L"oelig", 339},
427 {L"ograve", 242},
428 {L"oline", 8254},
429 {L"omega", 969},
430 {L"omicron", 959},
431 {L"oplus", 8853},
432 {L"or", 8744},
433 {L"ordf", 170},
434 {L"ordm", 186},
435 {L"oslash", 248},
436 {L"otilde", 245},
437 {L"otimes", 8855},
438 {L"ouml", 246},
439 {L"para", 182},
440 {L"part", 8706},
441 {L"permil", 8240},
442 {L"perp", 8869},
443 {L"phi", 966},
444 {L"pi", 960},
445 {L"piv", 982},
446 {L"plusmn", 177},
447 {L"pound", 163},
448 {L"prime", 8242},
449 {L"prod", 8719},
450 {L"prop", 8733},
451 {L"psi", 968},
452 {L"quad", 8193},
453 {L"quot", 34},
454 {L"rArr", 8658},
455 {L"radic", 8730},
456 {L"rang", 9002},
457 {L"raquo", 187},
458 {L"rarr", 8594},
459 {L"rceil", 8969},
460 {L"rdquo", 8221},
461 {L"real", 8476},
462 {L"reg", 174},
463 {L"rfloor", 8971},
464 {L"rho", 961},
465 {L"rlm", 8207},
466 {L"rsaquo", 8250},
467 {L"rsquo", 8217},
468 {L"sbquo", 8218},
469 {L"scaron", 353},
470 {L"sdot", 8901},
471 {L"sect", 167},
472 {L"shy", 173},
473 {L"sigma", 963},
474 {L"sigmaf", 962},
475 {L"sim", 8764},
476 {L"sp", 8194},
477 {L"spades", 9824},
478 {L"sub", 8834},
479 {L"sube", 8838},
480 {L"sum", 8721},
481 {L"sup", 8835},
482 {L"sup1", 185},
483 {L"sup2", 178},
484 {L"sup3", 179},
485 {L"supe", 8839},
486 {L"szlig", 223},
487 {L"tau", 964},
488 {L"there4", 8756},
489 {L"theta", 952},
490 {L"thetasym", 977},
491 {L"thinsp", 8201},
492 {L"thorn", 254},
493 {L"tilde", 732},
494 {L"times", 215},
495 {L"trade", 8482},
496 {L"uArr", 8657},
497 {L"uacute", 250},
498 {L"uarr", 8593},
499 {L"ucirc", 251},
500 {L"ugrave", 249},
501 {L"uml", 168},
502 {L"upsih", 978},
503 {L"upsilon", 965},
504 {L"uuml", 252},
505 {L"varepsilon", 8712},
506 {L"varphi", 981},
507 {L"varpi", 982},
508 {L"varrho", 1009},
509 {L"vdots", 8942},
510 {L"vsigma", 962},
511 {L"vtheta", 977},
512 {L"weierp", 8472},
513 {L"xi", 958},
514 {L"yacute", 253},
515 {L"yen", 165},
516 {L"yuml", 255},
517 {L"zeta", 950},
518 {L"zwj", 8205},
519 {L"zwnj", 8204}
520 };
521 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
522
523 // Characters Winstart..Winend are those that Windows
524 // uses interpolated into the Latin1 set.
525 // They aren't supposed to appear in HTML, but they do....
526 enum {
527 Winstart = 127,
528 Winend = 159
529 };
530
531 static int winchars[]= { 8226, // 8226 is a bullet
532 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
533 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
534 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
535 732, 8482, 353, 8250, 339, 8226, 8226, 376};
536
537 static StringInt* tagtable; // initialized from tagnames
538 static StringInt* attrtable; // initialized from attrnames
539
540 static void lexinit(void);
541 static int getplaindata(TokenSource* ts, Token* a, int* pai);
542 static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
543 static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
544 static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
545 static Rune* buftostr(Rune* s, Rune* buf, int j);
546 static int comment(TokenSource* ts);
547 static int findstr(TokenSource* ts, Rune* s);
548 static int ampersand(TokenSource* ts);
549 static int lowerc(int c);
550 static int getchar(TokenSource* ts);
551 static void ungetchar(TokenSource* ts, int c);
552 static void backup(TokenSource* ts, int savei);
553 static void freeinsidetoken(Token* t);
554 static void freeattrs(Attr* ahead);
555 static Attr* newattr(int attid, Rune* value, Attr* link);
556 static int Tconv(Fmt* f);
557
558 int dbglex = 0;
559 static int lexinited = 0;
560
561 static void
lexinit(void)562 lexinit(void)
563 {
564 tagtable = _makestrinttab(tagnames, Numtags);
565 attrtable = _makestrinttab(attrnames, Numattrs);
566 fmtinstall('T', Tconv);
567 lexinited = 1;
568 }
569
570 static TokenSource*
newtokensource(uchar * data,int edata,int chset,int mtype)571 newtokensource(uchar* data, int edata, int chset, int mtype)
572 {
573 TokenSource* ans;
574
575 assert(chset == US_Ascii || chset == ISO_8859_1 ||
576 chset == UTF_8 || chset == Unicode);
577 ans = (TokenSource*)emalloc(sizeof(TokenSource));
578 ans->i = 0;
579 ans->data = data;
580 ans->edata = edata;
581 ans->chset = chset;
582 ans->mtype = mtype;
583 return ans;
584 }
585
586 enum {
587 ToksChunk = 500,
588 };
589
590 // Call this to get the tokens.
591 // The number of returned tokens is returned in *plen.
592 Token*
_gettoks(uchar * data,int datalen,int chset,int mtype,int * plen)593 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
594 {
595 TokenSource* ts;
596 Token* a;
597 int alen;
598 int ai;
599 int starti;
600 int c;
601 int tag;
602
603 if(!lexinited)
604 lexinit();
605 ts = newtokensource(data, datalen, chset, mtype);
606 if(dbglex)
607 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
608 alen = 0;
609 ai = 0;
610 a = 0;
611 if(ts->mtype == TextHtml) {
612 for(;;) {
613 if(alen - ai < ToksChunk/32) {
614 alen += ToksChunk;
615 a = erealloc(a, alen*sizeof *a);
616 }
617 starti = ts->i;
618 c = getchar(ts);
619 if(c < 0)
620 break;
621 if(c == '<') {
622 tag = gettag(ts, starti, a, &ai);
623 if(tag == Tscript || tag == Tstyle) {
624 // special rules for getting Data after....
625 starti = ts->i;
626 c = getchar(ts);
627 tag = getscriptdata(ts, c, starti, a, &ai, tag);
628 }
629 }
630 else
631 tag = getdata(ts, c, starti, a, &ai);
632 if(tag == -1)
633 break;
634 else if(dbglex > 1 && tag != Comment)
635 fprint(2, "lex: got token %T\n", &a[ai-1]);
636 }
637 }
638 else {
639 // plain text (non-html) tokens
640 for(;;) {
641 if(alen - ai < ToksChunk/32) {
642 alen += ToksChunk;
643 a = erealloc(a, alen*sizeof *a);
644 }
645 tag = getplaindata(ts, a, &ai);
646 if(tag == -1)
647 break;
648 if(dbglex > 1)
649 fprint(2, "lex: got token %T\n", &a[ai]);
650 }
651 }
652 free(ts);
653 if(dbglex)
654 fprint(2, "lex: returning %d tokens\n", ai);
655 *plen = ai;
656 if(ai == 0){
657 free(a);
658 a = 0;
659 }
660 return a;
661 }
662
663 // For case where source isn't HTML.
664 // Just make data tokens, one per line (or partial line,
665 // at end of buffer), ignoring non-whitespace control
666 // characters and dumping \r's.
667 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
668 // Otherwise return -1;
669 static int
getplaindata(TokenSource * ts,Token * a,int * pai)670 getplaindata(TokenSource* ts, Token* a, int* pai)
671 {
672 Rune* s;
673 int j;
674 int starti;
675 int c;
676 Token* tok;
677 Rune buf[BIGBUFSIZE];
678
679 s = nil;
680 j = 0;
681 starti = ts->i;
682 for(c = getchar(ts); c >= 0; c = getchar(ts)) {
683 if(c < ' ') {
684 if(isspace(c)) {
685 if(c == '\r') {
686 // ignore it unless no following '\n',
687 // in which case treat it like '\n'
688 c = getchar(ts);
689 if(c != '\n') {
690 if(c >= 0)
691 ungetchar(ts, c);
692 c = '\n';
693 }
694 }
695 }
696 else
697 c = 0;
698 }
699 if(c != 0) {
700 buf[j++] = c;
701 if(j == nelem(buf)-1) {
702 s = buftostr(s, buf, j);
703 j = 0;
704 }
705 }
706 if(c == '\n')
707 break;
708 }
709 s = buftostr(s, buf, j);
710 if(s == nil)
711 return -1;
712 tok = &a[(*pai)++];
713 tok->tag = Data;
714 tok->text = s;
715 tok->attr = nil;
716 tok->starti = starti;
717 return Data;
718 }
719
720 // Return concatenation of s and buf[0:j]
721 static Rune*
buftostr(Rune * s,Rune * buf,int j)722 buftostr(Rune* s, Rune* buf, int j)
723 {
724 int i;
725
726 if(s == nil)
727 s = _Strndup(buf, j);
728 else {
729 i = _Strlen(s);
730 s = realloc(s, ( i+j+1)*sizeof *s);
731 memcpy(&s[i], buf, j*sizeof *s);
732 s[i+j] = 0;
733 }
734 return s;
735 }
736
737 // Gather data up to next start-of-tag or end-of-buffer.
738 // Translate entity references (&).
739 // Ignore non-whitespace control characters and get rid of \r's.
740 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
741 // Otherwise return -1;
742 static int
getdata(TokenSource * ts,int firstc,int starti,Token * a,int * pai)743 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
744 {
745 Rune* s;
746 int j;
747 int c;
748 Token* tok;
749 Rune buf[SMALLBUFSIZE];
750
751 s = nil;
752 j = 0;
753 for(c = firstc; c >= 0; c = getchar(ts)){
754 if(c == '&') {
755 c = ampersand(ts);
756 if(c < 0)
757 break;
758 }
759 else if(c < ' ') {
760 if(isspace(c)) {
761 if(c == '\r') {
762 // ignore it unless no following '\n',
763 // in which case treat it like '\n'
764 c = getchar(ts);
765 if(c != '\n') {
766 if(c >= 0)
767 ungetchar(ts, c);
768 c = '\n';
769 }
770 }
771 }
772 else {
773 if(warn)
774 fprint(2, "warning: non-whitespace control character %d ignored\n", c);
775 c = 0;
776 }
777 }
778 else if(c == '<') {
779 ungetchar(ts, c);
780 break;
781 }
782 if(c != 0) {
783 buf[j++] = c;
784 if(j == nelem(buf)-1) {
785 s = buftostr(s, buf, j);
786 j = 0;
787 }
788 }
789 }
790 s = buftostr(s, buf, j);
791 if(s == nil)
792 return -1;
793 tok = &a[(*pai)++];
794 tok->tag = Data;
795 tok->text = s;
796 tok->attr = nil;
797 tok->starti = starti;
798 return Data;
799 }
800
801 // The rules for lexing scripts are different (ugh).
802 // Gather up everything until see an "</" tagnames[tok] ">"
803 static int
getscriptdata(TokenSource * ts,int firstc,int starti,Token * a,int * pai,int findtag)804 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
805 {
806 Rune* s;
807 int j;
808 int tstarti;
809 int savei;
810 int c;
811 int tag;
812 int done;
813 Token* tok;
814 Rune buf[BIGBUFSIZE];
815
816 s = nil;
817 j = 0;
818 tstarti = starti;
819 c = firstc;
820 done = 0;
821 while(c >= 0) {
822 if(c == '<') {
823 // other browsers ignore stuff to end of line after <!
824 savei = ts->i;
825 c = getchar(ts);
826 if(c == '!') {
827 if(comment(ts) == -1)
828 break;
829 if(c == '\r')
830 c = getchar(ts);
831 if(c == '\n')
832 c = getchar(ts);
833 }
834 else if(c >= 0) {
835 backup(ts, savei);
836 tag = gettag(ts, tstarti, a, pai);
837 if(tag == -1)
838 break;
839 if(tag != Comment)
840 (*pai)--;
841 backup(ts, tstarti);
842 if(tag == findtag + RBRA) {
843 done = 1;
844 break;
845 }
846 // here tag was not the one we were looking for, so take as regular data
847 c = getchar(ts);
848 }
849 }
850 if(c < 0)
851 break;
852 if(c != 0) {
853 buf[j++] = c;
854 if(j == nelem(buf)-1) {
855 s = buftostr(s, buf, j);
856 j = 0;
857 }
858 }
859 tstarti = ts->i;
860 c = getchar(ts);
861 }
862 if(done || ts->i == ts->edata) {
863 s = buftostr(s, buf, j);
864 tok = &a[(*pai)++];
865 tok->tag = Data;
866 tok->text = s;
867 tok->attr = nil;
868 tok->starti = starti;
869 return Data;
870 }
871 free(s);
872 backup(ts, starti);
873 return -1;
874 }
875
876 // We've just seen a '<'. Gather up stuff to closing '>' (if buffer
877 // ends before then, return -1).
878 // If it's a tag, look up the name, gather the attributes, and return
879 // the appropriate token.
880 // Else it's either just plain data or some kind of ignorable stuff:
881 // return Data or Comment as appropriate.
882 // If it's not a Comment, put it in a[*pai] and bump *pai.
883 static int
gettag(TokenSource * ts,int starti,Token * a,int * pai)884 gettag(TokenSource* ts, int starti, Token* a, int* pai)
885 {
886 int rbra;
887 int ans;
888 Attr* al;
889 int nexti;
890 int c;
891 int ti;
892 int afnd;
893 int attid;
894 int quote;
895 Rune* val;
896 int nv;
897 int i;
898 int tag;
899 Token* tok;
900 Rune buf[BIGBUFSIZE];
901
902 rbra = 0;
903 nexti = ts->i;
904 tok = &a[*pai];
905 tok->tag = Notfound;
906 tok->text = nil;
907 tok->attr = nil;
908 tok->starti = starti;
909 c = getchar(ts);
910 if(c == '/') {
911 rbra = RBRA;
912 c = getchar(ts);
913 }
914 if(c < 0)
915 goto eob_done;
916 if(c >= 256 || !isalpha(c)) {
917 // not a tag
918 if(c == '!') {
919 ans = comment(ts);
920 if(ans != -1)
921 return ans;
922 goto eob_done;
923 }
924 else {
925 backup(ts, nexti);
926 tok->tag = Data;
927 tok->text = _Strdup(L"<");
928 (*pai)++;
929 return Data;
930 }
931 }
932 // c starts a tagname
933 buf[0] = c;
934 i = 1;
935 while(1) {
936 c = getchar(ts);
937 if(c < 0)
938 goto eob_done;
939 if(!ISNAMCHAR(c))
940 break;
941 // if name is bigger than buf it won't be found anyway...
942 if(i < BIGBUFSIZE)
943 buf[i++] = c;
944 }
945 if(_lookup(tagtable, Numtags, buf, i, &tag))
946 tok->tag = tag + rbra;
947 else
948 tok->text = _Strndup(buf, i); // for warning print, in build
949 // attribute gathering loop
950 al = nil;
951 while(1) {
952 // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
953 // skip whitespace
954 attrloop_continue:
955 while(c < 256 && isspace(c)) {
956 c = getchar(ts);
957 if(c < 0)
958 goto eob_done;
959 }
960 if(c == '>')
961 goto attrloop_done;
962 if(c == '<') {
963 if(warn)
964 fprint(2, "warning: unclosed tag\n");
965 ungetchar(ts, c);
966 goto attrloop_done;
967 }
968 if(c >= 256 || !isalpha(c)) {
969 if(warn)
970 fprint(2, "warning: expected attribute name\n");
971 // skipt to next attribute name
972 while(1) {
973 c = getchar(ts);
974 if(c < 0)
975 goto eob_done;
976 if(c < 256 && isalpha(c))
977 goto attrloop_continue;
978 if(c == '<') {
979 if(warn)
980 fprint(2, "warning: unclosed tag\n");
981 ungetchar(ts, 60);
982 goto attrloop_done;
983 }
984 if(c == '>')
985 goto attrloop_done;
986 }
987 }
988 // gather attribute name
989 buf[0] = c;
990 i = 1;
991 while(1) {
992 c = getchar(ts);
993 if(c < 0)
994 goto eob_done;
995 if(!ISNAMCHAR(c))
996 break;
997 if(i < BIGBUFSIZE-1)
998 buf[i++] = c;
999 }
1000 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
1001 if(warn && !afnd) {
1002 buf[i] = 0;
1003 fprint(2, "warning: unknown attribute name %S\n", buf);
1004 }
1005 // skip whitespace
1006 while(c < 256 && isspace(c)) {
1007 c = getchar(ts);
1008 if(c < 0)
1009 goto eob_done;
1010 }
1011 if(c != '=') {
1012 if(afnd)
1013 al = newattr(attid, nil, al);
1014 goto attrloop_continue;
1015 }
1016 //# c is '=' here; skip whitespace
1017 while(1) {
1018 c = getchar(ts);
1019 if(c < 0)
1020 goto eob_done;
1021 if(c >= 256 || !isspace(c))
1022 break;
1023 }
1024 quote = 0;
1025 if(c == '\'' || c == '"') {
1026 quote = c;
1027 c = getchar(ts);
1028 if(c < 0)
1029 goto eob_done;
1030 }
1031 val = nil;
1032 nv = 0;
1033 while(1) {
1034 valloop_continue:
1035 if(c < 0)
1036 goto eob_done;
1037 if(c == '>') {
1038 if(quote) {
1039 // c might be part of string (though not good style)
1040 // but if line ends before close quote, assume
1041 // there was an unmatched quote
1042 ti = ts->i;
1043 while(1) {
1044 c = getchar(ts);
1045 if(c < 0)
1046 goto eob_done;
1047 if(c == quote) {
1048 backup(ts, ti);
1049 buf[nv++] = '>';
1050 if(nv == BIGBUFSIZE-1) {
1051 val = buftostr(val, buf, nv);
1052 nv = 0;
1053 }
1054 c = getchar(ts);
1055 goto valloop_continue;
1056 }
1057 if(c == '\n') {
1058 if(warn)
1059 fprint(2, "warning: apparent unmatched quote\n");
1060 backup(ts, ti);
1061 c = '>';
1062 goto valloop_done;
1063 }
1064 }
1065 }
1066 else
1067 goto valloop_done;
1068 }
1069 if(quote) {
1070 if(c == quote) {
1071 c = getchar(ts);
1072 if(c < 0)
1073 goto eob_done;
1074 goto valloop_done;
1075 }
1076 if(c == '\r') {
1077 c = getchar(ts);
1078 goto valloop_continue;
1079 }
1080 if(c == '\t' || c == '\n')
1081 c = ' ';
1082 }
1083 else {
1084 if(c < 256 && isspace(c))
1085 goto valloop_done;
1086 }
1087 if(c == '&') {
1088 c = ampersand(ts);
1089 if(c == -1)
1090 goto eob_done;
1091 }
1092 buf[nv++] = c;
1093 if(nv == BIGBUFSIZE-1) {
1094 val = buftostr(val, buf, nv);
1095 nv = 0;
1096 }
1097 c = getchar(ts);
1098 }
1099 valloop_done:
1100 if(afnd) {
1101 val = buftostr(val, buf, nv);
1102 al = newattr(attid, val, al);
1103 }
1104 }
1105
1106 attrloop_done:
1107 tok->attr = al;
1108 (*pai)++;
1109 return tok->tag;
1110
1111 eob_done:
1112 if(warn)
1113 fprint(2, "warning: incomplete tag at end of page\n");
1114 backup(ts, nexti);
1115 tok->tag = Data;
1116 tok->text = _Strdup(L"<");
1117 return Data;
1118 }
1119
1120 // We've just read a '<!' at position starti,
1121 // so this may be a comment or other ignored section, or it may
1122 // be just a literal string if there is no close before end of file
1123 // (other browsers do that).
1124 // The accepted practice seems to be (note: contrary to SGML spec!):
1125 // If see <!--, look for --> to close, or if none, > to close.
1126 // If see <!(not --), look for > to close.
1127 // If no close before end of file, leave original characters in as literal data.
1128 //
1129 // If we see ignorable stuff, return Comment.
1130 // Else return nil (caller should back up and try again when more data arrives,
1131 // unless at end of file, in which case caller should just make '<' a data token).
1132 static int
comment(TokenSource * ts)1133 comment(TokenSource* ts)
1134 {
1135 int nexti;
1136 int havecomment;
1137 int c;
1138
1139 nexti = ts->i;
1140 havecomment = 0;
1141 c = getchar(ts);
1142 if(c == '-') {
1143 c = getchar(ts);
1144 if(c == '-') {
1145 if(findstr(ts, L"-->"))
1146 havecomment = 1;
1147 else
1148 backup(ts, nexti);
1149 }
1150 }
1151 if(!havecomment) {
1152 if(c == '>')
1153 havecomment = 1;
1154 else if(c >= 0) {
1155 if(findstr(ts, L">"))
1156 havecomment = 1;
1157 }
1158 }
1159 if(havecomment)
1160 return Comment;
1161 return -1;
1162 }
1163
1164 // Look for string s in token source.
1165 // If found, return 1, with buffer at next char after s,
1166 // else return 0 (caller should back up).
1167 static int
findstr(TokenSource * ts,Rune * s)1168 findstr(TokenSource* ts, Rune* s)
1169 {
1170 int c0;
1171 int n;
1172 int nexti;
1173 int i;
1174 int c;
1175
1176 c0 = s[0];
1177 n = runestrlen(s);
1178 while(1) {
1179 c = getchar(ts);
1180 if(c < 0)
1181 break;
1182 if(c == c0) {
1183 if(n == 1)
1184 return 1;
1185 nexti = ts->i;
1186 for(i = 1; i < n; i++) {
1187 c = getchar(ts);
1188 if(c < 0)
1189 goto mainloop_done;
1190 if(c != s[i])
1191 break;
1192 }
1193 if(i == n)
1194 return 1;
1195 backup(ts, nexti);
1196 }
1197 }
1198 mainloop_done:
1199 return 0;
1200 }
1201
1202 // We've just read an '&'; look for an entity reference
1203 // name, and if found, return translated char.
1204 // if there is a complete entity name but it isn't known,
1205 // back up to just past the '&' and return '&'.
1206 // If the entity can't be completed in the current buffer, back up
1207 // to the '&' and return -1.
1208 static int
ampersand(TokenSource * ts)1209 ampersand(TokenSource* ts)
1210 {
1211 int savei;
1212 int c;
1213 int fnd;
1214 int ans;
1215 int v;
1216 int k;
1217 Rune buf[25];
1218
1219 savei = ts->i;
1220 c = getchar(ts);
1221 fnd = 0;
1222 ans = -1;
1223 if(c == '#') {
1224 c = getchar(ts);
1225 v = 0;
1226 if(c == 'X' || c == 'x')
1227 for(c = getchar(ts); c < 256; c = getchar(ts))
1228 if(c >= '0' && c <= '9')
1229 v = v*16+c-'0';
1230 else if(c >= 'A' && c<= 'F')
1231 v = v*16+c-'A'+10;
1232 else if(c >= 'a' && c <= 'f')
1233 v = v*16+c-'a'+10;
1234 else
1235 break;
1236 else
1237 while(c >= 0) {
1238 if(!(c < 256 && isdigit(c)))
1239 break;
1240 v = v*10 + c - 48;
1241 c = getchar(ts);
1242 }
1243 if(c >= 0) {
1244 if(!(c == ';' || c == '\n' || c == '\r'))
1245 ungetchar(ts, c);
1246 c = v;
1247 if(c == 160)
1248 c = 160;
1249 if(c >= Winstart && c <= Winend) {
1250 c = winchars[c - Winstart];
1251 }
1252 ans = c;
1253 fnd = 1;
1254 }
1255 }
1256 else if(c < 256 && isalpha(c)) {
1257 buf[0] = c;
1258 k = 1;
1259 while(1) {
1260 c = getchar(ts);
1261 if(c < 0)
1262 break;
1263 if(c < 256 && (isalpha(c) || isdigit(c))) {
1264 if(k < nelem(buf)-1)
1265 buf[k++] = c;
1266 }
1267 else {
1268 if(!(c == ';' || c == '\n' || c == '\r'))
1269 ungetchar(ts, c);
1270 break;
1271 }
1272 }
1273 if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
1274 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1275 }
1276 if(!fnd) {
1277 backup(ts, savei);
1278 ans = '&';
1279 }
1280 return ans;
1281 }
1282
1283 // Get next char, obeying ts.chset.
1284 // Returns -1 if no complete character left before current end of data.
1285 static int
getchar(TokenSource * ts)1286 getchar(TokenSource* ts)
1287 {
1288 uchar* buf;
1289 int c;
1290 int n;
1291 int ok;
1292 Rune r;
1293
1294 if(ts->i >= ts->edata)
1295 return -1;
1296 buf = ts->data;
1297 c = buf[ts->i];
1298 switch(ts->chset) {
1299 case ISO_8859_1:
1300 if(c >= Winstart && c <= Winend)
1301 c = winchars[c - Winstart];
1302 ts->i++;
1303 break;
1304 case US_Ascii:
1305 if(c > 127) {
1306 if(warn)
1307 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1308 }
1309 ts->i++;
1310 break;
1311 case UTF_8:
1312 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1313 n = chartorune(&r, (char*)(buf+ts->i));
1314 if(ok) {
1315 if(warn && c == 0x80)
1316 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1317 ts->i += n;
1318 c = r;
1319 }
1320 else {
1321 // not enough bytes in buf to complete utf-8 char
1322 ts->i = ts->edata; // mark "all used"
1323 c = -1;
1324 }
1325 break;
1326 case Unicode:
1327 if(ts->i < ts->edata - 1) {
1328 //standards say most-significant byte first
1329 c = (c << 8)|(buf[ts->i + 1]);
1330 ts->i += 2;
1331 }
1332 else {
1333 ts->i = ts->edata; // mark "all used"
1334 c = -1;
1335 }
1336 break;
1337 default:
1338 return -1;
1339 }
1340 return c;
1341 }
1342
1343 // Assuming c was the last character returned by getchar, set
1344 // things up so that next getchar will get that same character
1345 // followed by the current 'next character', etc.
1346 static void
ungetchar(TokenSource * ts,int c)1347 ungetchar(TokenSource* ts, int c)
1348 {
1349 int n;
1350 Rune r;
1351 char a[UTFmax];
1352
1353 n = 1;
1354 switch(ts->chset) {
1355 case UTF_8:
1356 if(c >= 128) {
1357 r = c;
1358 n = runetochar(a, &r);
1359 }
1360 break;
1361 case Unicode:
1362 n = 2;
1363 break;
1364 }
1365 ts->i -= n;
1366 }
1367
1368 // Restore ts so that it is at the state where the index was savei.
1369 static void
backup(TokenSource * ts,int savei)1370 backup(TokenSource* ts, int savei)
1371 {
1372 if(dbglex)
1373 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1374 ts->i = savei;
1375 }
1376
1377
1378 // Look for value associated with attribute attid in token t.
1379 // If there is one, return 1 and put the value in *pans,
1380 // else return 0.
1381 // If xfer is true, transfer ownership of the string to the caller
1382 // (nil it out here); otherwise, caller must duplicate the answer
1383 // if it needs to save it.
1384 // OK to have pans==0, in which case this is just looking
1385 // to see if token is present.
1386 int
_tokaval(Token * t,int attid,Rune ** pans,int xfer)1387 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1388 {
1389 Attr* attr;
1390
1391 attr = t->attr;
1392 while(attr != nil) {
1393 if(attr->attid == attid) {
1394 if(pans != nil)
1395 *pans = attr->value;
1396 if(xfer)
1397 attr->value = nil;
1398 return 1;
1399 }
1400 attr = attr->next;
1401 }
1402 if(pans != nil)
1403 *pans = nil;
1404 return 0;
1405 }
1406
1407 static int
Tconv(Fmt * f)1408 Tconv(Fmt *f)
1409 {
1410 Token* t;
1411 int i;
1412 int tag;
1413 char* srbra;
1414 Rune* aname;
1415 Rune* tname;
1416 Attr* a;
1417 char buf[BIGBUFSIZE];
1418
1419 t = va_arg(f->args, Token*);
1420 if(t == nil)
1421 sprint(buf, "<null>");
1422 else {
1423 i = 0;
1424 if(dbglex > 1)
1425 i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1426 tag = t->tag;
1427 if(tag == Data) {
1428 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1429 }
1430 else {
1431 srbra = "";
1432 if(tag >= RBRA) {
1433 tag -= RBRA;
1434 srbra = "/";
1435 }
1436 tname = tagnames[tag];
1437 if(tag == Notfound)
1438 tname = L"?";
1439 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1440 for(a = t->attr; a != nil; a = a->next) {
1441 aname = attrnames[a->attid];
1442 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1443 if(a->value != nil)
1444 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1445 }
1446 i += snprint(buf+i, sizeof(buf)-i-1, ">");
1447 }
1448 buf[i] = 0;
1449 }
1450 return fmtstrcpy(f, buf);
1451 }
1452
1453 // Attrs own their constituent strings, but build may eventually
1454 // transfer some values to its items and nil them out in the Attr.
1455 static Attr*
newattr(int attid,Rune * value,Attr * link)1456 newattr(int attid, Rune* value, Attr* link)
1457 {
1458 Attr* ans;
1459
1460 ans = (Attr*)emalloc(sizeof(Attr));
1461 ans->attid = attid;
1462 ans->value = value;
1463 ans->next = link;
1464 return ans;
1465 }
1466
1467 // Free list of Attrs linked through next field
1468 static void
freeattrs(Attr * ahead)1469 freeattrs(Attr* ahead)
1470 {
1471 Attr* a;
1472 Attr* nexta;
1473
1474 a = ahead;
1475 while(a != nil) {
1476 nexta = a->next;
1477 free(a->value);
1478 free(a);
1479 a = nexta;
1480 }
1481 }
1482
1483 // Free array of Tokens.
1484 // Allocated space might have room for more than n tokens,
1485 // but only n of them are initialized.
1486 // If caller has transferred ownership of constitutent strings
1487 // or attributes, it must have nil'd out the pointers in the Tokens.
1488 void
_freetokens(Token * tarray,int n)1489 _freetokens(Token* tarray, int n)
1490 {
1491 int i;
1492 Token* t;
1493
1494 if(tarray == nil)
1495 return;
1496 for(i = 0; i < n; i++) {
1497 t = &tarray[i];
1498 free(t->text);
1499 freeattrs(t->attr);
1500 }
1501 free(tarray);
1502 }
1503