1 #include <u.h> 2 #include <libc.h> 3 #include <draw.h> 4 #include <ctype.h> 5 #include <html.h> 6 #include "impl.h" 7 8 typedef struct TokenSource TokenSource; 9 struct TokenSource 10 { 11 int i; // index of next byte to use 12 uchar* data; // all the data 13 int edata; // data[0:edata] is valid 14 int chset; // one of US_Ascii, etc. 15 int mtype; // TextHtml or TextPlain 16 }; 17 18 enum { 19 EOF = -2, 20 EOB = -1 21 }; 22 23 #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.')) 24 25 #define SMALLBUFSIZE 240 26 #define BIGBUFSIZE 2000 27 28 // HTML 4.0 tag names. 29 // Keep sorted, and in correspondence with enum in iparse.h. 30 Rune* tagnames[] = { 31 L" ", 32 L"!", 33 L"a", 34 L"abbr", 35 L"acronym", 36 L"address", 37 L"applet", 38 L"area", 39 L"b", 40 L"base", 41 L"basefont", 42 L"bdo", 43 L"big", 44 L"blink", 45 L"blockquote", 46 L"body", 47 L"bq", 48 L"br", 49 L"button", 50 L"caption", 51 L"center", 52 L"cite", 53 L"code", 54 L"col", 55 L"colgroup", 56 L"dd", 57 L"del", 58 L"dfn", 59 L"dir", 60 L"div", 61 L"dl", 62 L"dt", 63 L"em", 64 L"fieldset", 65 L"font", 66 L"form", 67 L"frame", 68 L"frameset", 69 L"h1", 70 L"h2", 71 L"h3", 72 L"h4", 73 L"h5", 74 L"h6", 75 L"head", 76 L"hr", 77 L"html", 78 L"i", 79 L"iframe", 80 L"img", 81 L"input", 82 L"ins", 83 L"isindex", 84 L"kbd", 85 L"label", 86 L"legend", 87 L"li", 88 L"link", 89 L"map", 90 L"menu", 91 L"meta", 92 L"nobr", 93 L"noframes", 94 L"noscript", 95 L"object", 96 L"ol", 97 L"optgroup", 98 L"option", 99 L"p", 100 L"param", 101 L"pre", 102 L"q", 103 L"s", 104 L"samp", 105 L"script", 106 L"select", 107 L"small", 108 L"span", 109 L"strike", 110 L"strong", 111 L"style", 112 L"sub", 113 L"sup", 114 L"table", 115 L"tbody", 116 L"td", 117 L"textarea", 118 L"tfoot", 119 L"th", 120 L"thead", 121 L"title", 122 L"tr", 123 L"tt", 124 L"u", 125 L"ul", 126 L"var" 127 }; 128 129 // HTML 4.0 attribute names. 130 // Keep sorted, and in correspondence with enum in i.h. 131 Rune* attrnames[] = { 132 L"abbr", 133 L"accept-charset", 134 L"access-key", 135 L"action", 136 L"align", 137 L"alink", 138 L"alt", 139 L"archive", 140 L"axis", 141 L"background", 142 L"bgcolor", 143 L"border", 144 L"cellpadding", 145 L"cellspacing", 146 L"char", 147 L"charoff", 148 L"charset", 149 L"checked", 150 L"cite", 151 L"class", 152 L"classid", 153 L"clear", 154 L"code", 155 L"codebase", 156 L"codetype", 157 L"color", 158 L"cols", 159 L"colspan", 160 L"compact", 161 L"content", 162 L"coords", 163 L"data", 164 L"datetime", 165 L"declare", 166 L"defer", 167 L"dir", 168 L"disabled", 169 L"enctype", 170 L"face", 171 L"for", 172 L"frame", 173 L"frameborder", 174 L"headers", 175 L"height", 176 L"href", 177 L"hreflang", 178 L"hspace", 179 L"http-equiv", 180 L"id", 181 L"ismap", 182 L"label", 183 L"lang", 184 L"link", 185 L"longdesc", 186 L"marginheight", 187 L"marginwidth", 188 L"maxlength", 189 L"media", 190 L"method", 191 L"multiple", 192 L"name", 193 L"nohref", 194 L"noresize", 195 L"noshade", 196 L"nowrap", 197 L"object", 198 L"onblur", 199 L"onchange", 200 L"onclick", 201 L"ondblclick", 202 L"onfocus", 203 L"onkeypress", 204 L"onkeyup", 205 L"onload", 206 L"onmousedown", 207 L"onmousemove", 208 L"onmouseout", 209 L"onmouseover", 210 L"onmouseup", 211 L"onreset", 212 L"onselect", 213 L"onsubmit", 214 L"onunload", 215 L"profile", 216 L"prompt", 217 L"readonly", 218 L"rel", 219 L"rev", 220 L"rows", 221 L"rowspan", 222 L"rules", 223 L"scheme", 224 L"scope", 225 L"scrolling", 226 L"selected", 227 L"shape", 228 L"size", 229 L"span", 230 L"src", 231 L"standby", 232 L"start", 233 L"style", 234 L"summary", 235 L"tabindex", 236 L"target", 237 L"text", 238 L"title", 239 L"type", 240 L"usemap", 241 L"valign", 242 L"value", 243 L"valuetype", 244 L"version", 245 L"vlink", 246 L"vspace", 247 L"width" 248 }; 249 250 251 // Character entity to unicode character number map. 252 // Keep sorted by name. 253 StringInt chartab[]= { 254 {L"AElig", 198}, 255 {L"Aacute", 193}, 256 {L"Acirc", 194}, 257 {L"Agrave", 192}, 258 {L"Alpha", 913}, 259 {L"Aring", 197}, 260 {L"Atilde", 195}, 261 {L"Auml", 196}, 262 {L"Beta", 914}, 263 {L"Ccedil", 199}, 264 {L"Chi", 935}, 265 {L"Dagger", 8225}, 266 {L"Delta", 916}, 267 {L"ETH", 208}, 268 {L"Eacute", 201}, 269 {L"Ecirc", 202}, 270 {L"Egrave", 200}, 271 {L"Epsilon", 917}, 272 {L"Eta", 919}, 273 {L"Euml", 203}, 274 {L"Gamma", 915}, 275 {L"Iacute", 205}, 276 {L"Icirc", 206}, 277 {L"Igrave", 204}, 278 {L"Iota", 921}, 279 {L"Iuml", 207}, 280 {L"Kappa", 922}, 281 {L"Lambda", 923}, 282 {L"Mu", 924}, 283 {L"Ntilde", 209}, 284 {L"Nu", 925}, 285 {L"OElig", 338}, 286 {L"Oacute", 211}, 287 {L"Ocirc", 212}, 288 {L"Ograve", 210}, 289 {L"Omega", 937}, 290 {L"Omicron", 927}, 291 {L"Oslash", 216}, 292 {L"Otilde", 213}, 293 {L"Ouml", 214}, 294 {L"Phi", 934}, 295 {L"Pi", 928}, 296 {L"Prime", 8243}, 297 {L"Psi", 936}, 298 {L"Rho", 929}, 299 {L"Scaron", 352}, 300 {L"Sigma", 931}, 301 {L"THORN", 222}, 302 {L"Tau", 932}, 303 {L"Theta", 920}, 304 {L"Uacute", 218}, 305 {L"Ucirc", 219}, 306 {L"Ugrave", 217}, 307 {L"Upsilon", 933}, 308 {L"Uuml", 220}, 309 {L"Xi", 926}, 310 {L"Yacute", 221}, 311 {L"Yuml", 376}, 312 {L"Zeta", 918}, 313 {L"aacute", 225}, 314 {L"acirc", 226}, 315 {L"acute", 180}, 316 {L"aelig", 230}, 317 {L"agrave", 224}, 318 {L"alefsym", 8501}, 319 {L"alpha", 945}, 320 {L"amp", 38}, 321 {L"and", 8743}, 322 {L"ang", 8736}, 323 {L"aring", 229}, 324 {L"asymp", 8776}, 325 {L"atilde", 227}, 326 {L"auml", 228}, 327 {L"bdquo", 8222}, 328 {L"beta", 946}, 329 {L"brvbar", 166}, 330 {L"bull", 8226}, 331 {L"cap", 8745}, 332 {L"ccedil", 231}, 333 {L"cdots", 8943}, 334 {L"cedil", 184}, 335 {L"cent", 162}, 336 {L"chi", 967}, 337 {L"circ", 710}, 338 {L"clubs", 9827}, 339 {L"cong", 8773}, 340 {L"copy", 169}, 341 {L"crarr", 8629}, 342 {L"cup", 8746}, 343 {L"curren", 164}, 344 {L"dArr", 8659}, 345 {L"dagger", 8224}, 346 {L"darr", 8595}, 347 {L"ddots", 8945}, 348 {L"deg", 176}, 349 {L"delta", 948}, 350 {L"diams", 9830}, 351 {L"divide", 247}, 352 {L"eacute", 233}, 353 {L"ecirc", 234}, 354 {L"egrave", 232}, 355 {L"emdash", 8212}, /* non-standard but commonly used */ 356 {L"empty", 8709}, 357 {L"emsp", 8195}, 358 {L"endash", 8211}, /* non-standard but commonly used */ 359 {L"ensp", 8194}, 360 {L"epsilon", 949}, 361 {L"equiv", 8801}, 362 {L"eta", 951}, 363 {L"eth", 240}, 364 {L"euml", 235}, 365 {L"euro", 8364}, 366 {L"exist", 8707}, 367 {L"fnof", 402}, 368 {L"forall", 8704}, 369 {L"frac12", 189}, 370 {L"frac14", 188}, 371 {L"frac34", 190}, 372 {L"frasl", 8260}, 373 {L"gamma", 947}, 374 {L"ge", 8805}, 375 {L"gt", 62}, 376 {L"hArr", 8660}, 377 {L"harr", 8596}, 378 {L"hearts", 9829}, 379 {L"hellip", 8230}, 380 {L"iacute", 237}, 381 {L"icirc", 238}, 382 {L"iexcl", 161}, 383 {L"igrave", 236}, 384 {L"image", 8465}, 385 {L"infin", 8734}, 386 {L"int", 8747}, 387 {L"iota", 953}, 388 {L"iquest", 191}, 389 {L"isin", 8712}, 390 {L"iuml", 239}, 391 {L"kappa", 954}, 392 {L"lArr", 8656}, 393 {L"lambda", 955}, 394 {L"lang", 9001}, 395 {L"laquo", 171}, 396 {L"larr", 8592}, 397 {L"lceil", 8968}, 398 {L"ldots", 8230}, 399 {L"ldquo", 8220}, 400 {L"le", 8804}, 401 {L"lfloor", 8970}, 402 {L"lowast", 8727}, 403 {L"loz", 9674}, 404 {L"lrm", 8206}, 405 {L"lsaquo", 8249}, 406 {L"lsquo", 8216}, 407 {L"lt", 60}, 408 {L"macr", 175}, 409 {L"mdash", 8212}, 410 {L"micro", 181}, 411 {L"middot", 183}, 412 {L"minus", 8722}, 413 {L"mu", 956}, 414 {L"nabla", 8711}, 415 {L"nbsp", 160}, 416 {L"ndash", 8211}, 417 {L"ne", 8800}, 418 {L"ni", 8715}, 419 {L"not", 172}, 420 {L"notin", 8713}, 421 {L"nsub", 8836}, 422 {L"ntilde", 241}, 423 {L"nu", 957}, 424 {L"oacute", 243}, 425 {L"ocirc", 244}, 426 {L"oelig", 339}, 427 {L"ograve", 242}, 428 {L"oline", 8254}, 429 {L"omega", 969}, 430 {L"omicron", 959}, 431 {L"oplus", 8853}, 432 {L"or", 8744}, 433 {L"ordf", 170}, 434 {L"ordm", 186}, 435 {L"oslash", 248}, 436 {L"otilde", 245}, 437 {L"otimes", 8855}, 438 {L"ouml", 246}, 439 {L"para", 182}, 440 {L"part", 8706}, 441 {L"permil", 8240}, 442 {L"perp", 8869}, 443 {L"phi", 966}, 444 {L"pi", 960}, 445 {L"piv", 982}, 446 {L"plusmn", 177}, 447 {L"pound", 163}, 448 {L"prime", 8242}, 449 {L"prod", 8719}, 450 {L"prop", 8733}, 451 {L"psi", 968}, 452 {L"quad", 8193}, 453 {L"quot", 34}, 454 {L"rArr", 8658}, 455 {L"radic", 8730}, 456 {L"rang", 9002}, 457 {L"raquo", 187}, 458 {L"rarr", 8594}, 459 {L"rceil", 8969}, 460 {L"rdquo", 8221}, 461 {L"real", 8476}, 462 {L"reg", 174}, 463 {L"rfloor", 8971}, 464 {L"rho", 961}, 465 {L"rlm", 8207}, 466 {L"rsaquo", 8250}, 467 {L"rsquo", 8217}, 468 {L"sbquo", 8218}, 469 {L"scaron", 353}, 470 {L"sdot", 8901}, 471 {L"sect", 167}, 472 {L"shy", 173}, 473 {L"sigma", 963}, 474 {L"sigmaf", 962}, 475 {L"sim", 8764}, 476 {L"sp", 8194}, 477 {L"spades", 9824}, 478 {L"sub", 8834}, 479 {L"sube", 8838}, 480 {L"sum", 8721}, 481 {L"sup", 8835}, 482 {L"sup1", 185}, 483 {L"sup2", 178}, 484 {L"sup3", 179}, 485 {L"supe", 8839}, 486 {L"szlig", 223}, 487 {L"tau", 964}, 488 {L"there4", 8756}, 489 {L"theta", 952}, 490 {L"thetasym", 977}, 491 {L"thinsp", 8201}, 492 {L"thorn", 254}, 493 {L"tilde", 732}, 494 {L"times", 215}, 495 {L"trade", 8482}, 496 {L"uArr", 8657}, 497 {L"uacute", 250}, 498 {L"uarr", 8593}, 499 {L"ucirc", 251}, 500 {L"ugrave", 249}, 501 {L"uml", 168}, 502 {L"upsih", 978}, 503 {L"upsilon", 965}, 504 {L"uuml", 252}, 505 {L"varepsilon", 8712}, 506 {L"varphi", 981}, 507 {L"varpi", 982}, 508 {L"varrho", 1009}, 509 {L"vdots", 8942}, 510 {L"vsigma", 962}, 511 {L"vtheta", 977}, 512 {L"weierp", 8472}, 513 {L"xi", 958}, 514 {L"yacute", 253}, 515 {L"yen", 165}, 516 {L"yuml", 255}, 517 {L"zeta", 950}, 518 {L"zwj", 8205}, 519 {L"zwnj", 8204} 520 }; 521 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0])) 522 523 // Characters Winstart..Winend are those that Windows 524 // uses interpolated into the Latin1 set. 525 // They aren't supposed to appear in HTML, but they do.... 526 enum { 527 Winstart = 127, 528 Winend = 159 529 }; 530 531 static int winchars[]= { 8226, // 8226 is a bullet 532 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225, 533 710, 8240, 352, 8249, 338, 8226, 8226, 8226, 534 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 535 732, 8482, 353, 8250, 339, 8226, 8226, 376}; 536 537 static StringInt* tagtable; // initialized from tagnames 538 static StringInt* attrtable; // initialized from attrnames 539 540 static void lexinit(); 541 static int getplaindata(TokenSource* ts, Token* a, int* pai); 542 static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); 543 static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); 544 static int gettag(TokenSource* ts, int starti, Token* a, int* pai); 545 static Rune* buftostr(Rune* s, Rune* buf, int j); 546 static int comment(TokenSource* ts); 547 static int findstr(TokenSource* ts, Rune* s); 548 static int ampersand(TokenSource* ts); 549 static int lowerc(int c); 550 static int getchar(TokenSource* ts); 551 static void ungetchar(TokenSource* ts, int c); 552 static void backup(TokenSource* ts, int savei); 553 static void freeinsidetoken(Token* t); 554 static void freeattrs(Attr* ahead); 555 static Attr* newattr(int attid, Rune* value, Attr* link); 556 static int Tconv(Fmt* f); 557 558 int dbglex = 0; 559 static int lexinited = 0; 560 561 static void 562 lexinit(void) 563 { 564 tagtable = _makestrinttab(tagnames, Numtags); 565 attrtable = _makestrinttab(attrnames, Numattrs); 566 fmtinstall('T', Tconv); 567 lexinited = 1; 568 } 569 570 static TokenSource* 571 newtokensource(uchar* data, int edata, int chset, int mtype) 572 { 573 TokenSource* ans; 574 575 assert(chset == US_Ascii || chset == ISO_8859_1 || 576 chset == UTF_8 || chset == Unicode); 577 ans = (TokenSource*)emalloc(sizeof(TokenSource)); 578 ans->i = 0; 579 ans->data = data; 580 ans->edata = edata; 581 ans->chset = chset; 582 ans->mtype = mtype; 583 return ans; 584 } 585 586 enum { 587 ToksChunk = 500 588 }; 589 590 // Call this to get the tokens. 591 // The number of returned tokens is returned in *plen. 592 Token* 593 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen) 594 { 595 TokenSource* ts; 596 Token* a; 597 int alen; 598 int ai; 599 int starti; 600 int c; 601 int tag; 602 603 if(!lexinited) 604 lexinit(); 605 ts = newtokensource(data, datalen, chset, mtype); 606 alen = ToksChunk; 607 a = (Token*)emalloc(alen * sizeof(Token)); 608 ai = 0; 609 if(dbglex) 610 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata); 611 if(ts->mtype == TextHtml) { 612 for(;;) { 613 if(ai == alen) { 614 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); 615 alen += ToksChunk; 616 } 617 starti = ts->i; 618 c = getchar(ts); 619 if(c < 0) 620 break; 621 if(c == '<') { 622 tag = gettag(ts, starti, a, &ai); 623 if(tag == Tscript) { 624 // special rules for getting Data after.... 625 starti = ts->i; 626 c = getchar(ts); 627 tag = getscriptdata(ts, c, starti, a, &ai); 628 } 629 } 630 else 631 tag = getdata(ts, c, starti, a, &ai); 632 if(tag == -1) 633 break; 634 else if(dbglex > 1 && tag != Comment) 635 fprint(2, "lex: got token %T\n", &a[ai-1]); 636 } 637 } 638 else { 639 // plain text (non-html) tokens 640 for(;;) { 641 if(ai == alen) { 642 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); 643 alen += ToksChunk; 644 } 645 tag = getplaindata(ts, a, &ai); 646 if(tag == -1) 647 break; 648 if(dbglex > 1) 649 fprint(2, "lex: got token %T\n", &a[ai]); 650 } 651 } 652 if(dbglex) 653 fprint(2, "lex: returning %d tokens\n", ai); 654 *plen = ai; 655 if(ai == 0) 656 return nil; 657 return a; 658 } 659 660 // For case where source isn't HTML. 661 // Just make data tokens, one per line (or partial line, 662 // at end of buffer), ignoring non-whitespace control 663 // characters and dumping \r's. 664 // If find non-empty token, fill in a[*pai], bump *pai, and return Data. 665 // Otherwise return -1; 666 static int 667 getplaindata(TokenSource* ts, Token* a, int* pai) 668 { 669 Rune* s; 670 int j; 671 int starti; 672 int c; 673 Token* tok; 674 Rune buf[BIGBUFSIZE]; 675 676 s = nil; 677 j = 0; 678 starti = ts->i; 679 for(c = getchar(ts); c >= 0; c = getchar(ts)) { 680 if(c < ' ') { 681 if(isspace(c)) { 682 if(c == '\r') { 683 // ignore it unless no following '\n', 684 // in which case treat it like '\n' 685 c = getchar(ts); 686 if(c != '\n') { 687 if(c >= 0) 688 ungetchar(ts, c); 689 c = '\n'; 690 } 691 } 692 } 693 else 694 c = 0; 695 } 696 if(c != 0) { 697 buf[j++] = c; 698 if(j == sizeof(buf)-1) { 699 s = buftostr(s, buf, j); 700 j = 0; 701 } 702 } 703 if(c == '\n') 704 break; 705 } 706 s = buftostr(s, buf, j); 707 if(s == nil) 708 return -1; 709 tok = &a[(*pai)++]; 710 tok->tag = Data; 711 tok->text = s; 712 tok->attr = nil; 713 tok->starti = starti; 714 return Data; 715 } 716 717 // Return concatenation of s and buf[0:j] 718 static Rune* 719 buftostr(Rune* s, Rune* buf, int j) 720 { 721 buf[j] = 0; 722 if(s == nil) 723 s = _Strndup(buf, j); 724 else 725 s = _Strdup2(s, buf); 726 return s; 727 } 728 729 // Gather data up to next start-of-tag or end-of-buffer. 730 // Translate entity references (&). 731 // Ignore non-whitespace control characters and get rid of \r's. 732 // If find non-empty token, fill in a[*pai], bump *pai, and return Data. 733 // Otherwise return -1; 734 static int 735 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) 736 { 737 Rune* s; 738 int j; 739 int c; 740 Token* tok; 741 Rune buf[BIGBUFSIZE]; 742 743 s = nil; 744 j = 0; 745 c = firstc; 746 while(c >= 0) { 747 if(c == '&') { 748 c = ampersand(ts); 749 if(c < 0) 750 break; 751 } 752 else if(c < ' ') { 753 if(isspace(c)) { 754 if(c == '\r') { 755 // ignore it unless no following '\n', 756 // in which case treat it like '\n' 757 c = getchar(ts); 758 if(c != '\n') { 759 if(c >= 0) 760 ungetchar(ts, c); 761 c = '\n'; 762 } 763 } 764 } 765 else { 766 if(warn) 767 fprint(2, "warning: non-whitespace control character %d ignored\n", c); 768 c = 0; 769 } 770 } 771 else if(c == '<') { 772 ungetchar(ts, c); 773 break; 774 } 775 if(c != 0) { 776 buf[j++] = c; 777 if(j == BIGBUFSIZE-1) { 778 s = buftostr(s, buf, j); 779 j = 0; 780 } 781 } 782 c = getchar(ts); 783 } 784 s = buftostr(s, buf, j); 785 if(s == nil) 786 return -1; 787 tok = &a[(*pai)++]; 788 tok->tag = Data; 789 tok->text = s; 790 tok->attr = nil; 791 tok->starti = starti; 792 return Data; 793 } 794 795 // The rules for lexing scripts are different (ugh). 796 // Gather up everything until see a </SCRIPT>. 797 static int 798 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) 799 { 800 Rune* s; 801 int j; 802 int tstarti; 803 int savei; 804 int c; 805 int tag; 806 int done; 807 Token* tok; 808 Rune buf[BIGBUFSIZE]; 809 810 s = nil; 811 j = 0; 812 tstarti = starti; 813 c = firstc; 814 done = 0; 815 while(c >= 0) { 816 if(c == '<') { 817 // other browsers ignore stuff to end of line after <! 818 savei = ts->i; 819 c = getchar(ts); 820 if(c == '!') { 821 while(c >= 0 && c != '\n' && c != '\r') 822 c = getchar(ts); 823 if(c == '\r') 824 c = getchar(ts); 825 if(c == '\n') 826 c = getchar(ts); 827 } 828 else if(c >= 0) { 829 backup(ts, savei); 830 tag = gettag(ts, tstarti, a, pai); 831 if(tag == -1) 832 break; 833 if(tag != Comment) 834 (*pai)--; 835 backup(ts, tstarti); 836 if(tag == Tscript + RBRA) { 837 done = 1; 838 break; 839 } 840 // here tag was not </SCRIPT>, so take as regular data 841 c = getchar(ts); 842 } 843 } 844 if(c < 0) 845 break; 846 if(c != 0) { 847 buf[j++] = c; 848 if(j == BIGBUFSIZE-1) { 849 s = buftostr(s, buf, j); 850 j = 0; 851 } 852 } 853 tstarti = ts->i; 854 c = getchar(ts); 855 } 856 if(done || ts->i == ts->edata) { 857 s = buftostr(s, buf, j); 858 tok = &a[(*pai)++]; 859 tok->tag = Data; 860 tok->text = s; 861 tok->attr = nil; 862 tok->starti = starti; 863 return Data; 864 } 865 backup(ts, starti); 866 return -1; 867 } 868 869 // We've just seen a '<'. Gather up stuff to closing '>' (if buffer 870 // ends before then, return -1). 871 // If it's a tag, look up the name, gather the attributes, and return 872 // the appropriate token. 873 // Else it's either just plain data or some kind of ignorable stuff: 874 // return Data or Comment as appropriate. 875 // If it's not a Comment, put it in a[*pai] and bump *pai. 876 static int 877 gettag(TokenSource* ts, int starti, Token* a, int* pai) 878 { 879 int rbra; 880 int ans; 881 Attr* al; 882 int nexti; 883 int c; 884 int ti; 885 int afnd; 886 int attid; 887 int quote; 888 Rune* val; 889 int nv; 890 int i; 891 int tag; 892 Token* tok; 893 Rune buf[BIGBUFSIZE]; 894 895 rbra = 0; 896 nexti = ts->i; 897 tok = &a[*pai]; 898 tok->tag = Notfound; 899 tok->text = nil; 900 tok->attr = nil; 901 tok->starti = starti; 902 c = getchar(ts); 903 if(c == '/') { 904 rbra = RBRA; 905 c = getchar(ts); 906 } 907 if(c < 0) 908 goto eob_done; 909 if(c >= 256 || !isalpha(c)) { 910 // not a tag 911 if(c == '!') { 912 ans = comment(ts); 913 if(ans != -1) 914 return ans; 915 goto eob_done; 916 } 917 else { 918 backup(ts, nexti); 919 tok->tag = Data; 920 tok->text = _Strdup(L"<"); 921 (*pai)++; 922 return Data; 923 } 924 } 925 // c starts a tagname 926 buf[0] = c; 927 i = 1; 928 while(1) { 929 c = getchar(ts); 930 if(c < 0) 931 goto eob_done; 932 if(!ISNAMCHAR(c)) 933 break; 934 // if name is bigger than buf it won't be found anyway... 935 if(i < BIGBUFSIZE) 936 buf[i++] = c; 937 } 938 if(_lookup(tagtable, Numtags, buf, i, &tag)) 939 tok->tag = tag + rbra; 940 else 941 tok->text = _Strndup(buf, i); // for warning print, in build 942 943 // attribute gathering loop 944 al = nil; 945 while(1) { 946 // look for "ws name" or "ws name ws = ws val" (ws=whitespace) 947 // skip whitespace 948 attrloop_continue: 949 while(c < 256 && isspace(c)) { 950 c = getchar(ts); 951 if(c < 0) 952 goto eob_done; 953 } 954 if(c == '>') 955 goto attrloop_done; 956 if(c == '<') { 957 if(warn) 958 fprint(2, "warning: unclosed tag\n"); 959 ungetchar(ts, c); 960 goto attrloop_done; 961 } 962 if(c >= 256 || !isalpha(c)) { 963 if(warn) 964 fprint(2, "warning: expected attribute name\n"); 965 // skipt to next attribute name 966 while(1) { 967 c = getchar(ts); 968 if(c < 0) 969 goto eob_done; 970 if(c < 256 && isalpha(c)) 971 goto attrloop_continue; 972 if(c == '<') { 973 if(warn) 974 fprint(2, "warning: unclosed tag\n"); 975 ungetchar(ts, 60); 976 goto attrloop_done; 977 } 978 if(c == '>') 979 goto attrloop_done; 980 } 981 } 982 // gather attribute name 983 buf[0] = c; 984 i = 1; 985 while(1) { 986 c = getchar(ts); 987 if(c < 0) 988 goto eob_done; 989 if(!ISNAMCHAR(c)) 990 break; 991 if(i < BIGBUFSIZE-1) 992 buf[i++] = c; 993 } 994 afnd = _lookup(attrtable, Numattrs, buf, i, &attid); 995 if(warn && !afnd) { 996 buf[i] = 0; 997 fprint(2, "warning: unknown attribute name %S\n", buf); 998 } 999 // skip whitespace 1000 while(c < 256 && isspace(c)) { 1001 c = getchar(ts); 1002 if(c < 0) 1003 goto eob_done; 1004 } 1005 if(c != '=') { 1006 if(afnd) 1007 al = newattr(attid, nil, al); 1008 goto attrloop_continue; 1009 } 1010 //# c is '=' here; skip whitespace 1011 while(1) { 1012 c = getchar(ts); 1013 if(c < 0) 1014 goto eob_done; 1015 if(c >= 256 || !isspace(c)) 1016 break; 1017 } 1018 quote = 0; 1019 if(c == '\'' || c == '"') { 1020 quote = c; 1021 c = getchar(ts); 1022 if(c < 0) 1023 goto eob_done; 1024 } 1025 val = nil; 1026 nv = 0; 1027 while(1) { 1028 valloop_continue: 1029 if(c < 0) 1030 goto eob_done; 1031 if(c == '>') { 1032 if(quote) { 1033 // c might be part of string (though not good style) 1034 // but if line ends before close quote, assume 1035 // there was an unmatched quote 1036 ti = ts->i; 1037 while(1) { 1038 c = getchar(ts); 1039 if(c < 0) 1040 goto eob_done; 1041 if(c == quote) { 1042 backup(ts, ti); 1043 buf[nv++] = '>'; 1044 if(nv == BIGBUFSIZE-1) { 1045 val = buftostr(val, buf, nv); 1046 nv = 0; 1047 } 1048 c = getchar(ts); 1049 goto valloop_continue; 1050 } 1051 if(c == '\n') { 1052 if(warn) 1053 fprint(2, "warning: apparent unmatched quote\n"); 1054 backup(ts, ti); 1055 c = '>'; 1056 goto valloop_done; 1057 } 1058 } 1059 } 1060 else 1061 goto valloop_done; 1062 } 1063 if(quote) { 1064 if(c == quote) { 1065 c = getchar(ts); 1066 if(c < 0) 1067 goto eob_done; 1068 goto valloop_done; 1069 } 1070 if(c == '\r') { 1071 c = getchar(ts); 1072 goto valloop_continue; 1073 } 1074 if(c == '\t' || c == '\n') 1075 c = ' '; 1076 } 1077 else { 1078 if(c < 256 && isspace(c)) 1079 goto valloop_done; 1080 } 1081 if(c == '&') { 1082 c = ampersand(ts); 1083 if(c == -1) 1084 goto eob_done; 1085 } 1086 buf[nv++] = c; 1087 if(nv == BIGBUFSIZE-1) { 1088 val = buftostr(val, buf, nv); 1089 nv = 0; 1090 } 1091 c = getchar(ts); 1092 } 1093 valloop_done: 1094 if(afnd) { 1095 val = buftostr(val, buf, nv); 1096 al = newattr(attid, val, al); 1097 } 1098 } 1099 1100 attrloop_done: 1101 tok->attr = al; 1102 (*pai)++; 1103 return tok->tag; 1104 1105 eob_done: 1106 if(warn) 1107 fprint(2, "warning: incomplete tag at end of page\n"); 1108 backup(ts, nexti); 1109 tok->tag = Data; 1110 tok->text = _Strdup(L"<"); 1111 return Data; 1112 } 1113 1114 // We've just read a '<!' at position starti, 1115 // so this may be a comment or other ignored section, or it may 1116 // be just a literal string if there is no close before end of file 1117 // (other browsers do that). 1118 // The accepted practice seems to be (note: contrary to SGML spec!): 1119 // If see <!--, look for --> to close, or if none, > to close. 1120 // If see <!(not --), look for > to close. 1121 // If no close before end of file, leave original characters in as literal data. 1122 // 1123 // If we see ignorable stuff, return Comment. 1124 // Else return nil (caller should back up and try again when more data arrives, 1125 // unless at end of file, in which case caller should just make '<' a data token). 1126 static int 1127 comment(TokenSource* ts) 1128 { 1129 int nexti; 1130 int havecomment; 1131 int c; 1132 1133 nexti = ts->i; 1134 havecomment = 0; 1135 c = getchar(ts); 1136 if(c == '-') { 1137 c = getchar(ts); 1138 if(c == '-') { 1139 if(findstr(ts, L"-->")) 1140 havecomment = 1; 1141 else 1142 backup(ts, nexti); 1143 } 1144 } 1145 if(!havecomment) { 1146 if(c == '>') 1147 havecomment = 1; 1148 else if(c >= 0) { 1149 if(findstr(ts, L">")) 1150 havecomment = 1; 1151 } 1152 } 1153 if(havecomment) 1154 return Comment; 1155 return -1; 1156 } 1157 1158 // Look for string s in token source. 1159 // If found, return 1, with buffer at next char after s, 1160 // else return 0 (caller should back up). 1161 static int 1162 findstr(TokenSource* ts, Rune* s) 1163 { 1164 int c0; 1165 int n; 1166 int nexti; 1167 int i; 1168 int c; 1169 1170 c0 = s[0]; 1171 n = runestrlen(s); 1172 while(1) { 1173 c = getchar(ts); 1174 if(c < 0) 1175 break; 1176 if(c == c0) { 1177 if(n == 1) 1178 return 1; 1179 nexti = ts->i; 1180 for(i = 1; i < n; i++) { 1181 c = getchar(ts); 1182 if(c < 0) 1183 goto mainloop_done; 1184 if(c != s[i]) 1185 break; 1186 } 1187 if(i == n) 1188 return 1; 1189 backup(ts, nexti); 1190 } 1191 } 1192 mainloop_done: 1193 return 0; 1194 } 1195 1196 // We've just read an '&'; look for an entity reference 1197 // name, and if found, return translated char. 1198 // if there is a complete entity name but it isn't known, 1199 // try prefixes (gets around some buggy HTML out there), 1200 // and if that fails, back up to just past the '&' and return '&'. 1201 // If the entity can't be completed in the current buffer, back up 1202 // to the '&' and return -1. 1203 static int 1204 ampersand(TokenSource* ts) 1205 { 1206 int savei; 1207 int c; 1208 int fnd; 1209 int ans; 1210 int v; 1211 int i; 1212 int k; 1213 Rune buf[SMALLBUFSIZE]; 1214 1215 savei = ts->i; 1216 c = getchar(ts); 1217 fnd = 0; 1218 ans = -1; 1219 if(c == '#') { 1220 c = getchar(ts); 1221 v = 0; 1222 while(c >= 0) { 1223 if(!(c < 256 && isdigit(c))) 1224 break; 1225 v = v*10 + c - 48; 1226 c = getchar(ts); 1227 } 1228 if(c >= 0) { 1229 if(!(c == ';' || c == '\n' || c == '\r')) 1230 ungetchar(ts, c); 1231 c = v; 1232 if(c == 160) 1233 c = 160; 1234 if(c >= Winstart && c <= Winend) { 1235 c = winchars[c - Winstart]; 1236 } 1237 ans = c; 1238 fnd = 1; 1239 } 1240 } 1241 else if(c < 256 && isalpha(c)) { 1242 buf[0] = c; 1243 k = 1; 1244 while(1) { 1245 c = getchar(ts); 1246 if(c < 0) 1247 break; 1248 if(ISNAMCHAR(c)) { 1249 if(k < SMALLBUFSIZE-1) 1250 buf[k++] = c; 1251 } 1252 else { 1253 if(!(c == ';' || c == '\n' || c == '\r')) 1254 ungetchar(ts, c); 1255 break; 1256 } 1257 } 1258 if(c >= 0) { 1259 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); 1260 if(!fnd) { 1261 // Try prefixes of s 1262 if(c == ';' || c == '\n' || c == '\r') 1263 ungetchar(ts, c); 1264 i = k; 1265 while(--k > 0) { 1266 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); 1267 if(fnd) { 1268 while(i > k) { 1269 i--; 1270 ungetchar(ts, buf[i]); 1271 } 1272 break; 1273 } 1274 } 1275 } 1276 } 1277 } 1278 if(!fnd) { 1279 backup(ts, savei); 1280 ans = '&'; 1281 } 1282 return ans; 1283 } 1284 1285 // Get next char, obeying ts.chset. 1286 // Returns -1 if no complete character left before current end of data. 1287 static int 1288 getchar(TokenSource* ts) 1289 { 1290 uchar* buf; 1291 int c; 1292 int n; 1293 int ok; 1294 Rune r; 1295 1296 if(ts->i >= ts->edata) 1297 return -1; 1298 buf = ts->data; 1299 c = buf[ts->i]; 1300 switch(ts->chset) { 1301 case ISO_8859_1: 1302 if(c >= Winstart && c <= Winend) 1303 c = winchars[c - Winstart]; 1304 ts->i++; 1305 break; 1306 case US_Ascii: 1307 if(c > 127) { 1308 if(warn) 1309 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c); 1310 } 1311 ts->i++; 1312 break; 1313 case UTF_8: 1314 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); 1315 n = chartorune(&r, (char*)(buf+ts->i)); 1316 if(ok) { 1317 if(warn && c == 0x80) 1318 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); 1319 ts->i += n; 1320 c = r; 1321 } 1322 else { 1323 // not enough bytes in buf to complete utf-8 char 1324 ts->i = ts->edata; // mark "all used" 1325 c = -1; 1326 } 1327 break; 1328 case Unicode: 1329 if(ts->i < ts->edata - 1) { 1330 //standards say most-significant byte first 1331 c = (c << 8)|(buf[ts->i + 1]); 1332 ts->i += 2; 1333 } 1334 else { 1335 ts->i = ts->edata; // mark "all used" 1336 c = -1; 1337 } 1338 break; 1339 } 1340 return c; 1341 } 1342 1343 // Assuming c was the last character returned by getchar, set 1344 // things up so that next getchar will get that same character 1345 // followed by the current 'next character', etc. 1346 static void 1347 ungetchar(TokenSource* ts, int c) 1348 { 1349 int n; 1350 Rune r; 1351 char a[UTFmax]; 1352 1353 n = 1; 1354 switch(ts->chset) { 1355 case UTF_8: 1356 if(c >= 128) { 1357 r = c; 1358 n = runetochar(a, &r); 1359 } 1360 break; 1361 case Unicode: 1362 n = 2; 1363 break; 1364 } 1365 ts->i -= n; 1366 } 1367 1368 // Restore ts so that it is at the state where the index was savei. 1369 static void 1370 backup(TokenSource* ts, int savei) 1371 { 1372 if(dbglex) 1373 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei); 1374 ts->i = savei; 1375 } 1376 1377 1378 // Look for value associated with attribute attid in token t. 1379 // If there is one, return 1 and put the value in *pans, 1380 // else return 0. 1381 // If xfer is true, transfer ownership of the string to the caller 1382 // (nil it out here); otherwise, caller must duplicate the answer 1383 // if it needs to save it. 1384 // OK to have pans==0, in which case this is just looking 1385 // to see if token is present. 1386 int 1387 _tokaval(Token* t, int attid, Rune** pans, int xfer) 1388 { 1389 Attr* attr; 1390 1391 attr = t->attr; 1392 while(attr != nil) { 1393 if(attr->attid == attid) { 1394 if(pans != nil) 1395 *pans = attr->value; 1396 if(xfer) 1397 attr->value = nil; 1398 return 1; 1399 } 1400 attr = attr->next; 1401 } 1402 if(pans != nil) 1403 *pans = nil; 1404 return 0; 1405 } 1406 1407 static int 1408 Tconv(Fmt *f) 1409 { 1410 Token* t; 1411 int i; 1412 int tag; 1413 char* srbra; 1414 Rune* aname; 1415 Rune* tname; 1416 Attr* a; 1417 char buf[BIGBUFSIZE]; 1418 1419 t = va_arg(f->args, Token*); 1420 if(t == nil) 1421 sprint(buf, "<null>"); 1422 else { 1423 i = 0; 1424 if(dbglex > 1) 1425 i = snprint(buf, sizeof(buf), "[%d]", t->starti); 1426 tag = t->tag; 1427 if(tag == Data) { 1428 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text); 1429 } 1430 else { 1431 srbra = ""; 1432 if(tag >= RBRA) { 1433 tag -= RBRA; 1434 srbra = "/"; 1435 } 1436 tname = tagnames[tag]; 1437 if(tag == Notfound) 1438 tname = L"?"; 1439 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname); 1440 for(a = t->attr; a != nil; a = a->next) { 1441 aname = attrnames[a->attid]; 1442 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname); 1443 if(a->value != nil) 1444 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value); 1445 } 1446 i += snprint(buf+i, sizeof(buf)-i-1, ">"); 1447 } 1448 buf[i] = 0; 1449 } 1450 return fmtstrcpy(f, buf); 1451 } 1452 1453 // Attrs own their constituent strings, but build may eventually 1454 // transfer some values to its items and nil them out in the Attr. 1455 static Attr* 1456 newattr(int attid, Rune* value, Attr* link) 1457 { 1458 Attr* ans; 1459 1460 ans = (Attr*)emalloc(sizeof(Attr)); 1461 ans->attid = attid; 1462 ans->value = value; 1463 ans->next = link; 1464 return ans; 1465 } 1466 1467 // Free list of Attrs linked through next field 1468 static void 1469 freeattrs(Attr* ahead) 1470 { 1471 Attr* a; 1472 Attr* nexta; 1473 1474 a = ahead; 1475 while(a != nil) { 1476 nexta = a->next; 1477 free(a->value); 1478 free(a); 1479 a = nexta; 1480 } 1481 } 1482 1483 // Free array of Tokens. 1484 // Allocated space might have room for more than n tokens, 1485 // but only n of them are initialized. 1486 // If caller has transferred ownership of constitutent strings 1487 // or attributes, it must have nil'd out the pointers in the Tokens. 1488 void 1489 _freetokens(Token* tarray, int n) 1490 { 1491 int i; 1492 Token* t; 1493 1494 if(tarray == nil) 1495 return; 1496 for(i = 0; i < n; i++) { 1497 t = &tarray[i]; 1498 free(t->text); 1499 freeattrs(t->attr); 1500 } 1501 free(tarray); 1502 } 1503