1implement HTML; 2 3include "sys.m"; 4include "html.m"; 5include "strinttab.m"; 6 7sys: Sys; 8T: StringIntTab; 9 10Stringtab: adt 11{ 12 name: string; 13 val: int; 14}; 15 16chartab:= array[] of { T->StringInt 17 ("AElig", 'Æ'), 18 ("Aacute", 'Á'), 19 ("Acirc", 'Â'), 20 ("Agrave", 'À'), 21 ("Aring", 'Å'), 22 ("Atilde", 'Ã'), 23 ("Auml", 'Ä'), 24 ("Ccedil", 'Ç'), 25 ("ETH", 'Ð'), 26 ("Eacute", 'É'), 27 ("Ecirc", 'Ê'), 28 ("Egrave", 'È'), 29 ("Euml", 'Ë'), 30 ("Iacute", 'Í'), 31 ("Icirc", 'Î'), 32 ("Igrave", 'Ì'), 33 ("Iuml", 'Ï'), 34 ("Ntilde", 'Ñ'), 35 ("Oacute", 'Ó'), 36 ("Ocirc", 'Ô'), 37 ("Ograve", 'Ò'), 38 ("Oslash", 'Ø'), 39 ("Otilde", 'Õ'), 40 ("Ouml", 'Ö'), 41 ("THORN", 'Þ'), 42 ("Uacute", 'Ú'), 43 ("Ucirc", 'Û'), 44 ("Ugrave", 'Ù'), 45 ("Uuml", 'Ü'), 46 ("Yacute", 'Ý'), 47 ("aacute", 'á'), 48 ("acirc", 'â'), 49 ("acute", '´'), 50 ("aelig", 'æ'), 51 ("agrave", 'à'), 52 ("alpha", 'α'), 53 ("amp", '&'), 54 ("aring", 'å'), 55 ("atilde", 'ã'), 56 ("auml", 'ä'), 57 ("beta", 'β'), 58 ("brvbar", '¦'), 59 ("ccedil", 'ç'), 60 ("cdots", '⋯'), 61 ("cedil", '¸'), 62 ("cent", '¢'), 63 ("chi", 'χ'), 64 ("copy", '©'), 65 ("curren", '¤'), 66 ("ddots", '⋱'), 67 ("deg", '°'), 68 ("delta", 'δ'), 69 ("divide", '÷'), 70 ("eacute", 'é'), 71 ("ecirc", 'ê'), 72 ("egrave", 'è'), 73 ("emdash", '—'), 74 ("emsp", ' '), 75 ("endash", '–'), 76 ("ensp", ' '), 77 ("epsilon", 'ε'), 78 ("eta", 'η'), 79 ("eth", 'ð'), 80 ("euml", 'ë'), 81 ("frac12", '½'), 82 ("frac14", '¼'), 83 ("frac34", '¾'), 84 ("gamma", 'γ'), 85 ("gt", '>'), 86 ("iacute", 'í'), 87 ("icirc", 'î'), 88 ("iexcl", '¡'), 89 ("igrave", 'ì'), 90 ("iota", 'ι'), 91 ("iquest", '¿'), 92 ("iuml", 'ï'), 93 ("kappa", 'κ'), 94 ("lambda", 'λ'), 95 ("laquo", '«'), 96 ("ldots", '…'), 97 ("lt", '<'), 98 ("macr", '¯'), 99 ("micro", 'µ'), 100 ("middot", '·'), 101 ("mu", 'μ'), 102 ("nbsp", ' '), 103 ("not", '¬'), 104 ("ntilde", 'ñ'), 105 ("nu", 'ν'), 106 ("oacute", 'ó'), 107 ("ocirc", 'ô'), 108 ("ograve", 'ò'), 109 ("omega", 'ω'), 110 ("omicron", 'ο'), 111 ("ordf", 'ª'), 112 ("ordm", 'º'), 113 ("oslash", 'ø'), 114 ("otilde", 'õ'), 115 ("ouml", 'ö'), 116 ("para", '¶'), 117 ("phi", 'φ'), 118 ("pi", 'π'), 119 ("plusmn", '±'), 120 ("pound", '£'), 121 ("psi", 'ψ'), 122 ("quad", ' '), 123 ("quot", '"'), 124 ("raquo", '»'), 125 ("reg", '®'), 126 ("rho", 'ρ'), 127 ("sect", '§'), 128 ("shy", ''), 129 ("sigma", 'σ'), 130 ("sp", ' '), 131 ("sup1", '¹'), 132 ("sup2", '²'), 133 ("sup3", '³'), 134 ("szlig", 'ß'), 135 ("tau", 'τ'), 136 ("theta", 'θ'), 137 ("thinsp", ' '), 138 ("thorn", 'þ'), 139 ("times", '×'), 140 ("trade", '™'), 141 ("uacute", 'ú'), 142 ("ucirc", 'û'), 143 ("ugrave", 'ù'), 144 ("uml", '¨'), 145 ("upsilon", 'υ'), 146 ("uuml", 'ü'), 147 ("varepsilon", '∈'), 148 ("varphi", 'ϕ'), 149 ("varpi", 'ϖ'), 150 ("varrho", 'ϱ'), 151 ("vdots", '⋮'), 152 ("vsigma", 'ς'), 153 ("vtheta", 'ϑ'), 154 ("xi", 'ξ'), 155 ("yacute", 'ý'), 156 ("yen", '¥'), 157 ("yuml", 'ÿ'), 158 ("zeta", 'ζ'), 159}; 160 161htmlstringtab := array[] of { T->StringInt 162 ("a", Ta), 163 ("address", Taddress), 164 ("applet", Tapplet), 165 ("area", Tarea), 166 ("att_footer", Tatt_footer), 167 ("b", Tb), 168 ("base", Tbase), 169 ("basefont", Tbasefont), 170 ("big", Tbig), 171 ("blink", Tblink), 172 ("blockquote", Tblockquote), 173 ("body", Tbody), 174 ("bq", Tbq), 175 ("br", Tbr), 176 ("caption", Tcaption), 177 ("center", Tcenter), 178 ("cite", Tcite), 179 ("code", Tcode), 180 ("col", Tcol), 181 ("colgroup", Tcolgroup), 182 ("dd", Tdd), 183 ("dfn", Tdfn), 184 ("dir", Tdir), 185 ("div", Tdiv), 186 ("dl", Tdl), 187 ("dt", Tdt), 188 ("em", Tem), 189 ("font", Tfont), 190 ("form", Tform), 191 ("frame", Tframe), 192 ("frameset", Tframeset), 193 ("h1", Th1), 194 ("h2", Th2), 195 ("h3", Th3), 196 ("h4", Th4), 197 ("h5", Th5), 198 ("h6", Th6), 199 ("head", Thead), 200 ("hr", Thr), 201 ("html", Thtml), 202 ("i", Ti), 203 ("img", Timg), 204 ("input", Tinput), 205 ("isindex", Tisindex), 206 ("item", Titem), 207 ("kbd", Tkbd), 208 ("li", Tli), 209 ("link", Tlink), 210 ("map", Tmap), 211 ("menu", Tmenu), 212 ("meta", Tmeta), 213 ("nobr", Tnobr), 214 ("noframes", Tnoframes), 215 ("ol", Tol), 216 ("option", Toption), 217 ("p", Tp), 218 ("param", Tparam), 219 ("pre", Tpre), 220 ("q", Tq), 221 ("samp", Tsamp), 222 ("script", Tscript), 223 ("select", Tselect), 224 ("small", Tsmall), 225 ("strike", Tstrike), 226 ("strong", Tstrong), 227 ("style", Tstyle), 228 ("sub", Tsub), 229 ("sup", Tsup), 230 ("t", Tt), 231 ("table", Ttable), 232 ("tbody", Ttbody), 233 ("td", Ttd), 234 ("textarea", Ttextarea), 235 ("textflow", Ttextflow), 236 ("tfoot", Ttfoot), 237 ("th", Tth), 238 ("thead", Tthead), 239 ("title", Ttitle), 240 ("tr", Ttr), 241 ("tt", Ttt), 242 ("u", Tu), 243 ("ul", Tul), 244 ("var", Tvar) 245}; 246 247W, D, L, U, N: con byte (1<<iota); 248NCTYPE: con 256; 249 250ctype := array[NCTYPE] of { 251 '0'=>D, '1'=>D, '2'=>D, '3'=>D, '4'=>D, 252 '5'=>D, '6'=>D, '7'=>D, '8'=>D, '9'=>D, 253 'A'=>U, 'B'=>U, 'C'=>U, 'D'=>U, 'E'=>U, 'F'=>U, 254 'G'=>U, 'H'=>U, 'I'=>U, 'J'=>U, 'K'=>U, 'L'=>U, 255 'M'=>U, 'N'=>U, 'O'=>U, 'P'=>U, 'Q'=>U, 'R'=>U, 256 'S'=>U, 'T'=>U, 'U'=>U, 'V'=>U, 'W'=>U, 'X'=>U, 257 'Y'=>U, 'Z'=>U, 258 'a'=>L, 'b'=>L, 'c'=>L, 'd'=>L, 'e'=>L, 'f'=>L, 259 'g'=>L, 'h'=>L, 'i'=>L, 'j'=>L, 'k'=>L, 'l'=>L, 260 'm'=>L, 'n'=>L, 'o'=>L, 'p'=>L, 'q'=>L, 'r'=>L, 261 's'=>L, 't'=>L, 'u'=>L, 'v'=>L, 'w'=>L, 'x'=>L, 262 'y'=>L, 'z'=>L, 263 '.'=>N, '-'=>N, 264 ' '=>W, '\n'=>W, '\t'=>W, '\r'=>W, 265 * => byte 0 266}; 267 268lex(b: array of byte, charset: int, keepwh: int): array of ref Lex 269{ 270 if(sys == nil) 271 sys = load Sys Sys->PATH; 272 if(T == nil) 273 T = load StringIntTab StringIntTab->PATH; 274 if(T == nil) { 275 sys->print("HTML->lex: couldn't %s\n", StringIntTab->PATH); 276 return nil; 277 } 278 279 a: array of ref Lex; 280 ai := 0; 281 i := 0; 282 nb := len b; 283 for(;;){ 284 Whitespace: 285 for(;;){ 286 # ignore nulls 287 while(i<nb && (int b[i] == 0)) 288 i++; 289 # skip white space 290 if(!keepwh) { 291 while(i<nb) { 292 c := int b[i]; 293 if(!(int (ctype[c]&W)) && c != ' ') 294 break; 295 i++; 296 } 297 } 298 # skip comments 299 if(i<nb-4 && int b[i]=='<' && int b[i+1]=='!' 300 && int b[i+2]=='-' && int b[i+3]=='-') { 301 i += 4; 302 while(i<nb-3){ 303 if(int b[i]=='-' && int b[i+1]=='-' && int b[i+2]=='>'){ 304 i += 3; 305 continue Whitespace; 306 } 307 i++; 308 } 309 continue Whitespace; 310 } 311 break; 312 } 313 if(i == nb) 314 break; 315 if(ai == len a){ 316 na := array[len a + 500] of ref Lex; 317 if(a != nil) 318 na[0:] = a; 319 a = na; 320 } 321 if(int b[i] == '<'){ 322 lx : ref Lex; 323 (lx, i) = gettag(b, i, charset); 324 a[ai++] = lx; 325 } 326 else { 327 s: string; 328 (s, i) = getdata(b, i, keepwh, charset); 329 a[ai++] = ref Lex (Data, s, nil); 330 } 331 } 332 return a[0:ai]; 333} 334 335getdata(b: array of byte, i: int, keepnls, charset: int): (string, int) 336{ 337 s:= ""; 338 j:= 0; 339 c: int; 340 nb := len b; 341 342loop: 343 while(i < nb){ 344 oldi := i; 345 case charset{ 346 Latin1 => 347 c = int b[i++]; 348 UTF8 => 349 j: int; 350 (c, j, nil) = sys->byte2char(b, i); 351 i += j; 352 } 353 case c { 354 0 or 16r1a => 355 continue loop; 356 '<' => 357 i = oldi; 358 break loop; 359 '&' => 360 (c, i) = ampersand(b, i); 361 '\n' => 362 if(!keepnls) 363 c = ' '; 364 '\r' => 365 if(oldi > 0 && int b[oldi-1] == '\n') 366 continue loop; 367 if(keepnls) 368 c = '\n'; 369 else 370 c = ' '; 371 } 372 s[j++] = c; 373 } 374 return (s, i); 375} 376 377gettag(b: array of byte, i, charset: int): (ref Lex, int) 378{ 379 rbra := 0; 380 nb := len b; 381 ans := ref Lex(Notfound, "", nil); 382 al: list of Attr; 383 if(++i == nb) 384 return (ans, i); 385 istart := i; 386 c := int b[i]; 387 if(c == '/') { 388 rbra = RBRA; 389 if(++i == nb) 390 return (ans, i); 391 c = int b[i]; 392 } 393 if(c>=NCTYPE || !int (ctype[c]&(L|U))) { 394 while(i < nb) { 395 c = int b[i++]; 396 if(c == '>') 397 break; 398 } 399 ans.text = string b[istart:i]; 400 return (ans, i); 401 } 402 namstart := i; 403 while(c<NCTYPE && int (ctype[c]&(L|U|D|N))) { 404 if(++i == nb) { 405 ans.text = string b[istart:i]; 406 return (ans, i); 407 } 408 c = int b[i]; 409 } 410 name := lowercase(b, namstart, i); 411 (fnd, tag) := T->lookup(htmlstringtab, name); 412 if(fnd) 413 ans.tag = tag+rbra; 414 else 415 ans.text = name; 416attrloop: 417 while(i < nb){ 418 # look for "ws name" or "ws name ws = ws val" (ws=whitespace) 419 # skip whitespace 420 while(c<NCTYPE && int (ctype[c]&W)) { 421 if(++i == nb) 422 break attrloop; 423 c = int b[i]; 424 } 425 if(c == '>') { 426 i++; 427 break; 428 } 429 if(c == '<') 430 break; # error: unclosed tag 431 if(c>=NCTYPE || !int (ctype[c]&(L|U))) { 432 # error, not the start of a name 433 # skip to end of tag 434 while(i < nb) { 435 c = int b[i++]; 436 if(c == '>') 437 break; 438 } 439 break attrloop; 440 } 441 # gather name 442 namstart = i; 443 while(c<NCTYPE && int (ctype[c]&(L|U|D|N))) { 444 if(++i == nb) 445 break attrloop; 446 c = int b[i]; 447 } 448 name = lowercase(b, namstart, i); 449 # skip whitespace 450 while(c<NCTYPE && int (ctype[c]&W)) { 451 if(++i == nb) 452 break attrloop; 453 c = int b[i]; 454 } 455 if(c != '=') { 456 # no value for this attr 457 al = (name, "") :: al; 458 continue attrloop; 459 } 460 # skip whitespace 461 if(++i == nb) 462 break attrloop; 463 c = int b[i]; 464 while(c<NCTYPE && int (ctype[c]&W)) { 465 if(++i == nb) 466 break attrloop; 467 c = int b[i]; 468 } 469 # gather value 470 quote := 0; 471 if(c == '\'' || c == '"') { 472 quote = c; 473 i++; 474 } 475 val := ""; 476 nv := 0; 477 valloop: 478 while(i < nb) { 479 case charset{ 480 Latin1 => 481 c = int b[i++]; 482 UTF8 => 483 j: int; 484 (c, j, nil) = sys->byte2char(b, i); 485 i += j; 486 } 487 if(c == '>') { 488 if(quote) { 489 # c might be part of string (though not good style) 490 # but if line ends before close quote, assume 491 # there was an unmatched quote 492 for(k := i; k < nb; k++) { 493 c = int b[k]; 494 if(c == quote) { 495 val[nv++] = '>'; 496 continue valloop; 497 } 498 if(c == '\n') { 499 i--; 500 break valloop; 501 } 502 } 503 } 504 i--; 505 break valloop; 506 } 507 if(quote) { 508 if(c == quote) 509 break valloop; 510 if(c == '\n') 511 continue valloop; 512 if(c == '\t' || c == '\r') 513 c = ' '; 514 } 515 else { 516 if(c<NCTYPE && int (ctype[c]&W)) 517 break valloop; 518 } 519 if(c == '&') 520 (c, i) = ampersand(b, i); 521 val[nv++] = c; 522 } 523 al = (name, val) :: al; 524 if(i < nb) 525 c = int b[i]; 526 } 527 ans.attr = al; 528 return (ans, i); 529} 530 531ampersand(b: array of byte, i: int): (int, int) 532{ 533 starti := i; 534 c := 0; 535 nb := len b; 536 if(i >= nb) 537 return ('?', i); 538 fnd := 0; 539 ans := 0; 540 if(int b[i] == '#'){ 541 i++; 542 while(i<nb){ 543 d := int b[i]; 544 if(!(int (ctype[d]&D))) 545 break; 546 c = c*10 + d-'0'; 547 i++; 548 } 549 if(0<c && c<256) { 550 if(c==160) 551 c = ' '; # non-breaking space 552 ans = c; 553 fnd = 1; 554 } 555 } 556 else { 557 s := ""; 558 k := 0; 559 c = int b[i]; 560 if(int (ctype[c]&(L|U))) { 561 while(i<nb) { 562 c = int b[i]; 563 if(!(int (ctype[c]&(L|U|D|N)))) 564 break; 565 s[k++] = c; 566 i++; 567 } 568 } 569 (fnd, ans) = T->lookup(chartab, s); 570 } 571 if(!fnd) 572 return ('&', starti); 573 if(i<nb && (int b[i]==';' || int b[i]=='\n')) 574 i++; 575 return (ans, i); 576} 577 578lowercase(b: array of byte, istart, iend: int): string 579{ 580 l := ""; 581 j := 0; 582 for(i:=istart; i<iend; i++) { 583 c := int b[i]; 584 if(c < NCTYPE && int (ctype[c]&U)) 585 l[j] = c-'A'+'a'; 586 else 587 l[j] = c; 588 j++; 589 } 590 return l; 591} 592 593uppercase(s: string): string 594{ 595 l := ""; 596 597 for(i:=0; i<len s; i++) { 598 c := s[i]; 599 if(c < NCTYPE && int (ctype[c]&L)) 600 l[i] = c+'A'-'a'; 601 else 602 l[i] = c; 603 } 604 return l; 605} 606 607attrvalue(attr: list of Attr, name: string): (int, string) 608{ 609 while(attr != nil){ 610 a := hd attr; 611 if(a.name == name) 612 return (1, a.value); 613 attr = tl attr; 614 } 615 return (0, ""); 616} 617 618globalattr(html: array of ref Lex, tag: int, attr: string): (int, string) 619{ 620 for(i:=0; i<len html; i++) 621 if(html[i].tag == tag) 622 return attrvalue(html[i].attr, attr); 623 return (0, ""); 624} 625 626isbreak(h: array of ref Lex, i: int): int 627{ 628 for(; i<len h; i++){ 629 case h[i].tag{ 630 Th1 or Th2 or Th3 or Th4 or Th5 or Th6 or 631 Tbr or Tp or Tbody or Taddress or Tblockquote or 632 Tul or Tdl or Tdir or Tmenu or Tol or Tpre or Thr or Tform => 633 return 1; 634 Data => 635 return 0; 636 } 637 } 638 return 0; 639} 640 641# for debugging 642lex2string(l: ref Lex): string 643{ 644 ans := ""; 645 tag := l.tag; 646 if(tag == HTML->Data) 647 ans = "'" + l.text + "'"; 648 else { 649 ans = "<"; 650 if(tag >= RBRA) { 651 tag -= RBRA; 652 ans = ans + "/"; 653 } 654 tname := T->revlookup(htmlstringtab, tag); 655 if(tname != nil) 656 ans = ans + uppercase(tname); 657 for(al := l.attr; al != nil; al = tl al) { 658 a := hd al; 659 ans = ans + " " + a.name + "='" + a.value + "'"; 660 } 661 ans = ans + ">"; 662 } 663 return ans; 664} 665