1*37da2899SCharles.Forsythimplement HTML; 2*37da2899SCharles.Forsyth 3*37da2899SCharles.Forsythinclude "sys.m"; 4*37da2899SCharles.Forsythinclude "html.m"; 5*37da2899SCharles.Forsythinclude "strinttab.m"; 6*37da2899SCharles.Forsyth 7*37da2899SCharles.Forsythsys: Sys; 8*37da2899SCharles.ForsythT: StringIntTab; 9*37da2899SCharles.Forsyth 10*37da2899SCharles.ForsythStringtab: adt 11*37da2899SCharles.Forsyth{ 12*37da2899SCharles.Forsyth name: string; 13*37da2899SCharles.Forsyth val: int; 14*37da2899SCharles.Forsyth}; 15*37da2899SCharles.Forsyth 16*37da2899SCharles.Forsythchartab:= array[] of { T->StringInt 17*37da2899SCharles.Forsyth ("AElig", 'Æ'), 18*37da2899SCharles.Forsyth ("Aacute", 'Á'), 19*37da2899SCharles.Forsyth ("Acirc", 'Â'), 20*37da2899SCharles.Forsyth ("Agrave", 'À'), 21*37da2899SCharles.Forsyth ("Aring", 'Å'), 22*37da2899SCharles.Forsyth ("Atilde", 'Ã'), 23*37da2899SCharles.Forsyth ("Auml", 'Ä'), 24*37da2899SCharles.Forsyth ("Ccedil", 'Ç'), 25*37da2899SCharles.Forsyth ("ETH", 'Ð'), 26*37da2899SCharles.Forsyth ("Eacute", 'É'), 27*37da2899SCharles.Forsyth ("Ecirc", 'Ê'), 28*37da2899SCharles.Forsyth ("Egrave", 'È'), 29*37da2899SCharles.Forsyth ("Euml", 'Ë'), 30*37da2899SCharles.Forsyth ("Iacute", 'Í'), 31*37da2899SCharles.Forsyth ("Icirc", 'Î'), 32*37da2899SCharles.Forsyth ("Igrave", 'Ì'), 33*37da2899SCharles.Forsyth ("Iuml", 'Ï'), 34*37da2899SCharles.Forsyth ("Ntilde", 'Ñ'), 35*37da2899SCharles.Forsyth ("Oacute", 'Ó'), 36*37da2899SCharles.Forsyth ("Ocirc", 'Ô'), 37*37da2899SCharles.Forsyth ("Ograve", 'Ò'), 38*37da2899SCharles.Forsyth ("Oslash", 'Ø'), 39*37da2899SCharles.Forsyth ("Otilde", 'Õ'), 40*37da2899SCharles.Forsyth ("Ouml", 'Ö'), 41*37da2899SCharles.Forsyth ("THORN", 'Þ'), 42*37da2899SCharles.Forsyth ("Uacute", 'Ú'), 43*37da2899SCharles.Forsyth ("Ucirc", 'Û'), 44*37da2899SCharles.Forsyth ("Ugrave", 'Ù'), 45*37da2899SCharles.Forsyth ("Uuml", 'Ü'), 46*37da2899SCharles.Forsyth ("Yacute", 'Ý'), 47*37da2899SCharles.Forsyth ("aacute", 'á'), 48*37da2899SCharles.Forsyth ("acirc", 'â'), 49*37da2899SCharles.Forsyth ("acute", '´'), 50*37da2899SCharles.Forsyth ("aelig", 'æ'), 51*37da2899SCharles.Forsyth ("agrave", 'à'), 52*37da2899SCharles.Forsyth ("alpha", 'α'), 53*37da2899SCharles.Forsyth ("amp", '&'), 54*37da2899SCharles.Forsyth ("aring", 'å'), 55*37da2899SCharles.Forsyth ("atilde", 'ã'), 56*37da2899SCharles.Forsyth ("auml", 'ä'), 57*37da2899SCharles.Forsyth ("beta", 'β'), 58*37da2899SCharles.Forsyth ("brvbar", '¦'), 59*37da2899SCharles.Forsyth ("ccedil", 'ç'), 60*37da2899SCharles.Forsyth ("cdots", '⋯'), 61*37da2899SCharles.Forsyth ("cedil", '¸'), 62*37da2899SCharles.Forsyth ("cent", '¢'), 63*37da2899SCharles.Forsyth ("chi", 'χ'), 64*37da2899SCharles.Forsyth ("copy", '©'), 65*37da2899SCharles.Forsyth ("curren", '¤'), 66*37da2899SCharles.Forsyth ("ddots", '⋱'), 67*37da2899SCharles.Forsyth ("deg", '°'), 68*37da2899SCharles.Forsyth ("delta", 'δ'), 69*37da2899SCharles.Forsyth ("divide", '÷'), 70*37da2899SCharles.Forsyth ("eacute", 'é'), 71*37da2899SCharles.Forsyth ("ecirc", 'ê'), 72*37da2899SCharles.Forsyth ("egrave", 'è'), 73*37da2899SCharles.Forsyth ("emdash", '—'), 74*37da2899SCharles.Forsyth ("emsp", ' '), 75*37da2899SCharles.Forsyth ("endash", '–'), 76*37da2899SCharles.Forsyth ("ensp", ' '), 77*37da2899SCharles.Forsyth ("epsilon", 'ε'), 78*37da2899SCharles.Forsyth ("eta", 'η'), 79*37da2899SCharles.Forsyth ("eth", 'ð'), 80*37da2899SCharles.Forsyth ("euml", 'ë'), 81*37da2899SCharles.Forsyth ("frac12", '½'), 82*37da2899SCharles.Forsyth ("frac14", '¼'), 83*37da2899SCharles.Forsyth ("frac34", '¾'), 84*37da2899SCharles.Forsyth ("gamma", 'γ'), 85*37da2899SCharles.Forsyth ("gt", '>'), 86*37da2899SCharles.Forsyth ("iacute", 'í'), 87*37da2899SCharles.Forsyth ("icirc", 'î'), 88*37da2899SCharles.Forsyth ("iexcl", '¡'), 89*37da2899SCharles.Forsyth ("igrave", 'ì'), 90*37da2899SCharles.Forsyth ("iota", 'ι'), 91*37da2899SCharles.Forsyth ("iquest", '¿'), 92*37da2899SCharles.Forsyth ("iuml", 'ï'), 93*37da2899SCharles.Forsyth ("kappa", 'κ'), 94*37da2899SCharles.Forsyth ("lambda", 'λ'), 95*37da2899SCharles.Forsyth ("laquo", '«'), 96*37da2899SCharles.Forsyth ("ldots", '…'), 97*37da2899SCharles.Forsyth ("lt", '<'), 98*37da2899SCharles.Forsyth ("macr", '¯'), 99*37da2899SCharles.Forsyth ("micro", 'µ'), 100*37da2899SCharles.Forsyth ("middot", '·'), 101*37da2899SCharles.Forsyth ("mu", 'μ'), 102*37da2899SCharles.Forsyth ("nbsp", ' '), 103*37da2899SCharles.Forsyth ("not", '¬'), 104*37da2899SCharles.Forsyth ("ntilde", 'ñ'), 105*37da2899SCharles.Forsyth ("nu", 'ν'), 106*37da2899SCharles.Forsyth ("oacute", 'ó'), 107*37da2899SCharles.Forsyth ("ocirc", 'ô'), 108*37da2899SCharles.Forsyth ("ograve", 'ò'), 109*37da2899SCharles.Forsyth ("omega", 'ω'), 110*37da2899SCharles.Forsyth ("omicron", 'ο'), 111*37da2899SCharles.Forsyth ("ordf", 'ª'), 112*37da2899SCharles.Forsyth ("ordm", 'º'), 113*37da2899SCharles.Forsyth ("oslash", 'ø'), 114*37da2899SCharles.Forsyth ("otilde", 'õ'), 115*37da2899SCharles.Forsyth ("ouml", 'ö'), 116*37da2899SCharles.Forsyth ("para", '¶'), 117*37da2899SCharles.Forsyth ("phi", 'φ'), 118*37da2899SCharles.Forsyth ("pi", 'π'), 119*37da2899SCharles.Forsyth ("plusmn", '±'), 120*37da2899SCharles.Forsyth ("pound", '£'), 121*37da2899SCharles.Forsyth ("psi", 'ψ'), 122*37da2899SCharles.Forsyth ("quad", ' '), 123*37da2899SCharles.Forsyth ("quot", '"'), 124*37da2899SCharles.Forsyth ("raquo", '»'), 125*37da2899SCharles.Forsyth ("reg", '®'), 126*37da2899SCharles.Forsyth ("rho", 'ρ'), 127*37da2899SCharles.Forsyth ("sect", '§'), 128*37da2899SCharles.Forsyth ("shy", ''), 129*37da2899SCharles.Forsyth ("sigma", 'σ'), 130*37da2899SCharles.Forsyth ("sp", ' '), 131*37da2899SCharles.Forsyth ("sup1", '¹'), 132*37da2899SCharles.Forsyth ("sup2", '²'), 133*37da2899SCharles.Forsyth ("sup3", '³'), 134*37da2899SCharles.Forsyth ("szlig", 'ß'), 135*37da2899SCharles.Forsyth ("tau", 'τ'), 136*37da2899SCharles.Forsyth ("theta", 'θ'), 137*37da2899SCharles.Forsyth ("thinsp", ' '), 138*37da2899SCharles.Forsyth ("thorn", 'þ'), 139*37da2899SCharles.Forsyth ("times", '×'), 140*37da2899SCharles.Forsyth ("trade", '™'), 141*37da2899SCharles.Forsyth ("uacute", 'ú'), 142*37da2899SCharles.Forsyth ("ucirc", 'û'), 143*37da2899SCharles.Forsyth ("ugrave", 'ù'), 144*37da2899SCharles.Forsyth ("uml", '¨'), 145*37da2899SCharles.Forsyth ("upsilon", 'υ'), 146*37da2899SCharles.Forsyth ("uuml", 'ü'), 147*37da2899SCharles.Forsyth ("varepsilon", '∈'), 148*37da2899SCharles.Forsyth ("varphi", 'ϕ'), 149*37da2899SCharles.Forsyth ("varpi", 'ϖ'), 150*37da2899SCharles.Forsyth ("varrho", 'ϱ'), 151*37da2899SCharles.Forsyth ("vdots", '⋮'), 152*37da2899SCharles.Forsyth ("vsigma", 'ς'), 153*37da2899SCharles.Forsyth ("vtheta", 'ϑ'), 154*37da2899SCharles.Forsyth ("xi", 'ξ'), 155*37da2899SCharles.Forsyth ("yacute", 'ý'), 156*37da2899SCharles.Forsyth ("yen", '¥'), 157*37da2899SCharles.Forsyth ("yuml", 'ÿ'), 158*37da2899SCharles.Forsyth ("zeta", 'ζ'), 159*37da2899SCharles.Forsyth}; 160*37da2899SCharles.Forsyth 161*37da2899SCharles.Forsythhtmlstringtab := array[] of { T->StringInt 162*37da2899SCharles.Forsyth ("a", Ta), 163*37da2899SCharles.Forsyth ("address", Taddress), 164*37da2899SCharles.Forsyth ("applet", Tapplet), 165*37da2899SCharles.Forsyth ("area", Tarea), 166*37da2899SCharles.Forsyth ("att_footer", Tatt_footer), 167*37da2899SCharles.Forsyth ("b", Tb), 168*37da2899SCharles.Forsyth ("base", Tbase), 169*37da2899SCharles.Forsyth ("basefont", Tbasefont), 170*37da2899SCharles.Forsyth ("big", Tbig), 171*37da2899SCharles.Forsyth ("blink", Tblink), 172*37da2899SCharles.Forsyth ("blockquote", Tblockquote), 173*37da2899SCharles.Forsyth ("body", Tbody), 174*37da2899SCharles.Forsyth ("bq", Tbq), 175*37da2899SCharles.Forsyth ("br", Tbr), 176*37da2899SCharles.Forsyth ("caption", Tcaption), 177*37da2899SCharles.Forsyth ("center", Tcenter), 178*37da2899SCharles.Forsyth ("cite", Tcite), 179*37da2899SCharles.Forsyth ("code", Tcode), 180*37da2899SCharles.Forsyth ("col", Tcol), 181*37da2899SCharles.Forsyth ("colgroup", Tcolgroup), 182*37da2899SCharles.Forsyth ("dd", Tdd), 183*37da2899SCharles.Forsyth ("dfn", Tdfn), 184*37da2899SCharles.Forsyth ("dir", Tdir), 185*37da2899SCharles.Forsyth ("div", Tdiv), 186*37da2899SCharles.Forsyth ("dl", Tdl), 187*37da2899SCharles.Forsyth ("dt", Tdt), 188*37da2899SCharles.Forsyth ("em", Tem), 189*37da2899SCharles.Forsyth ("font", Tfont), 190*37da2899SCharles.Forsyth ("form", Tform), 191*37da2899SCharles.Forsyth ("frame", Tframe), 192*37da2899SCharles.Forsyth ("frameset", Tframeset), 193*37da2899SCharles.Forsyth ("h1", Th1), 194*37da2899SCharles.Forsyth ("h2", Th2), 195*37da2899SCharles.Forsyth ("h3", Th3), 196*37da2899SCharles.Forsyth ("h4", Th4), 197*37da2899SCharles.Forsyth ("h5", Th5), 198*37da2899SCharles.Forsyth ("h6", Th6), 199*37da2899SCharles.Forsyth ("head", Thead), 200*37da2899SCharles.Forsyth ("hr", Thr), 201*37da2899SCharles.Forsyth ("html", Thtml), 202*37da2899SCharles.Forsyth ("i", Ti), 203*37da2899SCharles.Forsyth ("img", Timg), 204*37da2899SCharles.Forsyth ("input", Tinput), 205*37da2899SCharles.Forsyth ("isindex", Tisindex), 206*37da2899SCharles.Forsyth ("item", Titem), 207*37da2899SCharles.Forsyth ("kbd", Tkbd), 208*37da2899SCharles.Forsyth ("li", Tli), 209*37da2899SCharles.Forsyth ("link", Tlink), 210*37da2899SCharles.Forsyth ("map", Tmap), 211*37da2899SCharles.Forsyth ("menu", Tmenu), 212*37da2899SCharles.Forsyth ("meta", Tmeta), 213*37da2899SCharles.Forsyth ("nobr", Tnobr), 214*37da2899SCharles.Forsyth ("noframes", Tnoframes), 215*37da2899SCharles.Forsyth ("ol", Tol), 216*37da2899SCharles.Forsyth ("option", Toption), 217*37da2899SCharles.Forsyth ("p", Tp), 218*37da2899SCharles.Forsyth ("param", Tparam), 219*37da2899SCharles.Forsyth ("pre", Tpre), 220*37da2899SCharles.Forsyth ("q", Tq), 221*37da2899SCharles.Forsyth ("samp", Tsamp), 222*37da2899SCharles.Forsyth ("script", Tscript), 223*37da2899SCharles.Forsyth ("select", Tselect), 224*37da2899SCharles.Forsyth ("small", Tsmall), 225*37da2899SCharles.Forsyth ("strike", Tstrike), 226*37da2899SCharles.Forsyth ("strong", Tstrong), 227*37da2899SCharles.Forsyth ("style", Tstyle), 228*37da2899SCharles.Forsyth ("sub", Tsub), 229*37da2899SCharles.Forsyth ("sup", Tsup), 230*37da2899SCharles.Forsyth ("t", Tt), 231*37da2899SCharles.Forsyth ("table", Ttable), 232*37da2899SCharles.Forsyth ("tbody", Ttbody), 233*37da2899SCharles.Forsyth ("td", Ttd), 234*37da2899SCharles.Forsyth ("textarea", Ttextarea), 235*37da2899SCharles.Forsyth ("textflow", Ttextflow), 236*37da2899SCharles.Forsyth ("tfoot", Ttfoot), 237*37da2899SCharles.Forsyth ("th", Tth), 238*37da2899SCharles.Forsyth ("thead", Tthead), 239*37da2899SCharles.Forsyth ("title", Ttitle), 240*37da2899SCharles.Forsyth ("tr", Ttr), 241*37da2899SCharles.Forsyth ("tt", Ttt), 242*37da2899SCharles.Forsyth ("u", Tu), 243*37da2899SCharles.Forsyth ("ul", Tul), 244*37da2899SCharles.Forsyth ("var", Tvar) 245*37da2899SCharles.Forsyth}; 246*37da2899SCharles.Forsyth 247*37da2899SCharles.ForsythW, D, L, U, N: con byte (1<<iota); 248*37da2899SCharles.ForsythNCTYPE: con 256; 249*37da2899SCharles.Forsyth 250*37da2899SCharles.Forsythctype := array[NCTYPE] of { 251*37da2899SCharles.Forsyth '0'=>D, '1'=>D, '2'=>D, '3'=>D, '4'=>D, 252*37da2899SCharles.Forsyth '5'=>D, '6'=>D, '7'=>D, '8'=>D, '9'=>D, 253*37da2899SCharles.Forsyth 'A'=>U, 'B'=>U, 'C'=>U, 'D'=>U, 'E'=>U, 'F'=>U, 254*37da2899SCharles.Forsyth 'G'=>U, 'H'=>U, 'I'=>U, 'J'=>U, 'K'=>U, 'L'=>U, 255*37da2899SCharles.Forsyth 'M'=>U, 'N'=>U, 'O'=>U, 'P'=>U, 'Q'=>U, 'R'=>U, 256*37da2899SCharles.Forsyth 'S'=>U, 'T'=>U, 'U'=>U, 'V'=>U, 'W'=>U, 'X'=>U, 257*37da2899SCharles.Forsyth 'Y'=>U, 'Z'=>U, 258*37da2899SCharles.Forsyth 'a'=>L, 'b'=>L, 'c'=>L, 'd'=>L, 'e'=>L, 'f'=>L, 259*37da2899SCharles.Forsyth 'g'=>L, 'h'=>L, 'i'=>L, 'j'=>L, 'k'=>L, 'l'=>L, 260*37da2899SCharles.Forsyth 'm'=>L, 'n'=>L, 'o'=>L, 'p'=>L, 'q'=>L, 'r'=>L, 261*37da2899SCharles.Forsyth 's'=>L, 't'=>L, 'u'=>L, 'v'=>L, 'w'=>L, 'x'=>L, 262*37da2899SCharles.Forsyth 'y'=>L, 'z'=>L, 263*37da2899SCharles.Forsyth '.'=>N, '-'=>N, 264*37da2899SCharles.Forsyth ' '=>W, '\n'=>W, '\t'=>W, '\r'=>W, 265*37da2899SCharles.Forsyth * => byte 0 266*37da2899SCharles.Forsyth}; 267*37da2899SCharles.Forsyth 268*37da2899SCharles.Forsythlex(b: array of byte, charset: int, keepwh: int): array of ref Lex 269*37da2899SCharles.Forsyth{ 270*37da2899SCharles.Forsyth if(sys == nil) 271*37da2899SCharles.Forsyth sys = load Sys Sys->PATH; 272*37da2899SCharles.Forsyth if(T == nil) 273*37da2899SCharles.Forsyth T = load StringIntTab StringIntTab->PATH; 274*37da2899SCharles.Forsyth if(T == nil) { 275*37da2899SCharles.Forsyth sys->print("HTML->lex: couldn't %s\n", StringIntTab->PATH); 276*37da2899SCharles.Forsyth return nil; 277*37da2899SCharles.Forsyth } 278*37da2899SCharles.Forsyth 279*37da2899SCharles.Forsyth a: array of ref Lex; 280*37da2899SCharles.Forsyth ai := 0; 281*37da2899SCharles.Forsyth i := 0; 282*37da2899SCharles.Forsyth nb := len b; 283*37da2899SCharles.Forsyth for(;;){ 284*37da2899SCharles.Forsyth Whitespace: 285*37da2899SCharles.Forsyth for(;;){ 286*37da2899SCharles.Forsyth # ignore nulls 287*37da2899SCharles.Forsyth while(i<nb && (int b[i] == 0)) 288*37da2899SCharles.Forsyth i++; 289*37da2899SCharles.Forsyth # skip white space 290*37da2899SCharles.Forsyth if(!keepwh) { 291*37da2899SCharles.Forsyth while(i<nb) { 292*37da2899SCharles.Forsyth c := int b[i]; 293*37da2899SCharles.Forsyth if(!(int (ctype[c]&W)) && c != ' ') 294*37da2899SCharles.Forsyth break; 295*37da2899SCharles.Forsyth i++; 296*37da2899SCharles.Forsyth } 297*37da2899SCharles.Forsyth } 298*37da2899SCharles.Forsyth # skip comments 299*37da2899SCharles.Forsyth if(i<nb-4 && int b[i]=='<' && int b[i+1]=='!' 300*37da2899SCharles.Forsyth && int b[i+2]=='-' && int b[i+3]=='-') { 301*37da2899SCharles.Forsyth i += 4; 302*37da2899SCharles.Forsyth while(i<nb-3){ 303*37da2899SCharles.Forsyth if(int b[i]=='-' && int b[i+1]=='-' && int b[i+2]=='>'){ 304*37da2899SCharles.Forsyth i += 3; 305*37da2899SCharles.Forsyth continue Whitespace; 306*37da2899SCharles.Forsyth } 307*37da2899SCharles.Forsyth i++; 308*37da2899SCharles.Forsyth } 309*37da2899SCharles.Forsyth continue Whitespace; 310*37da2899SCharles.Forsyth } 311*37da2899SCharles.Forsyth break; 312*37da2899SCharles.Forsyth } 313*37da2899SCharles.Forsyth if(i == nb) 314*37da2899SCharles.Forsyth break; 315*37da2899SCharles.Forsyth if(ai == len a){ 316*37da2899SCharles.Forsyth na := array[len a + 500] of ref Lex; 317*37da2899SCharles.Forsyth if(a != nil) 318*37da2899SCharles.Forsyth na[0:] = a; 319*37da2899SCharles.Forsyth a = na; 320*37da2899SCharles.Forsyth } 321*37da2899SCharles.Forsyth if(int b[i] == '<'){ 322*37da2899SCharles.Forsyth lx : ref Lex; 323*37da2899SCharles.Forsyth (lx, i) = gettag(b, i, charset); 324*37da2899SCharles.Forsyth a[ai++] = lx; 325*37da2899SCharles.Forsyth } 326*37da2899SCharles.Forsyth else { 327*37da2899SCharles.Forsyth s: string; 328*37da2899SCharles.Forsyth (s, i) = getdata(b, i, keepwh, charset); 329*37da2899SCharles.Forsyth a[ai++] = ref Lex (Data, s, nil); 330*37da2899SCharles.Forsyth } 331*37da2899SCharles.Forsyth } 332*37da2899SCharles.Forsyth return a[0:ai]; 333*37da2899SCharles.Forsyth} 334*37da2899SCharles.Forsyth 335*37da2899SCharles.Forsythgetdata(b: array of byte, i: int, keepnls, charset: int): (string, int) 336*37da2899SCharles.Forsyth{ 337*37da2899SCharles.Forsyth s:= ""; 338*37da2899SCharles.Forsyth j:= 0; 339*37da2899SCharles.Forsyth c: int; 340*37da2899SCharles.Forsyth nb := len b; 341*37da2899SCharles.Forsyth 342*37da2899SCharles.Forsythloop: 343*37da2899SCharles.Forsyth while(i < nb){ 344*37da2899SCharles.Forsyth oldi := i; 345*37da2899SCharles.Forsyth case charset{ 346*37da2899SCharles.Forsyth Latin1 => 347*37da2899SCharles.Forsyth c = int b[i++]; 348*37da2899SCharles.Forsyth UTF8 => 349*37da2899SCharles.Forsyth j: int; 350*37da2899SCharles.Forsyth (c, j, nil) = sys->byte2char(b, i); 351*37da2899SCharles.Forsyth i += j; 352*37da2899SCharles.Forsyth } 353*37da2899SCharles.Forsyth case c { 354*37da2899SCharles.Forsyth 0 or 16r1a => 355*37da2899SCharles.Forsyth continue loop; 356*37da2899SCharles.Forsyth '<' => 357*37da2899SCharles.Forsyth i = oldi; 358*37da2899SCharles.Forsyth break loop; 359*37da2899SCharles.Forsyth '&' => 360*37da2899SCharles.Forsyth (c, i) = ampersand(b, i); 361*37da2899SCharles.Forsyth '\n' => 362*37da2899SCharles.Forsyth if(!keepnls) 363*37da2899SCharles.Forsyth c = ' '; 364*37da2899SCharles.Forsyth '\r' => 365*37da2899SCharles.Forsyth if(oldi > 0 && int b[oldi-1] == '\n') 366*37da2899SCharles.Forsyth continue loop; 367*37da2899SCharles.Forsyth if(keepnls) 368*37da2899SCharles.Forsyth c = '\n'; 369*37da2899SCharles.Forsyth else 370*37da2899SCharles.Forsyth c = ' '; 371*37da2899SCharles.Forsyth } 372*37da2899SCharles.Forsyth s[j++] = c; 373*37da2899SCharles.Forsyth } 374*37da2899SCharles.Forsyth return (s, i); 375*37da2899SCharles.Forsyth} 376*37da2899SCharles.Forsyth 377*37da2899SCharles.Forsythgettag(b: array of byte, i, charset: int): (ref Lex, int) 378*37da2899SCharles.Forsyth{ 379*37da2899SCharles.Forsyth rbra := 0; 380*37da2899SCharles.Forsyth nb := len b; 381*37da2899SCharles.Forsyth ans := ref Lex(Notfound, "", nil); 382*37da2899SCharles.Forsyth al: list of Attr; 383*37da2899SCharles.Forsyth if(++i == nb) 384*37da2899SCharles.Forsyth return (ans, i); 385*37da2899SCharles.Forsyth istart := i; 386*37da2899SCharles.Forsyth c := int b[i]; 387*37da2899SCharles.Forsyth if(c == '/') { 388*37da2899SCharles.Forsyth rbra = RBRA; 389*37da2899SCharles.Forsyth if(++i == nb) 390*37da2899SCharles.Forsyth return (ans, i); 391*37da2899SCharles.Forsyth c = int b[i]; 392*37da2899SCharles.Forsyth } 393*37da2899SCharles.Forsyth if(c>=NCTYPE || !int (ctype[c]&(L|U))) { 394*37da2899SCharles.Forsyth while(i < nb) { 395*37da2899SCharles.Forsyth c = int b[i++]; 396*37da2899SCharles.Forsyth if(c == '>') 397*37da2899SCharles.Forsyth break; 398*37da2899SCharles.Forsyth } 399*37da2899SCharles.Forsyth ans.text = string b[istart:i]; 400*37da2899SCharles.Forsyth return (ans, i); 401*37da2899SCharles.Forsyth } 402*37da2899SCharles.Forsyth namstart := i; 403*37da2899SCharles.Forsyth while(c<NCTYPE && int (ctype[c]&(L|U|D|N))) { 404*37da2899SCharles.Forsyth if(++i == nb) { 405*37da2899SCharles.Forsyth ans.text = string b[istart:i]; 406*37da2899SCharles.Forsyth return (ans, i); 407*37da2899SCharles.Forsyth } 408*37da2899SCharles.Forsyth c = int b[i]; 409*37da2899SCharles.Forsyth } 410*37da2899SCharles.Forsyth name := lowercase(b, namstart, i); 411*37da2899SCharles.Forsyth (fnd, tag) := T->lookup(htmlstringtab, name); 412*37da2899SCharles.Forsyth if(fnd) 413*37da2899SCharles.Forsyth ans.tag = tag+rbra; 414*37da2899SCharles.Forsyth else 415*37da2899SCharles.Forsyth ans.text = name; 416*37da2899SCharles.Forsythattrloop: 417*37da2899SCharles.Forsyth while(i < nb){ 418*37da2899SCharles.Forsyth # look for "ws name" or "ws name ws = ws val" (ws=whitespace) 419*37da2899SCharles.Forsyth # skip whitespace 420*37da2899SCharles.Forsyth while(c<NCTYPE && int (ctype[c]&W)) { 421*37da2899SCharles.Forsyth if(++i == nb) 422*37da2899SCharles.Forsyth break attrloop; 423*37da2899SCharles.Forsyth c = int b[i]; 424*37da2899SCharles.Forsyth } 425*37da2899SCharles.Forsyth if(c == '>') { 426*37da2899SCharles.Forsyth i++; 427*37da2899SCharles.Forsyth break; 428*37da2899SCharles.Forsyth } 429*37da2899SCharles.Forsyth if(c == '<') 430*37da2899SCharles.Forsyth break; # error: unclosed tag 431*37da2899SCharles.Forsyth if(c>=NCTYPE || !int (ctype[c]&(L|U))) { 432*37da2899SCharles.Forsyth # error, not the start of a name 433*37da2899SCharles.Forsyth # skip to end of tag 434*37da2899SCharles.Forsyth while(i < nb) { 435*37da2899SCharles.Forsyth c = int b[i++]; 436*37da2899SCharles.Forsyth if(c == '>') 437*37da2899SCharles.Forsyth break; 438*37da2899SCharles.Forsyth } 439*37da2899SCharles.Forsyth break attrloop; 440*37da2899SCharles.Forsyth } 441*37da2899SCharles.Forsyth # gather name 442*37da2899SCharles.Forsyth namstart = i; 443*37da2899SCharles.Forsyth while(c<NCTYPE && int (ctype[c]&(L|U|D|N))) { 444*37da2899SCharles.Forsyth if(++i == nb) 445*37da2899SCharles.Forsyth break attrloop; 446*37da2899SCharles.Forsyth c = int b[i]; 447*37da2899SCharles.Forsyth } 448*37da2899SCharles.Forsyth name = lowercase(b, namstart, i); 449*37da2899SCharles.Forsyth # skip whitespace 450*37da2899SCharles.Forsyth while(c<NCTYPE && int (ctype[c]&W)) { 451*37da2899SCharles.Forsyth if(++i == nb) 452*37da2899SCharles.Forsyth break attrloop; 453*37da2899SCharles.Forsyth c = int b[i]; 454*37da2899SCharles.Forsyth } 455*37da2899SCharles.Forsyth if(c != '=') { 456*37da2899SCharles.Forsyth # no value for this attr 457*37da2899SCharles.Forsyth al = (name, "") :: al; 458*37da2899SCharles.Forsyth continue attrloop; 459*37da2899SCharles.Forsyth } 460*37da2899SCharles.Forsyth # skip whitespace 461*37da2899SCharles.Forsyth if(++i == nb) 462*37da2899SCharles.Forsyth break attrloop; 463*37da2899SCharles.Forsyth c = int b[i]; 464*37da2899SCharles.Forsyth while(c<NCTYPE && int (ctype[c]&W)) { 465*37da2899SCharles.Forsyth if(++i == nb) 466*37da2899SCharles.Forsyth break attrloop; 467*37da2899SCharles.Forsyth c = int b[i]; 468*37da2899SCharles.Forsyth } 469*37da2899SCharles.Forsyth # gather value 470*37da2899SCharles.Forsyth quote := 0; 471*37da2899SCharles.Forsyth if(c == '\'' || c == '"') { 472*37da2899SCharles.Forsyth quote = c; 473*37da2899SCharles.Forsyth i++; 474*37da2899SCharles.Forsyth } 475*37da2899SCharles.Forsyth val := ""; 476*37da2899SCharles.Forsyth nv := 0; 477*37da2899SCharles.Forsyth valloop: 478*37da2899SCharles.Forsyth while(i < nb) { 479*37da2899SCharles.Forsyth case charset{ 480*37da2899SCharles.Forsyth Latin1 => 481*37da2899SCharles.Forsyth c = int b[i++]; 482*37da2899SCharles.Forsyth UTF8 => 483*37da2899SCharles.Forsyth j: int; 484*37da2899SCharles.Forsyth (c, j, nil) = sys->byte2char(b, i); 485*37da2899SCharles.Forsyth i += j; 486*37da2899SCharles.Forsyth } 487*37da2899SCharles.Forsyth if(c == '>') { 488*37da2899SCharles.Forsyth if(quote) { 489*37da2899SCharles.Forsyth # c might be part of string (though not good style) 490*37da2899SCharles.Forsyth # but if line ends before close quote, assume 491*37da2899SCharles.Forsyth # there was an unmatched quote 492*37da2899SCharles.Forsyth for(k := i; k < nb; k++) { 493*37da2899SCharles.Forsyth c = int b[k]; 494*37da2899SCharles.Forsyth if(c == quote) { 495*37da2899SCharles.Forsyth val[nv++] = '>'; 496*37da2899SCharles.Forsyth continue valloop; 497*37da2899SCharles.Forsyth } 498*37da2899SCharles.Forsyth if(c == '\n') { 499*37da2899SCharles.Forsyth i--; 500*37da2899SCharles.Forsyth break valloop; 501*37da2899SCharles.Forsyth } 502*37da2899SCharles.Forsyth } 503*37da2899SCharles.Forsyth } 504*37da2899SCharles.Forsyth i--; 505*37da2899SCharles.Forsyth break valloop; 506*37da2899SCharles.Forsyth } 507*37da2899SCharles.Forsyth if(quote) { 508*37da2899SCharles.Forsyth if(c == quote) 509*37da2899SCharles.Forsyth break valloop; 510*37da2899SCharles.Forsyth if(c == '\n') 511*37da2899SCharles.Forsyth continue valloop; 512*37da2899SCharles.Forsyth if(c == '\t' || c == '\r') 513*37da2899SCharles.Forsyth c = ' '; 514*37da2899SCharles.Forsyth } 515*37da2899SCharles.Forsyth else { 516*37da2899SCharles.Forsyth if(c<NCTYPE && int (ctype[c]&W)) 517*37da2899SCharles.Forsyth break valloop; 518*37da2899SCharles.Forsyth } 519*37da2899SCharles.Forsyth if(c == '&') 520*37da2899SCharles.Forsyth (c, i) = ampersand(b, i); 521*37da2899SCharles.Forsyth val[nv++] = c; 522*37da2899SCharles.Forsyth } 523*37da2899SCharles.Forsyth al = (name, val) :: al; 524*37da2899SCharles.Forsyth if(i < nb) 525*37da2899SCharles.Forsyth c = int b[i]; 526*37da2899SCharles.Forsyth } 527*37da2899SCharles.Forsyth ans.attr = al; 528*37da2899SCharles.Forsyth return (ans, i); 529*37da2899SCharles.Forsyth} 530*37da2899SCharles.Forsyth 531*37da2899SCharles.Forsythampersand(b: array of byte, i: int): (int, int) 532*37da2899SCharles.Forsyth{ 533*37da2899SCharles.Forsyth starti := i; 534*37da2899SCharles.Forsyth c := 0; 535*37da2899SCharles.Forsyth nb := len b; 536*37da2899SCharles.Forsyth if(i >= nb) 537*37da2899SCharles.Forsyth return ('?', i); 538*37da2899SCharles.Forsyth fnd := 0; 539*37da2899SCharles.Forsyth ans := 0; 540*37da2899SCharles.Forsyth if(int b[i] == '#'){ 541*37da2899SCharles.Forsyth i++; 542*37da2899SCharles.Forsyth while(i<nb){ 543*37da2899SCharles.Forsyth d := int b[i]; 544*37da2899SCharles.Forsyth if(!(int (ctype[d]&D))) 545*37da2899SCharles.Forsyth break; 546*37da2899SCharles.Forsyth c = c*10 + d-'0'; 547*37da2899SCharles.Forsyth i++; 548*37da2899SCharles.Forsyth } 549*37da2899SCharles.Forsyth if(0<c && c<256) { 550*37da2899SCharles.Forsyth if(c==160) 551*37da2899SCharles.Forsyth c = ' '; # non-breaking space 552*37da2899SCharles.Forsyth ans = c; 553*37da2899SCharles.Forsyth fnd = 1; 554*37da2899SCharles.Forsyth } 555*37da2899SCharles.Forsyth } 556*37da2899SCharles.Forsyth else { 557*37da2899SCharles.Forsyth s := ""; 558*37da2899SCharles.Forsyth k := 0; 559*37da2899SCharles.Forsyth c = int b[i]; 560*37da2899SCharles.Forsyth if(int (ctype[c]&(L|U))) { 561*37da2899SCharles.Forsyth while(i<nb) { 562*37da2899SCharles.Forsyth c = int b[i]; 563*37da2899SCharles.Forsyth if(!(int (ctype[c]&(L|U|D|N)))) 564*37da2899SCharles.Forsyth break; 565*37da2899SCharles.Forsyth s[k++] = c; 566*37da2899SCharles.Forsyth i++; 567*37da2899SCharles.Forsyth } 568*37da2899SCharles.Forsyth } 569*37da2899SCharles.Forsyth (fnd, ans) = T->lookup(chartab, s); 570*37da2899SCharles.Forsyth } 571*37da2899SCharles.Forsyth if(!fnd) 572*37da2899SCharles.Forsyth return ('&', starti); 573*37da2899SCharles.Forsyth if(i<nb && (int b[i]==';' || int b[i]=='\n')) 574*37da2899SCharles.Forsyth i++; 575*37da2899SCharles.Forsyth return (ans, i); 576*37da2899SCharles.Forsyth} 577*37da2899SCharles.Forsyth 578*37da2899SCharles.Forsythlowercase(b: array of byte, istart, iend: int): string 579*37da2899SCharles.Forsyth{ 580*37da2899SCharles.Forsyth l := ""; 581*37da2899SCharles.Forsyth j := 0; 582*37da2899SCharles.Forsyth for(i:=istart; i<iend; i++) { 583*37da2899SCharles.Forsyth c := int b[i]; 584*37da2899SCharles.Forsyth if(c < NCTYPE && int (ctype[c]&U)) 585*37da2899SCharles.Forsyth l[j] = c-'A'+'a'; 586*37da2899SCharles.Forsyth else 587*37da2899SCharles.Forsyth l[j] = c; 588*37da2899SCharles.Forsyth j++; 589*37da2899SCharles.Forsyth } 590*37da2899SCharles.Forsyth return l; 591*37da2899SCharles.Forsyth} 592*37da2899SCharles.Forsyth 593*37da2899SCharles.Forsythuppercase(s: string): string 594*37da2899SCharles.Forsyth{ 595*37da2899SCharles.Forsyth l := ""; 596*37da2899SCharles.Forsyth 597*37da2899SCharles.Forsyth for(i:=0; i<len s; i++) { 598*37da2899SCharles.Forsyth c := s[i]; 599*37da2899SCharles.Forsyth if(c < NCTYPE && int (ctype[c]&L)) 600*37da2899SCharles.Forsyth l[i] = c+'A'-'a'; 601*37da2899SCharles.Forsyth else 602*37da2899SCharles.Forsyth l[i] = c; 603*37da2899SCharles.Forsyth } 604*37da2899SCharles.Forsyth return l; 605*37da2899SCharles.Forsyth} 606*37da2899SCharles.Forsyth 607*37da2899SCharles.Forsythattrvalue(attr: list of Attr, name: string): (int, string) 608*37da2899SCharles.Forsyth{ 609*37da2899SCharles.Forsyth while(attr != nil){ 610*37da2899SCharles.Forsyth a := hd attr; 611*37da2899SCharles.Forsyth if(a.name == name) 612*37da2899SCharles.Forsyth return (1, a.value); 613*37da2899SCharles.Forsyth attr = tl attr; 614*37da2899SCharles.Forsyth } 615*37da2899SCharles.Forsyth return (0, ""); 616*37da2899SCharles.Forsyth} 617*37da2899SCharles.Forsyth 618*37da2899SCharles.Forsythglobalattr(html: array of ref Lex, tag: int, attr: string): (int, string) 619*37da2899SCharles.Forsyth{ 620*37da2899SCharles.Forsyth for(i:=0; i<len html; i++) 621*37da2899SCharles.Forsyth if(html[i].tag == tag) 622*37da2899SCharles.Forsyth return attrvalue(html[i].attr, attr); 623*37da2899SCharles.Forsyth return (0, ""); 624*37da2899SCharles.Forsyth} 625*37da2899SCharles.Forsyth 626*37da2899SCharles.Forsythisbreak(h: array of ref Lex, i: int): int 627*37da2899SCharles.Forsyth{ 628*37da2899SCharles.Forsyth for(; i<len h; i++){ 629*37da2899SCharles.Forsyth case h[i].tag{ 630*37da2899SCharles.Forsyth Th1 or Th2 or Th3 or Th4 or Th5 or Th6 or 631*37da2899SCharles.Forsyth Tbr or Tp or Tbody or Taddress or Tblockquote or 632*37da2899SCharles.Forsyth Tul or Tdl or Tdir or Tmenu or Tol or Tpre or Thr or Tform => 633*37da2899SCharles.Forsyth return 1; 634*37da2899SCharles.Forsyth Data => 635*37da2899SCharles.Forsyth return 0; 636*37da2899SCharles.Forsyth } 637*37da2899SCharles.Forsyth } 638*37da2899SCharles.Forsyth return 0; 639*37da2899SCharles.Forsyth} 640*37da2899SCharles.Forsyth 641*37da2899SCharles.Forsyth# for debugging 642*37da2899SCharles.Forsythlex2string(l: ref Lex): string 643*37da2899SCharles.Forsyth{ 644*37da2899SCharles.Forsyth ans := ""; 645*37da2899SCharles.Forsyth tag := l.tag; 646*37da2899SCharles.Forsyth if(tag == HTML->Data) 647*37da2899SCharles.Forsyth ans = "'" + l.text + "'"; 648*37da2899SCharles.Forsyth else { 649*37da2899SCharles.Forsyth ans = "<"; 650*37da2899SCharles.Forsyth if(tag >= RBRA) { 651*37da2899SCharles.Forsyth tag -= RBRA; 652*37da2899SCharles.Forsyth ans = ans + "/"; 653*37da2899SCharles.Forsyth } 654*37da2899SCharles.Forsyth tname := T->revlookup(htmlstringtab, tag); 655*37da2899SCharles.Forsyth if(tname != nil) 656*37da2899SCharles.Forsyth ans = ans + uppercase(tname); 657*37da2899SCharles.Forsyth for(al := l.attr; al != nil; al = tl al) { 658*37da2899SCharles.Forsyth a := hd al; 659*37da2899SCharles.Forsyth ans = ans + " " + a.name + "='" + a.value + "'"; 660*37da2899SCharles.Forsyth } 661*37da2899SCharles.Forsyth ans = ans + ">"; 662*37da2899SCharles.Forsyth } 663*37da2899SCharles.Forsyth return ans; 664*37da2899SCharles.Forsyth} 665