1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include "hdr.h" 5 #include "conv.h" 6 7 typedef struct Hchar Hchar; 8 struct Hchar 9 { 10 char *s; 11 Rune r; 12 }; 13 14 /* <, >, ", & intentionally omitted */ 15 16 static Hchar byname[] = 17 { 18 {"AElig", 198}, 19 {"Aacute", 193}, 20 {"Acirc", 194}, 21 {"Agrave", 192}, 22 {"Alpha", 913}, 23 {"Aring", 197}, 24 {"Atilde", 195}, 25 {"Auml", 196}, 26 {"Beta", 914}, 27 {"Ccedil", 199}, 28 {"Chi", 935}, 29 {"Dagger", 8225}, 30 {"Delta", 916}, 31 {"ETH", 208}, 32 {"Eacute", 201}, 33 {"Ecirc", 202}, 34 {"Egrave", 200}, 35 {"Epsilon", 917}, 36 {"Eta", 919}, 37 {"Euml", 203}, 38 {"Gamma", 915}, 39 {"Iacute", 205}, 40 {"Icirc", 206}, 41 {"Igrave", 204}, 42 {"Iota", 921}, 43 {"Iuml", 207}, 44 {"Kappa", 922}, 45 {"Lambda", 923}, 46 {"Mu", 924}, 47 {"Ntilde", 209}, 48 {"Nu", 925}, 49 {"OElig", 338}, 50 {"Oacute", 211}, 51 {"Ocirc", 212}, 52 {"Ograve", 210}, 53 {"Omega", 937}, 54 {"Omicron", 927}, 55 {"Oslash", 216}, 56 {"Otilde", 213}, 57 {"Ouml", 214}, 58 {"Phi", 934}, 59 {"Pi", 928}, 60 {"Prime", 8243}, 61 {"Psi", 936}, 62 {"Rho", 929}, 63 {"Scaron", 352}, 64 {"Sigma", 931}, 65 {"THORN", 222}, 66 {"Tau", 932}, 67 {"Theta", 920}, 68 {"Uacute", 218}, 69 {"Ucirc", 219}, 70 {"Ugrave", 217}, 71 {"Upsilon", 933}, 72 {"Uuml", 220}, 73 {"Xi", 926}, 74 {"Yacute", 221}, 75 {"Yuml", 376}, 76 {"Zeta", 918}, 77 {"aacute", 225}, 78 {"acirc", 226}, 79 {"acute", 180}, 80 {"aelig", 230}, 81 {"agrave", 224}, 82 {"alefsym", 8501}, 83 {"alpha", 945}, 84 {"amp", 38}, 85 {"and", 8743}, 86 {"ang", 8736}, 87 {"aring", 229}, 88 {"asymp", 8776}, 89 {"atilde", 227}, 90 {"auml", 228}, 91 {"bdquo", 8222}, 92 {"beta", 946}, 93 {"brvbar", 166}, 94 {"bull", 8226}, 95 {"cap", 8745}, 96 {"ccedil", 231}, 97 {"cdots", 8943}, 98 {"cedil", 184}, 99 {"cent", 162}, 100 {"chi", 967}, 101 {"circ", 710}, 102 {"clubs", 9827}, 103 {"cong", 8773}, 104 {"copy", 169}, 105 {"crarr", 8629}, 106 {"cup", 8746}, 107 {"curren", 164}, 108 {"dArr", 8659}, 109 {"dagger", 8224}, 110 {"darr", 8595}, 111 {"ddots", 8945}, 112 {"deg", 176}, 113 {"delta", 948}, 114 {"diams", 9830}, 115 {"divide", 247}, 116 {"eacute", 233}, 117 {"ecirc", 234}, 118 {"egrave", 232}, 119 {"emdash", 8212}, /* non-standard but commonly used */ 120 {"empty", 8709}, 121 {"emsp", 8195}, 122 {"endash", 8211}, /* non-standard but commonly used */ 123 {"ensp", 8194}, 124 {"epsilon", 949}, 125 {"equiv", 8801}, 126 {"eta", 951}, 127 {"eth", 240}, 128 {"euml", 235}, 129 {"euro", 8364}, 130 {"exist", 8707}, 131 {"fnof", 402}, 132 {"forall", 8704}, 133 {"frac12", 189}, 134 {"frac14", 188}, 135 {"frac34", 190}, 136 {"frasl", 8260}, 137 {"gamma", 947}, 138 {"ge", 8805}, 139 {"gt", 62}, 140 {"hArr", 8660}, 141 {"harr", 8596}, 142 {"hearts", 9829}, 143 {"hellip", 8230}, 144 {"iacute", 237}, 145 {"icirc", 238}, 146 {"iexcl", 161}, 147 {"igrave", 236}, 148 {"image", 8465}, 149 {"infin", 8734}, 150 {"int", 8747}, 151 {"iota", 953}, 152 {"iquest", 191}, 153 {"isin", 8712}, 154 {"iuml", 239}, 155 {"kappa", 954}, 156 {"lArr", 8656}, 157 {"lambda", 955}, 158 {"lang", 9001}, 159 {"laquo", 171}, 160 {"larr", 8592}, 161 {"lceil", 8968}, 162 {"ldots", 8230}, 163 {"ldquo", 8220}, 164 {"le", 8804}, 165 {"lfloor", 8970}, 166 {"lowast", 8727}, 167 {"loz", 9674}, 168 {"lrm", 8206}, 169 {"lsaquo", 8249}, 170 {"lsquo", 8216}, 171 {"lt", 60}, 172 {"macr", 175}, 173 {"mdash", 8212}, 174 {"micro", 181}, 175 {"middot", 183}, 176 {"minus", 8722}, 177 {"mu", 956}, 178 {"nabla", 8711}, 179 {"nbsp", 160}, 180 {"ndash", 8211}, 181 {"ne", 8800}, 182 {"ni", 8715}, 183 {"not", 172}, 184 {"notin", 8713}, 185 {"nsub", 8836}, 186 {"ntilde", 241}, 187 {"nu", 957}, 188 {"oacute", 243}, 189 {"ocirc", 244}, 190 {"oelig", 339}, 191 {"ograve", 242}, 192 {"oline", 8254}, 193 {"omega", 969}, 194 {"omicron", 959}, 195 {"oplus", 8853}, 196 {"or", 8744}, 197 {"ordf", 170}, 198 {"ordm", 186}, 199 {"oslash", 248}, 200 {"otilde", 245}, 201 {"otimes", 8855}, 202 {"ouml", 246}, 203 {"para", 182}, 204 {"part", 8706}, 205 {"permil", 8240}, 206 {"perp", 8869}, 207 {"phi", 966}, 208 {"pi", 960}, 209 {"piv", 982}, 210 {"plusmn", 177}, 211 {"pound", 163}, 212 {"prime", 8242}, 213 {"prod", 8719}, 214 {"prop", 8733}, 215 {"psi", 968}, 216 {"quad", 8193}, 217 {"quot", 34}, 218 {"rArr", 8658}, 219 {"radic", 8730}, 220 {"rang", 9002}, 221 {"raquo", 187}, 222 {"rarr", 8594}, 223 {"rceil", 8969}, 224 {"rdquo", 8221}, 225 {"real", 8476}, 226 {"reg", 174}, 227 {"rfloor", 8971}, 228 {"rho", 961}, 229 {"rlm", 8207}, 230 {"rsaquo", 8250}, 231 {"rsquo", 8217}, 232 {"sbquo", 8218}, 233 {"scaron", 353}, 234 {"sdot", 8901}, 235 {"sect", 167}, 236 {"shy", 173}, 237 {"sigma", 963}, 238 {"sigmaf", 962}, 239 {"sim", 8764}, 240 {"sp", 8194}, 241 {"spades", 9824}, 242 {"sub", 8834}, 243 {"sube", 8838}, 244 {"sum", 8721}, 245 {"sup", 8835}, 246 {"sup1", 185}, 247 {"sup2", 178}, 248 {"sup3", 179}, 249 {"supe", 8839}, 250 {"szlig", 223}, 251 {"tau", 964}, 252 {"there4", 8756}, 253 {"theta", 952}, 254 {"thetasym", 977}, 255 {"thinsp", 8201}, 256 {"thorn", 254}, 257 {"tilde", 732}, 258 {"times", 215}, 259 {"trade", 8482}, 260 {"uArr", 8657}, 261 {"uacute", 250}, 262 {"uarr", 8593}, 263 {"ucirc", 251}, 264 {"ugrave", 249}, 265 {"uml", 168}, 266 {"upsih", 978}, 267 {"upsilon", 965}, 268 {"uuml", 252}, 269 {"varepsilon", 8712}, 270 {"varphi", 981}, 271 {"varpi", 982}, 272 {"varrho", 1009}, 273 {"vdots", 8942}, 274 {"vsigma", 962}, 275 {"vtheta", 977}, 276 {"weierp", 8472}, 277 {"xi", 958}, 278 {"yacute", 253}, 279 {"yen", 165}, 280 {"yuml", 255}, 281 {"zeta", 950}, 282 {"zwj", 8205}, 283 {"zwnj", 8204} 284 }; 285 286 static Hchar byrune[nelem(byname)]; 287 288 static int 289 hnamecmp(const void *va, const void *vb) 290 { 291 Hchar *a, *b; 292 293 a = (Hchar*)va; 294 b = (Hchar*)vb; 295 return strcmp(a->s, b->s); 296 } 297 298 static int 299 hrunecmp(const void *va, const void *vb) 300 { 301 Hchar *a, *b; 302 303 a = (Hchar*)va; 304 b = (Hchar*)vb; 305 return a->r - b->r; 306 } 307 308 static void 309 html_init(void) 310 { 311 static int init; 312 313 if(init) 314 return; 315 init = 1; 316 memmove(byrune, byname, sizeof byrune); 317 qsort(byname, nelem(byname), sizeof byname[0], hnamecmp); 318 qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp); 319 } 320 321 static Rune 322 findbyname(char *s) 323 { 324 Hchar *h; 325 int n, m, x; 326 327 h = byname; 328 n = nelem(byname); 329 while(n > 0){ 330 m = n/2; 331 x = strcmp(h[m].s, s); 332 if(x == 0) 333 return h[m].r; 334 if(x < 0){ 335 h += m+1; 336 n -= m+1; 337 }else 338 n = m; 339 } 340 return Runeerror; 341 } 342 343 static char* 344 findbyrune(Rune r) 345 { 346 Hchar *h; 347 int n, m; 348 349 h = byrune; 350 n = nelem(byrune); 351 while(n > 0){ 352 m = n/2; 353 if(h[m].r == r) 354 return h[m].s; 355 if(h[m].r < r){ 356 h += m+1; 357 n -= m+1; 358 }else 359 n = m; 360 } 361 return nil; 362 } 363 364 void 365 html_in(int fd, long *x, struct convert *out) 366 { 367 char buf[100], *p; 368 Biobuf b; 369 Rune rbuf[N]; 370 Rune *r, *er; 371 int c, i; 372 373 USED(x); 374 375 html_init(); 376 r = rbuf; 377 er = rbuf+N; 378 Binit(&b, fd, OREAD); 379 while((c = Bgetrune(&b)) != Beof){ 380 if(r >= er){ 381 OUT(out, rbuf, r-rbuf); 382 r = rbuf; 383 } 384 if(c == '&'){ 385 buf[0] = c; 386 for(i=1; i<nelem(buf)-1;){ 387 c = Bgetc(&b); 388 if(c == Beof) 389 break; 390 buf[i++] = c; 391 if(strchr("; \t\r\n", c)) 392 break; 393 } 394 buf[i] = 0; 395 if(buf[i-1] == ';'){ 396 buf[i-1] = 0; 397 if((c = findbyname(buf+1)) != Runeerror){ 398 *r++ = c; 399 continue; 400 } 401 buf[i-1] = ';'; 402 if(buf[1] == '#'){ 403 if(buf[2] == 'x') 404 c = strtol(buf+3, &p, 16); 405 else 406 c = strtol(buf+2, &p, 10); 407 if(*p != ';' || c >= NRUNE || c < 0) 408 goto bad; 409 *r++ = c; 410 continue; 411 } 412 } 413 bad: 414 for(p=buf; p<buf+i; ){ 415 p += chartorune(r++, p); 416 if(r >= er){ 417 OUT(out, rbuf, r-rbuf); 418 r = rbuf; 419 } 420 } 421 continue; 422 } 423 *r++ = c; 424 } 425 if(r > rbuf) 426 OUT(out, rbuf, r-rbuf); 427 OUT(out, rbuf, 0); 428 } 429 430 /* 431 * use biobuf because can use more than UTFmax bytes per rune 432 */ 433 void 434 html_out(Rune *r, int n, long *x) 435 { 436 char *s; 437 Biobuf b; 438 Rune *er; 439 440 USED(x); 441 html_init(); 442 Binit(&b, 1, OWRITE); 443 er = r+n; 444 for(; r<er; r++){ 445 if(*r < Runeself) 446 Bputrune(&b, *r); 447 else if((s = findbyrune(*r)) != nil) 448 Bprint(&b, "&%s;", s); 449 else 450 Bprint(&b, "&#%d;", *r); 451 } 452 Bflush(&b); 453 } 454 455