1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <draw.h> 5 #include <regexp.h> 6 #include <html.h> 7 #include <ctype.h> 8 #include "dat.h" 9 10 char urlexpr[] = 11 "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)" 12 "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; 13 Reprog *urlprog; 14 15 int newitextitem; 16 int inword = 0; 17 int col = 0; 18 int wordi = 0; 19 20 char* 21 loadhtml(int fd) 22 { 23 URLwin *u; 24 Bytes *b; 25 int n; 26 char buf[4096]; 27 28 u = emalloc(sizeof(URLwin)); 29 u->infd = fd; 30 u->outfd = 1; 31 u->url = estrdup(url); 32 u->type = TextHtml; 33 34 b = emalloc(sizeof(Bytes)); 35 while((n = read(fd, buf, sizeof buf)) > 0) 36 growbytes(b, buf, n); 37 if(b->b == nil) 38 return nil; /* empty file */ 39 rendertext(u, b); 40 freeurlwin(u); 41 return nil; 42 } 43 44 char* 45 runetobyte(Rune *r, int n) 46 { 47 char *s; 48 49 if(n == 0) 50 return emalloc(1); 51 s = smprint("%.*S", n, r); 52 if(s == nil) 53 error("malloc failed"); 54 return s; 55 } 56 57 int 58 closingpunct(char c) 59 { 60 return strchr(".,:;'\")]}>!?", c) != nil; 61 } 62 63 void 64 emitword(Bytes *b, Rune *r, int nr) 65 { 66 char *s; 67 int space; 68 69 if(nr == 0) 70 return; 71 s = smprint("%.*S", nr, r); 72 space = b->n > 0 && !isspace(b->b[b->n-1]) && (!newitextitem || !closingpunct(*s)); 73 if(col > 0 && col+space+nr > width){ 74 growbytes(b, "\n", 1); 75 space = 0; 76 col = 0; 77 } 78 if(space && col > 0){ 79 growbytes(b, " ", 1); 80 col++; 81 } 82 growbytes(b, s, strlen(s)); 83 col += nr; 84 free(s); 85 inword = 0; 86 newitextitem = 0; 87 } 88 89 void 90 renderrunes(Bytes *b, Rune *r) 91 { 92 int i, n; 93 94 newitextitem = 1; 95 96 n = runestrlen(r); 97 for(i=0; i<n; i++){ 98 switch(r[i]){ 99 case '\n': 100 if(inword) 101 emitword(b, r+wordi, i-wordi); 102 col = 0; 103 if(b->n == 0) 104 break; /* don't start with blank lines */ 105 if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n') 106 growbytes(b, "\n", 1); 107 break; 108 case ' ': 109 if(inword) 110 emitword(b, r+wordi, i-wordi); 111 break; 112 default: 113 if(!inword) 114 wordi = i; 115 inword = 1; 116 break; 117 } 118 } 119 if(inword) 120 emitword(b, r+wordi, i-wordi); 121 } 122 123 void 124 renderbytes(Bytes *b, char *fmt, ...) 125 { 126 Rune *r; 127 va_list arg; 128 129 va_start(arg, fmt); 130 r = runevsmprint(fmt, arg); 131 va_end(arg); 132 renderrunes(b, r); 133 free(r); 134 } 135 136 char* 137 baseurl(char *url) 138 { 139 char *base, *slash; 140 Resub rs[10]; 141 142 if(url == nil) 143 return nil; 144 if(urlprog == nil){ 145 urlprog = regcomp(urlexpr); 146 if(urlprog == nil) 147 error("can't compile URL regexp"); 148 } 149 memset(rs, 0, sizeof rs); 150 if(regexec(urlprog, url, rs, nelem(rs)) == 0) 151 return nil; 152 base = estrdup(url); 153 slash = strrchr(base, '/'); 154 if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp]) 155 *slash = '\0'; 156 else 157 base[rs[0].ep-rs[0].sp] = '\0'; 158 return base; 159 } 160 161 char* 162 fullurl(URLwin *u, Rune *rhref) 163 { 164 char *base, *href, *hrefbase; 165 char *result; 166 167 if(rhref == nil) 168 return estrdup("NULL URL"); 169 href = runetobyte(rhref, runestrlen(rhref)); 170 hrefbase = baseurl(href); 171 result = nil; 172 if(hrefbase==nil && (base = baseurl(u->url))!=nil){ 173 result = estrdup(base); 174 if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) 175 result = eappend(result, "/", ""); 176 free(base); 177 } 178 if(href){ 179 if(result) 180 result = eappend(result, "", href); 181 else 182 result = estrdup(href); 183 } 184 free(hrefbase); 185 if(result == nil) 186 return estrdup("***unknown***"); 187 return result; 188 } 189 190 void 191 render(URLwin *u, Bytes *t, Item *items, int curanchor) 192 { 193 Item *il; 194 Itext *it; 195 Ifloat *ifl; 196 Ispacer *is; 197 Itable *ita; 198 Iimage *im; 199 Anchor *a; 200 Table *tab; 201 Tablecell *cell; 202 char *href; 203 204 inword = 0; 205 col = 0; 206 wordi = 0; 207 208 for(il=items; il!=nil; il=il->next){ 209 if(il->state & IFbrk) 210 renderbytes(t, "\n"); 211 if(il->state & IFbrksp) 212 renderbytes(t, "\n"); 213 214 switch(il->tag){ 215 case Itexttag: 216 it = (Itext*)il; 217 if(it->state & IFwrap) 218 renderrunes(t, it->s); 219 else { 220 newitextitem = 1; 221 emitword(t, it->s, runestrlen(it->s)); 222 } 223 break; 224 case Iruletag: 225 if(t->n>0 && t->b[t->n-1]!='\n') 226 renderbytes(t, "\n"); 227 renderbytes(t, "=======\n"); 228 break; 229 case Iimagetag: 230 if(!aflag) 231 break; 232 im = (Iimage*)il; 233 if(im->imsrc){ 234 href = fullurl(u, im->imsrc); 235 renderbytes(t, "[image %s]", href); 236 free(href); 237 } 238 break; 239 case Iformfieldtag: 240 if(aflag) 241 renderbytes(t, "[formfield]"); 242 break; 243 case Itabletag: 244 ita = (Itable*)il; 245 tab = ita->table; 246 for(cell=tab->cells; cell!=nil; cell=cell->next){ 247 render(u, t, cell->content, curanchor); 248 } 249 if(t->n>0 && t->b[t->n-1]!='\n') 250 renderbytes(t, "\n"); 251 break; 252 case Ifloattag: 253 ifl = (Ifloat*)il; 254 render(u, t, ifl->item, curanchor); 255 break; 256 case Ispacertag: 257 is = (Ispacer*)il; 258 if(is->spkind != ISPnull) 259 renderbytes(t, " "); 260 break; 261 default: 262 error("unknown item tag %d\n", il->tag); 263 } 264 if(il->anchorid != 0 && il->anchorid!=curanchor){ 265 for(a=u->docinfo->anchors; a!=nil; a=a->next) 266 if(aflag && a->index == il->anchorid){ 267 href = fullurl(u, a->href); 268 renderbytes(t, "[%s]", href); 269 free(href); 270 break; 271 } 272 curanchor = il->anchorid; 273 } 274 } 275 if(t->n>0 && t->b[t->n-1]!='\n') 276 renderbytes(t, "\n"); 277 } 278 279 void 280 rerender(URLwin *u) 281 { 282 Bytes *t; 283 284 t = emalloc(sizeof(Bytes)); 285 286 render(u, t, u->items, 0); 287 288 if(t->n) 289 write(u->outfd, (char*)t->b, t->n); 290 free(t->b); 291 free(t); 292 } 293 294 /* 295 * Somewhat of a hack. Not a full parse, just looks for strings in the beginning 296 * of the document (cistrstr only looks at first somewhat bytes). 297 */ 298 int 299 charset(char *s) 300 { 301 char *meta, *emeta, *charset; 302 303 if(defcharset == 0) 304 defcharset = ISO_8859_1; 305 meta = cistrstr(s, "<meta"); 306 if(meta == nil) 307 return defcharset; 308 for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++) 309 ; 310 charset = cistrstr(s, "charset="); 311 if(charset == nil) 312 return defcharset; 313 charset += 8; 314 if(*charset == '"') 315 charset++; 316 if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4)) 317 return UTF_8; 318 return defcharset; 319 } 320 321 void 322 rendertext(URLwin *u, Bytes *b) 323 { 324 Rune *rurl; 325 326 rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1); 327 u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); 328 // free(rurl); 329 330 rerender(u); 331 } 332 333 334 void 335 freeurlwin(URLwin *u) 336 { 337 freeitems(u->items); 338 u->items = nil; 339 freedocinfo(u->docinfo); 340 u->docinfo = nil; 341 free(u); 342 } 343