1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <draw.h> 5 #include <regexp.h> 6 #include <html.h> 7 #include <ctype.h> 8 #include "dat.h" 9 10 char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; 11 Reprog *urlprog; 12 13 int inword = 0; 14 int col = 0; 15 int wordi = 0; 16 17 char* 18 loadhtml(int fd) 19 { 20 URLwin *u; 21 Bytes *b; 22 int n; 23 char buf[4096]; 24 25 u = emalloc(sizeof(URLwin)); 26 u->infd = fd; 27 u->outfd = 1; 28 u->url = estrdup(url); 29 u->type = TextHtml; 30 31 b = emalloc(sizeof(Bytes)); 32 while((n = read(fd, buf, sizeof buf)) > 0) 33 growbytes(b, buf, n); 34 if(b->b == nil) 35 return nil; /* empty file */ 36 rendertext(u, b); 37 freeurlwin(u); 38 return nil; 39 } 40 41 char* 42 runetobyte(Rune *r, int n) 43 { 44 char *s; 45 46 if(n == 0) 47 return emalloc(1); 48 s = smprint("%.*S", n, r); 49 if(s == nil) 50 error("malloc failed"); 51 return s; 52 } 53 54 int 55 closingpunct(int c) 56 { 57 return strchr(".,:;'\")]}>!?", c) != nil; 58 } 59 60 void 61 emitword(Bytes *b, Rune *r, int nr) 62 { 63 char *s; 64 int space; 65 66 if(nr == 0) 67 return; 68 s = smprint("%.*S", nr, r); 69 space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]); 70 if(col>0 && col+space+nr > width){ 71 growbytes(b, "\n", 1); 72 space = 0; 73 col = 0; 74 } 75 if(space && col>0){ 76 growbytes(b, " ", 1); 77 col++; 78 } 79 growbytes(b, s, strlen(s)); 80 col += nr; 81 free(s); 82 inword = 0; 83 } 84 85 void 86 renderrunes(Bytes *b, Rune *r) 87 { 88 int i, n; 89 90 n = runestrlen(r); 91 for(i=0; i<n; i++){ 92 switch(r[i]){ 93 case '\n': 94 if(inword) 95 emitword(b, r+wordi, i-wordi); 96 col = 0; 97 if(b->n == 0) 98 break; /* don't start with blank lines */ 99 if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n') 100 growbytes(b, "\n", 1); 101 break; 102 case ' ': 103 if(inword) 104 emitword(b, r+wordi, i-wordi); 105 break; 106 default: 107 if(!inword) 108 wordi = i; 109 inword = 1; 110 break; 111 } 112 } 113 if(inword) 114 emitword(b, r+wordi, i-wordi); 115 } 116 117 void 118 renderbytes(Bytes *b, char *fmt, ...) 119 { 120 Rune *r; 121 va_list arg; 122 123 va_start(arg, fmt); 124 r = runevsmprint(fmt, arg); 125 va_end(arg); 126 renderrunes(b, r); 127 free(r); 128 } 129 130 char* 131 baseurl(char *url) 132 { 133 char *base, *slash; 134 Resub rs[10]; 135 136 if(url == nil) 137 return nil; 138 if(urlprog == nil){ 139 urlprog = regcomp(urlexpr); 140 if(urlprog == nil) 141 error("can't compile URL regexp"); 142 } 143 memset(rs, 0, sizeof rs); 144 if(regexec(urlprog, url, rs, nelem(rs)) == 0) 145 return nil; 146 base = estrdup(url); 147 slash = strrchr(base, '/'); 148 if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp]) 149 *slash = '\0'; 150 else 151 base[rs[0].ep-rs[0].sp] = '\0'; 152 return base; 153 } 154 155 char* 156 fullurl(URLwin *u, Rune *rhref) 157 { 158 char *base, *href, *hrefbase; 159 char *result; 160 161 if(rhref == nil) 162 return estrdup("NULL URL"); 163 href = runetobyte(rhref, runestrlen(rhref)); 164 hrefbase = baseurl(href); 165 result = nil; 166 if(hrefbase==nil && (base = baseurl(u->url))!=nil){ 167 result = estrdup(base); 168 if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) 169 result = eappend(result, "/", ""); 170 free(base); 171 } 172 if(href){ 173 if(result) 174 result = eappend(result, "", href); 175 else 176 result = estrdup(href); 177 } 178 free(hrefbase); 179 if(result == nil) 180 return estrdup("***unknown***"); 181 return result; 182 } 183 184 void 185 render(URLwin *u, Bytes *t, Item *items, int curanchor) 186 { 187 Item *il; 188 Itext *it; 189 Ifloat *ifl; 190 Ispacer *is; 191 Itable *ita; 192 Iimage *im; 193 Anchor *a; 194 Table *tab; 195 Tablecell *cell; 196 char *href; 197 198 inword = 0; 199 col = 0; 200 wordi = 0; 201 202 for(il=items; il!=nil; il=il->next){ 203 if(il->state & IFbrk) 204 renderbytes(t, "\n"); 205 if(il->state & IFbrksp) 206 renderbytes(t, "\n"); 207 208 switch(il->tag){ 209 case Itexttag: 210 it = (Itext*)il; 211 if(it->state & IFwrap) 212 renderrunes(t, it->s); 213 else 214 emitword(t, it->s, runestrlen(it->s)); 215 break; 216 case Iruletag: 217 if(t->n>0 && t->b[t->n-1]!='\n') 218 renderbytes(t, "\n"); 219 renderbytes(t, "=======\n"); 220 break; 221 case Iimagetag: 222 if(!aflag) 223 break; 224 im = (Iimage*)il; 225 if(im->imsrc){ 226 href = fullurl(u, im->imsrc); 227 renderbytes(t, "[image %s]", href); 228 free(href); 229 } 230 break; 231 case Iformfieldtag: 232 if(aflag) 233 renderbytes(t, "[formfield]"); 234 break; 235 case Itabletag: 236 ita = (Itable*)il; 237 tab = ita->table; 238 for(cell=tab->cells; cell!=nil; cell=cell->next){ 239 render(u, t, cell->content, curanchor); 240 } 241 if(t->n>0 && t->b[t->n-1]!='\n') 242 renderbytes(t, "\n"); 243 break; 244 case Ifloattag: 245 ifl = (Ifloat*)il; 246 render(u, t, ifl->item, curanchor); 247 break; 248 case Ispacertag: 249 is = (Ispacer*)il; 250 if(is->spkind != ISPnull) 251 renderbytes(t, " "); 252 break; 253 default: 254 error("unknown item tag %d\n", il->tag); 255 } 256 if(il->anchorid != 0 && il->anchorid!=curanchor){ 257 for(a=u->docinfo->anchors; a!=nil; a=a->next) 258 if(aflag && a->index == il->anchorid){ 259 href = fullurl(u, a->href); 260 renderbytes(t, "[%s]", href); 261 free(href); 262 break; 263 } 264 curanchor = il->anchorid; 265 } 266 } 267 if(t->n>0 && t->b[t->n-1]!='\n') 268 renderbytes(t, "\n"); 269 } 270 271 void 272 rerender(URLwin *u) 273 { 274 Bytes *t; 275 276 t = emalloc(sizeof(Bytes)); 277 278 render(u, t, u->items, 0); 279 280 if(t->n) 281 write(u->outfd, (char*)t->b, t->n); 282 free(t->b); 283 free(t); 284 } 285 286 /* 287 * Somewhat of a hack. Not a full parse, just looks for strings in the beginning 288 * of the document (cistrstr only looks at first somewhat bytes). 289 */ 290 int 291 charset(char *s) 292 { 293 char *meta, *emeta, *charset; 294 295 if(defcharset == 0) 296 defcharset = ISO_8859_1; 297 meta = cistrstr(s, "<meta"); 298 if(meta == nil) 299 return defcharset; 300 for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++) 301 ; 302 charset = cistrstr(s, "charset="); 303 if(charset == nil) 304 return defcharset; 305 charset += 8; 306 if(*charset == '"') 307 charset++; 308 if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4)) 309 return UTF_8; 310 return defcharset; 311 } 312 313 void 314 rendertext(URLwin *u, Bytes *b) 315 { 316 Rune *rurl; 317 318 rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1); 319 u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); 320 // free(rurl); 321 322 rerender(u); 323 } 324 325 326 void 327 freeurlwin(URLwin *u) 328 { 329 freeitems(u->items); 330 u->items = nil; 331 freedocinfo(u->docinfo); 332 u->docinfo = nil; 333 free(u); 334 } 335