1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <draw.h> 5 #include <regexp.h> 6 #include <html.h> 7 #include <ctype.h> 8 #include "dat.h" 9 10 char urlexpr[] = 11 "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)" 12 "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; 13 Reprog *urlprog; 14 15 int inword = 0; 16 int col = 0; 17 int wordi = 0; 18 19 char* 20 loadhtml(int fd) 21 { 22 URLwin *u; 23 Bytes *b; 24 int n; 25 char buf[4096]; 26 27 u = emalloc(sizeof(URLwin)); 28 u->infd = fd; 29 u->outfd = 1; 30 u->url = estrdup(url); 31 u->type = TextHtml; 32 33 b = emalloc(sizeof(Bytes)); 34 while((n = read(fd, buf, sizeof buf)) > 0) 35 growbytes(b, buf, n); 36 if(b->b == nil) 37 return nil; /* empty file */ 38 rendertext(u, b); 39 freeurlwin(u); 40 return nil; 41 } 42 43 char* 44 runetobyte(Rune *r, int n) 45 { 46 char *s; 47 48 if(n == 0) 49 return emalloc(1); 50 s = smprint("%.*S", n, r); 51 if(s == nil) 52 error("malloc failed"); 53 return s; 54 } 55 56 int 57 closingpunct(char c) 58 { 59 return strchr(".,:;'\")]}>!?", c) != nil; 60 } 61 62 void 63 emitword(Bytes *b, Rune *r, int nr) 64 { 65 char *s; 66 int space; 67 68 if(nr == 0) 69 return; 70 s = smprint("%.*S", nr, r); 71 space = b->n > 0 && !isspace(b->b[b->n-1]) && !closingpunct(*s); 72 if(col > 0 && col+space+nr > width){ 73 growbytes(b, "\n", 1); 74 space = 0; 75 col = 0; 76 } 77 if(space && col > 0){ 78 growbytes(b, " ", 1); 79 col++; 80 } 81 growbytes(b, s, strlen(s)); 82 col += nr; 83 free(s); 84 inword = 0; 85 } 86 87 void 88 renderrunes(Bytes *b, Rune *r) 89 { 90 int i, n; 91 92 n = runestrlen(r); 93 for(i=0; i<n; i++){ 94 switch(r[i]){ 95 case '\n': 96 if(inword) 97 emitword(b, r+wordi, i-wordi); 98 col = 0; 99 if(b->n == 0) 100 break; /* don't start with blank lines */ 101 if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n') 102 growbytes(b, "\n", 1); 103 break; 104 case ' ': 105 if(inword) 106 emitword(b, r+wordi, i-wordi); 107 break; 108 default: 109 if(!inword) 110 wordi = i; 111 inword = 1; 112 break; 113 } 114 } 115 if(inword) 116 emitword(b, r+wordi, i-wordi); 117 } 118 119 void 120 renderbytes(Bytes *b, char *fmt, ...) 121 { 122 Rune *r; 123 va_list arg; 124 125 va_start(arg, fmt); 126 r = runevsmprint(fmt, arg); 127 va_end(arg); 128 renderrunes(b, r); 129 free(r); 130 } 131 132 char* 133 baseurl(char *url) 134 { 135 char *base, *slash; 136 Resub rs[10]; 137 138 if(url == nil) 139 return nil; 140 if(urlprog == nil){ 141 urlprog = regcomp(urlexpr); 142 if(urlprog == nil) 143 error("can't compile URL regexp"); 144 } 145 memset(rs, 0, sizeof rs); 146 if(regexec(urlprog, url, rs, nelem(rs)) == 0) 147 return nil; 148 base = estrdup(url); 149 slash = strrchr(base, '/'); 150 if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp]) 151 *slash = '\0'; 152 else 153 base[rs[0].ep-rs[0].sp] = '\0'; 154 return base; 155 } 156 157 char* 158 fullurl(URLwin *u, Rune *rhref) 159 { 160 char *base, *href, *hrefbase; 161 char *result; 162 163 if(rhref == nil) 164 return estrdup("NULL URL"); 165 href = runetobyte(rhref, runestrlen(rhref)); 166 hrefbase = baseurl(href); 167 result = nil; 168 if(hrefbase==nil && (base = baseurl(u->url))!=nil){ 169 result = estrdup(base); 170 if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) 171 result = eappend(result, "/", ""); 172 free(base); 173 } 174 if(href){ 175 if(result) 176 result = eappend(result, "", href); 177 else 178 result = estrdup(href); 179 } 180 free(hrefbase); 181 if(result == nil) 182 return estrdup("***unknown***"); 183 return result; 184 } 185 186 void 187 render(URLwin *u, Bytes *t, Item *items, int curanchor) 188 { 189 Item *il; 190 Itext *it; 191 Ifloat *ifl; 192 Ispacer *is; 193 Itable *ita; 194 Iimage *im; 195 Anchor *a; 196 Table *tab; 197 Tablecell *cell; 198 char *href; 199 200 inword = 0; 201 col = 0; 202 wordi = 0; 203 204 for(il=items; il!=nil; il=il->next){ 205 if(il->state & IFbrk) 206 renderbytes(t, "\n"); 207 if(il->state & IFbrksp) 208 renderbytes(t, "\n"); 209 210 switch(il->tag){ 211 case Itexttag: 212 it = (Itext*)il; 213 if(it->state & IFwrap) 214 renderrunes(t, it->s); 215 else 216 emitword(t, it->s, runestrlen(it->s)); 217 break; 218 case Iruletag: 219 if(t->n>0 && t->b[t->n-1]!='\n') 220 renderbytes(t, "\n"); 221 renderbytes(t, "=======\n"); 222 break; 223 case Iimagetag: 224 if(!aflag) 225 break; 226 im = (Iimage*)il; 227 if(im->imsrc){ 228 href = fullurl(u, im->imsrc); 229 renderbytes(t, "[image %s]", href); 230 free(href); 231 } 232 break; 233 case Iformfieldtag: 234 if(aflag) 235 renderbytes(t, "[formfield]"); 236 break; 237 case Itabletag: 238 ita = (Itable*)il; 239 tab = ita->table; 240 for(cell=tab->cells; cell!=nil; cell=cell->next){ 241 render(u, t, cell->content, curanchor); 242 } 243 if(t->n>0 && t->b[t->n-1]!='\n') 244 renderbytes(t, "\n"); 245 break; 246 case Ifloattag: 247 ifl = (Ifloat*)il; 248 render(u, t, ifl->item, curanchor); 249 break; 250 case Ispacertag: 251 is = (Ispacer*)il; 252 if(is->spkind != ISPnull) 253 renderbytes(t, " "); 254 break; 255 default: 256 error("unknown item tag %d\n", il->tag); 257 } 258 if(il->anchorid != 0 && il->anchorid!=curanchor){ 259 for(a=u->docinfo->anchors; a!=nil; a=a->next) 260 if(aflag && a->index == il->anchorid){ 261 href = fullurl(u, a->href); 262 renderbytes(t, "[%s]", href); 263 free(href); 264 break; 265 } 266 curanchor = il->anchorid; 267 } 268 } 269 if(t->n>0 && t->b[t->n-1]!='\n') 270 renderbytes(t, "\n"); 271 } 272 273 void 274 rerender(URLwin *u) 275 { 276 Bytes *t; 277 278 t = emalloc(sizeof(Bytes)); 279 280 render(u, t, u->items, 0); 281 282 if(t->n) 283 write(u->outfd, (char*)t->b, t->n); 284 free(t->b); 285 free(t); 286 } 287 288 /* 289 * Somewhat of a hack. Not a full parse, just looks for strings in the beginning 290 * of the document (cistrstr only looks at first somewhat bytes). 291 */ 292 int 293 charset(char *s) 294 { 295 char *meta, *emeta, *charset; 296 297 if(defcharset == 0) 298 defcharset = ISO_8859_1; 299 meta = cistrstr(s, "<meta"); 300 if(meta == nil) 301 return defcharset; 302 for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++) 303 ; 304 charset = cistrstr(s, "charset="); 305 if(charset == nil) 306 return defcharset; 307 charset += 8; 308 if(*charset == '"') 309 charset++; 310 if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4)) 311 return UTF_8; 312 return defcharset; 313 } 314 315 void 316 rendertext(URLwin *u, Bytes *b) 317 { 318 Rune *rurl; 319 320 rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1); 321 u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); 322 // free(rurl); 323 324 rerender(u); 325 } 326 327 328 void 329 freeurlwin(URLwin *u) 330 { 331 freeitems(u->items); 332 u->items = nil; 333 freedocinfo(u->docinfo); 334 u->docinfo = nil; 335 free(u); 336 } 337