1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <draw.h>
5 #include <regexp.h>
6 #include <html.h>
7 #include <ctype.h>
8 #include "dat.h"
9
10 char urlexpr[] =
11 "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)"
12 "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
13 Reprog *urlprog;
14
15 int newitextitem;
16 int inword = 0;
17 int col = 0;
18 int wordi = 0;
19
20 char*
loadhtml(int fd)21 loadhtml(int fd)
22 {
23 URLwin *u;
24 Bytes *b;
25 int n;
26 char buf[4096];
27
28 u = emalloc(sizeof(URLwin));
29 u->infd = fd;
30 u->outfd = 1;
31 u->url = estrdup(url);
32 u->type = TextHtml;
33
34 b = emalloc(sizeof(Bytes));
35 while((n = read(fd, buf, sizeof buf)) > 0)
36 growbytes(b, buf, n);
37 if(b->b == nil)
38 return nil; /* empty file */
39 rendertext(u, b);
40 freeurlwin(u);
41 return nil;
42 }
43
44 char*
runetobyte(Rune * r,int n)45 runetobyte(Rune *r, int n)
46 {
47 char *s;
48
49 if(n == 0)
50 return emalloc(1);
51 s = smprint("%.*S", n, r);
52 if(s == nil)
53 error("malloc failed");
54 return s;
55 }
56
57 int
closingpunct(char c)58 closingpunct(char c)
59 {
60 return strchr(".,:;'\")]}>!?", c) != nil;
61 }
62
63 void
emitword(Bytes * b,Rune * r,int nr)64 emitword(Bytes *b, Rune *r, int nr)
65 {
66 char *s;
67 int space;
68
69 if(nr == 0)
70 return;
71 s = smprint("%.*S", nr, r);
72 space = b->n > 0 && !isspace(b->b[b->n-1]) && (!newitextitem || !closingpunct(*s));
73 if(col > 0 && col+space+nr > width){
74 growbytes(b, "\n", 1);
75 space = 0;
76 col = 0;
77 }
78 if(space && col > 0){
79 growbytes(b, " ", 1);
80 col++;
81 }
82 growbytes(b, s, strlen(s));
83 col += nr;
84 free(s);
85 inword = 0;
86 newitextitem = 0;
87 }
88
89 void
renderrunes(Bytes * b,Rune * r)90 renderrunes(Bytes *b, Rune *r)
91 {
92 int i, n;
93
94 newitextitem = 1;
95
96 n = runestrlen(r);
97 for(i=0; i<n; i++){
98 switch(r[i]){
99 case '\n':
100 if(inword)
101 emitword(b, r+wordi, i-wordi);
102 col = 0;
103 if(b->n == 0)
104 break; /* don't start with blank lines */
105 if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
106 growbytes(b, "\n", 1);
107 break;
108 case ' ':
109 if(inword)
110 emitword(b, r+wordi, i-wordi);
111 break;
112 default:
113 if(!inword)
114 wordi = i;
115 inword = 1;
116 break;
117 }
118 }
119 if(inword)
120 emitword(b, r+wordi, i-wordi);
121 }
122
123 void
renderbytes(Bytes * b,char * fmt,...)124 renderbytes(Bytes *b, char *fmt, ...)
125 {
126 Rune *r;
127 va_list arg;
128
129 va_start(arg, fmt);
130 r = runevsmprint(fmt, arg);
131 va_end(arg);
132 renderrunes(b, r);
133 free(r);
134 }
135
136 char*
baseurl(char * url)137 baseurl(char *url)
138 {
139 char *base, *slash;
140 Resub rs[10];
141
142 if(url == nil)
143 return nil;
144 if(urlprog == nil){
145 urlprog = regcomp(urlexpr);
146 if(urlprog == nil)
147 error("can't compile URL regexp");
148 }
149 memset(rs, 0, sizeof rs);
150 if(regexec(urlprog, url, rs, nelem(rs)) == 0)
151 return nil;
152 base = estrdup(url);
153 slash = strrchr(base, '/');
154 if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
155 *slash = '\0';
156 else
157 base[rs[0].ep-rs[0].sp] = '\0';
158 return base;
159 }
160
161 char*
fullurl(URLwin * u,Rune * rhref)162 fullurl(URLwin *u, Rune *rhref)
163 {
164 char *base, *href, *hrefbase;
165 char *result;
166
167 if(rhref == nil)
168 return estrdup("NULL URL");
169 href = runetobyte(rhref, runestrlen(rhref));
170 hrefbase = baseurl(href);
171 result = nil;
172 if(hrefbase==nil && (base = baseurl(u->url))!=nil){
173 result = estrdup(base);
174 if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
175 result = eappend(result, "/", "");
176 free(base);
177 }
178 if(href){
179 if(result)
180 result = eappend(result, "", href);
181 else
182 result = estrdup(href);
183 }
184 free(hrefbase);
185 if(result == nil)
186 return estrdup("***unknown***");
187 return result;
188 }
189
190 void
render(URLwin * u,Bytes * t,Item * items,int curanchor)191 render(URLwin *u, Bytes *t, Item *items, int curanchor)
192 {
193 Item *il;
194 Itext *it;
195 Ifloat *ifl;
196 Ispacer *is;
197 Itable *ita;
198 Iimage *im;
199 Anchor *a;
200 Table *tab;
201 Tablecell *cell;
202 char *href;
203
204 inword = 0;
205 col = 0;
206 wordi = 0;
207
208 for(il=items; il!=nil; il=il->next){
209 if(il->state & IFbrk)
210 renderbytes(t, "\n");
211 if(il->state & IFbrksp)
212 renderbytes(t, "\n");
213
214 switch(il->tag){
215 case Itexttag:
216 it = (Itext*)il;
217 if(it->state & IFwrap)
218 renderrunes(t, it->s);
219 else {
220 newitextitem = 1;
221 emitword(t, it->s, runestrlen(it->s));
222 }
223 break;
224 case Iruletag:
225 if(t->n>0 && t->b[t->n-1]!='\n')
226 renderbytes(t, "\n");
227 renderbytes(t, "=======\n");
228 break;
229 case Iimagetag:
230 if(!aflag)
231 break;
232 im = (Iimage*)il;
233 if(im->imsrc){
234 href = fullurl(u, im->imsrc);
235 renderbytes(t, "[image %s]", href);
236 free(href);
237 }
238 break;
239 case Iformfieldtag:
240 if(aflag)
241 renderbytes(t, "[formfield]");
242 break;
243 case Itabletag:
244 ita = (Itable*)il;
245 tab = ita->table;
246 for(cell=tab->cells; cell!=nil; cell=cell->next){
247 render(u, t, cell->content, curanchor);
248 }
249 if(t->n>0 && t->b[t->n-1]!='\n')
250 renderbytes(t, "\n");
251 break;
252 case Ifloattag:
253 ifl = (Ifloat*)il;
254 render(u, t, ifl->item, curanchor);
255 break;
256 case Ispacertag:
257 is = (Ispacer*)il;
258 if(is->spkind != ISPnull)
259 renderbytes(t, " ");
260 break;
261 default:
262 error("unknown item tag %d\n", il->tag);
263 }
264 if(il->anchorid != 0 && il->anchorid!=curanchor){
265 for(a=u->docinfo->anchors; a!=nil; a=a->next)
266 if(aflag && a->index == il->anchorid){
267 href = fullurl(u, a->href);
268 renderbytes(t, "[%s]", href);
269 free(href);
270 break;
271 }
272 curanchor = il->anchorid;
273 }
274 }
275 if(t->n>0 && t->b[t->n-1]!='\n')
276 renderbytes(t, "\n");
277 }
278
279 void
rerender(URLwin * u)280 rerender(URLwin *u)
281 {
282 Bytes *t;
283
284 t = emalloc(sizeof(Bytes));
285
286 render(u, t, u->items, 0);
287
288 if(t->n)
289 write(u->outfd, (char*)t->b, t->n);
290 free(t->b);
291 free(t);
292 }
293
294 /*
295 * Somewhat of a hack. Not a full parse, just looks for strings in the beginning
296 * of the document (cistrstr only looks at first somewhat bytes).
297 */
298 int
charset(char * s)299 charset(char *s)
300 {
301 char *meta, *emeta, *charset;
302
303 if(defcharset == 0)
304 defcharset = ISO_8859_1;
305 meta = cistrstr(s, "<meta");
306 if(meta == nil)
307 return defcharset;
308 for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
309 ;
310 charset = cistrstr(s, "charset=");
311 if(charset == nil)
312 return defcharset;
313 charset += 8;
314 if(*charset == '"')
315 charset++;
316 if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
317 return UTF_8;
318 return defcharset;
319 }
320
321 void
rendertext(URLwin * u,Bytes * b)322 rendertext(URLwin *u, Bytes *b)
323 {
324 Rune *rurl;
325
326 rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
327 u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
328 // free(rurl);
329
330 rerender(u);
331 }
332
333
334 void
freeurlwin(URLwin * u)335 freeurlwin(URLwin *u)
336 {
337 freeitems(u->items);
338 u->items = nil;
339 freedocinfo(u->docinfo);
340 u->docinfo = nil;
341 free(u);
342 }
343