xref: /inferno-os/appl/cmd/webgrab.b (revision 9274481003af38a88988b4e9a3a2c3e0df206bee)
1# Webgrab -- for getting html pages and the subordinate files (images, frame children)
2# they refer to (using "src=..." in a tag) into the local file space.
3# Assume http: scheme if none specified.
4# Usage:
5#	webgrab [-r] [-v] [-o stem] url
6#  If stem is specified, file will be saved in stem.html and images will
7#  go in stem_1.jpg (or .gif, ...), stem_2.jpg, etc.
8#  If stem is not specified, derive it from url (see getstem comment, below).
9# If -r is specified, get "raw", i.e., no image fetching/html munging.
10# If -v is specified (verbose), print some progress information,
11# with more if -vv is given.
12
13implement Webgrab;
14
15include "sys.m";
16	sys: Sys;
17	FD: import sys;
18
19include "draw.m";
20
21include "string.m";
22	S: String;
23
24include "url.m";
25	U: Url;
26	ParsedUrl: import U;
27
28include "daytime.m";
29	DT: Daytime;
30
31include "bufio.m";
32	B: Bufio;
33
34include "dial.m";
35	D: Dial;
36
37include "arg.m";
38
39Webgrab: module
40{
41	init: fn(ctxt: ref Draw->Context, args: list of string);
42};
43
44stderr: ref FD;
45verbose := 0;
46postbody : string;
47
48httpproxy: ref Url->ParsedUrl;
49noproxydoms: list of string;	# domains that don't require proxy
50
51init(nil: ref Draw->Context, args: list of string)
52{
53	sys = load Sys Sys->PATH;
54	stderr = sys->fildes(2);
55	S = load String String->PATH;
56	U = load Url Url->PATH;
57	DT = load Daytime Daytime->PATH;
58	D = load Dial Dial->PATH;
59	B = load Bufio Bufio->PATH;
60	arg := load Arg Arg->PATH;
61	if(S == nil || U == nil || DT == nil || B == nil || arg == nil)
62		error_exit("can't load a module");
63	U->init();
64	stem := "";
65	rawflag := 0;
66	arg->init(args);
67	arg->setusage("webgrab [-r] [-v[v]] [-p postbody] [-o stem] url");
68	url := "";
69	while((o := arg->opt()) != 0)
70		case o {
71		'r' =>
72			rawflag = 1;
73		'v' =>
74			verbose++;
75		'o' =>
76			stem = arg->earg();
77		'p' =>
78			postbody = arg->earg();
79		* =>
80			arg->usage();
81		}
82	args = arg->argv();
83	if(len args != 1)
84		arg->usage();
85	url = hd args;
86	arg = nil;
87	(nil,xr) := S->splitstrl(url,"//");
88	(nil,yr) := S->splitl(url,":");
89	if(xr == "" && yr == "")
90		url = "http://" + url;
91	u := U->makeurl(url);
92	if(stem == "")
93		stem = getstem(u);
94	readconfig();
95	grab(u, stem, rawflag);
96}
97
98readconfig()
99{
100	cfgio := B->open("/services/webget/config", sys->OREAD);
101	if(cfgio != nil) {
102		for(;;) {
103			line := B->cfgio.gets('\n');
104			if(line == "") {
105				B->cfgio.close();
106				break;
107			}
108			if(line[0]=='#')
109				continue;
110			(key, val) := S->splitl(line, " \t=");
111			val = S->take(S->drop(val, " \t="), "^\r\n");
112			if(val == "")
113				continue;
114			case key {
115			"httpproxy" =>
116				if(val == "none")
117					continue;
118				# val should be host or host:port
119				httpproxy = U->makeurl("http://" + val);
120				if(verbose)
121					sys->fprint(stderr, "Using http proxy %s\n", httpproxy.tostring());
122			"noproxy" or
123			"noproxydoms" =>
124				(nil, noproxydoms) = sys->tokenize(val, ";, \t");
125			}
126		}
127	}
128}
129
130# Make up a stem for forming save-file-names, based on url u.
131# Use the last non-nil component of u.path, without a final extension,
132# else use the host.  Then, if the stem still contains a '.' (e.g., www.lucent)
133# use the part after the final '.'.
134# Finally, if all else fails, use use "grabout".
135getstem(u: ref ParsedUrl) : string
136{
137	stem := "";
138	if(u.path != "") {
139		(l, r) := S->splitr(u.path, "/");
140		if(r == "") {
141			# path ended with '/'; try next to last component
142			if(l != "")
143				(l, r) = S->splitr(l[0:len l - 1], "/");
144		}
145		if(r != "")
146			stem = r;
147	}
148	if(stem == "")
149		stem = u.host;
150	if(stem != "") {
151		ext: string;
152		(stem, ext) = S->splitr(stem, ".");
153		if(stem == "")
154			stem = ext;
155		else
156			stem = stem[0:len stem - 1];
157		(nil, stem) = S->splitr(stem, ".");
158	}
159	if(stem == "")
160		stem = "grabout";
161	return stem;
162}
163
164grab(u: ref ParsedUrl, stem: string, rawflag: int)
165{
166	(err, contents, fd, actual) := httpget(u);
167	if(err != "")
168		error_exit(err);
169	ish := is_html(contents);
170	if(ish)
171		contents = addfetchcomment(contents, u, actual);
172	if(rawflag || !ish) {
173		writebytes(stem, contents, fd);
174		return;
175	}
176	# get subordinates, modify contents
177	subs : list of (string, string);
178	(contents, subs)  = subfix(contents, stem);
179	writebytes(stem + ".html", contents, fd);
180	for(l := subs; l != nil; l = tl l) {
181		(fname, suburl) := hd l;
182		subu := U->makeurl(suburl);
183		subu.makeabsolute(actual);
184		(suberr, subcontents, subfd, nil) := httpget(subu);
185		if(suberr != "") {
186			sys->fprint(stderr, "webgrab: can't fetch subordinate %s from %s: %s\n", fname, subu.tostring(), suberr);
187			continue;
188		}
189		writebytes(fname, subcontents, subfd);
190	}
191}
192
193# Fix the html in array a so that referenced subordinate files (SRC= or BACKGROUND= fields of tags)
194# are replaced with local names (stem_1.xxx, stem_2.xxx, etc.),
195# and return the fixed array along with a list of (local name, subordinate url)
196# of images to be fetched.
197subfix(a: array of byte, stem: string) : (array of byte, list of (string, string))
198{
199	alen := len a;
200	if(alen == 0)
201		return (a, nil);
202	nsubs := 0;
203	newa := array[alen + 1000] of byte;
204	newai := 0;
205	j := 0;
206	intag := 0;
207	incom := 0;
208	quote := 0;
209	subs : list of (string, string) = nil;
210	for(i := 0; i < alen; i++) {
211		c := int a[i];
212		if(incom) {
213			if(amatch(a, i, alen, "-->")) {
214				incom = 0;
215				i = i+2;
216			}
217		}
218		else if(intag) {
219			if(quote==0 && (amatch(a, i, alen, "src") || amatch(a, i, alen, "background"))) {
220				v := "";
221				eqi := 0;
222				if(amatch(a, i, alen, "src"))
223					k := i+3;
224				else
225					k = i+10;
226				for(; k < alen; k++)
227					if(!iswhite(int a[k]))
228						break;
229				if(k < alen && int a[k] == '=') {
230					eqi = k;
231					k++;
232					while(k<alen && iswhite(int a[k]))
233						k++;
234					if(k<alen) {
235						kstart := k;
236						c = int a[k];
237						if(c == '\'' || c== '"') {
238							quote = int a[k++];
239							while(k<alen && (int a[k])!=quote)
240								k++;
241							v = string a[kstart+1:k];
242							k++;
243						}
244						else {
245							while(k<alen && !iswhite(int a[k]) && int a[k] != '>')
246								k++;
247							v = string a[kstart:k];
248						}
249					}
250				}
251				if(v != "") {
252					f := "";
253					for(l := subs; l != nil; l = tl l) {
254						(ff,uu) := hd l;
255						if(v == uu) {
256							f = ff;
257							break;
258						}
259					}
260					if(f == "") {
261						nsubs++;
262						f = stem + "_" + string nsubs + getsuff(v);
263						subs = (f, v) :: subs;
264					}
265					# should check for newa too small
266					newa[newai:] = a[j:eqi+1];
267					newai += eqi+1-j;
268					xa := array of byte f;
269					newa[newai:] = xa;
270					newai += len xa;
271					j = k;
272				}
273				i = k-1;
274			}
275			if(c == '>' && quote == 0)
276				intag = 0;
277			if(quote) {
278				if(quote == c)
279					quote = 0;
280			else if(c == '"' || c == '\'')
281				quote = c;
282			}
283		}
284		else if(c == '<')
285			intag = 1;
286	}
287	if(nsubs == 0)
288		return (a, nil);
289	if(i > j) {
290		newa[newai:] = a[j:i];
291		newai += i-j;
292	}
293	ans := array[newai] of byte;
294	ans[0:] = newa[0:newai];
295	anssubs : list of (string, string) = nil;
296	for(ll := subs; ll != nil; ll = tl ll)
297		anssubs = hd ll :: anssubs;
298	return (ans, anssubs);
299}
300
301# add c after all f's in a
302fixnames(a: array of byte, f: string, c: byte)
303{
304	alen := len a;
305	n := alen - len f;
306	for(i := 0; i < n; i++) {
307		if(amatch(a, i, alen, f)) {
308			a[i+len f] = c;
309		}
310	}
311}
312
313amatch(a: array of byte, i, alen: int, s: string) : int
314{
315	slen := len s;
316	for(k := 0; i+k < alen && k < slen; k++) {
317		c := int a[i+k];
318		if(c >= 'A' && c <= 'Z')
319			c = c + (int 'a' - int 'A');
320		if(c != s[k])
321			break;
322	}
323	if(k == slen) {
324		return 1;
325	}
326	return 0;
327}
328
329getsuff(ustr: string) : string
330{
331	u := U->makeurl(ustr);
332	if(u.path != "") {
333		for(i := len u.path - 1; i >= 0; i--) {
334			c := u.path[i];
335			if(c == '.')
336				return u.path[i:];
337			if(c == '/')
338				break;
339		}
340	}
341	return "";
342}
343
344iswhite(c: int) : int
345{
346	return (c==' ' || c=='\t' || c=='\n' || c=='\r');
347}
348
349# Add a comment to end of a giving date and source of fetch
350addfetchcomment(a: array of byte, u, actu: ref ParsedUrl) : array of byte
351{
352	now := DT->text(DT->local(DT->now()));
353	ustr := u.tostring();
354	actustr := actu.tostring();
355	comment := "\n<!-- Fetched " + now + " from " + ustr;
356	if(ustr != actustr)
357		comment += ", redirected to " + actustr;
358	comment += " -->\n";
359	acom := array of byte comment;
360	newa := array[len a + len acom] of byte;
361	newa[0:] = a;
362	newa[len a:] = acom;
363	return newa;
364}
365
366# Get u, return (error string, body, actual url of source, after redirection)
367httpget(u: ref ParsedUrl) : (string, array of byte, ref Sys->FD, ref ParsedUrl)
368{
369	ans, body : array of byte;
370	restfd: ref Sys->FD;
371	req : string;
372
373	for(redir := 0; redir < 10; redir++) {
374		if(u.port == "")
375			u.port = "80";	# default IP port for HTTP
376		if(verbose)
377			sys->fprint(stderr, "connecting to %s\n", u.host);
378		dialhost, port: string;
379
380		if(httpproxy != nil && need_proxy(u.host)) {
381			dialhost = httpproxy.host;
382			port = httpproxy.port;
383		}
384		else {
385			dialhost = u.host;
386			port = u.port;
387		}
388		dest := D->netmkaddr(dialhost, "tcp", port);
389		net := D->dial(dest, nil);
390		if(net == nil)
391			return (sys->sprint("can't dial %s: %r", dest), nil, nil, nil);
392
393		# prepare request
394		if(u.query != ""){
395			u.query = "?" + u.query;
396		}
397
398		if (postbody == nil){
399			if(httpproxy == nil || !need_proxy(u.host)){
400				req = sys->sprint("GET /%s%s HTTP/1.0\r\n"+
401						"Host: %s\r\n"+
402						"User-agent: Inferno/webgrab\r\n"+
403						"Cache-Control: no-cache\r\n"+
404						"Pragma: no-cache\r\n\r\n",
405						u.path, u.query, u.host);
406			}else{
407				req = sys->sprint("GET http:///%s%s HTTP/1.0\r\n"+
408						"Host: %s\r\n"+
409						"User-agent: Inferno/webgrab\r\n"+
410						"Cache-Control: no-cache\r\n"+
411						"Pragma: no-cache\r\n\r\n",
412						u.host, u.path, u.host);
413			}
414		}else{
415				req = sys->sprint("POST /%s HTTP/1.0\r\n"+
416						"Host: %s\r\n"+
417						"Content-type: application/x-www-form-urlencoded\r\n"+
418						"Content-length: %d\r\n"+
419						"User-agent: Inferno/webgrab\r\n"+
420						"\r\n"+"%s",
421						u.path, u.host, len postbody, postbody);
422
423		}
424
425		if(verbose)
426			sys->fprint(stderr, "writing request: %s\n", req);
427		areq := array of byte req;
428		n := sys->write(net.dfd, areq, len areq);
429		if(n != len areq)
430			return (sys->sprint("write problem: %r"), nil, nil, nil);
431		(ans, restfd) = readbytes(net.dfd);
432		(status, rest) := stripline(ans);
433		if(verbose)
434			sys->fprint(stderr, "response: %s\n", status);
435		(vers, statusrest) := S->splitl(status, " ");
436		if(!S->prefix("HTTP/", vers))
437			return ("bad reply status: " + status, rest, restfd, nil);
438		code := int statusrest;
439		location := "";
440		body = rest;
441		for(;;) {
442			hline: string;
443			(hline, body) = stripline(body);
444			if(hline == "")
445				break;
446			if(verbose > 1)
447				sys->fprint(stderr, "%s\n", hline);
448			if(!iswhite(hline[0])) {
449				(hname, hrest) := S->splitl(hline, ":");
450				if(hrest != "") {
451					hname = S->tolower(hname);
452					hval := S->drop(hrest, ": \t");
453					hval = S->take(hval, "^ \t");
454					if(hname == "location")
455						location = hval;
456				}
457			}
458		}
459		if(code != 200) {
460			if((code == 300 || code == 301 || code == 302) && location != "") {
461				# MultipleChoices, MovedPerm, or MovedTemp
462				if(verbose)
463					sys->fprint(stderr, "redirect to %s\n", location);
464				u = U->makeurl(location);
465				continue;
466			}
467			return ("status not ok: " + status, rest, restfd, u);
468		}
469		break;
470	}
471	return ("", body, restfd, u);
472}
473
474
475need_proxy(h: string) : int
476{
477	doml := noproxydoms;
478	if(doml == nil)
479		return 1;		# all domains need proxy
480
481	lh := len h;
482	for(dom := hd doml; doml != nil; doml = tl doml) {
483		ld := len dom;
484		if(lh >= ld && h[lh-ld:] == dom)
485			return 0;	# domain is on the noproxy list
486	}
487
488	return 1;
489}
490
491# Simple guess test for HTML: first non-white byte is '<'
492is_html(a: array of byte) : int
493{
494	for(i := 0; i < len a; i++)
495		if(!iswhite(int a[i]))
496			break;
497	if(i < len a && a[i] == byte '<')
498		return 1;
499	return 0;
500}
501
502readbytes(fd: ref Sys->FD) : (array of byte, ref Sys->FD)
503{
504	buf := array[Sys->ATOMICIO] of byte;
505	i := 0;
506	avail := len buf;
507	while (avail > 0) {
508		n := sys->read(fd, buf[i:], avail);
509		if(n <= 0) {
510			fd = nil;
511			break;
512		}
513		i += n;
514		avail -= n;
515	}
516	return (buf[0:i], fd);
517}
518
519writebytes(f: string, a: array of byte, fd: ref Sys->FD)
520{
521	ofd: ref Sys->FD;
522	if (f == "-")
523		ofd = sys->fildes(1);
524	else
525		ofd = sys->create(f, Sys->OWRITE, 8r666);
526	if(ofd == nil) {
527		sys->fprint(stderr, "webgrab: can't create %s: %r\n", f);
528		return;
529	}
530	i := 0;
531	clen := len a;
532	while(i < clen) {
533		n := sys->write(ofd, a[i:], clen-i);
534		if(n < 0) {
535			sys->fprint(stderr, "webgrab: write error: %r\n");
536			return;
537		}
538		i += n;
539	}
540	if(fd != nil) {
541		buf := array[Sys->ATOMICIO] of byte;
542		while((n := sys->read(fd, buf, len buf)) > 0) {
543			if(sys->write(ofd, buf, n) != n) {
544				sys->fprint(stderr, "webgrab: write error: %r\n");
545				return;
546			}
547		}
548		if(n < 0) {
549			sys->fprint(stderr, "webgrab: read error: %r\n");
550			return;
551		}
552		clen += n;
553	}
554	if (f != "-")
555		sys->fprint(stderr, "created %s, %d bytes\n", f, clen);
556}
557
558stripline(b: array of byte) : (string, array of byte)
559{
560	n := len b - 1;
561	for(i := 0; i < n; i++)
562		if(b[i] == byte '\r' && b[i+1] == byte '\n')
563			return (string b[0:i], b[i+2:]);
564	return ("", b);
565}
566
567error_exit(msg: string)
568{
569	sys->fprint(sys->fildes(2), "%s\n", msg);
570	raise "fail:error";
571}
572