1# Webgrab -- for getting html pages and the subordinate files (images, frame children) 2# they refer to (using "src=..." in a tag) into the local file space. 3# Assume http: scheme if none specified. 4# Usage: 5# webgrab [-r] [-v] [-o stem] url 6# If stem is specified, file will be saved in stem.html and images will 7# go in stem_1.jpg (or .gif, ...), stem_2.jpg, etc. 8# If stem is not specified, derive it from url (see getstem comment, below). 9# If -r is specified, get "raw", i.e., no image fetching/html munging. 10# If -v is specified (verbose), print some progress information, 11# with more if -vv is given. 12 13implement Webgrab; 14 15include "sys.m"; 16 sys: Sys; 17 FD: import sys; 18 19include "draw.m"; 20 21include "string.m"; 22 S: String; 23 24include "url.m"; 25 U: Url; 26 ParsedUrl: import U; 27 28include "daytime.m"; 29 DT: Daytime; 30 31include "bufio.m"; 32 B: Bufio; 33 34include "dial.m"; 35 D: Dial; 36 37include "arg.m"; 38 39Webgrab: module 40{ 41 init: fn(ctxt: ref Draw->Context, args: list of string); 42}; 43 44stderr: ref FD; 45verbose := 0; 46postbody : string; 47 48httpproxy: ref Url->ParsedUrl; 49noproxydoms: list of string; # domains that don't require proxy 50 51init(nil: ref Draw->Context, args: list of string) 52{ 53 sys = load Sys Sys->PATH; 54 stderr = sys->fildes(2); 55 S = load String String->PATH; 56 U = load Url Url->PATH; 57 DT = load Daytime Daytime->PATH; 58 D = load Dial Dial->PATH; 59 B = load Bufio Bufio->PATH; 60 arg := load Arg Arg->PATH; 61 if(S == nil || U == nil || DT == nil || B == nil || arg == nil) 62 error_exit("can't load a module"); 63 U->init(); 64 stem := ""; 65 rawflag := 0; 66 arg->init(args); 67 arg->setusage("webgrab [-r] [-v[v]] [-p postbody] [-o stem] url"); 68 url := ""; 69 while((o := arg->opt()) != 0) 70 case o { 71 'r' => 72 rawflag = 1; 73 'v' => 74 verbose++; 75 'o' => 76 stem = arg->earg(); 77 'p' => 78 postbody = arg->earg(); 79 * => 80 arg->usage(); 81 } 82 args = arg->argv(); 83 if(len args != 1) 84 arg->usage(); 85 url = hd args; 86 arg = nil; 87 (nil,xr) := S->splitstrl(url,"//"); 88 (nil,yr) := S->splitl(url,":"); 89 if(xr == "" && yr == "") 90 url = "http://" + url; 91 u := U->makeurl(url); 92 if(stem == "") 93 stem = getstem(u); 94 readconfig(); 95 grab(u, stem, rawflag); 96} 97 98readconfig() 99{ 100 cfgio := B->open("/services/webget/config", sys->OREAD); 101 if(cfgio != nil) { 102 for(;;) { 103 line := B->cfgio.gets('\n'); 104 if(line == "") { 105 B->cfgio.close(); 106 break; 107 } 108 if(line[0]=='#') 109 continue; 110 (key, val) := S->splitl(line, " \t="); 111 val = S->take(S->drop(val, " \t="), "^\r\n"); 112 if(val == "") 113 continue; 114 case key { 115 "httpproxy" => 116 if(val == "none") 117 continue; 118 # val should be host or host:port 119 httpproxy = U->makeurl("http://" + val); 120 if(verbose) 121 sys->fprint(stderr, "Using http proxy %s\n", httpproxy.tostring()); 122 "noproxy" or 123 "noproxydoms" => 124 (nil, noproxydoms) = sys->tokenize(val, ";, \t"); 125 } 126 } 127 } 128} 129 130# Make up a stem for forming save-file-names, based on url u. 131# Use the last non-nil component of u.path, without a final extension, 132# else use the host. Then, if the stem still contains a '.' (e.g., www.lucent) 133# use the part after the final '.'. 134# Finally, if all else fails, use use "grabout". 135getstem(u: ref ParsedUrl) : string 136{ 137 stem := ""; 138 if(u.path != "") { 139 (l, r) := S->splitr(u.path, "/"); 140 if(r == "") { 141 # path ended with '/'; try next to last component 142 if(l != "") 143 (l, r) = S->splitr(l[0:len l - 1], "/"); 144 } 145 if(r != "") 146 stem = r; 147 } 148 if(stem == "") 149 stem = u.host; 150 if(stem != "") { 151 ext: string; 152 (stem, ext) = S->splitr(stem, "."); 153 if(stem == "") 154 stem = ext; 155 else 156 stem = stem[0:len stem - 1]; 157 (nil, stem) = S->splitr(stem, "."); 158 } 159 if(stem == "") 160 stem = "grabout"; 161 return stem; 162} 163 164grab(u: ref ParsedUrl, stem: string, rawflag: int) 165{ 166 (err, contents, fd, actual) := httpget(u); 167 if(err != "") 168 error_exit(err); 169 ish := is_html(contents); 170 if(ish) 171 contents = addfetchcomment(contents, u, actual); 172 if(rawflag || !ish) { 173 writebytes(stem, contents, fd); 174 return; 175 } 176 # get subordinates, modify contents 177 subs : list of (string, string); 178 (contents, subs) = subfix(contents, stem); 179 writebytes(stem + ".html", contents, fd); 180 for(l := subs; l != nil; l = tl l) { 181 (fname, suburl) := hd l; 182 subu := U->makeurl(suburl); 183 subu.makeabsolute(actual); 184 (suberr, subcontents, subfd, nil) := httpget(subu); 185 if(suberr != "") { 186 sys->fprint(stderr, "webgrab: can't fetch subordinate %s from %s: %s\n", fname, subu.tostring(), suberr); 187 continue; 188 } 189 writebytes(fname, subcontents, subfd); 190 } 191} 192 193# Fix the html in array a so that referenced subordinate files (SRC= or BACKGROUND= fields of tags) 194# are replaced with local names (stem_1.xxx, stem_2.xxx, etc.), 195# and return the fixed array along with a list of (local name, subordinate url) 196# of images to be fetched. 197subfix(a: array of byte, stem: string) : (array of byte, list of (string, string)) 198{ 199 alen := len a; 200 if(alen == 0) 201 return (a, nil); 202 nsubs := 0; 203 newa := array[alen + 1000] of byte; 204 newai := 0; 205 j := 0; 206 intag := 0; 207 incom := 0; 208 quote := 0; 209 subs : list of (string, string) = nil; 210 for(i := 0; i < alen; i++) { 211 c := int a[i]; 212 if(incom) { 213 if(amatch(a, i, alen, "-->")) { 214 incom = 0; 215 i = i+2; 216 } 217 } 218 else if(intag) { 219 if(quote==0 && (amatch(a, i, alen, "src") || amatch(a, i, alen, "background"))) { 220 v := ""; 221 eqi := 0; 222 if(amatch(a, i, alen, "src")) 223 k := i+3; 224 else 225 k = i+10; 226 for(; k < alen; k++) 227 if(!iswhite(int a[k])) 228 break; 229 if(k < alen && int a[k] == '=') { 230 eqi = k; 231 k++; 232 while(k<alen && iswhite(int a[k])) 233 k++; 234 if(k<alen) { 235 kstart := k; 236 c = int a[k]; 237 if(c == '\'' || c== '"') { 238 quote = int a[k++]; 239 while(k<alen && (int a[k])!=quote) 240 k++; 241 v = string a[kstart+1:k]; 242 k++; 243 } 244 else { 245 while(k<alen && !iswhite(int a[k]) && int a[k] != '>') 246 k++; 247 v = string a[kstart:k]; 248 } 249 } 250 } 251 if(v != "") { 252 f := ""; 253 for(l := subs; l != nil; l = tl l) { 254 (ff,uu) := hd l; 255 if(v == uu) { 256 f = ff; 257 break; 258 } 259 } 260 if(f == "") { 261 nsubs++; 262 f = stem + "_" + string nsubs + getsuff(v); 263 subs = (f, v) :: subs; 264 } 265 # should check for newa too small 266 newa[newai:] = a[j:eqi+1]; 267 newai += eqi+1-j; 268 xa := array of byte f; 269 newa[newai:] = xa; 270 newai += len xa; 271 j = k; 272 } 273 i = k-1; 274 } 275 if(c == '>' && quote == 0) 276 intag = 0; 277 if(quote) { 278 if(quote == c) 279 quote = 0; 280 else if(c == '"' || c == '\'') 281 quote = c; 282 } 283 } 284 else if(c == '<') 285 intag = 1; 286 } 287 if(nsubs == 0) 288 return (a, nil); 289 if(i > j) { 290 newa[newai:] = a[j:i]; 291 newai += i-j; 292 } 293 ans := array[newai] of byte; 294 ans[0:] = newa[0:newai]; 295 anssubs : list of (string, string) = nil; 296 for(ll := subs; ll != nil; ll = tl ll) 297 anssubs = hd ll :: anssubs; 298 return (ans, anssubs); 299} 300 301# add c after all f's in a 302fixnames(a: array of byte, f: string, c: byte) 303{ 304 alen := len a; 305 n := alen - len f; 306 for(i := 0; i < n; i++) { 307 if(amatch(a, i, alen, f)) { 308 a[i+len f] = c; 309 } 310 } 311} 312 313amatch(a: array of byte, i, alen: int, s: string) : int 314{ 315 slen := len s; 316 for(k := 0; i+k < alen && k < slen; k++) { 317 c := int a[i+k]; 318 if(c >= 'A' && c <= 'Z') 319 c = c + (int 'a' - int 'A'); 320 if(c != s[k]) 321 break; 322 } 323 if(k == slen) { 324 return 1; 325 } 326 return 0; 327} 328 329getsuff(ustr: string) : string 330{ 331 u := U->makeurl(ustr); 332 if(u.path != "") { 333 for(i := len u.path - 1; i >= 0; i--) { 334 c := u.path[i]; 335 if(c == '.') 336 return u.path[i:]; 337 if(c == '/') 338 break; 339 } 340 } 341 return ""; 342} 343 344iswhite(c: int) : int 345{ 346 return (c==' ' || c=='\t' || c=='\n' || c=='\r'); 347} 348 349# Add a comment to end of a giving date and source of fetch 350addfetchcomment(a: array of byte, u, actu: ref ParsedUrl) : array of byte 351{ 352 now := DT->text(DT->local(DT->now())); 353 ustr := u.tostring(); 354 actustr := actu.tostring(); 355 comment := "\n<!-- Fetched " + now + " from " + ustr; 356 if(ustr != actustr) 357 comment += ", redirected to " + actustr; 358 comment += " -->\n"; 359 acom := array of byte comment; 360 newa := array[len a + len acom] of byte; 361 newa[0:] = a; 362 newa[len a:] = acom; 363 return newa; 364} 365 366# Get u, return (error string, body, actual url of source, after redirection) 367httpget(u: ref ParsedUrl) : (string, array of byte, ref Sys->FD, ref ParsedUrl) 368{ 369 ans, body : array of byte; 370 restfd: ref Sys->FD; 371 req : string; 372 373 for(redir := 0; redir < 10; redir++) { 374 if(u.port == "") 375 u.port = "80"; # default IP port for HTTP 376 if(verbose) 377 sys->fprint(stderr, "connecting to %s\n", u.host); 378 dialhost, port: string; 379 380 if(httpproxy != nil && need_proxy(u.host)) { 381 dialhost = httpproxy.host; 382 port = httpproxy.port; 383 } 384 else { 385 dialhost = u.host; 386 port = u.port; 387 } 388 dest := D->netmkaddr(dialhost, "tcp", port); 389 net := D->dial(dest, nil); 390 if(net == nil) 391 return (sys->sprint("can't dial %s: %r", dest), nil, nil, nil); 392 393 # prepare request 394 if(u.query != ""){ 395 u.query = "?" + u.query; 396 } 397 398 if (postbody == nil){ 399 if(httpproxy == nil || !need_proxy(u.host)){ 400 req = sys->sprint("GET /%s%s HTTP/1.0\r\n"+ 401 "Host: %s\r\n"+ 402 "User-agent: Inferno/webgrab\r\n"+ 403 "Cache-Control: no-cache\r\n"+ 404 "Pragma: no-cache\r\n\r\n", 405 u.path, u.query, u.host); 406 }else{ 407 req = sys->sprint("GET http:///%s%s HTTP/1.0\r\n"+ 408 "Host: %s\r\n"+ 409 "User-agent: Inferno/webgrab\r\n"+ 410 "Cache-Control: no-cache\r\n"+ 411 "Pragma: no-cache\r\n\r\n", 412 u.host, u.path, u.host); 413 } 414 }else{ 415 req = sys->sprint("POST /%s HTTP/1.0\r\n"+ 416 "Host: %s\r\n"+ 417 "Content-type: application/x-www-form-urlencoded\r\n"+ 418 "Content-length: %d\r\n"+ 419 "User-agent: Inferno/webgrab\r\n"+ 420 "\r\n"+"%s", 421 u.path, u.host, len postbody, postbody); 422 423 } 424 425 if(verbose) 426 sys->fprint(stderr, "writing request: %s\n", req); 427 areq := array of byte req; 428 n := sys->write(net.dfd, areq, len areq); 429 if(n != len areq) 430 return (sys->sprint("write problem: %r"), nil, nil, nil); 431 (ans, restfd) = readbytes(net.dfd); 432 (status, rest) := stripline(ans); 433 if(verbose) 434 sys->fprint(stderr, "response: %s\n", status); 435 (vers, statusrest) := S->splitl(status, " "); 436 if(!S->prefix("HTTP/", vers)) 437 return ("bad reply status: " + status, rest, restfd, nil); 438 code := int statusrest; 439 location := ""; 440 body = rest; 441 for(;;) { 442 hline: string; 443 (hline, body) = stripline(body); 444 if(hline == "") 445 break; 446 if(verbose > 1) 447 sys->fprint(stderr, "%s\n", hline); 448 if(!iswhite(hline[0])) { 449 (hname, hrest) := S->splitl(hline, ":"); 450 if(hrest != "") { 451 hname = S->tolower(hname); 452 hval := S->drop(hrest, ": \t"); 453 hval = S->take(hval, "^ \t"); 454 if(hname == "location") 455 location = hval; 456 } 457 } 458 } 459 if(code != 200) { 460 if((code == 300 || code == 301 || code == 302) && location != "") { 461 # MultipleChoices, MovedPerm, or MovedTemp 462 if(verbose) 463 sys->fprint(stderr, "redirect to %s\n", location); 464 u = U->makeurl(location); 465 continue; 466 } 467 return ("status not ok: " + status, rest, restfd, u); 468 } 469 break; 470 } 471 return ("", body, restfd, u); 472} 473 474 475need_proxy(h: string) : int 476{ 477 doml := noproxydoms; 478 if(doml == nil) 479 return 1; # all domains need proxy 480 481 lh := len h; 482 for(dom := hd doml; doml != nil; doml = tl doml) { 483 ld := len dom; 484 if(lh >= ld && h[lh-ld:] == dom) 485 return 0; # domain is on the noproxy list 486 } 487 488 return 1; 489} 490 491# Simple guess test for HTML: first non-white byte is '<' 492is_html(a: array of byte) : int 493{ 494 for(i := 0; i < len a; i++) 495 if(!iswhite(int a[i])) 496 break; 497 if(i < len a && a[i] == byte '<') 498 return 1; 499 return 0; 500} 501 502readbytes(fd: ref Sys->FD) : (array of byte, ref Sys->FD) 503{ 504 buf := array[Sys->ATOMICIO] of byte; 505 i := 0; 506 avail := len buf; 507 while (avail > 0) { 508 n := sys->read(fd, buf[i:], avail); 509 if(n <= 0) { 510 fd = nil; 511 break; 512 } 513 i += n; 514 avail -= n; 515 } 516 return (buf[0:i], fd); 517} 518 519writebytes(f: string, a: array of byte, fd: ref Sys->FD) 520{ 521 ofd: ref Sys->FD; 522 if (f == "-") 523 ofd = sys->fildes(1); 524 else 525 ofd = sys->create(f, Sys->OWRITE, 8r666); 526 if(ofd == nil) { 527 sys->fprint(stderr, "webgrab: can't create %s: %r\n", f); 528 return; 529 } 530 i := 0; 531 clen := len a; 532 while(i < clen) { 533 n := sys->write(ofd, a[i:], clen-i); 534 if(n < 0) { 535 sys->fprint(stderr, "webgrab: write error: %r\n"); 536 return; 537 } 538 i += n; 539 } 540 if(fd != nil) { 541 buf := array[Sys->ATOMICIO] of byte; 542 while((n := sys->read(fd, buf, len buf)) > 0) { 543 if(sys->write(ofd, buf, n) != n) { 544 sys->fprint(stderr, "webgrab: write error: %r\n"); 545 return; 546 } 547 } 548 if(n < 0) { 549 sys->fprint(stderr, "webgrab: read error: %r\n"); 550 return; 551 } 552 clen += n; 553 } 554 if (f != "-") 555 sys->fprint(stderr, "created %s, %d bytes\n", f, clen); 556} 557 558stripline(b: array of byte) : (string, array of byte) 559{ 560 n := len b - 1; 561 for(i := 0; i < n; i++) 562 if(b[i] == byte '\r' && b[i+1] == byte '\n') 563 return (string b[0:i], b[i+2:]); 564 return ("", b); 565} 566 567error_exit(msg: string) 568{ 569 sys->fprint(sys->fildes(2), "%s\n", msg); 570 raise "fail:error"; 571} 572