xref: /netbsd-src/external/bsd/fetch/dist/libfetch/http.c (revision 500db002748d9818288e46e10f026a2b09548086)
1 /*	$NetBSD: http.c,v 1.1.1.4 2009/03/10 00:44:23 joerg Exp $	*/
2 /*-
3  * Copyright (c) 2000-2004 Dag-Erling Co�dan Sm�rgrav
4  * Copyright (c) 2003 Thomas Klausner <wiz@NetBSD.org>
5  * Copyright (c) 2008, 2009 Joerg Sonnenberger <joerg@NetBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer
13  *    in this position and unchanged.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. The name of the author may not be used to endorse or promote products
18  *    derived from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * $FreeBSD: http.c,v 1.83 2008/02/06 11:39:55 des Exp $
32  */
33 
34 /*
35  * The following copyright applies to the base64 code:
36  *
37  *-
38  * Copyright 1997 Massachusetts Institute of Technology
39  *
40  * Permission to use, copy, modify, and distribute this software and
41  * its documentation for any purpose and without fee is hereby
42  * granted, provided that both the above copyright notice and this
43  * permission notice appear in all copies, that both the above
44  * copyright notice and this permission notice appear in all
45  * supporting documentation, and that the name of M.I.T. not be used
46  * in advertising or publicity pertaining to distribution of the
47  * software without specific, written prior permission.  M.I.T. makes
48  * no representations about the suitability of this software for any
49  * purpose.  It is provided "as is" without express or implied
50  * warranty.
51  *
52  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
53  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
54  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
55  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
56  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
57  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
58  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
59  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
60  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
61  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
62  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  */
65 
66 #ifdef __linux__
67 /* Keep this down to Linux, it can create surprises else where. */
68 #define _GNU_SOURCE
69 #endif
70 
71 #if HAVE_CONFIG_H
72 #include "config.h"
73 #endif
74 #ifndef NETBSD
75 #include <nbcompat.h>
76 #endif
77 
78 #include <sys/types.h>
79 #include <sys/socket.h>
80 
81 #include <ctype.h>
82 #include <errno.h>
83 #include <locale.h>
84 #include <stdarg.h>
85 #ifndef NETBSD
86 #include <nbcompat/netdb.h>
87 #include <nbcompat/stdio.h>
88 #else
89 #include <netdb.h>
90 #include <stdio.h>
91 #endif
92 #include <stdlib.h>
93 #include <string.h>
94 #include <time.h>
95 #include <unistd.h>
96 
97 #include <arpa/inet.h>
98 
99 #include <netinet/in.h>
100 #include <netinet/tcp.h>
101 
102 #include "fetch.h"
103 #include "common.h"
104 #include "httperr.h"
105 
106 /* Maximum number of redirects to follow */
107 #define MAX_REDIRECT 5
108 
109 /* Symbolic names for reply codes we care about */
110 #define HTTP_OK			200
111 #define HTTP_PARTIAL		206
112 #define HTTP_MOVED_PERM		301
113 #define HTTP_MOVED_TEMP		302
114 #define HTTP_SEE_OTHER		303
115 #define HTTP_NOT_MODIFIED	304
116 #define HTTP_TEMP_REDIRECT	307
117 #define HTTP_NEED_AUTH		401
118 #define HTTP_NEED_PROXY_AUTH	407
119 #define HTTP_BAD_RANGE		416
120 #define HTTP_PROTOCOL_ERROR	999
121 
122 #define HTTP_REDIRECT(xyz) ((xyz) == HTTP_MOVED_PERM \
123 			    || (xyz) == HTTP_MOVED_TEMP \
124 			    || (xyz) == HTTP_TEMP_REDIRECT \
125 			    || (xyz) == HTTP_SEE_OTHER)
126 
127 #define HTTP_ERROR(xyz) ((xyz) > 400 && (xyz) < 599)
128 
129 
130 /*****************************************************************************
131  * I/O functions for decoding chunked streams
132  */
133 
134 struct httpio
135 {
136 	conn_t		*conn;		/* connection */
137 	int		 chunked;	/* chunked mode */
138 	char		*buf;		/* chunk buffer */
139 	size_t		 bufsize;	/* size of chunk buffer */
140 	ssize_t		 buflen;	/* amount of data currently in buffer */
141 	int		 bufpos;	/* current read offset in buffer */
142 	int		 eof;		/* end-of-file flag */
143 	int		 error;		/* error flag */
144 	size_t		 chunksize;	/* remaining size of current chunk */
145 };
146 
147 /*
148  * Get next chunk header
149  */
150 static int
151 http_new_chunk(struct httpio *io)
152 {
153 	char *p;
154 
155 	if (fetch_getln(io->conn) == -1)
156 		return (-1);
157 
158 	if (io->conn->buflen < 2 || !isxdigit((unsigned char)*io->conn->buf))
159 		return (-1);
160 
161 	for (p = io->conn->buf; *p && !isspace((unsigned char)*p); ++p) {
162 		if (*p == ';')
163 			break;
164 		if (!isxdigit((unsigned char)*p))
165 			return (-1);
166 		if (isdigit((unsigned char)*p)) {
167 			io->chunksize = io->chunksize * 16 +
168 			    *p - '0';
169 		} else {
170 			io->chunksize = io->chunksize * 16 +
171 			    10 + tolower((unsigned char)*p) - 'a';
172 		}
173 	}
174 
175 	return (io->chunksize);
176 }
177 
178 /*
179  * Grow the input buffer to at least len bytes
180  */
181 static int
182 http_growbuf(struct httpio *io, size_t len)
183 {
184 	char *tmp;
185 
186 	if (io->bufsize >= len)
187 		return (0);
188 
189 	if ((tmp = realloc(io->buf, len)) == NULL)
190 		return (-1);
191 	io->buf = tmp;
192 	io->bufsize = len;
193 	return (0);
194 }
195 
196 /*
197  * Fill the input buffer, do chunk decoding on the fly
198  */
199 static int
200 http_fillbuf(struct httpio *io, size_t len)
201 {
202 	if (io->error)
203 		return (-1);
204 	if (io->eof)
205 		return (0);
206 
207 	if (io->chunked == 0) {
208 		if (http_growbuf(io, len) == -1)
209 			return (-1);
210 		if ((io->buflen = fetch_read(io->conn, io->buf, len)) == -1) {
211 			io->error = 1;
212 			return (-1);
213 		}
214 		io->bufpos = 0;
215 		return (io->buflen);
216 	}
217 
218 	if (io->chunksize == 0) {
219 		switch (http_new_chunk(io)) {
220 		case -1:
221 			io->error = 1;
222 			return (-1);
223 		case 0:
224 			io->eof = 1;
225 			return (0);
226 		}
227 	}
228 
229 	if (len > io->chunksize)
230 		len = io->chunksize;
231 	if (http_growbuf(io, len) == -1)
232 		return (-1);
233 	if ((io->buflen = fetch_read(io->conn, io->buf, len)) == -1) {
234 		io->error = 1;
235 		return (-1);
236 	}
237 	io->chunksize -= io->buflen;
238 
239 	if (io->chunksize == 0) {
240 		char endl[2];
241 		ssize_t len2;
242 
243 		len2 = fetch_read(io->conn, endl, 2);
244 		if (len2 == 1 && fetch_read(io->conn, endl + 1, 1) != 1)
245 			return (-1);
246 		if (len2 == -1 || endl[0] != '\r' || endl[1] != '\n')
247 			return (-1);
248 	}
249 
250 	io->bufpos = 0;
251 
252 	return (io->buflen);
253 }
254 
255 /*
256  * Read function
257  */
258 static ssize_t
259 http_readfn(void *v, void *buf, size_t len)
260 {
261 	struct httpio *io = (struct httpio *)v;
262 	size_t l, pos;
263 
264 	if (io->error)
265 		return (-1);
266 	if (io->eof)
267 		return (0);
268 
269 	for (pos = 0; len > 0; pos += l, len -= l) {
270 		/* empty buffer */
271 		if (!io->buf || io->bufpos == io->buflen)
272 			if (http_fillbuf(io, len) < 1)
273 				break;
274 		l = io->buflen - io->bufpos;
275 		if (len < l)
276 			l = len;
277 		memcpy((char *)buf + pos, io->buf + io->bufpos, l);
278 		io->bufpos += l;
279 	}
280 
281 	if (!pos && io->error)
282 		return (-1);
283 	return (pos);
284 }
285 
286 /*
287  * Write function
288  */
289 static ssize_t
290 http_writefn(void *v, const void *buf, size_t len)
291 {
292 	struct httpio *io = (struct httpio *)v;
293 
294 	return (fetch_write(io->conn, buf, len));
295 }
296 
297 /*
298  * Close function
299  */
300 static void
301 http_closefn(void *v)
302 {
303 	struct httpio *io = (struct httpio *)v;
304 
305 	fetch_close(io->conn);
306 	if (io->buf)
307 		free(io->buf);
308 	free(io);
309 }
310 
311 /*
312  * Wrap a file descriptor up
313  */
314 static fetchIO *
315 http_funopen(conn_t *conn, int chunked)
316 {
317 	struct httpio *io;
318 	fetchIO *f;
319 
320 	if ((io = calloc(1, sizeof(*io))) == NULL) {
321 		fetch_syserr();
322 		return (NULL);
323 	}
324 	io->conn = conn;
325 	io->chunked = chunked;
326 	f = fetchIO_unopen(io, http_readfn, http_writefn, http_closefn);
327 	if (f == NULL) {
328 		fetch_syserr();
329 		free(io);
330 		return (NULL);
331 	}
332 	return (f);
333 }
334 
335 
336 /*****************************************************************************
337  * Helper functions for talking to the server and parsing its replies
338  */
339 
340 /* Header types */
341 typedef enum {
342 	hdr_syserror = -2,
343 	hdr_error = -1,
344 	hdr_end = 0,
345 	hdr_unknown = 1,
346 	hdr_content_length,
347 	hdr_content_range,
348 	hdr_last_modified,
349 	hdr_location,
350 	hdr_transfer_encoding,
351 	hdr_www_authenticate
352 } hdr_t;
353 
354 /* Names of interesting headers */
355 static struct {
356 	hdr_t		 num;
357 	const char	*name;
358 } hdr_names[] = {
359 	{ hdr_content_length,		"Content-Length" },
360 	{ hdr_content_range,		"Content-Range" },
361 	{ hdr_last_modified,		"Last-Modified" },
362 	{ hdr_location,			"Location" },
363 	{ hdr_transfer_encoding,	"Transfer-Encoding" },
364 	{ hdr_www_authenticate,		"WWW-Authenticate" },
365 	{ hdr_unknown,			NULL },
366 };
367 
368 /*
369  * Send a formatted line; optionally echo to terminal
370  */
371 static int
372 http_cmd(conn_t *conn, const char *fmt, ...)
373 {
374 	va_list ap;
375 	size_t len;
376 	char *msg;
377 	int r;
378 
379 	va_start(ap, fmt);
380 	len = vasprintf(&msg, fmt, ap);
381 	va_end(ap);
382 
383 	if (msg == NULL) {
384 		errno = ENOMEM;
385 		fetch_syserr();
386 		return (-1);
387 	}
388 
389 	r = fetch_putln(conn, msg, len);
390 	free(msg);
391 
392 	if (r == -1) {
393 		fetch_syserr();
394 		return (-1);
395 	}
396 
397 	return (0);
398 }
399 
400 /*
401  * Get and parse status line
402  */
403 static int
404 http_get_reply(conn_t *conn)
405 {
406 	char *p;
407 
408 	if (fetch_getln(conn) == -1)
409 		return (-1);
410 	/*
411 	 * A valid status line looks like "HTTP/m.n xyz reason" where m
412 	 * and n are the major and minor protocol version numbers and xyz
413 	 * is the reply code.
414 	 * Unfortunately, there are servers out there (NCSA 1.5.1, to name
415 	 * just one) that do not send a version number, so we can't rely
416 	 * on finding one, but if we do, insist on it being 1.0 or 1.1.
417 	 * We don't care about the reason phrase.
418 	 */
419 	if (strncmp(conn->buf, "HTTP", 4) != 0)
420 		return (HTTP_PROTOCOL_ERROR);
421 	p = conn->buf + 4;
422 	if (*p == '/') {
423 		if (p[1] != '1' || p[2] != '.' || (p[3] != '0' && p[3] != '1'))
424 			return (HTTP_PROTOCOL_ERROR);
425 		p += 4;
426 	}
427 	if (*p != ' ' ||
428 	    !isdigit((unsigned char)p[1]) ||
429 	    !isdigit((unsigned char)p[2]) ||
430 	    !isdigit((unsigned char)p[3]))
431 		return (HTTP_PROTOCOL_ERROR);
432 
433 	conn->err = (p[1] - '0') * 100 + (p[2] - '0') * 10 + (p[3] - '0');
434 	return (conn->err);
435 }
436 
437 /*
438  * Check a header; if the type matches the given string, return a pointer
439  * to the beginning of the value.
440  */
441 static const char *
442 http_match(const char *str, const char *hdr)
443 {
444 	while (*str && *hdr &&
445 	    tolower((unsigned char)*str++) == tolower((unsigned char)*hdr++))
446 		/* nothing */;
447 	if (*str || *hdr != ':')
448 		return (NULL);
449 	while (*hdr && isspace((unsigned char)*++hdr))
450 		/* nothing */;
451 	return (hdr);
452 }
453 
454 /*
455  * Get the next header and return the appropriate symbolic code.
456  */
457 static hdr_t
458 http_next_header(conn_t *conn, const char **p)
459 {
460 	int i;
461 
462 	if (fetch_getln(conn) == -1)
463 		return (hdr_syserror);
464 	while (conn->buflen && isspace((unsigned char)conn->buf[conn->buflen - 1]))
465 		conn->buflen--;
466 	conn->buf[conn->buflen] = '\0';
467 	if (conn->buflen == 0)
468 		return (hdr_end);
469 	/*
470 	 * We could check for malformed headers but we don't really care.
471 	 * A valid header starts with a token immediately followed by a
472 	 * colon; a token is any sequence of non-control, non-whitespace
473 	 * characters except "()<>@,;:\\\"{}".
474 	 */
475 	for (i = 0; hdr_names[i].num != hdr_unknown; i++)
476 		if ((*p = http_match(hdr_names[i].name, conn->buf)) != NULL)
477 			return (hdr_names[i].num);
478 	return (hdr_unknown);
479 }
480 
481 /*
482  * Parse a last-modified header
483  */
484 static int
485 http_parse_mtime(const char *p, time_t *mtime)
486 {
487 	char locale[64], *r;
488 	struct tm tm;
489 
490 	strncpy(locale, setlocale(LC_TIME, NULL), sizeof(locale));
491 	setlocale(LC_TIME, "C");
492 	r = strptime(p, "%a, %d %b %Y %H:%M:%S GMT", &tm);
493 	/* XXX should add support for date-2 and date-3 */
494 	setlocale(LC_TIME, locale);
495 	if (r == NULL)
496 		return (-1);
497 	*mtime = timegm(&tm);
498 	return (0);
499 }
500 
501 /*
502  * Parse a content-length header
503  */
504 static int
505 http_parse_length(const char *p, off_t *length)
506 {
507 	off_t len;
508 
509 	for (len = 0; *p && isdigit((unsigned char)*p); ++p)
510 		len = len * 10 + (*p - '0');
511 	if (*p)
512 		return (-1);
513 	*length = len;
514 	return (0);
515 }
516 
517 /*
518  * Parse a content-range header
519  */
520 static int
521 http_parse_range(const char *p, off_t *offset, off_t *length, off_t *size)
522 {
523 	off_t first, last, len;
524 
525 	if (strncasecmp(p, "bytes ", 6) != 0)
526 		return (-1);
527 	p += 6;
528 	if (*p == '*') {
529 		first = last = -1;
530 		++p;
531 	} else {
532 		for (first = 0; *p && isdigit((unsigned char)*p); ++p)
533 			first = first * 10 + *p - '0';
534 		if (*p != '-')
535 			return (-1);
536 		for (last = 0, ++p; *p && isdigit((unsigned char)*p); ++p)
537 			last = last * 10 + *p - '0';
538 	}
539 	if (first > last || *p != '/')
540 		return (-1);
541 	for (len = 0, ++p; *p && isdigit((unsigned char)*p); ++p)
542 		len = len * 10 + *p - '0';
543 	if (*p || len < last - first + 1)
544 		return (-1);
545 	if (first == -1)
546 		*length = 0;
547 	else
548 		*length = last - first + 1;
549 	*offset = first;
550 	*size = len;
551 	return (0);
552 }
553 
554 
555 /*****************************************************************************
556  * Helper functions for authorization
557  */
558 
559 /*
560  * Base64 encoding
561  */
562 static char *
563 http_base64(const char *src)
564 {
565 	static const char base64[] =
566 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
567 	    "abcdefghijklmnopqrstuvwxyz"
568 	    "0123456789+/";
569 	char *str, *dst;
570 	size_t l;
571 	int t, r;
572 
573 	l = strlen(src);
574 	if ((str = malloc(((l + 2) / 3) * 4 + 1)) == NULL)
575 		return (NULL);
576 	dst = str;
577 	r = 0;
578 
579 	while (l >= 3) {
580 		t = (src[0] << 16) | (src[1] << 8) | src[2];
581 		dst[0] = base64[(t >> 18) & 0x3f];
582 		dst[1] = base64[(t >> 12) & 0x3f];
583 		dst[2] = base64[(t >> 6) & 0x3f];
584 		dst[3] = base64[(t >> 0) & 0x3f];
585 		src += 3; l -= 3;
586 		dst += 4; r += 4;
587 	}
588 
589 	switch (l) {
590 	case 2:
591 		t = (src[0] << 16) | (src[1] << 8);
592 		dst[0] = base64[(t >> 18) & 0x3f];
593 		dst[1] = base64[(t >> 12) & 0x3f];
594 		dst[2] = base64[(t >> 6) & 0x3f];
595 		dst[3] = '=';
596 		dst += 4;
597 		r += 4;
598 		break;
599 	case 1:
600 		t = src[0] << 16;
601 		dst[0] = base64[(t >> 18) & 0x3f];
602 		dst[1] = base64[(t >> 12) & 0x3f];
603 		dst[2] = dst[3] = '=';
604 		dst += 4;
605 		r += 4;
606 		break;
607 	case 0:
608 		break;
609 	}
610 
611 	*dst = 0;
612 	return (str);
613 }
614 
615 /*
616  * Encode username and password
617  */
618 static int
619 http_basic_auth(conn_t *conn, const char *hdr, const char *usr, const char *pwd)
620 {
621 	char *upw, *auth;
622 	int r;
623 
624 	if (asprintf(&upw, "%s:%s", usr, pwd) == -1)
625 		return (-1);
626 	auth = http_base64(upw);
627 	free(upw);
628 	if (auth == NULL)
629 		return (-1);
630 	r = http_cmd(conn, "%s: Basic %s", hdr, auth);
631 	free(auth);
632 	return (r);
633 }
634 
635 /*
636  * Send an authorization header
637  */
638 static int
639 http_authorize(conn_t *conn, const char *hdr, const char *p)
640 {
641 	/* basic authorization */
642 	if (strncasecmp(p, "basic:", 6) == 0) {
643 		char *user, *pwd, *str;
644 		int r;
645 
646 		/* skip realm */
647 		for (p += 6; *p && *p != ':'; ++p)
648 			/* nothing */ ;
649 		if (!*p || strchr(++p, ':') == NULL)
650 			return (-1);
651 		if ((str = strdup(p)) == NULL)
652 			return (-1); /* XXX */
653 		user = str;
654 		pwd = strchr(str, ':');
655 		*pwd++ = '\0';
656 		r = http_basic_auth(conn, hdr, user, pwd);
657 		free(str);
658 		return (r);
659 	}
660 	return (-1);
661 }
662 
663 
664 /*****************************************************************************
665  * Helper functions for connecting to a server or proxy
666  */
667 
668 /*
669  * Connect to the correct HTTP server or proxy.
670  */
671 static conn_t *
672 http_connect(struct url *URL, struct url *purl, const char *flags)
673 {
674 	conn_t *conn;
675 	int af, verbose;
676 #ifdef TCP_NOPUSH
677 	int val;
678 #endif
679 
680 #ifdef INET6
681 	af = AF_UNSPEC;
682 #else
683 	af = AF_INET;
684 #endif
685 
686 	verbose = CHECK_FLAG('v');
687 	if (CHECK_FLAG('4'))
688 		af = AF_INET;
689 #ifdef INET6
690 	else if (CHECK_FLAG('6'))
691 		af = AF_INET6;
692 #endif
693 
694 	if (purl && strcasecmp(URL->scheme, SCHEME_HTTPS) != 0) {
695 		URL = purl;
696 	} else if (strcasecmp(URL->scheme, SCHEME_FTP) == 0) {
697 		/* can't talk http to an ftp server */
698 		/* XXX should set an error code */
699 		return (NULL);
700 	}
701 
702 	if ((conn = fetch_connect(URL->host, URL->port, af, verbose)) == NULL)
703 		/* fetch_connect() has already set an error code */
704 		return (NULL);
705 	if (strcasecmp(URL->scheme, SCHEME_HTTPS) == 0 &&
706 	    fetch_ssl(conn, verbose) == -1) {
707 		fetch_close(conn);
708 		/* grrr */
709 #ifdef EAUTH
710 		errno = EAUTH;
711 #else
712 		errno = EPERM;
713 #endif
714 		fetch_syserr();
715 		return (NULL);
716 	}
717 
718 #ifdef TCP_NOPUSH
719 	val = 1;
720 	setsockopt(conn->sd, IPPROTO_TCP, TCP_NOPUSH, &val, sizeof(val));
721 #endif
722 
723 	return (conn);
724 }
725 
726 static struct url *
727 http_get_proxy(struct url * url, const char *flags)
728 {
729 	struct url *purl;
730 	char *p;
731 
732 	if (flags != NULL && strchr(flags, 'd') != NULL)
733 		return (NULL);
734 	if (fetch_no_proxy_match(url->host))
735 		return (NULL);
736 	if (((p = getenv("HTTP_PROXY")) || (p = getenv("http_proxy"))) &&
737 	    *p && (purl = fetchParseURL(p))) {
738 		if (!*purl->scheme)
739 			strcpy(purl->scheme, SCHEME_HTTP);
740 		if (!purl->port)
741 			purl->port = fetch_default_proxy_port(purl->scheme);
742 		if (strcasecmp(purl->scheme, SCHEME_HTTP) == 0)
743 			return (purl);
744 		fetchFreeURL(purl);
745 	}
746 	return (NULL);
747 }
748 
749 static void
750 set_if_modified_since(conn_t *conn, time_t last_modified)
751 {
752 	static const char weekdays[] = "SunMonTueWedThuFriSat";
753 	static const char months[] = "JanFebMarAprMayJunJulAugSepOctNovDec";
754 	struct tm tm;
755 	char buf[80];
756 	gmtime_r(&last_modified, &tm);
757 	snprintf(buf, sizeof(buf), "%.3s, %02d %.3s %4d %02d:%02d:%02d GMT",
758 	    weekdays + tm.tm_wday * 3, tm.tm_mday, months + tm.tm_mon * 3,
759 	    tm.tm_year + 1900, tm.tm_hour, tm.tm_min, tm.tm_sec);
760 	http_cmd(conn, "If-Modified-Since: %s", buf);
761 }
762 
763 
764 /*****************************************************************************
765  * Core
766  */
767 
768 /*
769  * Send a request and process the reply
770  *
771  * XXX This function is way too long, the do..while loop should be split
772  * XXX off into a separate function.
773  */
774 fetchIO *
775 http_request(struct url *URL, const char *op, struct url_stat *us,
776     struct url *purl, const char *flags)
777 {
778 	conn_t *conn;
779 	struct url *url, *new;
780 	int chunked, direct, if_modified_since, need_auth, noredirect, verbose;
781 	int e, i, n, val;
782 	off_t offset, clength, length, size;
783 	time_t mtime;
784 	const char *p;
785 	fetchIO *f;
786 	hdr_t h;
787 	char hbuf[URL_HOSTLEN + 7], *host;
788 
789 	direct = CHECK_FLAG('d');
790 	noredirect = CHECK_FLAG('A');
791 	verbose = CHECK_FLAG('v');
792 	if_modified_since = CHECK_FLAG('i');
793 
794 	if (direct && purl) {
795 		fetchFreeURL(purl);
796 		purl = NULL;
797 	}
798 
799 	/* try the provided URL first */
800 	url = URL;
801 
802 	/* if the A flag is set, we only get one try */
803 	n = noredirect ? 1 : MAX_REDIRECT;
804 	i = 0;
805 
806 	e = HTTP_PROTOCOL_ERROR;
807 	need_auth = 0;
808 	do {
809 		new = NULL;
810 		chunked = 0;
811 		offset = 0;
812 		clength = -1;
813 		length = -1;
814 		size = -1;
815 		mtime = 0;
816 
817 		/* check port */
818 		if (!url->port)
819 			url->port = fetch_default_port(url->scheme);
820 
821 		/* were we redirected to an FTP URL? */
822 		if (purl == NULL && strcmp(url->scheme, SCHEME_FTP) == 0) {
823 			if (strcmp(op, "GET") == 0)
824 				return (ftp_request(url, "RETR", NULL, us, purl, flags));
825 			else if (strcmp(op, "HEAD") == 0)
826 				return (ftp_request(url, "STAT", NULL, us, purl, flags));
827 		}
828 
829 		/* connect to server or proxy */
830 		if ((conn = http_connect(url, purl, flags)) == NULL)
831 			goto ouch;
832 
833 		host = url->host;
834 #ifdef INET6
835 		if (strchr(url->host, ':')) {
836 			snprintf(hbuf, sizeof(hbuf), "[%s]", url->host);
837 			host = hbuf;
838 		}
839 #endif
840 		if (url->port != fetch_default_port(url->scheme)) {
841 			if (host != hbuf) {
842 				strcpy(hbuf, host);
843 				host = hbuf;
844 			}
845 			snprintf(hbuf + strlen(hbuf),
846 			    sizeof(hbuf) - strlen(hbuf), ":%d", url->port);
847 		}
848 
849 		/* send request */
850 		if (verbose)
851 			fetch_info("requesting %s://%s%s",
852 			    url->scheme, host, url->doc);
853 		if (purl) {
854 			http_cmd(conn, "%s %s://%s%s HTTP/1.1",
855 			    op, url->scheme, host, url->doc);
856 		} else {
857 			http_cmd(conn, "%s %s HTTP/1.1",
858 			    op, url->doc);
859 		}
860 
861 		if (if_modified_since && url->last_modified > 0)
862 			set_if_modified_since(conn, url->last_modified);
863 
864 		/* virtual host */
865 		http_cmd(conn, "Host: %s", host);
866 
867 		/* proxy authorization */
868 		if (purl) {
869 			if (*purl->user || *purl->pwd)
870 				http_basic_auth(conn, "Proxy-Authorization",
871 				    purl->user, purl->pwd);
872 			else if ((p = getenv("HTTP_PROXY_AUTH")) != NULL && *p != '\0')
873 				http_authorize(conn, "Proxy-Authorization", p);
874 		}
875 
876 		/* server authorization */
877 		if (need_auth || *url->user || *url->pwd) {
878 			if (*url->user || *url->pwd)
879 				http_basic_auth(conn, "Authorization", url->user, url->pwd);
880 			else if ((p = getenv("HTTP_AUTH")) != NULL && *p != '\0')
881 				http_authorize(conn, "Authorization", p);
882 			else if (fetchAuthMethod && fetchAuthMethod(url) == 0) {
883 				http_basic_auth(conn, "Authorization", url->user, url->pwd);
884 			} else {
885 				http_seterr(HTTP_NEED_AUTH);
886 				goto ouch;
887 			}
888 		}
889 
890 		/* other headers */
891 		if ((p = getenv("HTTP_REFERER")) != NULL && *p != '\0') {
892 			if (strcasecmp(p, "auto") == 0)
893 				http_cmd(conn, "Referer: %s://%s%s",
894 				    url->scheme, host, url->doc);
895 			else
896 				http_cmd(conn, "Referer: %s", p);
897 		}
898 		if ((p = getenv("HTTP_USER_AGENT")) != NULL && *p != '\0')
899 			http_cmd(conn, "User-Agent: %s", p);
900 		else
901 			http_cmd(conn, "User-Agent: %s ", _LIBFETCH_VER);
902 		if (url->offset > 0)
903 			http_cmd(conn, "Range: bytes=%lld-", (long long)url->offset);
904 		http_cmd(conn, "Connection: close");
905 		http_cmd(conn, "");
906 
907 		/*
908 		 * Force the queued request to be dispatched.  Normally, one
909 		 * would do this with shutdown(2) but squid proxies can be
910 		 * configured to disallow such half-closed connections.  To
911 		 * be compatible with such configurations, fiddle with socket
912 		 * options to force the pending data to be written.
913 		 */
914 #ifdef TCP_NOPUSH
915 		val = 0;
916 		setsockopt(conn->sd, IPPROTO_TCP, TCP_NOPUSH, &val,
917 			   sizeof(val));
918 #endif
919 		val = 1;
920 		setsockopt(conn->sd, IPPROTO_TCP, TCP_NODELAY, &val,
921 			   sizeof(val));
922 
923 		/* get reply */
924 		switch (http_get_reply(conn)) {
925 		case HTTP_OK:
926 		case HTTP_PARTIAL:
927 		case HTTP_NOT_MODIFIED:
928 			/* fine */
929 			break;
930 		case HTTP_MOVED_PERM:
931 		case HTTP_MOVED_TEMP:
932 		case HTTP_SEE_OTHER:
933 			/*
934 			 * Not so fine, but we still have to read the
935 			 * headers to get the new location.
936 			 */
937 			break;
938 		case HTTP_NEED_AUTH:
939 			if (need_auth) {
940 				/*
941 				 * We already sent out authorization code,
942 				 * so there's nothing more we can do.
943 				 */
944 				http_seterr(conn->err);
945 				goto ouch;
946 			}
947 			/* try again, but send the password this time */
948 			if (verbose)
949 				fetch_info("server requires authorization");
950 			break;
951 		case HTTP_NEED_PROXY_AUTH:
952 			/*
953 			 * If we're talking to a proxy, we already sent
954 			 * our proxy authorization code, so there's
955 			 * nothing more we can do.
956 			 */
957 			http_seterr(conn->err);
958 			goto ouch;
959 		case HTTP_BAD_RANGE:
960 			/*
961 			 * This can happen if we ask for 0 bytes because
962 			 * we already have the whole file.  Consider this
963 			 * a success for now, and check sizes later.
964 			 */
965 			break;
966 		case HTTP_PROTOCOL_ERROR:
967 			/* fall through */
968 		case -1:
969 			fetch_syserr();
970 			goto ouch;
971 		default:
972 			http_seterr(conn->err);
973 			if (!verbose)
974 				goto ouch;
975 			/* fall through so we can get the full error message */
976 		}
977 
978 		/* get headers */
979 		do {
980 			switch ((h = http_next_header(conn, &p))) {
981 			case hdr_syserror:
982 				fetch_syserr();
983 				goto ouch;
984 			case hdr_error:
985 				http_seterr(HTTP_PROTOCOL_ERROR);
986 				goto ouch;
987 			case hdr_content_length:
988 				http_parse_length(p, &clength);
989 				break;
990 			case hdr_content_range:
991 				http_parse_range(p, &offset, &length, &size);
992 				break;
993 			case hdr_last_modified:
994 				http_parse_mtime(p, &mtime);
995 				break;
996 			case hdr_location:
997 				if (!HTTP_REDIRECT(conn->err))
998 					break;
999 				if (new)
1000 					free(new);
1001 				if (verbose)
1002 					fetch_info("%d redirect to %s", conn->err, p);
1003 				if (*p == '/')
1004 					/* absolute path */
1005 					new = fetchMakeURL(url->scheme, url->host, url->port, p,
1006 					    url->user, url->pwd);
1007 				else
1008 					new = fetchParseURL(p);
1009 				if (new == NULL) {
1010 					/* XXX should set an error code */
1011 					goto ouch;
1012 				}
1013 				if (!*new->user && !*new->pwd) {
1014 					strcpy(new->user, url->user);
1015 					strcpy(new->pwd, url->pwd);
1016 				}
1017 				new->offset = url->offset;
1018 				new->length = url->length;
1019 				break;
1020 			case hdr_transfer_encoding:
1021 				/* XXX weak test*/
1022 				chunked = (strcasecmp(p, "chunked") == 0);
1023 				break;
1024 			case hdr_www_authenticate:
1025 				if (conn->err != HTTP_NEED_AUTH)
1026 					break;
1027 				/* if we were smarter, we'd check the method and realm */
1028 				break;
1029 			case hdr_end:
1030 				/* fall through */
1031 			case hdr_unknown:
1032 				/* ignore */
1033 				break;
1034 			}
1035 		} while (h > hdr_end);
1036 
1037 		/* we need to provide authentication */
1038 		if (conn->err == HTTP_NEED_AUTH) {
1039 			e = conn->err;
1040 			need_auth = 1;
1041 			fetch_close(conn);
1042 			conn = NULL;
1043 			continue;
1044 		}
1045 
1046 		/* requested range not satisfiable */
1047 		if (conn->err == HTTP_BAD_RANGE) {
1048 			if (url->offset == size && url->length == 0) {
1049 				/* asked for 0 bytes; fake it */
1050 				offset = url->offset;
1051 				conn->err = HTTP_OK;
1052 				break;
1053 			} else {
1054 				http_seterr(conn->err);
1055 				goto ouch;
1056 			}
1057 		}
1058 
1059 		/* we have a hit or an error */
1060 		if (conn->err == HTTP_OK ||
1061 		    conn->err == HTTP_PARTIAL ||
1062 		    conn->err == HTTP_NOT_MODIFIED ||
1063 		    HTTP_ERROR(conn->err))
1064 			break;
1065 
1066 		/* all other cases: we got a redirect */
1067 		e = conn->err;
1068 		need_auth = 0;
1069 		fetch_close(conn);
1070 		conn = NULL;
1071 		if (!new)
1072 			break;
1073 		if (url != URL)
1074 			fetchFreeURL(url);
1075 		url = new;
1076 	} while (++i < n);
1077 
1078 	/* we failed, or ran out of retries */
1079 	if (conn == NULL) {
1080 		http_seterr(e);
1081 		goto ouch;
1082 	}
1083 
1084 	/* check for inconsistencies */
1085 	if (clength != -1 && length != -1 && clength != length) {
1086 		http_seterr(HTTP_PROTOCOL_ERROR);
1087 		goto ouch;
1088 	}
1089 	if (clength == -1)
1090 		clength = length;
1091 	if (clength != -1)
1092 		length = offset + clength;
1093 	if (length != -1 && size != -1 && length != size) {
1094 		http_seterr(HTTP_PROTOCOL_ERROR);
1095 		goto ouch;
1096 	}
1097 	if (size == -1)
1098 		size = length;
1099 
1100 	/* fill in stats */
1101 	if (us) {
1102 		us->size = size;
1103 		us->atime = us->mtime = mtime;
1104 	}
1105 
1106 	/* too far? */
1107 	if (URL->offset > 0 && offset > URL->offset) {
1108 		http_seterr(HTTP_PROTOCOL_ERROR);
1109 		goto ouch;
1110 	}
1111 
1112 	/* report back real offset and size */
1113 	URL->offset = offset;
1114 	URL->length = clength;
1115 
1116 	if (conn->err == HTTP_NOT_MODIFIED) {
1117 		http_seterr(HTTP_NOT_MODIFIED);
1118 		return (NULL);
1119 	}
1120 
1121 	/* wrap it up in a fetchIO */
1122 	if ((f = http_funopen(conn, chunked)) == NULL) {
1123 		fetch_syserr();
1124 		goto ouch;
1125 	}
1126 
1127 	if (url != URL)
1128 		fetchFreeURL(url);
1129 	if (purl)
1130 		fetchFreeURL(purl);
1131 
1132 	if (HTTP_ERROR(conn->err)) {
1133 		fetchIO_close(f);
1134 		f = NULL;
1135 	}
1136 
1137 	return (f);
1138 
1139 ouch:
1140 	if (url != URL)
1141 		fetchFreeURL(url);
1142 	if (purl)
1143 		fetchFreeURL(purl);
1144 	if (conn != NULL)
1145 		fetch_close(conn);
1146 	return (NULL);
1147 }
1148 
1149 
1150 /*****************************************************************************
1151  * Entry points
1152  */
1153 
1154 /*
1155  * Retrieve and stat a file by HTTP
1156  */
1157 fetchIO *
1158 fetchXGetHTTP(struct url *URL, struct url_stat *us, const char *flags)
1159 {
1160 	return (http_request(URL, "GET", us, http_get_proxy(URL, flags), flags));
1161 }
1162 
1163 /*
1164  * Retrieve a file by HTTP
1165  */
1166 fetchIO *
1167 fetchGetHTTP(struct url *URL, const char *flags)
1168 {
1169 	return (fetchXGetHTTP(URL, NULL, flags));
1170 }
1171 
1172 /*
1173  * Store a file by HTTP
1174  */
1175 fetchIO *
1176 fetchPutHTTP(struct url *URL, const char *flags)
1177 {
1178 	fprintf(stderr, "fetchPutHTTP(): not implemented\n");
1179 	return (NULL);
1180 }
1181 
1182 /*
1183  * Get an HTTP document's metadata
1184  */
1185 int
1186 fetchStatHTTP(struct url *URL, struct url_stat *us, const char *flags)
1187 {
1188 	fetchIO *f;
1189 
1190 	f = http_request(URL, "HEAD", us, http_get_proxy(URL, flags), flags);
1191 	if (f == NULL)
1192 		return (-1);
1193 	fetchIO_close(f);
1194 	return (0);
1195 }
1196 
1197 enum http_states {
1198 	ST_NONE,
1199 	ST_LT,
1200 	ST_LTA,
1201 	ST_TAGA,
1202 	ST_H,
1203 	ST_R,
1204 	ST_E,
1205 	ST_F,
1206 	ST_HREF,
1207 	ST_HREFQ,
1208 	ST_TAG,
1209 	ST_TAGAX,
1210 	ST_TAGAQ
1211 };
1212 
1213 struct index_parser {
1214 	struct url_list *ue;
1215 	struct url *url;
1216 	enum http_states state;
1217 };
1218 
1219 static size_t
1220 parse_index(struct index_parser *parser, const char *buf, size_t len)
1221 {
1222 	char *end_attr, p = *buf;
1223 
1224 	switch (parser->state) {
1225 	case ST_NONE:
1226 		/* Plain text, not in markup */
1227 		if (p == '<')
1228 			parser->state = ST_LT;
1229 		return 1;
1230 	case ST_LT:
1231 		/* In tag -- "<" already found */
1232 		if (p == '>')
1233 			parser->state = ST_NONE;
1234 		else if (p == 'a' || p == 'A')
1235 			parser->state = ST_LTA;
1236 		else if (!isspace((unsigned char)p))
1237 			parser->state = ST_TAG;
1238 		return 1;
1239 	case ST_LTA:
1240 		/* In tag -- "<a" already found */
1241 		if (p == '>')
1242 			parser->state = ST_NONE;
1243 		else if (p == '"')
1244 			parser->state = ST_TAGAQ;
1245 		else if (isspace((unsigned char)p))
1246 			parser->state = ST_TAGA;
1247 		else
1248 			parser->state = ST_TAG;
1249 		return 1;
1250 	case ST_TAG:
1251 		/* In tag, but not "<a" -- disregard */
1252 		if (p == '>')
1253 			parser->state = ST_NONE;
1254 		return 1;
1255 	case ST_TAGA:
1256 		/* In a-tag -- "<a " already found */
1257 		if (p == '>')
1258 			parser->state = ST_NONE;
1259 		else if (p == '"')
1260 			parser->state = ST_TAGAQ;
1261 		else if (p == 'h' || p == 'H')
1262 			parser->state = ST_H;
1263 		else if (!isspace((unsigned char)p))
1264 			parser->state = ST_TAGAX;
1265 		return 1;
1266 	case ST_TAGAX:
1267 		/* In unknown keyword in a-tag */
1268 		if (p == '>')
1269 			parser->state = ST_NONE;
1270 		else if (p == '"')
1271 			parser->state = ST_TAGAQ;
1272 		else if (isspace((unsigned char)p))
1273 			parser->state = ST_TAGA;
1274 		return 1;
1275 	case ST_TAGAQ:
1276 		/* In a-tag, unknown argument for keys. */
1277 		if (p == '>')
1278 			parser->state = ST_NONE;
1279 		else if (p == '"')
1280 			parser->state = ST_TAGA;
1281 		return 1;
1282 	case ST_H:
1283 		/* In a-tag -- "<a h" already found */
1284 		if (p == '>')
1285 			parser->state = ST_NONE;
1286 		else if (p == '"')
1287 			parser->state = ST_TAGAQ;
1288 		else if (p == 'r' || p == 'R')
1289 			parser->state = ST_R;
1290 		else if (isspace((unsigned char)p))
1291 			parser->state = ST_TAGA;
1292 		else
1293 			parser->state = ST_TAGAX;
1294 		return 1;
1295 	case ST_R:
1296 		/* In a-tag -- "<a hr" already found */
1297 		if (p == '>')
1298 			parser->state = ST_NONE;
1299 		else if (p == '"')
1300 			parser->state = ST_TAGAQ;
1301 		else if (p == 'e' || p == 'E')
1302 			parser->state = ST_E;
1303 		else if (isspace((unsigned char)p))
1304 			parser->state = ST_TAGA;
1305 		else
1306 			parser->state = ST_TAGAX;
1307 		return 1;
1308 	case ST_E:
1309 		/* In a-tag -- "<a hre" already found */
1310 		if (p == '>')
1311 			parser->state = ST_NONE;
1312 		else if (p == '"')
1313 			parser->state = ST_TAGAQ;
1314 		else if (p == 'f' || p == 'F')
1315 			parser->state = ST_F;
1316 		else if (isspace((unsigned char)p))
1317 			parser->state = ST_TAGA;
1318 		else
1319 			parser->state = ST_TAGAX;
1320 		return 1;
1321 	case ST_F:
1322 		/* In a-tag -- "<a href" already found */
1323 		if (p == '>')
1324 			parser->state = ST_NONE;
1325 		else if (p == '"')
1326 			parser->state = ST_TAGAQ;
1327 		else if (p == '=')
1328 			parser->state = ST_HREF;
1329 		else if (!isspace((unsigned char)p))
1330 			parser->state = ST_TAGAX;
1331 		return 1;
1332 	case ST_HREF:
1333 		/* In a-tag -- "<a href=" already found */
1334 		if (p == '>')
1335 			parser->state = ST_NONE;
1336 		else if (p == '"')
1337 			parser->state = ST_HREFQ;
1338 		else if (!isspace((unsigned char)p))
1339 			parser->state = ST_TAGA;
1340 		return 1;
1341 	case ST_HREFQ:
1342 		/* In href of the a-tag */
1343 		end_attr = memchr(buf, '"', len);
1344 		if (end_attr == NULL)
1345 			return 0;
1346 		*end_attr = '\0';
1347 		parser->state = ST_TAGA;
1348 		fetch_add_entry(parser->ue, parser->url, buf, 1);
1349 		return end_attr + 1 - buf;
1350 	}
1351 	abort();
1352 }
1353 
1354 /*
1355  * List a directory
1356  */
1357 int
1358 fetchListHTTP(struct url_list *ue, struct url *url, const char *pattern, const char *flags)
1359 {
1360 	fetchIO *f;
1361 	char buf[2 * PATH_MAX];
1362 	size_t buf_len, processed, sum_processed;
1363 	ssize_t read_len;
1364 	struct index_parser state;
1365 
1366 	state.url = url;
1367 	state.state = ST_NONE;
1368 	state.ue = ue;
1369 
1370 	f = fetchGetHTTP(url, flags);
1371 	if (f == NULL)
1372 		return -1;
1373 
1374 	buf_len = 0;
1375 
1376 	while ((read_len = fetchIO_read(f, buf + buf_len, sizeof(buf) - buf_len)) > 0) {
1377 		buf_len += read_len;
1378 		sum_processed = 0;
1379 		do {
1380 			processed = parse_index(&state, buf + sum_processed, buf_len);
1381 			buf_len -= processed;
1382 			sum_processed += processed;
1383 		} while (processed != 0 && buf_len > 0);
1384 		memmove(buf, buf + sum_processed, buf_len);
1385 	}
1386 
1387 	fetchIO_close(f);
1388 	return read_len < 0 ? -1 : 0;
1389 }
1390