src/libhttpd/parse.c

#include <u.h>
#include <libc.h>
#include <libsec.h>
#include <bin.h>
#include <httpd.h>
#include "escape.h"

typedef struct Hlex	Hlex;
typedef struct MimeHead	MimeHead;

enum
{
	/*
	 * tokens
	 */
	Word	= 1,
	QString,
};

#define UlongMax	4294967295UL

struct Hlex
{
	int	tok;
	int	eoh;
	int	eol;			/* end of header line encountered? */
	uchar	*hstart;		/* start of header */
	jmp_buf	jmp;			/* jmp here to parse header */
	char	wordval[HMaxWord];
	HConnect *c;
};

struct MimeHead
{
	char	*name;
	void	(*parse)(Hlex*, char*);
	uchar	seen;
	uchar	ignore;
};

static void	mimeaccept(Hlex*, char*);
static void	mimeacceptchar(Hlex*, char*);
static void	mimeacceptenc(Hlex*, char*);
static void	mimeacceptlang(Hlex*, char*);
static void	mimeagent(Hlex*, char*);
static void	mimeauthorization(Hlex*, char*);
static void	mimeconnection(Hlex*, char*);
static void	mimecontlen(Hlex*, char*);
static void	mimeexpect(Hlex*, char*);
static void	mimefresh(Hlex*, char*);
static void	mimefrom(Hlex*, char*);
static void	mimehost(Hlex*, char*);
static void	mimeifrange(Hlex*, char*);
static void	mimeignore(Hlex*, char*);
static void	mimematch(Hlex*, char*);
static void	mimemodified(Hlex*, char*);
static void	mimenomatch(Hlex*, char*);
static void	mimerange(Hlex*, char*);
static void	mimetransenc(Hlex*, char*);
static void	mimeunmodified(Hlex*, char*);

/*
 * headers seen also include
 * allow  cache-control chargeto
 * content-encoding content-language content-location content-md5 content-range content-type
 * date etag expires forwarded last-modified max-forwards pragma
 * proxy-agent proxy-authorization proxy-connection
 * ua-color ua-cpu ua-os ua-pixels
 * upgrade via x-afs-tokens x-serial-number
 */
static MimeHead	mimehead[] =
{
	{"accept",		mimeaccept},
	{"accept-charset",	mimeacceptchar},
	{"accept-encoding",	mimeacceptenc},
	{"accept-language",	mimeacceptlang},
	{"authorization",	mimeauthorization},
	{"connection",		mimeconnection},
	{"content-length",	mimecontlen},
	{"expect",		mimeexpect},
	{"fresh",		mimefresh},
	{"from",		mimefrom},
	{"host",		mimehost},
	{"if-match",		mimematch},
	{"if-modified-since",	mimemodified},
	{"if-none-match",	mimenomatch},
	{"if-range",		mimeifrange},
	{"if-unmodified-since",	mimeunmodified},
	{"range",		mimerange},
	{"transfer-encoding",	mimetransenc},
	{"user-agent",		mimeagent},
};

char*		hmydomain;
char*		hversion = "HTTP/1.1";

static	void	lexhead(Hlex*);
static	void	parsejump(Hlex*, char*);
static	int	getc(Hlex*);
static	void	ungetc(Hlex*);
static	int	wordcr(Hlex*);
static	int	wordnl(Hlex*);
static	void	word(Hlex*, char*);
static	int	lex1(Hlex*, int);
static	int	lex(Hlex*);
static	int	lexbase64(Hlex*);
static	ulong	digtoul(char *s, char **e);

/*
 * flush an clean up junk from a request
 */
void
hreqcleanup(HConnect *c)
{
	int i;

	hxferenc(&c->hout, 0);
	memset(&c->req, 0, sizeof(c->req));
	memset(&c->head, 0, sizeof(c->head));
	c->hpos = c->header;
	c->hstop = c->header;
	binfree(&c->bin);
	for(i = 0; i < nelem(mimehead); i++){
		mimehead[i].seen = 0;
		mimehead[i].ignore = 0;
	}
}

/*
 * list of tokens
 * if the client is HTTP/1.0,
 * ignore headers which match one of the tokens.
 * restarts parsing if necessary.
 */
static void
mimeconnection(Hlex *h, char *)
{
	char *u, *p;
	int reparse, i;

	reparse = 0;
	for(;;){
		while(lex(h) != Word)
			if(h->tok != ',')
				goto breakout;

		if(cistrcmp(h->wordval, "keep-alive") == 0)
			h->c->head.persist = 1;
		else if(cistrcmp(h->wordval, "close") == 0)
			h->c->head.closeit = 1;
		else if(!http11(h->c)){
			for(i = 0; i < nelem(mimehead); i++){
				if(cistrcmp(mimehead[i].name, h->wordval) == 0){
					reparse = mimehead[i].seen && !mimehead[i].ignore;
					mimehead[i].ignore = 1;
					if(cistrcmp(mimehead[i].name, "authorization") == 0){
						h->c->head.authuser = nil;
						h->c->head.authpass = nil;
					}
				}
			}
		}

		if(lex(h) != ',')
			break;
	}

breakout:;
	/*
	 * if need to ignore headers we've already parsed,
	 * reset & start over.  need to save authorization
	 * info because it's written over when parsed.
	 */
	if(reparse){
		u = h->c->head.authuser;
		p = h->c->head.authpass;
		memset(&h->c->head, 0, sizeof(h->c->head));
		h->c->head.authuser = u;
		h->c->head.authpass = p;

		h->c->hpos = h->hstart;
		longjmp(h->jmp, 1);
	}
}

int
hparseheaders(HConnect *c, int timeout)
{
	Hlex h;

	c->head.fresh_thresh = 0;
	c->head.fresh_have = 0;
	c->head.persist = 0;
	if(c->req.vermaj == 0){
		c->head.host = hmydomain;
		return 1;
	}

	memset(&h, 0, sizeof(h));
	h.c = c;
	alarm(timeout);
	if(!hgethead(c, 1))
		return -1;
	alarm(0);
	h.hstart = c->hpos;

	if(setjmp(h.jmp) == -1)
		return -1;

	h.eol = 0;
	h.eoh = 0;
	h.tok = '\n';
	while(lex(&h) != '\n'){
		if(h.tok == Word && lex(&h) == ':')
			parsejump(&h, hstrdup(c, h.wordval));
		while(h.tok != '\n')
			lex(&h);
		h.eol = h.eoh;
	}

	if(http11(c)){
		/*
		 * according to the http/1.1 spec,
		 * these rules must be followed
		 */
		if(c->head.host == nil){
			hfail(c, HBadReq, nil);
			return -1;
		}
		if(c->req.urihost != nil)
			c->head.host = c->req.urihost;
		/*
		 * also need to check host is actually this one
		 */
	}else if(c->head.host == nil)
		c->head.host = hmydomain;
	return 1;
}

/*
 * mimeparams	: | mimeparams ";" mimepara
 * mimeparam	: token "=" token | token "=" qstring
 */
static HSPairs*
mimeparams(Hlex *h)
{
	HSPairs *p;
	char *s;

	p = nil;
	for(;;){
		if(lex(h) != Word)
			break;
		s = hstrdup(h->c, h->wordval);
		if(lex(h) != Word && h->tok != QString)
			break;
		p = hmkspairs(h->c, s, hstrdup(h->c, h->wordval), p);
	}
	return hrevspairs(p);
}

/*
 * mimehfields	: mimehfield | mimehfields commas mimehfield
 * mimehfield	: token mimeparams
 * commas	: "," | commas ","
 */
static HFields*
mimehfields(Hlex *h)
{
	HFields *f;

	f = nil;
	for(;;){
		while(lex(h) != Word)
			if(h->tok != ',')
				goto breakout;

		f = hmkhfields(h->c, hstrdup(h->c, h->wordval), nil, f);

		if(lex(h) == ';')
			f->params = mimeparams(h);
		if(h->tok != ',')
			break;
	}
breakout:;
	return hrevhfields(f);
}

/*
 * parse a list of acceptable types, encodings, languages, etc.
 */
static HContent*
mimeok(Hlex *h, char *name, int multipart, HContent *head)
{
	char *generic, *specific, *s;
	float v;

	/*
	 * each type is separated by one or more commas
	 */
	while(lex(h) != Word)
		if(h->tok != ',')
			return head;

	generic = hstrdup(h->c, h->wordval);
	lex(h);
	if(h->tok == '/' || multipart){
		/*
		 * at one time, IE5 improperly said '*' for single types
		 */
		if(h->tok != '/')
			return nil;
		if(lex(h) != Word)
			return head;
		specific = hstrdup(h->c, h->wordval);
		if(!multipart && strcmp(specific, "*") != 0)
			return head;
		lex(h);
	}else
		specific = nil;
	head = hmkcontent(h->c, generic, specific, head);

	for(;;){
		switch(h->tok){
		case ';':
			/*
			 * should make a list of these params
			 * for accept, they fall into two classes:
			 *	up to a q=..., they modify the media type.
			 *	afterwards, they acceptance criteria
			 */
			if(lex(h) == Word){
				s = hstrdup(h->c, h->wordval);
				if(lex(h) != '=' || lex(h) != Word && h->tok != QString)
					return head;
				v = strtod(h->wordval, nil);
				if(strcmp(s, "q") == 0)
					head->q = v;
				else if(strcmp(s, "mxb") == 0)
					head->mxb = v;
			}
			break;
		case ',':
			return  mimeok(h, name, multipart, head);
		default:
			return head;
		}
		lex(h);
	}
	return head;
}

/*
 * parse a list of entity tags
 * 1#entity-tag
 * entity-tag = [weak] opaque-tag
 * weak = "W/"
 * opaque-tag = quoted-string
 */
static HETag*
mimeetag(Hlex *h, HETag *head)
{
	HETag *e;
	int weak;

	for(;;){
		while(lex(h) != Word && h->tok != QString)
			if(h->tok != ',')
				return head;

		weak = 0;
		if(h->tok == Word && strcmp(h->wordval, "*") != 0){
			if(strcmp(h->wordval, "W") != 0)
				return head;
			if(lex(h) != '/' || lex(h) != QString)
				return head;
			weak = 1;
		}

		e = halloc(h->c, sizeof(HETag));
		e->etag = hstrdup(h->c, h->wordval);
		e->weak = weak;
		e->next = head;
		head = e;

		if(lex(h) != ',')
			return head;
	}
	return head;
}

/*
 * ranges-specifier = byte-ranges-specifier
 * byte-ranges-specifier = "bytes" "=" byte-range-set
 * byte-range-set = 1#(byte-range-spec|suffix-byte-range-spec)
 * byte-range-spec = byte-pos "-" [byte-pos]
 * byte-pos = 1*DIGIT
 * suffix-byte-range-spec = "-" suffix-length
 * suffix-length = 1*DIGIT
 *
 * syntactically invalid range specifiers cause the
 * entire header field to be ignored.
 * it is syntactically incorrect for the second byte pos
 * to be smaller than the first byte pos
 */
static HRange*
mimeranges(Hlex *h, HRange *head)
{
	HRange *r, *rh, *tail;
	char *w;
	ulong start, stop;
	int suf;

	if(lex(h) != Word || strcmp(h->wordval, "bytes") != 0 || lex(h) != '=')
		return head;

	rh = nil;
	tail = nil;
	for(;;){
		while(lex(h) != Word){
			if(h->tok != ','){
				if(h->tok == '\n')
					goto breakout;
				return head;
			}
		}

		w = h->wordval;
		start = 0;
		suf = 1;
		if(w[0] != '-'){
			suf = 0;
			start = digtoul(w, &w);
			if(w[0] != '-')
				return head;
		}
		w++;
		stop = ~0UL;
		if(w[0] != '\0'){
			stop = digtoul(w, &w);
			if(w[0] != '\0')
				return head;
			if(!suf && stop < start)
				return head;
		}

		r = halloc(h->c, sizeof(HRange));
		r->suffix = suf;
		r->start = start;
		r->stop = stop;
		r->next = nil;
		if(rh == nil)
			rh = r;
		else
			tail->next = r;
		tail = r;

		if(lex(h) != ','){
			if(h->tok == '\n')
				break;
			return head;
		}
	}
breakout:;

	if(head == nil)
		return rh;

	for(tail = head; tail->next != nil; tail = tail->next)
		;
	tail->next = rh;
	return head;
}

static void
mimeaccept(Hlex *h, char *name)
{
	h->c->head.oktype = mimeok(h, name, 1, h->c->head.oktype);
}

static void
mimeacceptchar(Hlex *h, char *name)
{
	h->c->head.okchar = mimeok(h, name, 0, h->c->head.okchar);
}

static void
mimeacceptenc(Hlex *h, char *name)
{
	h->c->head.okencode = mimeok(h, name, 0, h->c->head.okencode);
}

static void
mimeacceptlang(Hlex *h, char *name)
{
	h->c->head.oklang = mimeok(h, name, 0, h->c->head.oklang);
}

static void
mimemodified(Hlex *h, char *)
{
	lexhead(h);
	h->c->head.ifmodsince = hdate2sec(h->wordval);
}

static void
mimeunmodified(Hlex *h, char *)
{
	lexhead(h);
	h->c->head.ifunmodsince = hdate2sec(h->wordval);
}

static void
mimematch(Hlex *h, char *)
{
	h->c->head.ifmatch = mimeetag(h, h->c->head.ifmatch);
}

static void
mimenomatch(Hlex *h, char *)
{
	h->c->head.ifnomatch = mimeetag(h, h->c->head.ifnomatch);
}

/*
 * argument is either etag or date
 */
static void
mimeifrange(Hlex *h, char *)
{
	int c, d, et;

	et = 0;
	c = getc(h);
	while(c == ' ' || c == '\t')
		c = getc(h);
	if(c == '"')
		et = 1;
	else if(c == 'W'){
		d = getc(h);
		if(d == '/')
			et = 1;
		ungetc(h);
	}
	ungetc(h);
	if(et){
		h->c->head.ifrangeetag = mimeetag(h, h->c->head.ifrangeetag);
	}else{
		lexhead(h);
		h->c->head.ifrangedate = hdate2sec(h->wordval);
	}
}

static void
mimerange(Hlex *h, char *)
{
	h->c->head.range = mimeranges(h, h->c->head.range);
}

/*
 * note: netscape and ie through versions 4.7 and 4
 * support only basic authorization, so that is all that is supported here
 *
 * "Authorization" ":" "Basic" base64-user-pass
 * where base64-user-pass is the base64 encoding of
 * username ":" password
 */
static void
mimeauthorization(Hlex *h, char *)
{
	char *up, *p;
	int n;

	if(lex(h) != Word || cistrcmp(h->wordval, "basic") != 0)
		return;

	n = lexbase64(h);
	if(!n)
		return;

	/*
	 * wipe out source for password, so it won't be logged.
	 * it is replaced by a single =,
	 * which is valid base64, but not ok for an auth reponse.
	 * therefore future parses of the header field will not overwrite
	 * authuser and authpass.
	 */
	memmove(h->c->hpos - (n - 1), h->c->hpos, h->c->hstop - h->c->hpos);
	h->c->hstop -= n - 1;
	*h->c->hstop = '\0';
	h->c->hpos -= n - 1;
	h->c->hpos[-1] = '=';

	up = halloc(h->c, n + 1);
	n = dec64((uchar*)up, n, h->wordval, n);
	up[n] = '\0';
	p = strchr(up, ':');
	if(p != nil){
		*p++ = '\0';
		h->c->head.authuser = hstrdup(h->c, up);
		h->c->head.authpass = hstrdup(h->c, p);
	}
}

static void
mimeagent(Hlex *h, char *)
{
	lexhead(h);
	h->c->head.client = hstrdup(h->c, h->wordval);
}

static void
mimefrom(Hlex *h, char *)
{
	lexhead(h);
}

static void
mimehost(Hlex *h, char *)
{
	char *hd;

	lexhead(h);
	for(hd = h->wordval; *hd == ' ' || *hd == '\t'; hd++)
		;
	h->c->head.host = hlower(hstrdup(h->c, hd));
}

/*
 * if present, implies that a message body follows the headers
 * "content-length" ":" digits
 */
static void
mimecontlen(Hlex *h, char *)
{
	char *e;
	ulong v;

	if(lex(h) != Word)
		return;
	e = h->wordval;
	v = digtoul(e, &e);
	if(v == ~0UL || *e != '\0')
		return;
	h->c->head.contlen = v;
}

/*
 * mimexpect	: "expect" ":" expects
 * expects	: | expects "," expect
 * expect	: "100-continue" | token | token "=" token expectparams | token "=" qstring expectparams
 * expectparams	: ";" token | ";" token "=" token | token "=" qstring
 * for now, we merely parse "100-continue" or anything else.
 */
static void
mimeexpect(Hlex *h, char *)
{
	if(lex(h) != Word || cistrcmp(h->wordval, "100-continue") != 0 || lex(h) != '\n')
		h->c->head.expectother = 1;
	h->c->head.expectcont = 1;
}

static void
mimetransenc(Hlex *h, char *)
{
	h->c->head.transenc = mimehfields(h);
}

static void
mimefresh(Hlex *h, char *)
{
	char *s;

	lexhead(h);
	for(s = h->wordval; *s && (*s==' ' || *s=='\t'); s++)
		;
	if(strncmp(s, "pathstat/", 9) == 0)
		h->c->head.fresh_thresh = atoi(s+9);
	else if(strncmp(s, "have/", 5) == 0)
		h->c->head.fresh_have = atoi(s+5);
}

static void
mimeignore(Hlex *h, char *)
{
	lexhead(h);
}

static void
parsejump(Hlex *h, char *k)
{
	int l, r, m;

	l = 1;
	r = nelem(mimehead) - 1;
	while(l <= r){
		m = (r + l) >> 1;
		if(cistrcmp(mimehead[m].name, k) <= 0)
			l = m + 1;
		else
			r = m - 1;
	}
	m = l - 1;
	if(cistrcmp(mimehead[m].name, k) == 0 && !mimehead[m].ignore){
		mimehead[m].seen = 1;
		(*mimehead[m].parse)(h, k);
	}else
		mimeignore(h, k);
}

static int
lex(Hlex *h)
{
	return h->tok = lex1(h, 0);
}

static int
lexbase64(Hlex *h)
{
	int c, n;

	n = 0;
	lex1(h, 1);

	while((c = getc(h)) >= 0){
		if(!(c >= 'A' && c <= 'Z'
		|| c >= 'a' && c <= 'z'
		|| c >= '0' && c <= '9'
		|| c == '+' || c == '/')){
			ungetc(h);
			break;
		}

		if(n < HMaxWord-1)
			h->wordval[n++] = c;
	}
	h->wordval[n] = '\0';
	return n;
}

/*
 * rfc 822/rfc 1521 lexical analyzer
 */
static int
lex1(Hlex *h, int skipwhite)
{
	int level, c;

	if(h->eol)
		return '\n';

top:
	c = getc(h);
	switch(c){
	case '(':
		level = 1;
		while((c = getc(h)) >= 0){
			if(c == '\\'){
				c = getc(h);
				if(c < 0)
					return '\n';
				continue;
			}
			if(c == '(')
				level++;
			else if(c == ')' && --level == 0)
				break;
			else if(c == '\n'){
				c = getc(h);
				if(c < 0)
					return '\n';
				if(c == ')' && --level == 0)
					break;
				if(c != ' ' && c != '\t'){
					ungetc(h);
					return '\n';
				}
			}
		}
		goto top;

	case ' ': case '\t':
		goto top;

	case '\r':
		c = getc(h);
		if(c != '\n'){
			ungetc(h);
			goto top;
		}

	case '\n':
		if(h->tok == '\n'){
			h->eol = 1;
			h->eoh = 1;
			return '\n';
		}
		c = getc(h);
		if(c < 0){
			h->eol = 1;
			return '\n';
		}
		if(c != ' ' && c != '\t'){
			ungetc(h);
			h->eol = 1;
			return '\n';
		}
		goto top;

	case ')':
	case '<': case '>':
	case '[': case ']':
	case '@': case '/':
	case ',': case ';': case ':': case '?': case '=':
		if(skipwhite){
			ungetc(h);
			return c;
		}
		return c;

	case '"':
		if(skipwhite){
			ungetc(h);
			return c;
		}
		word(h, "\"");
		getc(h);		/* skip the closing quote */
		return QString;

	default:
		ungetc(h);
		if(skipwhite)
			return c;
		word(h, "\"(){}<>@,;:/[]?=\r\n \t");
		if(h->wordval[0] == '\0'){
			h->c->head.closeit = 1;
			hfail(h->c, HSyntax);
			longjmp(h->jmp, -1);
		}
		return Word;
	}
	goto top;
	return 0;
}

/*
 * return the rest of an rfc 822, including \n
 * do not map to lower case
 */
static void
lexhead(Hlex *h)
{
	int c, n;

	n = 0;
	while((c = getc(h)) >= 0){
		if(c == '\r')
			c = wordcr(h);
		else if(c == '\n')
			c = wordnl(h);
		if(c == '\n')
			break;
		if(c == '\\'){
			c = getc(h);
			if(c < 0)
				break;
		}

		if(n < HMaxWord-1)
			h->wordval[n++] = c;
	}
	h->tok = '\n';
	h->eol = 1;
	h->wordval[n] = '\0';
}

static void
word(Hlex *h, char *stop)
{
	int c, n;

	n = 0;
	while((c = getc(h)) >= 0){
		if(c == '\r')
			c = wordcr(h);
		else if(c == '\n')
			c = wordnl(h);
		if(c == '\\'){
			c = getc(h);
			if(c < 0)
				break;
		}else if(c < 32 || strchr(stop, c) != nil){
			ungetc(h);
			break;
		}

		if(n < HMaxWord-1)
			h->wordval[n++] = c;
	}
	h->wordval[n] = '\0';
}

static int
wordcr(Hlex *h)
{
	int c;

	c = getc(h);
	if(c == '\n')
		return wordnl(h);
	ungetc(h);
	return ' ';
}

static int
wordnl(Hlex *h)
{
	int c;

	c = getc(h);
	if(c == ' ' || c == '\t')
		return c;
	ungetc(h);

	return '\n';
}

static int
getc(Hlex *h)
{
	if(h->eoh)
		return -1;
	if(h->c->hpos < h->c->hstop)
		return *h->c->hpos++;
	h->eoh = 1;
	h->eol = 1;
	return -1;
}

static void
ungetc(Hlex *h)
{
	if(h->eoh)
		return;
	h->c->hpos--;
}

static ulong
digtoul(char *s, char **e)
{
	ulong v;
	int c, ovfl;

	v = 0;
	ovfl = 0;
	for(;;){
		c = *s;
		if(c < '0' || c > '9')
			break;
		s++;
		c -= '0';
		if(v > UlongMax/10 || v == UlongMax/10 && c >= UlongMax%10)
			ovfl = 1;
		v = v * 10 + c;
	}

	if(e)
		*e = s;
	if(ovfl)
		return UlongMax;
	return v;
}

int
http11(HConnect *c)
{
	return c->req.vermaj > 1 || c->req.vermaj == 1 && c->req.vermin > 0;
}

char*
hmkmimeboundary(HConnect *c)
{
	char buf[32];
	int i;

	srand((time(0)<<16)|getpid());
	strcpy(buf, "upas-");
	for(i = 5; i < sizeof(buf)-1; i++)
		buf[i] = 'a' + nrand(26);
	buf[i] = 0;
	return hstrdup(c, buf);
}

HSPairs*
hmkspairs(HConnect *c, char *s, char *t, HSPairs *next)
{
	HSPairs *sp;

	sp = halloc(c, sizeof *sp);
	sp->s = s;
	sp->t = t;
	sp->next = next;
	return sp;
}

HSPairs*
hrevspairs(HSPairs *sp)
{
	HSPairs *last, *next;

	last = nil;
	for(; sp != nil; sp = next){
		next = sp->next;
		sp->next = last;
		last = sp;
	}
	return last;
}

HFields*
hmkhfields(HConnect *c, char *s, HSPairs *p, HFields *next)
{
	HFields *hf;

	hf = halloc(c, sizeof *hf);
	hf->s = s;
	hf->params = p;
	hf->next = next;
	return hf;
}

HFields*
hrevhfields(HFields *hf)
{
	HFields *last, *next;

	last = nil;
	for(; hf != nil; hf = next){
		next = hf->next;
		hf->next = last;
		last = hf;
	}
	return last;
}

HContent*
hmkcontent(HConnect *c, char *generic, char *specific, HContent *next)
{
	HContent *ct;

	ct = halloc(c, sizeof(HContent));
	ct->generic = generic;
	ct->specific = specific;
	ct->next = next;
	ct->q = 1;
	ct->mxb = 0;
	return ct;
}