xref: /plan9/sys/src/9/port/devbridge.c (revision f9e1cf08d3be51592e03e639fc848a68dc31a55e)
1 /*
2  * IPv4 Ethernet bridge
3  */
4 #include "u.h"
5 #include "../port/lib.h"
6 #include "mem.h"
7 #include "dat.h"
8 #include "fns.h"
9 #include "../ip/ip.h"
10 #include "../port/netif.h"
11 #include "../port/error.h"
12 
13 typedef struct Bridge 	Bridge;
14 typedef struct Port 	Port;
15 typedef struct Centry	Centry;
16 typedef struct Iphdr	Iphdr;
17 typedef struct Tcphdr	Tcphdr;
18 
19 enum
20 {
21 	Qtopdir=	1,		/* top level directory */
22 
23 	Qbridgedir,			/* bridge* directory */
24 	Qbctl,
25 	Qstats,
26 	Qcache,
27 	Qlog,
28 
29 	Qportdir,			/* directory for a protocol */
30 	Qpctl,
31 	Qlocal,
32 	Qstatus,
33 
34 	MaxQ,
35 
36 	Maxbridge=	4,
37 	Maxport=	128,		// power of 2
38 	CacheHash=	257,		// prime
39 	CacheLook=	5,		// how many cache entries to examine
40 	CacheSize=	(CacheHash+CacheLook-1),
41 	CacheTimeout=	5*60,		// timeout for cache entry in seconds
42 
43 	TcpMssMax = 1300,		// max desirable Tcp MSS value
44 	TunnelMtu = 1400,
45 };
46 
47 static Dirtab bridgedirtab[]={
48 	"ctl",		{Qbctl},	0,	0666,
49 	"stats",	{Qstats},	0,	0444,
50 	"cache",	{Qcache},	0,	0444,
51 	"log",		{Qlog},		0,	0666,
52 };
53 
54 static Dirtab portdirtab[]={
55 	"ctl",		{Qpctl},	0,	0666,
56 	"local",	{Qlocal},	0,	0444,
57 	"status",	{Qstatus},	0,	0444,
58 };
59 
60 enum {
61 	Logcache=	(1<<0),
62 	Logmcast=	(1<<1),
63 };
64 
65 // types of interfaces
66 enum
67 {
68 	Tether,
69 	Ttun,
70 };
71 
72 static Logflag logflags[] =
73 {
74 	{ "cache",	Logcache, },
75 	{ "multicast",	Logmcast, },
76 	{ nil,		0, },
77 };
78 
79 static Dirtab	*dirtab[MaxQ];
80 
81 #define TYPE(x) 	(((ulong)(x).path) & 0xff)
82 #define PORT(x) 	((((ulong)(x).path) >> 8)&(Maxport-1))
83 #define QID(x, y) 	(((x)<<8) | (y))
84 
85 struct Centry
86 {
87 	uchar	d[Eaddrlen];
88 	int	port;
89 	long	expire;		// entry expires this many seconds after bootime
90 	long	src;
91 	long	dst;
92 };
93 
94 struct Bridge
95 {
96 	QLock;
97 	int	nport;
98 	Port	*port[Maxport];
99 	Centry	cache[CacheSize];
100 	ulong	hit;
101 	ulong	miss;
102 	ulong	copy;
103 	long	delay0;		// constant microsecond delay per packet
104 	long	delayn;		// microsecond delay per byte
105 	int	tcpmss;		// modify tcpmss value
106 
107 	Log;
108 };
109 
110 struct Port
111 {
112 	int	id;
113 	Bridge	*bridge;
114 	int	ref;
115 	int	closed;
116 
117 	Chan	*data[2];	// channel to data
118 
119 	Proc	*readp;		// read proc
120 
121 	// the following uniquely identifies the port
122 	int	type;
123 	char	name[KNAMELEN];
124 
125 	// owner hash - avoids bind/unbind races
126 	ulong	ownhash;
127 
128 	// various stats
129 	int	in;		// number of packets read
130 	int	inmulti;	// multicast or broadcast
131 	int	inunknown;	// unknown address
132 	int	out;		// number of packets read
133 	int	outmulti;	// multicast or broadcast
134 	int	outunknown;	// unknown address
135 	int	outfrag;	// fragmented the packet
136 	int	nentry;		// number of cache entries for this port
137 };
138 
139 enum {
140 	IP_TCPPROTO	= 6,
141 	EOLOPT		= 0,
142 	NOOPOPT		= 1,
143 	MSSOPT		= 2,
144 	MSS_LENGTH	= 4,		/* Mean segment size */
145 	SYN		= 0x02,		/* Pkt. is synchronise */
146 	IPHDR		= 20,		/* sizeof(Iphdr) */
147 };
148 
149 struct Iphdr
150 {
151 	uchar	vihl;		/* Version and header length */
152 	uchar	tos;		/* Type of service */
153 	uchar	length[2];	/* packet length */
154 	uchar	id[2];		/* ip->identification */
155 	uchar	frag[2];	/* Fragment information */
156 	uchar	ttl;		/* Time to live */
157 	uchar	proto;		/* Protocol */
158 	uchar	cksum[2];	/* Header checksum */
159 	uchar	src[4];		/* IP source */
160 	uchar	dst[4];		/* IP destination */
161 };
162 
163 struct Tcphdr
164 {
165 	uchar	sport[2];
166 	uchar	dport[2];
167 	uchar	seq[4];
168 	uchar	ack[4];
169 	uchar	flag[2];
170 	uchar	win[2];
171 	uchar	cksum[2];
172 	uchar	urg[2];
173 };
174 
175 static Bridge bridgetab[Maxbridge];
176 
177 static int m2p[] = {
178 	[OREAD]		4,
179 	[OWRITE]	2,
180 	[ORDWR]		6
181 };
182 
183 static int	bridgegen(Chan *c, char*, Dirtab*, int, int s, Dir *dp);
184 static void	portbind(Bridge *b, int argc, char *argv[]);
185 static void	portunbind(Bridge *b, int argc, char *argv[]);
186 static void	etherread(void *a);
187 static char	*cachedump(Bridge *b);
188 static void	portfree(Port *port);
189 static void	cacheflushport(Bridge *b, int port);
190 static void	etherwrite(Port *port, Block *bp);
191 
192 extern ulong	parseip(uchar*, char*);
193 extern ushort	ipcsum(uchar *addr);
194 
195 static void
196 bridgeinit(void)
197 {
198 	int i;
199 	Dirtab *dt;
200 
201 	// setup dirtab with non directory entries
202 	for(i=0; i<nelem(bridgedirtab); i++) {
203 		dt = bridgedirtab + i;
204 		dirtab[TYPE(dt->qid)] = dt;
205 	}
206 	for(i=0; i<nelem(portdirtab); i++) {
207 		dt = portdirtab + i;
208 		dirtab[TYPE(dt->qid)] = dt;
209 	}
210 }
211 
212 static Chan*
213 bridgeattach(char* spec)
214 {
215 	Chan *c;
216 	int dev;
217 
218 	dev = atoi(spec);
219 	if(dev<0 || dev >= Maxbridge)
220 		error("bad specification");
221 
222 	c = devattach('B', spec);
223 	mkqid(&c->qid, QID(0, Qtopdir), 0, QTDIR);
224 	c->dev = dev;
225 	return c;
226 }
227 
228 static Walkqid*
229 bridgewalk(Chan *c, Chan *nc, char **name, int nname)
230 {
231 	return devwalk(c, nc, name, nname, (Dirtab*)0, 0, bridgegen);
232 }
233 
234 static int
235 bridgestat(Chan* c, uchar* db, int n)
236 {
237 	return devstat(c, db, n, (Dirtab *)0, 0L, bridgegen);
238 }
239 
240 static Chan*
241 bridgeopen(Chan* c, int omode)
242 {
243 	int perm;
244 	Bridge *b;
245 
246 	omode &= 3;
247 	perm = m2p[omode];
248 	USED(perm);
249 
250 	b = bridgetab + c->dev;
251 	USED(b);
252 
253 	switch(TYPE(c->qid)) {
254 	default:
255 		break;
256 	case Qlog:
257 		logopen(b);
258 		break;
259 	case Qcache:
260 		c->aux = cachedump(b);
261 		break;
262 	}
263 	c->mode = openmode(omode);
264 	c->flag |= COPEN;
265 	c->offset = 0;
266 	return c;
267 }
268 
269 static void
270 bridgeclose(Chan* c)
271 {
272 	Bridge *b  = bridgetab + c->dev;
273 
274 	switch(TYPE(c->qid)) {
275 	case Qcache:
276 		if(c->flag & COPEN)
277 			free(c->aux);
278 		break;
279 	case Qlog:
280 		if(c->flag & COPEN)
281 			logclose(b);
282 		break;
283 	}
284 }
285 
286 static long
287 bridgeread(Chan *c, void *a, long n, vlong off)
288 {
289 	char buf[256];
290 	Bridge *b = bridgetab + c->dev;
291 	Port *port;
292 	int i, ingood, outgood;
293 
294 	USED(off);
295 	switch(TYPE(c->qid)) {
296 	default:
297 		error(Eperm);
298 	case Qtopdir:
299 	case Qbridgedir:
300 	case Qportdir:
301 		return devdirread(c, a, n, 0, 0, bridgegen);
302 	case Qlog:
303 		return logread(b, a, off, n);
304 	case Qstatus:
305 		qlock(b);
306 		port = b->port[PORT(c->qid)];
307 		if(port == 0)
308 			strcpy(buf, "unbound\n");
309 		else {
310 			i = 0;
311 			switch(port->type) {
312 			default:
313 				panic("bridgeread: unknown port type: %d",
314 					port->type);
315 			case Tether:
316 				i += snprint(buf+i, sizeof(buf)-i, "ether %s: ", port->name);
317 				break;
318 			case Ttun:
319 				i += snprint(buf+i, sizeof(buf)-i, "tunnel %s: ", port->name);
320 				break;
321 			}
322 			ingood = port->in - port->inmulti - port->inunknown;
323 			outgood = port->out - port->outmulti - port->outunknown;
324 			i += snprint(buf+i, sizeof(buf)-i,
325 				"in=%d(%d:%d:%d) out=%d(%d:%d:%d:%d)\n",
326 				port->in, ingood, port->inmulti, port->inunknown,
327 				port->out, outgood, port->outmulti,
328 				port->outunknown, port->outfrag);
329 			USED(i);
330 		}
331 		n = readstr(off, a, n, buf);
332 		qunlock(b);
333 		return n;
334 	case Qbctl:
335 		snprint(buf, sizeof(buf), "%s tcpmss\ndelay %ld %ld\n",
336 			b->tcpmss ? "set" : "clear", b->delay0, b->delayn);
337 		n = readstr(off, a, n, buf);
338 		return n;
339 	case Qcache:
340 		n = readstr(off, a, n, c->aux);
341 		return n;
342 	case Qstats:
343 		snprint(buf, sizeof(buf), "hit=%uld miss=%uld copy=%uld\n",
344 			b->hit, b->miss, b->copy);
345 		n = readstr(off, a, n, buf);
346 		return n;
347 	}
348 }
349 
350 static void
351 bridgeoption(Bridge *b, char *option, int value)
352 {
353 	if(strcmp(option, "tcpmss") == 0)
354 		b->tcpmss = value;
355 	else
356 		error("unknown bridge option");
357 }
358 
359 
360 static long
361 bridgewrite(Chan *c, void *a, long n, vlong off)
362 {
363 	Bridge *b = bridgetab + c->dev;
364 	Cmdbuf *cb;
365 	char *arg0, *p;
366 
367 	USED(off);
368 	switch(TYPE(c->qid)) {
369 	default:
370 		error(Eperm);
371 	case Qbctl:
372 		cb = parsecmd(a, n);
373 		qlock(b);
374 		if(waserror()) {
375 			qunlock(b);
376 			free(cb);
377 			nexterror();
378 		}
379 		if(cb->nf == 0)
380 			error("short write");
381 		arg0 = cb->f[0];
382 		if(strcmp(arg0, "bind") == 0) {
383 			portbind(b, cb->nf-1, cb->f+1);
384 		} else if(strcmp(arg0, "unbind") == 0) {
385 			portunbind(b, cb->nf-1, cb->f+1);
386 		} else if(strcmp(arg0, "cacheflush") == 0) {
387 			log(b, Logcache, "cache flush\n");
388 			memset(b->cache, 0, CacheSize*sizeof(Centry));
389 		} else if(strcmp(arg0, "set") == 0) {
390 			if(cb->nf != 2)
391 				error("usage: set option");
392 			bridgeoption(b, cb->f[1], 1);
393 		} else if(strcmp(arg0, "clear") == 0) {
394 			if(cb->nf != 2)
395 				error("usage: clear option");
396 			bridgeoption(b, cb->f[1], 0);
397 		} else if(strcmp(arg0, "delay") == 0) {
398 			if(cb->nf != 3)
399 				error("usage: delay delay0 delayn");
400 			b->delay0 = strtol(cb->f[1], nil, 10);
401 			b->delayn = strtol(cb->f[2], nil, 10);
402 		} else
403 			error("unknown control request");
404 		poperror();
405 		qunlock(b);
406 		free(cb);
407 		return n;
408 	case Qlog:
409 		cb = parsecmd(a, n);
410 		p = logctl(b, cb->nf, cb->f, logflags);
411 		free(cb);
412 		if(p != nil)
413 			error(p);
414 		return n;
415 	}
416 }
417 
418 static int
419 bridgegen(Chan *c, char *, Dirtab*, int, int s, Dir *dp)
420 {
421 	Bridge *b = bridgetab + c->dev;
422 	int type = TYPE(c->qid);
423 	Dirtab *dt;
424 	Qid qid;
425 
426 	if(s  == DEVDOTDOT){
427 		switch(TYPE(c->qid)){
428 		case Qtopdir:
429 		case Qbridgedir:
430 			snprint(up->genbuf, sizeof(up->genbuf), "#B%ld", c->dev);
431 			mkqid(&qid, Qtopdir, 0, QTDIR);
432 			devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
433 			break;
434 		case Qportdir:
435 			snprint(up->genbuf, sizeof(up->genbuf), "bridge%ld", c->dev);
436 			mkqid(&qid, Qbridgedir, 0, QTDIR);
437 			devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
438 			break;
439 		default:
440 			panic("bridgewalk %llux", c->qid.path);
441 		}
442 		return 1;
443 	}
444 
445 	switch(type) {
446 	default:
447 		/* non-directory entries end up here */
448 		if(c->qid.type & QTDIR)
449 			panic("bridgegen: unexpected directory");
450 		if(s != 0)
451 			return -1;
452 		dt = dirtab[TYPE(c->qid)];
453 		if(dt == nil)
454 			panic("bridgegen: unknown type: %lud", TYPE(c->qid));
455 		devdir(c, c->qid, dt->name, dt->length, eve, dt->perm, dp);
456 		return 1;
457 	case Qtopdir:
458 		if(s != 0)
459 			return -1;
460 		snprint(up->genbuf, sizeof(up->genbuf), "bridge%ld", c->dev);
461 		mkqid(&qid, QID(0, Qbridgedir), 0, QTDIR);
462 		devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
463 		return 1;
464 	case Qbridgedir:
465 		if(s<nelem(bridgedirtab)) {
466 			dt = bridgedirtab+s;
467 			devdir(c, dt->qid, dt->name, dt->length, eve, dt->perm, dp);
468 			return 1;
469 		}
470 		s -= nelem(bridgedirtab);
471 		if(s >= b->nport)
472 			return -1;
473 		mkqid(&qid, QID(s, Qportdir), 0, QTDIR);
474 		snprint(up->genbuf, sizeof(up->genbuf), "%d", s);
475 		devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
476 		return 1;
477 	case Qportdir:
478 		if(s>=nelem(portdirtab))
479 			return -1;
480 		dt = portdirtab+s;
481 		mkqid(&qid, QID(PORT(c->qid),TYPE(dt->qid)), 0, QTFILE);
482 		devdir(c, qid, dt->name, dt->length, eve, dt->perm, dp);
483 		return 1;
484 	}
485 }
486 
487 // parse mac address; also in netif.c
488 static int
489 parseaddr(uchar *to, char *from, int alen)
490 {
491 	char nip[4];
492 	char *p;
493 	int i;
494 
495 	p = from;
496 	for(i = 0; i < alen; i++){
497 		if(*p == 0)
498 			return -1;
499 		nip[0] = *p++;
500 		if(*p == 0)
501 			return -1;
502 		nip[1] = *p++;
503 		nip[2] = 0;
504 		to[i] = strtoul(nip, 0, 16);
505 		if(*p == ':')
506 			p++;
507 	}
508 	return 0;
509 }
510 
511 // assumes b is locked
512 static void
513 portbind(Bridge *b, int argc, char *argv[])
514 {
515 	Port *port;
516 	Chan *ctl;
517 	int type = 0, i, n;
518 	ulong ownhash;
519 	char *dev, *dev2 = nil, *p;
520 	char buf[100], name[KNAMELEN], path[8*KNAMELEN];
521 	static char usage[] = "usage: bind ether|tunnel name ownhash dev [dev2]";
522 
523 	memset(name, 0, KNAMELEN);
524 	if(argc < 4)
525 		error(usage);
526 	if(strcmp(argv[0], "ether") == 0) {
527 		if(argc != 4)
528 			error(usage);
529 		type = Tether;
530 		strncpy(name, argv[1], KNAMELEN);
531 		name[KNAMELEN-1] = 0;
532 //		parseaddr(addr, argv[1], Eaddrlen);
533 	} else if(strcmp(argv[0], "tunnel") == 0) {
534 		if(argc != 5)
535 			error(usage);
536 		type = Ttun;
537 		strncpy(name, argv[1], KNAMELEN);
538 		name[KNAMELEN-1] = 0;
539 //		parseip(addr, argv[1]);
540 		dev2 = argv[4];
541 	} else
542 		error(usage);
543 	ownhash = atoi(argv[2]);
544 	dev = argv[3];
545 	for(i=0; i<b->nport; i++) {
546 		port = b->port[i];
547 		if(port != nil && port->type == type &&
548 		    memcmp(port->name, name, KNAMELEN) == 0)
549 			error("port in use");
550 	}
551 	for(i=0; i<Maxport; i++)
552 		if(b->port[i] == nil)
553 			break;
554 	if(i == Maxport)
555 		error("no more ports");
556 	port = smalloc(sizeof(Port));
557 	port->ref = 1;
558 	port->id = i;
559 	port->ownhash = ownhash;
560 
561 	if(waserror()) {
562 		portfree(port);
563 		nexterror();
564 	}
565 	port->type = type;
566 	memmove(port->name, name, KNAMELEN);
567 	switch(port->type) {
568 	default:
569 		panic("portbind: unknown port type: %d", type);
570 	case Tether:
571 		snprint(path, sizeof(path), "%s/clone", dev);
572 		ctl = namec(path, Aopen, ORDWR, 0);
573 		if(waserror()) {
574 			cclose(ctl);
575 			nexterror();
576 		}
577 		// check addr?
578 
579 		// get directory name
580 		n = devtab[ctl->type]->read(ctl, buf, sizeof(buf), 0);
581 		buf[n] = 0;
582 		for(p = buf; *p == ' '; p++)
583 			;
584 		snprint(path, sizeof(path), "%s/%lud/data", dev, strtoul(p, 0, 0));
585 
586 		// setup connection to be promiscuous
587 		snprint(buf, sizeof(buf), "connect -1");
588 		devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
589 		snprint(buf, sizeof(buf), "promiscuous");
590 		devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
591 		snprint(buf, sizeof(buf), "bridge");
592 		devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
593 
594 		// open data port
595 		port->data[0] = namec(path, Aopen, ORDWR, 0);
596 		// dup it
597 		incref(port->data[0]);
598 		port->data[1] = port->data[0];
599 
600 		poperror();
601 		cclose(ctl);
602 
603 		break;
604 	case Ttun:
605 		port->data[0] = namec(dev, Aopen, OREAD, 0);
606 		port->data[1] = namec(dev2, Aopen, OWRITE, 0);
607 		break;
608 	}
609 
610 	poperror();
611 
612 	/* committed to binding port */
613 	b->port[port->id] = port;
614 	port->bridge = b;
615 	if(b->nport <= port->id)
616 		b->nport = port->id+1;
617 
618 	// assumes kproc always succeeds
619 	kproc("etherread", etherread, port);	// poperror must be next
620 	port->ref++;
621 }
622 
623 // assumes b is locked
624 static void
625 portunbind(Bridge *b, int argc, char *argv[])
626 {
627 	int type = 0, i;
628 	char name[KNAMELEN];
629 	ulong ownhash;
630 	Port *port = nil;
631 	static char usage[] = "usage: unbind ether|tunnel addr [ownhash]";
632 
633 	memset(name, 0, KNAMELEN);
634 	if(argc < 2 || argc > 3)
635 		error(usage);
636 	if(strcmp(argv[0], "ether") == 0) {
637 		type = Tether;
638 		strncpy(name, argv[1], KNAMELEN);
639 		name[KNAMELEN-1] = 0;
640 //		parseaddr(addr, argv[1], Eaddrlen);
641 	} else if(strcmp(argv[0], "tunnel") == 0) {
642 		type = Ttun;
643 		strncpy(name, argv[1], KNAMELEN);
644 		name[KNAMELEN-1] = 0;
645 //		parseip(addr, argv[1]);
646 	} else
647 		error(usage);
648 	if(argc == 3)
649 		ownhash = atoi(argv[2]);
650 	else
651 		ownhash = 0;
652 	for(i=0; i<b->nport; i++) {
653 		port = b->port[i];
654 		if(port != nil && port->type == type &&
655 		    memcmp(port->name, name, KNAMELEN) == 0)
656 			break;
657 	}
658 	if(i == b->nport)
659 		error("port not found");
660 	if(ownhash != 0 && port->ownhash != 0 && ownhash != port->ownhash)
661 		error("bad owner hash");
662 
663 	port->closed = 1;
664 	b->port[i] = nil;	// port is now unbound
665 	cacheflushport(b, i);
666 
667 	// try and stop reader
668 	if(port->readp)
669 		postnote(port->readp, 1, "unbind", 0);
670 	portfree(port);
671 }
672 
673 // assumes b is locked
674 static Centry *
675 cachelookup(Bridge *b, uchar d[Eaddrlen])
676 {
677 	int i;
678 	uint h;
679 	Centry *p;
680 	long sec;
681 
682 	// dont cache multicast or broadcast
683 	if(d[0] & 1)
684 		return 0;
685 
686 	h = 0;
687 	for(i=0; i<Eaddrlen; i++) {
688 		h *= 7;
689 		h += d[i];
690 	}
691 	h %= CacheHash;
692 	p = b->cache + h;
693 	sec = TK2SEC(m->ticks);
694 	for(i=0; i<CacheLook; i++,p++) {
695 		if(memcmp(d, p->d, Eaddrlen) == 0) {
696 			p->dst++;
697 			if(sec >= p->expire) {
698 				log(b, Logcache, "expired cache entry: %E %d\n",
699 					d, p->port);
700 				return nil;
701 			}
702 			p->expire = sec + CacheTimeout;
703 			return p;
704 		}
705 	}
706 	log(b, Logcache, "cache miss: %E\n", d);
707 	return nil;
708 }
709 
710 // assumes b is locked
711 static void
712 cacheupdate(Bridge *b, uchar d[Eaddrlen], int port)
713 {
714 	int i;
715 	uint h;
716 	Centry *p, *pp;
717 	long sec;
718 
719 	// dont cache multicast or broadcast
720 	if(d[0] & 1) {
721 		log(b, Logcache, "bad source address: %E\n", d);
722 		return;
723 	}
724 
725 	h = 0;
726 	for(i=0; i<Eaddrlen; i++) {
727 		h *= 7;
728 		h += d[i];
729 	}
730 	h %= CacheHash;
731 	p = b->cache + h;
732 	pp = p;
733 	sec = p->expire;
734 
735 	// look for oldest entry
736 	for(i=0; i<CacheLook; i++,p++) {
737 		if(memcmp(p->d, d, Eaddrlen) == 0) {
738 			p->expire = TK2SEC(m->ticks) + CacheTimeout;
739 			if(p->port != port) {
740 				log(b, Logcache, "NIC changed port %d->%d: %E\n",
741 					p->port, port, d);
742 				p->port = port;
743 			}
744 			p->src++;
745 			return;
746 		}
747 		if(p->expire < sec) {
748 			sec = p->expire;
749 			pp = p;
750 		}
751 	}
752 	if(pp->expire != 0)
753 		log(b, Logcache, "bumping from cache: %E %d\n", pp->d, pp->port);
754 	pp->expire = TK2SEC(m->ticks) + CacheTimeout;
755 	memmove(pp->d, d, Eaddrlen);
756 	pp->port = port;
757 	pp->src = 1;
758 	pp->dst = 0;
759 	log(b, Logcache, "adding to cache: %E %d\n", pp->d, pp->port);
760 }
761 
762 // assumes b is locked
763 static void
764 cacheflushport(Bridge *b, int port)
765 {
766 	Centry *ce;
767 	int i;
768 
769 	ce = b->cache;
770 	for(i=0; i<CacheSize; i++,ce++) {
771 		if(ce->port != port)
772 			continue;
773 		memset(ce, 0, sizeof(Centry));
774 	}
775 }
776 
777 static char *
778 cachedump(Bridge *b)
779 {
780 	int i, n;
781 	long sec, off;
782 	char *buf, *p, *ep;
783 	Centry *ce;
784 	char c;
785 
786 	qlock(b);
787 	if(waserror()) {
788 		qunlock(b);
789 		nexterror();
790 	}
791 	sec = TK2SEC(m->ticks);
792 	n = 0;
793 	for(i=0; i<CacheSize; i++)
794 		if(b->cache[i].expire != 0)
795 			n++;
796 
797 	n *= 51;	// change if print format is changed
798 	n += 10;	// some slop at the end
799 	buf = malloc(n);
800 	p = buf;
801 	ep = buf + n;
802 	ce = b->cache;
803 	off = seconds() - sec;
804 	for(i=0; i<CacheSize; i++,ce++) {
805 		if(ce->expire == 0)
806 			continue;
807 		c = (sec < ce->expire)?'v':'e';
808 		p += snprint(p, ep-p, "%E %2d %10ld %10ld %10ld %c\n", ce->d,
809 			ce->port, ce->src, ce->dst, ce->expire+off, c);
810 	}
811 	*p = 0;
812 	poperror();
813 	qunlock(b);
814 
815 	return buf;
816 }
817 
818 
819 
820 // assumes b is locked
821 static void
822 ethermultiwrite(Bridge *b, Block *bp, Port *port)
823 {
824 	Port *oport;
825 	Block *bp2;
826 	Etherpkt *ep;
827 	int i, mcast;
828 
829 	if(waserror()) {
830 		if(bp)
831 			freeb(bp);
832 		nexterror();
833 	}
834 
835 	ep = (Etherpkt*)bp->rp;
836 	mcast = ep->d[0] & 1;		/* multicast bit of ethernet address */
837 
838 	oport = nil;
839 	for(i=0; i<b->nport; i++) {
840 		if(i == port->id || b->port[i] == nil)
841 			continue;
842 		/*
843 		 * we need to forward multicast packets for ipv6,
844 		 * so always do it.
845 		 */
846 		if(mcast)
847 			b->port[i]->outmulti++;
848 		else
849 			b->port[i]->outunknown++;
850 
851 		// delay one so that the last write does not copy
852 		if(oport != nil) {
853 			b->copy++;
854 			bp2 = copyblock(bp, blocklen(bp));
855 			if(!waserror()) {
856 				etherwrite(oport, bp2);
857 				poperror();
858 			}
859 		}
860 		oport = b->port[i];
861 	}
862 
863 	// last write free block
864 	if(oport) {
865 		bp2 = bp; bp = nil; USED(bp);
866 		if(!waserror()) {
867 			etherwrite(oport, bp2);
868 			poperror();
869 		}
870 	} else
871 		freeb(bp);
872 
873 	poperror();
874 }
875 
876 static void
877 tcpmsshack(Etherpkt *epkt, int n)
878 {
879 	int hl, optlen;
880 	Iphdr *iphdr;
881 	Tcphdr *tcphdr;
882 	ulong mss, cksum;
883 	uchar *optr;
884 
885 	/* ignore non-ipv4 packets */
886 	if(nhgets(epkt->type) != ETIP4)
887 		return;
888 	iphdr = (Iphdr*)(epkt->data);
889 	n -= ETHERHDRSIZE;
890 	if(n < IPHDR)
891 		return;
892 
893 	/* ignore bad packets */
894 	if(iphdr->vihl != (IP_VER4|IP_HLEN4)) {
895 		hl = (iphdr->vihl&0xF)<<2;
896 		if((iphdr->vihl&0xF0) != IP_VER4 || hl < (IP_HLEN4<<2))
897 			return;
898 	} else
899 		hl = IP_HLEN4<<2;
900 
901 	/* ignore non-tcp packets */
902 	if(iphdr->proto != IP_TCPPROTO)
903 		return;
904 	n -= hl;
905 	if(n < sizeof(Tcphdr))
906 		return;
907 	tcphdr = (Tcphdr*)((uchar*)(iphdr) + hl);
908 	// MSS can only appear in SYN packet
909 	if(!(tcphdr->flag[1] & SYN))
910 		return;
911 	hl = (tcphdr->flag[0] & 0xf0)>>2;
912 	if(n < hl)
913 		return;
914 
915 	// check for MSS option
916 	optr = (uchar*)tcphdr + sizeof(Tcphdr);
917 	n = hl - sizeof(Tcphdr);
918 	for(;;) {
919 		if(n <= 0 || *optr == EOLOPT)
920 			return;
921 		if(*optr == NOOPOPT) {
922 			n--;
923 			optr++;
924 			continue;
925 		}
926 		optlen = optr[1];
927 		if(optlen < 2 || optlen > n)
928 			return;
929 		if(*optr == MSSOPT && optlen == MSS_LENGTH)
930 			break;
931 		n -= optlen;
932 		optr += optlen;
933 	}
934 
935 	mss = nhgets(optr+2);
936 	if(mss <= TcpMssMax)
937 		return;
938 	// fit checksum
939 	cksum = nhgets(tcphdr->cksum);
940 	if(optr-(uchar*)tcphdr & 1) {
941 print("tcpmsshack: odd alignment!\n");
942 		// odd alignments are a pain
943 		cksum += nhgets(optr+1);
944 		cksum -= (optr[1]<<8)|(TcpMssMax>>8);
945 		cksum += (cksum>>16);
946 		cksum &= 0xffff;
947 		cksum += nhgets(optr+3);
948 		cksum -= ((TcpMssMax&0xff)<<8)|optr[4];
949 		cksum += (cksum>>16);
950 	} else {
951 		cksum += mss;
952 		cksum -= TcpMssMax;
953 		cksum += (cksum>>16);
954 	}
955 	hnputs(tcphdr->cksum, cksum);
956 	hnputs(optr+2, TcpMssMax);
957 }
958 
959 /*
960  *  process to read from the ethernet
961  */
962 static void
963 etherread(void *a)
964 {
965 	Port *port = a;
966 	Bridge *b = port->bridge;
967 	Block *bp, *bp2;
968 	Etherpkt *ep;
969 	Centry *ce;
970 	long md;
971 
972 	qlock(b);
973 	port->readp = up;	/* hide identity under a rock for unbind */
974 
975 	while(!port->closed){
976 		// release lock to read - error means it is time to quit
977 		qunlock(b);
978 		if(waserror()) {
979 			print("etherread read error: %s\n", up->errstr);
980 			qlock(b);
981 			break;
982 		}
983 		if(0)
984 			print("devbridge: etherread: reading\n");
985 		bp = devtab[port->data[0]->type]->bread(port->data[0],
986 			ETHERMAXTU, 0);
987 		if(0)
988 			print("devbridge: etherread: blocklen = %d\n",
989 				blocklen(bp));
990 		poperror();
991 		qlock(b);
992 		if(bp == nil || port->closed)
993 			break;
994 		if(waserror()) {
995 //			print("etherread bridge error\n");
996 			if(bp)
997 				freeb(bp);
998 			continue;
999 		}
1000 		if(blocklen(bp) < ETHERMINTU)
1001 			error("short packet");
1002 		port->in++;
1003 
1004 		ep = (Etherpkt*)bp->rp;
1005 		cacheupdate(b, ep->s, port->id);
1006 		if(b->tcpmss)
1007 			tcpmsshack(ep, BLEN(bp));
1008 
1009 		/*
1010 		 * delay packets to simulate a slow link
1011 		 */
1012 		if(b->delay0 || b->delayn){
1013 			md = b->delay0 + b->delayn * BLEN(bp);
1014 			if(md > 0)
1015 				microdelay(md);
1016 		}
1017 
1018 		if(ep->d[0] & 1) {
1019 			log(b, Logmcast, "multicast: port=%d src=%E dst=%E type=%#.4ux\n",
1020 				port->id, ep->s, ep->d, ep->type[0]<<8|ep->type[1]);
1021 			port->inmulti++;
1022 			bp2 = bp; bp = nil;
1023 			ethermultiwrite(b, bp2, port);
1024 		} else {
1025 			ce = cachelookup(b, ep->d);
1026 			if(ce == nil) {
1027 				b->miss++;
1028 				port->inunknown++;
1029 				bp2 = bp; bp = nil;
1030 				ethermultiwrite(b, bp2, port);
1031 			}else if(ce->port != port->id){
1032 				b->hit++;
1033 				bp2 = bp; bp = nil;
1034 				etherwrite(b->port[ce->port], bp2);
1035 			}
1036 		}
1037 
1038 		poperror();
1039 		if(bp)
1040 			freeb(bp);
1041 	}
1042 //	print("etherread: trying to exit\n");
1043 	port->readp = nil;
1044 	portfree(port);
1045 	qunlock(b);
1046 	pexit("hangup", 1);
1047 }
1048 
1049 static int
1050 fragment(Etherpkt *epkt, int n)
1051 {
1052 	Iphdr *iphdr;
1053 
1054 	if(n <= TunnelMtu)
1055 		return 0;
1056 
1057 	/* ignore non-ipv4 packets */
1058 	if(nhgets(epkt->type) != ETIP4)
1059 		return 0;
1060 	iphdr = (Iphdr*)(epkt->data);
1061 	n -= ETHERHDRSIZE;
1062 	/*
1063 	 * ignore: IP runt packets, bad packets (I don't handle IP
1064 	 * options for the moment), packets with don't-fragment set,
1065 	 * and short blocks.
1066 	 */
1067 	if(n < IPHDR || iphdr->vihl != (IP_VER4|IP_HLEN4) ||
1068 	    iphdr->frag[0] & (IP_DF>>8) || nhgets(iphdr->length) > n)
1069 		return 0;
1070 
1071 	return 1;
1072 }
1073 
1074 
1075 static void
1076 etherwrite(Port *port, Block *bp)
1077 {
1078 	Iphdr *eh, *feh;
1079 	Etherpkt *epkt;
1080 	int n, lid, len, seglen, chunk, dlen, blklen, offset, mf;
1081 	Block *xp, *nb;
1082 	ushort fragoff, frag;
1083 
1084 	port->out++;
1085 	epkt = (Etherpkt*)bp->rp;
1086 	n = blocklen(bp);
1087 	if(port->type != Ttun || !fragment(epkt, n)) {
1088 		devtab[port->data[1]->type]->bwrite(port->data[1], bp, 0);
1089 		return;
1090 	}
1091 	port->outfrag++;
1092 	if(waserror()){
1093 		freeblist(bp);
1094 		nexterror();
1095 	}
1096 
1097 	seglen = (TunnelMtu - ETHERHDRSIZE - IPHDR) & ~7;
1098 	eh = (Iphdr*)(epkt->data);
1099 	len = nhgets(eh->length);
1100 	frag = nhgets(eh->frag);
1101 	mf = frag & IP_MF;
1102 	frag <<= 3;
1103 	dlen = len - IPHDR;
1104 	xp = bp;
1105 	lid = nhgets(eh->id);
1106 	offset = ETHERHDRSIZE+IPHDR;
1107 	while(xp != nil && offset && offset >= BLEN(xp)) {
1108 		offset -= BLEN(xp);
1109 		xp = xp->next;
1110 	}
1111 	xp->rp += offset;
1112 
1113 	if(0)
1114 		print("seglen=%d, dlen=%d, mf=%x, frag=%d\n",
1115 			seglen, dlen, mf, frag);
1116 	for(fragoff = 0; fragoff < dlen; fragoff += seglen) {
1117 		nb = allocb(ETHERHDRSIZE+IPHDR+seglen);
1118 
1119 		feh = (Iphdr*)(nb->wp+ETHERHDRSIZE);
1120 
1121 		memmove(nb->wp, epkt, ETHERHDRSIZE+IPHDR);
1122 		nb->wp += ETHERHDRSIZE+IPHDR;
1123 
1124 		if((fragoff + seglen) >= dlen) {
1125 			seglen = dlen - fragoff;
1126 			hnputs(feh->frag, (frag+fragoff)>>3 | mf);
1127 		}
1128 		else
1129 			hnputs(feh->frag, (frag+fragoff>>3) | IP_MF);
1130 
1131 		hnputs(feh->length, seglen + IPHDR);
1132 		hnputs(feh->id, lid);
1133 
1134 		/* Copy up the data area */
1135 		chunk = seglen;
1136 		while(chunk) {
1137 			blklen = chunk;
1138 			if(BLEN(xp) < chunk)
1139 				blklen = BLEN(xp);
1140 			memmove(nb->wp, xp->rp, blklen);
1141 			nb->wp += blklen;
1142 			xp->rp += blklen;
1143 			chunk -= blklen;
1144 			if(xp->rp == xp->wp)
1145 				xp = xp->next;
1146 		}
1147 
1148 		feh->cksum[0] = 0;
1149 		feh->cksum[1] = 0;
1150 		hnputs(feh->cksum, ipcsum(&feh->vihl));
1151 
1152 		/* don't generate small packets */
1153 		if(BLEN(nb) < ETHERMINTU)
1154 			nb->wp = nb->rp + ETHERMINTU;
1155 		devtab[port->data[1]->type]->bwrite(port->data[1], nb, 0);
1156 	}
1157 	poperror();
1158 	freeblist(bp);
1159 }
1160 
1161 // hold b lock
1162 static void
1163 portfree(Port *port)
1164 {
1165 	port->ref--;
1166 	if(port->ref < 0)
1167 		panic("portfree: bad ref");
1168 	if(port->ref > 0)
1169 		return;
1170 
1171 	if(port->data[0])
1172 		cclose(port->data[0]);
1173 	if(port->data[1])
1174 		cclose(port->data[1]);
1175 	memset(port, 0, sizeof(Port));
1176 	free(port);
1177 }
1178 
1179 Dev bridgedevtab = {
1180 	'B',
1181 	"bridge",
1182 
1183 	devreset,
1184 	bridgeinit,
1185 	devshutdown,
1186 	bridgeattach,
1187 	bridgewalk,
1188 	bridgestat,
1189 	bridgeopen,
1190 	devcreate,
1191 	bridgeclose,
1192 	bridgeread,
1193 	devbread,
1194 	bridgewrite,
1195 	devbwrite,
1196 	devremove,
1197 	devwstat,
1198 };
1199