xref: /inferno-os/os/port/devbridge.c (revision 4eb166cf184c1f102fb79e31b1465ea3e2021c39)
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "../port/netif.h"
7 #include "../port/error.h"
8 
9 typedef struct Bridge 	Bridge;
10 typedef struct Port 	Port;
11 typedef struct Centry	Centry;
12 typedef struct Iphdr	Iphdr;
13 typedef struct Tcphdr	Tcphdr;
14 
15 enum
16 {
17 	Qtopdir=	1,		/* top level directory */
18 
19 	Qbridgedir,			/* bridge* directory */
20 	Qbctl,
21 	Qstats,
22 	Qcache,
23 	Qlog,
24 
25 	Qportdir,			/* directory for a protocol */
26 	Qpctl,
27 	Qlocal,
28 	Qstatus,
29 
30 	MaxQ,
31 
32 	Maxbridge=	4,
33 	Maxport=	128,		// power of 2
34 	CacheHash=	257,		// prime
35 	CacheLook=	5,		// how many cache entries to examine
36 	CacheSize=	(CacheHash+CacheLook-1),
37 	CacheTimeout=	5*60,		// timeout for cache entry in seconds
38 
39 	TcpMssMax = 1300,			// max desirable Tcp MSS value
40 	TunnelMtu = 1400,
41 };
42 
43 static Dirtab bridgedirtab[]={
44 	"ctl",		{Qbctl},	0,	0666,
45 	"stats",	{Qstats},	0,	0444,
46 	"cache",	{Qcache},	0,	0444,
47 	"log",		{Qlog},		0,	0666,
48 };
49 
50 static Dirtab portdirtab[]={
51 	"ctl",		{Qpctl},	0,	0666,
52 	"local",	{Qlocal},	0,	0444,
53 	"status",	{Qstatus},	0,	0444,
54 };
55 
56 enum {
57 	Logcache=	(1<<0),
58 	Logmcast=	(1<<1),
59 };
60 
61 // types of interfaces
62 enum
63 {
64 	Tether,
65 	Ttun,
66 };
67 
68 static Logflag logflags[] =
69 {
70 	{ "cache",	Logcache, },
71 	{ "multicast",	Logmcast, },
72 	{ nil,		0, },
73 };
74 
75 static Dirtab	*dirtab[MaxQ];
76 
77 #define TYPE(x) 	(((ulong)(x).path) & 0xff)
78 #define PORT(x) 	((((ulong)(x).path) >> 8)&(Maxport-1))
79 #define QID(x, y) 	(((x)<<8) | (y))
80 
81 struct Centry
82 {
83 	uchar	d[Eaddrlen];
84 	int	port;
85 	long	expire;		// entry expires this number of seconds after bootime
86 	long	src;
87 	long	dst;
88 };
89 
90 struct Bridge
91 {
92 	QLock;
93 	int	nport;
94 	Port	*port[Maxport];
95 	Centry	cache[CacheSize];
96 	ulong	hit;
97 	ulong	miss;
98 	ulong	copy;
99 	long	delay0;		// constant microsecond delay per packet
100 	long	delayn;		// microsecond delay per byte
101 	int tcpmss;		// modify tcpmss value
102 
103 	Log;
104 };
105 
106 struct Port
107 {
108 	int	id;
109 	Bridge	*bridge;
110 	int	ref;
111 	int	closed;
112 
113 	Chan	*data[2];	// channel to data
114 
115 	int	mcast;		// send multi cast packets
116 
117 	Proc	*readp;		// read proc
118 
119 	// the following uniquely identifies the port
120 	int	type;
121 	char	name[KNAMELEN];
122 
123 	// owner hash - avoids bind/unbind races
124 	ulong	ownhash;
125 
126 	// various stats
127 	int	in;		// number of packets read
128 	int	inmulti;	// multicast or broadcast
129 	int	inunknown;	// unknown address
130 	int	out;		// number of packets read
131 	int	outmulti;	// multicast or broadcast
132 	int	outunknown;	// unknown address
133 	int outfrag;	// fragmented the packet
134 	int	nentry;		// number of cache entries for this port
135 };
136 
137 enum {
138 	IP_VER		= 0x40,		/* Using IP version 4 */
139 	IP_HLEN		= 0x05,		/* Header length in characters */
140 	IP_DF		= 0x4000,	/* Don't fragment */
141 	IP_MF		= 0x2000,	/* More fragments */
142 	IP_MAX		= (32*1024),	/* Maximum Internet packet size */
143 	IP_TCPPROTO = 6,
144 	EOLOPT		= 0,
145 	NOOPOPT		= 1,
146 	MSSOPT		= 2,
147 	MSS_LENGTH	= 4,		/* Mean segment size */
148 	SYN		= 0x02,		/* Pkt. is synchronise */
149 	IPHDR		= 20,		/* sizeof(Iphdr) */
150 };
151 
152 struct Iphdr
153 {
154 	uchar	vihl;		/* Version and header length */
155 	uchar	tos;		/* Type of service */
156 	uchar	length[2];	/* packet length */
157 	uchar	id[2];		/* ip->identification */
158 	uchar	frag[2];	/* Fragment information */
159 	uchar	ttl;		/* Time to live */
160 	uchar	proto;		/* Protocol */
161 	uchar	cksum[2];	/* Header checksum */
162 	uchar	src[4];		/* IP source */
163 	uchar	dst[4];		/* IP destination */
164 };
165 
166 struct Tcphdr
167 {
168 	uchar	sport[2];
169 	uchar	dport[2];
170 	uchar	seq[4];
171 	uchar	ack[4];
172 	uchar	flag[2];
173 	uchar	win[2];
174 	uchar	cksum[2];
175 	uchar	urg[2];
176 };
177 
178 static Bridge bridgetab[Maxbridge];
179 
180 static int m2p[] = {
181 	[OREAD]		4,
182 	[OWRITE]	2,
183 	[ORDWR]		6
184 };
185 
186 static int	bridgegen(Chan *c, char*, Dirtab*, int, int s, Dir *dp);
187 static void	portbind(Bridge *b, int argc, char *argv[]);
188 static void	portunbind(Bridge *b, int argc, char *argv[]);
189 static void	etherread(void *a);
190 static char	*cachedump(Bridge *b);
191 static void	portfree(Port *port);
192 static void	cacheflushport(Bridge *b, int port);
193 static void	etherwrite(Port *port, Block *bp);
194 
195 extern ulong	parseip(uchar*, char*);
196 extern ushort	ipcsum(uchar *addr);
197 
198 static void
199 bridgeinit(void)
200 {
201 	int i;
202 	Dirtab *dt;
203 	// setup dirtab with non directory entries
204 	for(i=0; i<nelem(bridgedirtab); i++) {
205 		dt = bridgedirtab + i;
206 		dirtab[TYPE(dt->qid)] = dt;
207 	}
208 	for(i=0; i<nelem(portdirtab); i++) {
209 		dt = portdirtab + i;
210 		dirtab[TYPE(dt->qid)] = dt;
211 	}
212 }
213 
214 static Chan*
215 bridgeattach(char* spec)
216 {
217 	Chan *c;
218 	int dev;
219 
220 	dev = atoi(spec);
221 	if(dev<0 || dev >= Maxbridge)
222 		error("bad specification");
223 
224 	c = devattach('B', spec);
225 	mkqid(&c->qid, QID(0, Qtopdir), 0, QTDIR);
226 	c->dev = dev;
227 
228 	return c;
229 }
230 
231 static Walkqid*
232 bridgewalk(Chan *c, Chan *nc, char **name, int nname)
233 {
234 	return devwalk(c, nc, name, nname, (Dirtab*)0, 0, bridgegen);
235 }
236 
237 static int
238 bridgestat(Chan* c, uchar* db, int n)
239 {
240 	return devstat(c, db, n, (Dirtab *)0, 0L, bridgegen);
241 }
242 
243 static Chan*
244 bridgeopen(Chan* c, int omode)
245 {
246 	int perm;
247 	Bridge *b;
248 
249 	omode &= 3;
250 	perm = m2p[omode];
251 	USED(perm);
252 
253 	b = bridgetab + c->dev;
254 	USED(b);
255 
256 	switch(TYPE(c->qid)) {
257 	default:
258 		break;
259 	case Qlog:
260 		logopen(b);
261 		break;
262 	case Qcache:
263 		c->aux = cachedump(b);
264 		break;
265 	}
266 	c->mode = openmode(omode);
267 	c->flag |= COPEN;
268 	c->offset = 0;
269 	return c;
270 }
271 
272 static void
273 bridgeclose(Chan* c)
274 {
275 	Bridge *b  = bridgetab + c->dev;
276 
277 	switch(TYPE(c->qid)) {
278 	case Qcache:
279 		if(c->flag & COPEN)
280 			free(c->aux);
281 		break;
282 	case Qlog:
283 		if(c->flag & COPEN)
284 			logclose(b);
285 		break;
286 	}
287 }
288 
289 static long
290 bridgeread(Chan *c, void *a, long n, vlong off)
291 {
292 	char buf[256];
293 	Bridge *b = bridgetab + c->dev;
294 	Port *port;
295 	int i, ingood, outgood;
296 
297 	USED(off);
298 	switch(TYPE(c->qid)) {
299 	default:
300 		error(Eperm);
301 	case Qtopdir:
302 	case Qbridgedir:
303 	case Qportdir:
304 		return devdirread(c, a, n, 0, 0, bridgegen);
305 	case Qlog:
306 		return logread(b, a, off, n);
307 	case Qstatus:
308 		qlock(b);
309 		port = b->port[PORT(c->qid)];
310 		if(port == 0)
311 			strcpy(buf, "unbound\n");
312 		else {
313 			i = 0;
314 			switch(port->type) {
315 			default: panic("bridgeread: unknown port type: %d", port->type);
316 			case Tether:
317 				i += snprint(buf+i, sizeof(buf)-i, "ether %s: ", port->name);
318 				break;
319 			case Ttun:
320 				i += snprint(buf+i, sizeof(buf)-i, "tunnel %s: ", port->name);
321 				break;
322 			}
323 			ingood = port->in-port->inmulti-port->inunknown;
324 			outgood = port->out-port->outmulti-port->outunknown;
325 			i += snprint(buf+i, sizeof(buf)-i, "in=%d(%d:%d:%d) out=%d(%d:%d:%d:%d)\n",
326 				port->in, ingood, port->inmulti, port->inunknown,
327 				port->out, outgood, port->outmulti, port->outunknown, port->outfrag);
328 			USED(i);
329 		}
330 		n = readstr(off, a, n, buf);
331 		qunlock(b);
332 		return n;
333 	case Qbctl:
334 		snprint(buf, sizeof(buf), "%s tcpmss\ndelay %ld %ld\n", b->tcpmss ? "set" : "clear",
335 			b->delay0, b->delayn);
336 		n = readstr(off, a, n, buf);
337 		return n;
338 	case Qcache:
339 		n = readstr(off, a, n, c->aux);
340 		return n;
341 	case Qstats:
342 		snprint(buf, sizeof(buf), "hit=%uld miss=%uld copy=%uld\n",
343 			b->hit, b->miss, b->copy);
344 		n = readstr(off, a, n, buf);
345 		return n;
346 	}
347 }
348 
349 static void
350 bridgeoption(Bridge *b, char *option, int value)
351 {
352 	if(strcmp(option, "tcpmss") == 0)
353 		b->tcpmss = value;
354 	else
355 		error("unknown bridge option");
356 }
357 
358 
359 static long
360 bridgewrite(Chan *c, void *a, long n, vlong off)
361 {
362 	Bridge *b = bridgetab + c->dev;
363 	Cmdbuf *cb;
364 	char *arg0;
365 	char *p;
366 
367 	USED(off);
368 	switch(TYPE(c->qid)) {
369 	default:
370 		error(Eperm);
371 	case Qbctl:
372 		cb = parsecmd(a, n);
373 		qlock(b);
374 		if(waserror()) {
375 			qunlock(b);
376 			free(cb);
377 			nexterror();
378 		}
379 		if(cb->nf == 0)
380 			error("short write");
381 		arg0 = cb->f[0];
382 		if(strcmp(arg0, "bind") == 0) {
383 			portbind(b, cb->nf-1, cb->f+1);
384 		} else if(strcmp(arg0, "unbind") == 0) {
385 			portunbind(b, cb->nf-1, cb->f+1);
386 		} else if(strcmp(arg0, "cacheflush") == 0) {
387 			logb(b, Logcache, "cache flush\n");
388 			memset(b->cache, 0, CacheSize*sizeof(Centry));
389 		} else if(strcmp(arg0, "set") == 0) {
390 			if(cb->nf != 2)
391 				error("usage: set option");
392 			bridgeoption(b, cb->f[1], 1);
393 		} else if(strcmp(arg0, "clear") == 0) {
394 			if(cb->nf != 2)
395 				error("usage: clear option");
396 			bridgeoption(b, cb->f[1], 0);
397 		} else if(strcmp(arg0, "delay") == 0) {
398 			if(cb->nf != 3)
399 				error("usage: delay delay0 delayn");
400 			b->delay0 = strtol(cb->f[1], nil, 10);
401 			b->delayn = strtol(cb->f[2], nil, 10);
402 		} else
403 			error("unknown control request");
404 		poperror();
405 		qunlock(b);
406 		free(cb);
407 		return n;
408 	case Qlog:
409 		cb = parsecmd(a, n);
410 		p = logctl(b, cb->nf, cb->f, logflags);
411 		free(cb);
412 		if(p != nil)
413 			error(p);
414 		return n;
415 	}
416 }
417 
418 static int
419 bridgegen(Chan *c, char *, Dirtab*, int, int s, Dir *dp)
420 {
421 	Bridge *b = bridgetab + c->dev;
422 	int type = TYPE(c->qid);
423 	Dirtab *dt;
424 	Qid qid;
425 
426 	if(s  == DEVDOTDOT){
427 		switch(TYPE(c->qid)){
428 		case Qtopdir:
429 		case Qbridgedir:
430 			snprint(up->genbuf, sizeof(up->genbuf), "#B%ld", c->dev);
431 			mkqid(&qid, Qtopdir, 0, QTDIR);
432 			devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
433 			break;
434 		case Qportdir:
435 			snprint(up->genbuf, sizeof(up->genbuf), "bridge%ld", c->dev);
436 			mkqid(&qid, Qbridgedir, 0, QTDIR);
437 			devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
438 			break;
439 		default:
440 			panic("bridgewalk %llux", c->qid.path);
441 		}
442 		return 1;
443 	}
444 
445 	switch(type) {
446 	default:
447 		// non directory entries end up here
448 		if(c->qid.type & QTDIR)
449 			panic("bridgegen: unexpected directory");
450 		if(s != 0)
451 			return -1;
452 		dt = dirtab[TYPE(c->qid)];
453 		if(dt == nil)
454 			panic("bridgegen: unknown type: %lud", TYPE(c->qid));
455 		devdir(c, c->qid, dt->name, dt->length, eve, dt->perm, dp);
456 		return 1;
457 	case Qtopdir:
458 		if(s != 0)
459 			return -1;
460 		snprint(up->genbuf, sizeof(up->genbuf), "bridge%ld", c->dev);
461 		mkqid(&qid, QID(0, Qbridgedir), 0, QTDIR);
462 		devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
463 		return 1;
464 	case Qbridgedir:
465 		if(s<nelem(bridgedirtab)) {
466 			dt = bridgedirtab+s;
467 			devdir(c, dt->qid, dt->name, dt->length, eve, dt->perm, dp);
468 			return 1;
469 		}
470 		s -= nelem(bridgedirtab);
471 		if(s >= b->nport)
472 			return -1;
473 		mkqid(&qid, QID(s, Qportdir), 0, QTDIR);
474 		snprint(up->genbuf, sizeof(up->genbuf), "%d", s);
475 		devdir(c, qid, up->genbuf, 0, eve, 0555, dp);
476 		return 1;
477 	case Qportdir:
478 		if(s>=nelem(portdirtab))
479 			return -1;
480 		dt = portdirtab+s;
481 		mkqid(&qid, QID(PORT(c->qid),TYPE(dt->qid)), 0, QTFILE);
482 		devdir(c, qid, dt->name, dt->length, eve, dt->perm, dp);
483 		return 1;
484 	}
485 }
486 
487 // also in netif.c
488 static int
489 parseaddr(uchar *to, char *from, int alen)
490 {
491 	char nip[4];
492 	char *p;
493 	int i;
494 
495 	p = from;
496 	for(i = 0; i < alen; i++){
497 		if(*p == 0)
498 			return -1;
499 		nip[0] = *p++;
500 		if(*p == 0)
501 			return -1;
502 		nip[1] = *p++;
503 		nip[2] = 0;
504 		to[i] = strtoul(nip, 0, 16);
505 		if(*p == ':')
506 			p++;
507 	}
508 	return 0;
509 }
510 
511 // assumes b is locked
512 static void
513 portbind(Bridge *b, int argc, char *argv[])
514 {
515 	Port *port;
516 	char path[8*KNAMELEN];
517 	char buf[100];
518 	char *dev, *dev2=nil, *p;
519 	Chan *ctl;
520 	int type=0, i, n;
521 	char *usage = "usage: bind ether|tunnel name ownhash dev [dev2]";
522 	char name[KNAMELEN];
523 	ulong ownhash;
524 
525 	memset(name, 0, KNAMELEN);
526 	if(argc < 4)
527 		error(usage);
528 	if(strcmp(argv[0], "ether") == 0) {
529 		if(argc != 4)
530 			error(usage);
531 		type = Tether;
532 		strncpy(name, argv[1], KNAMELEN);
533 		name[KNAMELEN-1] = 0;
534 //		parseaddr(addr, argv[1], Eaddrlen);
535 	} else if(strcmp(argv[0], "tunnel") == 0) {
536 		if(argc != 5)
537 			error(usage);
538 		type = Ttun;
539 		strncpy(name, argv[1], KNAMELEN);
540 		name[KNAMELEN-1] = 0;
541 //		parseip(addr, argv[1]);
542 		dev2 = argv[4];
543 	} else
544 		error(usage);
545 	ownhash = atoi(argv[2]);
546 	dev = argv[3];
547 	for(i=0; i<b->nport; i++) {
548 		port = b->port[i];
549 		if(port != nil)
550 		if(port->type == type)
551 		if(memcmp(port->name, name, KNAMELEN) == 0)
552 			error("port in use");
553 	}
554 	for(i=0; i<Maxport; i++)
555 		if(b->port[i] == nil)
556 			break;
557 	if(i == Maxport)
558 		error("no more ports");
559 	port = smalloc(sizeof(Port));
560 	port->ref = 1;
561 	port->id = i;
562 	port->ownhash = ownhash;
563 
564 	if(waserror()) {
565 		portfree(port);
566 		nexterror();
567 	}
568 	port->type = type;
569 	memmove(port->name, name, KNAMELEN);
570 	switch(port->type) {
571 	default: panic("portbind: unknown port type: %d", type);
572 	case Tether:
573 		snprint(path, sizeof(path), "%s/clone", dev);
574 		ctl = namec(path, Aopen, ORDWR, 0);
575 		if(waserror()) {
576 			cclose(ctl);
577 			nexterror();
578 		}
579 		// check addr?
580 
581 		// get directory name
582 		n = devtab[ctl->type]->read(ctl, buf, sizeof(buf), 0);
583 		buf[n] = 0;
584 		for(p = buf; *p == ' '; p++)
585 			;
586 		snprint(path, sizeof(path), "%s/%lud/data", dev, strtoul(p, 0, 0));
587 
588 		// setup connection to be promiscuous
589 		snprint(buf, sizeof(buf), "connect -1");
590 		devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
591 		snprint(buf, sizeof(buf), "promiscuous");
592 		devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
593 		snprint(buf, sizeof(buf), "bridge");
594 		devtab[ctl->type]->write(ctl, buf, strlen(buf), 0);
595 
596 		// open data port
597 		port->data[0] = namec(path, Aopen, ORDWR, 0);
598 		// dup it
599 		incref(port->data[0]);
600 		port->data[1] = port->data[0];
601 
602 		poperror();
603 		cclose(ctl);
604 
605 		break;
606 	case Ttun:
607 		port->data[0] = namec(dev, Aopen, OREAD, 0);
608 		port->data[1] = namec(dev2, Aopen, OWRITE, 0);
609 		break;
610 	}
611 
612 	poperror();
613 
614 	// commited to binding port
615 	b->port[port->id] = port;
616 	port->bridge = b;
617 	if(b->nport <= port->id)
618 		b->nport = port->id+1;
619 
620 	// assumes kproc always succeeds
621 	kproc("etherread", etherread, port, 0);	// poperror must be next
622 	port->ref++;
623 }
624 
625 // assumes b is locked
626 static void
627 portunbind(Bridge *b, int argc, char *argv[])
628 {
629 	Port *port=nil;
630 	int type=0, i;
631 	char *usage = "usage: unbind ether|tunnel addr [ownhash]";
632 	char name[KNAMELEN];
633 	ulong ownhash;
634 
635 	memset(name, 0, KNAMELEN);
636 	if(argc < 2 || argc > 3)
637 		error(usage);
638 	if(strcmp(argv[0], "ether") == 0) {
639 		type = Tether;
640 		strncpy(name, argv[1], KNAMELEN);
641 		name[KNAMELEN-1] = 0;
642 //		parseaddr(addr, argv[1], Eaddrlen);
643 	} else if(strcmp(argv[0], "tunnel") == 0) {
644 		type = Ttun;
645 		strncpy(name, argv[1], KNAMELEN);
646 		name[KNAMELEN-1] = 0;
647 //		parseip(addr, argv[1]);
648 	} else
649 		error(usage);
650 	if(argc == 3)
651 		ownhash = atoi(argv[2]);
652 	else
653 		ownhash = 0;
654 	for(i=0; i<b->nport; i++) {
655 		port = b->port[i];
656 		if(port != nil)
657 		if(port->type == type)
658 		if(memcmp(port->name, name, KNAMELEN) == 0)
659 			break;
660 	}
661 	if(i == b->nport)
662 		error("port not found");
663 	if(ownhash != 0 && port->ownhash != 0 && ownhash != port->ownhash)
664 		error("bad owner hash");
665 
666 	port->closed = 1;
667 	b->port[i] = nil;	// port is now unbound
668 	cacheflushport(b, i);
669 
670 	// try and stop reader
671 	if(port->readp)
672 		postnote(port->readp, 1, "unbind", 0);
673 	portfree(port);
674 }
675 
676 // assumes b is locked
677 static Centry *
678 cachelookup(Bridge *b, uchar d[Eaddrlen])
679 {
680 	int i;
681 	uint h;
682 	Centry *p;
683 	long sec;
684 
685 	// dont cache multicast or broadcast
686 	if(d[0] & 1)
687 		return 0;
688 
689 	h = 0;
690 	for(i=0; i<Eaddrlen; i++) {
691 		h *= 7;
692 		h += d[i];
693 	}
694 	h %= CacheHash;
695 	p = b->cache + h;
696 	sec = TK2SEC(m->ticks);
697 	for(i=0; i<CacheLook; i++,p++) {
698 		if(memcmp(d, p->d, Eaddrlen) == 0) {
699 			p->dst++;
700 			if(sec >= p->expire) {
701 				logb(b, Logcache, "expired cache entry: %E %d\n",
702 					d, p->port);
703 				return nil;
704 			}
705 			p->expire = sec + CacheTimeout;
706 			return p;
707 		}
708 	}
709 	logb(b, Logcache, "cache miss: %E\n", d);
710 	return nil;
711 }
712 
713 // assumes b is locked
714 static void
715 cacheupdate(Bridge *b, uchar d[Eaddrlen], int port)
716 {
717 	int i;
718 	uint h;
719 	Centry *p, *pp;
720 	long sec;
721 
722 	// dont cache multicast or broadcast
723 	if(d[0] & 1) {
724 		logb(b, Logcache, "bad source address: %E\n", d);
725 		return;
726 	}
727 
728 	h = 0;
729 	for(i=0; i<Eaddrlen; i++) {
730 		h *= 7;
731 		h += d[i];
732 	}
733 	h %= CacheHash;
734 	p = b->cache + h;
735 	pp = p;
736 	sec = p->expire;
737 
738 	// look for oldest entry
739 	for(i=0; i<CacheLook; i++,p++) {
740 		if(memcmp(p->d, d, Eaddrlen) == 0) {
741 			p->expire = TK2SEC(m->ticks) + CacheTimeout;
742 			if(p->port != port) {
743 				logb(b, Logcache, "NIC changed port %d->%d: %E\n",
744 					p->port, port, d);
745 				p->port = port;
746 			}
747 			p->src++;
748 			return;
749 		}
750 		if(p->expire < sec) {
751 			sec = p->expire;
752 			pp = p;
753 		}
754 	}
755 	if(pp->expire != 0)
756 		logb(b, Logcache, "bumping from cache: %E %d\n", pp->d, pp->port);
757 	pp->expire = TK2SEC(m->ticks) + CacheTimeout;
758 	memmove(pp->d, d, Eaddrlen);
759 	pp->port = port;
760 	pp->src = 1;
761 	pp->dst = 0;
762 	logb(b, Logcache, "adding to cache: %E %d\n", pp->d, pp->port);
763 }
764 
765 // assumes b is locked
766 static void
767 cacheflushport(Bridge *b, int port)
768 {
769 	Centry *ce;
770 	int i;
771 
772 	ce = b->cache;
773 	for(i=0; i<CacheSize; i++,ce++) {
774 		if(ce->port != port)
775 			continue;
776 		memset(ce, 0, sizeof(Centry));
777 	}
778 }
779 
780 static char *
781 cachedump(Bridge *b)
782 {
783 	int i, n;
784 	long sec, off;
785 	char *buf, *p, *ep;
786 	Centry *ce;
787 	char c;
788 
789 	qlock(b);
790 	if(waserror()) {
791 		qunlock(b);
792 		nexterror();
793 	}
794 	sec = TK2SEC(m->ticks);
795 	n = 0;
796 	for(i=0; i<CacheSize; i++)
797 		if(b->cache[i].expire != 0)
798 			n++;
799 
800 	n *= 51;	// change if print format is changed
801 	n += 10;	// some slop at the end
802 	buf = malloc(n);
803 	p = buf;
804 	ep = buf + n;
805 	ce = b->cache;
806 	off = seconds() - sec;
807 	for(i=0; i<CacheSize; i++,ce++) {
808 		if(ce->expire == 0)
809 			continue;
810 		c = (sec < ce->expire)?'v':'e';
811 		p += snprint(p, ep-p, "%E %2d %10ld %10ld %10ld %c\n", ce->d,
812 			ce->port, ce->src, ce->dst, ce->expire+off, c);
813 	}
814 	*p = 0;
815 	poperror();
816 	qunlock(b);
817 
818 	return buf;
819 }
820 
821 
822 
823 // assumes b is locked
824 static void
825 ethermultiwrite(Bridge *b, Block *bp, Port *port)
826 {
827 	Port *oport;
828 	Block *bp2;
829 	Etherpkt *ep;
830 	int i, mcast, bcast;
831 	static uchar bcastaddr[Eaddrlen] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
832 
833 	if(waserror()) {
834 		if(bp)
835 			freeb(bp);
836 		nexterror();
837 	}
838 
839 	ep = (Etherpkt*)bp->rp;
840 	mcast = ep->d[0] & 1;
841 	if(mcast)
842 		bcast = memcmp(ep->d, bcastaddr, Eaddrlen) == 0;
843 	else
844 		bcast = 0;
845 
846 	oport = nil;
847 	for(i=0; i<b->nport; i++) {
848 		if(i == port->id || b->port[i] == nil)
849 			continue;
850 		if(mcast && !bcast && !b->port[i]->mcast)
851 			continue;
852 		if(mcast)
853 			b->port[i]->outmulti++;
854 		else
855 			b->port[i]->outunknown++;
856 
857 		// delay one so that the last write does not copy
858 		if(oport != nil) {
859 			b->copy++;
860 			bp2 = copyblock(bp, blocklen(bp));
861 			if(!waserror()) {
862 				etherwrite(oport, bp2);
863 				poperror();
864 			}
865 		}
866 		oport = b->port[i];
867 	}
868 
869 	// last write free block
870 	if(oport) {
871 		bp2 = bp; bp = nil; USED(bp);
872 		if(!waserror()) {
873 			etherwrite(oport, bp2);
874 			poperror();
875 		}
876 	} else
877 		freeb(bp);
878 
879 	poperror();
880 }
881 
882 static void
883 tcpmsshack(Etherpkt *epkt, int n)
884 {
885 	int hl;
886 	Iphdr *iphdr;
887 	Tcphdr *tcphdr;
888 	ulong mss;
889 	ulong cksum;
890 	int optlen;
891 	uchar *optr;
892 
893 	// check it is an ip packet
894 	if(nhgets(epkt->type) != 0x800)
895 		return;
896 	iphdr = (Iphdr*)(epkt->data);
897 	n -= ETHERHDRSIZE;
898 	if(n < IPHDR)
899 		return;
900 
901 	// check it is ok IP packet
902 	if(iphdr->vihl != (IP_VER|IP_HLEN)) {
903 		hl = (iphdr->vihl&0xF)<<2;
904 		if((iphdr->vihl&0xF0) != IP_VER || hl < (IP_HLEN<<2))
905 			return;
906 	} else
907 		hl = IP_HLEN<<2;
908 
909 	// check TCP
910 	if(iphdr->proto != IP_TCPPROTO)
911 		return;
912 	n -= hl;
913 	if(n < sizeof(Tcphdr))
914 		return;
915 	tcphdr = (Tcphdr*)((uchar*)(iphdr) + hl);
916 	// MSS can only appear in SYN packet
917 	if(!(tcphdr->flag[1] & SYN))
918 		return;
919 	hl = (tcphdr->flag[0] & 0xf0)>>2;
920 	if(n < hl)
921 		return;
922 
923 	// check for MSS option
924 	optr = (uchar*)(tcphdr) + sizeof(Tcphdr);
925 	n = hl - sizeof(Tcphdr);
926 	for(;;) {
927 		if(n <= 0 || *optr == EOLOPT)
928 			return;
929 		if(*optr == NOOPOPT) {
930 			n--;
931 			optr++;
932 			continue;
933 		}
934 		optlen = optr[1];
935 		if(optlen < 2 || optlen > n)
936 			return;
937 		if(*optr == MSSOPT && optlen == MSS_LENGTH)
938 			break;
939 		n -= optlen;
940 		optr += optlen;
941 	}
942 
943 	mss = nhgets(optr+2);
944 	if(mss <= TcpMssMax)
945 		return;
946 	// fit checksum
947 	cksum = nhgets(tcphdr->cksum);
948 	if(optr-(uchar*)tcphdr & 1) {
949 print("tcpmsshack: odd alignment!\n");
950 		// odd alignments are a pain
951 		cksum += nhgets(optr+1);
952 		cksum -= (optr[1]<<8)|(TcpMssMax>>8);
953 		cksum += (cksum>>16);
954 		cksum &= 0xffff;
955 		cksum += nhgets(optr+3);
956 		cksum -= ((TcpMssMax&0xff)<<8)|optr[4];
957 		cksum += (cksum>>16);
958 	} else {
959 		cksum += mss;
960 		cksum -= TcpMssMax;
961 		cksum += (cksum>>16);
962 	}
963 	hnputs(tcphdr->cksum, cksum);
964 	hnputs(optr+2, TcpMssMax);
965 }
966 
967 /*
968  *  process to read from the ethernet
969  */
970 static void
971 etherread(void *a)
972 {
973 	Port *port = a;
974 	Bridge *b = port->bridge;
975 	Block *bp, *bp2;
976 	Etherpkt *ep;
977 	Centry *ce;
978 	long md;
979 
980 	qlock(b);
981 	port->readp = up;	/* hide identity under a rock for unbind */
982 
983 	while(!port->closed){
984 		// release lock to read - error means it is time to quit
985 		qunlock(b);
986 		if(waserror()) {
987 print("etherread read error: %s\n", up->env->errstr);
988 			qlock(b);
989 			break;
990 		}
991 if(0)print("devbridge: etherread: reading\n");
992 		bp = devtab[port->data[0]->type]->bread(port->data[0], ETHERMAXTU, 0);
993 if(0)print("devbridge: etherread: blocklen = %d\n", blocklen(bp));
994 		poperror();
995 		qlock(b);
996 		if(bp == nil || port->closed)
997 			break;
998 		if(waserror()) {
999 //print("etherread bridge error\n");
1000 			if(bp)
1001 				freeb(bp);
1002 			continue;
1003 		}
1004 		if(blocklen(bp) < ETHERMINTU)
1005 			error("short packet");
1006 		port->in++;
1007 
1008 		ep = (Etherpkt*)bp->rp;
1009 		cacheupdate(b, ep->s, port->id);
1010 		if(b->tcpmss)
1011 			tcpmsshack(ep, BLEN(bp));
1012 
1013 		/*
1014 		 * delay packets to simulate a slow link
1015 		 */
1016 		if(b->delay0 || b->delayn){
1017 			md = b->delay0 + b->delayn * BLEN(bp);
1018 			if(md > 0)
1019 				microdelay(md);
1020 		}
1021 
1022 		if(ep->d[0] & 1) {
1023 			logb(b, Logmcast, "multicast: port=%d src=%E dst=%E type=%#.4ux\n",
1024 				port->id, ep->s, ep->d, (ep->type[0]<<8)|ep->type[1] );
1025 			port->inmulti++;
1026 			bp2 = bp; bp = nil;
1027 			ethermultiwrite(b, bp2, port);
1028 		} else {
1029 			ce = cachelookup(b, ep->d);
1030 			if(ce == nil) {
1031 				b->miss++;
1032 				port->inunknown++;
1033 				bp2 = bp; bp = nil;
1034 				ethermultiwrite(b, bp2, port);
1035 			}else if(ce->port != port->id){
1036 				b->hit++;
1037 				bp2 = bp; bp = nil;
1038 				etherwrite(b->port[ce->port], bp2);
1039 			}
1040 		}
1041 
1042 		poperror();
1043 		if(bp)
1044 			freeb(bp);
1045 	}
1046 //print("etherread: trying to exit\n");
1047 	port->readp = nil;
1048 	portfree(port);
1049 	qunlock(b);
1050 	pexit("hangup", 1);
1051 }
1052 
1053 static int
1054 fragment(Etherpkt *epkt, int n)
1055 {
1056 	Iphdr *iphdr;
1057 
1058 	if(n <= TunnelMtu)
1059 		return 0;
1060 
1061 	// check it is an ip packet
1062 	if(nhgets(epkt->type) != 0x800)
1063 		return 0;
1064 	iphdr = (Iphdr*)(epkt->data);
1065 	n -= ETHERHDRSIZE;
1066 	if(n < IPHDR)
1067 		return 0;
1068 
1069 	// check it is ok IP packet - I don't handle IP options for the momment
1070 	if(iphdr->vihl != (IP_VER|IP_HLEN))
1071 		return 0;
1072 
1073 	// check for don't fragment
1074 	if(iphdr->frag[0] & (IP_DF>>8))
1075 		return 0;
1076 
1077 	// check for short block
1078 	if(nhgets(iphdr->length) > n)
1079 		return 0;
1080 
1081 	return 1;
1082 }
1083 
1084 
1085 static void
1086 etherwrite(Port *port, Block *bp)
1087 {
1088 	Iphdr *eh, *feh;
1089 	Etherpkt *epkt;
1090 	int n, lid, len, seglen, chunk, dlen, blklen, offset, mf;
1091 	Block *xp, *nb;
1092 	ushort fragoff, frag;
1093 
1094 	port->out++;
1095 	epkt = (Etherpkt*)bp->rp;
1096 	n = blocklen(bp);
1097 	if(port->type != Ttun || !fragment(epkt, n)) {
1098 		devtab[port->data[1]->type]->bwrite(port->data[1], bp, 0);
1099 		return;
1100 	}
1101 	port->outfrag++;
1102 	if(waserror()){
1103 		freeblist(bp);
1104 		nexterror();
1105 	}
1106 
1107 	seglen = (TunnelMtu - ETHERHDRSIZE - IPHDR) & ~7;
1108 	eh = (Iphdr*)(epkt->data);
1109 	len = nhgets(eh->length);
1110 	frag = nhgets(eh->frag);
1111 	mf = frag & IP_MF;
1112 	frag <<= 3;
1113 	dlen = len - IPHDR;
1114 	xp = bp;
1115 	lid = nhgets(eh->id);
1116 	offset = ETHERHDRSIZE+IPHDR;
1117 	while(xp != nil && offset && offset >= BLEN(xp)) {
1118 		offset -= BLEN(xp);
1119 		xp = xp->next;
1120 	}
1121 	xp->rp += offset;
1122 
1123 if(0) print("seglen=%d, dlen=%d, mf=%x, frag=%d\n", seglen, dlen, mf, frag);
1124 	for(fragoff = 0; fragoff < dlen; fragoff += seglen) {
1125 		nb = allocb(ETHERHDRSIZE+IPHDR+seglen);
1126 
1127 		feh = (Iphdr*)(nb->wp+ETHERHDRSIZE);
1128 
1129 		memmove(nb->wp, epkt, ETHERHDRSIZE+IPHDR);
1130 		nb->wp += ETHERHDRSIZE+IPHDR;
1131 
1132 		if((fragoff + seglen) >= dlen) {
1133 			seglen = dlen - fragoff;
1134 			hnputs(feh->frag, (frag+fragoff)>>3 | mf);
1135 		}
1136 		else
1137 			hnputs(feh->frag, (frag+fragoff>>3) | IP_MF);
1138 
1139 		hnputs(feh->length, seglen + IPHDR);
1140 		hnputs(feh->id, lid);
1141 
1142 		/* Copy up the data area */
1143 		chunk = seglen;
1144 		while(chunk) {
1145 			blklen = chunk;
1146 			if(BLEN(xp) < chunk)
1147 				blklen = BLEN(xp);
1148 			memmove(nb->wp, xp->rp, blklen);
1149 			nb->wp += blklen;
1150 			xp->rp += blklen;
1151 			chunk -= blklen;
1152 			if(xp->rp == xp->wp)
1153 				xp = xp->next;
1154 		}
1155 
1156 		feh->cksum[0] = 0;
1157 		feh->cksum[1] = 0;
1158 		hnputs(feh->cksum, ipcsum(&feh->vihl));
1159 
1160 		// don't generate small packets
1161 		if(BLEN(nb) < ETHERMINTU)
1162 			nb->wp = nb->rp + ETHERMINTU;
1163 		devtab[port->data[1]->type]->bwrite(port->data[1], nb, 0);
1164 	}
1165 	poperror();
1166 	freeblist(bp);
1167 }
1168 
1169 // hold b lock
1170 static void
1171 portfree(Port *port)
1172 {
1173 	port->ref--;
1174 	if(port->ref < 0)
1175 		panic("portfree: bad ref");
1176 	if(port->ref > 0)
1177 		return;
1178 
1179 	if(port->data[0])
1180 		cclose(port->data[0]);
1181 	if(port->data[1])
1182 		cclose(port->data[1]);
1183 	memset(port, 0, sizeof(Port));
1184 	free(port);
1185 }
1186 
1187 Dev bridgedevtab = {
1188 	'B',
1189 	"bridge",
1190 
1191 	devreset,
1192 	bridgeinit,
1193 	devshutdown,
1194 	bridgeattach,
1195 	bridgewalk,
1196 	bridgestat,
1197 	bridgeopen,
1198 	devcreate,
1199 	bridgeclose,
1200 	bridgeread,
1201 	devbread,
1202 	bridgewrite,
1203 	devbwrite,
1204 	devremove,
1205 	devwstat,
1206 };
1207