xref: /netbsd-src/sys/rump/net/lib/libshmif/if_shmem.c (revision 22ebeae4b2252475e0ebe332f69734639cb946ea)
1 /*	$NetBSD: if_shmem.c,v 1.87 2024/08/20 16:49:10 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 2009, 2010 Antti Kantee.  All Rights Reserved.
5  *
6  * Development of this software was supported by The Nokia Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __KERNEL_RCSID(0, "$NetBSD: if_shmem.c,v 1.87 2024/08/20 16:49:10 riastradh Exp $");
32 
33 #include <sys/param.h>
34 #include <sys/atomic.h>
35 #include <sys/fcntl.h>
36 #include <sys/kmem.h>
37 #include <sys/kthread.h>
38 #include <sys/lock.h>
39 #include <sys/vmem.h>
40 #include <sys/cprng.h>
41 
42 #include <net/bpf.h>
43 #include <net/if.h>
44 #include <net/if_dl.h>
45 #include <net/if_ether.h>
46 #include <net/if_media.h>
47 #include <net/ether_sw_offload.h>
48 
49 #include <netinet/in.h>
50 #include <netinet/in_var.h>
51 
52 #include <rump-sys/kern.h>
53 #include <rump-sys/net.h>
54 
55 #include <rump/rump.h>
56 #include <rump/rumpuser.h>
57 
58 #include "shmif_user.h"
59 
60 static int shmif_clone(struct if_clone *, int);
61 static int shmif_unclone(struct ifnet *);
62 
63 static int shmif_mediachange(struct ifnet *);
64 static void shmif_mediastatus(struct ifnet *, struct ifmediareq *);
65 
66 struct if_clone shmif_cloner =
67     IF_CLONE_INITIALIZER("shmif", shmif_clone, shmif_unclone);
68 
69 /*
70  * Do r/w prefault for backend pages when attaching the interface.
71  * At least logically thinking improves performance (although no
72  * mlocking is done, so they might go away).
73  */
74 #define PREFAULT_RW
75 
76 /*
77  * A virtual ethernet interface which uses shared memory from a
78  * memory mapped file as the bus.
79  */
80 
81 static int	shmif_init(struct ifnet *);
82 static int	shmif_ioctl(struct ifnet *, u_long, void *);
83 static void	shmif_start(struct ifnet *);
84 static void	shmif_snd(struct ifnet *, struct mbuf *);
85 static void	shmif_stop(struct ifnet *, int);
86 
87 #include "shmifvar.h"
88 
89 struct shmif_sc {
90 	struct ethercom sc_ec;
91 	struct ifmedia sc_im;
92 	struct shmif_mem *sc_busmem;
93 	int sc_memfd;
94 	int sc_kq;
95 	int sc_unit;
96 
97 	char *sc_backfile;
98 	size_t sc_backfilelen;
99 
100 	uint64_t sc_devgen;
101 	uint32_t sc_nextpacket;
102 
103 	kmutex_t sc_mtx;
104 	kcondvar_t sc_cv;
105 
106 	struct lwp *sc_rcvl;
107 	bool sc_dying;
108 
109 	uint64_t sc_uid;
110 };
111 
112 static void shmif_rcv(void *);
113 
114 #define LOCK_UNLOCKED	0
115 #define LOCK_LOCKED	1
116 #define LOCK_COOLDOWN	1001
117 
118 vmem_t *shmif_units;
119 
120 static void
121 dowakeup(struct shmif_sc *sc)
122 {
123 	struct rumpuser_iovec iov;
124 	uint32_t ver = SHMIF_VERSION;
125 	size_t n;
126 
127 	iov.iov_base = &ver;
128 	iov.iov_len = sizeof(ver);
129 	rumpuser_iovwrite(sc->sc_memfd, &iov, 1, IFMEM_WAKEUP, &n);
130 }
131 
132 /*
133  * This locking needs work and will misbehave severely if:
134  * 1) the backing memory has to be paged in
135  * 2) some lockholder exits while holding the lock
136  */
137 static void
138 shmif_lockbus(struct shmif_mem *busmem)
139 {
140 	int i = 0;
141 
142 	while (__predict_false(atomic_cas_32(&busmem->shm_lock,
143 	    LOCK_UNLOCKED, LOCK_LOCKED) == LOCK_LOCKED)) {
144 		if (__predict_false(++i > LOCK_COOLDOWN)) {
145 			/* wait 1ms */
146 			rumpuser_clock_sleep(RUMPUSER_CLOCK_RELWALL,
147 			    0, 1000*1000);
148 			i = 0;
149 		}
150 		continue;
151 	}
152 	membar_acquire();
153 }
154 
155 static void
156 shmif_unlockbus(struct shmif_mem *busmem)
157 {
158 	unsigned int old __diagused;
159 
160 	membar_release();
161 	old = atomic_swap_32(&busmem->shm_lock, LOCK_UNLOCKED);
162 	KASSERT(old == LOCK_LOCKED);
163 }
164 
165 static int
166 allocif(int unit, struct shmif_sc **scp)
167 {
168 	uint8_t enaddr[ETHER_ADDR_LEN] = { 0xb2, 0xa0, 0x00, 0x00, 0x00, 0x00 };
169 	struct shmif_sc *sc;
170 	struct ifnet *ifp;
171 	uint64_t randnum;
172 	int error = 0;
173 
174 	randnum = cprng_strong64();
175 	memcpy(&enaddr[2], &randnum, 4);
176 
177 	sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
178 	sc->sc_memfd = -1;
179 	sc->sc_unit = unit;
180 	sc->sc_uid = randnum;
181 
182 	ifp = &sc->sc_ec.ec_if;
183 
184 	ifmedia_init(&sc->sc_im, 0, shmif_mediachange, shmif_mediastatus);
185 	ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_AUTO, 0, NULL);
186 	ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_NONE, 0, NULL);
187 	ifmedia_set(&sc->sc_im, IFM_ETHER|IFM_AUTO);
188 
189 	snprintf(ifp->if_xname, sizeof(ifp->if_xname), "shmif%d", unit);
190 	ifp->if_softc = sc;
191 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
192 	ifp->if_init = shmif_init;
193 	ifp->if_ioctl = shmif_ioctl;
194 	ifp->if_start = shmif_start;
195 	ifp->if_stop = shmif_stop;
196 	ifp->if_mtu = ETHERMTU;
197 	ifp->if_dlt = DLT_EN10MB;
198 	ifp->if_capabilities = IFCAP_TSOv4 | IFCAP_TSOv6 |
199 	    IFCAP_CSUM_IPv4_Rx	| IFCAP_CSUM_IPv4_Tx |
200 	    IFCAP_CSUM_TCPv4_Rx	| IFCAP_CSUM_TCPv4_Tx |
201 	    IFCAP_CSUM_UDPv4_Rx	| IFCAP_CSUM_UDPv4_Tx |
202 	    IFCAP_CSUM_TCPv6_Rx	| IFCAP_CSUM_TCPv6_Tx |
203 	    IFCAP_CSUM_UDPv6_Rx	| IFCAP_CSUM_UDPv6_Tx;
204 	IFQ_SET_READY(&ifp->if_snd);
205 
206 	mutex_init(&sc->sc_mtx, MUTEX_DEFAULT, IPL_NONE);
207 	cv_init(&sc->sc_cv, "shmifcv");
208 
209 	if_initialize(ifp);
210 #if 1
211 	char buf[256];
212 
213 	if (rumpuser_getparam("RUMP_SHMIF_CAPENABLE", buf, sizeof(buf)) == 0) {
214 		uint64_t capen = strtoul(buf, NULL, 0);
215 
216 		ifp->if_capenable = capen & ifp->if_capabilities;
217 	}
218 #endif
219 
220 	if_deferred_start_init(ifp, NULL);
221 	ether_ifattach(ifp, enaddr);
222 	if_register(ifp);
223 
224 	aprint_verbose("shmif%d: Ethernet address %s\n",
225 	    unit, ether_sprintf(enaddr));
226 
227 	if (scp)
228 		*scp = sc;
229 
230 	if (rump_threads) {
231 		error = kthread_create(PRI_NONE,
232 		    KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL,
233 		    shmif_rcv, ifp, &sc->sc_rcvl, "shmif");
234 	} else {
235 		printf("WARNING: threads not enabled, shmif NOT working\n");
236 	}
237 
238 	if (error) {
239 		shmif_unclone(ifp);
240 	}
241 
242 	return 0;
243 }
244 
245 static int
246 initbackend(struct shmif_sc *sc, int memfd)
247 {
248 	volatile uint8_t v;
249 	volatile uint8_t *p;
250 	void *mem;
251 	int error;
252 
253 	error = rumpcomp_shmif_mmap(memfd, BUSMEM_SIZE, &mem);
254 	if (error)
255 		return error;
256 	sc->sc_busmem = mem;
257 
258 	if (sc->sc_busmem->shm_magic
259 	    && sc->sc_busmem->shm_magic != SHMIF_MAGIC) {
260 		printf("bus is not magical");
261 		rumpuser_unmap(sc->sc_busmem, BUSMEM_SIZE);
262 		return ENOEXEC;
263 	}
264 
265 	/*
266 	 * Prefault in pages to minimize runtime penalty with buslock.
267 	 * Use 512 instead of PAGE_SIZE to make sure we catch cases where
268 	 * rump kernel PAGE_SIZE > host page size.
269 	 */
270 	for (p = (uint8_t *)sc->sc_busmem;
271 	    p < (uint8_t *)sc->sc_busmem + BUSMEM_SIZE;
272 	    p += 512)
273 		v = *p;
274 
275 	shmif_lockbus(sc->sc_busmem);
276 	/* we're first?  initialize bus */
277 	if (sc->sc_busmem->shm_magic == 0) {
278 		sc->sc_busmem->shm_magic = SHMIF_MAGIC;
279 		sc->sc_busmem->shm_first = BUSMEM_DATASIZE;
280 	}
281 
282 	sc->sc_nextpacket = sc->sc_busmem->shm_last;
283 	sc->sc_devgen = sc->sc_busmem->shm_gen;
284 
285 #ifdef PREFAULT_RW
286 	for (p = (uint8_t *)sc->sc_busmem;
287 	    p < (uint8_t *)sc->sc_busmem + BUSMEM_SIZE;
288 	    p += PAGE_SIZE) {
289 		v = *p;
290 		*p = v;
291 	}
292 #endif
293 	shmif_unlockbus(sc->sc_busmem);
294 
295 	sc->sc_kq = -1;
296 	error = rumpcomp_shmif_watchsetup(&sc->sc_kq, memfd);
297 	if (error) {
298 		rumpuser_unmap(sc->sc_busmem, BUSMEM_SIZE);
299 		return error;
300 	}
301 
302 	sc->sc_memfd = memfd;
303 
304 	return error;
305 }
306 
307 static void
308 finibackend(struct shmif_sc *sc)
309 {
310 
311 	if (sc->sc_backfile == NULL)
312 		return;
313 
314 	if (sc->sc_backfile) {
315 		kmem_free(sc->sc_backfile, sc->sc_backfilelen);
316 		sc->sc_backfile = NULL;
317 		sc->sc_backfilelen = 0;
318 	}
319 
320 	rumpuser_unmap(sc->sc_busmem, BUSMEM_SIZE);
321 	rumpuser_close(sc->sc_memfd);
322 	rumpuser_close(sc->sc_kq);
323 
324 	sc->sc_memfd = -1;
325 }
326 
327 int
328 rump_shmif_create(const char *path, int *ifnum)
329 {
330 	struct shmif_sc *sc;
331 	vmem_addr_t t;
332 	int unit, error;
333 	int memfd = -1; /* XXXgcc */
334 
335 	if (path) {
336 		error = rumpuser_open(path,
337 		    RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_CREATE, &memfd);
338 		if (error)
339 			return error;
340 	}
341 
342 	error = vmem_xalloc(shmif_units, 1, 0, 0, 0,
343 	    VMEM_ADDR_MIN, VMEM_ADDR_MAX, VM_INSTANTFIT | VM_SLEEP, &t);
344 
345 	if (error != 0) {
346 		if (path)
347 			rumpuser_close(memfd);
348 		return error;
349 	}
350 
351 	unit = t - 1;
352 
353 	if ((error = allocif(unit, &sc)) != 0) {
354 		if (path)
355 			rumpuser_close(memfd);
356 		return error;
357 	}
358 
359 	if (!path)
360 		goto out;
361 
362 	error = initbackend(sc, memfd);
363 	if (error) {
364 		shmif_unclone(&sc->sc_ec.ec_if);
365 		return error;
366 	}
367 
368 	sc->sc_backfilelen = strlen(path)+1;
369 	sc->sc_backfile = kmem_alloc(sc->sc_backfilelen, KM_SLEEP);
370 	strcpy(sc->sc_backfile, path);
371 
372  out:
373 	if (ifnum)
374 		*ifnum = unit;
375 
376 	return 0;
377 }
378 
379 static int
380 shmif_clone(struct if_clone *ifc, int unit)
381 {
382 	int rc __diagused;
383 	vmem_addr_t unit2;
384 
385 	/*
386 	 * Ok, we know the unit number, but we must still reserve it.
387 	 * Otherwise the wildcard-side of things might get the same one.
388 	 * This is slightly offset-happy due to vmem.  First, we offset
389 	 * the range of unit numbers by +1 since vmem cannot deal with
390 	 * ranges starting from 0.  Talk about uuuh.
391 	 */
392 	rc = vmem_xalloc(shmif_units, 1, 0, 0, 0, unit+1, unit+1,
393 	    VM_SLEEP | VM_INSTANTFIT, &unit2);
394 	KASSERT(rc == 0 && unit2-1 == unit);
395 
396 	return allocif(unit, NULL);
397 }
398 
399 static int
400 shmif_unclone(struct ifnet *ifp)
401 {
402 	struct shmif_sc *sc = ifp->if_softc;
403 
404 	shmif_stop(ifp, 1);
405 	if_down(ifp);
406 
407 	mutex_enter(&sc->sc_mtx);
408 	sc->sc_dying = true;
409 	cv_broadcast(&sc->sc_cv);
410 	mutex_exit(&sc->sc_mtx);
411 
412 	if (sc->sc_rcvl)
413 		kthread_join(sc->sc_rcvl);
414 	sc->sc_rcvl = NULL;
415 
416 	/*
417 	 * Need to be called after the kthread left, otherwise closing kqueue
418 	 * (sc_kq) hangs sometimes perhaps because of a race condition between
419 	 * close and kevent in the kthread on the kqueue.
420 	 */
421 	finibackend(sc);
422 
423 	vmem_xfree(shmif_units, sc->sc_unit+1, 1);
424 
425 	ether_ifdetach(ifp);
426 	if_detach(ifp);
427 
428 	cv_destroy(&sc->sc_cv);
429 	mutex_destroy(&sc->sc_mtx);
430 
431 	kmem_free(sc, sizeof(*sc));
432 
433 	return 0;
434 }
435 
436 static int
437 shmif_init(struct ifnet *ifp)
438 {
439 	struct shmif_sc *sc = ifp->if_softc;
440 	int error = 0;
441 
442 	if (sc->sc_memfd == -1)
443 		return ENXIO;
444 	KASSERT(sc->sc_busmem);
445 
446 	ifp->if_flags |= IFF_RUNNING;
447 
448 	mutex_enter(&sc->sc_mtx);
449 	sc->sc_nextpacket = sc->sc_busmem->shm_last;
450 	sc->sc_devgen = sc->sc_busmem->shm_gen;
451 
452 	cv_broadcast(&sc->sc_cv);
453 	mutex_exit(&sc->sc_mtx);
454 
455 	return error;
456 }
457 
458 static int
459 shmif_mediachange(struct ifnet *ifp)
460 {
461 	struct shmif_sc *sc = ifp->if_softc;
462 
463 	if (IFM_SUBTYPE(sc->sc_im.ifm_cur->ifm_media) == IFM_NONE &&
464 	    ifp->if_link_state != LINK_STATE_DOWN) {
465 		if_link_state_change(ifp, LINK_STATE_DOWN);
466 	} else if (IFM_SUBTYPE(sc->sc_im.ifm_cur->ifm_media) == IFM_AUTO &&
467 	    ifp->if_link_state != LINK_STATE_UP) {
468 		if_link_state_change(ifp, LINK_STATE_UP);
469 	}
470 	return 0;
471 }
472 
473 static void
474 shmif_mediastatus(struct ifnet *ifp, struct ifmediareq *imr)
475 {
476 	struct shmif_sc *sc = ifp->if_softc;
477 	imr->ifm_active = sc->sc_im.ifm_cur->ifm_media;
478 }
479 
480 static int
481 shmif_ioctl(struct ifnet *ifp, u_long cmd, void *data)
482 {
483 	struct shmif_sc *sc = ifp->if_softc;
484 	struct ifdrv *ifd;
485 	char *path;
486 	int s, rv, memfd;
487 
488 	s = splnet();
489 	switch (cmd) {
490 	case SIOCGLINKSTR:
491 		ifd = data;
492 
493 		if (sc->sc_backfilelen == 0) {
494 			rv = ENOENT;
495 			break;
496 		}
497 
498 		ifd->ifd_len = sc->sc_backfilelen;
499 		if (ifd->ifd_cmd == IFLINKSTR_QUERYLEN) {
500 			rv = 0;
501 			break;
502 		}
503 
504 		if (ifd->ifd_cmd != 0) {
505 			rv = EINVAL;
506 			break;
507 		}
508 
509 		rv = copyoutstr(sc->sc_backfile, ifd->ifd_data,
510 		    MIN(sc->sc_backfilelen, ifd->ifd_len), NULL);
511 		break;
512 	case SIOCSLINKSTR:
513 		if (ifp->if_flags & IFF_UP) {
514 			rv = EBUSY;
515 			break;
516 		}
517 
518 		ifd = data;
519 		if (ifd->ifd_cmd == IFLINKSTR_UNSET) {
520 			finibackend(sc);
521 			/* Back to the default just in case */
522 			ifp->if_link_state = LINK_STATE_UNKNOWN;
523 			rv = 0;
524 			break;
525 		} else if (ifd->ifd_cmd != 0) {
526 			rv = EINVAL;
527 			break;
528 		} else if (sc->sc_backfile) {
529 			rv = EBUSY;
530 			break;
531 		}
532 
533 		if (ifd->ifd_len > MAXPATHLEN) {
534 			rv = E2BIG;
535 			break;
536 		} else if (ifd->ifd_len < 1) {
537 			rv = EINVAL;
538 			break;
539 		}
540 
541 		path = kmem_alloc(ifd->ifd_len, KM_SLEEP);
542 		rv = copyinstr(ifd->ifd_data, path, ifd->ifd_len, NULL);
543 		if (rv) {
544 			kmem_free(path, ifd->ifd_len);
545 			break;
546 		}
547 		rv = rumpuser_open(path,
548 		    RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_CREATE, &memfd);
549 		if (rv) {
550 			kmem_free(path, ifd->ifd_len);
551 			break;
552 		}
553 		rv = initbackend(sc, memfd);
554 		if (rv) {
555 			kmem_free(path, ifd->ifd_len);
556 			rumpuser_close(memfd);
557 			break;
558 		}
559 		sc->sc_backfile = path;
560 		sc->sc_backfilelen = ifd->ifd_len;
561 
562 		if_link_state_change(ifp, LINK_STATE_UP);
563 		break;
564 
565 #ifdef OSIOCSIFMEDIA
566 	case OSIOCSIFMEDIA:
567 #endif
568 	case SIOCSIFMEDIA:
569 	case SIOCGIFMEDIA:
570 		rv = ifmedia_ioctl(ifp, data, &sc->sc_im, cmd);
571 		break;
572 
573 	default:
574 		rv = ether_ioctl(ifp, cmd, data);
575 		if (rv == ENETRESET)
576 			rv = 0;
577 		break;
578 	}
579 	splx(s);
580 
581 	return rv;
582 }
583 
584 static void
585 shmif_start(struct ifnet *ifp)
586 {
587 	struct shmif_sc *sc = ifp->if_softc;
588 	struct mbuf *m, *n;
589 	bool wrote = false;
590 
591 	ifp->if_flags |= IFF_OACTIVE;
592 
593 	for (;;) {
594 		IFQ_DEQUEUE(&ifp->if_snd, m);
595 		if (m == NULL)
596 			break;
597 
598 		m = ether_sw_offload_tx(ifp, m);
599 		if (m == NULL) {
600 			if_statinc(ifp, if_oerrors);
601 			break;
602 		}
603 
604 		do {
605 			n = m->m_nextpkt;
606 			shmif_snd(ifp, m);
607 			m = n;
608 		} while (m != NULL);
609 
610 		wrote = true;
611 	}
612 
613 	ifp->if_flags &= ~IFF_OACTIVE;
614 
615 	/* wakeup? */
616 	if (wrote) {
617 		dowakeup(sc);
618 	}
619 }
620 
621 /* send everything in-context since it's just a matter of mem-to-mem copy */
622 static void
623 shmif_snd(struct ifnet *ifp, struct mbuf *m0)
624 {
625 	struct shmif_sc *sc = ifp->if_softc;
626 	struct shmif_mem *busmem = sc->sc_busmem;
627 	struct shmif_pkthdr sp;
628 	struct timeval tv;
629 	struct mbuf *m;
630 	uint32_t dataoff;
631 	uint32_t pktsize, pktwrote;
632 	bool wrap;
633 
634 	pktsize = 0;
635 	for (m = m0; m != NULL; m = m->m_next) {
636 		pktsize += m->m_len;
637 	}
638 	KASSERT(pktsize <= ETHERMTU + ETHER_HDR_LEN);
639 
640 	getmicrouptime(&tv);
641 	sp.sp_len = pktsize;
642 	sp.sp_sec = tv.tv_sec;
643 	sp.sp_usec = tv.tv_usec;
644 	sp.sp_sender = sc->sc_uid;
645 
646 	bpf_mtap(ifp, m0, BPF_D_OUT);
647 
648 	/*
649 	 * Compare with DOWN to allow UNKNOWN (the default value),
650 	 * which is required by some ATF tests using rump servers
651 	 * written in C.
652 	 */
653 	if (ifp->if_link_state == LINK_STATE_DOWN)
654 		goto dontsend;
655 
656 	shmif_lockbus(busmem);
657 	KASSERT(busmem->shm_magic == SHMIF_MAGIC);
658 	busmem->shm_last = shmif_nextpktoff(busmem, busmem->shm_last);
659 
660 	wrap = false;
661 	dataoff =
662 	    shmif_buswrite(busmem, busmem->shm_last, &sp, sizeof(sp), &wrap);
663 	pktwrote = 0;
664 	for (m = m0; m != NULL; m = m->m_next) {
665 		pktwrote += m->m_len;
666 		dataoff = shmif_buswrite(busmem, dataoff, mtod(m, void *),
667 		    m->m_len, &wrap);
668 	}
669 	KASSERT(pktwrote == pktsize);
670 	if (wrap) {
671 		busmem->shm_gen++;
672 		DPRINTF(("bus generation now %" PRIu64 "\n", busmem->shm_gen));
673 	}
674 	shmif_unlockbus(busmem);
675 
676 dontsend:
677 	m_freem(m0);
678 	if_statinc(ifp, if_opackets);
679 
680 	DPRINTF(("shmif_start: send %d bytes at off %d\n", pktsize,
681 	    busmem->shm_last));
682 }
683 
684 static void
685 shmif_stop(struct ifnet *ifp, int disable)
686 {
687 	struct shmif_sc *sc = ifp->if_softc;
688 
689 	ifp->if_flags &= ~IFF_RUNNING;
690 	membar_producer();
691 
692 	/*
693 	 * wakeup thread.  this will of course wake up all bus
694 	 * listeners, but that's life.
695 	 */
696 	if (sc->sc_memfd != -1) {
697 		dowakeup(sc);
698 	}
699 }
700 
701 
702 /*
703  * Check if we have been sleeping too long.  Basically,
704  * our in-sc nextpkt must by first <= nextpkt <= last"+1".
705  * We use the fact that first is guaranteed to never overlap
706  * with the last frame in the ring.
707  */
708 static __inline bool
709 stillvalid_p(struct shmif_sc *sc)
710 {
711 	struct shmif_mem *busmem = sc->sc_busmem;
712 	unsigned gendiff = busmem->shm_gen - sc->sc_devgen;
713 	uint32_t lastoff, devoff;
714 
715 	KASSERT(busmem->shm_first != busmem->shm_last);
716 
717 	/* normalize onto a 2x busmem chunk */
718 	devoff = sc->sc_nextpacket;
719 	lastoff = shmif_nextpktoff(busmem, busmem->shm_last);
720 
721 	/* trivial case */
722 	if (gendiff > 1)
723 		return false;
724 	KASSERT(gendiff <= 1);
725 
726 	/* Normalize onto 2x busmem chunk */
727 	if (busmem->shm_first >= lastoff) {
728 		lastoff += BUSMEM_DATASIZE;
729 		if (gendiff == 0)
730 			devoff += BUSMEM_DATASIZE;
731 	} else {
732 		if (gendiff)
733 			return false;
734 	}
735 
736 	return devoff >= busmem->shm_first && devoff <= lastoff;
737 }
738 
739 static void
740 shmif_rcv(void *arg)
741 {
742 	struct ifnet *ifp = arg;
743 	struct shmif_sc *sc = ifp->if_softc;
744 	struct shmif_mem *busmem;
745 	struct mbuf *m = NULL;
746 	struct ether_header *eth;
747 	uint32_t nextpkt;
748 	bool wrap, passup;
749 	int error;
750 	const int align
751 	    = ALIGN(sizeof(struct ether_header)) - sizeof(struct ether_header);
752 
753  reup:
754 	mutex_enter(&sc->sc_mtx);
755 	while ((ifp->if_flags & IFF_RUNNING) == 0 && !sc->sc_dying)
756 		cv_wait(&sc->sc_cv, &sc->sc_mtx);
757 	mutex_exit(&sc->sc_mtx);
758 
759 	busmem = sc->sc_busmem;
760 
761 	while (ifp->if_flags & IFF_RUNNING) {
762 		struct shmif_pkthdr sp;
763 
764 		if (m == NULL) {
765 			m = m_gethdr(M_WAIT, MT_DATA);
766 			MCLGET(m, M_WAIT);
767 			m->m_data += align;
768 		}
769 
770 		DPRINTF(("waiting %d/%" PRIu64 "\n",
771 		    sc->sc_nextpacket, sc->sc_devgen));
772 		KASSERT(m->m_flags & M_EXT);
773 
774 		shmif_lockbus(busmem);
775 		KASSERT(busmem->shm_magic == SHMIF_MAGIC);
776 		KASSERT(busmem->shm_gen >= sc->sc_devgen);
777 
778 		/* need more data? */
779 		if (sc->sc_devgen == busmem->shm_gen &&
780 		    shmif_nextpktoff(busmem, busmem->shm_last)
781 		     == sc->sc_nextpacket) {
782 			shmif_unlockbus(busmem);
783 			error = rumpcomp_shmif_watchwait(sc->sc_kq);
784 			if (__predict_false(error))
785 				printf("shmif_rcv: wait failed %d\n", error);
786 			membar_consumer();
787 			continue;
788 		}
789 
790 		if (stillvalid_p(sc)) {
791 			nextpkt = sc->sc_nextpacket;
792 		} else {
793 			KASSERT(busmem->shm_gen > 0);
794 			nextpkt = busmem->shm_first;
795 			if (busmem->shm_first > busmem->shm_last)
796 				sc->sc_devgen = busmem->shm_gen - 1;
797 			else
798 				sc->sc_devgen = busmem->shm_gen;
799 			DPRINTF(("dev %p overrun, new data: %d/%" PRIu64 "\n",
800 			    sc, nextpkt, sc->sc_devgen));
801 		}
802 
803 		/*
804 		 * If our read pointer is ahead the bus last write, our
805 		 * generation must be one behind.
806 		 */
807 		KASSERT(!(nextpkt > busmem->shm_last
808 		    && sc->sc_devgen == busmem->shm_gen));
809 
810 		wrap = false;
811 		nextpkt = shmif_busread(busmem, &sp,
812 		    nextpkt, sizeof(sp), &wrap);
813 		KASSERT(sp.sp_len <= ETHERMTU + ETHER_HDR_LEN);
814 		nextpkt = shmif_busread(busmem, mtod(m, void *),
815 		    nextpkt, sp.sp_len, &wrap);
816 
817 		DPRINTF(("shmif_rcv: read packet of length %d at %d\n",
818 		    sp.sp_len, nextpkt));
819 
820 		sc->sc_nextpacket = nextpkt;
821 		shmif_unlockbus(sc->sc_busmem);
822 
823 		if (wrap) {
824 			sc->sc_devgen++;
825 			DPRINTF(("dev %p generation now %" PRIu64 "\n",
826 			    sc, sc->sc_devgen));
827 		}
828 
829 		/*
830 		 * Ignore packets too short to possibly be valid.
831 		 * This is hit at least for the first frame on a new bus.
832 		 */
833 		if (__predict_false(sp.sp_len < ETHER_HDR_LEN)) {
834 			DPRINTF(("shmif read packet len %d < ETHER_HDR_LEN\n",
835 			    sp.sp_len));
836 			continue;
837 		}
838 
839 		m->m_len = m->m_pkthdr.len = sp.sp_len;
840 		m_set_rcvif(m, ifp);
841 
842 		/*
843 		 * Test if we want to pass the packet upwards
844 		 */
845 		eth = mtod(m, struct ether_header *);
846 		/*
847 		 * Compare with DOWN to allow UNKNOWN (the default value),
848 		 * which is required by some ATF tests using rump servers
849 		 * written in C.
850 		 */
851 		if (ifp->if_link_state == LINK_STATE_DOWN) {
852 			passup = false;
853 		} else if (sp.sp_sender == sc->sc_uid) {
854 			passup = false;
855 		} else if (memcmp(eth->ether_dhost, CLLADDR(ifp->if_sadl),
856 		    ETHER_ADDR_LEN) == 0) {
857 			passup = true;
858 		} else if (ETHER_IS_MULTICAST(eth->ether_dhost)) {
859 			passup = true;
860 		} else if (ifp->if_flags & IFF_PROMISC) {
861 			m->m_flags |= M_PROMISC;
862 			passup = true;
863 		} else {
864 			passup = false;
865 		}
866 
867 		if (passup) {
868 			int bound;
869 
870 			m = ether_sw_offload_rx(ifp, m);
871 
872 			KERNEL_LOCK(1, NULL);
873 			/* Prevent LWP migrations between CPUs for psref(9) */
874 			bound = curlwp_bind();
875 			if_input(ifp, m);
876 			curlwp_bindx(bound);
877 			KERNEL_UNLOCK_ONE(NULL);
878 
879 			m = NULL;
880 		}
881 		/* else: reuse mbuf for a future packet */
882 	}
883 	m_freem(m);
884 	m = NULL;
885 
886 	if (!sc->sc_dying)
887 		goto reup;
888 
889 	kthread_exit(0);
890 }
891