xref: /netbsd-src/sys/net/bpf.c (revision fdd524d4ccd2bb0c6f67401e938dabf773eb0372)
1 /*	$NetBSD: bpf.c,v 1.199 2016/06/20 06:46:37 knakahara Exp $	*/
2 
3 /*
4  * Copyright (c) 1990, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from the Stanford/CMU enet packet filter,
8  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10  * Berkeley Laboratory.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)bpf.c	8.4 (Berkeley) 1/9/95
37  * static char rcsid[] =
38  * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
39  */
40 
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.199 2016/06/20 06:46:37 knakahara Exp $");
43 
44 #if defined(_KERNEL_OPT)
45 #include "opt_bpf.h"
46 #include "sl.h"
47 #include "strip.h"
48 #endif
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/mbuf.h>
53 #include <sys/buf.h>
54 #include <sys/time.h>
55 #include <sys/proc.h>
56 #include <sys/ioctl.h>
57 #include <sys/conf.h>
58 #include <sys/vnode.h>
59 #include <sys/queue.h>
60 #include <sys/stat.h>
61 #include <sys/module.h>
62 #include <sys/once.h>
63 #include <sys/atomic.h>
64 
65 #include <sys/file.h>
66 #include <sys/filedesc.h>
67 #include <sys/tty.h>
68 #include <sys/uio.h>
69 
70 #include <sys/protosw.h>
71 #include <sys/socket.h>
72 #include <sys/errno.h>
73 #include <sys/kernel.h>
74 #include <sys/poll.h>
75 #include <sys/sysctl.h>
76 #include <sys/kauth.h>
77 
78 #include <net/if.h>
79 #include <net/slip.h>
80 
81 #include <net/bpf.h>
82 #include <net/bpfdesc.h>
83 #include <net/bpfjit.h>
84 
85 #include <net/if_arc.h>
86 #include <net/if_ether.h>
87 
88 #include <netinet/in.h>
89 #include <netinet/if_inarp.h>
90 
91 
92 #include <compat/sys/sockio.h>
93 
94 #ifndef BPF_BUFSIZE
95 /*
96  * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
97  * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
98  */
99 # define BPF_BUFSIZE 32768
100 #endif
101 
102 #define PRINET  26			/* interruptible */
103 
104 /*
105  * The default read buffer size, and limit for BIOCSBLEN, is sysctl'able.
106  * XXX the default values should be computed dynamically based
107  * on available memory size and available mbuf clusters.
108  */
109 int bpf_bufsize = BPF_BUFSIZE;
110 int bpf_maxbufsize = BPF_DFLTBUFSIZE;	/* XXX set dynamically, see above */
111 bool bpf_jit = false;
112 
113 struct bpfjit_ops bpfjit_module_ops = {
114 	.bj_generate_code = NULL,
115 	.bj_free_code = NULL
116 };
117 
118 /*
119  * Global BPF statistics returned by net.bpf.stats sysctl.
120  */
121 struct bpf_stat	bpf_gstats;
122 
123 /*
124  * Use a mutex to avoid a race condition between gathering the stats/peers
125  * and opening/closing the device.
126  */
127 static kmutex_t bpf_mtx;
128 
129 /*
130  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
131  *  bpf_dtab holds the descriptors, indexed by minor device #
132  */
133 struct bpf_if	*bpf_iflist;
134 LIST_HEAD(, bpf_d) bpf_list;
135 
136 static int	bpf_allocbufs(struct bpf_d *);
137 static void	bpf_deliver(struct bpf_if *,
138 		            void *(*cpfn)(void *, const void *, size_t),
139 		            void *, u_int, u_int, const bool);
140 static void	bpf_freed(struct bpf_d *);
141 static void	bpf_ifname(struct ifnet *, struct ifreq *);
142 static void	*bpf_mcpy(void *, const void *, size_t);
143 static int	bpf_movein(struct uio *, int, uint64_t,
144 			        struct mbuf **, struct sockaddr *);
145 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
146 static void	bpf_detachd(struct bpf_d *);
147 static int	bpf_setif(struct bpf_d *, struct ifreq *);
148 static void	bpf_timed_out(void *);
149 static inline void
150 		bpf_wakeup(struct bpf_d *);
151 static int	bpf_hdrlen(struct bpf_d *);
152 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
153     void *(*)(void *, const void *, size_t), struct timespec *);
154 static void	reset_d(struct bpf_d *);
155 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
156 static int	bpf_setdlt(struct bpf_d *, u_int);
157 
158 static int	bpf_read(struct file *, off_t *, struct uio *, kauth_cred_t,
159     int);
160 static int	bpf_write(struct file *, off_t *, struct uio *, kauth_cred_t,
161     int);
162 static int	bpf_ioctl(struct file *, u_long, void *);
163 static int	bpf_poll(struct file *, int);
164 static int	bpf_stat(struct file *, struct stat *);
165 static int	bpf_close(struct file *);
166 static int	bpf_kqfilter(struct file *, struct knote *);
167 static void	bpf_softintr(void *);
168 
169 static const struct fileops bpf_fileops = {
170 	.fo_read = bpf_read,
171 	.fo_write = bpf_write,
172 	.fo_ioctl = bpf_ioctl,
173 	.fo_fcntl = fnullop_fcntl,
174 	.fo_poll = bpf_poll,
175 	.fo_stat = bpf_stat,
176 	.fo_close = bpf_close,
177 	.fo_kqfilter = bpf_kqfilter,
178 	.fo_restart = fnullop_restart,
179 };
180 
181 dev_type_open(bpfopen);
182 
183 const struct cdevsw bpf_cdevsw = {
184 	.d_open = bpfopen,
185 	.d_close = noclose,
186 	.d_read = noread,
187 	.d_write = nowrite,
188 	.d_ioctl = noioctl,
189 	.d_stop = nostop,
190 	.d_tty = notty,
191 	.d_poll = nopoll,
192 	.d_mmap = nommap,
193 	.d_kqfilter = nokqfilter,
194 	.d_discard = nodiscard,
195 	.d_flag = D_OTHER
196 };
197 
198 bpfjit_func_t
199 bpf_jit_generate(bpf_ctx_t *bc, void *code, size_t size)
200 {
201 
202 	membar_consumer();
203 	if (bpfjit_module_ops.bj_generate_code != NULL) {
204 		return bpfjit_module_ops.bj_generate_code(bc, code, size);
205 	}
206 	return NULL;
207 }
208 
209 void
210 bpf_jit_freecode(bpfjit_func_t jcode)
211 {
212 	KASSERT(bpfjit_module_ops.bj_free_code != NULL);
213 	bpfjit_module_ops.bj_free_code(jcode);
214 }
215 
216 static int
217 bpf_movein(struct uio *uio, int linktype, uint64_t mtu, struct mbuf **mp,
218 	   struct sockaddr *sockp)
219 {
220 	struct mbuf *m;
221 	int error;
222 	size_t len;
223 	size_t hlen;
224 	size_t align;
225 
226 	/*
227 	 * Build a sockaddr based on the data link layer type.
228 	 * We do this at this level because the ethernet header
229 	 * is copied directly into the data field of the sockaddr.
230 	 * In the case of SLIP, there is no header and the packet
231 	 * is forwarded as is.
232 	 * Also, we are careful to leave room at the front of the mbuf
233 	 * for the link level header.
234 	 */
235 	switch (linktype) {
236 
237 	case DLT_SLIP:
238 		sockp->sa_family = AF_INET;
239 		hlen = 0;
240 		align = 0;
241 		break;
242 
243 	case DLT_PPP:
244 		sockp->sa_family = AF_UNSPEC;
245 		hlen = 0;
246 		align = 0;
247 		break;
248 
249 	case DLT_EN10MB:
250 		sockp->sa_family = AF_UNSPEC;
251 		/* XXX Would MAXLINKHDR be better? */
252  		/* 6(dst)+6(src)+2(type) */
253 		hlen = sizeof(struct ether_header);
254 		align = 2;
255 		break;
256 
257 	case DLT_ARCNET:
258 		sockp->sa_family = AF_UNSPEC;
259 		hlen = ARC_HDRLEN;
260 		align = 5;
261 		break;
262 
263 	case DLT_FDDI:
264 		sockp->sa_family = AF_LINK;
265 		/* XXX 4(FORMAC)+6(dst)+6(src) */
266 		hlen = 16;
267 		align = 0;
268 		break;
269 
270 	case DLT_ECONET:
271 		sockp->sa_family = AF_UNSPEC;
272 		hlen = 6;
273 		align = 2;
274 		break;
275 
276 	case DLT_NULL:
277 		sockp->sa_family = AF_UNSPEC;
278 		hlen = 0;
279 		align = 0;
280 		break;
281 
282 	default:
283 		return (EIO);
284 	}
285 
286 	len = uio->uio_resid;
287 	/*
288 	 * If there aren't enough bytes for a link level header or the
289 	 * packet length exceeds the interface mtu, return an error.
290 	 */
291 	if (len - hlen > mtu)
292 		return (EMSGSIZE);
293 
294 	/*
295 	 * XXX Avoid complicated buffer chaining ---
296 	 * bail if it won't fit in a single mbuf.
297 	 * (Take into account possible alignment bytes)
298 	 */
299 	if (len + align > MCLBYTES)
300 		return (EIO);
301 
302 	m = m_gethdr(M_WAIT, MT_DATA);
303 	m_reset_rcvif(m);
304 	m->m_pkthdr.len = (int)(len - hlen);
305 	if (len + align > MHLEN) {
306 		m_clget(m, M_WAIT);
307 		if ((m->m_flags & M_EXT) == 0) {
308 			error = ENOBUFS;
309 			goto bad;
310 		}
311 	}
312 
313 	/* Insure the data is properly aligned */
314 	if (align > 0) {
315 		m->m_data += align;
316 		m->m_len -= (int)align;
317 	}
318 
319 	error = uiomove(mtod(m, void *), len, uio);
320 	if (error)
321 		goto bad;
322 	if (hlen != 0) {
323 		memcpy(sockp->sa_data, mtod(m, void *), hlen);
324 		m->m_data += hlen; /* XXX */
325 		len -= hlen;
326 	}
327 	m->m_len = (int)len;
328 	*mp = m;
329 	return (0);
330 
331 bad:
332 	m_freem(m);
333 	return (error);
334 }
335 
336 /*
337  * Attach file to the bpf interface, i.e. make d listen on bp.
338  * Must be called at splnet.
339  */
340 static void
341 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
342 {
343 	KASSERT(mutex_owned(&bpf_mtx));
344 	/*
345 	 * Point d at bp, and add d to the interface's list of listeners.
346 	 * Finally, point the driver's bpf cookie at the interface so
347 	 * it will divert packets to bpf.
348 	 */
349 	d->bd_bif = bp;
350 	d->bd_next = bp->bif_dlist;
351 	bp->bif_dlist = d;
352 
353 	*bp->bif_driverp = bp;
354 }
355 
356 /*
357  * Detach a file from its interface.
358  */
359 static void
360 bpf_detachd(struct bpf_d *d)
361 {
362 	struct bpf_d **p;
363 	struct bpf_if *bp;
364 
365 	KASSERT(mutex_owned(&bpf_mtx));
366 
367 	bp = d->bd_bif;
368 	/*
369 	 * Check if this descriptor had requested promiscuous mode.
370 	 * If so, turn it off.
371 	 */
372 	if (d->bd_promisc) {
373 		int error __diagused;
374 
375 		d->bd_promisc = 0;
376 		/*
377 		 * Take device out of promiscuous mode.  Since we were
378 		 * able to enter promiscuous mode, we should be able
379 		 * to turn it off.  But we can get an error if
380 		 * the interface was configured down, so only panic
381 		 * if we don't get an unexpected error.
382 		 */
383   		error = ifpromisc(bp->bif_ifp, 0);
384 #ifdef DIAGNOSTIC
385 		if (error)
386 			printf("%s: ifpromisc failed: %d", __func__, error);
387 #endif
388 	}
389 	/* Remove d from the interface's descriptor list. */
390 	p = &bp->bif_dlist;
391 	while (*p != d) {
392 		p = &(*p)->bd_next;
393 		if (*p == NULL)
394 			panic("%s: descriptor not in list", __func__);
395 	}
396 	*p = (*p)->bd_next;
397 	if (bp->bif_dlist == NULL)
398 		/*
399 		 * Let the driver know that there are no more listeners.
400 		 */
401 		*d->bd_bif->bif_driverp = NULL;
402 	d->bd_bif = NULL;
403 }
404 
405 static int
406 doinit(void)
407 {
408 
409 	mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE);
410 
411 	LIST_INIT(&bpf_list);
412 
413 	bpf_gstats.bs_recv = 0;
414 	bpf_gstats.bs_drop = 0;
415 	bpf_gstats.bs_capt = 0;
416 
417 	return 0;
418 }
419 
420 /*
421  * bpfilterattach() is called at boot time.
422  */
423 /* ARGSUSED */
424 void
425 bpfilterattach(int n)
426 {
427 	static ONCE_DECL(control);
428 
429 	RUN_ONCE(&control, doinit);
430 }
431 
432 /*
433  * Open ethernet device. Clones.
434  */
435 /* ARGSUSED */
436 int
437 bpfopen(dev_t dev, int flag, int mode, struct lwp *l)
438 {
439 	struct bpf_d *d;
440 	struct file *fp;
441 	int error, fd;
442 
443 	/* falloc() will fill in the descriptor for us. */
444 	if ((error = fd_allocfile(&fp, &fd)) != 0)
445 		return error;
446 
447 	d = malloc(sizeof(*d), M_DEVBUF, M_WAITOK|M_ZERO);
448 	d->bd_bufsize = bpf_bufsize;
449 	d->bd_seesent = 1;
450 	d->bd_feedback = 0;
451 	d->bd_pid = l->l_proc->p_pid;
452 #ifdef _LP64
453 	if (curproc->p_flag & PK_32)
454 		d->bd_compat32 = 1;
455 #endif
456 	getnanotime(&d->bd_btime);
457 	d->bd_atime = d->bd_mtime = d->bd_btime;
458 	callout_init(&d->bd_callout, 0);
459 	selinit(&d->bd_sel);
460 	d->bd_sih = softint_establish(SOFTINT_CLOCK, bpf_softintr, d);
461 	d->bd_jitcode = NULL;
462 
463 	mutex_enter(&bpf_mtx);
464 	LIST_INSERT_HEAD(&bpf_list, d, bd_list);
465 	mutex_exit(&bpf_mtx);
466 
467 	return fd_clone(fp, fd, flag, &bpf_fileops, d);
468 }
469 
470 /*
471  * Close the descriptor by detaching it from its interface,
472  * deallocating its buffers, and marking it free.
473  */
474 /* ARGSUSED */
475 static int
476 bpf_close(struct file *fp)
477 {
478 	struct bpf_d *d;
479 	int s;
480 
481 	KERNEL_LOCK(1, NULL);
482 	mutex_enter(&bpf_mtx);
483 
484 	if ((d = fp->f_bpf) == NULL) {
485 		mutex_exit(&bpf_mtx);
486 		KERNEL_UNLOCK_ONE(NULL);
487 		return 0;
488 	}
489 
490 	/*
491 	 * Refresh the PID associated with this bpf file.
492 	 */
493 	d->bd_pid = curproc->p_pid;
494 
495 	s = splnet();
496 	if (d->bd_state == BPF_WAITING)
497 		callout_stop(&d->bd_callout);
498 	d->bd_state = BPF_IDLE;
499 	if (d->bd_bif)
500 		bpf_detachd(d);
501 	splx(s);
502 	bpf_freed(d);
503 	LIST_REMOVE(d, bd_list);
504 	fp->f_bpf = NULL;
505 
506 	mutex_exit(&bpf_mtx);
507 	KERNEL_UNLOCK_ONE(NULL);
508 
509 	callout_destroy(&d->bd_callout);
510 	seldestroy(&d->bd_sel);
511 	softint_disestablish(d->bd_sih);
512 	free(d, M_DEVBUF);
513 
514 	return (0);
515 }
516 
517 /*
518  * Rotate the packet buffers in descriptor d.  Move the store buffer
519  * into the hold slot, and the free buffer into the store slot.
520  * Zero the length of the new store buffer.
521  */
522 #define ROTATE_BUFFERS(d) \
523 	(d)->bd_hbuf = (d)->bd_sbuf; \
524 	(d)->bd_hlen = (d)->bd_slen; \
525 	(d)->bd_sbuf = (d)->bd_fbuf; \
526 	(d)->bd_slen = 0; \
527 	(d)->bd_fbuf = NULL;
528 /*
529  *  bpfread - read next chunk of packets from buffers
530  */
531 static int
532 bpf_read(struct file *fp, off_t *offp, struct uio *uio,
533     kauth_cred_t cred, int flags)
534 {
535 	struct bpf_d *d = fp->f_bpf;
536 	int timed_out;
537 	int error;
538 	int s;
539 
540 	getnanotime(&d->bd_atime);
541 	/*
542 	 * Restrict application to use a buffer the same size as
543 	 * the kernel buffers.
544 	 */
545 	if (uio->uio_resid != d->bd_bufsize)
546 		return (EINVAL);
547 
548 	KERNEL_LOCK(1, NULL);
549 	s = splnet();
550 	if (d->bd_state == BPF_WAITING)
551 		callout_stop(&d->bd_callout);
552 	timed_out = (d->bd_state == BPF_TIMED_OUT);
553 	d->bd_state = BPF_IDLE;
554 	/*
555 	 * If the hold buffer is empty, then do a timed sleep, which
556 	 * ends when the timeout expires or when enough packets
557 	 * have arrived to fill the store buffer.
558 	 */
559 	while (d->bd_hbuf == NULL) {
560 		if (fp->f_flag & FNONBLOCK) {
561 			if (d->bd_slen == 0) {
562 				splx(s);
563 				KERNEL_UNLOCK_ONE(NULL);
564 				return (EWOULDBLOCK);
565 			}
566 			ROTATE_BUFFERS(d);
567 			break;
568 		}
569 
570 		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
571 			/*
572 			 * A packet(s) either arrived since the previous
573 			 * read or arrived while we were asleep.
574 			 * Rotate the buffers and return what's here.
575 			 */
576 			ROTATE_BUFFERS(d);
577 			break;
578 		}
579 		error = tsleep(d, PRINET|PCATCH, "bpf",
580 				d->bd_rtout);
581 		if (error == EINTR || error == ERESTART) {
582 			splx(s);
583 			KERNEL_UNLOCK_ONE(NULL);
584 			return (error);
585 		}
586 		if (error == EWOULDBLOCK) {
587 			/*
588 			 * On a timeout, return what's in the buffer,
589 			 * which may be nothing.  If there is something
590 			 * in the store buffer, we can rotate the buffers.
591 			 */
592 			if (d->bd_hbuf)
593 				/*
594 				 * We filled up the buffer in between
595 				 * getting the timeout and arriving
596 				 * here, so we don't need to rotate.
597 				 */
598 				break;
599 
600 			if (d->bd_slen == 0) {
601 				splx(s);
602 				KERNEL_UNLOCK_ONE(NULL);
603 				return (0);
604 			}
605 			ROTATE_BUFFERS(d);
606 			break;
607 		}
608 		if (error != 0)
609 			goto done;
610 	}
611 	/*
612 	 * At this point, we know we have something in the hold slot.
613 	 */
614 	splx(s);
615 
616 	/*
617 	 * Move data from hold buffer into user space.
618 	 * We know the entire buffer is transferred since
619 	 * we checked above that the read buffer is bpf_bufsize bytes.
620 	 */
621 	error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
622 
623 	s = splnet();
624 	d->bd_fbuf = d->bd_hbuf;
625 	d->bd_hbuf = NULL;
626 	d->bd_hlen = 0;
627 done:
628 	splx(s);
629 	KERNEL_UNLOCK_ONE(NULL);
630 	return (error);
631 }
632 
633 
634 /*
635  * If there are processes sleeping on this descriptor, wake them up.
636  */
637 static inline void
638 bpf_wakeup(struct bpf_d *d)
639 {
640 	wakeup(d);
641 	if (d->bd_async)
642 		softint_schedule(d->bd_sih);
643 	selnotify(&d->bd_sel, 0, 0);
644 }
645 
646 static void
647 bpf_softintr(void *cookie)
648 {
649 	struct bpf_d *d;
650 
651 	d = cookie;
652 	if (d->bd_async)
653 		fownsignal(d->bd_pgid, SIGIO, 0, 0, NULL);
654 }
655 
656 static void
657 bpf_timed_out(void *arg)
658 {
659 	struct bpf_d *d = arg;
660 	int s;
661 
662 	s = splnet();
663 	if (d->bd_state == BPF_WAITING) {
664 		d->bd_state = BPF_TIMED_OUT;
665 		if (d->bd_slen != 0)
666 			bpf_wakeup(d);
667 	}
668 	splx(s);
669 }
670 
671 
672 static int
673 bpf_write(struct file *fp, off_t *offp, struct uio *uio,
674     kauth_cred_t cred, int flags)
675 {
676 	struct bpf_d *d = fp->f_bpf;
677 	struct ifnet *ifp;
678 	struct mbuf *m, *mc;
679 	int error, s;
680 	static struct sockaddr_storage dst;
681 
682 	m = NULL;	/* XXX gcc */
683 
684 	KERNEL_LOCK(1, NULL);
685 
686 	if (d->bd_bif == NULL) {
687 		KERNEL_UNLOCK_ONE(NULL);
688 		return (ENXIO);
689 	}
690 	getnanotime(&d->bd_mtime);
691 
692 	ifp = d->bd_bif->bif_ifp;
693 
694 	if (uio->uio_resid == 0) {
695 		KERNEL_UNLOCK_ONE(NULL);
696 		return (0);
697 	}
698 
699 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp->if_mtu, &m,
700 		(struct sockaddr *) &dst);
701 	if (error) {
702 		KERNEL_UNLOCK_ONE(NULL);
703 		return (error);
704 	}
705 
706 	if (m->m_pkthdr.len > ifp->if_mtu) {
707 		KERNEL_UNLOCK_ONE(NULL);
708 		m_freem(m);
709 		return (EMSGSIZE);
710 	}
711 
712 	if (d->bd_hdrcmplt)
713 		dst.ss_family = pseudo_AF_HDRCMPLT;
714 
715 	if (d->bd_feedback) {
716 		mc = m_dup(m, 0, M_COPYALL, M_NOWAIT);
717 		if (mc != NULL)
718 			m_set_rcvif(mc, ifp);
719 		/* Set M_PROMISC for outgoing packets to be discarded. */
720 		if (1 /*d->bd_direction == BPF_D_INOUT*/)
721 			m->m_flags |= M_PROMISC;
722 	} else
723 		mc = NULL;
724 
725 	s = splsoftnet();
726 	error = if_output_lock(ifp, ifp, m, (struct sockaddr *) &dst, NULL);
727 
728 	if (mc != NULL) {
729 		if (error == 0)
730 			ifp->_if_input(ifp, mc);
731 		else
732 			m_freem(mc);
733 	}
734 	splx(s);
735 	KERNEL_UNLOCK_ONE(NULL);
736 	/*
737 	 * The driver frees the mbuf.
738 	 */
739 	return (error);
740 }
741 
742 /*
743  * Reset a descriptor by flushing its packet buffer and clearing the
744  * receive and drop counts.  Should be called at splnet.
745  */
746 static void
747 reset_d(struct bpf_d *d)
748 {
749 	if (d->bd_hbuf) {
750 		/* Free the hold buffer. */
751 		d->bd_fbuf = d->bd_hbuf;
752 		d->bd_hbuf = NULL;
753 	}
754 	d->bd_slen = 0;
755 	d->bd_hlen = 0;
756 	d->bd_rcount = 0;
757 	d->bd_dcount = 0;
758 	d->bd_ccount = 0;
759 }
760 
761 /*
762  *  FIONREAD		Check for read packet available.
763  *  BIOCGBLEN		Get buffer len [for read()].
764  *  BIOCSETF		Set ethernet read filter.
765  *  BIOCFLUSH		Flush read packet buffer.
766  *  BIOCPROMISC		Put interface into promiscuous mode.
767  *  BIOCGDLT		Get link layer type.
768  *  BIOCGETIF		Get interface name.
769  *  BIOCSETIF		Set interface.
770  *  BIOCSRTIMEOUT	Set read timeout.
771  *  BIOCGRTIMEOUT	Get read timeout.
772  *  BIOCGSTATS		Get packet stats.
773  *  BIOCIMMEDIATE	Set immediate mode.
774  *  BIOCVERSION		Get filter language version.
775  *  BIOCGHDRCMPLT	Get "header already complete" flag.
776  *  BIOCSHDRCMPLT	Set "header already complete" flag.
777  *  BIOCSFEEDBACK	Set packet feedback mode.
778  *  BIOCGFEEDBACK	Get packet feedback mode.
779  *  BIOCGSEESENT  	Get "see sent packets" mode.
780  *  BIOCSSEESENT  	Set "see sent packets" mode.
781  */
782 /* ARGSUSED */
783 static int
784 bpf_ioctl(struct file *fp, u_long cmd, void *addr)
785 {
786 	struct bpf_d *d = fp->f_bpf;
787 	int s, error = 0;
788 
789 	/*
790 	 * Refresh the PID associated with this bpf file.
791 	 */
792 	KERNEL_LOCK(1, NULL);
793 	d->bd_pid = curproc->p_pid;
794 #ifdef _LP64
795 	if (curproc->p_flag & PK_32)
796 		d->bd_compat32 = 1;
797 	else
798 		d->bd_compat32 = 0;
799 #endif
800 
801 	s = splnet();
802 	if (d->bd_state == BPF_WAITING)
803 		callout_stop(&d->bd_callout);
804 	d->bd_state = BPF_IDLE;
805 	splx(s);
806 
807 	switch (cmd) {
808 
809 	default:
810 		error = EINVAL;
811 		break;
812 
813 	/*
814 	 * Check for read packet available.
815 	 */
816 	case FIONREAD:
817 		{
818 			int n;
819 
820 			s = splnet();
821 			n = d->bd_slen;
822 			if (d->bd_hbuf)
823 				n += d->bd_hlen;
824 			splx(s);
825 
826 			*(int *)addr = n;
827 			break;
828 		}
829 
830 	/*
831 	 * Get buffer len [for read()].
832 	 */
833 	case BIOCGBLEN:
834 		*(u_int *)addr = d->bd_bufsize;
835 		break;
836 
837 	/*
838 	 * Set buffer length.
839 	 */
840 	case BIOCSBLEN:
841 		if (d->bd_bif != NULL)
842 			error = EINVAL;
843 		else {
844 			u_int size = *(u_int *)addr;
845 
846 			if (size > bpf_maxbufsize)
847 				*(u_int *)addr = size = bpf_maxbufsize;
848 			else if (size < BPF_MINBUFSIZE)
849 				*(u_int *)addr = size = BPF_MINBUFSIZE;
850 			d->bd_bufsize = size;
851 		}
852 		break;
853 
854 	/*
855 	 * Set link layer read filter.
856 	 */
857 	case BIOCSETF:
858 		error = bpf_setf(d, addr);
859 		break;
860 
861 	/*
862 	 * Flush read packet buffer.
863 	 */
864 	case BIOCFLUSH:
865 		s = splnet();
866 		reset_d(d);
867 		splx(s);
868 		break;
869 
870 	/*
871 	 * Put interface into promiscuous mode.
872 	 */
873 	case BIOCPROMISC:
874 		if (d->bd_bif == NULL) {
875 			/*
876 			 * No interface attached yet.
877 			 */
878 			error = EINVAL;
879 			break;
880 		}
881 		s = splnet();
882 		if (d->bd_promisc == 0) {
883 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
884 			if (error == 0)
885 				d->bd_promisc = 1;
886 		}
887 		splx(s);
888 		break;
889 
890 	/*
891 	 * Get device parameters.
892 	 */
893 	case BIOCGDLT:
894 		if (d->bd_bif == NULL)
895 			error = EINVAL;
896 		else
897 			*(u_int *)addr = d->bd_bif->bif_dlt;
898 		break;
899 
900 	/*
901 	 * Get a list of supported device parameters.
902 	 */
903 	case BIOCGDLTLIST:
904 		if (d->bd_bif == NULL)
905 			error = EINVAL;
906 		else
907 			error = bpf_getdltlist(d, addr);
908 		break;
909 
910 	/*
911 	 * Set device parameters.
912 	 */
913 	case BIOCSDLT:
914 		mutex_enter(&bpf_mtx);
915 		if (d->bd_bif == NULL)
916 			error = EINVAL;
917 		else
918 			error = bpf_setdlt(d, *(u_int *)addr);
919 		mutex_exit(&bpf_mtx);
920 		break;
921 
922 	/*
923 	 * Set interface name.
924 	 */
925 #ifdef OBIOCGETIF
926 	case OBIOCGETIF:
927 #endif
928 	case BIOCGETIF:
929 		if (d->bd_bif == NULL)
930 			error = EINVAL;
931 		else
932 			bpf_ifname(d->bd_bif->bif_ifp, addr);
933 		break;
934 
935 	/*
936 	 * Set interface.
937 	 */
938 #ifdef OBIOCSETIF
939 	case OBIOCSETIF:
940 #endif
941 	case BIOCSETIF:
942 		mutex_enter(&bpf_mtx);
943 		error = bpf_setif(d, addr);
944 		mutex_exit(&bpf_mtx);
945 		break;
946 
947 	/*
948 	 * Set read timeout.
949 	 */
950 	case BIOCSRTIMEOUT:
951 		{
952 			struct timeval *tv = addr;
953 
954 			/* Compute number of ticks. */
955 			d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick;
956 			if ((d->bd_rtout == 0) && (tv->tv_usec != 0))
957 				d->bd_rtout = 1;
958 			break;
959 		}
960 
961 #ifdef BIOCGORTIMEOUT
962 	/*
963 	 * Get read timeout.
964 	 */
965 	case BIOCGORTIMEOUT:
966 		{
967 			struct timeval50 *tv = addr;
968 
969 			tv->tv_sec = d->bd_rtout / hz;
970 			tv->tv_usec = (d->bd_rtout % hz) * tick;
971 			break;
972 		}
973 #endif
974 
975 #ifdef BIOCSORTIMEOUT
976 	/*
977 	 * Set read timeout.
978 	 */
979 	case BIOCSORTIMEOUT:
980 		{
981 			struct timeval50 *tv = addr;
982 
983 			/* Compute number of ticks. */
984 			d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick;
985 			if ((d->bd_rtout == 0) && (tv->tv_usec != 0))
986 				d->bd_rtout = 1;
987 			break;
988 		}
989 #endif
990 
991 	/*
992 	 * Get read timeout.
993 	 */
994 	case BIOCGRTIMEOUT:
995 		{
996 			struct timeval *tv = addr;
997 
998 			tv->tv_sec = d->bd_rtout / hz;
999 			tv->tv_usec = (d->bd_rtout % hz) * tick;
1000 			break;
1001 		}
1002 	/*
1003 	 * Get packet stats.
1004 	 */
1005 	case BIOCGSTATS:
1006 		{
1007 			struct bpf_stat *bs = addr;
1008 
1009 			bs->bs_recv = d->bd_rcount;
1010 			bs->bs_drop = d->bd_dcount;
1011 			bs->bs_capt = d->bd_ccount;
1012 			break;
1013 		}
1014 
1015 	case BIOCGSTATSOLD:
1016 		{
1017 			struct bpf_stat_old *bs = addr;
1018 
1019 			bs->bs_recv = d->bd_rcount;
1020 			bs->bs_drop = d->bd_dcount;
1021 			break;
1022 		}
1023 
1024 	/*
1025 	 * Set immediate mode.
1026 	 */
1027 	case BIOCIMMEDIATE:
1028 		d->bd_immediate = *(u_int *)addr;
1029 		break;
1030 
1031 	case BIOCVERSION:
1032 		{
1033 			struct bpf_version *bv = addr;
1034 
1035 			bv->bv_major = BPF_MAJOR_VERSION;
1036 			bv->bv_minor = BPF_MINOR_VERSION;
1037 			break;
1038 		}
1039 
1040 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
1041 		*(u_int *)addr = d->bd_hdrcmplt;
1042 		break;
1043 
1044 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
1045 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1046 		break;
1047 
1048 	/*
1049 	 * Get "see sent packets" flag
1050 	 */
1051 	case BIOCGSEESENT:
1052 		*(u_int *)addr = d->bd_seesent;
1053 		break;
1054 
1055 	/*
1056 	 * Set "see sent" packets flag
1057 	 */
1058 	case BIOCSSEESENT:
1059 		d->bd_seesent = *(u_int *)addr;
1060 		break;
1061 
1062 	/*
1063 	 * Set "feed packets from bpf back to input" mode
1064 	 */
1065 	case BIOCSFEEDBACK:
1066 		d->bd_feedback = *(u_int *)addr;
1067 		break;
1068 
1069 	/*
1070 	 * Get "feed packets from bpf back to input" mode
1071 	 */
1072 	case BIOCGFEEDBACK:
1073 		*(u_int *)addr = d->bd_feedback;
1074 		break;
1075 
1076 	case FIONBIO:		/* Non-blocking I/O */
1077 		/*
1078 		 * No need to do anything special as we use IO_NDELAY in
1079 		 * bpfread() as an indication of whether or not to block
1080 		 * the read.
1081 		 */
1082 		break;
1083 
1084 	case FIOASYNC:		/* Send signal on receive packets */
1085 		d->bd_async = *(int *)addr;
1086 		break;
1087 
1088 	case TIOCSPGRP:		/* Process or group to send signals to */
1089 	case FIOSETOWN:
1090 		error = fsetown(&d->bd_pgid, cmd, addr);
1091 		break;
1092 
1093 	case TIOCGPGRP:
1094 	case FIOGETOWN:
1095 		error = fgetown(d->bd_pgid, cmd, addr);
1096 		break;
1097 	}
1098 	KERNEL_UNLOCK_ONE(NULL);
1099 	return (error);
1100 }
1101 
1102 /*
1103  * Set d's packet filter program to fp.  If this file already has a filter,
1104  * free it and replace it.  Returns EINVAL for bogus requests.
1105  */
1106 int
1107 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1108 {
1109 	struct bpf_insn *fcode, *old;
1110 	bpfjit_func_t jcode, oldj;
1111 	size_t flen, size;
1112 	int s;
1113 
1114 	jcode = NULL;
1115 	flen = fp->bf_len;
1116 
1117 	if ((fp->bf_insns == NULL && flen) || flen > BPF_MAXINSNS) {
1118 		return EINVAL;
1119 	}
1120 
1121 	if (flen) {
1122 		/*
1123 		 * Allocate the buffer, copy the byte-code from
1124 		 * userspace and validate it.
1125 		 */
1126 		size = flen * sizeof(*fp->bf_insns);
1127 		fcode = malloc(size, M_DEVBUF, M_WAITOK);
1128 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1129 		    !bpf_validate(fcode, (int)flen)) {
1130 			free(fcode, M_DEVBUF);
1131 			return EINVAL;
1132 		}
1133 		membar_consumer();
1134 		if (bpf_jit)
1135 			jcode = bpf_jit_generate(NULL, fcode, flen);
1136 	} else {
1137 		fcode = NULL;
1138 	}
1139 
1140 	s = splnet();
1141 	old = d->bd_filter;
1142 	d->bd_filter = fcode;
1143 	oldj = d->bd_jitcode;
1144 	d->bd_jitcode = jcode;
1145 	reset_d(d);
1146 	splx(s);
1147 
1148 	if (old) {
1149 		free(old, M_DEVBUF);
1150 	}
1151 	if (oldj) {
1152 		bpf_jit_freecode(oldj);
1153 	}
1154 
1155 	return 0;
1156 }
1157 
1158 /*
1159  * Detach a file from its current interface (if attached at all) and attach
1160  * to the interface indicated by the name stored in ifr.
1161  * Return an errno or 0.
1162  */
1163 static int
1164 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1165 {
1166 	struct bpf_if *bp;
1167 	char *cp;
1168 	int unit_seen, i, s, error;
1169 
1170 	KASSERT(mutex_owned(&bpf_mtx));
1171 	/*
1172 	 * Make sure the provided name has a unit number, and default
1173 	 * it to '0' if not specified.
1174 	 * XXX This is ugly ... do this differently?
1175 	 */
1176 	unit_seen = 0;
1177 	cp = ifr->ifr_name;
1178 	cp[sizeof(ifr->ifr_name) - 1] = '\0';	/* sanity */
1179 	while (*cp++)
1180 		if (*cp >= '0' && *cp <= '9')
1181 			unit_seen = 1;
1182 	if (!unit_seen) {
1183 		/* Make sure to leave room for the '\0'. */
1184 		for (i = 0; i < (IFNAMSIZ - 1); ++i) {
1185 			if ((ifr->ifr_name[i] >= 'a' &&
1186 			     ifr->ifr_name[i] <= 'z') ||
1187 			    (ifr->ifr_name[i] >= 'A' &&
1188 			     ifr->ifr_name[i] <= 'Z'))
1189 				continue;
1190 			ifr->ifr_name[i] = '0';
1191 		}
1192 	}
1193 
1194 	/*
1195 	 * Look through attached interfaces for the named one.
1196 	 */
1197 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1198 		struct ifnet *ifp = bp->bif_ifp;
1199 
1200 		if (ifp == NULL ||
1201 		    strcmp(ifp->if_xname, ifr->ifr_name) != 0)
1202 			continue;
1203 		/* skip additional entry */
1204 		if (bp->bif_driverp != &ifp->if_bpf)
1205 			continue;
1206 		/*
1207 		 * We found the requested interface.
1208 		 * Allocate the packet buffers if we need to.
1209 		 * If we're already attached to requested interface,
1210 		 * just flush the buffer.
1211 		 */
1212 		if (d->bd_sbuf == NULL) {
1213 			error = bpf_allocbufs(d);
1214 			if (error != 0)
1215 				return (error);
1216 		}
1217 		s = splnet();
1218 		if (bp != d->bd_bif) {
1219 			if (d->bd_bif)
1220 				/*
1221 				 * Detach if attached to something else.
1222 				 */
1223 				bpf_detachd(d);
1224 
1225 			bpf_attachd(d, bp);
1226 		}
1227 		reset_d(d);
1228 		splx(s);
1229 		return (0);
1230 	}
1231 	/* Not found. */
1232 	return (ENXIO);
1233 }
1234 
1235 /*
1236  * Copy the interface name to the ifreq.
1237  */
1238 static void
1239 bpf_ifname(struct ifnet *ifp, struct ifreq *ifr)
1240 {
1241 	memcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ);
1242 }
1243 
1244 static int
1245 bpf_stat(struct file *fp, struct stat *st)
1246 {
1247 	struct bpf_d *d = fp->f_bpf;
1248 
1249 	(void)memset(st, 0, sizeof(*st));
1250 	KERNEL_LOCK(1, NULL);
1251 	st->st_dev = makedev(cdevsw_lookup_major(&bpf_cdevsw), d->bd_pid);
1252 	st->st_atimespec = d->bd_atime;
1253 	st->st_mtimespec = d->bd_mtime;
1254 	st->st_ctimespec = st->st_birthtimespec = d->bd_btime;
1255 	st->st_uid = kauth_cred_geteuid(fp->f_cred);
1256 	st->st_gid = kauth_cred_getegid(fp->f_cred);
1257 	st->st_mode = S_IFCHR;
1258 	KERNEL_UNLOCK_ONE(NULL);
1259 	return 0;
1260 }
1261 
1262 /*
1263  * Support for poll() system call
1264  *
1265  * Return true iff the specific operation will not block indefinitely - with
1266  * the assumption that it is safe to positively acknowledge a request for the
1267  * ability to write to the BPF device.
1268  * Otherwise, return false but make a note that a selnotify() must be done.
1269  */
1270 static int
1271 bpf_poll(struct file *fp, int events)
1272 {
1273 	struct bpf_d *d = fp->f_bpf;
1274 	int s = splnet();
1275 	int revents;
1276 
1277 	/*
1278 	 * Refresh the PID associated with this bpf file.
1279 	 */
1280 	KERNEL_LOCK(1, NULL);
1281 	d->bd_pid = curproc->p_pid;
1282 
1283 	revents = events & (POLLOUT | POLLWRNORM);
1284 	if (events & (POLLIN | POLLRDNORM)) {
1285 		/*
1286 		 * An imitation of the FIONREAD ioctl code.
1287 		 */
1288 		if (d->bd_hlen != 0 ||
1289 		    ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1290 		     d->bd_slen != 0)) {
1291 			revents |= events & (POLLIN | POLLRDNORM);
1292 		} else {
1293 			selrecord(curlwp, &d->bd_sel);
1294 			/* Start the read timeout if necessary */
1295 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1296 				callout_reset(&d->bd_callout, d->bd_rtout,
1297 					      bpf_timed_out, d);
1298 				d->bd_state = BPF_WAITING;
1299 			}
1300 		}
1301 	}
1302 
1303 	KERNEL_UNLOCK_ONE(NULL);
1304 	splx(s);
1305 	return (revents);
1306 }
1307 
1308 static void
1309 filt_bpfrdetach(struct knote *kn)
1310 {
1311 	struct bpf_d *d = kn->kn_hook;
1312 	int s;
1313 
1314 	KERNEL_LOCK(1, NULL);
1315 	s = splnet();
1316 	SLIST_REMOVE(&d->bd_sel.sel_klist, kn, knote, kn_selnext);
1317 	splx(s);
1318 	KERNEL_UNLOCK_ONE(NULL);
1319 }
1320 
1321 static int
1322 filt_bpfread(struct knote *kn, long hint)
1323 {
1324 	struct bpf_d *d = kn->kn_hook;
1325 	int rv;
1326 
1327 	KERNEL_LOCK(1, NULL);
1328 	kn->kn_data = d->bd_hlen;
1329 	if (d->bd_immediate)
1330 		kn->kn_data += d->bd_slen;
1331 	rv = (kn->kn_data > 0);
1332 	KERNEL_UNLOCK_ONE(NULL);
1333 	return rv;
1334 }
1335 
1336 static const struct filterops bpfread_filtops =
1337 	{ 1, NULL, filt_bpfrdetach, filt_bpfread };
1338 
1339 static int
1340 bpf_kqfilter(struct file *fp, struct knote *kn)
1341 {
1342 	struct bpf_d *d = fp->f_bpf;
1343 	struct klist *klist;
1344 	int s;
1345 
1346 	KERNEL_LOCK(1, NULL);
1347 
1348 	switch (kn->kn_filter) {
1349 	case EVFILT_READ:
1350 		klist = &d->bd_sel.sel_klist;
1351 		kn->kn_fop = &bpfread_filtops;
1352 		break;
1353 
1354 	default:
1355 		KERNEL_UNLOCK_ONE(NULL);
1356 		return (EINVAL);
1357 	}
1358 
1359 	kn->kn_hook = d;
1360 
1361 	s = splnet();
1362 	SLIST_INSERT_HEAD(klist, kn, kn_selnext);
1363 	splx(s);
1364 	KERNEL_UNLOCK_ONE(NULL);
1365 
1366 	return (0);
1367 }
1368 
1369 /*
1370  * Copy data from an mbuf chain into a buffer.  This code is derived
1371  * from m_copydata in sys/uipc_mbuf.c.
1372  */
1373 static void *
1374 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1375 {
1376 	const struct mbuf *m;
1377 	u_int count;
1378 	u_char *dst;
1379 
1380 	m = src_arg;
1381 	dst = dst_arg;
1382 	while (len > 0) {
1383 		if (m == NULL)
1384 			panic("bpf_mcpy");
1385 		count = min(m->m_len, len);
1386 		memcpy(dst, mtod(m, const void *), count);
1387 		m = m->m_next;
1388 		dst += count;
1389 		len -= count;
1390 	}
1391 	return dst_arg;
1392 }
1393 
1394 /*
1395  * Dispatch a packet to all the listeners on interface bp.
1396  *
1397  * pkt     pointer to the packet, either a data buffer or an mbuf chain
1398  * buflen  buffer length, if pkt is a data buffer
1399  * cpfn    a function that can copy pkt into the listener's buffer
1400  * pktlen  length of the packet
1401  * rcv     true if packet came in
1402  */
1403 static inline void
1404 bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t),
1405     void *pkt, u_int pktlen, u_int buflen, const bool rcv)
1406 {
1407 	uint32_t mem[BPF_MEMWORDS];
1408 	bpf_args_t args = {
1409 		.pkt = (const uint8_t *)pkt,
1410 		.wirelen = pktlen,
1411 		.buflen = buflen,
1412 		.mem = mem,
1413 		.arg = NULL
1414 	};
1415 	bool gottime = false;
1416 	struct timespec ts;
1417 
1418 	/*
1419 	 * Note that the IPL does not have to be raised at this point.
1420 	 * The only problem that could arise here is that if two different
1421 	 * interfaces shared any data.  This is not the case.
1422 	 */
1423 	for (struct bpf_d *d = bp->bif_dlist; d != NULL; d = d->bd_next) {
1424 		u_int slen;
1425 
1426 		if (!d->bd_seesent && !rcv) {
1427 			continue;
1428 		}
1429 		d->bd_rcount++;
1430 		bpf_gstats.bs_recv++;
1431 
1432 		if (d->bd_jitcode)
1433 			slen = d->bd_jitcode(NULL, &args);
1434 		else
1435 			slen = bpf_filter_ext(NULL, d->bd_filter, &args);
1436 
1437 		if (!slen) {
1438 			continue;
1439 		}
1440 		if (!gottime) {
1441 			gottime = true;
1442 			nanotime(&ts);
1443 		}
1444 		catchpacket(d, pkt, pktlen, slen, cpfn, &ts);
1445 	}
1446 }
1447 
1448 /*
1449  * Incoming linkage from device drivers.  Process the packet pkt, of length
1450  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
1451  * by each process' filter, and if accepted, stashed into the corresponding
1452  * buffer.
1453  */
1454 static void
1455 _bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
1456 {
1457 
1458 	bpf_deliver(bp, memcpy, pkt, pktlen, pktlen, true);
1459 }
1460 
1461 /*
1462  * Incoming linkage from device drivers, when the head of the packet is in
1463  * a buffer, and the tail is in an mbuf chain.
1464  */
1465 static void
1466 _bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
1467 {
1468 	u_int pktlen;
1469 	struct mbuf mb;
1470 
1471 	/* Skip outgoing duplicate packets. */
1472 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) {
1473 		m->m_flags &= ~M_PROMISC;
1474 		return;
1475 	}
1476 
1477 	pktlen = m_length(m) + dlen;
1478 
1479 	/*
1480 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
1481 	 * Note that we cut corners here; we only setup what's
1482 	 * absolutely needed--this mbuf should never go anywhere else.
1483 	 */
1484 	(void)memset(&mb, 0, sizeof(mb));
1485 	mb.m_next = m;
1486 	mb.m_data = data;
1487 	mb.m_len = dlen;
1488 
1489 	bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, m->m_pkthdr.rcvif_index != 0);
1490 }
1491 
1492 /*
1493  * Incoming linkage from device drivers, when packet is in an mbuf chain.
1494  */
1495 static void
1496 _bpf_mtap(struct bpf_if *bp, struct mbuf *m)
1497 {
1498 	void *(*cpfn)(void *, const void *, size_t);
1499 	u_int pktlen, buflen;
1500 	void *marg;
1501 
1502 	/* Skip outgoing duplicate packets. */
1503 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) {
1504 		m->m_flags &= ~M_PROMISC;
1505 		return;
1506 	}
1507 
1508 	pktlen = m_length(m);
1509 
1510 	if (pktlen == m->m_len) {
1511 		cpfn = (void *)memcpy;
1512 		marg = mtod(m, void *);
1513 		buflen = pktlen;
1514 	} else {
1515 		cpfn = bpf_mcpy;
1516 		marg = m;
1517 		buflen = 0;
1518 	}
1519 
1520 	bpf_deliver(bp, cpfn, marg, pktlen, buflen, m->m_pkthdr.rcvif_index != 0);
1521 }
1522 
1523 /*
1524  * We need to prepend the address family as
1525  * a four byte field.  Cons up a dummy header
1526  * to pacify bpf.  This is safe because bpf
1527  * will only read from the mbuf (i.e., it won't
1528  * try to free it or keep a pointer a to it).
1529  */
1530 static void
1531 _bpf_mtap_af(struct bpf_if *bp, uint32_t af, struct mbuf *m)
1532 {
1533 	struct mbuf m0;
1534 
1535 	m0.m_flags = 0;
1536 	m0.m_next = m;
1537 	m0.m_len = 4;
1538 	m0.m_data = (char *)&af;
1539 
1540 	_bpf_mtap(bp, &m0);
1541 }
1542 
1543 /*
1544  * Put the SLIP pseudo-"link header" in place.
1545  * Note this M_PREPEND() should never fail,
1546  * swince we know we always have enough space
1547  * in the input buffer.
1548  */
1549 static void
1550 _bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m)
1551 {
1552 	int s;
1553 	u_char *hp;
1554 
1555 	M_PREPEND(*m, SLIP_HDRLEN, M_DONTWAIT);
1556 	if (*m == NULL)
1557 		return;
1558 
1559 	hp = mtod(*m, u_char *);
1560 	hp[SLX_DIR] = SLIPDIR_IN;
1561 	(void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
1562 
1563 	s = splnet();
1564 	_bpf_mtap(bp, *m);
1565 	splx(s);
1566 
1567 	m_adj(*m, SLIP_HDRLEN);
1568 }
1569 
1570 /*
1571  * Put the SLIP pseudo-"link header" in
1572  * place.  The compressed header is now
1573  * at the beginning of the mbuf.
1574  */
1575 static void
1576 _bpf_mtap_sl_out(struct bpf_if *bp, u_char *chdr, struct mbuf *m)
1577 {
1578 	struct mbuf m0;
1579 	u_char *hp;
1580 	int s;
1581 
1582 	m0.m_flags = 0;
1583 	m0.m_next = m;
1584 	m0.m_data = m0.m_dat;
1585 	m0.m_len = SLIP_HDRLEN;
1586 
1587 	hp = mtod(&m0, u_char *);
1588 
1589 	hp[SLX_DIR] = SLIPDIR_OUT;
1590 	(void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
1591 
1592 	s = splnet();
1593 	_bpf_mtap(bp, &m0);
1594 	splx(s);
1595 	m_freem(m);
1596 }
1597 
1598 static int
1599 bpf_hdrlen(struct bpf_d *d)
1600 {
1601 	int hdrlen = d->bd_bif->bif_hdrlen;
1602 	/*
1603 	 * Compute the length of the bpf header.  This is not necessarily
1604 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1605 	 * that the network layer header begins on a longword boundary (for
1606 	 * performance reasons and to alleviate alignment restrictions).
1607 	 */
1608 #ifdef _LP64
1609 	if (d->bd_compat32)
1610 		return (BPF_WORDALIGN32(hdrlen + SIZEOF_BPF_HDR32) - hdrlen);
1611 	else
1612 #endif
1613 		return (BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen);
1614 }
1615 
1616 /*
1617  * Move the packet data from interface memory (pkt) into the
1618  * store buffer. Call the wakeup functions if it's time to wakeup
1619  * a listener (buffer full), "cpfn" is the routine called to do the
1620  * actual data transfer. memcpy is passed in to copy contiguous chunks,
1621  * while bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
1622  * pkt is really an mbuf.
1623  */
1624 static void
1625 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
1626     void *(*cpfn)(void *, const void *, size_t), struct timespec *ts)
1627 {
1628 	char *h;
1629 	int totlen, curlen, caplen;
1630 	int hdrlen = bpf_hdrlen(d);
1631 	int do_wakeup = 0;
1632 
1633 	++d->bd_ccount;
1634 	++bpf_gstats.bs_capt;
1635 	/*
1636 	 * Figure out how many bytes to move.  If the packet is
1637 	 * greater or equal to the snapshot length, transfer that
1638 	 * much.  Otherwise, transfer the whole packet (unless
1639 	 * we hit the buffer size limit).
1640 	 */
1641 	totlen = hdrlen + min(snaplen, pktlen);
1642 	if (totlen > d->bd_bufsize)
1643 		totlen = d->bd_bufsize;
1644 	/*
1645 	 * If we adjusted totlen to fit the bufsize, it could be that
1646 	 * totlen is smaller than hdrlen because of the link layer header.
1647 	 */
1648 	caplen = totlen - hdrlen;
1649 	if (caplen < 0)
1650 		caplen = 0;
1651 
1652 	/*
1653 	 * Round up the end of the previous packet to the next longword.
1654 	 */
1655 #ifdef _LP64
1656 	if (d->bd_compat32)
1657 		curlen = BPF_WORDALIGN32(d->bd_slen);
1658 	else
1659 #endif
1660 		curlen = BPF_WORDALIGN(d->bd_slen);
1661 	if (curlen + totlen > d->bd_bufsize) {
1662 		/*
1663 		 * This packet will overflow the storage buffer.
1664 		 * Rotate the buffers if we can, then wakeup any
1665 		 * pending reads.
1666 		 */
1667 		if (d->bd_fbuf == NULL) {
1668 			/*
1669 			 * We haven't completed the previous read yet,
1670 			 * so drop the packet.
1671 			 */
1672 			++d->bd_dcount;
1673 			++bpf_gstats.bs_drop;
1674 			return;
1675 		}
1676 		ROTATE_BUFFERS(d);
1677 		do_wakeup = 1;
1678 		curlen = 0;
1679 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1680 		/*
1681 		 * Immediate mode is set, or the read timeout has
1682 		 * already expired during a select call.  A packet
1683 		 * arrived, so the reader should be woken up.
1684 		 */
1685 		do_wakeup = 1;
1686 	}
1687 
1688 	/*
1689 	 * Append the bpf header.
1690 	 */
1691 	h = (char *)d->bd_sbuf + curlen;
1692 #ifdef _LP64
1693 	if (d->bd_compat32) {
1694 		struct bpf_hdr32 *hp32;
1695 
1696 		hp32 = (struct bpf_hdr32 *)h;
1697 		hp32->bh_tstamp.tv_sec = ts->tv_sec;
1698 		hp32->bh_tstamp.tv_usec = ts->tv_nsec / 1000;
1699 		hp32->bh_datalen = pktlen;
1700 		hp32->bh_hdrlen = hdrlen;
1701 		hp32->bh_caplen = caplen;
1702 	} else
1703 #endif
1704 	{
1705 		struct bpf_hdr *hp;
1706 
1707 		hp = (struct bpf_hdr *)h;
1708 		hp->bh_tstamp.tv_sec = ts->tv_sec;
1709 		hp->bh_tstamp.tv_usec = ts->tv_nsec / 1000;
1710 		hp->bh_datalen = pktlen;
1711 		hp->bh_hdrlen = hdrlen;
1712 		hp->bh_caplen = caplen;
1713 	}
1714 
1715 	/*
1716 	 * Copy the packet data into the store buffer and update its length.
1717 	 */
1718 	(*cpfn)(h + hdrlen, pkt, caplen);
1719 	d->bd_slen = curlen + totlen;
1720 
1721 	/*
1722 	 * Call bpf_wakeup after bd_slen has been updated so that kevent(2)
1723 	 * will cause filt_bpfread() to be called with it adjusted.
1724 	 */
1725 	if (do_wakeup)
1726 		bpf_wakeup(d);
1727 }
1728 
1729 /*
1730  * Initialize all nonzero fields of a descriptor.
1731  */
1732 static int
1733 bpf_allocbufs(struct bpf_d *d)
1734 {
1735 
1736 	d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1737 	if (!d->bd_fbuf)
1738 		return (ENOBUFS);
1739 	d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1740 	if (!d->bd_sbuf) {
1741 		free(d->bd_fbuf, M_DEVBUF);
1742 		return (ENOBUFS);
1743 	}
1744 	d->bd_slen = 0;
1745 	d->bd_hlen = 0;
1746 	return (0);
1747 }
1748 
1749 /*
1750  * Free buffers currently in use by a descriptor.
1751  * Called on close.
1752  */
1753 static void
1754 bpf_freed(struct bpf_d *d)
1755 {
1756 	/*
1757 	 * We don't need to lock out interrupts since this descriptor has
1758 	 * been detached from its interface and it yet hasn't been marked
1759 	 * free.
1760 	 */
1761 	if (d->bd_sbuf != NULL) {
1762 		free(d->bd_sbuf, M_DEVBUF);
1763 		if (d->bd_hbuf != NULL)
1764 			free(d->bd_hbuf, M_DEVBUF);
1765 		if (d->bd_fbuf != NULL)
1766 			free(d->bd_fbuf, M_DEVBUF);
1767 	}
1768 	if (d->bd_filter)
1769 		free(d->bd_filter, M_DEVBUF);
1770 
1771 	if (d->bd_jitcode != NULL) {
1772 		bpf_jit_freecode(d->bd_jitcode);
1773 	}
1774 }
1775 
1776 /*
1777  * Attach an interface to bpf.  dlt is the link layer type;
1778  * hdrlen is the fixed size of the link header for the specified dlt
1779  * (variable length headers not yet supported).
1780  */
1781 static void
1782 _bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
1783 {
1784 	struct bpf_if *bp;
1785 	bp = malloc(sizeof(*bp), M_DEVBUF, M_DONTWAIT);
1786 	if (bp == NULL)
1787 		panic("bpfattach");
1788 
1789 	mutex_enter(&bpf_mtx);
1790 	bp->bif_dlist = NULL;
1791 	bp->bif_driverp = driverp;
1792 	bp->bif_ifp = ifp;
1793 	bp->bif_dlt = dlt;
1794 
1795 	bp->bif_next = bpf_iflist;
1796 	bpf_iflist = bp;
1797 
1798 	*bp->bif_driverp = NULL;
1799 
1800 	bp->bif_hdrlen = hdrlen;
1801 	mutex_exit(&bpf_mtx);
1802 #if 0
1803 	printf("bpf: %s attached\n", ifp->if_xname);
1804 #endif
1805 }
1806 
1807 /*
1808  * Remove an interface from bpf.
1809  */
1810 static void
1811 _bpfdetach(struct ifnet *ifp)
1812 {
1813 	struct bpf_if *bp, **pbp;
1814 	struct bpf_d *d;
1815 	int s;
1816 
1817 	mutex_enter(&bpf_mtx);
1818 	/* Nuke the vnodes for any open instances */
1819 	LIST_FOREACH(d, &bpf_list, bd_list) {
1820 		if (d->bd_bif != NULL && d->bd_bif->bif_ifp == ifp) {
1821 			/*
1822 			 * Detach the descriptor from an interface now.
1823 			 * It will be free'ed later by close routine.
1824 			 */
1825 			s = splnet();
1826 			d->bd_promisc = 0;	/* we can't touch device. */
1827 			bpf_detachd(d);
1828 			splx(s);
1829 		}
1830 	}
1831 
1832   again:
1833 	for (bp = bpf_iflist, pbp = &bpf_iflist;
1834 	     bp != NULL; pbp = &bp->bif_next, bp = bp->bif_next) {
1835 		if (bp->bif_ifp == ifp) {
1836 			*pbp = bp->bif_next;
1837 			free(bp, M_DEVBUF);
1838 			goto again;
1839 		}
1840 	}
1841 	mutex_exit(&bpf_mtx);
1842 }
1843 
1844 /*
1845  * Change the data link type of a interface.
1846  */
1847 static void
1848 _bpf_change_type(struct ifnet *ifp, u_int dlt, u_int hdrlen)
1849 {
1850 	struct bpf_if *bp;
1851 
1852 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1853 		if (bp->bif_driverp == &ifp->if_bpf)
1854 			break;
1855 	}
1856 	if (bp == NULL)
1857 		panic("bpf_change_type");
1858 
1859 	bp->bif_dlt = dlt;
1860 
1861 	bp->bif_hdrlen = hdrlen;
1862 }
1863 
1864 /*
1865  * Get a list of available data link type of the interface.
1866  */
1867 static int
1868 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1869 {
1870 	int n, error;
1871 	struct ifnet *ifp;
1872 	struct bpf_if *bp;
1873 
1874 	ifp = d->bd_bif->bif_ifp;
1875 	n = 0;
1876 	error = 0;
1877 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1878 		if (bp->bif_ifp != ifp)
1879 			continue;
1880 		if (bfl->bfl_list != NULL) {
1881 			if (n >= bfl->bfl_len)
1882 				return ENOMEM;
1883 			error = copyout(&bp->bif_dlt,
1884 			    bfl->bfl_list + n, sizeof(u_int));
1885 		}
1886 		n++;
1887 	}
1888 	bfl->bfl_len = n;
1889 	return error;
1890 }
1891 
1892 /*
1893  * Set the data link type of a BPF instance.
1894  */
1895 static int
1896 bpf_setdlt(struct bpf_d *d, u_int dlt)
1897 {
1898 	int s, error, opromisc;
1899 	struct ifnet *ifp;
1900 	struct bpf_if *bp;
1901 
1902 	KASSERT(mutex_owned(&bpf_mtx));
1903 
1904 	if (d->bd_bif->bif_dlt == dlt)
1905 		return 0;
1906 	ifp = d->bd_bif->bif_ifp;
1907 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1908 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
1909 			break;
1910 	}
1911 	if (bp == NULL)
1912 		return EINVAL;
1913 	s = splnet();
1914 	opromisc = d->bd_promisc;
1915 	bpf_detachd(d);
1916 	bpf_attachd(d, bp);
1917 	reset_d(d);
1918 	if (opromisc) {
1919 		error = ifpromisc(bp->bif_ifp, 1);
1920 		if (error)
1921 			printf("%s: bpf_setdlt: ifpromisc failed (%d)\n",
1922 			    bp->bif_ifp->if_xname, error);
1923 		else
1924 			d->bd_promisc = 1;
1925 	}
1926 	splx(s);
1927 	return 0;
1928 }
1929 
1930 static int
1931 sysctl_net_bpf_maxbufsize(SYSCTLFN_ARGS)
1932 {
1933 	int newsize, error;
1934 	struct sysctlnode node;
1935 
1936 	node = *rnode;
1937 	node.sysctl_data = &newsize;
1938 	newsize = bpf_maxbufsize;
1939 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1940 	if (error || newp == NULL)
1941 		return (error);
1942 
1943 	if (newsize < BPF_MINBUFSIZE || newsize > BPF_MAXBUFSIZE)
1944 		return (EINVAL);
1945 
1946 	bpf_maxbufsize = newsize;
1947 
1948 	return (0);
1949 }
1950 
1951 #if defined(MODULAR) || defined(BPFJIT)
1952 static int
1953 sysctl_net_bpf_jit(SYSCTLFN_ARGS)
1954 {
1955 	bool newval;
1956 	int error;
1957 	struct sysctlnode node;
1958 
1959 	node = *rnode;
1960 	node.sysctl_data = &newval;
1961 	newval = bpf_jit;
1962 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1963 	if (error != 0 || newp == NULL)
1964 		return error;
1965 
1966 	bpf_jit = newval;
1967 
1968 	/*
1969 	 * Do a full sync to publish new bpf_jit value and
1970 	 * update bpfjit_module_ops.bj_generate_code variable.
1971 	 */
1972 	membar_sync();
1973 
1974 	if (newval && bpfjit_module_ops.bj_generate_code == NULL) {
1975 		printf("JIT compilation is postponed "
1976 		    "until after bpfjit module is loaded\n");
1977 	}
1978 
1979 	return 0;
1980 }
1981 #endif
1982 
1983 static int
1984 sysctl_net_bpf_peers(SYSCTLFN_ARGS)
1985 {
1986 	int    error, elem_count;
1987 	struct bpf_d	 *dp;
1988 	struct bpf_d_ext  dpe;
1989 	size_t len, needed, elem_size, out_size;
1990 	char   *sp;
1991 
1992 	if (namelen == 1 && name[0] == CTL_QUERY)
1993 		return (sysctl_query(SYSCTLFN_CALL(rnode)));
1994 
1995 	if (namelen != 2)
1996 		return (EINVAL);
1997 
1998 	/* BPF peers is privileged information. */
1999 	error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE,
2000 	    KAUTH_REQ_NETWORK_INTERFACE_GETPRIV, NULL, NULL, NULL);
2001 	if (error)
2002 		return (EPERM);
2003 
2004 	len = (oldp != NULL) ? *oldlenp : 0;
2005 	sp = oldp;
2006 	elem_size = name[0];
2007 	elem_count = name[1];
2008 	out_size = MIN(sizeof(dpe), elem_size);
2009 	needed = 0;
2010 
2011 	if (elem_size < 1 || elem_count < 0)
2012 		return (EINVAL);
2013 
2014 	mutex_enter(&bpf_mtx);
2015 	LIST_FOREACH(dp, &bpf_list, bd_list) {
2016 		if (len >= elem_size && elem_count > 0) {
2017 #define BPF_EXT(field)	dpe.bde_ ## field = dp->bd_ ## field
2018 			BPF_EXT(bufsize);
2019 			BPF_EXT(promisc);
2020 			BPF_EXT(state);
2021 			BPF_EXT(immediate);
2022 			BPF_EXT(hdrcmplt);
2023 			BPF_EXT(seesent);
2024 			BPF_EXT(pid);
2025 			BPF_EXT(rcount);
2026 			BPF_EXT(dcount);
2027 			BPF_EXT(ccount);
2028 #undef BPF_EXT
2029 			if (dp->bd_bif)
2030 				(void)strlcpy(dpe.bde_ifname,
2031 				    dp->bd_bif->bif_ifp->if_xname,
2032 				    IFNAMSIZ - 1);
2033 			else
2034 				dpe.bde_ifname[0] = '\0';
2035 
2036 			error = copyout(&dpe, sp, out_size);
2037 			if (error)
2038 				break;
2039 			sp += elem_size;
2040 			len -= elem_size;
2041 		}
2042 		needed += elem_size;
2043 		if (elem_count > 0 && elem_count != INT_MAX)
2044 			elem_count--;
2045 	}
2046 	mutex_exit(&bpf_mtx);
2047 
2048 	*oldlenp = needed;
2049 
2050 	return (error);
2051 }
2052 
2053 static struct sysctllog *bpf_sysctllog;
2054 static void
2055 sysctl_net_bpf_setup(void)
2056 {
2057 	const struct sysctlnode *node;
2058 
2059 	node = NULL;
2060 	sysctl_createv(&bpf_sysctllog, 0, NULL, &node,
2061 		       CTLFLAG_PERMANENT,
2062 		       CTLTYPE_NODE, "bpf",
2063 		       SYSCTL_DESCR("BPF options"),
2064 		       NULL, 0, NULL, 0,
2065 		       CTL_NET, CTL_CREATE, CTL_EOL);
2066 	if (node != NULL) {
2067 #if defined(MODULAR) || defined(BPFJIT)
2068 		sysctl_createv(&bpf_sysctllog, 0, NULL, NULL,
2069 			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2070 			CTLTYPE_BOOL, "jit",
2071 			SYSCTL_DESCR("Toggle Just-In-Time compilation"),
2072 			sysctl_net_bpf_jit, 0, &bpf_jit, 0,
2073 			CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
2074 #endif
2075 		sysctl_createv(&bpf_sysctllog, 0, NULL, NULL,
2076 			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2077 			CTLTYPE_INT, "maxbufsize",
2078 			SYSCTL_DESCR("Maximum size for data capture buffer"),
2079 			sysctl_net_bpf_maxbufsize, 0, &bpf_maxbufsize, 0,
2080 			CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
2081 		sysctl_createv(&bpf_sysctllog, 0, NULL, NULL,
2082 			CTLFLAG_PERMANENT,
2083 			CTLTYPE_STRUCT, "stats",
2084 			SYSCTL_DESCR("BPF stats"),
2085 			NULL, 0, &bpf_gstats, sizeof(bpf_gstats),
2086 			CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
2087 		sysctl_createv(&bpf_sysctllog, 0, NULL, NULL,
2088 			CTLFLAG_PERMANENT,
2089 			CTLTYPE_STRUCT, "peers",
2090 			SYSCTL_DESCR("BPF peers"),
2091 			sysctl_net_bpf_peers, 0, NULL, 0,
2092 			CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
2093 	}
2094 
2095 }
2096 
2097 struct bpf_ops bpf_ops_kernel = {
2098 	.bpf_attach =		_bpfattach,
2099 	.bpf_detach =		_bpfdetach,
2100 	.bpf_change_type =	_bpf_change_type,
2101 
2102 	.bpf_tap =		_bpf_tap,
2103 	.bpf_mtap =		_bpf_mtap,
2104 	.bpf_mtap2 =		_bpf_mtap2,
2105 	.bpf_mtap_af =		_bpf_mtap_af,
2106 	.bpf_mtap_sl_in =	_bpf_mtap_sl_in,
2107 	.bpf_mtap_sl_out =	_bpf_mtap_sl_out,
2108 };
2109 
2110 MODULE(MODULE_CLASS_DRIVER, bpf, "bpf_filter");
2111 
2112 static int
2113 bpf_modcmd(modcmd_t cmd, void *arg)
2114 {
2115 	devmajor_t bmajor, cmajor;
2116 	int error;
2117 
2118 	bmajor = cmajor = NODEVMAJOR;
2119 
2120 	switch (cmd) {
2121 	case MODULE_CMD_INIT:
2122 		bpfilterattach(0);
2123 		error = devsw_attach("bpf", NULL, &bmajor,
2124 		    &bpf_cdevsw, &cmajor);
2125 		if (error == EEXIST)
2126 			error = 0; /* maybe built-in ... improve eventually */
2127 		if (error)
2128 			break;
2129 
2130 		bpf_ops_handover_enter(&bpf_ops_kernel);
2131 		atomic_swap_ptr(&bpf_ops, &bpf_ops_kernel);
2132 		bpf_ops_handover_exit();
2133 		sysctl_net_bpf_setup();
2134 		break;
2135 
2136 	case MODULE_CMD_FINI:
2137 		/*
2138 		 * While there is no reference counting for bpf callers,
2139 		 * unload could at least in theory be done similarly to
2140 		 * system call disestablishment.  This should even be
2141 		 * a little simpler:
2142 		 *
2143 		 * 1) replace op vector with stubs
2144 		 * 2) post update to all cpus with xc
2145 		 * 3) check that nobody is in bpf anymore
2146 		 *    (it's doubtful we'd want something like l_sysent,
2147 		 *     but we could do something like *signed* percpu
2148 		 *     counters.  if the sum is 0, we're good).
2149 		 * 4) if fail, unroll changes
2150 		 *
2151 		 * NOTE: change won't be atomic to the outside.  some
2152 		 * packets may be not captured even if unload is
2153 		 * not succesful.  I think packet capture not working
2154 		 * is a perfectly logical consequence of trying to
2155 		 * disable packet capture.
2156 		 */
2157 		error = EOPNOTSUPP;
2158 		/* insert sysctl teardown */
2159 		break;
2160 
2161 	default:
2162 		error = ENOTTY;
2163 		break;
2164 	}
2165 
2166 	return error;
2167 }
2168