xref: /openbsd-src/sys/net/bpf.c (revision 7350f337b9e3eb4461d99580e625c7ef148d107c)
1 /*	$OpenBSD: bpf.c,v 1.177 2019/06/13 21:14:53 mpi Exp $	*/
2 /*	$NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1990, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * Copyright (c) 2010, 2014 Henning Brauer <henning@openbsd.org>
8  *
9  * This code is derived from the Stanford/CMU enet packet filter,
10  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
11  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
12  * Berkeley Laboratory.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)bpf.c	8.2 (Berkeley) 3/28/94
39  */
40 
41 #include "bpfilter.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/mbuf.h>
46 #include <sys/proc.h>
47 #include <sys/signalvar.h>
48 #include <sys/ioctl.h>
49 #include <sys/conf.h>
50 #include <sys/vnode.h>
51 #include <sys/fcntl.h>
52 #include <sys/socket.h>
53 #include <sys/poll.h>
54 #include <sys/kernel.h>
55 #include <sys/sysctl.h>
56 #include <sys/rwlock.h>
57 #include <sys/atomic.h>
58 #include <sys/smr.h>
59 #include <sys/specdev.h>
60 #include <sys/selinfo.h>
61 #include <sys/task.h>
62 
63 #include <net/if.h>
64 #include <net/bpf.h>
65 #include <net/bpfdesc.h>
66 
67 #include <netinet/in.h>
68 #include <netinet/if_ether.h>
69 
70 #include "vlan.h"
71 #if NVLAN > 0
72 #include <net/if_vlan_var.h>
73 #endif
74 
75 #define BPF_BUFSIZE 32768
76 
77 #define PRINET  26			/* interruptible */
78 
79 /* from kern/kern_clock.c; incremented each clock tick. */
80 extern int ticks;
81 
82 /*
83  * The default read buffer size is patchable.
84  */
85 int bpf_bufsize = BPF_BUFSIZE;
86 int bpf_maxbufsize = BPF_MAXBUFSIZE;
87 
88 /*
89  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
90  *  bpf_d_list is the list of descriptors
91  */
92 struct bpf_if	*bpf_iflist;
93 LIST_HEAD(, bpf_d) bpf_d_list;
94 
95 int	bpf_allocbufs(struct bpf_d *);
96 void	bpf_ifname(struct bpf_if*, struct ifreq *);
97 int	_bpf_mtap(caddr_t, const struct mbuf *, u_int,
98 	    void (*)(const void *, void *, size_t));
99 void	bpf_mcopy(const void *, void *, size_t);
100 int	bpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
101 	    struct sockaddr *);
102 int	bpf_setif(struct bpf_d *, struct ifreq *);
103 int	bpfpoll(dev_t, int, struct proc *);
104 int	bpfkqfilter(dev_t, struct knote *);
105 void	bpf_wakeup(struct bpf_d *);
106 void	bpf_wakeup_cb(void *);
107 void	bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
108 	    void (*)(const void *, void *, size_t), struct timeval *);
109 int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
110 int	bpf_setdlt(struct bpf_d *, u_int);
111 
112 void	filt_bpfrdetach(struct knote *);
113 int	filt_bpfread(struct knote *, long);
114 
115 int	bpf_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t);
116 
117 struct bpf_d *bpfilter_lookup(int);
118 
119 /*
120  * Called holding ``bd_mtx''.
121  */
122 void	bpf_attachd(struct bpf_d *, struct bpf_if *);
123 void	bpf_detachd(struct bpf_d *);
124 void	bpf_resetd(struct bpf_d *);
125 
126 void	bpf_prog_smr(void *);
127 void	bpf_d_smr(void *);
128 
129 struct rwlock bpf_sysctl_lk = RWLOCK_INITIALIZER("bpfsz");
130 
131 int
132 bpf_movein(struct uio *uio, struct bpf_d *d, struct mbuf **mp,
133     struct sockaddr *sockp)
134 {
135 	struct bpf_program_smr *bps;
136 	struct bpf_insn *fcode = NULL;
137 	struct mbuf *m;
138 	struct m_tag *mtag;
139 	int error;
140 	u_int hlen;
141 	u_int len;
142 	u_int linktype;
143 	u_int slen;
144 
145 	/*
146 	 * Build a sockaddr based on the data link layer type.
147 	 * We do this at this level because the ethernet header
148 	 * is copied directly into the data field of the sockaddr.
149 	 * In the case of SLIP, there is no header and the packet
150 	 * is forwarded as is.
151 	 * Also, we are careful to leave room at the front of the mbuf
152 	 * for the link level header.
153 	 */
154 	linktype = d->bd_bif->bif_dlt;
155 	switch (linktype) {
156 
157 	case DLT_SLIP:
158 		sockp->sa_family = AF_INET;
159 		hlen = 0;
160 		break;
161 
162 	case DLT_PPP:
163 		sockp->sa_family = AF_UNSPEC;
164 		hlen = 0;
165 		break;
166 
167 	case DLT_EN10MB:
168 		sockp->sa_family = AF_UNSPEC;
169 		/* XXX Would MAXLINKHDR be better? */
170 		hlen = ETHER_HDR_LEN;
171 		break;
172 
173 	case DLT_IEEE802_11:
174 	case DLT_IEEE802_11_RADIO:
175 		sockp->sa_family = AF_UNSPEC;
176 		hlen = 0;
177 		break;
178 
179 	case DLT_RAW:
180 	case DLT_NULL:
181 		sockp->sa_family = AF_UNSPEC;
182 		hlen = 0;
183 		break;
184 
185 	case DLT_LOOP:
186 		sockp->sa_family = AF_UNSPEC;
187 		hlen = sizeof(u_int32_t);
188 		break;
189 
190 	default:
191 		return (EIO);
192 	}
193 
194 	if (uio->uio_resid > MAXMCLBYTES)
195 		return (EIO);
196 	len = uio->uio_resid;
197 
198 	MGETHDR(m, M_WAIT, MT_DATA);
199 	m->m_pkthdr.ph_ifidx = 0;
200 	m->m_pkthdr.len = len - hlen;
201 
202 	if (len > MHLEN) {
203 		MCLGETI(m, M_WAIT, NULL, len);
204 		if ((m->m_flags & M_EXT) == 0) {
205 			error = ENOBUFS;
206 			goto bad;
207 		}
208 	}
209 	m->m_len = len;
210 	*mp = m;
211 
212 	error = uiomove(mtod(m, caddr_t), len, uio);
213 	if (error)
214 		goto bad;
215 
216 	smr_read_enter();
217 	bps = SMR_PTR_GET(&d->bd_wfilter);
218 	if (bps != NULL)
219 		fcode = bps->bps_bf.bf_insns;
220 	slen = bpf_filter(fcode, mtod(m, u_char *), len, len);
221 	smr_read_leave();
222 
223 	if (slen < len) {
224 		error = EPERM;
225 		goto bad;
226 	}
227 
228 	if (m->m_len < hlen) {
229 		error = EPERM;
230 		goto bad;
231 	}
232 	/*
233 	 * Make room for link header, and copy it to sockaddr
234 	 */
235 	if (hlen != 0) {
236 		if (linktype == DLT_LOOP) {
237 			u_int32_t af;
238 
239 			/* the link header indicates the address family */
240 			KASSERT(hlen == sizeof(u_int32_t));
241 			memcpy(&af, m->m_data, hlen);
242 			sockp->sa_family = ntohl(af);
243 		} else
244 			memcpy(sockp->sa_data, m->m_data, hlen);
245 		m->m_len -= hlen;
246 		m->m_data += hlen; /* XXX */
247 	}
248 
249 	/*
250 	 * Prepend the data link type as a mbuf tag
251 	 */
252 	mtag = m_tag_get(PACKET_TAG_DLT, sizeof(u_int), M_WAIT);
253 	*(u_int *)(mtag + 1) = linktype;
254 	m_tag_prepend(m, mtag);
255 
256 	return (0);
257  bad:
258 	m_freem(m);
259 	return (error);
260 }
261 
262 /*
263  * Attach file to the bpf interface, i.e. make d listen on bp.
264  */
265 void
266 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
267 {
268 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
269 
270 	/*
271 	 * Point d at bp, and add d to the interface's list of listeners.
272 	 * Finally, point the driver's bpf cookie at the interface so
273 	 * it will divert packets to bpf.
274 	 */
275 
276 	d->bd_bif = bp;
277 
278 	KERNEL_ASSERT_LOCKED();
279 	SMR_SLIST_INSERT_HEAD_LOCKED(&bp->bif_dlist, d, bd_next);
280 
281 	*bp->bif_driverp = bp;
282 }
283 
284 /*
285  * Detach a file from its interface.
286  */
287 void
288 bpf_detachd(struct bpf_d *d)
289 {
290 	struct bpf_if *bp;
291 
292 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
293 
294 	bp = d->bd_bif;
295 	/* Not attached. */
296 	if (bp == NULL)
297 		return;
298 
299 	/* Remove ``d'' from the interface's descriptor list. */
300 	KERNEL_ASSERT_LOCKED();
301 	SMR_SLIST_REMOVE_LOCKED(&bp->bif_dlist, d, bpf_d, bd_next);
302 
303 	if (SMR_SLIST_EMPTY_LOCKED(&bp->bif_dlist)) {
304 		/*
305 		 * Let the driver know that there are no more listeners.
306 		 */
307 		*bp->bif_driverp = NULL;
308 	}
309 
310 	d->bd_bif = NULL;
311 
312 	/*
313 	 * Check if this descriptor had requested promiscuous mode.
314 	 * If so, turn it off.
315 	 */
316 	if (d->bd_promisc) {
317 		int error;
318 
319 		KASSERT(bp->bif_ifp != NULL);
320 
321 		d->bd_promisc = 0;
322 
323 		mtx_leave(&d->bd_mtx);
324 		NET_LOCK();
325 		error = ifpromisc(bp->bif_ifp, 0);
326 		NET_UNLOCK();
327 		mtx_enter(&d->bd_mtx);
328 
329 		if (error && !(error == EINVAL || error == ENODEV ||
330 		    error == ENXIO))
331 			/*
332 			 * Something is really wrong if we were able to put
333 			 * the driver into promiscuous mode, but can't
334 			 * take it out.
335 			 */
336 			panic("bpf: ifpromisc failed");
337 	}
338 }
339 
340 void
341 bpfilterattach(int n)
342 {
343 	LIST_INIT(&bpf_d_list);
344 }
345 
346 /*
347  * Open ethernet device.  Returns ENXIO for illegal minor device number,
348  * EBUSY if file is open by another process.
349  */
350 int
351 bpfopen(dev_t dev, int flag, int mode, struct proc *p)
352 {
353 	struct bpf_d *bd;
354 	int unit = minor(dev);
355 
356 	if (unit & ((1 << CLONE_SHIFT) - 1))
357 		return (ENXIO);
358 
359 	KASSERT(bpfilter_lookup(unit) == NULL);
360 
361 	/* create on demand */
362 	if ((bd = malloc(sizeof(*bd), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
363 		return (EBUSY);
364 
365 	/* Mark "free" and do most initialization. */
366 	bd->bd_unit = unit;
367 	bd->bd_bufsize = bpf_bufsize;
368 	bd->bd_sig = SIGIO;
369 	mtx_init(&bd->bd_mtx, IPL_NET);
370 	task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
371 	smr_init(&bd->bd_smr);
372 
373 	if (flag & FNONBLOCK)
374 		bd->bd_rtout = -1;
375 
376 	LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
377 
378 	return (0);
379 }
380 
381 /*
382  * Close the descriptor by detaching it from its interface,
383  * deallocating its buffers, and marking it free.
384  */
385 int
386 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
387 {
388 	struct bpf_d *d;
389 
390 	d = bpfilter_lookup(minor(dev));
391 	mtx_enter(&d->bd_mtx);
392 	bpf_detachd(d);
393 	bpf_wakeup(d);
394 	LIST_REMOVE(d, bd_list);
395 	mtx_leave(&d->bd_mtx);
396 
397 	/*
398 	 * Wait for the task to finish here, before proceeding to garbage
399 	 * collection.
400 	 */
401 	taskq_barrier(systq);
402 	smr_call(&d->bd_smr, bpf_d_smr, d);
403 
404 	return (0);
405 }
406 
407 /*
408  * Rotate the packet buffers in descriptor d.  Move the store buffer
409  * into the hold slot, and the free buffer into the store slot.
410  * Zero the length of the new store buffer.
411  */
412 #define ROTATE_BUFFERS(d) \
413 	KASSERT(d->bd_in_uiomove == 0); \
414 	MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
415 	(d)->bd_hbuf = (d)->bd_sbuf; \
416 	(d)->bd_hlen = (d)->bd_slen; \
417 	(d)->bd_sbuf = (d)->bd_fbuf; \
418 	(d)->bd_slen = 0; \
419 	(d)->bd_fbuf = NULL;
420 /*
421  *  bpfread - read next chunk of packets from buffers
422  */
423 int
424 bpfread(dev_t dev, struct uio *uio, int ioflag)
425 {
426 	struct bpf_d *d;
427 	caddr_t hbuf;
428 	int hlen, error;
429 
430 	KERNEL_ASSERT_LOCKED();
431 
432 	d = bpfilter_lookup(minor(dev));
433 	if (d->bd_bif == NULL)
434 		return (ENXIO);
435 
436 	mtx_enter(&d->bd_mtx);
437 
438 	/*
439 	 * Restrict application to use a buffer the same size as
440 	 * as kernel buffers.
441 	 */
442 	if (uio->uio_resid != d->bd_bufsize) {
443 		error = EINVAL;
444 		goto out;
445 	}
446 
447 	/*
448 	 * If there's a timeout, bd_rdStart is tagged when we start the read.
449 	 * we can then figure out when we're done reading.
450 	 */
451 	if (d->bd_rtout != -1 && d->bd_rdStart == 0)
452 		d->bd_rdStart = ticks;
453 	else
454 		d->bd_rdStart = 0;
455 
456 	/*
457 	 * If the hold buffer is empty, then do a timed sleep, which
458 	 * ends when the timeout expires or when enough packets
459 	 * have arrived to fill the store buffer.
460 	 */
461 	while (d->bd_hbuf == NULL) {
462 		if (d->bd_bif == NULL) {
463 			/* interface is gone */
464 			if (d->bd_slen == 0) {
465 				error = EIO;
466 				goto out;
467 			}
468 			ROTATE_BUFFERS(d);
469 			break;
470 		}
471 		if (d->bd_immediate && d->bd_slen != 0) {
472 			/*
473 			 * A packet(s) either arrived since the previous
474 			 * read or arrived while we were asleep.
475 			 * Rotate the buffers and return what's here.
476 			 */
477 			ROTATE_BUFFERS(d);
478 			break;
479 		}
480 		if (d->bd_rtout == -1) {
481 			/* User requested non-blocking I/O */
482 			error = EWOULDBLOCK;
483 		} else {
484 			if (d->bd_rdStart <= ULONG_MAX - d->bd_rtout &&
485 			    d->bd_rdStart + d->bd_rtout < ticks) {
486 				error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
487 				    "bpf", d->bd_rtout);
488 			} else
489 				error = EWOULDBLOCK;
490 		}
491 		if (error == EINTR || error == ERESTART)
492 			goto out;
493 		if (error == EWOULDBLOCK) {
494 			/*
495 			 * On a timeout, return what's in the buffer,
496 			 * which may be nothing.  If there is something
497 			 * in the store buffer, we can rotate the buffers.
498 			 */
499 			if (d->bd_hbuf != NULL)
500 				/*
501 				 * We filled up the buffer in between
502 				 * getting the timeout and arriving
503 				 * here, so we don't need to rotate.
504 				 */
505 				break;
506 
507 			if (d->bd_slen == 0) {
508 				error = 0;
509 				goto out;
510 			}
511 			ROTATE_BUFFERS(d);
512 			break;
513 		}
514 	}
515 	/*
516 	 * At this point, we know we have something in the hold slot.
517 	 */
518 	hbuf = d->bd_hbuf;
519 	hlen = d->bd_hlen;
520 	d->bd_hbuf = NULL;
521 	d->bd_hlen = 0;
522 	d->bd_fbuf = NULL;
523 	d->bd_in_uiomove = 1;
524 
525 	/*
526 	 * Move data from hold buffer into user space.
527 	 * We know the entire buffer is transferred since
528 	 * we checked above that the read buffer is bpf_bufsize bytes.
529 	 */
530 	mtx_leave(&d->bd_mtx);
531 	error = uiomove(hbuf, hlen, uio);
532 	mtx_enter(&d->bd_mtx);
533 
534 	/* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
535 	KASSERT(d->bd_fbuf == NULL);
536 	KASSERT(d->bd_hbuf == NULL);
537 	d->bd_fbuf = hbuf;
538 	d->bd_in_uiomove = 0;
539 out:
540 	mtx_leave(&d->bd_mtx);
541 
542 	return (error);
543 }
544 
545 
546 /*
547  * If there are processes sleeping on this descriptor, wake them up.
548  */
549 void
550 bpf_wakeup(struct bpf_d *d)
551 {
552 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
553 
554 	/*
555 	 * As long as csignal() and selwakeup() need to be protected
556 	 * by the KERNEL_LOCK() we have to delay the wakeup to
557 	 * another context to keep the hot path KERNEL_LOCK()-free.
558 	 */
559 	task_add(systq, &d->bd_wake_task);
560 }
561 
562 void
563 bpf_wakeup_cb(void *xd)
564 {
565 	struct bpf_d *d = xd;
566 
567 	KERNEL_ASSERT_LOCKED();
568 
569 	wakeup(d);
570 	if (d->bd_async && d->bd_sig)
571 		csignal(d->bd_pgid, d->bd_sig, d->bd_siguid, d->bd_sigeuid);
572 
573 	selwakeup(&d->bd_sel);
574 }
575 
576 int
577 bpfwrite(dev_t dev, struct uio *uio, int ioflag)
578 {
579 	struct bpf_d *d;
580 	struct ifnet *ifp;
581 	struct mbuf *m;
582 	int error;
583 	struct sockaddr_storage dst;
584 
585 	KERNEL_ASSERT_LOCKED();
586 
587 	d = bpfilter_lookup(minor(dev));
588 	if (d->bd_bif == NULL)
589 		return (ENXIO);
590 
591 	ifp = d->bd_bif->bif_ifp;
592 
593 	if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
594 		error = ENETDOWN;
595 		goto out;
596 	}
597 
598 	if (uio->uio_resid == 0) {
599 		error = 0;
600 		goto out;
601 	}
602 
603 	error = bpf_movein(uio, d, &m, sstosa(&dst));
604 	if (error)
605 		goto out;
606 
607 	if (m->m_pkthdr.len > ifp->if_mtu) {
608 		m_freem(m);
609 		error = EMSGSIZE;
610 		goto out;
611 	}
612 
613 	m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
614 	m->m_pkthdr.pf.prio = ifp->if_llprio;
615 
616 	if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
617 		dst.ss_family = pseudo_AF_HDRCMPLT;
618 
619 	NET_LOCK();
620 	error = ifp->if_output(ifp, m, sstosa(&dst), NULL);
621 	NET_UNLOCK();
622 
623 out:
624 	return (error);
625 }
626 
627 /*
628  * Reset a descriptor by flushing its packet buffer and clearing the
629  * receive and drop counts.
630  */
631 void
632 bpf_resetd(struct bpf_d *d)
633 {
634 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
635 	KASSERT(d->bd_in_uiomove == 0);
636 
637 	if (d->bd_hbuf != NULL) {
638 		/* Free the hold buffer. */
639 		d->bd_fbuf = d->bd_hbuf;
640 		d->bd_hbuf = NULL;
641 	}
642 	d->bd_slen = 0;
643 	d->bd_hlen = 0;
644 	d->bd_rcount = 0;
645 	d->bd_dcount = 0;
646 }
647 
648 /*
649  *  FIONREAD		Check for read packet available.
650  *  BIOCGBLEN		Get buffer len [for read()].
651  *  BIOCSETF		Set ethernet read filter.
652  *  BIOCFLUSH		Flush read packet buffer.
653  *  BIOCPROMISC		Put interface into promiscuous mode.
654  *  BIOCGDLTLIST	Get supported link layer types.
655  *  BIOCGDLT		Get link layer type.
656  *  BIOCSDLT		Set link layer type.
657  *  BIOCGETIF		Get interface name.
658  *  BIOCSETIF		Set interface.
659  *  BIOCSRTIMEOUT	Set read timeout.
660  *  BIOCGRTIMEOUT	Get read timeout.
661  *  BIOCGSTATS		Get packet stats.
662  *  BIOCIMMEDIATE	Set immediate mode.
663  *  BIOCVERSION		Get filter language version.
664  *  BIOCGHDRCMPLT	Get "header already complete" flag
665  *  BIOCSHDRCMPLT	Set "header already complete" flag
666  */
667 int
668 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
669 {
670 	struct bpf_d *d;
671 	int error = 0;
672 
673 	d = bpfilter_lookup(minor(dev));
674 	if (d->bd_locked && suser(p) != 0) {
675 		/* list of allowed ioctls when locked and not root */
676 		switch (cmd) {
677 		case BIOCGBLEN:
678 		case BIOCFLUSH:
679 		case BIOCGDLT:
680 		case BIOCGDLTLIST:
681 		case BIOCGETIF:
682 		case BIOCGRTIMEOUT:
683 		case BIOCGSTATS:
684 		case BIOCVERSION:
685 		case BIOCGRSIG:
686 		case BIOCGHDRCMPLT:
687 		case FIONREAD:
688 		case BIOCLOCK:
689 		case BIOCSRTIMEOUT:
690 		case BIOCIMMEDIATE:
691 		case TIOCGPGRP:
692 		case BIOCGDIRFILT:
693 			break;
694 		default:
695 			return (EPERM);
696 		}
697 	}
698 
699 	switch (cmd) {
700 	default:
701 		error = EINVAL;
702 		break;
703 
704 	/*
705 	 * Check for read packet available.
706 	 */
707 	case FIONREAD:
708 		{
709 			int n;
710 
711 			mtx_enter(&d->bd_mtx);
712 			n = d->bd_slen;
713 			if (d->bd_hbuf != NULL)
714 				n += d->bd_hlen;
715 			mtx_leave(&d->bd_mtx);
716 
717 			*(int *)addr = n;
718 			break;
719 		}
720 
721 	/*
722 	 * Get buffer len [for read()].
723 	 */
724 	case BIOCGBLEN:
725 		*(u_int *)addr = d->bd_bufsize;
726 		break;
727 
728 	/*
729 	 * Set buffer length.
730 	 */
731 	case BIOCSBLEN:
732 		if (d->bd_bif != NULL)
733 			error = EINVAL;
734 		else {
735 			u_int size = *(u_int *)addr;
736 
737 			if (size > bpf_maxbufsize)
738 				*(u_int *)addr = size = bpf_maxbufsize;
739 			else if (size < BPF_MINBUFSIZE)
740 				*(u_int *)addr = size = BPF_MINBUFSIZE;
741 			mtx_enter(&d->bd_mtx);
742 			d->bd_bufsize = size;
743 			mtx_leave(&d->bd_mtx);
744 		}
745 		break;
746 
747 	/*
748 	 * Set link layer read filter.
749 	 */
750 	case BIOCSETF:
751 		error = bpf_setf(d, (struct bpf_program *)addr, 0);
752 		break;
753 
754 	/*
755 	 * Set link layer write filter.
756 	 */
757 	case BIOCSETWF:
758 		error = bpf_setf(d, (struct bpf_program *)addr, 1);
759 		break;
760 
761 	/*
762 	 * Flush read packet buffer.
763 	 */
764 	case BIOCFLUSH:
765 		mtx_enter(&d->bd_mtx);
766 		bpf_resetd(d);
767 		mtx_leave(&d->bd_mtx);
768 		break;
769 
770 	/*
771 	 * Put interface into promiscuous mode.
772 	 */
773 	case BIOCPROMISC:
774 		if (d->bd_bif == NULL) {
775 			/*
776 			 * No interface attached yet.
777 			 */
778 			error = EINVAL;
779 		} else if (d->bd_bif->bif_ifp != NULL) {
780 			if (d->bd_promisc == 0) {
781 				MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
782 				NET_LOCK();
783 				error = ifpromisc(d->bd_bif->bif_ifp, 1);
784 				NET_UNLOCK();
785 				if (error == 0)
786 					d->bd_promisc = 1;
787 			}
788 		}
789 		break;
790 
791 	/*
792 	 * Get a list of supported device parameters.
793 	 */
794 	case BIOCGDLTLIST:
795 		if (d->bd_bif == NULL)
796 			error = EINVAL;
797 		else
798 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
799 		break;
800 
801 	/*
802 	 * Get device parameters.
803 	 */
804 	case BIOCGDLT:
805 		if (d->bd_bif == NULL)
806 			error = EINVAL;
807 		else
808 			*(u_int *)addr = d->bd_bif->bif_dlt;
809 		break;
810 
811 	/*
812 	 * Set device parameters.
813 	 */
814 	case BIOCSDLT:
815 		if (d->bd_bif == NULL)
816 			error = EINVAL;
817 		else {
818 			mtx_enter(&d->bd_mtx);
819 			error = bpf_setdlt(d, *(u_int *)addr);
820 			mtx_leave(&d->bd_mtx);
821 		}
822 		break;
823 
824 	/*
825 	 * Set interface name.
826 	 */
827 	case BIOCGETIF:
828 		if (d->bd_bif == NULL)
829 			error = EINVAL;
830 		else
831 			bpf_ifname(d->bd_bif, (struct ifreq *)addr);
832 		break;
833 
834 	/*
835 	 * Set interface.
836 	 */
837 	case BIOCSETIF:
838 		error = bpf_setif(d, (struct ifreq *)addr);
839 		break;
840 
841 	/*
842 	 * Set read timeout.
843 	 */
844 	case BIOCSRTIMEOUT:
845 		{
846 			struct timeval *tv = (struct timeval *)addr;
847 			u_long rtout;
848 
849 			/* Compute number of ticks. */
850 			if (tv->tv_sec < 0 || !timerisvalid(tv)) {
851 				error = EINVAL;
852 				break;
853 			}
854 			if (tv->tv_sec > INT_MAX / hz) {
855 				error = EOVERFLOW;
856 				break;
857 			}
858 			rtout = tv->tv_sec * hz;
859 			if (tv->tv_usec / tick > INT_MAX - rtout) {
860 				error = EOVERFLOW;
861 				break;
862 			}
863 			rtout += tv->tv_usec / tick;
864 			d->bd_rtout = rtout;
865 			if (d->bd_rtout == 0 && tv->tv_usec != 0)
866 				d->bd_rtout = 1;
867 			break;
868 		}
869 
870 	/*
871 	 * Get read timeout.
872 	 */
873 	case BIOCGRTIMEOUT:
874 		{
875 			struct timeval *tv = (struct timeval *)addr;
876 
877 			tv->tv_sec = d->bd_rtout / hz;
878 			tv->tv_usec = (d->bd_rtout % hz) * tick;
879 			break;
880 		}
881 
882 	/*
883 	 * Get packet stats.
884 	 */
885 	case BIOCGSTATS:
886 		{
887 			struct bpf_stat *bs = (struct bpf_stat *)addr;
888 
889 			bs->bs_recv = d->bd_rcount;
890 			bs->bs_drop = d->bd_dcount;
891 			break;
892 		}
893 
894 	/*
895 	 * Set immediate mode.
896 	 */
897 	case BIOCIMMEDIATE:
898 		d->bd_immediate = *(u_int *)addr;
899 		break;
900 
901 	case BIOCVERSION:
902 		{
903 			struct bpf_version *bv = (struct bpf_version *)addr;
904 
905 			bv->bv_major = BPF_MAJOR_VERSION;
906 			bv->bv_minor = BPF_MINOR_VERSION;
907 			break;
908 		}
909 
910 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
911 		*(u_int *)addr = d->bd_hdrcmplt;
912 		break;
913 
914 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
915 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
916 		break;
917 
918 	case BIOCLOCK:		/* set "locked" flag (no reset) */
919 		d->bd_locked = 1;
920 		break;
921 
922 	case BIOCGFILDROP:	/* get "filter-drop" flag */
923 		*(u_int *)addr = d->bd_fildrop;
924 		break;
925 
926 	case BIOCSFILDROP: {	/* set "filter-drop" flag */
927 		unsigned int fildrop = *(u_int *)addr;
928 		switch (fildrop) {
929 		case BPF_FILDROP_PASS:
930 		case BPF_FILDROP_CAPTURE:
931 		case BPF_FILDROP_DROP:
932 			d->bd_fildrop = fildrop;
933 			break;
934 		default:
935 			error = EINVAL;
936 			break;
937 		}
938 		break;
939 	}
940 
941 	case BIOCGDIRFILT:	/* get direction filter */
942 		*(u_int *)addr = d->bd_dirfilt;
943 		break;
944 
945 	case BIOCSDIRFILT:	/* set direction filter */
946 		d->bd_dirfilt = (*(u_int *)addr) &
947 		    (BPF_DIRECTION_IN|BPF_DIRECTION_OUT);
948 		break;
949 
950 	case FIONBIO:		/* Non-blocking I/O */
951 		if (*(int *)addr)
952 			d->bd_rtout = -1;
953 		else
954 			d->bd_rtout = 0;
955 		break;
956 
957 	case FIOASYNC:		/* Send signal on receive packets */
958 		d->bd_async = *(int *)addr;
959 		break;
960 
961 	/*
962 	 * N.B.  ioctl (FIOSETOWN) and fcntl (F_SETOWN) both end up doing
963 	 * the equivalent of a TIOCSPGRP and hence end up here.  *However*
964 	 * TIOCSPGRP's arg is a process group if it's positive and a process
965 	 * id if it's negative.  This is exactly the opposite of what the
966 	 * other two functions want!  Therefore there is code in ioctl and
967 	 * fcntl to negate the arg before calling here.
968 	 */
969 	case TIOCSPGRP:		/* Process or group to send signals to */
970 		d->bd_pgid = *(int *)addr;
971 		d->bd_siguid = p->p_ucred->cr_ruid;
972 		d->bd_sigeuid = p->p_ucred->cr_uid;
973 		break;
974 
975 	case TIOCGPGRP:
976 		*(int *)addr = d->bd_pgid;
977 		break;
978 
979 	case BIOCSRSIG:		/* Set receive signal */
980 		{
981 			u_int sig;
982 
983 			sig = *(u_int *)addr;
984 
985 			if (sig >= NSIG)
986 				error = EINVAL;
987 			else
988 				d->bd_sig = sig;
989 			break;
990 		}
991 	case BIOCGRSIG:
992 		*(u_int *)addr = d->bd_sig;
993 		break;
994 	}
995 
996 	return (error);
997 }
998 
999 /*
1000  * Set d's packet filter program to fp.  If this file already has a filter,
1001  * free it and replace it.  Returns EINVAL for bogus requests.
1002  */
1003 int
1004 bpf_setf(struct bpf_d *d, struct bpf_program *fp, int wf)
1005 {
1006 	struct bpf_program_smr *bps, *old_bps;
1007 	struct bpf_insn *fcode;
1008 	u_int flen, size;
1009 
1010 	KERNEL_ASSERT_LOCKED();
1011 
1012 	if (fp->bf_insns == 0) {
1013 		if (fp->bf_len != 0)
1014 			return (EINVAL);
1015 		bps = NULL;
1016 	} else {
1017 		flen = fp->bf_len;
1018 		if (flen > BPF_MAXINSNS)
1019 			return (EINVAL);
1020 
1021 		fcode = mallocarray(flen, sizeof(*fp->bf_insns), M_DEVBUF,
1022 		    M_WAITOK | M_CANFAIL);
1023 		if (fcode == NULL)
1024 			return (ENOMEM);
1025 
1026 		size = flen * sizeof(*fp->bf_insns);
1027 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1028 		    bpf_validate(fcode, (int)flen) == 0) {
1029 			free(fcode, M_DEVBUF, size);
1030 			return (EINVAL);
1031 		}
1032 
1033 		bps = malloc(sizeof(*bps), M_DEVBUF, M_WAITOK);
1034 		smr_init(&bps->bps_smr);
1035 		bps->bps_bf.bf_len = flen;
1036 		bps->bps_bf.bf_insns = fcode;
1037 	}
1038 
1039 	if (wf == 0) {
1040 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_rfilter);
1041 		SMR_PTR_SET_LOCKED(&d->bd_rfilter, bps);
1042 	} else {
1043 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_wfilter);
1044 		SMR_PTR_SET_LOCKED(&d->bd_wfilter, bps);
1045 	}
1046 
1047 	mtx_enter(&d->bd_mtx);
1048 	bpf_resetd(d);
1049 	mtx_leave(&d->bd_mtx);
1050 	if (old_bps != NULL)
1051 		smr_call(&old_bps->bps_smr, bpf_prog_smr, old_bps);
1052 
1053 	return (0);
1054 }
1055 
1056 /*
1057  * Detach a file from its current interface (if attached at all) and attach
1058  * to the interface indicated by the name stored in ifr.
1059  * Return an errno or 0.
1060  */
1061 int
1062 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1063 {
1064 	struct bpf_if *bp, *candidate = NULL;
1065 	int error = 0;
1066 
1067 	/*
1068 	 * Look through attached interfaces for the named one.
1069 	 */
1070 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1071 		if (strcmp(bp->bif_name, ifr->ifr_name) != 0)
1072 			continue;
1073 
1074 		if (candidate == NULL || candidate->bif_dlt > bp->bif_dlt)
1075 			candidate = bp;
1076 	}
1077 
1078 	/* Not found. */
1079 	if (candidate == NULL)
1080 		return (ENXIO);
1081 
1082 	/*
1083 	 * Allocate the packet buffers if we need to.
1084 	 * If we're already attached to requested interface,
1085 	 * just flush the buffer.
1086 	 */
1087 	mtx_enter(&d->bd_mtx);
1088 	if (d->bd_sbuf == NULL) {
1089 		if ((error = bpf_allocbufs(d)))
1090 			goto out;
1091 	}
1092 	if (candidate != d->bd_bif) {
1093 		/*
1094 		 * Detach if attached to something else.
1095 		 */
1096 		bpf_detachd(d);
1097 		bpf_attachd(d, candidate);
1098 	}
1099 	bpf_resetd(d);
1100 out:
1101 	mtx_leave(&d->bd_mtx);
1102 	return (error);
1103 }
1104 
1105 /*
1106  * Copy the interface name to the ifreq.
1107  */
1108 void
1109 bpf_ifname(struct bpf_if *bif, struct ifreq *ifr)
1110 {
1111 	bcopy(bif->bif_name, ifr->ifr_name, sizeof(ifr->ifr_name));
1112 }
1113 
1114 /*
1115  * Support for poll() system call
1116  */
1117 int
1118 bpfpoll(dev_t dev, int events, struct proc *p)
1119 {
1120 	struct bpf_d *d;
1121 	int revents;
1122 
1123 	KERNEL_ASSERT_LOCKED();
1124 
1125 	/*
1126 	 * An imitation of the FIONREAD ioctl code.
1127 	 */
1128 	d = bpfilter_lookup(minor(dev));
1129 
1130 	/*
1131 	 * XXX The USB stack manages it to trigger some race condition
1132 	 * which causes bpfilter_lookup to return NULL when a USB device
1133 	 * gets detached while it is up and has an open bpf handler (e.g.
1134 	 * dhclient).  We still should recheck if we can fix the root
1135 	 * cause of this issue.
1136 	 */
1137 	if (d == NULL)
1138 		return (POLLERR);
1139 
1140 	/* Always ready to write data */
1141 	revents = events & (POLLOUT | POLLWRNORM);
1142 
1143 	if (events & (POLLIN | POLLRDNORM)) {
1144 		mtx_enter(&d->bd_mtx);
1145 		if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0))
1146 			revents |= events & (POLLIN | POLLRDNORM);
1147 		else {
1148 			/*
1149 			 * if there's a timeout, mark the time we
1150 			 * started waiting.
1151 			 */
1152 			if (d->bd_rtout != -1 && d->bd_rdStart == 0)
1153 				d->bd_rdStart = ticks;
1154 			selrecord(p, &d->bd_sel);
1155 		}
1156 		mtx_leave(&d->bd_mtx);
1157 	}
1158 	return (revents);
1159 }
1160 
1161 struct filterops bpfread_filtops =
1162 	{ 1, NULL, filt_bpfrdetach, filt_bpfread };
1163 
1164 int
1165 bpfkqfilter(dev_t dev, struct knote *kn)
1166 {
1167 	struct bpf_d *d;
1168 	struct klist *klist;
1169 
1170 	KERNEL_ASSERT_LOCKED();
1171 
1172 	d = bpfilter_lookup(minor(dev));
1173 
1174 	switch (kn->kn_filter) {
1175 	case EVFILT_READ:
1176 		klist = &d->bd_sel.si_note;
1177 		kn->kn_fop = &bpfread_filtops;
1178 		break;
1179 	default:
1180 		return (EINVAL);
1181 	}
1182 
1183 	kn->kn_hook = d;
1184 	SLIST_INSERT_HEAD(klist, kn, kn_selnext);
1185 
1186 	mtx_enter(&d->bd_mtx);
1187 	if (d->bd_rtout != -1 && d->bd_rdStart == 0)
1188 		d->bd_rdStart = ticks;
1189 	mtx_leave(&d->bd_mtx);
1190 
1191 	return (0);
1192 }
1193 
1194 void
1195 filt_bpfrdetach(struct knote *kn)
1196 {
1197 	struct bpf_d *d = kn->kn_hook;
1198 
1199 	KERNEL_ASSERT_LOCKED();
1200 
1201 	SLIST_REMOVE(&d->bd_sel.si_note, kn, knote, kn_selnext);
1202 }
1203 
1204 int
1205 filt_bpfread(struct knote *kn, long hint)
1206 {
1207 	struct bpf_d *d = kn->kn_hook;
1208 
1209 	KERNEL_ASSERT_LOCKED();
1210 
1211 	mtx_enter(&d->bd_mtx);
1212 	kn->kn_data = d->bd_hlen;
1213 	if (d->bd_immediate)
1214 		kn->kn_data += d->bd_slen;
1215 	mtx_leave(&d->bd_mtx);
1216 
1217 	return (kn->kn_data > 0);
1218 }
1219 
1220 /*
1221  * Copy data from an mbuf chain into a buffer.  This code is derived
1222  * from m_copydata in sys/uipc_mbuf.c.
1223  */
1224 void
1225 bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
1226 {
1227 	const struct mbuf *m;
1228 	u_int count;
1229 	u_char *dst;
1230 
1231 	m = src_arg;
1232 	dst = dst_arg;
1233 	while (len > 0) {
1234 		if (m == NULL)
1235 			panic("bpf_mcopy");
1236 		count = min(m->m_len, len);
1237 		bcopy(mtod(m, caddr_t), (caddr_t)dst, count);
1238 		m = m->m_next;
1239 		dst += count;
1240 		len -= count;
1241 	}
1242 }
1243 
1244 /*
1245  * like bpf_mtap, but copy fn can be given. used by various bpf_mtap*
1246  */
1247 int
1248 _bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction,
1249     void (*cpfn)(const void *, void *, size_t))
1250 {
1251 	struct bpf_if *bp = (struct bpf_if *)arg;
1252 	struct bpf_d *d;
1253 	size_t pktlen, slen;
1254 	const struct mbuf *m0;
1255 	struct timeval tv;
1256 	int gottime = 0;
1257 	int drop = 0;
1258 
1259 	if (m == NULL)
1260 		return (0);
1261 
1262 	if (cpfn == NULL)
1263 		cpfn = bpf_mcopy;
1264 
1265 	if (bp == NULL)
1266 		return (0);
1267 
1268 	pktlen = 0;
1269 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1270 		pktlen += m0->m_len;
1271 
1272 	smr_read_enter();
1273 	SMR_SLIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1274 		struct bpf_program_smr *bps;
1275 		struct bpf_insn *fcode = NULL;
1276 
1277 		atomic_inc_long(&d->bd_rcount);
1278 
1279 		if (ISSET(d->bd_dirfilt, direction))
1280 			continue;
1281 
1282 		bps = SMR_PTR_GET(&d->bd_rfilter);
1283 		if (bps != NULL)
1284 			fcode = bps->bps_bf.bf_insns;
1285 		slen = bpf_mfilter(fcode, m, pktlen);
1286 
1287 		if (slen == 0)
1288 			continue;
1289 		if (d->bd_fildrop != BPF_FILDROP_PASS)
1290 			drop = 1;
1291 		if (d->bd_fildrop != BPF_FILDROP_DROP) {
1292 			if (!gottime) {
1293 				if (ISSET(m->m_flags, M_PKTHDR))
1294 					m_microtime(m, &tv);
1295 				else
1296 					microtime(&tv);
1297 
1298 				gottime = 1;
1299 			}
1300 
1301 			mtx_enter(&d->bd_mtx);
1302 			bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
1303 			    &tv);
1304 			mtx_leave(&d->bd_mtx);
1305 		}
1306 	}
1307 	smr_read_leave();
1308 
1309 	return (drop);
1310 }
1311 
1312 /*
1313  * Incoming linkage from device drivers, where a data buffer should be
1314  * prepended by an arbitrary header. In this situation we already have a
1315  * way of representing a chain of memory buffers, ie, mbufs, so reuse
1316  * the existing functionality by attaching the buffers to mbufs.
1317  *
1318  * Con up a minimal mbuf chain to pacify bpf by allocating (only) a
1319  * struct m_hdr each for the header and data on the stack.
1320  */
1321 int
1322 bpf_tap_hdr(caddr_t arg, const void *hdr, unsigned int hdrlen,
1323     const void *buf, unsigned int buflen, u_int direction)
1324 {
1325 	struct m_hdr mh, md;
1326 	struct mbuf *m0 = NULL;
1327 	struct mbuf **mp = &m0;
1328 
1329 	if (hdr != NULL) {
1330 		mh.mh_flags = 0;
1331 		mh.mh_next = NULL;
1332 		mh.mh_len = hdrlen;
1333 		mh.mh_data = (void *)hdr;
1334 
1335 		*mp = (struct mbuf *)&mh;
1336 		mp = &mh.mh_next;
1337 	}
1338 
1339 	if (buf != NULL) {
1340 		md.mh_flags = 0;
1341 		md.mh_next = NULL;
1342 		md.mh_len = buflen;
1343 		md.mh_data = (void *)buf;
1344 
1345 		*mp = (struct mbuf *)&md;
1346 	}
1347 
1348 	return _bpf_mtap(arg, m0, direction, bpf_mcopy);
1349 }
1350 
1351 /*
1352  * Incoming linkage from device drivers, when packet is in an mbuf chain.
1353  */
1354 int
1355 bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
1356 {
1357 	return _bpf_mtap(arg, m, direction, NULL);
1358 }
1359 
1360 /*
1361  * Incoming linkage from device drivers, where we have a mbuf chain
1362  * but need to prepend some arbitrary header from a linear buffer.
1363  *
1364  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1365  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1366  * fields in this header that we initialize, and will not try to free
1367  * it or keep a pointer to it.
1368  */
1369 int
1370 bpf_mtap_hdr(caddr_t arg, caddr_t data, u_int dlen, const struct mbuf *m,
1371     u_int direction, void (*cpfn)(const void *, void *, size_t))
1372 {
1373 	struct m_hdr mh;
1374 	const struct mbuf *m0;
1375 
1376 	if (dlen > 0) {
1377 		mh.mh_flags = 0;
1378 		mh.mh_next = (struct mbuf *)m;
1379 		mh.mh_len = dlen;
1380 		mh.mh_data = data;
1381 		m0 = (struct mbuf *)&mh;
1382 	} else
1383 		m0 = m;
1384 
1385 	return _bpf_mtap(arg, m0, direction, cpfn);
1386 }
1387 
1388 /*
1389  * Incoming linkage from device drivers, where we have a mbuf chain
1390  * but need to prepend the address family.
1391  *
1392  * Con up a minimal dummy header to pacify bpf.  We allocate (only) a
1393  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1394  * fields in this header that we initialize, and will not try to free
1395  * it or keep a pointer to it.
1396  */
1397 int
1398 bpf_mtap_af(caddr_t arg, u_int32_t af, const struct mbuf *m, u_int direction)
1399 {
1400 	u_int32_t    afh;
1401 
1402 	afh = htonl(af);
1403 
1404 	return bpf_mtap_hdr(arg, (caddr_t)&afh, sizeof(afh),
1405 	    m, direction, NULL);
1406 }
1407 
1408 /*
1409  * Incoming linkage from device drivers, where we have a mbuf chain
1410  * but need to prepend a VLAN encapsulation header.
1411  *
1412  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1413  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1414  * fields in this header that we initialize, and will not try to free
1415  * it or keep a pointer to it.
1416  */
1417 int
1418 bpf_mtap_ether(caddr_t arg, const struct mbuf *m, u_int direction)
1419 {
1420 #if NVLAN > 0
1421 	struct ether_vlan_header evh;
1422 	struct m_hdr mh;
1423 	uint8_t prio;
1424 
1425 	if ((m->m_flags & M_VLANTAG) == 0)
1426 #endif
1427 	{
1428 		return bpf_mtap(arg, m, direction);
1429 	}
1430 
1431 #if NVLAN > 0
1432 	KASSERT(m->m_len >= ETHER_HDR_LEN);
1433 
1434 	prio = m->m_pkthdr.pf.prio;
1435 	if (prio <= 1)
1436 		prio = !prio;
1437 
1438 	memcpy(&evh, mtod(m, char *), ETHER_HDR_LEN);
1439 	evh.evl_proto = evh.evl_encap_proto;
1440 	evh.evl_encap_proto = htons(ETHERTYPE_VLAN);
1441 	evh.evl_tag = htons(m->m_pkthdr.ether_vtag |
1442 	    (prio << EVL_PRIO_BITS));
1443 
1444 	mh.mh_flags = 0;
1445 	mh.mh_data = m->m_data + ETHER_HDR_LEN;
1446 	mh.mh_len = m->m_len - ETHER_HDR_LEN;
1447 	mh.mh_next = m->m_next;
1448 
1449 	return bpf_mtap_hdr(arg, (caddr_t)&evh, sizeof(evh),
1450 	    (struct mbuf *)&mh, direction, NULL);
1451 #endif
1452 }
1453 
1454 /*
1455  * Move the packet data from interface memory (pkt) into the
1456  * store buffer.  Wake up listeners if needed.
1457  * "copy" is the routine called to do the actual data
1458  * transfer.  bcopy is passed in to copy contiguous chunks, while
1459  * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
1460  * pkt is really an mbuf.
1461  */
1462 void
1463 bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
1464     void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
1465 {
1466 	struct bpf_hdr *hp;
1467 	int totlen, curlen;
1468 	int hdrlen, do_wakeup = 0;
1469 
1470 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1471 	if (d->bd_bif == NULL)
1472 		return;
1473 
1474 	hdrlen = d->bd_bif->bif_hdrlen;
1475 
1476 	/*
1477 	 * Figure out how many bytes to move.  If the packet is
1478 	 * greater or equal to the snapshot length, transfer that
1479 	 * much.  Otherwise, transfer the whole packet (unless
1480 	 * we hit the buffer size limit).
1481 	 */
1482 	totlen = hdrlen + min(snaplen, pktlen);
1483 	if (totlen > d->bd_bufsize)
1484 		totlen = d->bd_bufsize;
1485 
1486 	/*
1487 	 * Round up the end of the previous packet to the next longword.
1488 	 */
1489 	curlen = BPF_WORDALIGN(d->bd_slen);
1490 	if (curlen + totlen > d->bd_bufsize) {
1491 		/*
1492 		 * This packet will overflow the storage buffer.
1493 		 * Rotate the buffers if we can, then wakeup any
1494 		 * pending reads.
1495 		 */
1496 		if (d->bd_fbuf == NULL) {
1497 			/*
1498 			 * We haven't completed the previous read yet,
1499 			 * so drop the packet.
1500 			 */
1501 			++d->bd_dcount;
1502 			return;
1503 		}
1504 		ROTATE_BUFFERS(d);
1505 		do_wakeup = 1;
1506 		curlen = 0;
1507 	}
1508 
1509 	/*
1510 	 * Append the bpf header.
1511 	 */
1512 	hp = (struct bpf_hdr *)(d->bd_sbuf + curlen);
1513 	hp->bh_tstamp.tv_sec = tv->tv_sec;
1514 	hp->bh_tstamp.tv_usec = tv->tv_usec;
1515 	hp->bh_datalen = pktlen;
1516 	hp->bh_hdrlen = hdrlen;
1517 	/*
1518 	 * Copy the packet data into the store buffer and update its length.
1519 	 */
1520 	(*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
1521 	d->bd_slen = curlen + totlen;
1522 
1523 	if (d->bd_immediate) {
1524 		/*
1525 		 * Immediate mode is set.  A packet arrived so any
1526 		 * reads should be woken up.
1527 		 */
1528 		do_wakeup = 1;
1529 	}
1530 
1531 	if (d->bd_rdStart && d->bd_rdStart <= ULONG_MAX - d->bd_rtout &&
1532 	    d->bd_rdStart + d->bd_rtout < ticks) {
1533 		/*
1534 		 * we could be selecting on the bpf, and we
1535 		 * may have timeouts set.  We got here by getting
1536 		 * a packet, so wake up the reader.
1537 		 */
1538 		if (d->bd_fbuf != NULL) {
1539 			d->bd_rdStart = 0;
1540 			ROTATE_BUFFERS(d);
1541 			do_wakeup = 1;
1542 		}
1543 	}
1544 
1545 	if (do_wakeup)
1546 		bpf_wakeup(d);
1547 }
1548 
1549 /*
1550  * Initialize all nonzero fields of a descriptor.
1551  */
1552 int
1553 bpf_allocbufs(struct bpf_d *d)
1554 {
1555 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1556 
1557 	d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1558 	if (d->bd_fbuf == NULL)
1559 		return (ENOMEM);
1560 
1561 	d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1562 	if (d->bd_sbuf == NULL) {
1563 		free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
1564 		return (ENOMEM);
1565 	}
1566 
1567 	d->bd_slen = 0;
1568 	d->bd_hlen = 0;
1569 
1570 	return (0);
1571 }
1572 
1573 void
1574 bpf_prog_smr(void *bps_arg)
1575 {
1576 	struct bpf_program_smr *bps = bps_arg;
1577 
1578 	free(bps->bps_bf.bf_insns, M_DEVBUF,
1579 	    bps->bps_bf.bf_len * sizeof(struct bpf_insn));
1580 	free(bps, M_DEVBUF, sizeof(struct bpf_program_smr));
1581 }
1582 
1583 void
1584 bpf_d_smr(void *smr)
1585 {
1586 	struct bpf_d	*bd = smr;
1587 
1588 	free(bd->bd_sbuf, M_DEVBUF, bd->bd_bufsize);
1589 	free(bd->bd_hbuf, M_DEVBUF, bd->bd_bufsize);
1590 	free(bd->bd_fbuf, M_DEVBUF, bd->bd_bufsize);
1591 
1592 	if (bd->bd_rfilter != NULL)
1593 		bpf_prog_smr(bd->bd_rfilter);
1594 	if (bd->bd_wfilter != NULL)
1595 		bpf_prog_smr(bd->bd_wfilter);
1596 
1597 	free(bd, M_DEVBUF, sizeof(*bd));
1598 }
1599 
1600 void *
1601 bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
1602 {
1603 	struct bpf_if *bp;
1604 
1605 	if ((bp = malloc(sizeof(*bp), M_DEVBUF, M_NOWAIT)) == NULL)
1606 		panic("bpfattach");
1607 	SMR_SLIST_INIT(&bp->bif_dlist);
1608 	bp->bif_driverp = (struct bpf_if **)bpfp;
1609 	bp->bif_name = name;
1610 	bp->bif_ifp = NULL;
1611 	bp->bif_dlt = dlt;
1612 
1613 	bp->bif_next = bpf_iflist;
1614 	bpf_iflist = bp;
1615 
1616 	*bp->bif_driverp = NULL;
1617 
1618 	/*
1619 	 * Compute the length of the bpf header.  This is not necessarily
1620 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1621 	 * that the network layer header begins on a longword boundary (for
1622 	 * performance reasons and to alleviate alignment restrictions).
1623 	 */
1624 	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1625 
1626 	return (bp);
1627 }
1628 
1629 void
1630 bpfattach(caddr_t *driverp, struct ifnet *ifp, u_int dlt, u_int hdrlen)
1631 {
1632 	struct bpf_if *bp;
1633 
1634 	bp = bpfsattach(driverp, ifp->if_xname, dlt, hdrlen);
1635 	bp->bif_ifp = ifp;
1636 }
1637 
1638 /* Detach an interface from its attached bpf device.  */
1639 void
1640 bpfdetach(struct ifnet *ifp)
1641 {
1642 	struct bpf_if *bp, *nbp, **pbp = &bpf_iflist;
1643 
1644 	KERNEL_ASSERT_LOCKED();
1645 
1646 	for (bp = bpf_iflist; bp; bp = nbp) {
1647 		nbp = bp->bif_next;
1648 		if (bp->bif_ifp == ifp) {
1649 			*pbp = nbp;
1650 
1651 			bpfsdetach(bp);
1652 		} else
1653 			pbp = &bp->bif_next;
1654 	}
1655 	ifp->if_bpf = NULL;
1656 }
1657 
1658 void
1659 bpfsdetach(void *p)
1660 {
1661 	struct bpf_if *bp = p;
1662 	struct bpf_d *bd;
1663 	int maj;
1664 
1665 	/* Locate the major number. */
1666 	for (maj = 0; maj < nchrdev; maj++)
1667 		if (cdevsw[maj].d_open == bpfopen)
1668 			break;
1669 
1670 	while ((bd = SMR_SLIST_FIRST_LOCKED(&bp->bif_dlist)))
1671 		vdevgone(maj, bd->bd_unit, bd->bd_unit, VCHR);
1672 
1673 	free(bp, M_DEVBUF, sizeof(*bp));
1674 }
1675 
1676 int
1677 bpf_sysctl_locked(int *name, u_int namelen, void *oldp, size_t *oldlenp,
1678     void *newp, size_t newlen)
1679 {
1680 	int newval;
1681 	int error;
1682 
1683 	switch (name[0]) {
1684 	case NET_BPF_BUFSIZE:
1685 		newval = bpf_bufsize;
1686 		error = sysctl_int(oldp, oldlenp, newp, newlen, &newval);
1687 		if (error)
1688 			return (error);
1689 		if (newval < BPF_MINBUFSIZE || newval > bpf_maxbufsize)
1690 			return (EINVAL);
1691 		bpf_bufsize = newval;
1692 		break;
1693 	case NET_BPF_MAXBUFSIZE:
1694 		newval = bpf_maxbufsize;
1695 		error = sysctl_int(oldp, oldlenp, newp, newlen, &newval);
1696 		if (error)
1697 			return (error);
1698 		if (newval < BPF_MINBUFSIZE)
1699 			return (EINVAL);
1700 		bpf_maxbufsize = newval;
1701 		break;
1702 	default:
1703 		return (EOPNOTSUPP);
1704 	}
1705 	return (0);
1706 }
1707 
1708 int
1709 bpf_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1710     size_t newlen)
1711 {
1712 	int flags = RW_INTR;
1713 	int error;
1714 
1715 	if (namelen != 1)
1716 		return (ENOTDIR);
1717 
1718 	flags |= (newp == NULL) ? RW_READ : RW_WRITE;
1719 
1720 	error = rw_enter(&bpf_sysctl_lk, flags);
1721 	if (error != 0)
1722 		return (error);
1723 
1724 	error = bpf_sysctl_locked(name, namelen, oldp, oldlenp, newp, newlen);
1725 
1726 	rw_exit(&bpf_sysctl_lk);
1727 
1728 	return (error);
1729 }
1730 
1731 struct bpf_d *
1732 bpfilter_lookup(int unit)
1733 {
1734 	struct bpf_d *bd;
1735 
1736 	KERNEL_ASSERT_LOCKED();
1737 
1738 	LIST_FOREACH(bd, &bpf_d_list, bd_list)
1739 		if (bd->bd_unit == unit)
1740 			return (bd);
1741 	return (NULL);
1742 }
1743 
1744 /*
1745  * Get a list of available data link type of the interface.
1746  */
1747 int
1748 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1749 {
1750 	int n, error;
1751 	struct bpf_if *bp;
1752 	const char *name;
1753 
1754 	name = d->bd_bif->bif_name;
1755 	n = 0;
1756 	error = 0;
1757 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1758 		if (strcmp(name, bp->bif_name) != 0)
1759 			continue;
1760 		if (bfl->bfl_list != NULL) {
1761 			if (n >= bfl->bfl_len)
1762 				return (ENOMEM);
1763 			error = copyout(&bp->bif_dlt,
1764 			    bfl->bfl_list + n, sizeof(u_int));
1765 			if (error)
1766 				break;
1767 		}
1768 		n++;
1769 	}
1770 
1771 	bfl->bfl_len = n;
1772 	return (error);
1773 }
1774 
1775 /*
1776  * Set the data link type of a BPF instance.
1777  */
1778 int
1779 bpf_setdlt(struct bpf_d *d, u_int dlt)
1780 {
1781 	const char *name;
1782 	struct bpf_if *bp;
1783 
1784 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1785 	if (d->bd_bif->bif_dlt == dlt)
1786 		return (0);
1787 	name = d->bd_bif->bif_name;
1788 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1789 		if (strcmp(name, bp->bif_name) != 0)
1790 			continue;
1791 		if (bp->bif_dlt == dlt)
1792 			break;
1793 	}
1794 	if (bp == NULL)
1795 		return (EINVAL);
1796 	bpf_detachd(d);
1797 	bpf_attachd(d, bp);
1798 	bpf_resetd(d);
1799 	return (0);
1800 }
1801 
1802 u_int32_t	bpf_mbuf_ldw(const void *, u_int32_t, int *);
1803 u_int32_t	bpf_mbuf_ldh(const void *, u_int32_t, int *);
1804 u_int32_t	bpf_mbuf_ldb(const void *, u_int32_t, int *);
1805 
1806 int		bpf_mbuf_copy(const struct mbuf *, u_int32_t,
1807 		    void *, u_int32_t);
1808 
1809 const struct bpf_ops bpf_mbuf_ops = {
1810 	bpf_mbuf_ldw,
1811 	bpf_mbuf_ldh,
1812 	bpf_mbuf_ldb,
1813 };
1814 
1815 int
1816 bpf_mbuf_copy(const struct mbuf *m, u_int32_t off, void *buf, u_int32_t len)
1817 {
1818 	u_int8_t *cp = buf;
1819 	u_int32_t count;
1820 
1821 	while (off >= m->m_len) {
1822 		off -= m->m_len;
1823 
1824 		m = m->m_next;
1825 		if (m == NULL)
1826 			return (-1);
1827 	}
1828 
1829 	for (;;) {
1830 		count = min(m->m_len - off, len);
1831 
1832 		memcpy(cp, m->m_data + off, count);
1833 		len -= count;
1834 
1835 		if (len == 0)
1836 			return (0);
1837 
1838 		m = m->m_next;
1839 		if (m == NULL)
1840 			break;
1841 
1842 		cp += count;
1843 		off = 0;
1844 	}
1845 
1846 	return (-1);
1847 }
1848 
1849 u_int32_t
1850 bpf_mbuf_ldw(const void *m0, u_int32_t k, int *err)
1851 {
1852 	u_int32_t v;
1853 
1854 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1855 		*err = 1;
1856 		return (0);
1857 	}
1858 
1859 	*err = 0;
1860 	return ntohl(v);
1861 }
1862 
1863 u_int32_t
1864 bpf_mbuf_ldh(const void *m0, u_int32_t k, int *err)
1865 {
1866 	u_int16_t v;
1867 
1868 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1869 		*err = 1;
1870 		return (0);
1871 	}
1872 
1873 	*err = 0;
1874 	return ntohs(v);
1875 }
1876 
1877 u_int32_t
1878 bpf_mbuf_ldb(const void *m0, u_int32_t k, int *err)
1879 {
1880 	const struct mbuf *m = m0;
1881 	u_int8_t v;
1882 
1883 	while (k >= m->m_len) {
1884 		k -= m->m_len;
1885 
1886 		m = m->m_next;
1887 		if (m == NULL) {
1888 			*err = 1;
1889 			return (0);
1890 		}
1891 	}
1892 	v = m->m_data[k];
1893 
1894 	*err = 0;
1895 	return v;
1896 }
1897 
1898 u_int
1899 bpf_mfilter(const struct bpf_insn *pc, const struct mbuf *m, u_int wirelen)
1900 {
1901 	return _bpf_filter(pc, &bpf_mbuf_ops, m, wirelen);
1902 }
1903