xref: /openbsd-src/sys/net/bpf.c (revision 505ee9ea3b177e2387d907a91ca7da069f3f14d8)
1 /*	$OpenBSD: bpf.c,v 1.192 2020/06/18 23:32:00 dlg Exp $	*/
2 /*	$NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1990, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * Copyright (c) 2010, 2014 Henning Brauer <henning@openbsd.org>
8  *
9  * This code is derived from the Stanford/CMU enet packet filter,
10  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
11  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
12  * Berkeley Laboratory.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)bpf.c	8.2 (Berkeley) 3/28/94
39  */
40 
41 #include "bpfilter.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/mbuf.h>
46 #include <sys/proc.h>
47 #include <sys/signalvar.h>
48 #include <sys/ioctl.h>
49 #include <sys/conf.h>
50 #include <sys/vnode.h>
51 #include <sys/fcntl.h>
52 #include <sys/socket.h>
53 #include <sys/poll.h>
54 #include <sys/kernel.h>
55 #include <sys/sysctl.h>
56 #include <sys/rwlock.h>
57 #include <sys/atomic.h>
58 #include <sys/smr.h>
59 #include <sys/specdev.h>
60 #include <sys/selinfo.h>
61 #include <sys/sigio.h>
62 #include <sys/task.h>
63 
64 #include <net/if.h>
65 #include <net/bpf.h>
66 #include <net/bpfdesc.h>
67 
68 #include <netinet/in.h>
69 #include <netinet/if_ether.h>
70 
71 #include "vlan.h"
72 #if NVLAN > 0
73 #include <net/if_vlan_var.h>
74 #endif
75 
76 #define BPF_BUFSIZE 32768
77 
78 #define PRINET  26			/* interruptible */
79 
80 /* from kern/kern_clock.c; incremented each clock tick. */
81 extern int ticks;
82 
83 /*
84  * The default read buffer size is patchable.
85  */
86 int bpf_bufsize = BPF_BUFSIZE;
87 int bpf_maxbufsize = BPF_MAXBUFSIZE;
88 
89 /*
90  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
91  *  bpf_d_list is the list of descriptors
92  */
93 struct bpf_if	*bpf_iflist;
94 LIST_HEAD(, bpf_d) bpf_d_list;
95 
96 int	bpf_allocbufs(struct bpf_d *);
97 void	bpf_ifname(struct bpf_if*, struct ifreq *);
98 void	bpf_mcopy(const void *, void *, size_t);
99 int	bpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
100 	    struct sockaddr *);
101 int	bpf_setif(struct bpf_d *, struct ifreq *);
102 int	bpfpoll(dev_t, int, struct proc *);
103 int	bpfkqfilter(dev_t, struct knote *);
104 void	bpf_wakeup(struct bpf_d *);
105 void	bpf_wakeup_cb(void *);
106 int	_bpf_mtap(caddr_t, const struct mbuf *, const struct mbuf *, u_int);
107 void	bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
108 	    const struct bpf_hdr *);
109 int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
110 int	bpf_setdlt(struct bpf_d *, u_int);
111 
112 void	filt_bpfrdetach(struct knote *);
113 int	filt_bpfread(struct knote *, long);
114 
115 int	bpf_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t);
116 
117 struct bpf_d *bpfilter_lookup(int);
118 
119 /*
120  * Called holding ``bd_mtx''.
121  */
122 void	bpf_attachd(struct bpf_d *, struct bpf_if *);
123 void	bpf_detachd(struct bpf_d *);
124 void	bpf_resetd(struct bpf_d *);
125 
126 void	bpf_prog_smr(void *);
127 void	bpf_d_smr(void *);
128 
129 /*
130  * Reference count access to descriptor buffers
131  */
132 void	bpf_get(struct bpf_d *);
133 void	bpf_put(struct bpf_d *);
134 
135 
136 struct rwlock bpf_sysctl_lk = RWLOCK_INITIALIZER("bpfsz");
137 
138 int
139 bpf_movein(struct uio *uio, struct bpf_d *d, struct mbuf **mp,
140     struct sockaddr *sockp)
141 {
142 	struct bpf_program_smr *bps;
143 	struct bpf_insn *fcode = NULL;
144 	struct mbuf *m;
145 	struct m_tag *mtag;
146 	int error;
147 	u_int hlen;
148 	u_int len;
149 	u_int linktype;
150 	u_int slen;
151 
152 	/*
153 	 * Build a sockaddr based on the data link layer type.
154 	 * We do this at this level because the ethernet header
155 	 * is copied directly into the data field of the sockaddr.
156 	 * In the case of SLIP, there is no header and the packet
157 	 * is forwarded as is.
158 	 * Also, we are careful to leave room at the front of the mbuf
159 	 * for the link level header.
160 	 */
161 	linktype = d->bd_bif->bif_dlt;
162 	switch (linktype) {
163 
164 	case DLT_SLIP:
165 		sockp->sa_family = AF_INET;
166 		hlen = 0;
167 		break;
168 
169 	case DLT_PPP:
170 		sockp->sa_family = AF_UNSPEC;
171 		hlen = 0;
172 		break;
173 
174 	case DLT_EN10MB:
175 		sockp->sa_family = AF_UNSPEC;
176 		/* XXX Would MAXLINKHDR be better? */
177 		hlen = ETHER_HDR_LEN;
178 		break;
179 
180 	case DLT_IEEE802_11:
181 	case DLT_IEEE802_11_RADIO:
182 		sockp->sa_family = AF_UNSPEC;
183 		hlen = 0;
184 		break;
185 
186 	case DLT_RAW:
187 	case DLT_NULL:
188 		sockp->sa_family = AF_UNSPEC;
189 		hlen = 0;
190 		break;
191 
192 	case DLT_LOOP:
193 		sockp->sa_family = AF_UNSPEC;
194 		hlen = sizeof(u_int32_t);
195 		break;
196 
197 	default:
198 		return (EIO);
199 	}
200 
201 	if (uio->uio_resid > MAXMCLBYTES)
202 		return (EIO);
203 	len = uio->uio_resid;
204 
205 	MGETHDR(m, M_WAIT, MT_DATA);
206 	m->m_pkthdr.ph_ifidx = 0;
207 	m->m_pkthdr.len = len - hlen;
208 
209 	if (len > MHLEN) {
210 		MCLGETI(m, M_WAIT, NULL, len);
211 		if ((m->m_flags & M_EXT) == 0) {
212 			error = ENOBUFS;
213 			goto bad;
214 		}
215 	}
216 	m->m_len = len;
217 	*mp = m;
218 
219 	error = uiomove(mtod(m, caddr_t), len, uio);
220 	if (error)
221 		goto bad;
222 
223 	smr_read_enter();
224 	bps = SMR_PTR_GET(&d->bd_wfilter);
225 	if (bps != NULL)
226 		fcode = bps->bps_bf.bf_insns;
227 	slen = bpf_filter(fcode, mtod(m, u_char *), len, len);
228 	smr_read_leave();
229 
230 	if (slen < len) {
231 		error = EPERM;
232 		goto bad;
233 	}
234 
235 	if (m->m_len < hlen) {
236 		error = EPERM;
237 		goto bad;
238 	}
239 	/*
240 	 * Make room for link header, and copy it to sockaddr
241 	 */
242 	if (hlen != 0) {
243 		if (linktype == DLT_LOOP) {
244 			u_int32_t af;
245 
246 			/* the link header indicates the address family */
247 			KASSERT(hlen == sizeof(u_int32_t));
248 			memcpy(&af, m->m_data, hlen);
249 			sockp->sa_family = ntohl(af);
250 		} else
251 			memcpy(sockp->sa_data, m->m_data, hlen);
252 		m->m_len -= hlen;
253 		m->m_data += hlen; /* XXX */
254 	}
255 
256 	/*
257 	 * Prepend the data link type as a mbuf tag
258 	 */
259 	mtag = m_tag_get(PACKET_TAG_DLT, sizeof(u_int), M_WAIT);
260 	*(u_int *)(mtag + 1) = linktype;
261 	m_tag_prepend(m, mtag);
262 
263 	return (0);
264  bad:
265 	m_freem(m);
266 	return (error);
267 }
268 
269 /*
270  * Attach file to the bpf interface, i.e. make d listen on bp.
271  */
272 void
273 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
274 {
275 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
276 
277 	/*
278 	 * Point d at bp, and add d to the interface's list of listeners.
279 	 * Finally, point the driver's bpf cookie at the interface so
280 	 * it will divert packets to bpf.
281 	 */
282 
283 	d->bd_bif = bp;
284 
285 	KERNEL_ASSERT_LOCKED();
286 	SMR_SLIST_INSERT_HEAD_LOCKED(&bp->bif_dlist, d, bd_next);
287 
288 	*bp->bif_driverp = bp;
289 }
290 
291 /*
292  * Detach a file from its interface.
293  */
294 void
295 bpf_detachd(struct bpf_d *d)
296 {
297 	struct bpf_if *bp;
298 
299 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
300 
301 	bp = d->bd_bif;
302 	/* Not attached. */
303 	if (bp == NULL)
304 		return;
305 
306 	/* Remove ``d'' from the interface's descriptor list. */
307 	KERNEL_ASSERT_LOCKED();
308 	SMR_SLIST_REMOVE_LOCKED(&bp->bif_dlist, d, bpf_d, bd_next);
309 
310 	if (SMR_SLIST_EMPTY_LOCKED(&bp->bif_dlist)) {
311 		/*
312 		 * Let the driver know that there are no more listeners.
313 		 */
314 		*bp->bif_driverp = NULL;
315 	}
316 
317 	d->bd_bif = NULL;
318 
319 	/*
320 	 * Check if this descriptor had requested promiscuous mode.
321 	 * If so, turn it off.
322 	 */
323 	if (d->bd_promisc) {
324 		int error;
325 
326 		KASSERT(bp->bif_ifp != NULL);
327 
328 		d->bd_promisc = 0;
329 
330 		bpf_get(d);
331 		mtx_leave(&d->bd_mtx);
332 		NET_LOCK();
333 		error = ifpromisc(bp->bif_ifp, 0);
334 		NET_UNLOCK();
335 		mtx_enter(&d->bd_mtx);
336 		bpf_put(d);
337 
338 		if (error && !(error == EINVAL || error == ENODEV ||
339 		    error == ENXIO))
340 			/*
341 			 * Something is really wrong if we were able to put
342 			 * the driver into promiscuous mode, but can't
343 			 * take it out.
344 			 */
345 			panic("bpf: ifpromisc failed");
346 	}
347 }
348 
349 void
350 bpfilterattach(int n)
351 {
352 	LIST_INIT(&bpf_d_list);
353 }
354 
355 /*
356  * Open ethernet device.  Returns ENXIO for illegal minor device number,
357  * EBUSY if file is open by another process.
358  */
359 int
360 bpfopen(dev_t dev, int flag, int mode, struct proc *p)
361 {
362 	struct bpf_d *bd;
363 	int unit = minor(dev);
364 
365 	if (unit & ((1 << CLONE_SHIFT) - 1))
366 		return (ENXIO);
367 
368 	KASSERT(bpfilter_lookup(unit) == NULL);
369 
370 	/* create on demand */
371 	if ((bd = malloc(sizeof(*bd), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
372 		return (EBUSY);
373 
374 	/* Mark "free" and do most initialization. */
375 	bd->bd_unit = unit;
376 	bd->bd_bufsize = bpf_bufsize;
377 	bd->bd_sig = SIGIO;
378 	mtx_init(&bd->bd_mtx, IPL_NET);
379 	task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
380 	smr_init(&bd->bd_smr);
381 	sigio_init(&bd->bd_sigio);
382 
383 	bd->bd_rtout = 0;	/* no timeout by default */
384 	bd->bd_rnonblock = ISSET(flag, FNONBLOCK);
385 
386 	bpf_get(bd);
387 	LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
388 
389 	return (0);
390 }
391 
392 /*
393  * Close the descriptor by detaching it from its interface,
394  * deallocating its buffers, and marking it free.
395  */
396 int
397 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
398 {
399 	struct bpf_d *d;
400 
401 	d = bpfilter_lookup(minor(dev));
402 	mtx_enter(&d->bd_mtx);
403 	bpf_detachd(d);
404 	bpf_wakeup(d);
405 	LIST_REMOVE(d, bd_list);
406 	mtx_leave(&d->bd_mtx);
407 	bpf_put(d);
408 
409 	return (0);
410 }
411 
412 /*
413  * Rotate the packet buffers in descriptor d.  Move the store buffer
414  * into the hold slot, and the free buffer into the store slot.
415  * Zero the length of the new store buffer.
416  */
417 #define ROTATE_BUFFERS(d) \
418 	KASSERT(d->bd_in_uiomove == 0); \
419 	MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
420 	(d)->bd_hbuf = (d)->bd_sbuf; \
421 	(d)->bd_hlen = (d)->bd_slen; \
422 	(d)->bd_sbuf = (d)->bd_fbuf; \
423 	(d)->bd_slen = 0; \
424 	(d)->bd_fbuf = NULL;
425 /*
426  *  bpfread - read next chunk of packets from buffers
427  */
428 int
429 bpfread(dev_t dev, struct uio *uio, int ioflag)
430 {
431 	struct bpf_d *d;
432 	caddr_t hbuf;
433 	int hlen, error;
434 
435 	KERNEL_ASSERT_LOCKED();
436 
437 	d = bpfilter_lookup(minor(dev));
438 	if (d->bd_bif == NULL)
439 		return (ENXIO);
440 
441 	bpf_get(d);
442 	mtx_enter(&d->bd_mtx);
443 
444 	/*
445 	 * Restrict application to use a buffer the same size as
446 	 * as kernel buffers.
447 	 */
448 	if (uio->uio_resid != d->bd_bufsize) {
449 		error = EINVAL;
450 		goto out;
451 	}
452 
453 	/*
454 	 * If there's a timeout, bd_rdStart is tagged when we start the read.
455 	 * we can then figure out when we're done reading.
456 	 */
457 	if (d->bd_rnonblock == 0 && d->bd_rdStart == 0)
458 		d->bd_rdStart = ticks;
459 	else
460 		d->bd_rdStart = 0;
461 
462 	/*
463 	 * If the hold buffer is empty, then do a timed sleep, which
464 	 * ends when the timeout expires or when enough packets
465 	 * have arrived to fill the store buffer.
466 	 */
467 	while (d->bd_hbuf == NULL) {
468 		if (d->bd_bif == NULL) {
469 			/* interface is gone */
470 			if (d->bd_slen == 0) {
471 				error = EIO;
472 				goto out;
473 			}
474 			ROTATE_BUFFERS(d);
475 			break;
476 		}
477 		if (d->bd_immediate && d->bd_slen != 0) {
478 			/*
479 			 * A packet(s) either arrived since the previous
480 			 * read or arrived while we were asleep.
481 			 * Rotate the buffers and return what's here.
482 			 */
483 			ROTATE_BUFFERS(d);
484 			break;
485 		}
486 		if (d->bd_rnonblock) {
487 			/* User requested non-blocking I/O */
488 			error = EWOULDBLOCK;
489 		} else {
490 			if (d->bd_rdStart <= ULONG_MAX - d->bd_rtout &&
491 			    d->bd_rdStart + d->bd_rtout < ticks) {
492 				error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
493 				    "bpf", d->bd_rtout);
494 			} else
495 				error = EWOULDBLOCK;
496 		}
497 		if (error == EINTR || error == ERESTART)
498 			goto out;
499 		if (error == EWOULDBLOCK) {
500 			/*
501 			 * On a timeout, return what's in the buffer,
502 			 * which may be nothing.  If there is something
503 			 * in the store buffer, we can rotate the buffers.
504 			 */
505 			if (d->bd_hbuf != NULL)
506 				/*
507 				 * We filled up the buffer in between
508 				 * getting the timeout and arriving
509 				 * here, so we don't need to rotate.
510 				 */
511 				break;
512 
513 			if (d->bd_slen == 0) {
514 				error = 0;
515 				goto out;
516 			}
517 			ROTATE_BUFFERS(d);
518 			break;
519 		}
520 	}
521 	/*
522 	 * At this point, we know we have something in the hold slot.
523 	 */
524 	hbuf = d->bd_hbuf;
525 	hlen = d->bd_hlen;
526 	d->bd_hbuf = NULL;
527 	d->bd_hlen = 0;
528 	d->bd_fbuf = NULL;
529 	d->bd_in_uiomove = 1;
530 
531 	/*
532 	 * Move data from hold buffer into user space.
533 	 * We know the entire buffer is transferred since
534 	 * we checked above that the read buffer is bpf_bufsize bytes.
535 	 */
536 	mtx_leave(&d->bd_mtx);
537 	error = uiomove(hbuf, hlen, uio);
538 	mtx_enter(&d->bd_mtx);
539 
540 	/* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
541 	KASSERT(d->bd_fbuf == NULL);
542 	KASSERT(d->bd_hbuf == NULL);
543 	d->bd_fbuf = hbuf;
544 	d->bd_in_uiomove = 0;
545 out:
546 	mtx_leave(&d->bd_mtx);
547 	bpf_put(d);
548 
549 	return (error);
550 }
551 
552 
553 /*
554  * If there are processes sleeping on this descriptor, wake them up.
555  */
556 void
557 bpf_wakeup(struct bpf_d *d)
558 {
559 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
560 
561 	/*
562 	 * As long as pgsigio() and selwakeup() need to be protected
563 	 * by the KERNEL_LOCK() we have to delay the wakeup to
564 	 * another context to keep the hot path KERNEL_LOCK()-free.
565 	 */
566 	bpf_get(d);
567 	if (!task_add(systq, &d->bd_wake_task))
568 		bpf_put(d);
569 }
570 
571 void
572 bpf_wakeup_cb(void *xd)
573 {
574 	struct bpf_d *d = xd;
575 
576 	wakeup(d);
577 	if (d->bd_async && d->bd_sig)
578 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
579 
580 	selwakeup(&d->bd_sel);
581 	bpf_put(d);
582 }
583 
584 int
585 bpfwrite(dev_t dev, struct uio *uio, int ioflag)
586 {
587 	struct bpf_d *d;
588 	struct ifnet *ifp;
589 	struct mbuf *m;
590 	int error;
591 	struct sockaddr_storage dst;
592 
593 	KERNEL_ASSERT_LOCKED();
594 
595 	d = bpfilter_lookup(minor(dev));
596 	if (d->bd_bif == NULL)
597 		return (ENXIO);
598 
599 	bpf_get(d);
600 	ifp = d->bd_bif->bif_ifp;
601 
602 	if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
603 		error = ENETDOWN;
604 		goto out;
605 	}
606 
607 	if (uio->uio_resid == 0) {
608 		error = 0;
609 		goto out;
610 	}
611 
612 	error = bpf_movein(uio, d, &m, sstosa(&dst));
613 	if (error)
614 		goto out;
615 
616 	if (m->m_pkthdr.len > ifp->if_mtu) {
617 		m_freem(m);
618 		error = EMSGSIZE;
619 		goto out;
620 	}
621 
622 	m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
623 	m->m_pkthdr.pf.prio = ifp->if_llprio;
624 
625 	if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
626 		dst.ss_family = pseudo_AF_HDRCMPLT;
627 
628 	NET_LOCK();
629 	error = ifp->if_output(ifp, m, sstosa(&dst), NULL);
630 	NET_UNLOCK();
631 
632 out:
633 	bpf_put(d);
634 	return (error);
635 }
636 
637 /*
638  * Reset a descriptor by flushing its packet buffer and clearing the
639  * receive and drop counts.
640  */
641 void
642 bpf_resetd(struct bpf_d *d)
643 {
644 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
645 	KASSERT(d->bd_in_uiomove == 0);
646 
647 	if (d->bd_hbuf != NULL) {
648 		/* Free the hold buffer. */
649 		d->bd_fbuf = d->bd_hbuf;
650 		d->bd_hbuf = NULL;
651 	}
652 	d->bd_slen = 0;
653 	d->bd_hlen = 0;
654 	d->bd_rcount = 0;
655 	d->bd_dcount = 0;
656 }
657 
658 /*
659  *  FIONREAD		Check for read packet available.
660  *  BIOCGBLEN		Get buffer len [for read()].
661  *  BIOCSETF		Set ethernet read filter.
662  *  BIOCFLUSH		Flush read packet buffer.
663  *  BIOCPROMISC		Put interface into promiscuous mode.
664  *  BIOCGDLTLIST	Get supported link layer types.
665  *  BIOCGDLT		Get link layer type.
666  *  BIOCSDLT		Set link layer type.
667  *  BIOCGETIF		Get interface name.
668  *  BIOCSETIF		Set interface.
669  *  BIOCSRTIMEOUT	Set read timeout.
670  *  BIOCGRTIMEOUT	Get read timeout.
671  *  BIOCGSTATS		Get packet stats.
672  *  BIOCIMMEDIATE	Set immediate mode.
673  *  BIOCVERSION		Get filter language version.
674  *  BIOCGHDRCMPLT	Get "header already complete" flag
675  *  BIOCSHDRCMPLT	Set "header already complete" flag
676  */
677 int
678 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
679 {
680 	struct bpf_d *d;
681 	int error = 0;
682 
683 	d = bpfilter_lookup(minor(dev));
684 	if (d->bd_locked && suser(p) != 0) {
685 		/* list of allowed ioctls when locked and not root */
686 		switch (cmd) {
687 		case BIOCGBLEN:
688 		case BIOCFLUSH:
689 		case BIOCGDLT:
690 		case BIOCGDLTLIST:
691 		case BIOCGETIF:
692 		case BIOCGRTIMEOUT:
693 		case BIOCGSTATS:
694 		case BIOCVERSION:
695 		case BIOCGRSIG:
696 		case BIOCGHDRCMPLT:
697 		case FIONREAD:
698 		case BIOCLOCK:
699 		case BIOCSRTIMEOUT:
700 		case BIOCIMMEDIATE:
701 		case TIOCGPGRP:
702 		case BIOCGDIRFILT:
703 			break;
704 		default:
705 			return (EPERM);
706 		}
707 	}
708 
709 	bpf_get(d);
710 
711 	switch (cmd) {
712 	default:
713 		error = EINVAL;
714 		break;
715 
716 	/*
717 	 * Check for read packet available.
718 	 */
719 	case FIONREAD:
720 		{
721 			int n;
722 
723 			mtx_enter(&d->bd_mtx);
724 			n = d->bd_slen;
725 			if (d->bd_hbuf != NULL)
726 				n += d->bd_hlen;
727 			mtx_leave(&d->bd_mtx);
728 
729 			*(int *)addr = n;
730 			break;
731 		}
732 
733 	/*
734 	 * Get buffer len [for read()].
735 	 */
736 	case BIOCGBLEN:
737 		*(u_int *)addr = d->bd_bufsize;
738 		break;
739 
740 	/*
741 	 * Set buffer length.
742 	 */
743 	case BIOCSBLEN:
744 		if (d->bd_bif != NULL)
745 			error = EINVAL;
746 		else {
747 			u_int size = *(u_int *)addr;
748 
749 			if (size > bpf_maxbufsize)
750 				*(u_int *)addr = size = bpf_maxbufsize;
751 			else if (size < BPF_MINBUFSIZE)
752 				*(u_int *)addr = size = BPF_MINBUFSIZE;
753 			mtx_enter(&d->bd_mtx);
754 			d->bd_bufsize = size;
755 			mtx_leave(&d->bd_mtx);
756 		}
757 		break;
758 
759 	/*
760 	 * Set link layer read filter.
761 	 */
762 	case BIOCSETF:
763 		error = bpf_setf(d, (struct bpf_program *)addr, 0);
764 		break;
765 
766 	/*
767 	 * Set link layer write filter.
768 	 */
769 	case BIOCSETWF:
770 		error = bpf_setf(d, (struct bpf_program *)addr, 1);
771 		break;
772 
773 	/*
774 	 * Flush read packet buffer.
775 	 */
776 	case BIOCFLUSH:
777 		mtx_enter(&d->bd_mtx);
778 		bpf_resetd(d);
779 		mtx_leave(&d->bd_mtx);
780 		break;
781 
782 	/*
783 	 * Put interface into promiscuous mode.
784 	 */
785 	case BIOCPROMISC:
786 		if (d->bd_bif == NULL) {
787 			/*
788 			 * No interface attached yet.
789 			 */
790 			error = EINVAL;
791 		} else if (d->bd_bif->bif_ifp != NULL) {
792 			if (d->bd_promisc == 0) {
793 				MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
794 				NET_LOCK();
795 				error = ifpromisc(d->bd_bif->bif_ifp, 1);
796 				NET_UNLOCK();
797 				if (error == 0)
798 					d->bd_promisc = 1;
799 			}
800 		}
801 		break;
802 
803 	/*
804 	 * Get a list of supported device parameters.
805 	 */
806 	case BIOCGDLTLIST:
807 		if (d->bd_bif == NULL)
808 			error = EINVAL;
809 		else
810 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
811 		break;
812 
813 	/*
814 	 * Get device parameters.
815 	 */
816 	case BIOCGDLT:
817 		if (d->bd_bif == NULL)
818 			error = EINVAL;
819 		else
820 			*(u_int *)addr = d->bd_bif->bif_dlt;
821 		break;
822 
823 	/*
824 	 * Set device parameters.
825 	 */
826 	case BIOCSDLT:
827 		if (d->bd_bif == NULL)
828 			error = EINVAL;
829 		else {
830 			mtx_enter(&d->bd_mtx);
831 			error = bpf_setdlt(d, *(u_int *)addr);
832 			mtx_leave(&d->bd_mtx);
833 		}
834 		break;
835 
836 	/*
837 	 * Set interface name.
838 	 */
839 	case BIOCGETIF:
840 		if (d->bd_bif == NULL)
841 			error = EINVAL;
842 		else
843 			bpf_ifname(d->bd_bif, (struct ifreq *)addr);
844 		break;
845 
846 	/*
847 	 * Set interface.
848 	 */
849 	case BIOCSETIF:
850 		error = bpf_setif(d, (struct ifreq *)addr);
851 		break;
852 
853 	/*
854 	 * Set read timeout.
855 	 */
856 	case BIOCSRTIMEOUT:
857 		{
858 			struct timeval *tv = (struct timeval *)addr;
859 			u_long rtout;
860 
861 			/* Compute number of ticks. */
862 			if (tv->tv_sec < 0 || !timerisvalid(tv)) {
863 				error = EINVAL;
864 				break;
865 			}
866 			if (tv->tv_sec > INT_MAX / hz) {
867 				error = EOVERFLOW;
868 				break;
869 			}
870 			rtout = tv->tv_sec * hz;
871 			if (tv->tv_usec / tick > INT_MAX - rtout) {
872 				error = EOVERFLOW;
873 				break;
874 			}
875 			rtout += tv->tv_usec / tick;
876 			d->bd_rtout = rtout;
877 			if (d->bd_rtout == 0 && tv->tv_usec != 0)
878 				d->bd_rtout = 1;
879 			break;
880 		}
881 
882 	/*
883 	 * Get read timeout.
884 	 */
885 	case BIOCGRTIMEOUT:
886 		{
887 			struct timeval *tv = (struct timeval *)addr;
888 
889 			tv->tv_sec = d->bd_rtout / hz;
890 			tv->tv_usec = (d->bd_rtout % hz) * tick;
891 			break;
892 		}
893 
894 	/*
895 	 * Get packet stats.
896 	 */
897 	case BIOCGSTATS:
898 		{
899 			struct bpf_stat *bs = (struct bpf_stat *)addr;
900 
901 			bs->bs_recv = d->bd_rcount;
902 			bs->bs_drop = d->bd_dcount;
903 			break;
904 		}
905 
906 	/*
907 	 * Set immediate mode.
908 	 */
909 	case BIOCIMMEDIATE:
910 		d->bd_immediate = *(u_int *)addr;
911 		break;
912 
913 	case BIOCVERSION:
914 		{
915 			struct bpf_version *bv = (struct bpf_version *)addr;
916 
917 			bv->bv_major = BPF_MAJOR_VERSION;
918 			bv->bv_minor = BPF_MINOR_VERSION;
919 			break;
920 		}
921 
922 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
923 		*(u_int *)addr = d->bd_hdrcmplt;
924 		break;
925 
926 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
927 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
928 		break;
929 
930 	case BIOCLOCK:		/* set "locked" flag (no reset) */
931 		d->bd_locked = 1;
932 		break;
933 
934 	case BIOCGFILDROP:	/* get "filter-drop" flag */
935 		*(u_int *)addr = d->bd_fildrop;
936 		break;
937 
938 	case BIOCSFILDROP: {	/* set "filter-drop" flag */
939 		unsigned int fildrop = *(u_int *)addr;
940 		switch (fildrop) {
941 		case BPF_FILDROP_PASS:
942 		case BPF_FILDROP_CAPTURE:
943 		case BPF_FILDROP_DROP:
944 			d->bd_fildrop = fildrop;
945 			break;
946 		default:
947 			error = EINVAL;
948 			break;
949 		}
950 		break;
951 	}
952 
953 	case BIOCGDIRFILT:	/* get direction filter */
954 		*(u_int *)addr = d->bd_dirfilt;
955 		break;
956 
957 	case BIOCSDIRFILT:	/* set direction filter */
958 		d->bd_dirfilt = (*(u_int *)addr) &
959 		    (BPF_DIRECTION_IN|BPF_DIRECTION_OUT);
960 		break;
961 
962 	case FIONBIO:		/* Non-blocking I/O */
963 		if (*(int *)addr)
964 			d->bd_rnonblock = 1;
965 		else
966 			d->bd_rnonblock = 0;
967 		break;
968 
969 	case FIOASYNC:		/* Send signal on receive packets */
970 		d->bd_async = *(int *)addr;
971 		break;
972 
973 	case FIOSETOWN:		/* Process or group to send signals to */
974 	case TIOCSPGRP:
975 		error = sigio_setown(&d->bd_sigio, cmd, addr);
976 		break;
977 
978 	case FIOGETOWN:
979 	case TIOCGPGRP:
980 		sigio_getown(&d->bd_sigio, cmd, addr);
981 		break;
982 
983 	case BIOCSRSIG:		/* Set receive signal */
984 		{
985 			u_int sig;
986 
987 			sig = *(u_int *)addr;
988 
989 			if (sig >= NSIG)
990 				error = EINVAL;
991 			else
992 				d->bd_sig = sig;
993 			break;
994 		}
995 	case BIOCGRSIG:
996 		*(u_int *)addr = d->bd_sig;
997 		break;
998 	}
999 
1000 	bpf_put(d);
1001 	return (error);
1002 }
1003 
1004 /*
1005  * Set d's packet filter program to fp.  If this file already has a filter,
1006  * free it and replace it.  Returns EINVAL for bogus requests.
1007  */
1008 int
1009 bpf_setf(struct bpf_d *d, struct bpf_program *fp, int wf)
1010 {
1011 	struct bpf_program_smr *bps, *old_bps;
1012 	struct bpf_insn *fcode;
1013 	u_int flen, size;
1014 
1015 	KERNEL_ASSERT_LOCKED();
1016 
1017 	if (fp->bf_insns == 0) {
1018 		if (fp->bf_len != 0)
1019 			return (EINVAL);
1020 		bps = NULL;
1021 	} else {
1022 		flen = fp->bf_len;
1023 		if (flen > BPF_MAXINSNS)
1024 			return (EINVAL);
1025 
1026 		fcode = mallocarray(flen, sizeof(*fp->bf_insns), M_DEVBUF,
1027 		    M_WAITOK | M_CANFAIL);
1028 		if (fcode == NULL)
1029 			return (ENOMEM);
1030 
1031 		size = flen * sizeof(*fp->bf_insns);
1032 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1033 		    bpf_validate(fcode, (int)flen) == 0) {
1034 			free(fcode, M_DEVBUF, size);
1035 			return (EINVAL);
1036 		}
1037 
1038 		bps = malloc(sizeof(*bps), M_DEVBUF, M_WAITOK);
1039 		smr_init(&bps->bps_smr);
1040 		bps->bps_bf.bf_len = flen;
1041 		bps->bps_bf.bf_insns = fcode;
1042 	}
1043 
1044 	if (wf == 0) {
1045 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_rfilter);
1046 		SMR_PTR_SET_LOCKED(&d->bd_rfilter, bps);
1047 	} else {
1048 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_wfilter);
1049 		SMR_PTR_SET_LOCKED(&d->bd_wfilter, bps);
1050 	}
1051 
1052 	mtx_enter(&d->bd_mtx);
1053 	bpf_resetd(d);
1054 	mtx_leave(&d->bd_mtx);
1055 	if (old_bps != NULL)
1056 		smr_call(&old_bps->bps_smr, bpf_prog_smr, old_bps);
1057 
1058 	return (0);
1059 }
1060 
1061 /*
1062  * Detach a file from its current interface (if attached at all) and attach
1063  * to the interface indicated by the name stored in ifr.
1064  * Return an errno or 0.
1065  */
1066 int
1067 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1068 {
1069 	struct bpf_if *bp, *candidate = NULL;
1070 	int error = 0;
1071 
1072 	/*
1073 	 * Look through attached interfaces for the named one.
1074 	 */
1075 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1076 		if (strcmp(bp->bif_name, ifr->ifr_name) != 0)
1077 			continue;
1078 
1079 		if (candidate == NULL || candidate->bif_dlt > bp->bif_dlt)
1080 			candidate = bp;
1081 	}
1082 
1083 	/* Not found. */
1084 	if (candidate == NULL)
1085 		return (ENXIO);
1086 
1087 	/*
1088 	 * Allocate the packet buffers if we need to.
1089 	 * If we're already attached to requested interface,
1090 	 * just flush the buffer.
1091 	 */
1092 	mtx_enter(&d->bd_mtx);
1093 	if (d->bd_sbuf == NULL) {
1094 		if ((error = bpf_allocbufs(d)))
1095 			goto out;
1096 	}
1097 	if (candidate != d->bd_bif) {
1098 		/*
1099 		 * Detach if attached to something else.
1100 		 */
1101 		bpf_detachd(d);
1102 		bpf_attachd(d, candidate);
1103 	}
1104 	bpf_resetd(d);
1105 out:
1106 	mtx_leave(&d->bd_mtx);
1107 	return (error);
1108 }
1109 
1110 /*
1111  * Copy the interface name to the ifreq.
1112  */
1113 void
1114 bpf_ifname(struct bpf_if *bif, struct ifreq *ifr)
1115 {
1116 	bcopy(bif->bif_name, ifr->ifr_name, sizeof(ifr->ifr_name));
1117 }
1118 
1119 /*
1120  * Support for poll() system call
1121  */
1122 int
1123 bpfpoll(dev_t dev, int events, struct proc *p)
1124 {
1125 	struct bpf_d *d;
1126 	int revents;
1127 
1128 	KERNEL_ASSERT_LOCKED();
1129 
1130 	/*
1131 	 * An imitation of the FIONREAD ioctl code.
1132 	 */
1133 	d = bpfilter_lookup(minor(dev));
1134 
1135 	/*
1136 	 * XXX The USB stack manages it to trigger some race condition
1137 	 * which causes bpfilter_lookup to return NULL when a USB device
1138 	 * gets detached while it is up and has an open bpf handler (e.g.
1139 	 * dhclient).  We still should recheck if we can fix the root
1140 	 * cause of this issue.
1141 	 */
1142 	if (d == NULL)
1143 		return (POLLERR);
1144 
1145 	/* Always ready to write data */
1146 	revents = events & (POLLOUT | POLLWRNORM);
1147 
1148 	if (events & (POLLIN | POLLRDNORM)) {
1149 		mtx_enter(&d->bd_mtx);
1150 		if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0))
1151 			revents |= events & (POLLIN | POLLRDNORM);
1152 		else {
1153 			/*
1154 			 * if there's a timeout, mark the time we
1155 			 * started waiting.
1156 			 */
1157 			if (d->bd_rnonblock == 0 && d->bd_rdStart == 0)
1158 				d->bd_rdStart = ticks;
1159 			selrecord(p, &d->bd_sel);
1160 		}
1161 		mtx_leave(&d->bd_mtx);
1162 	}
1163 	return (revents);
1164 }
1165 
1166 const struct filterops bpfread_filtops = {
1167 	.f_flags	= FILTEROP_ISFD,
1168 	.f_attach	= NULL,
1169 	.f_detach	= filt_bpfrdetach,
1170 	.f_event	= filt_bpfread,
1171 };
1172 
1173 int
1174 bpfkqfilter(dev_t dev, struct knote *kn)
1175 {
1176 	struct bpf_d *d;
1177 	struct klist *klist;
1178 
1179 	KERNEL_ASSERT_LOCKED();
1180 
1181 	d = bpfilter_lookup(minor(dev));
1182 
1183 	switch (kn->kn_filter) {
1184 	case EVFILT_READ:
1185 		klist = &d->bd_sel.si_note;
1186 		kn->kn_fop = &bpfread_filtops;
1187 		break;
1188 	default:
1189 		return (EINVAL);
1190 	}
1191 
1192 	bpf_get(d);
1193 	kn->kn_hook = d;
1194 	klist_insert(klist, kn);
1195 
1196 	mtx_enter(&d->bd_mtx);
1197 	if (d->bd_rnonblock == 0 && d->bd_rdStart == 0)
1198 		d->bd_rdStart = ticks;
1199 	mtx_leave(&d->bd_mtx);
1200 
1201 	return (0);
1202 }
1203 
1204 void
1205 filt_bpfrdetach(struct knote *kn)
1206 {
1207 	struct bpf_d *d = kn->kn_hook;
1208 
1209 	KERNEL_ASSERT_LOCKED();
1210 
1211 	klist_remove(&d->bd_sel.si_note, kn);
1212 	bpf_put(d);
1213 }
1214 
1215 int
1216 filt_bpfread(struct knote *kn, long hint)
1217 {
1218 	struct bpf_d *d = kn->kn_hook;
1219 
1220 	KERNEL_ASSERT_LOCKED();
1221 
1222 	mtx_enter(&d->bd_mtx);
1223 	kn->kn_data = d->bd_hlen;
1224 	if (d->bd_immediate)
1225 		kn->kn_data += d->bd_slen;
1226 	mtx_leave(&d->bd_mtx);
1227 
1228 	return (kn->kn_data > 0);
1229 }
1230 
1231 /*
1232  * Copy data from an mbuf chain into a buffer.  This code is derived
1233  * from m_copydata in sys/uipc_mbuf.c.
1234  */
1235 void
1236 bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
1237 {
1238 	const struct mbuf *m;
1239 	u_int count;
1240 	u_char *dst;
1241 
1242 	m = src_arg;
1243 	dst = dst_arg;
1244 	while (len > 0) {
1245 		if (m == NULL)
1246 			panic("bpf_mcopy");
1247 		count = min(m->m_len, len);
1248 		bcopy(mtod(m, caddr_t), (caddr_t)dst, count);
1249 		m = m->m_next;
1250 		dst += count;
1251 		len -= count;
1252 	}
1253 }
1254 
1255 int
1256 bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
1257 {
1258 	return _bpf_mtap(arg, m, m, direction);
1259 }
1260 
1261 int
1262 _bpf_mtap(caddr_t arg, const struct mbuf *mp, const struct mbuf *m,
1263     u_int direction)
1264 {
1265 	struct bpf_if *bp = (struct bpf_if *)arg;
1266 	struct bpf_d *d;
1267 	size_t pktlen, slen;
1268 	const struct mbuf *m0;
1269 	struct bpf_hdr tbh;
1270 	int gothdr = 0;
1271 	int drop = 0;
1272 
1273 	if (m == NULL)
1274 		return (0);
1275 
1276 	if (bp == NULL)
1277 		return (0);
1278 
1279 	pktlen = 0;
1280 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1281 		pktlen += m0->m_len;
1282 
1283 	smr_read_enter();
1284 	SMR_SLIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1285 		struct bpf_program_smr *bps;
1286 		struct bpf_insn *fcode = NULL;
1287 
1288 		atomic_inc_long(&d->bd_rcount);
1289 
1290 		if (ISSET(d->bd_dirfilt, direction))
1291 			continue;
1292 
1293 		bps = SMR_PTR_GET(&d->bd_rfilter);
1294 		if (bps != NULL)
1295 			fcode = bps->bps_bf.bf_insns;
1296 		slen = bpf_mfilter(fcode, m, pktlen);
1297 
1298 		if (slen == 0)
1299 			continue;
1300 		if (d->bd_fildrop != BPF_FILDROP_PASS)
1301 			drop = 1;
1302 		if (d->bd_fildrop != BPF_FILDROP_DROP) {
1303 			if (!gothdr) {
1304 				struct timeval tv;
1305 				memset(&tbh, 0, sizeof(tbh));
1306 
1307 				if (ISSET(mp->m_flags, M_PKTHDR)) {
1308 					tbh.bh_ifidx = mp->m_pkthdr.ph_ifidx;
1309 					tbh.bh_flowid = mp->m_pkthdr.ph_flowid;
1310 					tbh.bh_flags = mp->m_pkthdr.pf.prio;
1311 					if (ISSET(mp->m_pkthdr.csum_flags,
1312 					    M_FLOWID))
1313 						SET(tbh.bh_flags, BPF_F_FLOWID);
1314 
1315 					m_microtime(m, &tv);
1316 				} else
1317 					microtime(&tv);
1318 
1319 				tbh.bh_tstamp.tv_sec = tv.tv_sec;
1320 				tbh.bh_tstamp.tv_usec = tv.tv_usec;
1321 				SET(tbh.bh_flags, direction << BPF_F_DIR_SHIFT);
1322 
1323 				gothdr = 1;
1324 			}
1325 
1326 			mtx_enter(&d->bd_mtx);
1327 			bpf_catchpacket(d, (u_char *)m, pktlen, slen, &tbh);
1328 			mtx_leave(&d->bd_mtx);
1329 		}
1330 	}
1331 	smr_read_leave();
1332 
1333 	return (drop);
1334 }
1335 
1336 /*
1337  * Incoming linkage from device drivers, where a data buffer should be
1338  * prepended by an arbitrary header. In this situation we already have a
1339  * way of representing a chain of memory buffers, ie, mbufs, so reuse
1340  * the existing functionality by attaching the buffers to mbufs.
1341  *
1342  * Con up a minimal mbuf chain to pacify bpf by allocating (only) a
1343  * struct m_hdr each for the header and data on the stack.
1344  */
1345 int
1346 bpf_tap_hdr(caddr_t arg, const void *hdr, unsigned int hdrlen,
1347     const void *buf, unsigned int buflen, u_int direction)
1348 {
1349 	struct m_hdr mh, md;
1350 	struct mbuf *m0 = NULL;
1351 	struct mbuf **mp = &m0;
1352 
1353 	if (hdr != NULL) {
1354 		mh.mh_flags = 0;
1355 		mh.mh_next = NULL;
1356 		mh.mh_len = hdrlen;
1357 		mh.mh_data = (void *)hdr;
1358 
1359 		*mp = (struct mbuf *)&mh;
1360 		mp = &mh.mh_next;
1361 	}
1362 
1363 	if (buf != NULL) {
1364 		md.mh_flags = 0;
1365 		md.mh_next = NULL;
1366 		md.mh_len = buflen;
1367 		md.mh_data = (void *)buf;
1368 
1369 		*mp = (struct mbuf *)&md;
1370 	}
1371 
1372 	return bpf_mtap(arg, m0, direction);
1373 }
1374 
1375 /*
1376  * Incoming linkage from device drivers, where we have a mbuf chain
1377  * but need to prepend some arbitrary header from a linear buffer.
1378  *
1379  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1380  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1381  * fields in this header that we initialize, and will not try to free
1382  * it or keep a pointer to it.
1383  */
1384 int
1385 bpf_mtap_hdr(caddr_t arg, const void *data, u_int dlen, const struct mbuf *m,
1386     u_int direction)
1387 {
1388 	struct m_hdr mh;
1389 	const struct mbuf *m0;
1390 
1391 	if (dlen > 0) {
1392 		mh.mh_flags = 0;
1393 		mh.mh_next = (struct mbuf *)m;
1394 		mh.mh_len = dlen;
1395 		mh.mh_data = (void *)data;
1396 		m0 = (struct mbuf *)&mh;
1397 	} else
1398 		m0 = m;
1399 
1400 	return _bpf_mtap(arg, m, m0, direction);
1401 }
1402 
1403 /*
1404  * Incoming linkage from device drivers, where we have a mbuf chain
1405  * but need to prepend the address family.
1406  *
1407  * Con up a minimal dummy header to pacify bpf.  We allocate (only) a
1408  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1409  * fields in this header that we initialize, and will not try to free
1410  * it or keep a pointer to it.
1411  */
1412 int
1413 bpf_mtap_af(caddr_t arg, u_int32_t af, const struct mbuf *m, u_int direction)
1414 {
1415 	u_int32_t    afh;
1416 
1417 	afh = htonl(af);
1418 
1419 	return bpf_mtap_hdr(arg, &afh, sizeof(afh), m, direction);
1420 }
1421 
1422 /*
1423  * Incoming linkage from device drivers, where we have a mbuf chain
1424  * but need to prepend a VLAN encapsulation header.
1425  *
1426  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1427  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1428  * fields in this header that we initialize, and will not try to free
1429  * it or keep a pointer to it.
1430  */
1431 int
1432 bpf_mtap_ether(caddr_t arg, const struct mbuf *m, u_int direction)
1433 {
1434 #if NVLAN > 0
1435 	struct ether_vlan_header evh;
1436 	struct m_hdr mh;
1437 	uint8_t prio;
1438 
1439 	if ((m->m_flags & M_VLANTAG) == 0)
1440 #endif
1441 	{
1442 		return bpf_mtap(arg, m, direction);
1443 	}
1444 
1445 #if NVLAN > 0
1446 	KASSERT(m->m_len >= ETHER_HDR_LEN);
1447 
1448 	prio = m->m_pkthdr.pf.prio;
1449 	if (prio <= 1)
1450 		prio = !prio;
1451 
1452 	memcpy(&evh, mtod(m, char *), ETHER_HDR_LEN);
1453 	evh.evl_proto = evh.evl_encap_proto;
1454 	evh.evl_encap_proto = htons(ETHERTYPE_VLAN);
1455 	evh.evl_tag = htons(m->m_pkthdr.ether_vtag |
1456 	    (prio << EVL_PRIO_BITS));
1457 
1458 	mh.mh_flags = 0;
1459 	mh.mh_data = m->m_data + ETHER_HDR_LEN;
1460 	mh.mh_len = m->m_len - ETHER_HDR_LEN;
1461 	mh.mh_next = m->m_next;
1462 
1463 	return bpf_mtap_hdr(arg, &evh, sizeof(evh),
1464 	    (struct mbuf *)&mh, direction);
1465 #endif
1466 }
1467 
1468 /*
1469  * Move the packet data from interface memory (pkt) into the
1470  * store buffer.  Wake up listeners if needed.
1471  * "copy" is the routine called to do the actual data
1472  * transfer.  bcopy is passed in to copy contiguous chunks, while
1473  * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
1474  * pkt is really an mbuf.
1475  */
1476 void
1477 bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
1478     const struct bpf_hdr *tbh)
1479 {
1480 	struct bpf_hdr *bh;
1481 	int totlen, curlen;
1482 	int hdrlen, do_wakeup = 0;
1483 
1484 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1485 	if (d->bd_bif == NULL)
1486 		return;
1487 
1488 	hdrlen = d->bd_bif->bif_hdrlen;
1489 
1490 	/*
1491 	 * Figure out how many bytes to move.  If the packet is
1492 	 * greater or equal to the snapshot length, transfer that
1493 	 * much.  Otherwise, transfer the whole packet (unless
1494 	 * we hit the buffer size limit).
1495 	 */
1496 	totlen = hdrlen + min(snaplen, pktlen);
1497 	if (totlen > d->bd_bufsize)
1498 		totlen = d->bd_bufsize;
1499 
1500 	/*
1501 	 * Round up the end of the previous packet to the next longword.
1502 	 */
1503 	curlen = BPF_WORDALIGN(d->bd_slen);
1504 	if (curlen + totlen > d->bd_bufsize) {
1505 		/*
1506 		 * This packet will overflow the storage buffer.
1507 		 * Rotate the buffers if we can, then wakeup any
1508 		 * pending reads.
1509 		 */
1510 		if (d->bd_fbuf == NULL) {
1511 			/*
1512 			 * We haven't completed the previous read yet,
1513 			 * so drop the packet.
1514 			 */
1515 			++d->bd_dcount;
1516 			return;
1517 		}
1518 		ROTATE_BUFFERS(d);
1519 		do_wakeup = 1;
1520 		curlen = 0;
1521 	}
1522 
1523 	/*
1524 	 * Append the bpf header.
1525 	 */
1526 	bh = (struct bpf_hdr *)(d->bd_sbuf + curlen);
1527 	*bh = *tbh;
1528 	bh->bh_datalen = pktlen;
1529 	bh->bh_hdrlen = hdrlen;
1530 	bh->bh_caplen = totlen - hdrlen;
1531 
1532 	/*
1533 	 * Copy the packet data into the store buffer and update its length.
1534 	 */
1535 	bpf_mcopy(pkt, (u_char *)bh + hdrlen, bh->bh_caplen);
1536 	d->bd_slen = curlen + totlen;
1537 
1538 	if (d->bd_immediate) {
1539 		/*
1540 		 * Immediate mode is set.  A packet arrived so any
1541 		 * reads should be woken up.
1542 		 */
1543 		do_wakeup = 1;
1544 	}
1545 
1546 	if (d->bd_rdStart && d->bd_rdStart <= ULONG_MAX - d->bd_rtout &&
1547 	    d->bd_rdStart + d->bd_rtout < ticks) {
1548 		/*
1549 		 * we could be selecting on the bpf, and we
1550 		 * may have timeouts set.  We got here by getting
1551 		 * a packet, so wake up the reader.
1552 		 */
1553 		if (d->bd_fbuf != NULL) {
1554 			d->bd_rdStart = 0;
1555 			ROTATE_BUFFERS(d);
1556 			do_wakeup = 1;
1557 		}
1558 	}
1559 
1560 	if (do_wakeup)
1561 		bpf_wakeup(d);
1562 }
1563 
1564 /*
1565  * Initialize all nonzero fields of a descriptor.
1566  */
1567 int
1568 bpf_allocbufs(struct bpf_d *d)
1569 {
1570 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1571 
1572 	d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1573 	if (d->bd_fbuf == NULL)
1574 		return (ENOMEM);
1575 
1576 	d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1577 	if (d->bd_sbuf == NULL) {
1578 		free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
1579 		return (ENOMEM);
1580 	}
1581 
1582 	d->bd_slen = 0;
1583 	d->bd_hlen = 0;
1584 
1585 	return (0);
1586 }
1587 
1588 void
1589 bpf_prog_smr(void *bps_arg)
1590 {
1591 	struct bpf_program_smr *bps = bps_arg;
1592 
1593 	free(bps->bps_bf.bf_insns, M_DEVBUF,
1594 	    bps->bps_bf.bf_len * sizeof(struct bpf_insn));
1595 	free(bps, M_DEVBUF, sizeof(struct bpf_program_smr));
1596 }
1597 
1598 void
1599 bpf_d_smr(void *smr)
1600 {
1601 	struct bpf_d	*bd = smr;
1602 
1603 	sigio_free(&bd->bd_sigio);
1604 	free(bd->bd_sbuf, M_DEVBUF, bd->bd_bufsize);
1605 	free(bd->bd_hbuf, M_DEVBUF, bd->bd_bufsize);
1606 	free(bd->bd_fbuf, M_DEVBUF, bd->bd_bufsize);
1607 
1608 	if (bd->bd_rfilter != NULL)
1609 		bpf_prog_smr(bd->bd_rfilter);
1610 	if (bd->bd_wfilter != NULL)
1611 		bpf_prog_smr(bd->bd_wfilter);
1612 
1613 	free(bd, M_DEVBUF, sizeof(*bd));
1614 }
1615 
1616 void
1617 bpf_get(struct bpf_d *bd)
1618 {
1619 	atomic_inc_int(&bd->bd_ref);
1620 }
1621 
1622 /*
1623  * Free buffers currently in use by a descriptor
1624  * when the reference count drops to zero.
1625  */
1626 void
1627 bpf_put(struct bpf_d *bd)
1628 {
1629 	if (atomic_dec_int_nv(&bd->bd_ref) > 0)
1630 		return;
1631 
1632 	smr_call(&bd->bd_smr, bpf_d_smr, bd);
1633 }
1634 
1635 void *
1636 bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
1637 {
1638 	struct bpf_if *bp;
1639 
1640 	if ((bp = malloc(sizeof(*bp), M_DEVBUF, M_NOWAIT)) == NULL)
1641 		panic("bpfattach");
1642 	SMR_SLIST_INIT(&bp->bif_dlist);
1643 	bp->bif_driverp = (struct bpf_if **)bpfp;
1644 	bp->bif_name = name;
1645 	bp->bif_ifp = NULL;
1646 	bp->bif_dlt = dlt;
1647 
1648 	bp->bif_next = bpf_iflist;
1649 	bpf_iflist = bp;
1650 
1651 	*bp->bif_driverp = NULL;
1652 
1653 	/*
1654 	 * Compute the length of the bpf header.  This is not necessarily
1655 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1656 	 * that the network layer header begins on a longword boundary (for
1657 	 * performance reasons and to alleviate alignment restrictions).
1658 	 */
1659 	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1660 
1661 	return (bp);
1662 }
1663 
1664 void
1665 bpfattach(caddr_t *driverp, struct ifnet *ifp, u_int dlt, u_int hdrlen)
1666 {
1667 	struct bpf_if *bp;
1668 
1669 	bp = bpfsattach(driverp, ifp->if_xname, dlt, hdrlen);
1670 	bp->bif_ifp = ifp;
1671 }
1672 
1673 /* Detach an interface from its attached bpf device.  */
1674 void
1675 bpfdetach(struct ifnet *ifp)
1676 {
1677 	struct bpf_if *bp, *nbp;
1678 
1679 	KERNEL_ASSERT_LOCKED();
1680 
1681 	for (bp = bpf_iflist; bp; bp = nbp) {
1682 		nbp = bp->bif_next;
1683 		if (bp->bif_ifp == ifp)
1684 			bpfsdetach(bp);
1685 	}
1686 	ifp->if_bpf = NULL;
1687 }
1688 
1689 void
1690 bpfsdetach(void *p)
1691 {
1692 	struct bpf_if *bp = p, *tbp;
1693 	struct bpf_d *bd;
1694 	int maj;
1695 
1696 	KERNEL_ASSERT_LOCKED();
1697 
1698 	/* Locate the major number. */
1699 	for (maj = 0; maj < nchrdev; maj++)
1700 		if (cdevsw[maj].d_open == bpfopen)
1701 			break;
1702 
1703 	while ((bd = SMR_SLIST_FIRST_LOCKED(&bp->bif_dlist)))
1704 		vdevgone(maj, bd->bd_unit, bd->bd_unit, VCHR);
1705 
1706 	for (tbp = bpf_iflist; tbp; tbp = tbp->bif_next) {
1707 		if (tbp->bif_next == bp) {
1708 			tbp->bif_next = bp->bif_next;
1709 			break;
1710 		}
1711 	}
1712 
1713 	if (bpf_iflist == bp)
1714 		bpf_iflist = bp->bif_next;
1715 
1716 	free(bp, M_DEVBUF, sizeof(*bp));
1717 }
1718 
1719 int
1720 bpf_sysctl_locked(int *name, u_int namelen, void *oldp, size_t *oldlenp,
1721     void *newp, size_t newlen)
1722 {
1723 	int newval;
1724 	int error;
1725 
1726 	switch (name[0]) {
1727 	case NET_BPF_BUFSIZE:
1728 		newval = bpf_bufsize;
1729 		error = sysctl_int(oldp, oldlenp, newp, newlen, &newval);
1730 		if (error)
1731 			return (error);
1732 		if (newval < BPF_MINBUFSIZE || newval > bpf_maxbufsize)
1733 			return (EINVAL);
1734 		bpf_bufsize = newval;
1735 		break;
1736 	case NET_BPF_MAXBUFSIZE:
1737 		newval = bpf_maxbufsize;
1738 		error = sysctl_int(oldp, oldlenp, newp, newlen, &newval);
1739 		if (error)
1740 			return (error);
1741 		if (newval < BPF_MINBUFSIZE)
1742 			return (EINVAL);
1743 		bpf_maxbufsize = newval;
1744 		break;
1745 	default:
1746 		return (EOPNOTSUPP);
1747 	}
1748 	return (0);
1749 }
1750 
1751 int
1752 bpf_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1753     size_t newlen)
1754 {
1755 	int flags = RW_INTR;
1756 	int error;
1757 
1758 	if (namelen != 1)
1759 		return (ENOTDIR);
1760 
1761 	flags |= (newp == NULL) ? RW_READ : RW_WRITE;
1762 
1763 	error = rw_enter(&bpf_sysctl_lk, flags);
1764 	if (error != 0)
1765 		return (error);
1766 
1767 	error = bpf_sysctl_locked(name, namelen, oldp, oldlenp, newp, newlen);
1768 
1769 	rw_exit(&bpf_sysctl_lk);
1770 
1771 	return (error);
1772 }
1773 
1774 struct bpf_d *
1775 bpfilter_lookup(int unit)
1776 {
1777 	struct bpf_d *bd;
1778 
1779 	KERNEL_ASSERT_LOCKED();
1780 
1781 	LIST_FOREACH(bd, &bpf_d_list, bd_list)
1782 		if (bd->bd_unit == unit)
1783 			return (bd);
1784 	return (NULL);
1785 }
1786 
1787 /*
1788  * Get a list of available data link type of the interface.
1789  */
1790 int
1791 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1792 {
1793 	int n, error;
1794 	struct bpf_if *bp;
1795 	const char *name;
1796 
1797 	name = d->bd_bif->bif_name;
1798 	n = 0;
1799 	error = 0;
1800 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1801 		if (strcmp(name, bp->bif_name) != 0)
1802 			continue;
1803 		if (bfl->bfl_list != NULL) {
1804 			if (n >= bfl->bfl_len)
1805 				return (ENOMEM);
1806 			error = copyout(&bp->bif_dlt,
1807 			    bfl->bfl_list + n, sizeof(u_int));
1808 			if (error)
1809 				break;
1810 		}
1811 		n++;
1812 	}
1813 
1814 	bfl->bfl_len = n;
1815 	return (error);
1816 }
1817 
1818 /*
1819  * Set the data link type of a BPF instance.
1820  */
1821 int
1822 bpf_setdlt(struct bpf_d *d, u_int dlt)
1823 {
1824 	const char *name;
1825 	struct bpf_if *bp;
1826 
1827 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1828 	if (d->bd_bif->bif_dlt == dlt)
1829 		return (0);
1830 	name = d->bd_bif->bif_name;
1831 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1832 		if (strcmp(name, bp->bif_name) != 0)
1833 			continue;
1834 		if (bp->bif_dlt == dlt)
1835 			break;
1836 	}
1837 	if (bp == NULL)
1838 		return (EINVAL);
1839 	bpf_detachd(d);
1840 	bpf_attachd(d, bp);
1841 	bpf_resetd(d);
1842 	return (0);
1843 }
1844 
1845 u_int32_t	bpf_mbuf_ldw(const void *, u_int32_t, int *);
1846 u_int32_t	bpf_mbuf_ldh(const void *, u_int32_t, int *);
1847 u_int32_t	bpf_mbuf_ldb(const void *, u_int32_t, int *);
1848 
1849 int		bpf_mbuf_copy(const struct mbuf *, u_int32_t,
1850 		    void *, u_int32_t);
1851 
1852 const struct bpf_ops bpf_mbuf_ops = {
1853 	bpf_mbuf_ldw,
1854 	bpf_mbuf_ldh,
1855 	bpf_mbuf_ldb,
1856 };
1857 
1858 int
1859 bpf_mbuf_copy(const struct mbuf *m, u_int32_t off, void *buf, u_int32_t len)
1860 {
1861 	u_int8_t *cp = buf;
1862 	u_int32_t count;
1863 
1864 	while (off >= m->m_len) {
1865 		off -= m->m_len;
1866 
1867 		m = m->m_next;
1868 		if (m == NULL)
1869 			return (-1);
1870 	}
1871 
1872 	for (;;) {
1873 		count = min(m->m_len - off, len);
1874 
1875 		memcpy(cp, m->m_data + off, count);
1876 		len -= count;
1877 
1878 		if (len == 0)
1879 			return (0);
1880 
1881 		m = m->m_next;
1882 		if (m == NULL)
1883 			break;
1884 
1885 		cp += count;
1886 		off = 0;
1887 	}
1888 
1889 	return (-1);
1890 }
1891 
1892 u_int32_t
1893 bpf_mbuf_ldw(const void *m0, u_int32_t k, int *err)
1894 {
1895 	u_int32_t v;
1896 
1897 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1898 		*err = 1;
1899 		return (0);
1900 	}
1901 
1902 	*err = 0;
1903 	return ntohl(v);
1904 }
1905 
1906 u_int32_t
1907 bpf_mbuf_ldh(const void *m0, u_int32_t k, int *err)
1908 {
1909 	u_int16_t v;
1910 
1911 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1912 		*err = 1;
1913 		return (0);
1914 	}
1915 
1916 	*err = 0;
1917 	return ntohs(v);
1918 }
1919 
1920 u_int32_t
1921 bpf_mbuf_ldb(const void *m0, u_int32_t k, int *err)
1922 {
1923 	const struct mbuf *m = m0;
1924 	u_int8_t v;
1925 
1926 	while (k >= m->m_len) {
1927 		k -= m->m_len;
1928 
1929 		m = m->m_next;
1930 		if (m == NULL) {
1931 			*err = 1;
1932 			return (0);
1933 		}
1934 	}
1935 	v = m->m_data[k];
1936 
1937 	*err = 0;
1938 	return v;
1939 }
1940 
1941 u_int
1942 bpf_mfilter(const struct bpf_insn *pc, const struct mbuf *m, u_int wirelen)
1943 {
1944 	return _bpf_filter(pc, &bpf_mbuf_ops, m, wirelen);
1945 }
1946