xref: /openbsd-src/sys/net/bpf.c (revision c1a45aed656e7d5627c30c92421893a76f370ccb)
1 /*	$OpenBSD: bpf.c,v 1.216 2022/03/17 14:22:03 visa Exp $	*/
2 /*	$NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1990, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * Copyright (c) 2010, 2014 Henning Brauer <henning@openbsd.org>
8  *
9  * This code is derived from the Stanford/CMU enet packet filter,
10  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
11  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
12  * Berkeley Laboratory.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)bpf.c	8.2 (Berkeley) 3/28/94
39  */
40 
41 #include "bpfilter.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/mbuf.h>
46 #include <sys/proc.h>
47 #include <sys/signalvar.h>
48 #include <sys/ioctl.h>
49 #include <sys/conf.h>
50 #include <sys/vnode.h>
51 #include <sys/fcntl.h>
52 #include <sys/socket.h>
53 #include <sys/poll.h>
54 #include <sys/kernel.h>
55 #include <sys/sysctl.h>
56 #include <sys/rwlock.h>
57 #include <sys/atomic.h>
58 #include <sys/refcnt.h>
59 #include <sys/smr.h>
60 #include <sys/specdev.h>
61 #include <sys/selinfo.h>
62 #include <sys/sigio.h>
63 #include <sys/task.h>
64 #include <sys/time.h>
65 
66 #include <net/if.h>
67 #include <net/bpf.h>
68 #include <net/bpfdesc.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/if_ether.h>
72 
73 #include "vlan.h"
74 #if NVLAN > 0
75 #include <net/if_vlan_var.h>
76 #endif
77 
78 #define BPF_BUFSIZE 32768
79 
80 #define PRINET  26			/* interruptible */
81 
82 /*
83  * The default read buffer size is patchable.
84  */
85 int bpf_bufsize = BPF_BUFSIZE;
86 int bpf_maxbufsize = BPF_MAXBUFSIZE;
87 
88 /*
89  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
90  *  bpf_d_list is the list of descriptors
91  */
92 struct bpf_if	*bpf_iflist;
93 LIST_HEAD(, bpf_d) bpf_d_list;
94 
95 int	bpf_allocbufs(struct bpf_d *);
96 void	bpf_ifname(struct bpf_if*, struct ifreq *);
97 void	bpf_mcopy(const void *, void *, size_t);
98 int	bpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
99 	    struct sockaddr *);
100 int	bpf_setif(struct bpf_d *, struct ifreq *);
101 int	bpfpoll(dev_t, int, struct proc *);
102 int	bpfkqfilter(dev_t, struct knote *);
103 void	bpf_wakeup(struct bpf_d *);
104 void	bpf_wakeup_cb(void *);
105 int	_bpf_mtap(caddr_t, const struct mbuf *, const struct mbuf *, u_int);
106 void	bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
107 	    const struct bpf_hdr *);
108 int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
109 int	bpf_setdlt(struct bpf_d *, u_int);
110 
111 void	filt_bpfrdetach(struct knote *);
112 int	filt_bpfread(struct knote *, long);
113 int	filt_bpfreadmodify(struct kevent *, struct knote *);
114 int	filt_bpfreadprocess(struct knote *, struct kevent *);
115 
116 int	bpf_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t);
117 
118 struct bpf_d *bpfilter_lookup(int);
119 
120 /*
121  * Called holding ``bd_mtx''.
122  */
123 void	bpf_attachd(struct bpf_d *, struct bpf_if *);
124 void	bpf_detachd(struct bpf_d *);
125 void	bpf_resetd(struct bpf_d *);
126 
127 void	bpf_prog_smr(void *);
128 void	bpf_d_smr(void *);
129 
130 /*
131  * Reference count access to descriptor buffers
132  */
133 void	bpf_get(struct bpf_d *);
134 void	bpf_put(struct bpf_d *);
135 
136 
137 struct rwlock bpf_sysctl_lk = RWLOCK_INITIALIZER("bpfsz");
138 
139 int
140 bpf_movein(struct uio *uio, struct bpf_d *d, struct mbuf **mp,
141     struct sockaddr *sockp)
142 {
143 	struct bpf_program_smr *bps;
144 	struct bpf_insn *fcode = NULL;
145 	struct mbuf *m;
146 	struct m_tag *mtag;
147 	int error;
148 	u_int hlen, alen, mlen;
149 	u_int len;
150 	u_int linktype;
151 	u_int slen;
152 
153 	/*
154 	 * Build a sockaddr based on the data link layer type.
155 	 * We do this at this level because the ethernet header
156 	 * is copied directly into the data field of the sockaddr.
157 	 * In the case of SLIP, there is no header and the packet
158 	 * is forwarded as is.
159 	 * Also, we are careful to leave room at the front of the mbuf
160 	 * for the link level header.
161 	 */
162 	linktype = d->bd_bif->bif_dlt;
163 	switch (linktype) {
164 
165 	case DLT_SLIP:
166 		sockp->sa_family = AF_INET;
167 		hlen = 0;
168 		break;
169 
170 	case DLT_PPP:
171 		sockp->sa_family = AF_UNSPEC;
172 		hlen = 0;
173 		break;
174 
175 	case DLT_EN10MB:
176 		sockp->sa_family = AF_UNSPEC;
177 		/* XXX Would MAXLINKHDR be better? */
178 		hlen = ETHER_HDR_LEN;
179 		break;
180 
181 	case DLT_IEEE802_11:
182 	case DLT_IEEE802_11_RADIO:
183 		sockp->sa_family = AF_UNSPEC;
184 		hlen = 0;
185 		break;
186 
187 	case DLT_RAW:
188 	case DLT_NULL:
189 		sockp->sa_family = AF_UNSPEC;
190 		hlen = 0;
191 		break;
192 
193 	case DLT_LOOP:
194 		sockp->sa_family = AF_UNSPEC;
195 		hlen = sizeof(u_int32_t);
196 		break;
197 
198 	default:
199 		return (EIO);
200 	}
201 
202 	if (uio->uio_resid > MAXMCLBYTES)
203 		return (EMSGSIZE);
204 	len = uio->uio_resid;
205 	if (len < hlen)
206 		return (EINVAL);
207 
208 	/*
209 	 * Get the length of the payload so we can align it properly.
210 	 */
211 	alen = len - hlen;
212 
213 	/*
214 	 * Allocate enough space for headers and the aligned payload.
215 	 */
216 	mlen = max(max_linkhdr, hlen) + roundup(alen, sizeof(long));
217 	if (mlen > MAXMCLBYTES)
218 		return (EMSGSIZE);
219 
220 	MGETHDR(m, M_WAIT, MT_DATA);
221 	if (mlen > MHLEN) {
222 		MCLGETL(m, M_WAIT, mlen);
223 		if ((m->m_flags & M_EXT) == 0) {
224 			error = ENOBUFS;
225 			goto bad;
226 		}
227 	}
228 
229 	m_align(m, alen); /* Align the payload. */
230 	m->m_data -= hlen;
231 
232 	m->m_pkthdr.ph_ifidx = 0;
233 	m->m_pkthdr.len = len;
234 	m->m_len = len;
235 
236 	error = uiomove(mtod(m, caddr_t), len, uio);
237 	if (error)
238 		goto bad;
239 
240 	smr_read_enter();
241 	bps = SMR_PTR_GET(&d->bd_wfilter);
242 	if (bps != NULL)
243 		fcode = bps->bps_bf.bf_insns;
244 	slen = bpf_filter(fcode, mtod(m, u_char *), len, len);
245 	smr_read_leave();
246 
247 	if (slen < len) {
248 		error = EPERM;
249 		goto bad;
250 	}
251 
252 	/*
253 	 * Make room for link header, and copy it to sockaddr
254 	 */
255 	if (hlen != 0) {
256 		if (linktype == DLT_LOOP) {
257 			u_int32_t af;
258 
259 			/* the link header indicates the address family */
260 			KASSERT(hlen == sizeof(u_int32_t));
261 			memcpy(&af, m->m_data, hlen);
262 			sockp->sa_family = ntohl(af);
263 		} else
264 			memcpy(sockp->sa_data, m->m_data, hlen);
265 
266 		m->m_pkthdr.len -= hlen;
267 		m->m_len -= hlen;
268 		m->m_data += hlen;
269 	}
270 
271 	/*
272 	 * Prepend the data link type as a mbuf tag
273 	 */
274 	mtag = m_tag_get(PACKET_TAG_DLT, sizeof(u_int), M_WAIT);
275 	*(u_int *)(mtag + 1) = linktype;
276 	m_tag_prepend(m, mtag);
277 
278 	*mp = m;
279 	return (0);
280  bad:
281 	m_freem(m);
282 	return (error);
283 }
284 
285 /*
286  * Attach file to the bpf interface, i.e. make d listen on bp.
287  */
288 void
289 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
290 {
291 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
292 
293 	/*
294 	 * Point d at bp, and add d to the interface's list of listeners.
295 	 * Finally, point the driver's bpf cookie at the interface so
296 	 * it will divert packets to bpf.
297 	 */
298 
299 	d->bd_bif = bp;
300 
301 	KERNEL_ASSERT_LOCKED();
302 	SMR_SLIST_INSERT_HEAD_LOCKED(&bp->bif_dlist, d, bd_next);
303 
304 	*bp->bif_driverp = bp;
305 }
306 
307 /*
308  * Detach a file from its interface.
309  */
310 void
311 bpf_detachd(struct bpf_d *d)
312 {
313 	struct bpf_if *bp;
314 
315 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
316 
317 	bp = d->bd_bif;
318 	/* Not attached. */
319 	if (bp == NULL)
320 		return;
321 
322 	/* Remove ``d'' from the interface's descriptor list. */
323 	KERNEL_ASSERT_LOCKED();
324 	SMR_SLIST_REMOVE_LOCKED(&bp->bif_dlist, d, bpf_d, bd_next);
325 
326 	if (SMR_SLIST_EMPTY_LOCKED(&bp->bif_dlist)) {
327 		/*
328 		 * Let the driver know that there are no more listeners.
329 		 */
330 		*bp->bif_driverp = NULL;
331 	}
332 
333 	d->bd_bif = NULL;
334 
335 	/*
336 	 * Check if this descriptor had requested promiscuous mode.
337 	 * If so, turn it off.
338 	 */
339 	if (d->bd_promisc) {
340 		int error;
341 
342 		KASSERT(bp->bif_ifp != NULL);
343 
344 		d->bd_promisc = 0;
345 
346 		bpf_get(d);
347 		mtx_leave(&d->bd_mtx);
348 		NET_LOCK();
349 		error = ifpromisc(bp->bif_ifp, 0);
350 		NET_UNLOCK();
351 		mtx_enter(&d->bd_mtx);
352 		bpf_put(d);
353 
354 		if (error && !(error == EINVAL || error == ENODEV ||
355 		    error == ENXIO))
356 			/*
357 			 * Something is really wrong if we were able to put
358 			 * the driver into promiscuous mode, but can't
359 			 * take it out.
360 			 */
361 			panic("bpf: ifpromisc failed");
362 	}
363 }
364 
365 void
366 bpfilterattach(int n)
367 {
368 	LIST_INIT(&bpf_d_list);
369 }
370 
371 /*
372  * Open ethernet device.  Returns ENXIO for illegal minor device number,
373  * EBUSY if file is open by another process.
374  */
375 int
376 bpfopen(dev_t dev, int flag, int mode, struct proc *p)
377 {
378 	struct bpf_d *bd;
379 	int unit = minor(dev);
380 
381 	if (unit & ((1 << CLONE_SHIFT) - 1))
382 		return (ENXIO);
383 
384 	KASSERT(bpfilter_lookup(unit) == NULL);
385 
386 	/* create on demand */
387 	if ((bd = malloc(sizeof(*bd), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
388 		return (EBUSY);
389 
390 	/* Mark "free" and do most initialization. */
391 	bd->bd_unit = unit;
392 	bd->bd_bufsize = bpf_bufsize;
393 	bd->bd_sig = SIGIO;
394 	mtx_init(&bd->bd_mtx, IPL_NET);
395 	task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
396 	smr_init(&bd->bd_smr);
397 	sigio_init(&bd->bd_sigio);
398 	klist_init_mutex(&bd->bd_sel.si_note, &bd->bd_mtx);
399 
400 	bd->bd_rtout = 0;	/* no timeout by default */
401 
402 	refcnt_init(&bd->bd_refcnt);
403 	LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
404 
405 	return (0);
406 }
407 
408 /*
409  * Close the descriptor by detaching it from its interface,
410  * deallocating its buffers, and marking it free.
411  */
412 int
413 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
414 {
415 	struct bpf_d *d;
416 
417 	d = bpfilter_lookup(minor(dev));
418 	mtx_enter(&d->bd_mtx);
419 	bpf_detachd(d);
420 	bpf_wakeup(d);
421 	LIST_REMOVE(d, bd_list);
422 	mtx_leave(&d->bd_mtx);
423 	bpf_put(d);
424 
425 	return (0);
426 }
427 
428 /*
429  * Rotate the packet buffers in descriptor d.  Move the store buffer
430  * into the hold slot, and the free buffer into the store slot.
431  * Zero the length of the new store buffer.
432  */
433 #define ROTATE_BUFFERS(d) \
434 	KASSERT(d->bd_in_uiomove == 0); \
435 	MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
436 	(d)->bd_hbuf = (d)->bd_sbuf; \
437 	(d)->bd_hlen = (d)->bd_slen; \
438 	(d)->bd_sbuf = (d)->bd_fbuf; \
439 	(d)->bd_slen = 0; \
440 	(d)->bd_fbuf = NULL;
441 
442 /*
443  *  bpfread - read next chunk of packets from buffers
444  */
445 int
446 bpfread(dev_t dev, struct uio *uio, int ioflag)
447 {
448 	uint64_t end, now;
449 	struct bpf_d *d;
450 	caddr_t hbuf;
451 	int error, hlen;
452 
453 	KERNEL_ASSERT_LOCKED();
454 
455 	d = bpfilter_lookup(minor(dev));
456 	if (d->bd_bif == NULL)
457 		return (ENXIO);
458 
459 	bpf_get(d);
460 	mtx_enter(&d->bd_mtx);
461 
462 	/*
463 	 * Restrict application to use a buffer the same size as
464 	 * as kernel buffers.
465 	 */
466 	if (uio->uio_resid != d->bd_bufsize) {
467 		error = EINVAL;
468 		goto out;
469 	}
470 
471 	/*
472 	 * If there's a timeout, mark when the read should end.
473 	 */
474 	if (d->bd_rtout != 0) {
475 		now = nsecuptime();
476 		end = now + d->bd_rtout;
477 		if (end < now)
478 			end = UINT64_MAX;
479 	}
480 
481 	/*
482 	 * If the hold buffer is empty, then do a timed sleep, which
483 	 * ends when the timeout expires or when enough packets
484 	 * have arrived to fill the store buffer.
485 	 */
486 	while (d->bd_hbuf == NULL) {
487 		if (d->bd_bif == NULL) {
488 			/* interface is gone */
489 			if (d->bd_slen == 0) {
490 				error = EIO;
491 				goto out;
492 			}
493 			ROTATE_BUFFERS(d);
494 			break;
495 		}
496 		if (d->bd_immediate && d->bd_slen != 0) {
497 			/*
498 			 * A packet(s) either arrived since the previous
499 			 * read or arrived while we were asleep.
500 			 * Rotate the buffers and return what's here.
501 			 */
502 			ROTATE_BUFFERS(d);
503 			break;
504 		}
505 		if (ISSET(ioflag, IO_NDELAY)) {
506 			/* User requested non-blocking I/O */
507 			error = EWOULDBLOCK;
508 		} else if (d->bd_rtout == 0) {
509 			/* No read timeout set. */
510 			d->bd_nreaders++;
511 			error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
512 			    "bpf", INFSLP);
513 			d->bd_nreaders--;
514 		} else if ((now = nsecuptime()) < end) {
515 			/* Read timeout has not expired yet. */
516 			d->bd_nreaders++;
517 			error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
518 			    "bpf", end - now);
519 			d->bd_nreaders--;
520 		} else {
521 			/* Read timeout has expired. */
522 			error = EWOULDBLOCK;
523 		}
524 		if (error == EINTR || error == ERESTART)
525 			goto out;
526 		if (error == EWOULDBLOCK) {
527 			/*
528 			 * On a timeout, return what's in the buffer,
529 			 * which may be nothing.  If there is something
530 			 * in the store buffer, we can rotate the buffers.
531 			 */
532 			if (d->bd_hbuf != NULL)
533 				/*
534 				 * We filled up the buffer in between
535 				 * getting the timeout and arriving
536 				 * here, so we don't need to rotate.
537 				 */
538 				break;
539 
540 			if (d->bd_slen == 0) {
541 				error = 0;
542 				goto out;
543 			}
544 			ROTATE_BUFFERS(d);
545 			break;
546 		}
547 	}
548 	/*
549 	 * At this point, we know we have something in the hold slot.
550 	 */
551 	hbuf = d->bd_hbuf;
552 	hlen = d->bd_hlen;
553 	d->bd_hbuf = NULL;
554 	d->bd_hlen = 0;
555 	d->bd_fbuf = NULL;
556 	d->bd_in_uiomove = 1;
557 
558 	/*
559 	 * Move data from hold buffer into user space.
560 	 * We know the entire buffer is transferred since
561 	 * we checked above that the read buffer is bpf_bufsize bytes.
562 	 */
563 	mtx_leave(&d->bd_mtx);
564 	error = uiomove(hbuf, hlen, uio);
565 	mtx_enter(&d->bd_mtx);
566 
567 	/* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
568 	KASSERT(d->bd_fbuf == NULL);
569 	KASSERT(d->bd_hbuf == NULL);
570 	d->bd_fbuf = hbuf;
571 	d->bd_in_uiomove = 0;
572 out:
573 	mtx_leave(&d->bd_mtx);
574 	bpf_put(d);
575 
576 	return (error);
577 }
578 
579 /*
580  * If there are processes sleeping on this descriptor, wake them up.
581  */
582 void
583 bpf_wakeup(struct bpf_d *d)
584 {
585 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
586 
587 	if (d->bd_nreaders)
588 		wakeup(d);
589 
590 	KNOTE(&d->bd_sel.si_note, 0);
591 
592 	/*
593 	 * As long as pgsigio() and selwakeup() need to be protected
594 	 * by the KERNEL_LOCK() we have to delay the wakeup to
595 	 * another context to keep the hot path KERNEL_LOCK()-free.
596 	 */
597 	if ((d->bd_async && d->bd_sig) || d->bd_sel.si_seltid != 0) {
598 		bpf_get(d);
599 		if (!task_add(systq, &d->bd_wake_task))
600 			bpf_put(d);
601 	}
602 }
603 
604 void
605 bpf_wakeup_cb(void *xd)
606 {
607 	struct bpf_d *d = xd;
608 
609 	if (d->bd_async && d->bd_sig)
610 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
611 
612 	mtx_enter(&d->bd_mtx);
613 	selwakeup(&d->bd_sel);
614 	mtx_leave(&d->bd_mtx);
615 	bpf_put(d);
616 }
617 
618 int
619 bpfwrite(dev_t dev, struct uio *uio, int ioflag)
620 {
621 	struct bpf_d *d;
622 	struct ifnet *ifp;
623 	struct mbuf *m;
624 	int error;
625 	struct sockaddr_storage dst;
626 
627 	KERNEL_ASSERT_LOCKED();
628 
629 	d = bpfilter_lookup(minor(dev));
630 	if (d->bd_bif == NULL)
631 		return (ENXIO);
632 
633 	bpf_get(d);
634 	ifp = d->bd_bif->bif_ifp;
635 
636 	if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
637 		error = ENETDOWN;
638 		goto out;
639 	}
640 
641 	if (uio->uio_resid == 0) {
642 		error = 0;
643 		goto out;
644 	}
645 
646 	error = bpf_movein(uio, d, &m, sstosa(&dst));
647 	if (error)
648 		goto out;
649 
650 	if (m->m_pkthdr.len > ifp->if_mtu) {
651 		m_freem(m);
652 		error = EMSGSIZE;
653 		goto out;
654 	}
655 
656 	m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
657 	m->m_pkthdr.pf.prio = ifp->if_llprio;
658 
659 	if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
660 		dst.ss_family = pseudo_AF_HDRCMPLT;
661 
662 	NET_LOCK();
663 	error = ifp->if_output(ifp, m, sstosa(&dst), NULL);
664 	NET_UNLOCK();
665 
666 out:
667 	bpf_put(d);
668 	return (error);
669 }
670 
671 /*
672  * Reset a descriptor by flushing its packet buffer and clearing the
673  * receive and drop counts.
674  */
675 void
676 bpf_resetd(struct bpf_d *d)
677 {
678 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
679 	KASSERT(d->bd_in_uiomove == 0);
680 
681 	if (d->bd_hbuf != NULL) {
682 		/* Free the hold buffer. */
683 		d->bd_fbuf = d->bd_hbuf;
684 		d->bd_hbuf = NULL;
685 	}
686 	d->bd_slen = 0;
687 	d->bd_hlen = 0;
688 	d->bd_rcount = 0;
689 	d->bd_dcount = 0;
690 }
691 
692 /*
693  *  FIONREAD		Check for read packet available.
694  *  BIOCGBLEN		Get buffer len [for read()].
695  *  BIOCSETF		Set ethernet read filter.
696  *  BIOCFLUSH		Flush read packet buffer.
697  *  BIOCPROMISC		Put interface into promiscuous mode.
698  *  BIOCGDLTLIST	Get supported link layer types.
699  *  BIOCGDLT		Get link layer type.
700  *  BIOCSDLT		Set link layer type.
701  *  BIOCGETIF		Get interface name.
702  *  BIOCSETIF		Set interface.
703  *  BIOCSRTIMEOUT	Set read timeout.
704  *  BIOCGRTIMEOUT	Get read timeout.
705  *  BIOCGSTATS		Get packet stats.
706  *  BIOCIMMEDIATE	Set immediate mode.
707  *  BIOCVERSION		Get filter language version.
708  *  BIOCGHDRCMPLT	Get "header already complete" flag
709  *  BIOCSHDRCMPLT	Set "header already complete" flag
710  */
711 int
712 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
713 {
714 	struct bpf_d *d;
715 	int error = 0;
716 
717 	d = bpfilter_lookup(minor(dev));
718 	if (d->bd_locked && suser(p) != 0) {
719 		/* list of allowed ioctls when locked and not root */
720 		switch (cmd) {
721 		case BIOCGBLEN:
722 		case BIOCFLUSH:
723 		case BIOCGDLT:
724 		case BIOCGDLTLIST:
725 		case BIOCGETIF:
726 		case BIOCGRTIMEOUT:
727 		case BIOCGSTATS:
728 		case BIOCVERSION:
729 		case BIOCGRSIG:
730 		case BIOCGHDRCMPLT:
731 		case FIONREAD:
732 		case BIOCLOCK:
733 		case BIOCSRTIMEOUT:
734 		case BIOCIMMEDIATE:
735 		case TIOCGPGRP:
736 		case BIOCGDIRFILT:
737 			break;
738 		default:
739 			return (EPERM);
740 		}
741 	}
742 
743 	bpf_get(d);
744 
745 	switch (cmd) {
746 	default:
747 		error = EINVAL;
748 		break;
749 
750 	/*
751 	 * Check for read packet available.
752 	 */
753 	case FIONREAD:
754 		{
755 			int n;
756 
757 			mtx_enter(&d->bd_mtx);
758 			n = d->bd_slen;
759 			if (d->bd_hbuf != NULL)
760 				n += d->bd_hlen;
761 			mtx_leave(&d->bd_mtx);
762 
763 			*(int *)addr = n;
764 			break;
765 		}
766 
767 	/*
768 	 * Get buffer len [for read()].
769 	 */
770 	case BIOCGBLEN:
771 		*(u_int *)addr = d->bd_bufsize;
772 		break;
773 
774 	/*
775 	 * Set buffer length.
776 	 */
777 	case BIOCSBLEN:
778 		if (d->bd_bif != NULL)
779 			error = EINVAL;
780 		else {
781 			u_int size = *(u_int *)addr;
782 
783 			if (size > bpf_maxbufsize)
784 				*(u_int *)addr = size = bpf_maxbufsize;
785 			else if (size < BPF_MINBUFSIZE)
786 				*(u_int *)addr = size = BPF_MINBUFSIZE;
787 			mtx_enter(&d->bd_mtx);
788 			d->bd_bufsize = size;
789 			mtx_leave(&d->bd_mtx);
790 		}
791 		break;
792 
793 	/*
794 	 * Set link layer read filter.
795 	 */
796 	case BIOCSETF:
797 		error = bpf_setf(d, (struct bpf_program *)addr, 0);
798 		break;
799 
800 	/*
801 	 * Set link layer write filter.
802 	 */
803 	case BIOCSETWF:
804 		error = bpf_setf(d, (struct bpf_program *)addr, 1);
805 		break;
806 
807 	/*
808 	 * Flush read packet buffer.
809 	 */
810 	case BIOCFLUSH:
811 		mtx_enter(&d->bd_mtx);
812 		bpf_resetd(d);
813 		mtx_leave(&d->bd_mtx);
814 		break;
815 
816 	/*
817 	 * Put interface into promiscuous mode.
818 	 */
819 	case BIOCPROMISC:
820 		if (d->bd_bif == NULL) {
821 			/*
822 			 * No interface attached yet.
823 			 */
824 			error = EINVAL;
825 		} else if (d->bd_bif->bif_ifp != NULL) {
826 			if (d->bd_promisc == 0) {
827 				MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
828 				NET_LOCK();
829 				error = ifpromisc(d->bd_bif->bif_ifp, 1);
830 				NET_UNLOCK();
831 				if (error == 0)
832 					d->bd_promisc = 1;
833 			}
834 		}
835 		break;
836 
837 	/*
838 	 * Get a list of supported device parameters.
839 	 */
840 	case BIOCGDLTLIST:
841 		if (d->bd_bif == NULL)
842 			error = EINVAL;
843 		else
844 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
845 		break;
846 
847 	/*
848 	 * Get device parameters.
849 	 */
850 	case BIOCGDLT:
851 		if (d->bd_bif == NULL)
852 			error = EINVAL;
853 		else
854 			*(u_int *)addr = d->bd_bif->bif_dlt;
855 		break;
856 
857 	/*
858 	 * Set device parameters.
859 	 */
860 	case BIOCSDLT:
861 		if (d->bd_bif == NULL)
862 			error = EINVAL;
863 		else {
864 			mtx_enter(&d->bd_mtx);
865 			error = bpf_setdlt(d, *(u_int *)addr);
866 			mtx_leave(&d->bd_mtx);
867 		}
868 		break;
869 
870 	/*
871 	 * Set interface name.
872 	 */
873 	case BIOCGETIF:
874 		if (d->bd_bif == NULL)
875 			error = EINVAL;
876 		else
877 			bpf_ifname(d->bd_bif, (struct ifreq *)addr);
878 		break;
879 
880 	/*
881 	 * Set interface.
882 	 */
883 	case BIOCSETIF:
884 		error = bpf_setif(d, (struct ifreq *)addr);
885 		break;
886 
887 	/*
888 	 * Set read timeout.
889 	 */
890 	case BIOCSRTIMEOUT:
891 		{
892 			struct timeval *tv = (struct timeval *)addr;
893 			uint64_t rtout;
894 
895 			if (tv->tv_sec < 0 || !timerisvalid(tv)) {
896 				error = EINVAL;
897 				break;
898 			}
899 			rtout = TIMEVAL_TO_NSEC(tv);
900 			if (rtout > MAXTSLP) {
901 				error = EOVERFLOW;
902 				break;
903 			}
904 			mtx_enter(&d->bd_mtx);
905 			d->bd_rtout = rtout;
906 			mtx_leave(&d->bd_mtx);
907 			break;
908 		}
909 
910 	/*
911 	 * Get read timeout.
912 	 */
913 	case BIOCGRTIMEOUT:
914 		{
915 			struct timeval *tv = (struct timeval *)addr;
916 
917 			memset(tv, 0, sizeof(*tv));
918 			mtx_enter(&d->bd_mtx);
919 			NSEC_TO_TIMEVAL(d->bd_rtout, tv);
920 			mtx_leave(&d->bd_mtx);
921 			break;
922 		}
923 
924 	/*
925 	 * Get packet stats.
926 	 */
927 	case BIOCGSTATS:
928 		{
929 			struct bpf_stat *bs = (struct bpf_stat *)addr;
930 
931 			bs->bs_recv = d->bd_rcount;
932 			bs->bs_drop = d->bd_dcount;
933 			break;
934 		}
935 
936 	/*
937 	 * Set immediate mode.
938 	 */
939 	case BIOCIMMEDIATE:
940 		d->bd_immediate = *(u_int *)addr;
941 		break;
942 
943 	case BIOCVERSION:
944 		{
945 			struct bpf_version *bv = (struct bpf_version *)addr;
946 
947 			bv->bv_major = BPF_MAJOR_VERSION;
948 			bv->bv_minor = BPF_MINOR_VERSION;
949 			break;
950 		}
951 
952 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
953 		*(u_int *)addr = d->bd_hdrcmplt;
954 		break;
955 
956 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
957 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
958 		break;
959 
960 	case BIOCLOCK:		/* set "locked" flag (no reset) */
961 		d->bd_locked = 1;
962 		break;
963 
964 	case BIOCGFILDROP:	/* get "filter-drop" flag */
965 		*(u_int *)addr = d->bd_fildrop;
966 		break;
967 
968 	case BIOCSFILDROP: {	/* set "filter-drop" flag */
969 		unsigned int fildrop = *(u_int *)addr;
970 		switch (fildrop) {
971 		case BPF_FILDROP_PASS:
972 		case BPF_FILDROP_CAPTURE:
973 		case BPF_FILDROP_DROP:
974 			d->bd_fildrop = fildrop;
975 			break;
976 		default:
977 			error = EINVAL;
978 			break;
979 		}
980 		break;
981 	}
982 
983 	case BIOCGDIRFILT:	/* get direction filter */
984 		*(u_int *)addr = d->bd_dirfilt;
985 		break;
986 
987 	case BIOCSDIRFILT:	/* set direction filter */
988 		d->bd_dirfilt = (*(u_int *)addr) &
989 		    (BPF_DIRECTION_IN|BPF_DIRECTION_OUT);
990 		break;
991 
992 	case FIONBIO:		/* Non-blocking I/O */
993 		/* let vfs to keep track of this */
994 		break;
995 
996 	case FIOASYNC:		/* Send signal on receive packets */
997 		d->bd_async = *(int *)addr;
998 		break;
999 
1000 	case FIOSETOWN:		/* Process or group to send signals to */
1001 	case TIOCSPGRP:
1002 		error = sigio_setown(&d->bd_sigio, cmd, addr);
1003 		break;
1004 
1005 	case FIOGETOWN:
1006 	case TIOCGPGRP:
1007 		sigio_getown(&d->bd_sigio, cmd, addr);
1008 		break;
1009 
1010 	case BIOCSRSIG:		/* Set receive signal */
1011 		{
1012 			u_int sig;
1013 
1014 			sig = *(u_int *)addr;
1015 
1016 			if (sig >= NSIG)
1017 				error = EINVAL;
1018 			else
1019 				d->bd_sig = sig;
1020 			break;
1021 		}
1022 	case BIOCGRSIG:
1023 		*(u_int *)addr = d->bd_sig;
1024 		break;
1025 	}
1026 
1027 	bpf_put(d);
1028 	return (error);
1029 }
1030 
1031 /*
1032  * Set d's packet filter program to fp.  If this file already has a filter,
1033  * free it and replace it.  Returns EINVAL for bogus requests.
1034  */
1035 int
1036 bpf_setf(struct bpf_d *d, struct bpf_program *fp, int wf)
1037 {
1038 	struct bpf_program_smr *bps, *old_bps;
1039 	struct bpf_insn *fcode;
1040 	u_int flen, size;
1041 
1042 	KERNEL_ASSERT_LOCKED();
1043 
1044 	if (fp->bf_insns == 0) {
1045 		if (fp->bf_len != 0)
1046 			return (EINVAL);
1047 		bps = NULL;
1048 	} else {
1049 		flen = fp->bf_len;
1050 		if (flen > BPF_MAXINSNS)
1051 			return (EINVAL);
1052 
1053 		fcode = mallocarray(flen, sizeof(*fp->bf_insns), M_DEVBUF,
1054 		    M_WAITOK | M_CANFAIL);
1055 		if (fcode == NULL)
1056 			return (ENOMEM);
1057 
1058 		size = flen * sizeof(*fp->bf_insns);
1059 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1060 		    bpf_validate(fcode, (int)flen) == 0) {
1061 			free(fcode, M_DEVBUF, size);
1062 			return (EINVAL);
1063 		}
1064 
1065 		bps = malloc(sizeof(*bps), M_DEVBUF, M_WAITOK);
1066 		smr_init(&bps->bps_smr);
1067 		bps->bps_bf.bf_len = flen;
1068 		bps->bps_bf.bf_insns = fcode;
1069 	}
1070 
1071 	if (wf == 0) {
1072 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_rfilter);
1073 		SMR_PTR_SET_LOCKED(&d->bd_rfilter, bps);
1074 	} else {
1075 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_wfilter);
1076 		SMR_PTR_SET_LOCKED(&d->bd_wfilter, bps);
1077 	}
1078 
1079 	mtx_enter(&d->bd_mtx);
1080 	bpf_resetd(d);
1081 	mtx_leave(&d->bd_mtx);
1082 	if (old_bps != NULL)
1083 		smr_call(&old_bps->bps_smr, bpf_prog_smr, old_bps);
1084 
1085 	return (0);
1086 }
1087 
1088 /*
1089  * Detach a file from its current interface (if attached at all) and attach
1090  * to the interface indicated by the name stored in ifr.
1091  * Return an errno or 0.
1092  */
1093 int
1094 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1095 {
1096 	struct bpf_if *bp, *candidate = NULL;
1097 	int error = 0;
1098 
1099 	/*
1100 	 * Look through attached interfaces for the named one.
1101 	 */
1102 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1103 		if (strcmp(bp->bif_name, ifr->ifr_name) != 0)
1104 			continue;
1105 
1106 		if (candidate == NULL || candidate->bif_dlt > bp->bif_dlt)
1107 			candidate = bp;
1108 	}
1109 
1110 	/* Not found. */
1111 	if (candidate == NULL)
1112 		return (ENXIO);
1113 
1114 	/*
1115 	 * Allocate the packet buffers if we need to.
1116 	 * If we're already attached to requested interface,
1117 	 * just flush the buffer.
1118 	 */
1119 	mtx_enter(&d->bd_mtx);
1120 	if (d->bd_sbuf == NULL) {
1121 		if ((error = bpf_allocbufs(d)))
1122 			goto out;
1123 	}
1124 	if (candidate != d->bd_bif) {
1125 		/*
1126 		 * Detach if attached to something else.
1127 		 */
1128 		bpf_detachd(d);
1129 		bpf_attachd(d, candidate);
1130 	}
1131 	bpf_resetd(d);
1132 out:
1133 	mtx_leave(&d->bd_mtx);
1134 	return (error);
1135 }
1136 
1137 /*
1138  * Copy the interface name to the ifreq.
1139  */
1140 void
1141 bpf_ifname(struct bpf_if *bif, struct ifreq *ifr)
1142 {
1143 	bcopy(bif->bif_name, ifr->ifr_name, sizeof(ifr->ifr_name));
1144 }
1145 
1146 /*
1147  * Support for poll() system call
1148  */
1149 int
1150 bpfpoll(dev_t dev, int events, struct proc *p)
1151 {
1152 	struct bpf_d *d;
1153 	int revents;
1154 
1155 	KERNEL_ASSERT_LOCKED();
1156 
1157 	/*
1158 	 * An imitation of the FIONREAD ioctl code.
1159 	 */
1160 	d = bpfilter_lookup(minor(dev));
1161 
1162 	/*
1163 	 * XXX The USB stack manages it to trigger some race condition
1164 	 * which causes bpfilter_lookup to return NULL when a USB device
1165 	 * gets detached while it is up and has an open bpf handler (e.g.
1166 	 * dhclient).  We still should recheck if we can fix the root
1167 	 * cause of this issue.
1168 	 */
1169 	if (d == NULL)
1170 		return (POLLERR);
1171 
1172 	/* Always ready to write data */
1173 	revents = events & (POLLOUT | POLLWRNORM);
1174 
1175 	if (events & (POLLIN | POLLRDNORM)) {
1176 		mtx_enter(&d->bd_mtx);
1177 		if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0))
1178 			revents |= events & (POLLIN | POLLRDNORM);
1179 		else
1180 			selrecord(p, &d->bd_sel);
1181 		mtx_leave(&d->bd_mtx);
1182 	}
1183 	return (revents);
1184 }
1185 
1186 const struct filterops bpfread_filtops = {
1187 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
1188 	.f_attach	= NULL,
1189 	.f_detach	= filt_bpfrdetach,
1190 	.f_event	= filt_bpfread,
1191 	.f_modify	= filt_bpfreadmodify,
1192 	.f_process	= filt_bpfreadprocess,
1193 };
1194 
1195 int
1196 bpfkqfilter(dev_t dev, struct knote *kn)
1197 {
1198 	struct bpf_d *d;
1199 	struct klist *klist;
1200 
1201 	KERNEL_ASSERT_LOCKED();
1202 
1203 	d = bpfilter_lookup(minor(dev));
1204 	if (d == NULL)
1205 		return (ENXIO);
1206 
1207 	switch (kn->kn_filter) {
1208 	case EVFILT_READ:
1209 		klist = &d->bd_sel.si_note;
1210 		kn->kn_fop = &bpfread_filtops;
1211 		break;
1212 	default:
1213 		return (EINVAL);
1214 	}
1215 
1216 	bpf_get(d);
1217 	kn->kn_hook = d;
1218 	klist_insert(klist, kn);
1219 
1220 	return (0);
1221 }
1222 
1223 void
1224 filt_bpfrdetach(struct knote *kn)
1225 {
1226 	struct bpf_d *d = kn->kn_hook;
1227 
1228 	klist_remove(&d->bd_sel.si_note, kn);
1229 	bpf_put(d);
1230 }
1231 
1232 int
1233 filt_bpfread(struct knote *kn, long hint)
1234 {
1235 	struct bpf_d *d = kn->kn_hook;
1236 
1237 	if (hint == NOTE_SUBMIT) /* ignore activation from selwakeup */
1238 		return (0);
1239 
1240 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1241 
1242 	kn->kn_data = d->bd_hlen;
1243 	if (d->bd_immediate)
1244 		kn->kn_data += d->bd_slen;
1245 
1246 	return (kn->kn_data > 0);
1247 }
1248 
1249 int
1250 filt_bpfreadmodify(struct kevent *kev, struct knote *kn)
1251 {
1252 	struct bpf_d *d = kn->kn_hook;
1253 	int active;
1254 
1255 	mtx_enter(&d->bd_mtx);
1256 	active = knote_modify_fn(kev, kn, filt_bpfread);
1257 	mtx_leave(&d->bd_mtx);
1258 
1259 	return (active);
1260 }
1261 
1262 int
1263 filt_bpfreadprocess(struct knote *kn, struct kevent *kev)
1264 {
1265 	struct bpf_d *d = kn->kn_hook;
1266 	int active;
1267 
1268 	mtx_enter(&d->bd_mtx);
1269 	active = knote_process_fn(kn, kev, filt_bpfread);
1270 	mtx_leave(&d->bd_mtx);
1271 
1272 	return (active);
1273 }
1274 
1275 /*
1276  * Copy data from an mbuf chain into a buffer.  This code is derived
1277  * from m_copydata in sys/uipc_mbuf.c.
1278  */
1279 void
1280 bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
1281 {
1282 	const struct mbuf *m;
1283 	u_int count;
1284 	u_char *dst;
1285 
1286 	m = src_arg;
1287 	dst = dst_arg;
1288 	while (len > 0) {
1289 		if (m == NULL)
1290 			panic("bpf_mcopy");
1291 		count = min(m->m_len, len);
1292 		bcopy(mtod(m, caddr_t), (caddr_t)dst, count);
1293 		m = m->m_next;
1294 		dst += count;
1295 		len -= count;
1296 	}
1297 }
1298 
1299 int
1300 bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
1301 {
1302 	return _bpf_mtap(arg, m, m, direction);
1303 }
1304 
1305 int
1306 _bpf_mtap(caddr_t arg, const struct mbuf *mp, const struct mbuf *m,
1307     u_int direction)
1308 {
1309 	struct bpf_if *bp = (struct bpf_if *)arg;
1310 	struct bpf_d *d;
1311 	size_t pktlen, slen;
1312 	const struct mbuf *m0;
1313 	struct bpf_hdr tbh;
1314 	int gothdr = 0;
1315 	int drop = 0;
1316 
1317 	if (m == NULL)
1318 		return (0);
1319 
1320 	if (bp == NULL)
1321 		return (0);
1322 
1323 	pktlen = 0;
1324 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1325 		pktlen += m0->m_len;
1326 
1327 	smr_read_enter();
1328 	SMR_SLIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1329 		struct bpf_program_smr *bps;
1330 		struct bpf_insn *fcode = NULL;
1331 
1332 		atomic_inc_long(&d->bd_rcount);
1333 
1334 		if (ISSET(d->bd_dirfilt, direction))
1335 			continue;
1336 
1337 		bps = SMR_PTR_GET(&d->bd_rfilter);
1338 		if (bps != NULL)
1339 			fcode = bps->bps_bf.bf_insns;
1340 		slen = bpf_mfilter(fcode, m, pktlen);
1341 
1342 		if (slen == 0)
1343 			continue;
1344 		if (d->bd_fildrop != BPF_FILDROP_PASS)
1345 			drop = 1;
1346 		if (d->bd_fildrop != BPF_FILDROP_DROP) {
1347 			if (!gothdr) {
1348 				struct timeval tv;
1349 				memset(&tbh, 0, sizeof(tbh));
1350 
1351 				if (ISSET(mp->m_flags, M_PKTHDR)) {
1352 					tbh.bh_ifidx = mp->m_pkthdr.ph_ifidx;
1353 					tbh.bh_flowid = mp->m_pkthdr.ph_flowid;
1354 					tbh.bh_flags = mp->m_pkthdr.pf.prio;
1355 					if (ISSET(mp->m_pkthdr.csum_flags,
1356 					    M_FLOWID))
1357 						SET(tbh.bh_flags, BPF_F_FLOWID);
1358 
1359 					m_microtime(mp, &tv);
1360 				} else
1361 					microtime(&tv);
1362 
1363 				tbh.bh_tstamp.tv_sec = tv.tv_sec;
1364 				tbh.bh_tstamp.tv_usec = tv.tv_usec;
1365 				SET(tbh.bh_flags, direction << BPF_F_DIR_SHIFT);
1366 
1367 				gothdr = 1;
1368 			}
1369 
1370 			mtx_enter(&d->bd_mtx);
1371 			bpf_catchpacket(d, (u_char *)m, pktlen, slen, &tbh);
1372 			mtx_leave(&d->bd_mtx);
1373 		}
1374 	}
1375 	smr_read_leave();
1376 
1377 	return (drop);
1378 }
1379 
1380 /*
1381  * Incoming linkage from device drivers, where a data buffer should be
1382  * prepended by an arbitrary header. In this situation we already have a
1383  * way of representing a chain of memory buffers, ie, mbufs, so reuse
1384  * the existing functionality by attaching the buffers to mbufs.
1385  *
1386  * Con up a minimal mbuf chain to pacify bpf by allocating (only) a
1387  * struct m_hdr each for the header and data on the stack.
1388  */
1389 int
1390 bpf_tap_hdr(caddr_t arg, const void *hdr, unsigned int hdrlen,
1391     const void *buf, unsigned int buflen, u_int direction)
1392 {
1393 	struct m_hdr mh, md;
1394 	struct mbuf *m0 = NULL;
1395 	struct mbuf **mp = &m0;
1396 
1397 	if (hdr != NULL) {
1398 		mh.mh_flags = 0;
1399 		mh.mh_next = NULL;
1400 		mh.mh_len = hdrlen;
1401 		mh.mh_data = (void *)hdr;
1402 
1403 		*mp = (struct mbuf *)&mh;
1404 		mp = &mh.mh_next;
1405 	}
1406 
1407 	if (buf != NULL) {
1408 		md.mh_flags = 0;
1409 		md.mh_next = NULL;
1410 		md.mh_len = buflen;
1411 		md.mh_data = (void *)buf;
1412 
1413 		*mp = (struct mbuf *)&md;
1414 	}
1415 
1416 	return bpf_mtap(arg, m0, direction);
1417 }
1418 
1419 /*
1420  * Incoming linkage from device drivers, where we have a mbuf chain
1421  * but need to prepend some arbitrary header from a linear buffer.
1422  *
1423  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1424  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1425  * fields in this header that we initialize, and will not try to free
1426  * it or keep a pointer to it.
1427  */
1428 int
1429 bpf_mtap_hdr(caddr_t arg, const void *data, u_int dlen, const struct mbuf *m,
1430     u_int direction)
1431 {
1432 	struct m_hdr mh;
1433 	const struct mbuf *m0;
1434 
1435 	if (dlen > 0) {
1436 		mh.mh_flags = 0;
1437 		mh.mh_next = (struct mbuf *)m;
1438 		mh.mh_len = dlen;
1439 		mh.mh_data = (void *)data;
1440 		m0 = (struct mbuf *)&mh;
1441 	} else
1442 		m0 = m;
1443 
1444 	return _bpf_mtap(arg, m, m0, direction);
1445 }
1446 
1447 /*
1448  * Incoming linkage from device drivers, where we have a mbuf chain
1449  * but need to prepend the address family.
1450  *
1451  * Con up a minimal dummy header to pacify bpf.  We allocate (only) a
1452  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1453  * fields in this header that we initialize, and will not try to free
1454  * it or keep a pointer to it.
1455  */
1456 int
1457 bpf_mtap_af(caddr_t arg, u_int32_t af, const struct mbuf *m, u_int direction)
1458 {
1459 	u_int32_t    afh;
1460 
1461 	afh = htonl(af);
1462 
1463 	return bpf_mtap_hdr(arg, &afh, sizeof(afh), m, direction);
1464 }
1465 
1466 /*
1467  * Incoming linkage from device drivers, where we have a mbuf chain
1468  * but need to prepend a VLAN encapsulation header.
1469  *
1470  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1471  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1472  * fields in this header that we initialize, and will not try to free
1473  * it or keep a pointer to it.
1474  */
1475 int
1476 bpf_mtap_ether(caddr_t arg, const struct mbuf *m, u_int direction)
1477 {
1478 #if NVLAN > 0
1479 	struct ether_vlan_header evh;
1480 	struct m_hdr mh, md;
1481 
1482 	if ((m->m_flags & M_VLANTAG) == 0)
1483 #endif
1484 	{
1485 		return _bpf_mtap(arg, m, m, direction);
1486 	}
1487 
1488 #if NVLAN > 0
1489 	KASSERT(m->m_len >= ETHER_HDR_LEN);
1490 
1491 	memcpy(&evh, mtod(m, char *), ETHER_HDR_LEN);
1492 	evh.evl_proto = evh.evl_encap_proto;
1493 	evh.evl_encap_proto = htons(ETHERTYPE_VLAN);
1494 	evh.evl_tag = htons(m->m_pkthdr.ether_vtag);
1495 
1496 	mh.mh_flags = 0;
1497 	mh.mh_data = (caddr_t)&evh;
1498 	mh.mh_len = sizeof(evh);
1499 	mh.mh_next = (struct mbuf *)&md;
1500 
1501 	md.mh_flags = 0;
1502 	md.mh_data = m->m_data + ETHER_HDR_LEN;
1503 	md.mh_len = m->m_len - ETHER_HDR_LEN;
1504 	md.mh_next = m->m_next;
1505 
1506 	return _bpf_mtap(arg, m, (struct mbuf *)&mh, direction);
1507 #endif
1508 }
1509 
1510 /*
1511  * Move the packet data from interface memory (pkt) into the
1512  * store buffer.  Wake up listeners if needed.
1513  * "copy" is the routine called to do the actual data
1514  * transfer.  bcopy is passed in to copy contiguous chunks, while
1515  * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
1516  * pkt is really an mbuf.
1517  */
1518 void
1519 bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
1520     const struct bpf_hdr *tbh)
1521 {
1522 	struct bpf_hdr *bh;
1523 	int totlen, curlen;
1524 	int hdrlen, do_wakeup = 0;
1525 
1526 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1527 	if (d->bd_bif == NULL)
1528 		return;
1529 
1530 	hdrlen = d->bd_bif->bif_hdrlen;
1531 
1532 	/*
1533 	 * Figure out how many bytes to move.  If the packet is
1534 	 * greater or equal to the snapshot length, transfer that
1535 	 * much.  Otherwise, transfer the whole packet (unless
1536 	 * we hit the buffer size limit).
1537 	 */
1538 	totlen = hdrlen + min(snaplen, pktlen);
1539 	if (totlen > d->bd_bufsize)
1540 		totlen = d->bd_bufsize;
1541 
1542 	/*
1543 	 * Round up the end of the previous packet to the next longword.
1544 	 */
1545 	curlen = BPF_WORDALIGN(d->bd_slen);
1546 	if (curlen + totlen > d->bd_bufsize) {
1547 		/*
1548 		 * This packet will overflow the storage buffer.
1549 		 * Rotate the buffers if we can, then wakeup any
1550 		 * pending reads.
1551 		 */
1552 		if (d->bd_fbuf == NULL) {
1553 			/*
1554 			 * We haven't completed the previous read yet,
1555 			 * so drop the packet.
1556 			 */
1557 			++d->bd_dcount;
1558 			return;
1559 		}
1560 		ROTATE_BUFFERS(d);
1561 		do_wakeup = 1;
1562 		curlen = 0;
1563 	}
1564 
1565 	/*
1566 	 * Append the bpf header.
1567 	 */
1568 	bh = (struct bpf_hdr *)(d->bd_sbuf + curlen);
1569 	*bh = *tbh;
1570 	bh->bh_datalen = pktlen;
1571 	bh->bh_hdrlen = hdrlen;
1572 	bh->bh_caplen = totlen - hdrlen;
1573 
1574 	/*
1575 	 * Copy the packet data into the store buffer and update its length.
1576 	 */
1577 	bpf_mcopy(pkt, (u_char *)bh + hdrlen, bh->bh_caplen);
1578 	d->bd_slen = curlen + totlen;
1579 
1580 	if (d->bd_immediate) {
1581 		/*
1582 		 * Immediate mode is set.  A packet arrived so any
1583 		 * reads should be woken up.
1584 		 */
1585 		do_wakeup = 1;
1586 	}
1587 
1588 	if (do_wakeup)
1589 		bpf_wakeup(d);
1590 }
1591 
1592 /*
1593  * Initialize all nonzero fields of a descriptor.
1594  */
1595 int
1596 bpf_allocbufs(struct bpf_d *d)
1597 {
1598 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1599 
1600 	d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1601 	if (d->bd_fbuf == NULL)
1602 		return (ENOMEM);
1603 
1604 	d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1605 	if (d->bd_sbuf == NULL) {
1606 		free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
1607 		d->bd_fbuf = NULL;
1608 		return (ENOMEM);
1609 	}
1610 
1611 	d->bd_slen = 0;
1612 	d->bd_hlen = 0;
1613 
1614 	return (0);
1615 }
1616 
1617 void
1618 bpf_prog_smr(void *bps_arg)
1619 {
1620 	struct bpf_program_smr *bps = bps_arg;
1621 
1622 	free(bps->bps_bf.bf_insns, M_DEVBUF,
1623 	    bps->bps_bf.bf_len * sizeof(struct bpf_insn));
1624 	free(bps, M_DEVBUF, sizeof(struct bpf_program_smr));
1625 }
1626 
1627 void
1628 bpf_d_smr(void *smr)
1629 {
1630 	struct bpf_d	*bd = smr;
1631 
1632 	sigio_free(&bd->bd_sigio);
1633 	free(bd->bd_sbuf, M_DEVBUF, bd->bd_bufsize);
1634 	free(bd->bd_hbuf, M_DEVBUF, bd->bd_bufsize);
1635 	free(bd->bd_fbuf, M_DEVBUF, bd->bd_bufsize);
1636 
1637 	if (bd->bd_rfilter != NULL)
1638 		bpf_prog_smr(bd->bd_rfilter);
1639 	if (bd->bd_wfilter != NULL)
1640 		bpf_prog_smr(bd->bd_wfilter);
1641 
1642 	klist_free(&bd->bd_sel.si_note);
1643 	free(bd, M_DEVBUF, sizeof(*bd));
1644 }
1645 
1646 void
1647 bpf_get(struct bpf_d *bd)
1648 {
1649 	refcnt_take(&bd->bd_refcnt);
1650 }
1651 
1652 /*
1653  * Free buffers currently in use by a descriptor
1654  * when the reference count drops to zero.
1655  */
1656 void
1657 bpf_put(struct bpf_d *bd)
1658 {
1659 	if (refcnt_rele(&bd->bd_refcnt) == 0)
1660 		return;
1661 
1662 	smr_call(&bd->bd_smr, bpf_d_smr, bd);
1663 }
1664 
1665 void *
1666 bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
1667 {
1668 	struct bpf_if *bp;
1669 
1670 	if ((bp = malloc(sizeof(*bp), M_DEVBUF, M_NOWAIT)) == NULL)
1671 		panic("bpfattach");
1672 	SMR_SLIST_INIT(&bp->bif_dlist);
1673 	bp->bif_driverp = (struct bpf_if **)bpfp;
1674 	bp->bif_name = name;
1675 	bp->bif_ifp = NULL;
1676 	bp->bif_dlt = dlt;
1677 
1678 	bp->bif_next = bpf_iflist;
1679 	bpf_iflist = bp;
1680 
1681 	*bp->bif_driverp = NULL;
1682 
1683 	/*
1684 	 * Compute the length of the bpf header.  This is not necessarily
1685 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1686 	 * that the network layer header begins on a longword boundary (for
1687 	 * performance reasons and to alleviate alignment restrictions).
1688 	 */
1689 	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1690 
1691 	return (bp);
1692 }
1693 
1694 void
1695 bpfattach(caddr_t *driverp, struct ifnet *ifp, u_int dlt, u_int hdrlen)
1696 {
1697 	struct bpf_if *bp;
1698 
1699 	bp = bpfsattach(driverp, ifp->if_xname, dlt, hdrlen);
1700 	bp->bif_ifp = ifp;
1701 }
1702 
1703 /* Detach an interface from its attached bpf device.  */
1704 void
1705 bpfdetach(struct ifnet *ifp)
1706 {
1707 	struct bpf_if *bp, *nbp;
1708 
1709 	KERNEL_ASSERT_LOCKED();
1710 
1711 	for (bp = bpf_iflist; bp; bp = nbp) {
1712 		nbp = bp->bif_next;
1713 		if (bp->bif_ifp == ifp)
1714 			bpfsdetach(bp);
1715 	}
1716 	ifp->if_bpf = NULL;
1717 }
1718 
1719 void
1720 bpfsdetach(void *p)
1721 {
1722 	struct bpf_if *bp = p, *tbp;
1723 	struct bpf_d *bd;
1724 	int maj;
1725 
1726 	KERNEL_ASSERT_LOCKED();
1727 
1728 	/* Locate the major number. */
1729 	for (maj = 0; maj < nchrdev; maj++)
1730 		if (cdevsw[maj].d_open == bpfopen)
1731 			break;
1732 
1733 	while ((bd = SMR_SLIST_FIRST_LOCKED(&bp->bif_dlist))) {
1734 		vdevgone(maj, bd->bd_unit, bd->bd_unit, VCHR);
1735 		klist_invalidate(&bd->bd_sel.si_note);
1736 	}
1737 
1738 	for (tbp = bpf_iflist; tbp; tbp = tbp->bif_next) {
1739 		if (tbp->bif_next == bp) {
1740 			tbp->bif_next = bp->bif_next;
1741 			break;
1742 		}
1743 	}
1744 
1745 	if (bpf_iflist == bp)
1746 		bpf_iflist = bp->bif_next;
1747 
1748 	free(bp, M_DEVBUF, sizeof(*bp));
1749 }
1750 
1751 int
1752 bpf_sysctl_locked(int *name, u_int namelen, void *oldp, size_t *oldlenp,
1753     void *newp, size_t newlen)
1754 {
1755 	switch (name[0]) {
1756 	case NET_BPF_BUFSIZE:
1757 		return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1758 		    &bpf_bufsize, BPF_MINBUFSIZE, bpf_maxbufsize);
1759 	case NET_BPF_MAXBUFSIZE:
1760 		return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1761 		    &bpf_maxbufsize, BPF_MINBUFSIZE, INT_MAX);
1762 	default:
1763 		return (EOPNOTSUPP);
1764 	}
1765 }
1766 
1767 int
1768 bpf_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1769     size_t newlen)
1770 {
1771 	int flags = RW_INTR;
1772 	int error;
1773 
1774 	if (namelen != 1)
1775 		return (ENOTDIR);
1776 
1777 	flags |= (newp == NULL) ? RW_READ : RW_WRITE;
1778 
1779 	error = rw_enter(&bpf_sysctl_lk, flags);
1780 	if (error != 0)
1781 		return (error);
1782 
1783 	error = bpf_sysctl_locked(name, namelen, oldp, oldlenp, newp, newlen);
1784 
1785 	rw_exit(&bpf_sysctl_lk);
1786 
1787 	return (error);
1788 }
1789 
1790 struct bpf_d *
1791 bpfilter_lookup(int unit)
1792 {
1793 	struct bpf_d *bd;
1794 
1795 	KERNEL_ASSERT_LOCKED();
1796 
1797 	LIST_FOREACH(bd, &bpf_d_list, bd_list)
1798 		if (bd->bd_unit == unit)
1799 			return (bd);
1800 	return (NULL);
1801 }
1802 
1803 /*
1804  * Get a list of available data link type of the interface.
1805  */
1806 int
1807 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1808 {
1809 	int n, error;
1810 	struct bpf_if *bp;
1811 	const char *name;
1812 
1813 	name = d->bd_bif->bif_name;
1814 	n = 0;
1815 	error = 0;
1816 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1817 		if (strcmp(name, bp->bif_name) != 0)
1818 			continue;
1819 		if (bfl->bfl_list != NULL) {
1820 			if (n >= bfl->bfl_len)
1821 				return (ENOMEM);
1822 			error = copyout(&bp->bif_dlt,
1823 			    bfl->bfl_list + n, sizeof(u_int));
1824 			if (error)
1825 				break;
1826 		}
1827 		n++;
1828 	}
1829 
1830 	bfl->bfl_len = n;
1831 	return (error);
1832 }
1833 
1834 /*
1835  * Set the data link type of a BPF instance.
1836  */
1837 int
1838 bpf_setdlt(struct bpf_d *d, u_int dlt)
1839 {
1840 	const char *name;
1841 	struct bpf_if *bp;
1842 
1843 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1844 	if (d->bd_bif->bif_dlt == dlt)
1845 		return (0);
1846 	name = d->bd_bif->bif_name;
1847 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1848 		if (strcmp(name, bp->bif_name) != 0)
1849 			continue;
1850 		if (bp->bif_dlt == dlt)
1851 			break;
1852 	}
1853 	if (bp == NULL)
1854 		return (EINVAL);
1855 	bpf_detachd(d);
1856 	bpf_attachd(d, bp);
1857 	bpf_resetd(d);
1858 	return (0);
1859 }
1860 
1861 u_int32_t	bpf_mbuf_ldw(const void *, u_int32_t, int *);
1862 u_int32_t	bpf_mbuf_ldh(const void *, u_int32_t, int *);
1863 u_int32_t	bpf_mbuf_ldb(const void *, u_int32_t, int *);
1864 
1865 int		bpf_mbuf_copy(const struct mbuf *, u_int32_t,
1866 		    void *, u_int32_t);
1867 
1868 const struct bpf_ops bpf_mbuf_ops = {
1869 	bpf_mbuf_ldw,
1870 	bpf_mbuf_ldh,
1871 	bpf_mbuf_ldb,
1872 };
1873 
1874 int
1875 bpf_mbuf_copy(const struct mbuf *m, u_int32_t off, void *buf, u_int32_t len)
1876 {
1877 	u_int8_t *cp = buf;
1878 	u_int32_t count;
1879 
1880 	while (off >= m->m_len) {
1881 		off -= m->m_len;
1882 
1883 		m = m->m_next;
1884 		if (m == NULL)
1885 			return (-1);
1886 	}
1887 
1888 	for (;;) {
1889 		count = min(m->m_len - off, len);
1890 
1891 		memcpy(cp, m->m_data + off, count);
1892 		len -= count;
1893 
1894 		if (len == 0)
1895 			return (0);
1896 
1897 		m = m->m_next;
1898 		if (m == NULL)
1899 			break;
1900 
1901 		cp += count;
1902 		off = 0;
1903 	}
1904 
1905 	return (-1);
1906 }
1907 
1908 u_int32_t
1909 bpf_mbuf_ldw(const void *m0, u_int32_t k, int *err)
1910 {
1911 	u_int32_t v;
1912 
1913 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1914 		*err = 1;
1915 		return (0);
1916 	}
1917 
1918 	*err = 0;
1919 	return ntohl(v);
1920 }
1921 
1922 u_int32_t
1923 bpf_mbuf_ldh(const void *m0, u_int32_t k, int *err)
1924 {
1925 	u_int16_t v;
1926 
1927 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1928 		*err = 1;
1929 		return (0);
1930 	}
1931 
1932 	*err = 0;
1933 	return ntohs(v);
1934 }
1935 
1936 u_int32_t
1937 bpf_mbuf_ldb(const void *m0, u_int32_t k, int *err)
1938 {
1939 	const struct mbuf *m = m0;
1940 	u_int8_t v;
1941 
1942 	while (k >= m->m_len) {
1943 		k -= m->m_len;
1944 
1945 		m = m->m_next;
1946 		if (m == NULL) {
1947 			*err = 1;
1948 			return (0);
1949 		}
1950 	}
1951 	v = m->m_data[k];
1952 
1953 	*err = 0;
1954 	return v;
1955 }
1956 
1957 u_int
1958 bpf_mfilter(const struct bpf_insn *pc, const struct mbuf *m, u_int wirelen)
1959 {
1960 	return _bpf_filter(pc, &bpf_mbuf_ops, m, wirelen);
1961 }
1962