xref: /openbsd-src/sys/net/bpf.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*	$OpenBSD: bpf.c,v 1.221 2023/03/09 05:56:58 dlg Exp $	*/
2 /*	$NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1990, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * Copyright (c) 2010, 2014 Henning Brauer <henning@openbsd.org>
8  *
9  * This code is derived from the Stanford/CMU enet packet filter,
10  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
11  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
12  * Berkeley Laboratory.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)bpf.c	8.2 (Berkeley) 3/28/94
39  */
40 
41 #include "bpfilter.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/mbuf.h>
46 #include <sys/proc.h>
47 #include <sys/signalvar.h>
48 #include <sys/ioctl.h>
49 #include <sys/conf.h>
50 #include <sys/vnode.h>
51 #include <sys/fcntl.h>
52 #include <sys/socket.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/rwlock.h>
56 #include <sys/atomic.h>
57 #include <sys/event.h>
58 #include <sys/mutex.h>
59 #include <sys/refcnt.h>
60 #include <sys/smr.h>
61 #include <sys/specdev.h>
62 #include <sys/sigio.h>
63 #include <sys/task.h>
64 #include <sys/time.h>
65 
66 #include <net/if.h>
67 #include <net/bpf.h>
68 #include <net/bpfdesc.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/if_ether.h>
72 
73 #include "vlan.h"
74 #if NVLAN > 0
75 #include <net/if_vlan_var.h>
76 #endif
77 
78 #define BPF_BUFSIZE 32768
79 
80 #define BPF_S_IDLE	0
81 #define BPF_S_WAIT	1
82 #define BPF_S_DONE	2
83 
84 #define PRINET  26			/* interruptible */
85 
86 /*
87  * The default read buffer size is patchable.
88  */
89 int bpf_bufsize = BPF_BUFSIZE;
90 int bpf_maxbufsize = BPF_MAXBUFSIZE;
91 
92 /*
93  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
94  *  bpf_d_list is the list of descriptors
95  */
96 struct bpf_if	*bpf_iflist;
97 LIST_HEAD(, bpf_d) bpf_d_list;
98 
99 int	bpf_allocbufs(struct bpf_d *);
100 void	bpf_ifname(struct bpf_if*, struct ifreq *);
101 void	bpf_mcopy(const void *, void *, size_t);
102 int	bpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
103 	    struct sockaddr *);
104 int	bpf_setif(struct bpf_d *, struct ifreq *);
105 int	bpfkqfilter(dev_t, struct knote *);
106 void	bpf_wakeup(struct bpf_d *);
107 void	bpf_wakeup_cb(void *);
108 void	bpf_wait_cb(void *);
109 int	_bpf_mtap(caddr_t, const struct mbuf *, const struct mbuf *, u_int);
110 void	bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
111 	    const struct bpf_hdr *);
112 int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
113 int	bpf_setdlt(struct bpf_d *, u_int);
114 
115 void	filt_bpfrdetach(struct knote *);
116 int	filt_bpfread(struct knote *, long);
117 int	filt_bpfreadmodify(struct kevent *, struct knote *);
118 int	filt_bpfreadprocess(struct knote *, struct kevent *);
119 
120 int	bpf_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t);
121 
122 struct bpf_d *bpfilter_lookup(int);
123 
124 /*
125  * Called holding ``bd_mtx''.
126  */
127 void	bpf_attachd(struct bpf_d *, struct bpf_if *);
128 void	bpf_detachd(struct bpf_d *);
129 void	bpf_resetd(struct bpf_d *);
130 
131 void	bpf_prog_smr(void *);
132 void	bpf_d_smr(void *);
133 
134 /*
135  * Reference count access to descriptor buffers
136  */
137 void	bpf_get(struct bpf_d *);
138 void	bpf_put(struct bpf_d *);
139 
140 
141 struct rwlock bpf_sysctl_lk = RWLOCK_INITIALIZER("bpfsz");
142 
143 int
144 bpf_movein(struct uio *uio, struct bpf_d *d, struct mbuf **mp,
145     struct sockaddr *sockp)
146 {
147 	struct bpf_program_smr *bps;
148 	struct bpf_insn *fcode = NULL;
149 	struct mbuf *m;
150 	struct m_tag *mtag;
151 	int error;
152 	u_int hlen, alen, mlen;
153 	u_int len;
154 	u_int linktype;
155 	u_int slen;
156 
157 	/*
158 	 * Build a sockaddr based on the data link layer type.
159 	 * We do this at this level because the ethernet header
160 	 * is copied directly into the data field of the sockaddr.
161 	 * In the case of SLIP, there is no header and the packet
162 	 * is forwarded as is.
163 	 * Also, we are careful to leave room at the front of the mbuf
164 	 * for the link level header.
165 	 */
166 	linktype = d->bd_bif->bif_dlt;
167 	switch (linktype) {
168 
169 	case DLT_SLIP:
170 		sockp->sa_family = AF_INET;
171 		hlen = 0;
172 		break;
173 
174 	case DLT_PPP:
175 		sockp->sa_family = AF_UNSPEC;
176 		hlen = 0;
177 		break;
178 
179 	case DLT_EN10MB:
180 		sockp->sa_family = AF_UNSPEC;
181 		/* XXX Would MAXLINKHDR be better? */
182 		hlen = ETHER_HDR_LEN;
183 		break;
184 
185 	case DLT_IEEE802_11:
186 	case DLT_IEEE802_11_RADIO:
187 		sockp->sa_family = AF_UNSPEC;
188 		hlen = 0;
189 		break;
190 
191 	case DLT_RAW:
192 	case DLT_NULL:
193 		sockp->sa_family = AF_UNSPEC;
194 		hlen = 0;
195 		break;
196 
197 	case DLT_LOOP:
198 		sockp->sa_family = AF_UNSPEC;
199 		hlen = sizeof(u_int32_t);
200 		break;
201 
202 	default:
203 		return (EIO);
204 	}
205 
206 	if (uio->uio_resid > MAXMCLBYTES)
207 		return (EMSGSIZE);
208 	len = uio->uio_resid;
209 	if (len < hlen)
210 		return (EINVAL);
211 
212 	/*
213 	 * Get the length of the payload so we can align it properly.
214 	 */
215 	alen = len - hlen;
216 
217 	/*
218 	 * Allocate enough space for headers and the aligned payload.
219 	 */
220 	mlen = max(max_linkhdr, hlen) + roundup(alen, sizeof(long));
221 	if (mlen > MAXMCLBYTES)
222 		return (EMSGSIZE);
223 
224 	MGETHDR(m, M_WAIT, MT_DATA);
225 	if (mlen > MHLEN) {
226 		MCLGETL(m, M_WAIT, mlen);
227 		if ((m->m_flags & M_EXT) == 0) {
228 			error = ENOBUFS;
229 			goto bad;
230 		}
231 	}
232 
233 	m_align(m, alen); /* Align the payload. */
234 	m->m_data -= hlen;
235 
236 	m->m_pkthdr.ph_ifidx = 0;
237 	m->m_pkthdr.len = len;
238 	m->m_len = len;
239 
240 	error = uiomove(mtod(m, caddr_t), len, uio);
241 	if (error)
242 		goto bad;
243 
244 	smr_read_enter();
245 	bps = SMR_PTR_GET(&d->bd_wfilter);
246 	if (bps != NULL)
247 		fcode = bps->bps_bf.bf_insns;
248 	slen = bpf_filter(fcode, mtod(m, u_char *), len, len);
249 	smr_read_leave();
250 
251 	if (slen < len) {
252 		error = EPERM;
253 		goto bad;
254 	}
255 
256 	/*
257 	 * Make room for link header, and copy it to sockaddr
258 	 */
259 	if (hlen != 0) {
260 		if (linktype == DLT_LOOP) {
261 			u_int32_t af;
262 
263 			/* the link header indicates the address family */
264 			KASSERT(hlen == sizeof(u_int32_t));
265 			memcpy(&af, m->m_data, hlen);
266 			sockp->sa_family = ntohl(af);
267 		} else
268 			memcpy(sockp->sa_data, m->m_data, hlen);
269 
270 		m->m_pkthdr.len -= hlen;
271 		m->m_len -= hlen;
272 		m->m_data += hlen;
273 	}
274 
275 	/*
276 	 * Prepend the data link type as a mbuf tag
277 	 */
278 	mtag = m_tag_get(PACKET_TAG_DLT, sizeof(u_int), M_WAIT);
279 	*(u_int *)(mtag + 1) = linktype;
280 	m_tag_prepend(m, mtag);
281 
282 	*mp = m;
283 	return (0);
284  bad:
285 	m_freem(m);
286 	return (error);
287 }
288 
289 /*
290  * Attach file to the bpf interface, i.e. make d listen on bp.
291  */
292 void
293 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
294 {
295 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
296 
297 	/*
298 	 * Point d at bp, and add d to the interface's list of listeners.
299 	 * Finally, point the driver's bpf cookie at the interface so
300 	 * it will divert packets to bpf.
301 	 */
302 
303 	d->bd_bif = bp;
304 
305 	KERNEL_ASSERT_LOCKED();
306 	SMR_SLIST_INSERT_HEAD_LOCKED(&bp->bif_dlist, d, bd_next);
307 
308 	*bp->bif_driverp = bp;
309 }
310 
311 /*
312  * Detach a file from its interface.
313  */
314 void
315 bpf_detachd(struct bpf_d *d)
316 {
317 	struct bpf_if *bp;
318 
319 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
320 
321 	bp = d->bd_bif;
322 	/* Not attached. */
323 	if (bp == NULL)
324 		return;
325 
326 	/* Remove ``d'' from the interface's descriptor list. */
327 	KERNEL_ASSERT_LOCKED();
328 	SMR_SLIST_REMOVE_LOCKED(&bp->bif_dlist, d, bpf_d, bd_next);
329 
330 	if (SMR_SLIST_EMPTY_LOCKED(&bp->bif_dlist)) {
331 		/*
332 		 * Let the driver know that there are no more listeners.
333 		 */
334 		*bp->bif_driverp = NULL;
335 	}
336 
337 	d->bd_bif = NULL;
338 
339 	/*
340 	 * Check if this descriptor had requested promiscuous mode.
341 	 * If so, turn it off.
342 	 */
343 	if (d->bd_promisc) {
344 		int error;
345 
346 		KASSERT(bp->bif_ifp != NULL);
347 
348 		d->bd_promisc = 0;
349 
350 		bpf_get(d);
351 		mtx_leave(&d->bd_mtx);
352 		NET_LOCK();
353 		error = ifpromisc(bp->bif_ifp, 0);
354 		NET_UNLOCK();
355 		mtx_enter(&d->bd_mtx);
356 		bpf_put(d);
357 
358 		if (error && !(error == EINVAL || error == ENODEV ||
359 		    error == ENXIO))
360 			/*
361 			 * Something is really wrong if we were able to put
362 			 * the driver into promiscuous mode, but can't
363 			 * take it out.
364 			 */
365 			panic("bpf: ifpromisc failed");
366 	}
367 }
368 
369 void
370 bpfilterattach(int n)
371 {
372 	LIST_INIT(&bpf_d_list);
373 }
374 
375 /*
376  * Open ethernet device.  Returns ENXIO for illegal minor device number,
377  * EBUSY if file is open by another process.
378  */
379 int
380 bpfopen(dev_t dev, int flag, int mode, struct proc *p)
381 {
382 	struct bpf_d *bd;
383 	int unit = minor(dev);
384 
385 	if (unit & ((1 << CLONE_SHIFT) - 1))
386 		return (ENXIO);
387 
388 	KASSERT(bpfilter_lookup(unit) == NULL);
389 
390 	/* create on demand */
391 	if ((bd = malloc(sizeof(*bd), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
392 		return (EBUSY);
393 
394 	/* Mark "free" and do most initialization. */
395 	bd->bd_unit = unit;
396 	bd->bd_bufsize = bpf_bufsize;
397 	bd->bd_sig = SIGIO;
398 	mtx_init(&bd->bd_mtx, IPL_NET);
399 	task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
400 	timeout_set(&bd->bd_wait_tmo, bpf_wait_cb, bd);
401 	smr_init(&bd->bd_smr);
402 	sigio_init(&bd->bd_sigio);
403 	klist_init_mutex(&bd->bd_klist, &bd->bd_mtx);
404 
405 	bd->bd_rtout = 0;	/* no timeout by default */
406 	bd->bd_wtout = INFSLP;	/* wait for the buffer to fill by default */
407 
408 	refcnt_init(&bd->bd_refcnt);
409 	LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
410 
411 	return (0);
412 }
413 
414 /*
415  * Close the descriptor by detaching it from its interface,
416  * deallocating its buffers, and marking it free.
417  */
418 int
419 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
420 {
421 	struct bpf_d *d;
422 
423 	d = bpfilter_lookup(minor(dev));
424 	mtx_enter(&d->bd_mtx);
425 	bpf_detachd(d);
426 	bpf_wakeup(d);
427 	LIST_REMOVE(d, bd_list);
428 	mtx_leave(&d->bd_mtx);
429 	bpf_put(d);
430 
431 	return (0);
432 }
433 
434 /*
435  * Rotate the packet buffers in descriptor d.  Move the store buffer
436  * into the hold slot, and the free buffer into the store slot.
437  * Zero the length of the new store buffer.
438  */
439 #define ROTATE_BUFFERS(d) \
440 	KASSERT(d->bd_in_uiomove == 0); \
441 	MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
442 	(d)->bd_hbuf = (d)->bd_sbuf; \
443 	(d)->bd_hlen = (d)->bd_slen; \
444 	(d)->bd_sbuf = (d)->bd_fbuf; \
445 	(d)->bd_state = BPF_S_IDLE; \
446 	(d)->bd_slen = 0; \
447 	(d)->bd_fbuf = NULL;
448 
449 /*
450  *  bpfread - read next chunk of packets from buffers
451  */
452 int
453 bpfread(dev_t dev, struct uio *uio, int ioflag)
454 {
455 	uint64_t end, now;
456 	struct bpf_d *d;
457 	caddr_t hbuf;
458 	int error, hlen;
459 
460 	KERNEL_ASSERT_LOCKED();
461 
462 	d = bpfilter_lookup(minor(dev));
463 	if (d->bd_bif == NULL)
464 		return (ENXIO);
465 
466 	bpf_get(d);
467 	mtx_enter(&d->bd_mtx);
468 
469 	/*
470 	 * Restrict application to use a buffer the same size as
471 	 * as kernel buffers.
472 	 */
473 	if (uio->uio_resid != d->bd_bufsize) {
474 		error = EINVAL;
475 		goto out;
476 	}
477 
478 	/*
479 	 * If there's a timeout, mark when the read should end.
480 	 */
481 	if (d->bd_rtout != 0) {
482 		now = nsecuptime();
483 		end = now + d->bd_rtout;
484 		if (end < now)
485 			end = UINT64_MAX;
486 	}
487 
488 	/*
489 	 * If the hold buffer is empty, then do a timed sleep, which
490 	 * ends when the timeout expires or when enough packets
491 	 * have arrived to fill the store buffer.
492 	 */
493 	while (d->bd_hbuf == NULL) {
494 		if (d->bd_bif == NULL) {
495 			/* interface is gone */
496 			if (d->bd_slen == 0) {
497 				error = EIO;
498 				goto out;
499 			}
500 			ROTATE_BUFFERS(d);
501 			break;
502 		}
503 		if (d->bd_state == BPF_S_DONE) {
504 			/*
505 			 * A packet(s) either arrived since the previous
506 			 * read or arrived while we were asleep.
507 			 * Rotate the buffers and return what's here.
508 			 */
509 			ROTATE_BUFFERS(d);
510 			break;
511 		}
512 		if (ISSET(ioflag, IO_NDELAY)) {
513 			/* User requested non-blocking I/O */
514 			error = EWOULDBLOCK;
515 		} else if (d->bd_rtout == 0) {
516 			/* No read timeout set. */
517 			d->bd_nreaders++;
518 			error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
519 			    "bpf", INFSLP);
520 			d->bd_nreaders--;
521 		} else if ((now = nsecuptime()) < end) {
522 			/* Read timeout has not expired yet. */
523 			d->bd_nreaders++;
524 			error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
525 			    "bpf", end - now);
526 			d->bd_nreaders--;
527 		} else {
528 			/* Read timeout has expired. */
529 			error = EWOULDBLOCK;
530 		}
531 		if (error == EINTR || error == ERESTART)
532 			goto out;
533 		if (error == EWOULDBLOCK) {
534 			/*
535 			 * On a timeout, return what's in the buffer,
536 			 * which may be nothing.  If there is something
537 			 * in the store buffer, we can rotate the buffers.
538 			 */
539 			if (d->bd_hbuf != NULL)
540 				/*
541 				 * We filled up the buffer in between
542 				 * getting the timeout and arriving
543 				 * here, so we don't need to rotate.
544 				 */
545 				break;
546 
547 			if (d->bd_slen == 0) {
548 				error = 0;
549 				goto out;
550 			}
551 			ROTATE_BUFFERS(d);
552 			break;
553 		}
554 	}
555 	/*
556 	 * At this point, we know we have something in the hold slot.
557 	 */
558 	hbuf = d->bd_hbuf;
559 	hlen = d->bd_hlen;
560 	d->bd_hbuf = NULL;
561 	d->bd_hlen = 0;
562 	d->bd_fbuf = NULL;
563 	d->bd_in_uiomove = 1;
564 
565 	/*
566 	 * Move data from hold buffer into user space.
567 	 * We know the entire buffer is transferred since
568 	 * we checked above that the read buffer is bpf_bufsize bytes.
569 	 */
570 	mtx_leave(&d->bd_mtx);
571 	error = uiomove(hbuf, hlen, uio);
572 	mtx_enter(&d->bd_mtx);
573 
574 	/* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
575 	KASSERT(d->bd_fbuf == NULL);
576 	KASSERT(d->bd_hbuf == NULL);
577 	d->bd_fbuf = hbuf;
578 	d->bd_in_uiomove = 0;
579 out:
580 	mtx_leave(&d->bd_mtx);
581 	bpf_put(d);
582 
583 	return (error);
584 }
585 
586 /*
587  * If there are processes sleeping on this descriptor, wake them up.
588  */
589 void
590 bpf_wakeup(struct bpf_d *d)
591 {
592 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
593 
594 	if (d->bd_nreaders)
595 		wakeup(d);
596 
597 	knote_locked(&d->bd_klist, 0);
598 
599 	/*
600 	 * As long as pgsigio() needs to be protected
601 	 * by the KERNEL_LOCK() we have to delay the wakeup to
602 	 * another context to keep the hot path KERNEL_LOCK()-free.
603 	 */
604 	if (d->bd_async && d->bd_sig) {
605 		bpf_get(d);
606 		if (!task_add(systq, &d->bd_wake_task))
607 			bpf_put(d);
608 	}
609 }
610 
611 void
612 bpf_wakeup_cb(void *xd)
613 {
614 	struct bpf_d *d = xd;
615 
616 	if (d->bd_async && d->bd_sig)
617 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
618 
619 	bpf_put(d);
620 }
621 
622 void
623 bpf_wait_cb(void *xd)
624 {
625 	struct bpf_d *d = xd;
626 
627 	mtx_enter(&d->bd_mtx);
628 	if (d->bd_state == BPF_S_WAIT) {
629 		d->bd_state = BPF_S_DONE;
630 		bpf_wakeup(d);
631 	}
632 	mtx_leave(&d->bd_mtx);
633 
634 	bpf_put(d);
635 }
636 
637 int
638 bpfwrite(dev_t dev, struct uio *uio, int ioflag)
639 {
640 	struct bpf_d *d;
641 	struct ifnet *ifp;
642 	struct mbuf *m;
643 	int error;
644 	struct sockaddr_storage dst;
645 
646 	KERNEL_ASSERT_LOCKED();
647 
648 	d = bpfilter_lookup(minor(dev));
649 	if (d->bd_bif == NULL)
650 		return (ENXIO);
651 
652 	bpf_get(d);
653 	ifp = d->bd_bif->bif_ifp;
654 
655 	if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
656 		error = ENETDOWN;
657 		goto out;
658 	}
659 
660 	if (uio->uio_resid == 0) {
661 		error = 0;
662 		goto out;
663 	}
664 
665 	error = bpf_movein(uio, d, &m, sstosa(&dst));
666 	if (error)
667 		goto out;
668 
669 	if (m->m_pkthdr.len > ifp->if_mtu) {
670 		m_freem(m);
671 		error = EMSGSIZE;
672 		goto out;
673 	}
674 
675 	m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
676 	m->m_pkthdr.pf.prio = ifp->if_llprio;
677 
678 	if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
679 		dst.ss_family = pseudo_AF_HDRCMPLT;
680 
681 	NET_LOCK();
682 	error = ifp->if_output(ifp, m, sstosa(&dst), NULL);
683 	NET_UNLOCK();
684 
685 out:
686 	bpf_put(d);
687 	return (error);
688 }
689 
690 /*
691  * Reset a descriptor by flushing its packet buffer and clearing the
692  * receive and drop counts.
693  */
694 void
695 bpf_resetd(struct bpf_d *d)
696 {
697 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
698 	KASSERT(d->bd_in_uiomove == 0);
699 
700 	if (timeout_del(&d->bd_wait_tmo))
701 		bpf_put(d);
702 
703 	if (d->bd_hbuf != NULL) {
704 		/* Free the hold buffer. */
705 		d->bd_fbuf = d->bd_hbuf;
706 		d->bd_hbuf = NULL;
707 	}
708 	d->bd_state = BPF_S_IDLE;
709 	d->bd_slen = 0;
710 	d->bd_hlen = 0;
711 	d->bd_rcount = 0;
712 	d->bd_dcount = 0;
713 }
714 
715 static int
716 bpf_set_wtout(struct bpf_d *d, uint64_t wtout)
717 {
718 	mtx_enter(&d->bd_mtx);
719 	d->bd_wtout = wtout;
720 	mtx_leave(&d->bd_mtx);
721 
722 	return (0);
723 }
724 
725 static int
726 bpf_set_wtimeout(struct bpf_d *d, const struct timeval *tv)
727 {
728 	uint64_t nsec;
729 
730 	if (tv->tv_sec < 0 || !timerisvalid(tv))
731 		return (EINVAL);
732 
733 	nsec = TIMEVAL_TO_NSEC(tv);
734 	if (nsec > MAXTSLP)
735 		return (EOVERFLOW);
736 
737 	return (bpf_set_wtout(d, nsec));
738 }
739 
740 static int
741 bpf_get_wtimeout(struct bpf_d *d, struct timeval *tv)
742 {
743 	uint64_t nsec;
744 
745 	mtx_enter(&d->bd_mtx);
746 	nsec = d->bd_wtout;
747 	mtx_leave(&d->bd_mtx);
748 
749 	if (nsec == INFSLP)
750 		return (ENXIO);
751 
752 	memset(tv, 0, sizeof(*tv));
753 	NSEC_TO_TIMEVAL(nsec, tv);
754 
755 	return (0);
756 }
757 
758 /*
759  *  FIONREAD		Check for read packet available.
760  *  BIOCGBLEN		Get buffer len [for read()].
761  *  BIOCSETF		Set ethernet read filter.
762  *  BIOCFLUSH		Flush read packet buffer.
763  *  BIOCPROMISC		Put interface into promiscuous mode.
764  *  BIOCGDLTLIST	Get supported link layer types.
765  *  BIOCGDLT		Get link layer type.
766  *  BIOCSDLT		Set link layer type.
767  *  BIOCGETIF		Get interface name.
768  *  BIOCSETIF		Set interface.
769  *  BIOCSRTIMEOUT	Set read timeout.
770  *  BIOCGRTIMEOUT	Get read timeout.
771  *  BIOCSWTIMEOUT	Set wait timeout.
772  *  BIOCGWTIMEOUT	Get wait timeout.
773  *  BIOCDWTIMEOUT	Del wait timeout.
774  *  BIOCGSTATS		Get packet stats.
775  *  BIOCIMMEDIATE	Set immediate mode.
776  *  BIOCVERSION		Get filter language version.
777  *  BIOCGHDRCMPLT	Get "header already complete" flag
778  *  BIOCSHDRCMPLT	Set "header already complete" flag
779  */
780 int
781 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
782 {
783 	struct bpf_d *d;
784 	int error = 0;
785 
786 	d = bpfilter_lookup(minor(dev));
787 	if (d->bd_locked && suser(p) != 0) {
788 		/* list of allowed ioctls when locked and not root */
789 		switch (cmd) {
790 		case BIOCGBLEN:
791 		case BIOCFLUSH:
792 		case BIOCGDLT:
793 		case BIOCGDLTLIST:
794 		case BIOCGETIF:
795 		case BIOCGRTIMEOUT:
796 		case BIOCGWTIMEOUT:
797 		case BIOCGSTATS:
798 		case BIOCVERSION:
799 		case BIOCGRSIG:
800 		case BIOCGHDRCMPLT:
801 		case FIONREAD:
802 		case BIOCLOCK:
803 		case BIOCSRTIMEOUT:
804 		case BIOCSWTIMEOUT:
805 		case BIOCDWTIMEOUT:
806 		case BIOCIMMEDIATE:
807 		case TIOCGPGRP:
808 		case BIOCGDIRFILT:
809 			break;
810 		default:
811 			return (EPERM);
812 		}
813 	}
814 
815 	bpf_get(d);
816 
817 	switch (cmd) {
818 	default:
819 		error = EINVAL;
820 		break;
821 
822 	/*
823 	 * Check for read packet available.
824 	 */
825 	case FIONREAD:
826 		{
827 			int n;
828 
829 			mtx_enter(&d->bd_mtx);
830 			n = d->bd_slen;
831 			if (d->bd_hbuf != NULL)
832 				n += d->bd_hlen;
833 			mtx_leave(&d->bd_mtx);
834 
835 			*(int *)addr = n;
836 			break;
837 		}
838 
839 	/*
840 	 * Get buffer len [for read()].
841 	 */
842 	case BIOCGBLEN:
843 		*(u_int *)addr = d->bd_bufsize;
844 		break;
845 
846 	/*
847 	 * Set buffer length.
848 	 */
849 	case BIOCSBLEN:
850 		if (d->bd_bif != NULL)
851 			error = EINVAL;
852 		else {
853 			u_int size = *(u_int *)addr;
854 
855 			if (size > bpf_maxbufsize)
856 				*(u_int *)addr = size = bpf_maxbufsize;
857 			else if (size < BPF_MINBUFSIZE)
858 				*(u_int *)addr = size = BPF_MINBUFSIZE;
859 			mtx_enter(&d->bd_mtx);
860 			d->bd_bufsize = size;
861 			mtx_leave(&d->bd_mtx);
862 		}
863 		break;
864 
865 	/*
866 	 * Set link layer read filter.
867 	 */
868 	case BIOCSETF:
869 		error = bpf_setf(d, (struct bpf_program *)addr, 0);
870 		break;
871 
872 	/*
873 	 * Set link layer write filter.
874 	 */
875 	case BIOCSETWF:
876 		error = bpf_setf(d, (struct bpf_program *)addr, 1);
877 		break;
878 
879 	/*
880 	 * Flush read packet buffer.
881 	 */
882 	case BIOCFLUSH:
883 		mtx_enter(&d->bd_mtx);
884 		bpf_resetd(d);
885 		mtx_leave(&d->bd_mtx);
886 		break;
887 
888 	/*
889 	 * Put interface into promiscuous mode.
890 	 */
891 	case BIOCPROMISC:
892 		if (d->bd_bif == NULL) {
893 			/*
894 			 * No interface attached yet.
895 			 */
896 			error = EINVAL;
897 		} else if (d->bd_bif->bif_ifp != NULL) {
898 			if (d->bd_promisc == 0) {
899 				MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
900 				NET_LOCK();
901 				error = ifpromisc(d->bd_bif->bif_ifp, 1);
902 				NET_UNLOCK();
903 				if (error == 0)
904 					d->bd_promisc = 1;
905 			}
906 		}
907 		break;
908 
909 	/*
910 	 * Get a list of supported device parameters.
911 	 */
912 	case BIOCGDLTLIST:
913 		if (d->bd_bif == NULL)
914 			error = EINVAL;
915 		else
916 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
917 		break;
918 
919 	/*
920 	 * Get device parameters.
921 	 */
922 	case BIOCGDLT:
923 		if (d->bd_bif == NULL)
924 			error = EINVAL;
925 		else
926 			*(u_int *)addr = d->bd_bif->bif_dlt;
927 		break;
928 
929 	/*
930 	 * Set device parameters.
931 	 */
932 	case BIOCSDLT:
933 		if (d->bd_bif == NULL)
934 			error = EINVAL;
935 		else {
936 			mtx_enter(&d->bd_mtx);
937 			error = bpf_setdlt(d, *(u_int *)addr);
938 			mtx_leave(&d->bd_mtx);
939 		}
940 		break;
941 
942 	/*
943 	 * Set interface name.
944 	 */
945 	case BIOCGETIF:
946 		if (d->bd_bif == NULL)
947 			error = EINVAL;
948 		else
949 			bpf_ifname(d->bd_bif, (struct ifreq *)addr);
950 		break;
951 
952 	/*
953 	 * Set interface.
954 	 */
955 	case BIOCSETIF:
956 		error = bpf_setif(d, (struct ifreq *)addr);
957 		break;
958 
959 	/*
960 	 * Set read timeout.
961 	 */
962 	case BIOCSRTIMEOUT:
963 		{
964 			struct timeval *tv = (struct timeval *)addr;
965 			uint64_t rtout;
966 
967 			if (tv->tv_sec < 0 || !timerisvalid(tv)) {
968 				error = EINVAL;
969 				break;
970 			}
971 			rtout = TIMEVAL_TO_NSEC(tv);
972 			if (rtout > MAXTSLP) {
973 				error = EOVERFLOW;
974 				break;
975 			}
976 			mtx_enter(&d->bd_mtx);
977 			d->bd_rtout = rtout;
978 			mtx_leave(&d->bd_mtx);
979 			break;
980 		}
981 
982 	/*
983 	 * Get read timeout.
984 	 */
985 	case BIOCGRTIMEOUT:
986 		{
987 			struct timeval *tv = (struct timeval *)addr;
988 
989 			memset(tv, 0, sizeof(*tv));
990 			mtx_enter(&d->bd_mtx);
991 			NSEC_TO_TIMEVAL(d->bd_rtout, tv);
992 			mtx_leave(&d->bd_mtx);
993 			break;
994 		}
995 
996 	/*
997 	 * Get packet stats.
998 	 */
999 	case BIOCGSTATS:
1000 		{
1001 			struct bpf_stat *bs = (struct bpf_stat *)addr;
1002 
1003 			bs->bs_recv = d->bd_rcount;
1004 			bs->bs_drop = d->bd_dcount;
1005 			break;
1006 		}
1007 
1008 	/*
1009 	 * Set immediate mode.
1010 	 */
1011 	case BIOCIMMEDIATE:
1012 		error = bpf_set_wtout(d, *(int *)addr ? 0 : INFSLP);
1013 		break;
1014 
1015 	/*
1016 	 * Wait timeout.
1017 	 */
1018 	case BIOCSWTIMEOUT:
1019 		error = bpf_set_wtimeout(d, (const struct timeval *)addr);
1020 		break;
1021 	case BIOCGWTIMEOUT:
1022 		error = bpf_get_wtimeout(d, (struct timeval *)addr);
1023 		break;
1024 	case BIOCDWTIMEOUT:
1025 		error = bpf_set_wtout(d, INFSLP);
1026 		break;
1027 
1028 	case BIOCVERSION:
1029 		{
1030 			struct bpf_version *bv = (struct bpf_version *)addr;
1031 
1032 			bv->bv_major = BPF_MAJOR_VERSION;
1033 			bv->bv_minor = BPF_MINOR_VERSION;
1034 			break;
1035 		}
1036 
1037 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
1038 		*(u_int *)addr = d->bd_hdrcmplt;
1039 		break;
1040 
1041 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
1042 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1043 		break;
1044 
1045 	case BIOCLOCK:		/* set "locked" flag (no reset) */
1046 		d->bd_locked = 1;
1047 		break;
1048 
1049 	case BIOCGFILDROP:	/* get "filter-drop" flag */
1050 		*(u_int *)addr = d->bd_fildrop;
1051 		break;
1052 
1053 	case BIOCSFILDROP: {	/* set "filter-drop" flag */
1054 		unsigned int fildrop = *(u_int *)addr;
1055 		switch (fildrop) {
1056 		case BPF_FILDROP_PASS:
1057 		case BPF_FILDROP_CAPTURE:
1058 		case BPF_FILDROP_DROP:
1059 			d->bd_fildrop = fildrop;
1060 			break;
1061 		default:
1062 			error = EINVAL;
1063 			break;
1064 		}
1065 		break;
1066 	}
1067 
1068 	case BIOCGDIRFILT:	/* get direction filter */
1069 		*(u_int *)addr = d->bd_dirfilt;
1070 		break;
1071 
1072 	case BIOCSDIRFILT:	/* set direction filter */
1073 		d->bd_dirfilt = (*(u_int *)addr) &
1074 		    (BPF_DIRECTION_IN|BPF_DIRECTION_OUT);
1075 		break;
1076 
1077 	case FIONBIO:		/* Non-blocking I/O */
1078 		/* let vfs to keep track of this */
1079 		break;
1080 
1081 	case FIOASYNC:		/* Send signal on receive packets */
1082 		d->bd_async = *(int *)addr;
1083 		break;
1084 
1085 	case FIOSETOWN:		/* Process or group to send signals to */
1086 	case TIOCSPGRP:
1087 		error = sigio_setown(&d->bd_sigio, cmd, addr);
1088 		break;
1089 
1090 	case FIOGETOWN:
1091 	case TIOCGPGRP:
1092 		sigio_getown(&d->bd_sigio, cmd, addr);
1093 		break;
1094 
1095 	case BIOCSRSIG:		/* Set receive signal */
1096 		{
1097 			u_int sig;
1098 
1099 			sig = *(u_int *)addr;
1100 
1101 			if (sig >= NSIG)
1102 				error = EINVAL;
1103 			else
1104 				d->bd_sig = sig;
1105 			break;
1106 		}
1107 	case BIOCGRSIG:
1108 		*(u_int *)addr = d->bd_sig;
1109 		break;
1110 	}
1111 
1112 	bpf_put(d);
1113 	return (error);
1114 }
1115 
1116 /*
1117  * Set d's packet filter program to fp.  If this file already has a filter,
1118  * free it and replace it.  Returns EINVAL for bogus requests.
1119  */
1120 int
1121 bpf_setf(struct bpf_d *d, struct bpf_program *fp, int wf)
1122 {
1123 	struct bpf_program_smr *bps, *old_bps;
1124 	struct bpf_insn *fcode;
1125 	u_int flen, size;
1126 
1127 	KERNEL_ASSERT_LOCKED();
1128 
1129 	if (fp->bf_insns == 0) {
1130 		if (fp->bf_len != 0)
1131 			return (EINVAL);
1132 		bps = NULL;
1133 	} else {
1134 		flen = fp->bf_len;
1135 		if (flen > BPF_MAXINSNS)
1136 			return (EINVAL);
1137 
1138 		fcode = mallocarray(flen, sizeof(*fp->bf_insns), M_DEVBUF,
1139 		    M_WAITOK | M_CANFAIL);
1140 		if (fcode == NULL)
1141 			return (ENOMEM);
1142 
1143 		size = flen * sizeof(*fp->bf_insns);
1144 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1145 		    bpf_validate(fcode, (int)flen) == 0) {
1146 			free(fcode, M_DEVBUF, size);
1147 			return (EINVAL);
1148 		}
1149 
1150 		bps = malloc(sizeof(*bps), M_DEVBUF, M_WAITOK);
1151 		smr_init(&bps->bps_smr);
1152 		bps->bps_bf.bf_len = flen;
1153 		bps->bps_bf.bf_insns = fcode;
1154 	}
1155 
1156 	if (wf == 0) {
1157 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_rfilter);
1158 		SMR_PTR_SET_LOCKED(&d->bd_rfilter, bps);
1159 	} else {
1160 		old_bps = SMR_PTR_GET_LOCKED(&d->bd_wfilter);
1161 		SMR_PTR_SET_LOCKED(&d->bd_wfilter, bps);
1162 	}
1163 
1164 	mtx_enter(&d->bd_mtx);
1165 	bpf_resetd(d);
1166 	mtx_leave(&d->bd_mtx);
1167 	if (old_bps != NULL)
1168 		smr_call(&old_bps->bps_smr, bpf_prog_smr, old_bps);
1169 
1170 	return (0);
1171 }
1172 
1173 /*
1174  * Detach a file from its current interface (if attached at all) and attach
1175  * to the interface indicated by the name stored in ifr.
1176  * Return an errno or 0.
1177  */
1178 int
1179 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1180 {
1181 	struct bpf_if *bp, *candidate = NULL;
1182 	int error = 0;
1183 
1184 	/*
1185 	 * Look through attached interfaces for the named one.
1186 	 */
1187 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1188 		if (strcmp(bp->bif_name, ifr->ifr_name) != 0)
1189 			continue;
1190 
1191 		if (candidate == NULL || candidate->bif_dlt > bp->bif_dlt)
1192 			candidate = bp;
1193 	}
1194 
1195 	/* Not found. */
1196 	if (candidate == NULL)
1197 		return (ENXIO);
1198 
1199 	/*
1200 	 * Allocate the packet buffers if we need to.
1201 	 * If we're already attached to requested interface,
1202 	 * just flush the buffer.
1203 	 */
1204 	mtx_enter(&d->bd_mtx);
1205 	if (d->bd_sbuf == NULL) {
1206 		if ((error = bpf_allocbufs(d)))
1207 			goto out;
1208 	}
1209 	if (candidate != d->bd_bif) {
1210 		/*
1211 		 * Detach if attached to something else.
1212 		 */
1213 		bpf_detachd(d);
1214 		bpf_attachd(d, candidate);
1215 	}
1216 	bpf_resetd(d);
1217 out:
1218 	mtx_leave(&d->bd_mtx);
1219 	return (error);
1220 }
1221 
1222 /*
1223  * Copy the interface name to the ifreq.
1224  */
1225 void
1226 bpf_ifname(struct bpf_if *bif, struct ifreq *ifr)
1227 {
1228 	bcopy(bif->bif_name, ifr->ifr_name, sizeof(ifr->ifr_name));
1229 }
1230 
1231 const struct filterops bpfread_filtops = {
1232 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
1233 	.f_attach	= NULL,
1234 	.f_detach	= filt_bpfrdetach,
1235 	.f_event	= filt_bpfread,
1236 	.f_modify	= filt_bpfreadmodify,
1237 	.f_process	= filt_bpfreadprocess,
1238 };
1239 
1240 int
1241 bpfkqfilter(dev_t dev, struct knote *kn)
1242 {
1243 	struct bpf_d *d;
1244 	struct klist *klist;
1245 
1246 	KERNEL_ASSERT_LOCKED();
1247 
1248 	d = bpfilter_lookup(minor(dev));
1249 	if (d == NULL)
1250 		return (ENXIO);
1251 
1252 	switch (kn->kn_filter) {
1253 	case EVFILT_READ:
1254 		klist = &d->bd_klist;
1255 		kn->kn_fop = &bpfread_filtops;
1256 		break;
1257 	default:
1258 		return (EINVAL);
1259 	}
1260 
1261 	bpf_get(d);
1262 	kn->kn_hook = d;
1263 	klist_insert(klist, kn);
1264 
1265 	return (0);
1266 }
1267 
1268 void
1269 filt_bpfrdetach(struct knote *kn)
1270 {
1271 	struct bpf_d *d = kn->kn_hook;
1272 
1273 	klist_remove(&d->bd_klist, kn);
1274 	bpf_put(d);
1275 }
1276 
1277 int
1278 filt_bpfread(struct knote *kn, long hint)
1279 {
1280 	struct bpf_d *d = kn->kn_hook;
1281 
1282 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1283 
1284 	kn->kn_data = d->bd_hlen;
1285 	if (d->bd_wtout == 0)
1286 		kn->kn_data += d->bd_slen;
1287 
1288 	return (kn->kn_data > 0);
1289 }
1290 
1291 int
1292 filt_bpfreadmodify(struct kevent *kev, struct knote *kn)
1293 {
1294 	struct bpf_d *d = kn->kn_hook;
1295 	int active;
1296 
1297 	mtx_enter(&d->bd_mtx);
1298 	active = knote_modify_fn(kev, kn, filt_bpfread);
1299 	mtx_leave(&d->bd_mtx);
1300 
1301 	return (active);
1302 }
1303 
1304 int
1305 filt_bpfreadprocess(struct knote *kn, struct kevent *kev)
1306 {
1307 	struct bpf_d *d = kn->kn_hook;
1308 	int active;
1309 
1310 	mtx_enter(&d->bd_mtx);
1311 	active = knote_process_fn(kn, kev, filt_bpfread);
1312 	mtx_leave(&d->bd_mtx);
1313 
1314 	return (active);
1315 }
1316 
1317 /*
1318  * Copy data from an mbuf chain into a buffer.  This code is derived
1319  * from m_copydata in sys/uipc_mbuf.c.
1320  */
1321 void
1322 bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
1323 {
1324 	const struct mbuf *m;
1325 	u_int count;
1326 	u_char *dst;
1327 
1328 	m = src_arg;
1329 	dst = dst_arg;
1330 	while (len > 0) {
1331 		if (m == NULL)
1332 			panic("bpf_mcopy");
1333 		count = min(m->m_len, len);
1334 		bcopy(mtod(m, caddr_t), (caddr_t)dst, count);
1335 		m = m->m_next;
1336 		dst += count;
1337 		len -= count;
1338 	}
1339 }
1340 
1341 int
1342 bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
1343 {
1344 	return _bpf_mtap(arg, m, m, direction);
1345 }
1346 
1347 int
1348 _bpf_mtap(caddr_t arg, const struct mbuf *mp, const struct mbuf *m,
1349     u_int direction)
1350 {
1351 	struct bpf_if *bp = (struct bpf_if *)arg;
1352 	struct bpf_d *d;
1353 	size_t pktlen, slen;
1354 	const struct mbuf *m0;
1355 	struct bpf_hdr tbh;
1356 	int gothdr = 0;
1357 	int drop = 0;
1358 
1359 	if (m == NULL)
1360 		return (0);
1361 
1362 	if (bp == NULL)
1363 		return (0);
1364 
1365 	pktlen = 0;
1366 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1367 		pktlen += m0->m_len;
1368 
1369 	smr_read_enter();
1370 	SMR_SLIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1371 		struct bpf_program_smr *bps;
1372 		struct bpf_insn *fcode = NULL;
1373 
1374 		atomic_inc_long(&d->bd_rcount);
1375 
1376 		if (ISSET(d->bd_dirfilt, direction))
1377 			continue;
1378 
1379 		bps = SMR_PTR_GET(&d->bd_rfilter);
1380 		if (bps != NULL)
1381 			fcode = bps->bps_bf.bf_insns;
1382 		slen = bpf_mfilter(fcode, m, pktlen);
1383 
1384 		if (slen == 0)
1385 			continue;
1386 		if (d->bd_fildrop != BPF_FILDROP_PASS)
1387 			drop = 1;
1388 		if (d->bd_fildrop != BPF_FILDROP_DROP) {
1389 			if (!gothdr) {
1390 				struct timeval tv;
1391 				memset(&tbh, 0, sizeof(tbh));
1392 
1393 				if (ISSET(mp->m_flags, M_PKTHDR)) {
1394 					tbh.bh_ifidx = mp->m_pkthdr.ph_ifidx;
1395 					tbh.bh_flowid = mp->m_pkthdr.ph_flowid;
1396 					tbh.bh_flags = mp->m_pkthdr.pf.prio;
1397 					if (ISSET(mp->m_pkthdr.csum_flags,
1398 					    M_FLOWID))
1399 						SET(tbh.bh_flags, BPF_F_FLOWID);
1400 
1401 					m_microtime(mp, &tv);
1402 				} else
1403 					microtime(&tv);
1404 
1405 				tbh.bh_tstamp.tv_sec = tv.tv_sec;
1406 				tbh.bh_tstamp.tv_usec = tv.tv_usec;
1407 				SET(tbh.bh_flags, direction << BPF_F_DIR_SHIFT);
1408 
1409 				gothdr = 1;
1410 			}
1411 
1412 			mtx_enter(&d->bd_mtx);
1413 			bpf_catchpacket(d, (u_char *)m, pktlen, slen, &tbh);
1414 			mtx_leave(&d->bd_mtx);
1415 		}
1416 	}
1417 	smr_read_leave();
1418 
1419 	return (drop);
1420 }
1421 
1422 /*
1423  * Incoming linkage from device drivers, where a data buffer should be
1424  * prepended by an arbitrary header. In this situation we already have a
1425  * way of representing a chain of memory buffers, ie, mbufs, so reuse
1426  * the existing functionality by attaching the buffers to mbufs.
1427  *
1428  * Con up a minimal mbuf chain to pacify bpf by allocating (only) a
1429  * struct m_hdr each for the header and data on the stack.
1430  */
1431 int
1432 bpf_tap_hdr(caddr_t arg, const void *hdr, unsigned int hdrlen,
1433     const void *buf, unsigned int buflen, u_int direction)
1434 {
1435 	struct m_hdr mh, md;
1436 	struct mbuf *m0 = NULL;
1437 	struct mbuf **mp = &m0;
1438 
1439 	if (hdr != NULL) {
1440 		mh.mh_flags = 0;
1441 		mh.mh_next = NULL;
1442 		mh.mh_len = hdrlen;
1443 		mh.mh_data = (void *)hdr;
1444 
1445 		*mp = (struct mbuf *)&mh;
1446 		mp = &mh.mh_next;
1447 	}
1448 
1449 	if (buf != NULL) {
1450 		md.mh_flags = 0;
1451 		md.mh_next = NULL;
1452 		md.mh_len = buflen;
1453 		md.mh_data = (void *)buf;
1454 
1455 		*mp = (struct mbuf *)&md;
1456 	}
1457 
1458 	return bpf_mtap(arg, m0, direction);
1459 }
1460 
1461 /*
1462  * Incoming linkage from device drivers, where we have a mbuf chain
1463  * but need to prepend some arbitrary header from a linear buffer.
1464  *
1465  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1466  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1467  * fields in this header that we initialize, and will not try to free
1468  * it or keep a pointer to it.
1469  */
1470 int
1471 bpf_mtap_hdr(caddr_t arg, const void *data, u_int dlen, const struct mbuf *m,
1472     u_int direction)
1473 {
1474 	struct m_hdr mh;
1475 	const struct mbuf *m0;
1476 
1477 	if (dlen > 0) {
1478 		mh.mh_flags = 0;
1479 		mh.mh_next = (struct mbuf *)m;
1480 		mh.mh_len = dlen;
1481 		mh.mh_data = (void *)data;
1482 		m0 = (struct mbuf *)&mh;
1483 	} else
1484 		m0 = m;
1485 
1486 	return _bpf_mtap(arg, m, m0, direction);
1487 }
1488 
1489 /*
1490  * Incoming linkage from device drivers, where we have a mbuf chain
1491  * but need to prepend the address family.
1492  *
1493  * Con up a minimal dummy header to pacify bpf.  We allocate (only) a
1494  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1495  * fields in this header that we initialize, and will not try to free
1496  * it or keep a pointer to it.
1497  */
1498 int
1499 bpf_mtap_af(caddr_t arg, u_int32_t af, const struct mbuf *m, u_int direction)
1500 {
1501 	u_int32_t    afh;
1502 
1503 	afh = htonl(af);
1504 
1505 	return bpf_mtap_hdr(arg, &afh, sizeof(afh), m, direction);
1506 }
1507 
1508 /*
1509  * Incoming linkage from device drivers, where we have a mbuf chain
1510  * but need to prepend a VLAN encapsulation header.
1511  *
1512  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1513  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1514  * fields in this header that we initialize, and will not try to free
1515  * it or keep a pointer to it.
1516  */
1517 int
1518 bpf_mtap_ether(caddr_t arg, const struct mbuf *m, u_int direction)
1519 {
1520 #if NVLAN > 0
1521 	struct ether_vlan_header evh;
1522 	struct m_hdr mh, md;
1523 
1524 	if ((m->m_flags & M_VLANTAG) == 0)
1525 #endif
1526 	{
1527 		return _bpf_mtap(arg, m, m, direction);
1528 	}
1529 
1530 #if NVLAN > 0
1531 	KASSERT(m->m_len >= ETHER_HDR_LEN);
1532 
1533 	memcpy(&evh, mtod(m, char *), ETHER_HDR_LEN);
1534 	evh.evl_proto = evh.evl_encap_proto;
1535 	evh.evl_encap_proto = htons(ETHERTYPE_VLAN);
1536 	evh.evl_tag = htons(m->m_pkthdr.ether_vtag);
1537 
1538 	mh.mh_flags = 0;
1539 	mh.mh_data = (caddr_t)&evh;
1540 	mh.mh_len = sizeof(evh);
1541 	mh.mh_next = (struct mbuf *)&md;
1542 
1543 	md.mh_flags = 0;
1544 	md.mh_data = m->m_data + ETHER_HDR_LEN;
1545 	md.mh_len = m->m_len - ETHER_HDR_LEN;
1546 	md.mh_next = m->m_next;
1547 
1548 	return _bpf_mtap(arg, m, (struct mbuf *)&mh, direction);
1549 #endif
1550 }
1551 
1552 /*
1553  * Move the packet data from interface memory (pkt) into the
1554  * store buffer.  Wake up listeners if needed.
1555  * "copy" is the routine called to do the actual data
1556  * transfer.  bcopy is passed in to copy contiguous chunks, while
1557  * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
1558  * pkt is really an mbuf.
1559  */
1560 void
1561 bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
1562     const struct bpf_hdr *tbh)
1563 {
1564 	struct bpf_hdr *bh;
1565 	int totlen, curlen;
1566 	int hdrlen, do_wakeup = 0;
1567 
1568 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1569 	if (d->bd_bif == NULL)
1570 		return;
1571 
1572 	hdrlen = d->bd_bif->bif_hdrlen;
1573 
1574 	/*
1575 	 * Figure out how many bytes to move.  If the packet is
1576 	 * greater or equal to the snapshot length, transfer that
1577 	 * much.  Otherwise, transfer the whole packet (unless
1578 	 * we hit the buffer size limit).
1579 	 */
1580 	totlen = hdrlen + min(snaplen, pktlen);
1581 	if (totlen > d->bd_bufsize)
1582 		totlen = d->bd_bufsize;
1583 
1584 	/*
1585 	 * Round up the end of the previous packet to the next longword.
1586 	 */
1587 	curlen = BPF_WORDALIGN(d->bd_slen);
1588 	if (curlen + totlen > d->bd_bufsize) {
1589 		/*
1590 		 * This packet will overflow the storage buffer.
1591 		 * Rotate the buffers if we can, then wakeup any
1592 		 * pending reads.
1593 		 */
1594 		if (d->bd_fbuf == NULL) {
1595 			/*
1596 			 * We haven't completed the previous read yet,
1597 			 * so drop the packet.
1598 			 */
1599 			++d->bd_dcount;
1600 			return;
1601 		}
1602 
1603 		/* cancel pending wtime */
1604 		if (timeout_del(&d->bd_wait_tmo))
1605 			bpf_put(d);
1606 
1607 		ROTATE_BUFFERS(d);
1608 		do_wakeup = 1;
1609 		curlen = 0;
1610 	}
1611 
1612 	/*
1613 	 * Append the bpf header.
1614 	 */
1615 	bh = (struct bpf_hdr *)(d->bd_sbuf + curlen);
1616 	*bh = *tbh;
1617 	bh->bh_datalen = pktlen;
1618 	bh->bh_hdrlen = hdrlen;
1619 	bh->bh_caplen = totlen - hdrlen;
1620 
1621 	/*
1622 	 * Copy the packet data into the store buffer and update its length.
1623 	 */
1624 	bpf_mcopy(pkt, (u_char *)bh + hdrlen, bh->bh_caplen);
1625 	d->bd_slen = curlen + totlen;
1626 
1627 	switch (d->bd_wtout) {
1628 	case 0:
1629 		/*
1630 		 * Immediate mode is set.  A packet arrived so any
1631 		 * reads should be woken up.
1632 		 */
1633 		if (d->bd_state == BPF_S_IDLE)
1634 			d->bd_state = BPF_S_DONE;
1635 		do_wakeup = 1;
1636 		break;
1637 	case INFSLP:
1638 		break;
1639 	default:
1640 		if (d->bd_state == BPF_S_IDLE) {
1641 			d->bd_state = BPF_S_WAIT;
1642 
1643 			bpf_get(d);
1644 			if (!timeout_add_nsec(&d->bd_wait_tmo, d->bd_wtout))
1645 				bpf_put(d);
1646 		}
1647 		break;
1648 	}
1649 
1650 	if (do_wakeup)
1651 		bpf_wakeup(d);
1652 }
1653 
1654 /*
1655  * Initialize all nonzero fields of a descriptor.
1656  */
1657 int
1658 bpf_allocbufs(struct bpf_d *d)
1659 {
1660 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1661 
1662 	d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1663 	if (d->bd_fbuf == NULL)
1664 		return (ENOMEM);
1665 
1666 	d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1667 	if (d->bd_sbuf == NULL) {
1668 		free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
1669 		d->bd_fbuf = NULL;
1670 		return (ENOMEM);
1671 	}
1672 
1673 	d->bd_slen = 0;
1674 	d->bd_hlen = 0;
1675 
1676 	return (0);
1677 }
1678 
1679 void
1680 bpf_prog_smr(void *bps_arg)
1681 {
1682 	struct bpf_program_smr *bps = bps_arg;
1683 
1684 	free(bps->bps_bf.bf_insns, M_DEVBUF,
1685 	    bps->bps_bf.bf_len * sizeof(struct bpf_insn));
1686 	free(bps, M_DEVBUF, sizeof(struct bpf_program_smr));
1687 }
1688 
1689 void
1690 bpf_d_smr(void *smr)
1691 {
1692 	struct bpf_d	*bd = smr;
1693 
1694 	sigio_free(&bd->bd_sigio);
1695 	free(bd->bd_sbuf, M_DEVBUF, bd->bd_bufsize);
1696 	free(bd->bd_hbuf, M_DEVBUF, bd->bd_bufsize);
1697 	free(bd->bd_fbuf, M_DEVBUF, bd->bd_bufsize);
1698 
1699 	if (bd->bd_rfilter != NULL)
1700 		bpf_prog_smr(bd->bd_rfilter);
1701 	if (bd->bd_wfilter != NULL)
1702 		bpf_prog_smr(bd->bd_wfilter);
1703 
1704 	klist_free(&bd->bd_klist);
1705 	free(bd, M_DEVBUF, sizeof(*bd));
1706 }
1707 
1708 void
1709 bpf_get(struct bpf_d *bd)
1710 {
1711 	refcnt_take(&bd->bd_refcnt);
1712 }
1713 
1714 /*
1715  * Free buffers currently in use by a descriptor
1716  * when the reference count drops to zero.
1717  */
1718 void
1719 bpf_put(struct bpf_d *bd)
1720 {
1721 	if (refcnt_rele(&bd->bd_refcnt) == 0)
1722 		return;
1723 
1724 	smr_call(&bd->bd_smr, bpf_d_smr, bd);
1725 }
1726 
1727 void *
1728 bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
1729 {
1730 	struct bpf_if *bp;
1731 
1732 	if ((bp = malloc(sizeof(*bp), M_DEVBUF, M_NOWAIT)) == NULL)
1733 		panic("bpfattach");
1734 	SMR_SLIST_INIT(&bp->bif_dlist);
1735 	bp->bif_driverp = (struct bpf_if **)bpfp;
1736 	bp->bif_name = name;
1737 	bp->bif_ifp = NULL;
1738 	bp->bif_dlt = dlt;
1739 
1740 	bp->bif_next = bpf_iflist;
1741 	bpf_iflist = bp;
1742 
1743 	*bp->bif_driverp = NULL;
1744 
1745 	/*
1746 	 * Compute the length of the bpf header.  This is not necessarily
1747 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1748 	 * that the network layer header begins on a longword boundary (for
1749 	 * performance reasons and to alleviate alignment restrictions).
1750 	 */
1751 	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1752 
1753 	return (bp);
1754 }
1755 
1756 void
1757 bpfattach(caddr_t *driverp, struct ifnet *ifp, u_int dlt, u_int hdrlen)
1758 {
1759 	struct bpf_if *bp;
1760 
1761 	bp = bpfsattach(driverp, ifp->if_xname, dlt, hdrlen);
1762 	bp->bif_ifp = ifp;
1763 }
1764 
1765 /* Detach an interface from its attached bpf device.  */
1766 void
1767 bpfdetach(struct ifnet *ifp)
1768 {
1769 	struct bpf_if *bp, *nbp;
1770 
1771 	KERNEL_ASSERT_LOCKED();
1772 
1773 	for (bp = bpf_iflist; bp; bp = nbp) {
1774 		nbp = bp->bif_next;
1775 		if (bp->bif_ifp == ifp)
1776 			bpfsdetach(bp);
1777 	}
1778 	ifp->if_bpf = NULL;
1779 }
1780 
1781 void
1782 bpfsdetach(void *p)
1783 {
1784 	struct bpf_if *bp = p, *tbp;
1785 	struct bpf_d *bd;
1786 	int maj;
1787 
1788 	KERNEL_ASSERT_LOCKED();
1789 
1790 	/* Locate the major number. */
1791 	for (maj = 0; maj < nchrdev; maj++)
1792 		if (cdevsw[maj].d_open == bpfopen)
1793 			break;
1794 
1795 	while ((bd = SMR_SLIST_FIRST_LOCKED(&bp->bif_dlist))) {
1796 		vdevgone(maj, bd->bd_unit, bd->bd_unit, VCHR);
1797 		klist_invalidate(&bd->bd_klist);
1798 	}
1799 
1800 	for (tbp = bpf_iflist; tbp; tbp = tbp->bif_next) {
1801 		if (tbp->bif_next == bp) {
1802 			tbp->bif_next = bp->bif_next;
1803 			break;
1804 		}
1805 	}
1806 
1807 	if (bpf_iflist == bp)
1808 		bpf_iflist = bp->bif_next;
1809 
1810 	free(bp, M_DEVBUF, sizeof(*bp));
1811 }
1812 
1813 int
1814 bpf_sysctl_locked(int *name, u_int namelen, void *oldp, size_t *oldlenp,
1815     void *newp, size_t newlen)
1816 {
1817 	switch (name[0]) {
1818 	case NET_BPF_BUFSIZE:
1819 		return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1820 		    &bpf_bufsize, BPF_MINBUFSIZE, bpf_maxbufsize);
1821 	case NET_BPF_MAXBUFSIZE:
1822 		return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1823 		    &bpf_maxbufsize, BPF_MINBUFSIZE, INT_MAX);
1824 	default:
1825 		return (EOPNOTSUPP);
1826 	}
1827 }
1828 
1829 int
1830 bpf_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1831     size_t newlen)
1832 {
1833 	int flags = RW_INTR;
1834 	int error;
1835 
1836 	if (namelen != 1)
1837 		return (ENOTDIR);
1838 
1839 	flags |= (newp == NULL) ? RW_READ : RW_WRITE;
1840 
1841 	error = rw_enter(&bpf_sysctl_lk, flags);
1842 	if (error != 0)
1843 		return (error);
1844 
1845 	error = bpf_sysctl_locked(name, namelen, oldp, oldlenp, newp, newlen);
1846 
1847 	rw_exit(&bpf_sysctl_lk);
1848 
1849 	return (error);
1850 }
1851 
1852 struct bpf_d *
1853 bpfilter_lookup(int unit)
1854 {
1855 	struct bpf_d *bd;
1856 
1857 	KERNEL_ASSERT_LOCKED();
1858 
1859 	LIST_FOREACH(bd, &bpf_d_list, bd_list)
1860 		if (bd->bd_unit == unit)
1861 			return (bd);
1862 	return (NULL);
1863 }
1864 
1865 /*
1866  * Get a list of available data link type of the interface.
1867  */
1868 int
1869 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1870 {
1871 	int n, error;
1872 	struct bpf_if *bp;
1873 	const char *name;
1874 
1875 	name = d->bd_bif->bif_name;
1876 	n = 0;
1877 	error = 0;
1878 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1879 		if (strcmp(name, bp->bif_name) != 0)
1880 			continue;
1881 		if (bfl->bfl_list != NULL) {
1882 			if (n >= bfl->bfl_len)
1883 				return (ENOMEM);
1884 			error = copyout(&bp->bif_dlt,
1885 			    bfl->bfl_list + n, sizeof(u_int));
1886 			if (error)
1887 				break;
1888 		}
1889 		n++;
1890 	}
1891 
1892 	bfl->bfl_len = n;
1893 	return (error);
1894 }
1895 
1896 /*
1897  * Set the data link type of a BPF instance.
1898  */
1899 int
1900 bpf_setdlt(struct bpf_d *d, u_int dlt)
1901 {
1902 	const char *name;
1903 	struct bpf_if *bp;
1904 
1905 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1906 	if (d->bd_bif->bif_dlt == dlt)
1907 		return (0);
1908 	name = d->bd_bif->bif_name;
1909 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1910 		if (strcmp(name, bp->bif_name) != 0)
1911 			continue;
1912 		if (bp->bif_dlt == dlt)
1913 			break;
1914 	}
1915 	if (bp == NULL)
1916 		return (EINVAL);
1917 	bpf_detachd(d);
1918 	bpf_attachd(d, bp);
1919 	bpf_resetd(d);
1920 	return (0);
1921 }
1922 
1923 u_int32_t	bpf_mbuf_ldw(const void *, u_int32_t, int *);
1924 u_int32_t	bpf_mbuf_ldh(const void *, u_int32_t, int *);
1925 u_int32_t	bpf_mbuf_ldb(const void *, u_int32_t, int *);
1926 
1927 int		bpf_mbuf_copy(const struct mbuf *, u_int32_t,
1928 		    void *, u_int32_t);
1929 
1930 const struct bpf_ops bpf_mbuf_ops = {
1931 	bpf_mbuf_ldw,
1932 	bpf_mbuf_ldh,
1933 	bpf_mbuf_ldb,
1934 };
1935 
1936 int
1937 bpf_mbuf_copy(const struct mbuf *m, u_int32_t off, void *buf, u_int32_t len)
1938 {
1939 	u_int8_t *cp = buf;
1940 	u_int32_t count;
1941 
1942 	while (off >= m->m_len) {
1943 		off -= m->m_len;
1944 
1945 		m = m->m_next;
1946 		if (m == NULL)
1947 			return (-1);
1948 	}
1949 
1950 	for (;;) {
1951 		count = min(m->m_len - off, len);
1952 
1953 		memcpy(cp, m->m_data + off, count);
1954 		len -= count;
1955 
1956 		if (len == 0)
1957 			return (0);
1958 
1959 		m = m->m_next;
1960 		if (m == NULL)
1961 			break;
1962 
1963 		cp += count;
1964 		off = 0;
1965 	}
1966 
1967 	return (-1);
1968 }
1969 
1970 u_int32_t
1971 bpf_mbuf_ldw(const void *m0, u_int32_t k, int *err)
1972 {
1973 	u_int32_t v;
1974 
1975 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1976 		*err = 1;
1977 		return (0);
1978 	}
1979 
1980 	*err = 0;
1981 	return ntohl(v);
1982 }
1983 
1984 u_int32_t
1985 bpf_mbuf_ldh(const void *m0, u_int32_t k, int *err)
1986 {
1987 	u_int16_t v;
1988 
1989 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1990 		*err = 1;
1991 		return (0);
1992 	}
1993 
1994 	*err = 0;
1995 	return ntohs(v);
1996 }
1997 
1998 u_int32_t
1999 bpf_mbuf_ldb(const void *m0, u_int32_t k, int *err)
2000 {
2001 	const struct mbuf *m = m0;
2002 	u_int8_t v;
2003 
2004 	while (k >= m->m_len) {
2005 		k -= m->m_len;
2006 
2007 		m = m->m_next;
2008 		if (m == NULL) {
2009 			*err = 1;
2010 			return (0);
2011 		}
2012 	}
2013 	v = m->m_data[k];
2014 
2015 	*err = 0;
2016 	return v;
2017 }
2018 
2019 u_int
2020 bpf_mfilter(const struct bpf_insn *pc, const struct mbuf *m, u_int wirelen)
2021 {
2022 	return _bpf_filter(pc, &bpf_mbuf_ops, m, wirelen);
2023 }
2024