xref: /openbsd-src/sys/net/bpf.c (revision d1df930ffab53da22f3324c32bed7ac5709915e6)
1 /*	$OpenBSD: bpf.c,v 1.170 2018/07/13 08:51:15 bluhm Exp $	*/
2 /*	$NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $	*/
3 
4 /*
5  * Copyright (c) 1990, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * Copyright (c) 2010, 2014 Henning Brauer <henning@openbsd.org>
8  *
9  * This code is derived from the Stanford/CMU enet packet filter,
10  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
11  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
12  * Berkeley Laboratory.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)bpf.c	8.2 (Berkeley) 3/28/94
39  */
40 
41 #include "bpfilter.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/mbuf.h>
46 #include <sys/proc.h>
47 #include <sys/signalvar.h>
48 #include <sys/ioctl.h>
49 #include <sys/conf.h>
50 #include <sys/vnode.h>
51 #include <sys/fcntl.h>
52 #include <sys/socket.h>
53 #include <sys/poll.h>
54 #include <sys/kernel.h>
55 #include <sys/sysctl.h>
56 #include <sys/rwlock.h>
57 #include <sys/atomic.h>
58 #include <sys/srp.h>
59 #include <sys/specdev.h>
60 #include <sys/selinfo.h>
61 #include <sys/task.h>
62 
63 #include <net/if.h>
64 #include <net/bpf.h>
65 #include <net/bpfdesc.h>
66 
67 #include <netinet/in.h>
68 #include <netinet/if_ether.h>
69 
70 #include "vlan.h"
71 #if NVLAN > 0
72 #include <net/if_vlan_var.h>
73 #endif
74 
75 #define BPF_BUFSIZE 32768
76 
77 #define PRINET  26			/* interruptible */
78 
79 /* from kern/kern_clock.c; incremented each clock tick. */
80 extern int ticks;
81 
82 /*
83  * The default read buffer size is patchable.
84  */
85 int bpf_bufsize = BPF_BUFSIZE;
86 int bpf_maxbufsize = BPF_MAXBUFSIZE;
87 
88 /*
89  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
90  *  bpf_d_list is the list of descriptors
91  */
92 struct bpf_if	*bpf_iflist;
93 LIST_HEAD(, bpf_d) bpf_d_list;
94 
95 int	bpf_allocbufs(struct bpf_d *);
96 void	bpf_ifname(struct bpf_if*, struct ifreq *);
97 int	_bpf_mtap(caddr_t, const struct mbuf *, u_int,
98 	    void (*)(const void *, void *, size_t));
99 void	bpf_mcopy(const void *, void *, size_t);
100 int	bpf_movein(struct uio *, u_int, struct mbuf **,
101 	    struct sockaddr *, struct bpf_insn *);
102 int	bpf_setif(struct bpf_d *, struct ifreq *);
103 int	bpfpoll(dev_t, int, struct proc *);
104 int	bpfkqfilter(dev_t, struct knote *);
105 void	bpf_wakeup(struct bpf_d *);
106 void	bpf_wakeup_cb(void *);
107 void	bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
108 	    void (*)(const void *, void *, size_t), struct timeval *);
109 int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
110 int	bpf_setdlt(struct bpf_d *, u_int);
111 
112 void	filt_bpfrdetach(struct knote *);
113 int	filt_bpfread(struct knote *, long);
114 
115 int	bpf_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t);
116 
117 struct bpf_d *bpfilter_lookup(int);
118 
119 /*
120  * Called holding ``bd_mtx''.
121  */
122 void	bpf_attachd(struct bpf_d *, struct bpf_if *);
123 void	bpf_detachd(struct bpf_d *);
124 void	bpf_resetd(struct bpf_d *);
125 
126 /*
127  * Reference count access to descriptor buffers
128  */
129 void	bpf_get(struct bpf_d *);
130 void	bpf_put(struct bpf_d *);
131 
132 /*
133  * garbage collector srps
134  */
135 
136 void bpf_d_ref(void *, void *);
137 void bpf_d_unref(void *, void *);
138 struct srpl_rc bpf_d_rc = SRPL_RC_INITIALIZER(bpf_d_ref, bpf_d_unref, NULL);
139 
140 void bpf_insn_dtor(void *, void *);
141 struct srp_gc bpf_insn_gc = SRP_GC_INITIALIZER(bpf_insn_dtor, NULL);
142 
143 struct rwlock bpf_sysctl_lk = RWLOCK_INITIALIZER("bpfsz");
144 
145 int
146 bpf_movein(struct uio *uio, u_int linktype, struct mbuf **mp,
147     struct sockaddr *sockp, struct bpf_insn *filter)
148 {
149 	struct mbuf *m;
150 	struct m_tag *mtag;
151 	int error;
152 	u_int hlen;
153 	u_int len;
154 	u_int slen;
155 
156 	/*
157 	 * Build a sockaddr based on the data link layer type.
158 	 * We do this at this level because the ethernet header
159 	 * is copied directly into the data field of the sockaddr.
160 	 * In the case of SLIP, there is no header and the packet
161 	 * is forwarded as is.
162 	 * Also, we are careful to leave room at the front of the mbuf
163 	 * for the link level header.
164 	 */
165 	switch (linktype) {
166 
167 	case DLT_SLIP:
168 		sockp->sa_family = AF_INET;
169 		hlen = 0;
170 		break;
171 
172 	case DLT_PPP:
173 		sockp->sa_family = AF_UNSPEC;
174 		hlen = 0;
175 		break;
176 
177 	case DLT_EN10MB:
178 		sockp->sa_family = AF_UNSPEC;
179 		/* XXX Would MAXLINKHDR be better? */
180 		hlen = ETHER_HDR_LEN;
181 		break;
182 
183 	case DLT_IEEE802_11:
184 	case DLT_IEEE802_11_RADIO:
185 		sockp->sa_family = AF_UNSPEC;
186 		hlen = 0;
187 		break;
188 
189 	case DLT_RAW:
190 	case DLT_NULL:
191 		sockp->sa_family = AF_UNSPEC;
192 		hlen = 0;
193 		break;
194 
195 	case DLT_LOOP:
196 		sockp->sa_family = AF_UNSPEC;
197 		hlen = sizeof(u_int32_t);
198 		break;
199 
200 	default:
201 		return (EIO);
202 	}
203 
204 	if (uio->uio_resid > MAXMCLBYTES)
205 		return (EIO);
206 	len = uio->uio_resid;
207 
208 	MGETHDR(m, M_WAIT, MT_DATA);
209 	m->m_pkthdr.ph_ifidx = 0;
210 	m->m_pkthdr.len = len - hlen;
211 
212 	if (len > MHLEN) {
213 		MCLGETI(m, M_WAIT, NULL, len);
214 		if ((m->m_flags & M_EXT) == 0) {
215 			error = ENOBUFS;
216 			goto bad;
217 		}
218 	}
219 	m->m_len = len;
220 	*mp = m;
221 
222 	error = uiomove(mtod(m, caddr_t), len, uio);
223 	if (error)
224 		goto bad;
225 
226 	slen = bpf_filter(filter, mtod(m, u_char *), len, len);
227 	if (slen < len) {
228 		error = EPERM;
229 		goto bad;
230 	}
231 
232 	if (m->m_len < hlen) {
233 		error = EPERM;
234 		goto bad;
235 	}
236 	/*
237 	 * Make room for link header, and copy it to sockaddr
238 	 */
239 	if (hlen != 0) {
240 		if (linktype == DLT_LOOP) {
241 			u_int32_t af;
242 
243 			/* the link header indicates the address family */
244 			KASSERT(hlen == sizeof(u_int32_t));
245 			memcpy(&af, m->m_data, hlen);
246 			sockp->sa_family = ntohl(af);
247 		} else
248 			memcpy(sockp->sa_data, m->m_data, hlen);
249 		m->m_len -= hlen;
250 		m->m_data += hlen; /* XXX */
251 	}
252 
253 	/*
254 	 * Prepend the data link type as a mbuf tag
255 	 */
256 	mtag = m_tag_get(PACKET_TAG_DLT, sizeof(u_int), M_WAIT);
257 	*(u_int *)(mtag + 1) = linktype;
258 	m_tag_prepend(m, mtag);
259 
260 	return (0);
261  bad:
262 	m_freem(m);
263 	return (error);
264 }
265 
266 /*
267  * Attach file to the bpf interface, i.e. make d listen on bp.
268  */
269 void
270 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
271 {
272 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
273 
274 	/*
275 	 * Point d at bp, and add d to the interface's list of listeners.
276 	 * Finally, point the driver's bpf cookie at the interface so
277 	 * it will divert packets to bpf.
278 	 */
279 
280 	d->bd_bif = bp;
281 
282 	KERNEL_ASSERT_LOCKED();
283 	SRPL_INSERT_HEAD_LOCKED(&bpf_d_rc, &bp->bif_dlist, d, bd_next);
284 
285 	*bp->bif_driverp = bp;
286 }
287 
288 /*
289  * Detach a file from its interface.
290  */
291 void
292 bpf_detachd(struct bpf_d *d)
293 {
294 	struct bpf_if *bp;
295 
296 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
297 
298 	bp = d->bd_bif;
299 	/* Not attached. */
300 	if (bp == NULL)
301 		return;
302 
303 	/* Remove ``d'' from the interface's descriptor list. */
304 	KERNEL_ASSERT_LOCKED();
305 	SRPL_REMOVE_LOCKED(&bpf_d_rc, &bp->bif_dlist, d, bpf_d, bd_next);
306 
307 	if (SRPL_EMPTY_LOCKED(&bp->bif_dlist)) {
308 		/*
309 		 * Let the driver know that there are no more listeners.
310 		 */
311 		*bp->bif_driverp = NULL;
312 	}
313 
314 	d->bd_bif = NULL;
315 
316 	/*
317 	 * Check if this descriptor had requested promiscuous mode.
318 	 * If so, turn it off.
319 	 */
320 	if (d->bd_promisc) {
321 		int error;
322 
323 		KASSERT(bp->bif_ifp != NULL);
324 
325 		d->bd_promisc = 0;
326 
327 		bpf_get(d);
328 		mtx_leave(&d->bd_mtx);
329 		NET_LOCK();
330 		error = ifpromisc(bp->bif_ifp, 0);
331 		NET_UNLOCK();
332 		mtx_enter(&d->bd_mtx);
333 		bpf_put(d);
334 
335 		if (error && !(error == EINVAL || error == ENODEV ||
336 		    error == ENXIO))
337 			/*
338 			 * Something is really wrong if we were able to put
339 			 * the driver into promiscuous mode, but can't
340 			 * take it out.
341 			 */
342 			panic("bpf: ifpromisc failed");
343 	}
344 }
345 
346 void
347 bpfilterattach(int n)
348 {
349 	LIST_INIT(&bpf_d_list);
350 }
351 
352 /*
353  * Open ethernet device.  Returns ENXIO for illegal minor device number,
354  * EBUSY if file is open by another process.
355  */
356 int
357 bpfopen(dev_t dev, int flag, int mode, struct proc *p)
358 {
359 	struct bpf_d *bd;
360 	int unit = minor(dev);
361 
362 	if (unit & ((1 << CLONE_SHIFT) - 1))
363 		return (ENXIO);
364 
365 	KASSERT(bpfilter_lookup(unit) == NULL);
366 
367 	/* create on demand */
368 	if ((bd = malloc(sizeof(*bd), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
369 		return (EBUSY);
370 
371 	/* Mark "free" and do most initialization. */
372 	bd->bd_unit = unit;
373 	bd->bd_bufsize = bpf_bufsize;
374 	bd->bd_sig = SIGIO;
375 	mtx_init(&bd->bd_mtx, IPL_NET);
376 	task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
377 
378 	if (flag & FNONBLOCK)
379 		bd->bd_rtout = -1;
380 
381 	bpf_get(bd);
382 	LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
383 
384 	return (0);
385 }
386 
387 /*
388  * Close the descriptor by detaching it from its interface,
389  * deallocating its buffers, and marking it free.
390  */
391 int
392 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
393 {
394 	struct bpf_d *d;
395 
396 	d = bpfilter_lookup(minor(dev));
397 	mtx_enter(&d->bd_mtx);
398 	bpf_detachd(d);
399 	bpf_wakeup(d);
400 	LIST_REMOVE(d, bd_list);
401 	mtx_leave(&d->bd_mtx);
402 	bpf_put(d);
403 
404 	return (0);
405 }
406 
407 /*
408  * Rotate the packet buffers in descriptor d.  Move the store buffer
409  * into the hold slot, and the free buffer into the store slot.
410  * Zero the length of the new store buffer.
411  */
412 #define ROTATE_BUFFERS(d) \
413 	KASSERT(d->bd_in_uiomove == 0); \
414 	MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
415 	(d)->bd_hbuf = (d)->bd_sbuf; \
416 	(d)->bd_hlen = (d)->bd_slen; \
417 	(d)->bd_sbuf = (d)->bd_fbuf; \
418 	(d)->bd_slen = 0; \
419 	(d)->bd_fbuf = NULL;
420 /*
421  *  bpfread - read next chunk of packets from buffers
422  */
423 int
424 bpfread(dev_t dev, struct uio *uio, int ioflag)
425 {
426 	struct bpf_d *d;
427 	caddr_t hbuf;
428 	int hlen, error;
429 
430 	KERNEL_ASSERT_LOCKED();
431 
432 	d = bpfilter_lookup(minor(dev));
433 	if (d->bd_bif == NULL)
434 		return (ENXIO);
435 
436 	bpf_get(d);
437 	mtx_enter(&d->bd_mtx);
438 
439 	/*
440 	 * Restrict application to use a buffer the same size as
441 	 * as kernel buffers.
442 	 */
443 	if (uio->uio_resid != d->bd_bufsize) {
444 		error = EINVAL;
445 		goto out;
446 	}
447 
448 	/*
449 	 * If there's a timeout, bd_rdStart is tagged when we start the read.
450 	 * we can then figure out when we're done reading.
451 	 */
452 	if (d->bd_rtout != -1 && d->bd_rdStart == 0)
453 		d->bd_rdStart = ticks;
454 	else
455 		d->bd_rdStart = 0;
456 
457 	/*
458 	 * If the hold buffer is empty, then do a timed sleep, which
459 	 * ends when the timeout expires or when enough packets
460 	 * have arrived to fill the store buffer.
461 	 */
462 	while (d->bd_hbuf == NULL) {
463 		if (d->bd_bif == NULL) {
464 			/* interface is gone */
465 			if (d->bd_slen == 0) {
466 				error = EIO;
467 				goto out;
468 			}
469 			ROTATE_BUFFERS(d);
470 			break;
471 		}
472 		if (d->bd_immediate && d->bd_slen != 0) {
473 			/*
474 			 * A packet(s) either arrived since the previous
475 			 * read or arrived while we were asleep.
476 			 * Rotate the buffers and return what's here.
477 			 */
478 			ROTATE_BUFFERS(d);
479 			break;
480 		}
481 		if (d->bd_rtout == -1) {
482 			/* User requested non-blocking I/O */
483 			error = EWOULDBLOCK;
484 		} else {
485 			if ((d->bd_rdStart + d->bd_rtout) < ticks) {
486 				error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
487 				    "bpf", d->bd_rtout);
488 			} else
489 				error = EWOULDBLOCK;
490 		}
491 		if (error == EINTR || error == ERESTART)
492 			goto out;
493 		if (error == EWOULDBLOCK) {
494 			/*
495 			 * On a timeout, return what's in the buffer,
496 			 * which may be nothing.  If there is something
497 			 * in the store buffer, we can rotate the buffers.
498 			 */
499 			if (d->bd_hbuf != NULL)
500 				/*
501 				 * We filled up the buffer in between
502 				 * getting the timeout and arriving
503 				 * here, so we don't need to rotate.
504 				 */
505 				break;
506 
507 			if (d->bd_slen == 0) {
508 				error = 0;
509 				goto out;
510 			}
511 			ROTATE_BUFFERS(d);
512 			break;
513 		}
514 	}
515 	/*
516 	 * At this point, we know we have something in the hold slot.
517 	 */
518 	hbuf = d->bd_hbuf;
519 	hlen = d->bd_hlen;
520 	d->bd_hbuf = NULL;
521 	d->bd_hlen = 0;
522 	d->bd_fbuf = NULL;
523 	d->bd_in_uiomove = 1;
524 
525 	/*
526 	 * Move data from hold buffer into user space.
527 	 * We know the entire buffer is transferred since
528 	 * we checked above that the read buffer is bpf_bufsize bytes.
529 	 */
530 	mtx_leave(&d->bd_mtx);
531 	error = uiomove(hbuf, hlen, uio);
532 	mtx_enter(&d->bd_mtx);
533 
534 	/* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
535 	KASSERT(d->bd_fbuf == NULL);
536 	KASSERT(d->bd_hbuf == NULL);
537 	d->bd_fbuf = hbuf;
538 	d->bd_in_uiomove = 0;
539 out:
540 	mtx_leave(&d->bd_mtx);
541 	bpf_put(d);
542 
543 	return (error);
544 }
545 
546 
547 /*
548  * If there are processes sleeping on this descriptor, wake them up.
549  */
550 void
551 bpf_wakeup(struct bpf_d *d)
552 {
553 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
554 
555 	/*
556 	 * As long as csignal() and selwakeup() need to be protected
557 	 * by the KERNEL_LOCK() we have to delay the wakeup to
558 	 * another context to keep the hot path KERNEL_LOCK()-free.
559 	 */
560 	bpf_get(d);
561 	if (!task_add(systq, &d->bd_wake_task))
562 		bpf_put(d);
563 }
564 
565 void
566 bpf_wakeup_cb(void *xd)
567 {
568 	struct bpf_d *d = xd;
569 
570 	KERNEL_ASSERT_LOCKED();
571 
572 	wakeup(d);
573 	if (d->bd_async && d->bd_sig)
574 		csignal(d->bd_pgid, d->bd_sig, d->bd_siguid, d->bd_sigeuid);
575 
576 	selwakeup(&d->bd_sel);
577 	bpf_put(d);
578 }
579 
580 int
581 bpfwrite(dev_t dev, struct uio *uio, int ioflag)
582 {
583 	struct bpf_d *d;
584 	struct ifnet *ifp;
585 	struct mbuf *m;
586 	struct bpf_program *bf;
587 	struct bpf_insn *fcode = NULL;
588 	int error;
589 	struct sockaddr_storage dst;
590 	u_int dlt;
591 
592 	KERNEL_ASSERT_LOCKED();
593 
594 	d = bpfilter_lookup(minor(dev));
595 	if (d->bd_bif == NULL)
596 		return (ENXIO);
597 
598 	bpf_get(d);
599 	ifp = d->bd_bif->bif_ifp;
600 
601 	if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
602 		error = ENETDOWN;
603 		goto out;
604 	}
605 
606 	if (uio->uio_resid == 0) {
607 		error = 0;
608 		goto out;
609 	}
610 
611 	KERNEL_ASSERT_LOCKED(); /* for accessing bd_wfilter */
612 	bf = srp_get_locked(&d->bd_wfilter);
613 	if (bf != NULL)
614 		fcode = bf->bf_insns;
615 
616 	dlt = d->bd_bif->bif_dlt;
617 
618 	error = bpf_movein(uio, dlt, &m, sstosa(&dst), fcode);
619 	if (error)
620 		goto out;
621 
622 	if (m->m_pkthdr.len > ifp->if_mtu) {
623 		m_freem(m);
624 		error = EMSGSIZE;
625 		goto out;
626 	}
627 
628 	m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
629 	m->m_pkthdr.pf.prio = ifp->if_llprio;
630 
631 	if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
632 		dst.ss_family = pseudo_AF_HDRCMPLT;
633 
634 	NET_LOCK();
635 	error = ifp->if_output(ifp, m, sstosa(&dst), NULL);
636 	NET_UNLOCK();
637 
638 out:
639 	bpf_put(d);
640 	return (error);
641 }
642 
643 /*
644  * Reset a descriptor by flushing its packet buffer and clearing the
645  * receive and drop counts.
646  */
647 void
648 bpf_resetd(struct bpf_d *d)
649 {
650 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
651 	KASSERT(d->bd_in_uiomove == 0);
652 
653 	if (d->bd_hbuf != NULL) {
654 		/* Free the hold buffer. */
655 		d->bd_fbuf = d->bd_hbuf;
656 		d->bd_hbuf = NULL;
657 	}
658 	d->bd_slen = 0;
659 	d->bd_hlen = 0;
660 	d->bd_rcount = 0;
661 	d->bd_dcount = 0;
662 }
663 
664 /*
665  *  FIONREAD		Check for read packet available.
666  *  BIOCGBLEN		Get buffer len [for read()].
667  *  BIOCSETF		Set ethernet read filter.
668  *  BIOCFLUSH		Flush read packet buffer.
669  *  BIOCPROMISC		Put interface into promiscuous mode.
670  *  BIOCGDLTLIST	Get supported link layer types.
671  *  BIOCGDLT		Get link layer type.
672  *  BIOCSDLT		Set link layer type.
673  *  BIOCGETIF		Get interface name.
674  *  BIOCSETIF		Set interface.
675  *  BIOCSRTIMEOUT	Set read timeout.
676  *  BIOCGRTIMEOUT	Get read timeout.
677  *  BIOCGSTATS		Get packet stats.
678  *  BIOCIMMEDIATE	Set immediate mode.
679  *  BIOCVERSION		Get filter language version.
680  *  BIOCGHDRCMPLT	Get "header already complete" flag
681  *  BIOCSHDRCMPLT	Set "header already complete" flag
682  */
683 int
684 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
685 {
686 	struct bpf_d *d;
687 	int error = 0;
688 
689 	d = bpfilter_lookup(minor(dev));
690 	if (d->bd_locked && suser(p) != 0) {
691 		/* list of allowed ioctls when locked and not root */
692 		switch (cmd) {
693 		case BIOCGBLEN:
694 		case BIOCFLUSH:
695 		case BIOCGDLT:
696 		case BIOCGDLTLIST:
697 		case BIOCGETIF:
698 		case BIOCGRTIMEOUT:
699 		case BIOCGSTATS:
700 		case BIOCVERSION:
701 		case BIOCGRSIG:
702 		case BIOCGHDRCMPLT:
703 		case FIONREAD:
704 		case BIOCLOCK:
705 		case BIOCSRTIMEOUT:
706 		case BIOCIMMEDIATE:
707 		case TIOCGPGRP:
708 		case BIOCGDIRFILT:
709 			break;
710 		default:
711 			return (EPERM);
712 		}
713 	}
714 
715 	bpf_get(d);
716 
717 	switch (cmd) {
718 	default:
719 		error = EINVAL;
720 		break;
721 
722 	/*
723 	 * Check for read packet available.
724 	 */
725 	case FIONREAD:
726 		{
727 			int n;
728 
729 			mtx_enter(&d->bd_mtx);
730 			n = d->bd_slen;
731 			if (d->bd_hbuf != NULL)
732 				n += d->bd_hlen;
733 			mtx_leave(&d->bd_mtx);
734 
735 			*(int *)addr = n;
736 			break;
737 		}
738 
739 	/*
740 	 * Get buffer len [for read()].
741 	 */
742 	case BIOCGBLEN:
743 		*(u_int *)addr = d->bd_bufsize;
744 		break;
745 
746 	/*
747 	 * Set buffer length.
748 	 */
749 	case BIOCSBLEN:
750 		if (d->bd_bif != NULL)
751 			error = EINVAL;
752 		else {
753 			u_int size = *(u_int *)addr;
754 
755 			if (size > bpf_maxbufsize)
756 				*(u_int *)addr = size = bpf_maxbufsize;
757 			else if (size < BPF_MINBUFSIZE)
758 				*(u_int *)addr = size = BPF_MINBUFSIZE;
759 			mtx_enter(&d->bd_mtx);
760 			d->bd_bufsize = size;
761 			mtx_leave(&d->bd_mtx);
762 		}
763 		break;
764 
765 	/*
766 	 * Set link layer read filter.
767 	 */
768 	case BIOCSETF:
769 		error = bpf_setf(d, (struct bpf_program *)addr, 0);
770 		break;
771 
772 	/*
773 	 * Set link layer write filter.
774 	 */
775 	case BIOCSETWF:
776 		error = bpf_setf(d, (struct bpf_program *)addr, 1);
777 		break;
778 
779 	/*
780 	 * Flush read packet buffer.
781 	 */
782 	case BIOCFLUSH:
783 		mtx_enter(&d->bd_mtx);
784 		bpf_resetd(d);
785 		mtx_leave(&d->bd_mtx);
786 		break;
787 
788 	/*
789 	 * Put interface into promiscuous mode.
790 	 */
791 	case BIOCPROMISC:
792 		if (d->bd_bif == NULL) {
793 			/*
794 			 * No interface attached yet.
795 			 */
796 			error = EINVAL;
797 		} else if (d->bd_bif->bif_ifp != NULL) {
798 			if (d->bd_promisc == 0) {
799 				MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
800 				NET_LOCK();
801 				error = ifpromisc(d->bd_bif->bif_ifp, 1);
802 				NET_UNLOCK();
803 				if (error == 0)
804 					d->bd_promisc = 1;
805 			}
806 		}
807 		break;
808 
809 	/*
810 	 * Get a list of supported device parameters.
811 	 */
812 	case BIOCGDLTLIST:
813 		if (d->bd_bif == NULL)
814 			error = EINVAL;
815 		else
816 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
817 		break;
818 
819 	/*
820 	 * Get device parameters.
821 	 */
822 	case BIOCGDLT:
823 		if (d->bd_bif == NULL)
824 			error = EINVAL;
825 		else
826 			*(u_int *)addr = d->bd_bif->bif_dlt;
827 		break;
828 
829 	/*
830 	 * Set device parameters.
831 	 */
832 	case BIOCSDLT:
833 		if (d->bd_bif == NULL)
834 			error = EINVAL;
835 		else {
836 			mtx_enter(&d->bd_mtx);
837 			error = bpf_setdlt(d, *(u_int *)addr);
838 			mtx_leave(&d->bd_mtx);
839 		}
840 		break;
841 
842 	/*
843 	 * Set interface name.
844 	 */
845 	case BIOCGETIF:
846 		if (d->bd_bif == NULL)
847 			error = EINVAL;
848 		else
849 			bpf_ifname(d->bd_bif, (struct ifreq *)addr);
850 		break;
851 
852 	/*
853 	 * Set interface.
854 	 */
855 	case BIOCSETIF:
856 		error = bpf_setif(d, (struct ifreq *)addr);
857 		break;
858 
859 	/*
860 	 * Set read timeout.
861 	 */
862 	case BIOCSRTIMEOUT:
863 		{
864 			struct timeval *tv = (struct timeval *)addr;
865 
866 			/* Compute number of ticks. */
867 			d->bd_rtout = tv->tv_sec * hz + tv->tv_usec / tick;
868 			if (d->bd_rtout == 0 && tv->tv_usec != 0)
869 				d->bd_rtout = 1;
870 			break;
871 		}
872 
873 	/*
874 	 * Get read timeout.
875 	 */
876 	case BIOCGRTIMEOUT:
877 		{
878 			struct timeval *tv = (struct timeval *)addr;
879 
880 			tv->tv_sec = d->bd_rtout / hz;
881 			tv->tv_usec = (d->bd_rtout % hz) * tick;
882 			break;
883 		}
884 
885 	/*
886 	 * Get packet stats.
887 	 */
888 	case BIOCGSTATS:
889 		{
890 			struct bpf_stat *bs = (struct bpf_stat *)addr;
891 
892 			bs->bs_recv = d->bd_rcount;
893 			bs->bs_drop = d->bd_dcount;
894 			break;
895 		}
896 
897 	/*
898 	 * Set immediate mode.
899 	 */
900 	case BIOCIMMEDIATE:
901 		d->bd_immediate = *(u_int *)addr;
902 		break;
903 
904 	case BIOCVERSION:
905 		{
906 			struct bpf_version *bv = (struct bpf_version *)addr;
907 
908 			bv->bv_major = BPF_MAJOR_VERSION;
909 			bv->bv_minor = BPF_MINOR_VERSION;
910 			break;
911 		}
912 
913 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
914 		*(u_int *)addr = d->bd_hdrcmplt;
915 		break;
916 
917 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
918 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
919 		break;
920 
921 	case BIOCLOCK:		/* set "locked" flag (no reset) */
922 		d->bd_locked = 1;
923 		break;
924 
925 	case BIOCGFILDROP:	/* get "filter-drop" flag */
926 		*(u_int *)addr = d->bd_fildrop;
927 		break;
928 
929 	case BIOCSFILDROP:	/* set "filter-drop" flag */
930 		d->bd_fildrop = *(u_int *)addr ? 1 : 0;
931 		break;
932 
933 	case BIOCGDIRFILT:	/* get direction filter */
934 		*(u_int *)addr = d->bd_dirfilt;
935 		break;
936 
937 	case BIOCSDIRFILT:	/* set direction filter */
938 		d->bd_dirfilt = (*(u_int *)addr) &
939 		    (BPF_DIRECTION_IN|BPF_DIRECTION_OUT);
940 		break;
941 
942 	case FIONBIO:		/* Non-blocking I/O */
943 		if (*(int *)addr)
944 			d->bd_rtout = -1;
945 		else
946 			d->bd_rtout = 0;
947 		break;
948 
949 	case FIOASYNC:		/* Send signal on receive packets */
950 		d->bd_async = *(int *)addr;
951 		break;
952 
953 	/*
954 	 * N.B.  ioctl (FIOSETOWN) and fcntl (F_SETOWN) both end up doing
955 	 * the equivalent of a TIOCSPGRP and hence end up here.  *However*
956 	 * TIOCSPGRP's arg is a process group if it's positive and a process
957 	 * id if it's negative.  This is exactly the opposite of what the
958 	 * other two functions want!  Therefore there is code in ioctl and
959 	 * fcntl to negate the arg before calling here.
960 	 */
961 	case TIOCSPGRP:		/* Process or group to send signals to */
962 		d->bd_pgid = *(int *)addr;
963 		d->bd_siguid = p->p_ucred->cr_ruid;
964 		d->bd_sigeuid = p->p_ucred->cr_uid;
965 		break;
966 
967 	case TIOCGPGRP:
968 		*(int *)addr = d->bd_pgid;
969 		break;
970 
971 	case BIOCSRSIG:		/* Set receive signal */
972 		{
973 			u_int sig;
974 
975 			sig = *(u_int *)addr;
976 
977 			if (sig >= NSIG)
978 				error = EINVAL;
979 			else
980 				d->bd_sig = sig;
981 			break;
982 		}
983 	case BIOCGRSIG:
984 		*(u_int *)addr = d->bd_sig;
985 		break;
986 	}
987 
988 	bpf_put(d);
989 	return (error);
990 }
991 
992 /*
993  * Set d's packet filter program to fp.  If this file already has a filter,
994  * free it and replace it.  Returns EINVAL for bogus requests.
995  */
996 int
997 bpf_setf(struct bpf_d *d, struct bpf_program *fp, int wf)
998 {
999 	struct bpf_program *bf;
1000 	struct srp *filter;
1001 	struct bpf_insn *fcode;
1002 	u_int flen, size;
1003 
1004 	KERNEL_ASSERT_LOCKED();
1005 	filter = wf ? &d->bd_wfilter : &d->bd_rfilter;
1006 
1007 	if (fp->bf_insns == 0) {
1008 		if (fp->bf_len != 0)
1009 			return (EINVAL);
1010 		srp_update_locked(&bpf_insn_gc, filter, NULL);
1011 		mtx_enter(&d->bd_mtx);
1012 		bpf_resetd(d);
1013 		mtx_leave(&d->bd_mtx);
1014 		return (0);
1015 	}
1016 	flen = fp->bf_len;
1017 	if (flen > BPF_MAXINSNS)
1018 		return (EINVAL);
1019 
1020 	fcode = mallocarray(flen, sizeof(*fp->bf_insns), M_DEVBUF,
1021 	    M_WAITOK | M_CANFAIL);
1022 	if (fcode == NULL)
1023 		return (ENOMEM);
1024 
1025 	size = flen * sizeof(*fp->bf_insns);
1026 	if (copyin(fp->bf_insns, fcode, size) != 0 ||
1027 	    bpf_validate(fcode, (int)flen) == 0) {
1028 		free(fcode, M_DEVBUF, size);
1029 		return (EINVAL);
1030 	}
1031 
1032 	bf = malloc(sizeof(*bf), M_DEVBUF, M_WAITOK);
1033 	bf->bf_len = flen;
1034 	bf->bf_insns = fcode;
1035 
1036 	srp_update_locked(&bpf_insn_gc, filter, bf);
1037 
1038 	mtx_enter(&d->bd_mtx);
1039 	bpf_resetd(d);
1040 	mtx_leave(&d->bd_mtx);
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Detach a file from its current interface (if attached at all) and attach
1046  * to the interface indicated by the name stored in ifr.
1047  * Return an errno or 0.
1048  */
1049 int
1050 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1051 {
1052 	struct bpf_if *bp, *candidate = NULL;
1053 	int error = 0;
1054 
1055 	/*
1056 	 * Look through attached interfaces for the named one.
1057 	 */
1058 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1059 		if (strcmp(bp->bif_name, ifr->ifr_name) != 0)
1060 			continue;
1061 
1062 		if (candidate == NULL || candidate->bif_dlt > bp->bif_dlt)
1063 			candidate = bp;
1064 	}
1065 
1066 	/* Not found. */
1067 	if (candidate == NULL)
1068 		return (ENXIO);
1069 
1070 	/*
1071 	 * Allocate the packet buffers if we need to.
1072 	 * If we're already attached to requested interface,
1073 	 * just flush the buffer.
1074 	 */
1075 	mtx_enter(&d->bd_mtx);
1076 	if (d->bd_sbuf == NULL) {
1077 		if ((error = bpf_allocbufs(d)))
1078 			goto out;
1079 	}
1080 	if (candidate != d->bd_bif) {
1081 		/*
1082 		 * Detach if attached to something else.
1083 		 */
1084 		bpf_detachd(d);
1085 		bpf_attachd(d, candidate);
1086 	}
1087 	bpf_resetd(d);
1088 out:
1089 	mtx_leave(&d->bd_mtx);
1090 	return (error);
1091 }
1092 
1093 /*
1094  * Copy the interface name to the ifreq.
1095  */
1096 void
1097 bpf_ifname(struct bpf_if *bif, struct ifreq *ifr)
1098 {
1099 	bcopy(bif->bif_name, ifr->ifr_name, sizeof(ifr->ifr_name));
1100 }
1101 
1102 /*
1103  * Support for poll() system call
1104  */
1105 int
1106 bpfpoll(dev_t dev, int events, struct proc *p)
1107 {
1108 	struct bpf_d *d;
1109 	int revents;
1110 
1111 	KERNEL_ASSERT_LOCKED();
1112 
1113 	/*
1114 	 * An imitation of the FIONREAD ioctl code.
1115 	 */
1116 	d = bpfilter_lookup(minor(dev));
1117 
1118 	/*
1119 	 * XXX The USB stack manages it to trigger some race condition
1120 	 * which causes bpfilter_lookup to return NULL when a USB device
1121 	 * gets detached while it is up and has an open bpf handler (e.g.
1122 	 * dhclient).  We still should recheck if we can fix the root
1123 	 * cause of this issue.
1124 	 */
1125 	if (d == NULL)
1126 		return (POLLERR);
1127 
1128 	/* Always ready to write data */
1129 	revents = events & (POLLOUT | POLLWRNORM);
1130 
1131 	if (events & (POLLIN | POLLRDNORM)) {
1132 		mtx_enter(&d->bd_mtx);
1133 		if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0))
1134 			revents |= events & (POLLIN | POLLRDNORM);
1135 		else {
1136 			/*
1137 			 * if there's a timeout, mark the time we
1138 			 * started waiting.
1139 			 */
1140 			if (d->bd_rtout != -1 && d->bd_rdStart == 0)
1141 				d->bd_rdStart = ticks;
1142 			selrecord(p, &d->bd_sel);
1143 		}
1144 		mtx_leave(&d->bd_mtx);
1145 	}
1146 	return (revents);
1147 }
1148 
1149 struct filterops bpfread_filtops =
1150 	{ 1, NULL, filt_bpfrdetach, filt_bpfread };
1151 
1152 int
1153 bpfkqfilter(dev_t dev, struct knote *kn)
1154 {
1155 	struct bpf_d *d;
1156 	struct klist *klist;
1157 
1158 	KERNEL_ASSERT_LOCKED();
1159 
1160 	d = bpfilter_lookup(minor(dev));
1161 
1162 	switch (kn->kn_filter) {
1163 	case EVFILT_READ:
1164 		klist = &d->bd_sel.si_note;
1165 		kn->kn_fop = &bpfread_filtops;
1166 		break;
1167 	default:
1168 		return (EINVAL);
1169 	}
1170 
1171 	bpf_get(d);
1172 	kn->kn_hook = d;
1173 	SLIST_INSERT_HEAD(klist, kn, kn_selnext);
1174 
1175 	mtx_enter(&d->bd_mtx);
1176 	if (d->bd_rtout != -1 && d->bd_rdStart == 0)
1177 		d->bd_rdStart = ticks;
1178 	mtx_leave(&d->bd_mtx);
1179 
1180 	return (0);
1181 }
1182 
1183 void
1184 filt_bpfrdetach(struct knote *kn)
1185 {
1186 	struct bpf_d *d = kn->kn_hook;
1187 
1188 	KERNEL_ASSERT_LOCKED();
1189 
1190 	SLIST_REMOVE(&d->bd_sel.si_note, kn, knote, kn_selnext);
1191 	bpf_put(d);
1192 }
1193 
1194 int
1195 filt_bpfread(struct knote *kn, long hint)
1196 {
1197 	struct bpf_d *d = kn->kn_hook;
1198 
1199 	KERNEL_ASSERT_LOCKED();
1200 
1201 	mtx_enter(&d->bd_mtx);
1202 	kn->kn_data = d->bd_hlen;
1203 	if (d->bd_immediate)
1204 		kn->kn_data += d->bd_slen;
1205 	mtx_leave(&d->bd_mtx);
1206 
1207 	return (kn->kn_data > 0);
1208 }
1209 
1210 /*
1211  * Copy data from an mbuf chain into a buffer.  This code is derived
1212  * from m_copydata in sys/uipc_mbuf.c.
1213  */
1214 void
1215 bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
1216 {
1217 	const struct mbuf *m;
1218 	u_int count;
1219 	u_char *dst;
1220 
1221 	m = src_arg;
1222 	dst = dst_arg;
1223 	while (len > 0) {
1224 		if (m == NULL)
1225 			panic("bpf_mcopy");
1226 		count = min(m->m_len, len);
1227 		bcopy(mtod(m, caddr_t), (caddr_t)dst, count);
1228 		m = m->m_next;
1229 		dst += count;
1230 		len -= count;
1231 	}
1232 }
1233 
1234 /*
1235  * like bpf_mtap, but copy fn can be given. used by various bpf_mtap*
1236  */
1237 int
1238 _bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction,
1239     void (*cpfn)(const void *, void *, size_t))
1240 {
1241 	struct bpf_if *bp = (struct bpf_if *)arg;
1242 	struct srp_ref sr;
1243 	struct bpf_d *d;
1244 	size_t pktlen, slen;
1245 	const struct mbuf *m0;
1246 	struct timeval tv;
1247 	int gottime = 0;
1248 	int drop = 0;
1249 
1250 	if (m == NULL)
1251 		return (0);
1252 
1253 	if (cpfn == NULL)
1254 		cpfn = bpf_mcopy;
1255 
1256 	if (bp == NULL)
1257 		return (0);
1258 
1259 	pktlen = 0;
1260 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1261 		pktlen += m0->m_len;
1262 
1263 	SRPL_FOREACH(d, &sr, &bp->bif_dlist, bd_next) {
1264 		atomic_inc_long(&d->bd_rcount);
1265 
1266 		if ((direction & d->bd_dirfilt) != 0)
1267 			slen = 0;
1268 		else {
1269 			struct srp_ref bsr;
1270 			struct bpf_program *bf;
1271 			struct bpf_insn *fcode = NULL;
1272 
1273 			bf = srp_enter(&bsr, &d->bd_rfilter);
1274 			if (bf != NULL)
1275 				fcode = bf->bf_insns;
1276 			slen = bpf_mfilter(fcode, m, pktlen);
1277 			srp_leave(&bsr);
1278 		}
1279 
1280 		if (slen > 0) {
1281 			if (!gottime++)
1282 				microtime(&tv);
1283 
1284 			mtx_enter(&d->bd_mtx);
1285 			bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
1286 			    &tv);
1287 			mtx_leave(&d->bd_mtx);
1288 
1289 			if (d->bd_fildrop)
1290 				drop = 1;
1291 		}
1292 	}
1293 	SRPL_LEAVE(&sr);
1294 
1295 	return (drop);
1296 }
1297 
1298 /*
1299  * Incoming linkage from device drivers, where a data buffer should be
1300  * prepended by an arbitrary header. In this situation we already have a
1301  * way of representing a chain of memory buffers, ie, mbufs, so reuse
1302  * the existing functionality by attaching the buffers to mbufs.
1303  *
1304  * Con up a minimal mbuf chain to pacify bpf by allocating (only) a
1305  * struct m_hdr each for the header and data on the stack.
1306  */
1307 int
1308 bpf_tap_hdr(caddr_t arg, const void *hdr, unsigned int hdrlen,
1309     const void *buf, unsigned int buflen, u_int direction)
1310 {
1311 	struct m_hdr mh, md;
1312 	struct mbuf *m0 = NULL;
1313 	struct mbuf **mp = &m0;
1314 
1315 	if (hdr != NULL) {
1316 		mh.mh_flags = 0;
1317 		mh.mh_next = NULL;
1318 		mh.mh_len = hdrlen;
1319 		mh.mh_data = (void *)hdr;
1320 
1321 		*mp = (struct mbuf *)&mh;
1322 		mp = &mh.mh_next;
1323 	}
1324 
1325 	if (buf != NULL) {
1326 		md.mh_flags = 0;
1327 		md.mh_next = NULL;
1328 		md.mh_len = buflen;
1329 		md.mh_data = (void *)buf;
1330 
1331 		*mp = (struct mbuf *)&md;
1332 	}
1333 
1334 	return _bpf_mtap(arg, m0, direction, bpf_mcopy);
1335 }
1336 
1337 /*
1338  * Incoming linkage from device drivers, when packet is in an mbuf chain.
1339  */
1340 int
1341 bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
1342 {
1343 	return _bpf_mtap(arg, m, direction, NULL);
1344 }
1345 
1346 /*
1347  * Incoming linkage from device drivers, where we have a mbuf chain
1348  * but need to prepend some arbitrary header from a linear buffer.
1349  *
1350  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1351  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1352  * fields in this header that we initialize, and will not try to free
1353  * it or keep a pointer to it.
1354  */
1355 int
1356 bpf_mtap_hdr(caddr_t arg, caddr_t data, u_int dlen, const struct mbuf *m,
1357     u_int direction, void (*cpfn)(const void *, void *, size_t))
1358 {
1359 	struct m_hdr mh;
1360 	const struct mbuf *m0;
1361 
1362 	if (dlen > 0) {
1363 		mh.mh_flags = 0;
1364 		mh.mh_next = (struct mbuf *)m;
1365 		mh.mh_len = dlen;
1366 		mh.mh_data = data;
1367 		m0 = (struct mbuf *)&mh;
1368 	} else
1369 		m0 = m;
1370 
1371 	return _bpf_mtap(arg, m0, direction, cpfn);
1372 }
1373 
1374 /*
1375  * Incoming linkage from device drivers, where we have a mbuf chain
1376  * but need to prepend the address family.
1377  *
1378  * Con up a minimal dummy header to pacify bpf.  We allocate (only) a
1379  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1380  * fields in this header that we initialize, and will not try to free
1381  * it or keep a pointer to it.
1382  */
1383 int
1384 bpf_mtap_af(caddr_t arg, u_int32_t af, const struct mbuf *m, u_int direction)
1385 {
1386 	u_int32_t    afh;
1387 
1388 	afh = htonl(af);
1389 
1390 	return bpf_mtap_hdr(arg, (caddr_t)&afh, sizeof(afh),
1391 	    m, direction, NULL);
1392 }
1393 
1394 /*
1395  * Incoming linkage from device drivers, where we have a mbuf chain
1396  * but need to prepend a VLAN encapsulation header.
1397  *
1398  * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1399  * struct m_hdr on the stack.  This is safe as bpf only reads from the
1400  * fields in this header that we initialize, and will not try to free
1401  * it or keep a pointer to it.
1402  */
1403 int
1404 bpf_mtap_ether(caddr_t arg, const struct mbuf *m, u_int direction)
1405 {
1406 #if NVLAN > 0
1407 	struct ether_vlan_header evh;
1408 	struct m_hdr mh;
1409 	uint8_t prio;
1410 
1411 	if ((m->m_flags & M_VLANTAG) == 0)
1412 #endif
1413 	{
1414 		return bpf_mtap(arg, m, direction);
1415 	}
1416 
1417 #if NVLAN > 0
1418 	KASSERT(m->m_len >= ETHER_HDR_LEN);
1419 
1420 	prio = m->m_pkthdr.pf.prio;
1421 	if (prio <= 1)
1422 		prio = !prio;
1423 
1424 	memcpy(&evh, mtod(m, char *), ETHER_HDR_LEN);
1425 	evh.evl_proto = evh.evl_encap_proto;
1426 	evh.evl_encap_proto = htons(ETHERTYPE_VLAN);
1427 	evh.evl_tag = htons(m->m_pkthdr.ether_vtag |
1428 	    (prio << EVL_PRIO_BITS));
1429 
1430 	mh.mh_flags = 0;
1431 	mh.mh_data = m->m_data + ETHER_HDR_LEN;
1432 	mh.mh_len = m->m_len - ETHER_HDR_LEN;
1433 	mh.mh_next = m->m_next;
1434 
1435 	return bpf_mtap_hdr(arg, (caddr_t)&evh, sizeof(evh),
1436 	    (struct mbuf *)&mh, direction, NULL);
1437 #endif
1438 }
1439 
1440 /*
1441  * Move the packet data from interface memory (pkt) into the
1442  * store buffer.  Wake up listeners if needed.
1443  * "copy" is the routine called to do the actual data
1444  * transfer.  bcopy is passed in to copy contiguous chunks, while
1445  * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
1446  * pkt is really an mbuf.
1447  */
1448 void
1449 bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
1450     void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
1451 {
1452 	struct bpf_hdr *hp;
1453 	int totlen, curlen;
1454 	int hdrlen, do_wakeup = 0;
1455 
1456 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1457 	if (d->bd_bif == NULL)
1458 		return;
1459 
1460 	hdrlen = d->bd_bif->bif_hdrlen;
1461 
1462 	/*
1463 	 * Figure out how many bytes to move.  If the packet is
1464 	 * greater or equal to the snapshot length, transfer that
1465 	 * much.  Otherwise, transfer the whole packet (unless
1466 	 * we hit the buffer size limit).
1467 	 */
1468 	totlen = hdrlen + min(snaplen, pktlen);
1469 	if (totlen > d->bd_bufsize)
1470 		totlen = d->bd_bufsize;
1471 
1472 	/*
1473 	 * Round up the end of the previous packet to the next longword.
1474 	 */
1475 	curlen = BPF_WORDALIGN(d->bd_slen);
1476 	if (curlen + totlen > d->bd_bufsize) {
1477 		/*
1478 		 * This packet will overflow the storage buffer.
1479 		 * Rotate the buffers if we can, then wakeup any
1480 		 * pending reads.
1481 		 */
1482 		if (d->bd_fbuf == NULL) {
1483 			/*
1484 			 * We haven't completed the previous read yet,
1485 			 * so drop the packet.
1486 			 */
1487 			++d->bd_dcount;
1488 			return;
1489 		}
1490 		ROTATE_BUFFERS(d);
1491 		do_wakeup = 1;
1492 		curlen = 0;
1493 	}
1494 
1495 	/*
1496 	 * Append the bpf header.
1497 	 */
1498 	hp = (struct bpf_hdr *)(d->bd_sbuf + curlen);
1499 	hp->bh_tstamp.tv_sec = tv->tv_sec;
1500 	hp->bh_tstamp.tv_usec = tv->tv_usec;
1501 	hp->bh_datalen = pktlen;
1502 	hp->bh_hdrlen = hdrlen;
1503 	/*
1504 	 * Copy the packet data into the store buffer and update its length.
1505 	 */
1506 	(*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
1507 	d->bd_slen = curlen + totlen;
1508 
1509 	if (d->bd_immediate) {
1510 		/*
1511 		 * Immediate mode is set.  A packet arrived so any
1512 		 * reads should be woken up.
1513 		 */
1514 		do_wakeup = 1;
1515 	}
1516 
1517 	if (d->bd_rdStart && (d->bd_rtout + d->bd_rdStart < ticks)) {
1518 		/*
1519 		 * we could be selecting on the bpf, and we
1520 		 * may have timeouts set.  We got here by getting
1521 		 * a packet, so wake up the reader.
1522 		 */
1523 		if (d->bd_fbuf != NULL) {
1524 			d->bd_rdStart = 0;
1525 			ROTATE_BUFFERS(d);
1526 			do_wakeup = 1;
1527 		}
1528 	}
1529 
1530 	if (do_wakeup)
1531 		bpf_wakeup(d);
1532 }
1533 
1534 /*
1535  * Initialize all nonzero fields of a descriptor.
1536  */
1537 int
1538 bpf_allocbufs(struct bpf_d *d)
1539 {
1540 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1541 
1542 	d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1543 	if (d->bd_fbuf == NULL)
1544 		return (ENOMEM);
1545 
1546 	d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1547 	if (d->bd_sbuf == NULL) {
1548 		free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
1549 		return (ENOMEM);
1550 	}
1551 
1552 	d->bd_slen = 0;
1553 	d->bd_hlen = 0;
1554 
1555 	return (0);
1556 }
1557 
1558 void
1559 bpf_get(struct bpf_d *bd)
1560 {
1561 	atomic_inc_int(&bd->bd_ref);
1562 }
1563 
1564 /*
1565  * Free buffers currently in use by a descriptor
1566  * when the reference count drops to zero.
1567  */
1568 void
1569 bpf_put(struct bpf_d *bd)
1570 {
1571 	if (atomic_dec_int_nv(&bd->bd_ref) > 0)
1572 		return;
1573 
1574 	free(bd->bd_sbuf, M_DEVBUF, 0);
1575 	free(bd->bd_hbuf, M_DEVBUF, 0);
1576 	free(bd->bd_fbuf, M_DEVBUF, 0);
1577 	KERNEL_ASSERT_LOCKED();
1578 	srp_update_locked(&bpf_insn_gc, &bd->bd_rfilter, NULL);
1579 	srp_update_locked(&bpf_insn_gc, &bd->bd_wfilter, NULL);
1580 
1581 	free(bd, M_DEVBUF, sizeof(*bd));
1582 }
1583 
1584 void *
1585 bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
1586 {
1587 	struct bpf_if *bp;
1588 
1589 	if ((bp = malloc(sizeof(*bp), M_DEVBUF, M_NOWAIT)) == NULL)
1590 		panic("bpfattach");
1591 	SRPL_INIT(&bp->bif_dlist);
1592 	bp->bif_driverp = (struct bpf_if **)bpfp;
1593 	bp->bif_name = name;
1594 	bp->bif_ifp = NULL;
1595 	bp->bif_dlt = dlt;
1596 
1597 	bp->bif_next = bpf_iflist;
1598 	bpf_iflist = bp;
1599 
1600 	*bp->bif_driverp = NULL;
1601 
1602 	/*
1603 	 * Compute the length of the bpf header.  This is not necessarily
1604 	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1605 	 * that the network layer header begins on a longword boundary (for
1606 	 * performance reasons and to alleviate alignment restrictions).
1607 	 */
1608 	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1609 
1610 	return (bp);
1611 }
1612 
1613 void
1614 bpfattach(caddr_t *driverp, struct ifnet *ifp, u_int dlt, u_int hdrlen)
1615 {
1616 	struct bpf_if *bp;
1617 
1618 	bp = bpfsattach(driverp, ifp->if_xname, dlt, hdrlen);
1619 	bp->bif_ifp = ifp;
1620 }
1621 
1622 /* Detach an interface from its attached bpf device.  */
1623 void
1624 bpfdetach(struct ifnet *ifp)
1625 {
1626 	struct bpf_if *bp, *nbp, **pbp = &bpf_iflist;
1627 
1628 	KERNEL_ASSERT_LOCKED();
1629 
1630 	for (bp = bpf_iflist; bp; bp = nbp) {
1631 		nbp = bp->bif_next;
1632 		if (bp->bif_ifp == ifp) {
1633 			*pbp = nbp;
1634 
1635 			bpfsdetach(bp);
1636 		} else
1637 			pbp = &bp->bif_next;
1638 	}
1639 	ifp->if_bpf = NULL;
1640 }
1641 
1642 void
1643 bpfsdetach(void *p)
1644 {
1645 	struct bpf_if *bp = p;
1646 	struct bpf_d *bd;
1647 	int maj;
1648 
1649 	/* Locate the major number. */
1650 	for (maj = 0; maj < nchrdev; maj++)
1651 		if (cdevsw[maj].d_open == bpfopen)
1652 			break;
1653 
1654 	while ((bd = SRPL_FIRST_LOCKED(&bp->bif_dlist)))
1655 		vdevgone(maj, bd->bd_unit, bd->bd_unit, VCHR);
1656 
1657 	free(bp, M_DEVBUF, sizeof *bp);
1658 }
1659 
1660 int
1661 bpf_sysctl_locked(int *name, u_int namelen, void *oldp, size_t *oldlenp,
1662     void *newp, size_t newlen)
1663 {
1664 	int newval;
1665 	int error;
1666 
1667 	switch (name[0]) {
1668 	case NET_BPF_BUFSIZE:
1669 		newval = bpf_bufsize;
1670 		error = sysctl_int(oldp, oldlenp, newp, newlen, &newval);
1671 		if (error)
1672 			return (error);
1673 		if (newval < BPF_MINBUFSIZE || newval > bpf_maxbufsize)
1674 			return (EINVAL);
1675 		bpf_bufsize = newval;
1676 		break;
1677 	case NET_BPF_MAXBUFSIZE:
1678 		newval = bpf_maxbufsize;
1679 		error = sysctl_int(oldp, oldlenp, newp, newlen, &newval);
1680 		if (error)
1681 			return (error);
1682 		if (newval < BPF_MINBUFSIZE)
1683 			return (EINVAL);
1684 		bpf_maxbufsize = newval;
1685 		break;
1686 	default:
1687 		return (EOPNOTSUPP);
1688 	}
1689 	return (0);
1690 }
1691 
1692 int
1693 bpf_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1694     size_t newlen)
1695 {
1696 	int flags = RW_INTR;
1697 	int error;
1698 
1699 	if (namelen != 1)
1700 		return (ENOTDIR);
1701 
1702 	flags |= (newp == NULL) ? RW_READ : RW_WRITE;
1703 
1704 	error = rw_enter(&bpf_sysctl_lk, flags);
1705 	if (error != 0)
1706 		return (error);
1707 
1708 	error = bpf_sysctl_locked(name, namelen, oldp, oldlenp, newp, newlen);
1709 
1710 	rw_exit(&bpf_sysctl_lk);
1711 
1712 	return (error);
1713 }
1714 
1715 struct bpf_d *
1716 bpfilter_lookup(int unit)
1717 {
1718 	struct bpf_d *bd;
1719 
1720 	KERNEL_ASSERT_LOCKED();
1721 
1722 	LIST_FOREACH(bd, &bpf_d_list, bd_list)
1723 		if (bd->bd_unit == unit)
1724 			return (bd);
1725 	return (NULL);
1726 }
1727 
1728 /*
1729  * Get a list of available data link type of the interface.
1730  */
1731 int
1732 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1733 {
1734 	int n, error;
1735 	struct bpf_if *bp;
1736 	const char *name;
1737 
1738 	name = d->bd_bif->bif_name;
1739 	n = 0;
1740 	error = 0;
1741 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1742 		if (strcmp(name, bp->bif_name) != 0)
1743 			continue;
1744 		if (bfl->bfl_list != NULL) {
1745 			if (n >= bfl->bfl_len)
1746 				return (ENOMEM);
1747 			error = copyout(&bp->bif_dlt,
1748 			    bfl->bfl_list + n, sizeof(u_int));
1749 			if (error)
1750 				break;
1751 		}
1752 		n++;
1753 	}
1754 
1755 	bfl->bfl_len = n;
1756 	return (error);
1757 }
1758 
1759 /*
1760  * Set the data link type of a BPF instance.
1761  */
1762 int
1763 bpf_setdlt(struct bpf_d *d, u_int dlt)
1764 {
1765 	const char *name;
1766 	struct bpf_if *bp;
1767 
1768 	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1769 	if (d->bd_bif->bif_dlt == dlt)
1770 		return (0);
1771 	name = d->bd_bif->bif_name;
1772 	for (bp = bpf_iflist; bp != NULL; bp = bp->bif_next) {
1773 		if (strcmp(name, bp->bif_name) != 0)
1774 			continue;
1775 		if (bp->bif_dlt == dlt)
1776 			break;
1777 	}
1778 	if (bp == NULL)
1779 		return (EINVAL);
1780 	bpf_detachd(d);
1781 	bpf_attachd(d, bp);
1782 	bpf_resetd(d);
1783 	return (0);
1784 }
1785 
1786 void
1787 bpf_d_ref(void *null, void *d)
1788 {
1789 	bpf_get(d);
1790 }
1791 
1792 void
1793 bpf_d_unref(void *null, void *d)
1794 {
1795 	bpf_put(d);
1796 }
1797 
1798 void
1799 bpf_insn_dtor(void *null, void *f)
1800 {
1801 	struct bpf_program *bf = f;
1802 	struct bpf_insn *insns = bf->bf_insns;
1803 
1804 	free(insns, M_DEVBUF, bf->bf_len * sizeof(*insns));
1805 	free(bf, M_DEVBUF, sizeof(*bf));
1806 }
1807 
1808 u_int32_t	bpf_mbuf_ldw(const void *, u_int32_t, int *);
1809 u_int32_t	bpf_mbuf_ldh(const void *, u_int32_t, int *);
1810 u_int32_t	bpf_mbuf_ldb(const void *, u_int32_t, int *);
1811 
1812 int		bpf_mbuf_copy(const struct mbuf *, u_int32_t,
1813 		    void *, u_int32_t);
1814 
1815 const struct bpf_ops bpf_mbuf_ops = {
1816 	bpf_mbuf_ldw,
1817 	bpf_mbuf_ldh,
1818 	bpf_mbuf_ldb,
1819 };
1820 
1821 int
1822 bpf_mbuf_copy(const struct mbuf *m, u_int32_t off, void *buf, u_int32_t len)
1823 {
1824 	u_int8_t *cp = buf;
1825 	u_int32_t count;
1826 
1827 	while (off >= m->m_len) {
1828 		off -= m->m_len;
1829 
1830 		m = m->m_next;
1831 		if (m == NULL)
1832 			return (-1);
1833 	}
1834 
1835 	for (;;) {
1836 		count = min(m->m_len - off, len);
1837 
1838 		memcpy(cp, m->m_data + off, count);
1839 		len -= count;
1840 
1841 		if (len == 0)
1842 			return (0);
1843 
1844 		m = m->m_next;
1845 		if (m == NULL)
1846 			break;
1847 
1848 		cp += count;
1849 		off = 0;
1850 	}
1851 
1852 	return (-1);
1853 }
1854 
1855 u_int32_t
1856 bpf_mbuf_ldw(const void *m0, u_int32_t k, int *err)
1857 {
1858 	u_int32_t v;
1859 
1860 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1861 		*err = 1;
1862 		return (0);
1863 	}
1864 
1865 	*err = 0;
1866 	return ntohl(v);
1867 }
1868 
1869 u_int32_t
1870 bpf_mbuf_ldh(const void *m0, u_int32_t k, int *err)
1871 {
1872 	u_int16_t v;
1873 
1874 	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1875 		*err = 1;
1876 		return (0);
1877 	}
1878 
1879 	*err = 0;
1880 	return ntohs(v);
1881 }
1882 
1883 u_int32_t
1884 bpf_mbuf_ldb(const void *m0, u_int32_t k, int *err)
1885 {
1886 	const struct mbuf *m = m0;
1887 	u_int8_t v;
1888 
1889 	while (k >= m->m_len) {
1890 		k -= m->m_len;
1891 
1892 		m = m->m_next;
1893 		if (m == NULL) {
1894 			*err = 1;
1895 			return (0);
1896 		}
1897 	}
1898 	v = m->m_data[k];
1899 
1900 	*err = 0;
1901 	return v;
1902 }
1903 
1904 u_int
1905 bpf_mfilter(const struct bpf_insn *pc, const struct mbuf *m, u_int wirelen)
1906 {
1907 	return _bpf_filter(pc, &bpf_mbuf_ops, m, wirelen);
1908 }
1909