xref: /netbsd-src/sys/netinet/ip_mroute.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*	$NetBSD: ip_mroute.c,v 1.163 2018/09/14 05:09:51 maxv Exp $	*/
2 
3 /*
4  * Copyright (c) 1992, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Stephen Deering of Stanford University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
35  */
36 
37 /*
38  * Copyright (c) 1989 Stephen Deering
39  *
40  * This code is derived from software contributed to Berkeley by
41  * Stephen Deering of Stanford University.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice, this list of conditions and the following disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  * 3. All advertising materials mentioning features or use of this software
52  *    must display the following acknowledgement:
53  *      This product includes software developed by the University of
54  *      California, Berkeley and its contributors.
55  * 4. Neither the name of the University nor the names of its contributors
56  *    may be used to endorse or promote products derived from this software
57  *    without specific prior written permission.
58  *
59  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69  * SUCH DAMAGE.
70  *
71  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
72  */
73 
74 /*
75  * IP multicast forwarding procedures
76  *
77  * Written by David Waitzman, BBN Labs, August 1988.
78  * Modified by Steve Deering, Stanford, February 1989.
79  * Modified by Mark J. Steiglitz, Stanford, May, 1991
80  * Modified by Van Jacobson, LBL, January 1993
81  * Modified by Ajit Thyagarajan, PARC, August 1993
82  * Modified by Bill Fenner, PARC, April 1994
83  * Modified by Charles M. Hannum, NetBSD, May 1995.
84  * Modified by Ahmed Helmy, SGI, June 1996
85  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
86  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
87  * Modified by Hitoshi Asaeda, WIDE, August 2000
88  * Modified by Pavlin Radoslavov, ICSI, October 2002
89  *
90  * MROUTING Revision: 1.2
91  * and PIM-SMv2 and PIM-DM support, advanced API support,
92  * bandwidth metering and signaling
93  */
94 
95 #include <sys/cdefs.h>
96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.163 2018/09/14 05:09:51 maxv Exp $");
97 
98 #ifdef _KERNEL_OPT
99 #include "opt_inet.h"
100 #include "opt_ipsec.h"
101 #include "opt_pim.h"
102 #endif
103 
104 #ifdef PIM
105 #define _PIM_VT 1
106 #endif
107 
108 #include <sys/param.h>
109 #include <sys/systm.h>
110 #include <sys/callout.h>
111 #include <sys/mbuf.h>
112 #include <sys/socket.h>
113 #include <sys/socketvar.h>
114 #include <sys/errno.h>
115 #include <sys/time.h>
116 #include <sys/kernel.h>
117 #include <sys/kmem.h>
118 #include <sys/ioctl.h>
119 #include <sys/syslog.h>
120 
121 #include <net/if.h>
122 #include <net/raw_cb.h>
123 
124 #include <netinet/in.h>
125 #include <netinet/in_var.h>
126 #include <netinet/in_systm.h>
127 #include <netinet/in_offload.h>
128 #include <netinet/ip.h>
129 #include <netinet/ip_var.h>
130 #include <netinet/in_pcb.h>
131 #include <netinet/udp.h>
132 #include <netinet/igmp.h>
133 #include <netinet/igmp_var.h>
134 #include <netinet/ip_mroute.h>
135 #ifdef PIM
136 #include <netinet/pim.h>
137 #include <netinet/pim_var.h>
138 #endif
139 #include <netinet/ip_encap.h>
140 
141 #ifdef IPSEC
142 #include <netipsec/ipsec.h>
143 #include <netipsec/key.h>
144 #endif
145 
146 #define IP_MULTICASTOPTS 0
147 #define	M_PULLUP(m, len)						 \
148 	do {								 \
149 		if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
150 			(m) = m_pullup((m), (len));			 \
151 	} while (/*CONSTCOND*/ 0)
152 
153 /*
154  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
155  * except for netstat or debugging purposes.
156  */
157 struct socket  *ip_mrouter  = NULL;
158 int		ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
159 
160 #define	MFCHASH(a, g)							\
161 	((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^	\
162 	  ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
163 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
164 u_long	mfchash;
165 
166 u_char		nexpire[MFCTBLSIZ];
167 struct vif	viftable[MAXVIFS];
168 struct mrtstat	mrtstat;
169 u_int		mrtdebug = 0;	/* debug level */
170 #define		DEBUG_MFC	0x02
171 #define		DEBUG_FORWARD	0x04
172 #define		DEBUG_EXPIRE	0x08
173 #define		DEBUG_XMIT	0x10
174 #define		DEBUG_PIM	0x20
175 
176 #define		VIFI_INVALID	((vifi_t) -1)
177 
178 u_int tbfdebug = 0;	/* tbf debug level */
179 
180 /* vif attachment using sys/netinet/ip_encap.c */
181 static void vif_input(struct mbuf *, int, int, void *);
182 static int vif_encapcheck(struct mbuf *, int, int, void *);
183 
184 static const struct encapsw vif_encapsw = {
185 	.encapsw4 = {
186 		.pr_input	= vif_input,
187 		.pr_ctlinput	= NULL,
188 	}
189 };
190 
191 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second */
192 #define		UPCALL_EXPIRE	6		/* number of timeouts */
193 
194 /*
195  * Define the token bucket filter structures
196  */
197 
198 #define		TBF_REPROCESS	(hz / 100)	/* 100x / second */
199 
200 static int get_sg_cnt(struct sioc_sg_req *);
201 static int get_vif_cnt(struct sioc_vif_req *);
202 static int ip_mrouter_init(struct socket *, int);
203 static int set_assert(int);
204 static int add_vif(struct vifctl *);
205 static int del_vif(vifi_t *);
206 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
207 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
208 static void expire_mfc(struct mfc *);
209 static int add_mfc(struct sockopt *);
210 #ifdef UPCALL_TIMING
211 static void collate(struct timeval *);
212 #endif
213 static int del_mfc(struct sockopt *);
214 static int set_api_config(struct sockopt *); /* chose API capabilities */
215 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
216 static void expire_upcalls(void *);
217 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
218 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
219 static void encap_send(struct ip *, struct vif *, struct mbuf *);
220 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t);
221 static void tbf_queue(struct vif *, struct mbuf *);
222 static void tbf_process_q(struct vif *);
223 static void tbf_reprocess_q(void *);
224 static int tbf_dq_sel(struct vif *, struct ip *);
225 static void tbf_send_packet(struct vif *, struct mbuf *);
226 static void tbf_update_tokens(struct vif *);
227 static int priority(struct vif *, struct ip *);
228 
229 /*
230  * Bandwidth monitoring
231  */
232 static void free_bw_list(struct bw_meter *);
233 static int add_bw_upcall(struct bw_upcall *);
234 static int del_bw_upcall(struct bw_upcall *);
235 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
236 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
237 static void bw_upcalls_send(void);
238 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
239 static void unschedule_bw_meter(struct bw_meter *);
240 static void bw_meter_process(void);
241 static void expire_bw_upcalls_send(void *);
242 static void expire_bw_meter_process(void *);
243 
244 #ifdef PIM
245 static int pim_register_send(struct ip *, struct vif *,
246     struct mbuf *, struct mfc *);
247 static int pim_register_send_rp(struct ip *, struct vif *,
248     struct mbuf *, struct mfc *);
249 static int pim_register_send_upcall(struct ip *, struct vif *,
250     struct mbuf *, struct mfc *);
251 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
252 #endif
253 
254 #define	ENCAP_TTL	64
255 #define	ENCAP_PROTO	IPPROTO_IPIP
256 
257 /* prototype IP hdr for encapsulated packets */
258 static const struct ip multicast_encap_iphdr = {
259 	.ip_hl = sizeof(struct ip) >> 2,
260 	.ip_v = IPVERSION,
261 	.ip_len = sizeof(struct ip),
262 	.ip_ttl = ENCAP_TTL,
263 	.ip_p = ENCAP_PROTO,
264 };
265 
266 /*
267  * Bandwidth meter variables and constants
268  */
269 
270 /*
271  * Pending timeouts are stored in a hash table, the key being the
272  * expiration time. Periodically, the entries are analysed and processed.
273  */
274 #define BW_METER_BUCKETS	1024
275 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
276 struct callout bw_meter_ch;
277 #define BW_METER_PERIOD (hz)		/* periodical handling of bw meters */
278 
279 /*
280  * Pending upcalls are stored in a vector which is flushed when
281  * full, or periodically
282  */
283 static struct bw_upcall	bw_upcalls[BW_UPCALLS_MAX];
284 static u_int	bw_upcalls_n; /* # of pending upcalls */
285 struct callout	bw_upcalls_ch;
286 #define BW_UPCALLS_PERIOD (hz)		/* periodical flush of bw upcalls */
287 
288 #ifdef PIM
289 struct pimstat pimstat;
290 
291 /*
292  * Note: the PIM Register encapsulation adds the following in front of a
293  * data packet:
294  *
295  * struct pim_encap_hdr {
296  *     struct ip ip;
297  *     struct pim_encap_pimhdr  pim;
298  * }
299  */
300 
301 struct pim_encap_pimhdr {
302 	struct pim pim;
303 	uint32_t   flags;
304 };
305 
306 static struct ip pim_encap_iphdr = {
307 	.ip_v = IPVERSION,
308 	.ip_hl = sizeof(struct ip) >> 2,
309 	.ip_len = sizeof(struct ip),
310 	.ip_ttl = ENCAP_TTL,
311 	.ip_p = IPPROTO_PIM,
312 };
313 
314 static struct pim_encap_pimhdr pim_encap_pimhdr = {
315     {
316 	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
317 	0,			/* reserved */
318 	0,			/* checksum */
319     },
320     0				/* flags */
321 };
322 
323 static struct ifnet multicast_register_if;
324 static vifi_t reg_vif_num = VIFI_INVALID;
325 #endif /* PIM */
326 
327 
328 /*
329  * Private variables.
330  */
331 static vifi_t	   numvifs = 0;
332 
333 static struct callout expire_upcalls_ch;
334 
335 /*
336  * whether or not special PIM assert processing is enabled.
337  */
338 static int pim_assert;
339 /*
340  * Rate limit for assert notification messages, in usec
341  */
342 #define ASSERT_MSG_TIME		3000000
343 
344 /*
345  * Kernel multicast routing API capabilities and setup.
346  * If more API capabilities are added to the kernel, they should be
347  * recorded in `mrt_api_support'.
348  */
349 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
350 					  MRT_MFC_FLAGS_BORDER_VIF |
351 					  MRT_MFC_RP |
352 					  MRT_MFC_BW_UPCALL);
353 static u_int32_t mrt_api_config = 0;
354 
355 /*
356  * Find a route for a given origin IP address and Multicast group address
357  * Type of service parameter to be added in the future!!!
358  * Statistics are updated by the caller if needed
359  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
360  */
361 static struct mfc *
362 mfc_find(struct in_addr *o, struct in_addr *g)
363 {
364 	struct mfc *rt;
365 
366 	LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
367 		if (in_hosteq(rt->mfc_origin, *o) &&
368 		    in_hosteq(rt->mfc_mcastgrp, *g) &&
369 		    (rt->mfc_stall == NULL))
370 			break;
371 	}
372 
373 	return rt;
374 }
375 
376 /*
377  * Macros to compute elapsed time efficiently
378  * Borrowed from Van Jacobson's scheduling code
379  */
380 #define TV_DELTA(a, b, delta) do {					\
381 	int xxs;							\
382 	delta = (a).tv_usec - (b).tv_usec;				\
383 	xxs = (a).tv_sec - (b).tv_sec;					\
384 	switch (xxs) {							\
385 	case 2:								\
386 		delta += 1000000;					\
387 		/* fall through */					\
388 	case 1:								\
389 		delta += 1000000;					\
390 		/* fall through */					\
391 	case 0:								\
392 		break;							\
393 	default:							\
394 		delta += (1000000 * xxs);				\
395 		break;							\
396 	}								\
397 } while (/*CONSTCOND*/ 0)
398 
399 #ifdef UPCALL_TIMING
400 u_int32_t upcall_data[51];
401 #endif /* UPCALL_TIMING */
402 
403 /*
404  * Handle MRT setsockopt commands to modify the multicast routing tables.
405  */
406 int
407 ip_mrouter_set(struct socket *so, struct sockopt *sopt)
408 {
409 	int error;
410 	int optval;
411 	struct vifctl vifc;
412 	vifi_t vifi;
413 	struct bw_upcall bwuc;
414 
415 	if (sopt->sopt_name != MRT_INIT && so != ip_mrouter)
416 		error = ENOPROTOOPT;
417 	else {
418 		switch (sopt->sopt_name) {
419 		case MRT_INIT:
420 			error = sockopt_getint(sopt, &optval);
421 			if (error)
422 				break;
423 
424 			error = ip_mrouter_init(so, optval);
425 			break;
426 		case MRT_DONE:
427 			error = ip_mrouter_done();
428 			break;
429 		case MRT_ADD_VIF:
430 			error = sockopt_get(sopt, &vifc, sizeof(vifc));
431 			if (error)
432 				break;
433 			error = add_vif(&vifc);
434 			break;
435 		case MRT_DEL_VIF:
436 			error = sockopt_get(sopt, &vifi, sizeof(vifi));
437 			if (error)
438 				break;
439 			error = del_vif(&vifi);
440 			break;
441 		case MRT_ADD_MFC:
442 			error = add_mfc(sopt);
443 			break;
444 		case MRT_DEL_MFC:
445 			error = del_mfc(sopt);
446 			break;
447 		case MRT_ASSERT:
448 			error = sockopt_getint(sopt, &optval);
449 			if (error)
450 				break;
451 			error = set_assert(optval);
452 			break;
453 		case MRT_API_CONFIG:
454 			error = set_api_config(sopt);
455 			break;
456 		case MRT_ADD_BW_UPCALL:
457 			error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
458 			if (error)
459 				break;
460 			error = add_bw_upcall(&bwuc);
461 			break;
462 		case MRT_DEL_BW_UPCALL:
463 			error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
464 			if (error)
465 				break;
466 			error = del_bw_upcall(&bwuc);
467 			break;
468 		default:
469 			error = ENOPROTOOPT;
470 			break;
471 		}
472 	}
473 	return error;
474 }
475 
476 /*
477  * Handle MRT getsockopt commands
478  */
479 int
480 ip_mrouter_get(struct socket *so, struct sockopt *sopt)
481 {
482 	int error;
483 
484 	if (so != ip_mrouter)
485 		error = ENOPROTOOPT;
486 	else {
487 		switch (sopt->sopt_name) {
488 		case MRT_VERSION:
489 			error = sockopt_setint(sopt, 0x0305); /* XXX !!!! */
490 			break;
491 		case MRT_ASSERT:
492 			error = sockopt_setint(sopt, pim_assert);
493 			break;
494 		case MRT_API_SUPPORT:
495 			error = sockopt_set(sopt, &mrt_api_support,
496 			    sizeof(mrt_api_support));
497 			break;
498 		case MRT_API_CONFIG:
499 			error = sockopt_set(sopt, &mrt_api_config,
500 			    sizeof(mrt_api_config));
501 			break;
502 		default:
503 			error = ENOPROTOOPT;
504 			break;
505 		}
506 	}
507 	return error;
508 }
509 
510 /*
511  * Handle ioctl commands to obtain information from the cache
512  */
513 int
514 mrt_ioctl(struct socket *so, u_long cmd, void *data)
515 {
516 	int error;
517 
518 	if (so != ip_mrouter)
519 		error = EINVAL;
520 	else
521 		switch (cmd) {
522 		case SIOCGETVIFCNT:
523 			error = get_vif_cnt((struct sioc_vif_req *)data);
524 			break;
525 		case SIOCGETSGCNT:
526 			error = get_sg_cnt((struct sioc_sg_req *)data);
527 			break;
528 		default:
529 			error = EINVAL;
530 			break;
531 		}
532 
533 	return error;
534 }
535 
536 /*
537  * returns the packet, byte, rpf-failure count for the source group provided
538  */
539 static int
540 get_sg_cnt(struct sioc_sg_req *req)
541 {
542 	int s;
543 	struct mfc *rt;
544 
545 	s = splsoftnet();
546 	rt = mfc_find(&req->src, &req->grp);
547 	if (rt == NULL) {
548 		splx(s);
549 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
550 		return EADDRNOTAVAIL;
551 	}
552 	req->pktcnt = rt->mfc_pkt_cnt;
553 	req->bytecnt = rt->mfc_byte_cnt;
554 	req->wrong_if = rt->mfc_wrong_if;
555 	splx(s);
556 
557 	return 0;
558 }
559 
560 /*
561  * returns the input and output packet and byte counts on the vif provided
562  */
563 static int
564 get_vif_cnt(struct sioc_vif_req *req)
565 {
566 	vifi_t vifi = req->vifi;
567 
568 	if (vifi >= numvifs)
569 		return EINVAL;
570 
571 	req->icount = viftable[vifi].v_pkt_in;
572 	req->ocount = viftable[vifi].v_pkt_out;
573 	req->ibytes = viftable[vifi].v_bytes_in;
574 	req->obytes = viftable[vifi].v_bytes_out;
575 
576 	return 0;
577 }
578 
579 /*
580  * Enable multicast routing
581  */
582 static int
583 ip_mrouter_init(struct socket *so, int v)
584 {
585 	if (mrtdebug)
586 		log(LOG_DEBUG,
587 		    "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
588 		    so->so_type, so->so_proto->pr_protocol);
589 
590 	if (so->so_type != SOCK_RAW ||
591 	    so->so_proto->pr_protocol != IPPROTO_IGMP)
592 		return EOPNOTSUPP;
593 
594 	if (v != 1)
595 		return EINVAL;
596 
597 	if (ip_mrouter != NULL)
598 		return EADDRINUSE;
599 
600 	ip_mrouter = so;
601 
602 	mfchashtbl = hashinit(MFCTBLSIZ, HASH_LIST, true, &mfchash);
603 	memset((void *)nexpire, 0, sizeof(nexpire));
604 
605 	pim_assert = 0;
606 
607 	callout_init(&expire_upcalls_ch, 0);
608 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
609 		      expire_upcalls, NULL);
610 
611 	callout_init(&bw_upcalls_ch, 0);
612 	callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
613 		      expire_bw_upcalls_send, NULL);
614 
615 	callout_init(&bw_meter_ch, 0);
616 	callout_reset(&bw_meter_ch, BW_METER_PERIOD,
617 		      expire_bw_meter_process, NULL);
618 
619 	if (mrtdebug)
620 		log(LOG_DEBUG, "ip_mrouter_init\n");
621 
622 	return 0;
623 }
624 
625 /*
626  * Disable multicast routing
627  */
628 int
629 ip_mrouter_done(void)
630 {
631 	vifi_t vifi;
632 	struct vif *vifp;
633 	int i;
634 	int s;
635 
636 	s = splsoftnet();
637 
638 	/* Clear out all the vifs currently in use. */
639 	for (vifi = 0; vifi < numvifs; vifi++) {
640 		vifp = &viftable[vifi];
641 		if (!in_nullhost(vifp->v_lcl_addr))
642 			reset_vif(vifp);
643 	}
644 
645 	numvifs = 0;
646 	pim_assert = 0;
647 	mrt_api_config = 0;
648 
649 	callout_stop(&expire_upcalls_ch);
650 	callout_stop(&bw_upcalls_ch);
651 	callout_stop(&bw_meter_ch);
652 
653 	/*
654 	 * Free all multicast forwarding cache entries.
655 	 */
656 	for (i = 0; i < MFCTBLSIZ; i++) {
657 		struct mfc *rt, *nrt;
658 
659 		for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
660 			nrt = LIST_NEXT(rt, mfc_hash);
661 
662 			expire_mfc(rt);
663 		}
664 	}
665 
666 	memset((void *)nexpire, 0, sizeof(nexpire));
667 	hashdone(mfchashtbl, HASH_LIST, mfchash);
668 	mfchashtbl = NULL;
669 
670 	bw_upcalls_n = 0;
671 	memset(bw_meter_timers, 0, sizeof(bw_meter_timers));
672 
673 	/* Reset de-encapsulation cache. */
674 
675 	ip_mrouter = NULL;
676 
677 	splx(s);
678 
679 	if (mrtdebug)
680 		log(LOG_DEBUG, "ip_mrouter_done\n");
681 
682 	return 0;
683 }
684 
685 void
686 ip_mrouter_detach(struct ifnet *ifp)
687 {
688 	int vifi, i;
689 	struct vif *vifp;
690 	struct mfc *rt;
691 	struct rtdetq *rte;
692 
693 	/* XXX not sure about side effect to userland routing daemon */
694 	for (vifi = 0; vifi < numvifs; vifi++) {
695 		vifp = &viftable[vifi];
696 		if (vifp->v_ifp == ifp)
697 			reset_vif(vifp);
698 	}
699 	for (i = 0; i < MFCTBLSIZ; i++) {
700 		if (nexpire[i] == 0)
701 			continue;
702 		LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
703 			for (rte = rt->mfc_stall; rte; rte = rte->next) {
704 				if (rte->ifp == ifp)
705 					rte->ifp = NULL;
706 			}
707 		}
708 	}
709 }
710 
711 /*
712  * Set PIM assert processing global
713  */
714 static int
715 set_assert(int i)
716 {
717 	pim_assert = !!i;
718 	return 0;
719 }
720 
721 /*
722  * Configure API capabilities
723  */
724 static int
725 set_api_config(struct sockopt *sopt)
726 {
727 	u_int32_t apival;
728 	int i, error;
729 
730 	/*
731 	 * We can set the API capabilities only if it is the first operation
732 	 * after MRT_INIT. I.e.:
733 	 *  - there are no vifs installed
734 	 *  - pim_assert is not enabled
735 	 *  - the MFC table is empty
736 	 */
737 	error = sockopt_get(sopt, &apival, sizeof(apival));
738 	if (error)
739 		return error;
740 	if (numvifs > 0)
741 		return EPERM;
742 	if (pim_assert)
743 		return EPERM;
744 	for (i = 0; i < MFCTBLSIZ; i++) {
745 		if (LIST_FIRST(&mfchashtbl[i]) != NULL)
746 			return EPERM;
747 	}
748 
749 	mrt_api_config = apival & mrt_api_support;
750 	return 0;
751 }
752 
753 /*
754  * Add a vif to the vif table
755  */
756 static int
757 add_vif(struct vifctl *vifcp)
758 {
759 	struct vif *vifp;
760 	struct ifnet *ifp;
761 	int error, s;
762 	struct sockaddr_in sin;
763 
764 	if (vifcp->vifc_vifi >= MAXVIFS)
765 		return EINVAL;
766 	if (in_nullhost(vifcp->vifc_lcl_addr))
767 		return EADDRNOTAVAIL;
768 
769 	vifp = &viftable[vifcp->vifc_vifi];
770 	if (!in_nullhost(vifp->v_lcl_addr))
771 		return EADDRINUSE;
772 
773 	/* Find the interface with an address in AF_INET family. */
774 #ifdef PIM
775 	if (vifcp->vifc_flags & VIFF_REGISTER) {
776 		/*
777 		 * XXX: Because VIFF_REGISTER does not really need a valid
778 		 * local interface (e.g. it could be 127.0.0.2), we don't
779 		 * check its address.
780 		 */
781 		ifp = NULL;
782 	} else
783 #endif
784 	{
785 		struct ifaddr *ifa;
786 
787 		sockaddr_in_init(&sin, &vifcp->vifc_lcl_addr, 0);
788 		s = pserialize_read_enter();
789 		ifa = ifa_ifwithaddr(sintosa(&sin));
790 		if (ifa == NULL) {
791 			pserialize_read_exit(s);
792 			return EADDRNOTAVAIL;
793 		}
794 		ifp = ifa->ifa_ifp;
795 		/* FIXME NOMPSAFE */
796 		pserialize_read_exit(s);
797 	}
798 
799 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
800 		if (vifcp->vifc_flags & VIFF_SRCRT) {
801 			log(LOG_ERR, "source routed tunnels not supported\n");
802 			return EOPNOTSUPP;
803 		}
804 
805 		/* attach this vif to decapsulator dispatch table */
806 		/*
807 		 * XXX Use addresses in registration so that matching
808 		 * can be done with radix tree in decapsulator.  But,
809 		 * we need to check inner header for multicast, so
810 		 * this requires both radix tree lookup and then a
811 		 * function to check, and this is not supported yet.
812 		 */
813 		error = encap_lock_enter();
814 		if (error)
815 			return error;
816 		vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
817 		    vif_encapcheck, &vif_encapsw, vifp);
818 		encap_lock_exit();
819 		if (!vifp->v_encap_cookie)
820 			return EINVAL;
821 
822 		/* Create a fake encapsulation interface. */
823 		ifp = malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK|M_ZERO);
824 		snprintf(ifp->if_xname, sizeof(ifp->if_xname),
825 			 "mdecap%d", vifcp->vifc_vifi);
826 
827 		/* Prepare cached route entry. */
828 		memset(&vifp->v_route, 0, sizeof(vifp->v_route));
829 #ifdef PIM
830 	} else if (vifcp->vifc_flags & VIFF_REGISTER) {
831 		ifp = &multicast_register_if;
832 		if (mrtdebug)
833 			log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
834 			    (void *)ifp);
835 		if (reg_vif_num == VIFI_INVALID) {
836 			memset(ifp, 0, sizeof(*ifp));
837 			snprintf(ifp->if_xname, sizeof(ifp->if_xname),
838 				 "register_vif");
839 			ifp->if_flags = IFF_LOOPBACK;
840 			memset(&vifp->v_route, 0, sizeof(vifp->v_route));
841 			reg_vif_num = vifcp->vifc_vifi;
842 		}
843 #endif
844 	} else {
845 		/* Make sure the interface supports multicast. */
846 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
847 			return EOPNOTSUPP;
848 
849 		/* Enable promiscuous reception of all IP multicasts. */
850 		sockaddr_in_init(&sin, &zeroin_addr, 0);
851 		error = if_mcast_op(ifp, SIOCADDMULTI, sintosa(&sin));
852 		if (error)
853 			return error;
854 	}
855 
856 	s = splsoftnet();
857 
858 	/* Define parameters for the tbf structure. */
859 	vifp->tbf_q = NULL;
860 	vifp->tbf_t = &vifp->tbf_q;
861 	microtime(&vifp->tbf_last_pkt_t);
862 	vifp->tbf_n_tok = 0;
863 	vifp->tbf_q_len = 0;
864 	vifp->tbf_max_q_len = MAXQSIZE;
865 
866 	vifp->v_flags = vifcp->vifc_flags;
867 	vifp->v_threshold = vifcp->vifc_threshold;
868 	/* scaling up here allows division by 1024 in critical code */
869 	vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
870 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
871 	vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
872 	vifp->v_ifp = ifp;
873 	/* Initialize per vif pkt counters. */
874 	vifp->v_pkt_in = 0;
875 	vifp->v_pkt_out = 0;
876 	vifp->v_bytes_in = 0;
877 	vifp->v_bytes_out = 0;
878 
879 	callout_init(&vifp->v_repq_ch, 0);
880 
881 	splx(s);
882 
883 	/* Adjust numvifs up if the vifi is higher than numvifs. */
884 	if (numvifs <= vifcp->vifc_vifi)
885 		numvifs = vifcp->vifc_vifi + 1;
886 
887 	if (mrtdebug)
888 		log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
889 		    vifcp->vifc_vifi,
890 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
891 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
892 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
893 		    vifcp->vifc_threshold,
894 		    vifcp->vifc_rate_limit);
895 
896 	return 0;
897 }
898 
899 void
900 reset_vif(struct vif *vifp)
901 {
902 	struct mbuf *m, *n;
903 	struct ifnet *ifp;
904 	struct sockaddr_in sin;
905 
906 	callout_stop(&vifp->v_repq_ch);
907 
908 	/* detach this vif from decapsulator dispatch table */
909 	encap_lock_enter();
910 	encap_detach(vifp->v_encap_cookie);
911 	encap_lock_exit();
912 	vifp->v_encap_cookie = NULL;
913 
914 	/*
915 	 * Free packets queued at the interface
916 	 */
917 	for (m = vifp->tbf_q; m != NULL; m = n) {
918 		n = m->m_nextpkt;
919 		m_freem(m);
920 	}
921 
922 	if (vifp->v_flags & VIFF_TUNNEL)
923 		free(vifp->v_ifp, M_MRTABLE);
924 	else if (vifp->v_flags & VIFF_REGISTER) {
925 #ifdef PIM
926 		reg_vif_num = VIFI_INVALID;
927 #endif
928 	} else {
929 		sockaddr_in_init(&sin, &zeroin_addr, 0);
930 		ifp = vifp->v_ifp;
931 		if_mcast_op(ifp, SIOCDELMULTI, sintosa(&sin));
932 	}
933 	memset((void *)vifp, 0, sizeof(*vifp));
934 }
935 
936 /*
937  * Delete a vif from the vif table
938  */
939 static int
940 del_vif(vifi_t *vifip)
941 {
942 	struct vif *vifp;
943 	vifi_t vifi;
944 	int s;
945 
946 	if (*vifip >= numvifs)
947 		return EINVAL;
948 
949 	vifp = &viftable[*vifip];
950 	if (in_nullhost(vifp->v_lcl_addr))
951 		return EADDRNOTAVAIL;
952 
953 	s = splsoftnet();
954 
955 	reset_vif(vifp);
956 
957 	/* Adjust numvifs down */
958 	for (vifi = numvifs; vifi > 0; vifi--)
959 		if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
960 			break;
961 	numvifs = vifi;
962 
963 	splx(s);
964 
965 	if (mrtdebug)
966 		log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
967 
968 	return 0;
969 }
970 
971 /*
972  * update an mfc entry without resetting counters and S,G addresses.
973  */
974 static void
975 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
976 {
977 	int i;
978 
979 	rt->mfc_parent = mfccp->mfcc_parent;
980 	for (i = 0; i < numvifs; i++) {
981 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
982 		rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
983 			MRT_MFC_FLAGS_ALL;
984 	}
985 	/* set the RP address */
986 	if (mrt_api_config & MRT_MFC_RP)
987 		rt->mfc_rp = mfccp->mfcc_rp;
988 	else
989 		rt->mfc_rp = zeroin_addr;
990 }
991 
992 /*
993  * fully initialize an mfc entry from the parameter.
994  */
995 static void
996 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
997 {
998 	rt->mfc_origin     = mfccp->mfcc_origin;
999 	rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
1000 
1001 	update_mfc_params(rt, mfccp);
1002 
1003 	/* initialize pkt counters per src-grp */
1004 	rt->mfc_pkt_cnt    = 0;
1005 	rt->mfc_byte_cnt   = 0;
1006 	rt->mfc_wrong_if   = 0;
1007 	timerclear(&rt->mfc_last_assert);
1008 }
1009 
1010 static void
1011 expire_mfc(struct mfc *rt)
1012 {
1013 	struct rtdetq *rte, *nrte;
1014 
1015 	free_bw_list(rt->mfc_bw_meter);
1016 
1017 	for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
1018 		nrte = rte->next;
1019 		m_freem(rte->m);
1020 		free(rte, M_MRTABLE);
1021 	}
1022 
1023 	LIST_REMOVE(rt, mfc_hash);
1024 	free(rt, M_MRTABLE);
1025 }
1026 
1027 /*
1028  * Add an mfc entry
1029  */
1030 static int
1031 add_mfc(struct sockopt *sopt)
1032 {
1033 	struct mfcctl2 mfcctl2;
1034 	struct mfcctl2 *mfccp;
1035 	struct mfc *rt;
1036 	u_int32_t hash = 0;
1037 	struct rtdetq *rte, *nrte;
1038 	u_short nstl;
1039 	int s;
1040 	int error;
1041 
1042 	/*
1043 	 * select data size depending on API version.
1044 	 */
1045 	mfccp = &mfcctl2;
1046 	memset(&mfcctl2, 0, sizeof(mfcctl2));
1047 
1048 	if (mrt_api_config & MRT_API_FLAGS_ALL)
1049 		error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
1050 	else
1051 		error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
1052 
1053 	if (error)
1054 		return error;
1055 
1056 	s = splsoftnet();
1057 	rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1058 
1059 	/* If an entry already exists, just update the fields */
1060 	if (rt) {
1061 		if (mrtdebug & DEBUG_MFC)
1062 			log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
1063 			    ntohl(mfccp->mfcc_origin.s_addr),
1064 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1065 			    mfccp->mfcc_parent);
1066 
1067 		update_mfc_params(rt, mfccp);
1068 
1069 		splx(s);
1070 		return 0;
1071 	}
1072 
1073 	/*
1074 	 * Find the entry for which the upcall was made and update
1075 	 */
1076 	nstl = 0;
1077 	hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
1078 	LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1079 		if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1080 		    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
1081 		    rt->mfc_stall != NULL) {
1082 			if (nstl++)
1083 				log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
1084 				    "multiple kernel entries",
1085 				    ntohl(mfccp->mfcc_origin.s_addr),
1086 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1087 				    mfccp->mfcc_parent, rt->mfc_stall);
1088 
1089 			if (mrtdebug & DEBUG_MFC)
1090 				log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n",
1091 				    ntohl(mfccp->mfcc_origin.s_addr),
1092 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1093 				    mfccp->mfcc_parent, rt->mfc_stall);
1094 
1095 			rte = rt->mfc_stall;
1096 			init_mfc_params(rt, mfccp);
1097 			rt->mfc_stall = NULL;
1098 
1099 			rt->mfc_expire = 0; /* Don't clean this guy up */
1100 			nexpire[hash]--;
1101 
1102 			/* free packets Qed at the end of this entry */
1103 			for (; rte != NULL; rte = nrte) {
1104 				nrte = rte->next;
1105 				if (rte->ifp) {
1106 					ip_mdq(rte->m, rte->ifp, rt);
1107 				}
1108 				m_freem(rte->m);
1109 #ifdef UPCALL_TIMING
1110 				collate(&rte->t);
1111 #endif /* UPCALL_TIMING */
1112 				free(rte, M_MRTABLE);
1113 			}
1114 		}
1115 	}
1116 
1117 	/*
1118 	 * It is possible that an entry is being inserted without an upcall
1119 	 */
1120 	if (nstl == 0) {
1121 		/*
1122 		 * No mfc; make a new one
1123 		 */
1124 		if (mrtdebug & DEBUG_MFC)
1125 			log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
1126 			    ntohl(mfccp->mfcc_origin.s_addr),
1127 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1128 			    mfccp->mfcc_parent);
1129 
1130 		LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1131 			if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1132 			    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
1133 				init_mfc_params(rt, mfccp);
1134 				if (rt->mfc_expire)
1135 					nexpire[hash]--;
1136 				rt->mfc_expire = 0;
1137 				break; /* XXX */
1138 			}
1139 		}
1140 		if (rt == NULL) {	/* no upcall, so make a new entry */
1141 			rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
1142 			if (rt == NULL) {
1143 				splx(s);
1144 				return ENOBUFS;
1145 			}
1146 
1147 			init_mfc_params(rt, mfccp);
1148 			rt->mfc_expire	= 0;
1149 			rt->mfc_stall	= NULL;
1150 			rt->mfc_bw_meter = NULL;
1151 
1152 			/* insert new entry at head of hash chain */
1153 			LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1154 		}
1155 	}
1156 
1157 	splx(s);
1158 	return 0;
1159 }
1160 
1161 #ifdef UPCALL_TIMING
1162 /*
1163  * collect delay statistics on the upcalls
1164  */
1165 static void
1166 collate(struct timeval *t)
1167 {
1168 	u_int32_t d;
1169 	struct timeval tp;
1170 	u_int32_t delta;
1171 
1172 	microtime(&tp);
1173 
1174 	if (timercmp(t, &tp, <)) {
1175 		TV_DELTA(tp, *t, delta);
1176 
1177 		d = delta >> 10;
1178 		if (d > 50)
1179 			d = 50;
1180 
1181 		++upcall_data[d];
1182 	}
1183 }
1184 #endif /* UPCALL_TIMING */
1185 
1186 /*
1187  * Delete an mfc entry
1188  */
1189 static int
1190 del_mfc(struct sockopt *sopt)
1191 {
1192 	struct mfcctl2 mfcctl2;
1193 	struct mfcctl2 *mfccp;
1194 	struct mfc *rt;
1195 	int s;
1196 	int error;
1197 
1198 	/*
1199 	 * XXX: for deleting MFC entries the information in entries
1200 	 * of size "struct mfcctl" is sufficient.
1201 	 */
1202 
1203 	mfccp = &mfcctl2;
1204 	memset(&mfcctl2, 0, sizeof(mfcctl2));
1205 
1206 	error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
1207 	if (error) {
1208 		/* Try with the size of mfcctl2. */
1209 		error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
1210 		if (error)
1211 			return error;
1212 	}
1213 
1214 	if (mrtdebug & DEBUG_MFC)
1215 		log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
1216 		    ntohl(mfccp->mfcc_origin.s_addr),
1217 		    ntohl(mfccp->mfcc_mcastgrp.s_addr));
1218 
1219 	s = splsoftnet();
1220 
1221 	rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1222 	if (rt == NULL) {
1223 		splx(s);
1224 		return EADDRNOTAVAIL;
1225 	}
1226 
1227 	/*
1228 	 * free the bw_meter entries
1229 	 */
1230 	free_bw_list(rt->mfc_bw_meter);
1231 	rt->mfc_bw_meter = NULL;
1232 
1233 	LIST_REMOVE(rt, mfc_hash);
1234 	free(rt, M_MRTABLE);
1235 
1236 	splx(s);
1237 	return 0;
1238 }
1239 
1240 static int
1241 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
1242 {
1243 	if (s) {
1244 		if (sbappendaddr(&s->so_rcv, sintosa(src), mm, NULL) != 0) {
1245 			sorwakeup(s);
1246 			return 0;
1247 		}
1248 		soroverflow(s);
1249 	}
1250 	m_freem(mm);
1251 	return -1;
1252 }
1253 
1254 /*
1255  * IP multicast forwarding function. This function assumes that the packet
1256  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
1257  * pointed to by "ifp", and the packet is to be relayed to other networks
1258  * that have members of the packet's destination IP multicast group.
1259  *
1260  * The packet is returned unscathed to the caller, unless it is
1261  * erroneous, in which case a non-zero return value tells the caller to
1262  * discard it.
1263  */
1264 
1265 #define IP_HDR_LEN  20	/* # bytes of fixed IP header (excluding options) */
1266 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1267 
1268 int
1269 ip_mforward(struct mbuf *m, struct ifnet *ifp)
1270 {
1271 	struct ip *ip = mtod(m, struct ip *);
1272 	struct mfc *rt;
1273 	static int srctun = 0;
1274 	struct mbuf *mm;
1275 	struct sockaddr_in sin;
1276 	int s;
1277 	vifi_t vifi;
1278 
1279 	if (mrtdebug & DEBUG_FORWARD)
1280 		log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
1281 		    ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
1282 
1283 	/*
1284 	 * XXX XXX: Why do we check [1] against IPOPT_LSRR? Because we
1285 	 * expect [0] to be IPOPT_NOP, maybe? In all cases that doesn't
1286 	 * make a lot of sense, a forged packet can just put two IPOPT_NOPs
1287 	 * followed by one IPOPT_LSRR, and bypass the check.
1288 	 */
1289 	if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
1290 	    ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
1291 		/*
1292 		 * Packet arrived via a physical interface or
1293 		 * an encapsulated tunnel or a register_vif.
1294 		 */
1295 	} else {
1296 		/*
1297 		 * Packet arrived through a source-route tunnel.
1298 		 * Source-route tunnels are no longer supported.
1299 		 */
1300 		if ((srctun++ % 1000) == 0)
1301 			log(LOG_ERR,
1302 			    "ip_mforward: received source-routed packet from %x\n",
1303 			    ntohl(ip->ip_src.s_addr));
1304 		return EOPNOTSUPP;
1305 	}
1306 
1307 	/*
1308 	 * Clear any in-bound checksum flags for this packet.
1309 	 */
1310 	m->m_pkthdr.csum_flags = 0;
1311 
1312 	/*
1313 	 * Don't forward a packet with time-to-live of zero or one,
1314 	 * or a packet destined to a local-only group.
1315 	 */
1316 	if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
1317 		return 0;
1318 
1319 	/*
1320 	 * Determine forwarding vifs from the forwarding cache table
1321 	 */
1322 	s = splsoftnet();
1323 	++mrtstat.mrts_mfc_lookups;
1324 	rt = mfc_find(&ip->ip_src, &ip->ip_dst);
1325 
1326 	/* Entry exists, so forward if necessary */
1327 	if (rt != NULL) {
1328 		splx(s);
1329 		return ip_mdq(m, ifp, rt);
1330 	} else {
1331 		/*
1332 		 * If we don't have a route for packet's origin, make a copy
1333 		 * of the packet and send message to routing daemon.
1334 		 */
1335 
1336 		struct mbuf *mb0;
1337 		struct rtdetq *rte;
1338 		u_int32_t hash;
1339 		const int hlen = ip->ip_hl << 2;
1340 #ifdef UPCALL_TIMING
1341 		struct timeval tp;
1342 		microtime(&tp);
1343 #endif
1344 
1345 		++mrtstat.mrts_mfc_misses;
1346 
1347 		mrtstat.mrts_no_route++;
1348 		if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
1349 			log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
1350 			    ntohl(ip->ip_src.s_addr),
1351 			    ntohl(ip->ip_dst.s_addr));
1352 
1353 		/*
1354 		 * Allocate mbufs early so that we don't do extra work if we are
1355 		 * just going to fail anyway.  Make sure to pullup the header so
1356 		 * that other people can't step on it.
1357 		 */
1358 		rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT);
1359 		if (rte == NULL) {
1360 			splx(s);
1361 			return ENOBUFS;
1362 		}
1363 		mb0 = m_copypacket(m, M_DONTWAIT);
1364 		M_PULLUP(mb0, hlen);
1365 		if (mb0 == NULL) {
1366 			free(rte, M_MRTABLE);
1367 			splx(s);
1368 			return ENOBUFS;
1369 		}
1370 
1371 		/* is there an upcall waiting for this flow? */
1372 		hash = MFCHASH(ip->ip_src, ip->ip_dst);
1373 		LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1374 			if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
1375 			    in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
1376 			    rt->mfc_stall != NULL)
1377 				break;
1378 		}
1379 
1380 		if (rt == NULL) {
1381 			int i;
1382 			struct igmpmsg *im;
1383 
1384 			/*
1385 			 * Locate the vifi for the incoming interface for
1386 			 * this packet.
1387 			 * If none found, drop packet.
1388 			 */
1389 			for (vifi = 0; vifi < numvifs &&
1390 				 viftable[vifi].v_ifp != ifp; vifi++)
1391 				;
1392 			if (vifi >= numvifs) /* vif not found, drop packet */
1393 				goto non_fatal;
1394 
1395 			/* no upcall, so make a new entry */
1396 			rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
1397 			if (rt == NULL)
1398 				goto fail;
1399 
1400 			/*
1401 			 * Make a copy of the header to send to the user level
1402 			 * process
1403 			 */
1404 			mm = m_copym(m, 0, hlen, M_DONTWAIT);
1405 			M_PULLUP(mm, hlen);
1406 			if (mm == NULL)
1407 				goto fail1;
1408 
1409 			/*
1410 			 * Send message to routing daemon to install
1411 			 * a route into the kernel table
1412 			 */
1413 
1414 			im = mtod(mm, struct igmpmsg *);
1415 			im->im_msgtype = IGMPMSG_NOCACHE;
1416 			im->im_mbz = 0;
1417 			im->im_vif = vifi;
1418 
1419 			mrtstat.mrts_upcalls++;
1420 
1421 			sockaddr_in_init(&sin, &ip->ip_src, 0);
1422 			if (socket_send(ip_mrouter, mm, &sin) < 0) {
1423 				log(LOG_WARNING,
1424 				    "ip_mforward: ip_mrouter socket queue full\n");
1425 				++mrtstat.mrts_upq_sockfull;
1426 			fail1:
1427 				free(rt, M_MRTABLE);
1428 			fail:
1429 				free(rte, M_MRTABLE);
1430 				m_freem(mb0);
1431 				splx(s);
1432 				return ENOBUFS;
1433 			}
1434 
1435 			/* insert new entry at head of hash chain */
1436 			rt->mfc_origin = ip->ip_src;
1437 			rt->mfc_mcastgrp = ip->ip_dst;
1438 			rt->mfc_pkt_cnt = 0;
1439 			rt->mfc_byte_cnt = 0;
1440 			rt->mfc_wrong_if = 0;
1441 			rt->mfc_expire = UPCALL_EXPIRE;
1442 			nexpire[hash]++;
1443 			for (i = 0; i < numvifs; i++) {
1444 				rt->mfc_ttls[i] = 0;
1445 				rt->mfc_flags[i] = 0;
1446 			}
1447 			rt->mfc_parent = -1;
1448 
1449 			/* clear the RP address */
1450 			rt->mfc_rp = zeroin_addr;
1451 
1452 			rt->mfc_bw_meter = NULL;
1453 
1454 			/* link into table */
1455 			LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1456 			/* Add this entry to the end of the queue */
1457 			rt->mfc_stall = rte;
1458 		} else {
1459 			/* determine if q has overflowed */
1460 			struct rtdetq **p;
1461 			int npkts = 0;
1462 
1463 			/*
1464 			 * XXX ouch! we need to append to the list, but we
1465 			 * only have a pointer to the front, so we have to
1466 			 * scan the entire list every time.
1467 			 */
1468 			for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
1469 				if (++npkts > MAX_UPQ) {
1470 					mrtstat.mrts_upq_ovflw++;
1471 				non_fatal:
1472 					free(rte, M_MRTABLE);
1473 					m_freem(mb0);
1474 					splx(s);
1475 					return 0;
1476 				}
1477 
1478 			/* Add this entry to the end of the queue */
1479 			*p = rte;
1480 		}
1481 
1482 		rte->next = NULL;
1483 		rte->m = mb0;
1484 		rte->ifp = ifp;
1485 #ifdef UPCALL_TIMING
1486 		rte->t = tp;
1487 #endif
1488 
1489 		splx(s);
1490 
1491 		return 0;
1492 	}
1493 }
1494 
1495 /*ARGSUSED*/
1496 static void
1497 expire_upcalls(void *v)
1498 {
1499 	int i;
1500 
1501 	/* XXX NOMPSAFE still need softnet_lock */
1502 	mutex_enter(softnet_lock);
1503 	KERNEL_LOCK(1, NULL);
1504 
1505 	for (i = 0; i < MFCTBLSIZ; i++) {
1506 		struct mfc *rt, *nrt;
1507 
1508 		if (nexpire[i] == 0)
1509 			continue;
1510 
1511 		for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
1512 			nrt = LIST_NEXT(rt, mfc_hash);
1513 
1514 			if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
1515 				continue;
1516 			nexpire[i]--;
1517 
1518 			/*
1519 			 * free the bw_meter entries
1520 			 */
1521 			while (rt->mfc_bw_meter != NULL) {
1522 				struct bw_meter *x = rt->mfc_bw_meter;
1523 
1524 				rt->mfc_bw_meter = x->bm_mfc_next;
1525 				kmem_intr_free(x, sizeof(*x));
1526 			}
1527 
1528 			++mrtstat.mrts_cache_cleanups;
1529 			if (mrtdebug & DEBUG_EXPIRE)
1530 				log(LOG_DEBUG,
1531 				    "expire_upcalls: expiring (%x %x)\n",
1532 				    ntohl(rt->mfc_origin.s_addr),
1533 				    ntohl(rt->mfc_mcastgrp.s_addr));
1534 
1535 			expire_mfc(rt);
1536 		}
1537 	}
1538 
1539 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
1540 	    expire_upcalls, NULL);
1541 
1542 	KERNEL_UNLOCK_ONE(NULL);
1543 	mutex_exit(softnet_lock);
1544 }
1545 
1546 /*
1547  * Macro to send packet on vif.
1548  */
1549 #define MC_SEND(ip, vifp, m) do {					\
1550 	if ((vifp)->v_flags & VIFF_TUNNEL)				\
1551 		encap_send((ip), (vifp), (m));				\
1552 	else								\
1553 		phyint_send((ip), (vifp), (m));				\
1554 } while (/*CONSTCOND*/ 0)
1555 
1556 /*
1557  * Packet forwarding routine once entry in the cache is made
1558  */
1559 static int
1560 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
1561 {
1562 	struct ip *ip = mtod(m, struct ip *);
1563 	vifi_t vifi;
1564 	struct vif *vifp;
1565 	struct sockaddr_in sin;
1566 	const int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
1567 
1568 	/*
1569 	 * Don't forward if it didn't arrive from the parent vif for its origin.
1570 	 */
1571 	vifi = rt->mfc_parent;
1572 	if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
1573 		/* came in the wrong interface */
1574 		if (mrtdebug & DEBUG_FORWARD)
1575 			log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
1576 			    ifp, vifi,
1577 			    vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
1578 		++mrtstat.mrts_wrong_if;
1579 		++rt->mfc_wrong_if;
1580 
1581 		/*
1582 		 * If we are doing PIM assert processing, send a message
1583 		 * to the routing daemon.
1584 		 *
1585 		 * XXX: A PIM-SM router needs the WRONGVIF detection so it
1586 		 * can complete the SPT switch, regardless of the type
1587 		 * of the iif (broadcast media, GRE tunnel, etc).
1588 		 */
1589 		if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
1590 			struct timeval now;
1591 			u_int32_t delta;
1592 
1593 #ifdef PIM
1594 			if (ifp == &multicast_register_if)
1595 				pimstat.pims_rcv_registers_wrongiif++;
1596 #endif
1597 
1598 			/* Get vifi for the incoming packet */
1599 			for (vifi = 0;
1600 			     vifi < numvifs && viftable[vifi].v_ifp != ifp;
1601 			     vifi++)
1602 			    ;
1603 			if (vifi >= numvifs) {
1604 				/* The iif is not found: ignore the packet. */
1605 				return 0;
1606 			}
1607 
1608 			if (rt->mfc_flags[vifi] &
1609 			    MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
1610 				/* WRONGVIF disabled: ignore the packet */
1611 				return 0;
1612 			}
1613 
1614 			microtime(&now);
1615 
1616 			TV_DELTA(rt->mfc_last_assert, now, delta);
1617 
1618 			if (delta > ASSERT_MSG_TIME) {
1619 				struct igmpmsg *im;
1620 				const int hlen = ip->ip_hl << 2;
1621 				struct mbuf *mm =
1622 				    m_copym(m, 0, hlen, M_DONTWAIT);
1623 
1624 				M_PULLUP(mm, hlen);
1625 				if (mm == NULL)
1626 					return ENOBUFS;
1627 
1628 				rt->mfc_last_assert = now;
1629 
1630 				im = mtod(mm, struct igmpmsg *);
1631 				im->im_msgtype	= IGMPMSG_WRONGVIF;
1632 				im->im_mbz	= 0;
1633 				im->im_vif	= vifi;
1634 
1635 				mrtstat.mrts_upcalls++;
1636 
1637 				sockaddr_in_init(&sin, &im->im_src, 0);
1638 				if (socket_send(ip_mrouter, mm, &sin) < 0) {
1639 					log(LOG_WARNING,
1640 					    "ip_mforward: ip_mrouter socket queue full\n");
1641 					++mrtstat.mrts_upq_sockfull;
1642 					return ENOBUFS;
1643 				}
1644 			}
1645 		}
1646 		return 0;
1647 	}
1648 
1649 	/* If I sourced this packet, it counts as output, else it was input. */
1650 	if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
1651 		viftable[vifi].v_pkt_out++;
1652 		viftable[vifi].v_bytes_out += plen;
1653 	} else {
1654 		viftable[vifi].v_pkt_in++;
1655 		viftable[vifi].v_bytes_in += plen;
1656 	}
1657 	rt->mfc_pkt_cnt++;
1658 	rt->mfc_byte_cnt += plen;
1659 
1660 	/*
1661 	 * For each vif, decide if a copy of the packet should be forwarded.
1662 	 * Forward if:
1663 	 *  - the ttl exceeds the vif's threshold
1664 	 *  - there are group members downstream on interface
1665 	 */
1666 	for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) {
1667 		if ((rt->mfc_ttls[vifi] > 0) &&
1668 			(ip->ip_ttl > rt->mfc_ttls[vifi])) {
1669 			vifp->v_pkt_out++;
1670 			vifp->v_bytes_out += plen;
1671 #ifdef PIM
1672 			if (vifp->v_flags & VIFF_REGISTER)
1673 				pim_register_send(ip, vifp, m, rt);
1674 			else
1675 #endif
1676 			MC_SEND(ip, vifp, m);
1677 		}
1678 	}
1679 
1680 	/*
1681 	 * Perform upcall-related bw measuring.
1682 	 */
1683 	if (rt->mfc_bw_meter != NULL) {
1684 		struct bw_meter *x;
1685 		struct timeval now;
1686 
1687 		microtime(&now);
1688 		for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
1689 			bw_meter_receive_packet(x, plen, &now);
1690 	}
1691 
1692 	return 0;
1693 }
1694 
1695 static void
1696 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1697 {
1698 	struct mbuf *mb_copy;
1699 	const int hlen = ip->ip_hl << 2;
1700 
1701 	/*
1702 	 * Make a new reference to the packet; make sure that
1703 	 * the IP header is actually copied, not just referenced,
1704 	 * so that ip_output() only scribbles on the copy.
1705 	 */
1706 	mb_copy = m_copypacket(m, M_DONTWAIT);
1707 	M_PULLUP(mb_copy, hlen);
1708 	if (mb_copy == NULL)
1709 		return;
1710 
1711 	if (vifp->v_rate_limit <= 0)
1712 		tbf_send_packet(vifp, mb_copy);
1713 	else
1714 		tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *),
1715 		    ntohs(ip->ip_len));
1716 }
1717 
1718 static void
1719 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1720 {
1721 	struct mbuf *mb_copy;
1722 	struct ip *ip_copy;
1723 	int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
1724 
1725 	/* Take care of delayed checksums */
1726 	if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
1727 		in_undefer_cksum_tcpudp(m);
1728 		m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
1729 	}
1730 
1731 	/*
1732 	 * copy the old packet & pullup its IP header into the
1733 	 * new mbuf so we can modify it.  Try to fill the new
1734 	 * mbuf since if we don't the ethernet driver will.
1735 	 */
1736 	MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
1737 	if (mb_copy == NULL)
1738 		return;
1739 	mb_copy->m_data += max_linkhdr;
1740 	mb_copy->m_pkthdr.len = len;
1741 	mb_copy->m_len = sizeof(multicast_encap_iphdr);
1742 
1743 	if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
1744 		m_freem(mb_copy);
1745 		return;
1746 	}
1747 	i = MHLEN - max_linkhdr;
1748 	if (i > len)
1749 		i = len;
1750 	mb_copy = m_pullup(mb_copy, i);
1751 	if (mb_copy == NULL)
1752 		return;
1753 
1754 	/*
1755 	 * fill in the encapsulating IP header.
1756 	 */
1757 	ip_copy = mtod(mb_copy, struct ip *);
1758 	*ip_copy = multicast_encap_iphdr;
1759 	if (len < IP_MINFRAGSIZE)
1760 		ip_copy->ip_id = 0;
1761 	else
1762 		ip_copy->ip_id = ip_newid(NULL);
1763 	ip_copy->ip_len = htons(len);
1764 	ip_copy->ip_src = vifp->v_lcl_addr;
1765 	ip_copy->ip_dst = vifp->v_rmt_addr;
1766 
1767 	/*
1768 	 * turn the encapsulated IP header back into a valid one.
1769 	 */
1770 	ip = (struct ip *)((char *)ip_copy + sizeof(multicast_encap_iphdr));
1771 	--ip->ip_ttl;
1772 	ip->ip_sum = 0;
1773 	mb_copy->m_data += sizeof(multicast_encap_iphdr);
1774 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
1775 	mb_copy->m_data -= sizeof(multicast_encap_iphdr);
1776 
1777 	if (vifp->v_rate_limit <= 0)
1778 		tbf_send_packet(vifp, mb_copy);
1779 	else
1780 		tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len));
1781 }
1782 
1783 /*
1784  * De-encapsulate a packet and feed it back through ip input.
1785  */
1786 static void
1787 vif_input(struct mbuf *m, int off, int proto, void *eparg)
1788 {
1789 	struct vif *vifp = eparg;
1790 
1791 	KASSERT(vifp != NULL);
1792 
1793 	if (proto != ENCAP_PROTO) {
1794 		m_freem(m);
1795 		mrtstat.mrts_bad_tunnel++;
1796 		return;
1797 	}
1798 
1799 	m_adj(m, off);
1800 	m_set_rcvif(m, vifp->v_ifp);
1801 
1802 	if (__predict_false(!pktq_enqueue(ip_pktq, m, 0))) {
1803 		m_freem(m);
1804 	}
1805 }
1806 
1807 /*
1808  * Check if the packet should be received on the vif denoted by arg.
1809  * (The encap selection code will call this once per vif since each is
1810  * registered separately.)
1811  */
1812 static int
1813 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg)
1814 {
1815 	struct vif *vifp;
1816 	struct ip ip;
1817 
1818 #ifdef DIAGNOSTIC
1819 	if (!arg || proto != IPPROTO_IPV4)
1820 		panic("unexpected arg in vif_encapcheck");
1821 #endif
1822 
1823 	/*
1824 	 * Accept the packet only if the inner heaader is multicast
1825 	 * and the outer header matches a tunnel-mode vif.  Order
1826 	 * checks in the hope that common non-matching packets will be
1827 	 * rejected quickly.  Assume that unicast IPv4 traffic in a
1828 	 * parallel tunnel (e.g. gif(4)) is unlikely.
1829 	 */
1830 
1831 	/* Obtain the outer IP header and the vif pointer. */
1832 	m_copydata(m, 0, sizeof(ip), (void *)&ip);
1833 	vifp = (struct vif *)arg;
1834 
1835 	/*
1836 	 * The outer source must match the vif's remote peer address.
1837 	 * For a multicast router with several tunnels, this is the
1838 	 * only check that will fail on packets in other tunnels,
1839 	 * assuming the local address is the same.
1840 	 */
1841 	if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src))
1842 		return 0;
1843 
1844 	/* The outer destination must match the vif's local address. */
1845 	if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst))
1846 		return 0;
1847 
1848 	/* The vif must be of tunnel type. */
1849 	if ((vifp->v_flags & VIFF_TUNNEL) == 0)
1850 		return 0;
1851 
1852 	/* Check that the inner destination is multicast. */
1853 	if (off + sizeof(ip) > m->m_pkthdr.len)
1854 		return 0;
1855 	m_copydata(m, off, sizeof(ip), (void *)&ip);
1856 	if (!IN_MULTICAST(ip.ip_dst.s_addr))
1857 		return 0;
1858 
1859 	/*
1860 	 * We have checked that both the outer src and dst addresses
1861 	 * match the vif, and that the inner destination is multicast
1862 	 * (224/5).  By claiming more than 64, we intend to
1863 	 * preferentially take packets that also match a parallel
1864 	 * gif(4).
1865 	 */
1866 	return 32 + 32 + 5;
1867 }
1868 
1869 /*
1870  * Token bucket filter module
1871  */
1872 static void
1873 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len)
1874 {
1875 
1876 	if (len > MAX_BKT_SIZE) {
1877 		/* drop if packet is too large */
1878 		mrtstat.mrts_pkt2large++;
1879 		m_freem(m);
1880 		return;
1881 	}
1882 
1883 	tbf_update_tokens(vifp);
1884 
1885 	/*
1886 	 * If there are enough tokens, and the queue is empty, send this packet
1887 	 * out immediately.  Otherwise, try to insert it on this vif's queue.
1888 	 */
1889 	if (vifp->tbf_q_len == 0) {
1890 		if (len <= vifp->tbf_n_tok) {
1891 			vifp->tbf_n_tok -= len;
1892 			tbf_send_packet(vifp, m);
1893 		} else {
1894 			/* queue packet and timeout till later */
1895 			tbf_queue(vifp, m);
1896 			callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1897 			    tbf_reprocess_q, vifp);
1898 		}
1899 	} else {
1900 		if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
1901 		    !tbf_dq_sel(vifp, ip)) {
1902 			/* queue full, and couldn't make room */
1903 			mrtstat.mrts_q_overflow++;
1904 			m_freem(m);
1905 		} else {
1906 			/* queue length low enough, or made room */
1907 			tbf_queue(vifp, m);
1908 			tbf_process_q(vifp);
1909 		}
1910 	}
1911 }
1912 
1913 /*
1914  * adds a packet to the queue at the interface
1915  */
1916 static void
1917 tbf_queue(struct vif *vifp, struct mbuf *m)
1918 {
1919 	int s = splsoftnet();
1920 
1921 	/* insert at tail */
1922 	*vifp->tbf_t = m;
1923 	vifp->tbf_t = &m->m_nextpkt;
1924 	vifp->tbf_q_len++;
1925 
1926 	splx(s);
1927 }
1928 
1929 /*
1930  * processes the queue at the interface
1931  */
1932 static void
1933 tbf_process_q(struct vif *vifp)
1934 {
1935 	struct mbuf *m;
1936 	int len;
1937 	int s = splsoftnet();
1938 
1939 	/*
1940 	 * Loop through the queue at the interface and send as many packets
1941 	 * as possible.
1942 	 */
1943 	for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) {
1944 		len = ntohs(mtod(m, struct ip *)->ip_len);
1945 
1946 		/* determine if the packet can be sent */
1947 		if (len <= vifp->tbf_n_tok) {
1948 			/* if so,
1949 			 * reduce no of tokens, dequeue the packet,
1950 			 * send the packet.
1951 			 */
1952 			if ((vifp->tbf_q = m->m_nextpkt) == NULL)
1953 				vifp->tbf_t = &vifp->tbf_q;
1954 			--vifp->tbf_q_len;
1955 
1956 			m->m_nextpkt = NULL;
1957 			vifp->tbf_n_tok -= len;
1958 			tbf_send_packet(vifp, m);
1959 		} else
1960 			break;
1961 	}
1962 	splx(s);
1963 }
1964 
1965 static void
1966 tbf_reprocess_q(void *arg)
1967 {
1968 	struct vif *vifp = arg;
1969 
1970 	if (ip_mrouter == NULL)
1971 		return;
1972 
1973 	tbf_update_tokens(vifp);
1974 	tbf_process_q(vifp);
1975 
1976 	if (vifp->tbf_q_len != 0)
1977 		callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1978 		    tbf_reprocess_q, vifp);
1979 }
1980 
1981 /* function that will selectively discard a member of the queue
1982  * based on the precedence value and the priority
1983  */
1984 static int
1985 tbf_dq_sel(struct vif *vifp, struct ip *ip)
1986 {
1987 	u_int p;
1988 	struct mbuf **mp, *m;
1989 	int s = splsoftnet();
1990 
1991 	p = priority(vifp, ip);
1992 
1993 	for (mp = &vifp->tbf_q, m = *mp;
1994 	    m != NULL;
1995 	    mp = &m->m_nextpkt, m = *mp) {
1996 		if (p > priority(vifp, mtod(m, struct ip *))) {
1997 			if ((*mp = m->m_nextpkt) == NULL)
1998 				vifp->tbf_t = mp;
1999 			--vifp->tbf_q_len;
2000 
2001 			m_freem(m);
2002 			mrtstat.mrts_drop_sel++;
2003 			splx(s);
2004 			return 1;
2005 		}
2006 	}
2007 	splx(s);
2008 	return 0;
2009 }
2010 
2011 static void
2012 tbf_send_packet(struct vif *vifp, struct mbuf *m)
2013 {
2014 	int error;
2015 	int s = splsoftnet();
2016 
2017 	if (vifp->v_flags & VIFF_TUNNEL) {
2018 		/* If tunnel options */
2019 		ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL);
2020 	} else {
2021 		/* if physical interface option, extract the options and then send */
2022 		struct ip_moptions imo;
2023 
2024 		imo.imo_multicast_if_index = if_get_index(vifp->v_ifp);
2025 		imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
2026 		imo.imo_multicast_loop = 1;
2027 
2028 		error = ip_output(m, NULL, NULL, IP_FORWARDING|IP_MULTICASTOPTS,
2029 		    &imo, NULL);
2030 
2031 		if (mrtdebug & DEBUG_XMIT)
2032 			log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
2033 			    (long)(vifp - viftable), error);
2034 	}
2035 	splx(s);
2036 }
2037 
2038 /* determine the current time and then
2039  * the elapsed time (between the last time and time now)
2040  * in milliseconds & update the no. of tokens in the bucket
2041  */
2042 static void
2043 tbf_update_tokens(struct vif *vifp)
2044 {
2045 	struct timeval tp;
2046 	u_int32_t tm;
2047 	int s = splsoftnet();
2048 
2049 	microtime(&tp);
2050 
2051 	TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
2052 
2053 	/*
2054 	 * This formula is actually
2055 	 * "time in seconds" * "bytes/second".
2056 	 *
2057 	 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
2058 	 *
2059 	 * The (1000/1024) was introduced in add_vif to optimize
2060 	 * this divide into a shift.
2061 	 */
2062 	vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
2063 	vifp->tbf_last_pkt_t = tp;
2064 
2065 	if (vifp->tbf_n_tok > MAX_BKT_SIZE)
2066 		vifp->tbf_n_tok = MAX_BKT_SIZE;
2067 
2068 	splx(s);
2069 }
2070 
2071 static int
2072 priority(struct vif *vifp, struct ip *ip)
2073 {
2074 	int prio = 50;	/* the lowest priority -- default case */
2075 
2076 	/* temporary hack; may add general packet classifier some day */
2077 
2078 	/*
2079 	 * XXX XXX: We're reading the UDP header, but we didn't ensure
2080 	 * it was present in the packet.
2081 	 */
2082 
2083 	/*
2084 	 * The UDP port space is divided up into four priority ranges:
2085 	 * [0, 16384)     : unclassified - lowest priority
2086 	 * [16384, 32768) : audio - highest priority
2087 	 * [32768, 49152) : whiteboard - medium priority
2088 	 * [49152, 65536) : video - low priority
2089 	 */
2090 	if (ip->ip_p == IPPROTO_UDP) {
2091 		struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
2092 
2093 		switch (ntohs(udp->uh_dport) & 0xc000) {
2094 		case 0x4000:
2095 			prio = 70;
2096 			break;
2097 		case 0x8000:
2098 			prio = 60;
2099 			break;
2100 		case 0xc000:
2101 			prio = 55;
2102 			break;
2103 		}
2104 
2105 		if (tbfdebug > 1)
2106 			log(LOG_DEBUG, "port %x prio %d\n",
2107 			    ntohs(udp->uh_dport), prio);
2108 	}
2109 
2110 	return prio;
2111 }
2112 
2113 /*
2114  * Code for bandwidth monitors
2115  */
2116 
2117 /*
2118  * Define common interface for timeval-related methods
2119  */
2120 #define	BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
2121 #define	BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
2122 #define	BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
2123 
2124 static uint32_t
2125 compute_bw_meter_flags(struct bw_upcall *req)
2126 {
2127 	uint32_t flags = 0;
2128 
2129 	if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
2130 		flags |= BW_METER_UNIT_PACKETS;
2131 	if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
2132 		flags |= BW_METER_UNIT_BYTES;
2133 	if (req->bu_flags & BW_UPCALL_GEQ)
2134 		flags |= BW_METER_GEQ;
2135 	if (req->bu_flags & BW_UPCALL_LEQ)
2136 		flags |= BW_METER_LEQ;
2137 
2138 	return flags;
2139 }
2140 
2141 /*
2142  * Add a bw_meter entry
2143  */
2144 static int
2145 add_bw_upcall(struct bw_upcall *req)
2146 {
2147 	int s;
2148 	struct mfc *mfc;
2149 	struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
2150 		BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
2151 	struct timeval now;
2152 	struct bw_meter *x;
2153 	uint32_t flags;
2154 
2155 	if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
2156 		return EOPNOTSUPP;
2157 
2158 	/* Test if the flags are valid */
2159 	if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
2160 		return EINVAL;
2161 	if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
2162 		return EINVAL;
2163 	if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
2164 	    == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
2165 		return EINVAL;
2166 
2167 	/* Test if the threshold time interval is valid */
2168 	if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
2169 		return EINVAL;
2170 
2171 	flags = compute_bw_meter_flags(req);
2172 
2173 	/*
2174 	 * Find if we have already same bw_meter entry
2175 	 */
2176 	s = splsoftnet();
2177 	mfc = mfc_find(&req->bu_src, &req->bu_dst);
2178 	if (mfc == NULL) {
2179 		splx(s);
2180 		return EADDRNOTAVAIL;
2181 	}
2182 	for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
2183 		if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
2184 		    &req->bu_threshold.b_time, ==)) &&
2185 		    (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
2186 		    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
2187 		    (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
2188 			splx(s);
2189 			return 0;		/* XXX Already installed */
2190 		}
2191 	}
2192 
2193 	/* Allocate the new bw_meter entry */
2194 	x = kmem_intr_alloc(sizeof(*x), KM_NOSLEEP);
2195 	if (x == NULL) {
2196 		splx(s);
2197 		return ENOBUFS;
2198 	}
2199 
2200 	/* Set the new bw_meter entry */
2201 	x->bm_threshold.b_time = req->bu_threshold.b_time;
2202 	microtime(&now);
2203 	x->bm_start_time = now;
2204 	x->bm_threshold.b_packets = req->bu_threshold.b_packets;
2205 	x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
2206 	x->bm_measured.b_packets = 0;
2207 	x->bm_measured.b_bytes = 0;
2208 	x->bm_flags = flags;
2209 	x->bm_time_next = NULL;
2210 	x->bm_time_hash = BW_METER_BUCKETS;
2211 
2212 	/* Add the new bw_meter entry to the front of entries for this MFC */
2213 	x->bm_mfc = mfc;
2214 	x->bm_mfc_next = mfc->mfc_bw_meter;
2215 	mfc->mfc_bw_meter = x;
2216 	schedule_bw_meter(x, &now);
2217 	splx(s);
2218 
2219 	return 0;
2220 }
2221 
2222 static void
2223 free_bw_list(struct bw_meter *list)
2224 {
2225 	while (list != NULL) {
2226 		struct bw_meter *x = list;
2227 
2228 		list = list->bm_mfc_next;
2229 		unschedule_bw_meter(x);
2230 		kmem_intr_free(x, sizeof(*x));
2231 	}
2232 }
2233 
2234 /*
2235  * Delete one or multiple bw_meter entries
2236  */
2237 static int
2238 del_bw_upcall(struct bw_upcall *req)
2239 {
2240 	int s;
2241 	struct mfc *mfc;
2242 	struct bw_meter *x;
2243 
2244 	if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
2245 		return EOPNOTSUPP;
2246 
2247 	s = splsoftnet();
2248 	/* Find the corresponding MFC entry */
2249 	mfc = mfc_find(&req->bu_src, &req->bu_dst);
2250 	if (mfc == NULL) {
2251 		splx(s);
2252 		return EADDRNOTAVAIL;
2253 	} else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
2254 		/*
2255 		 * Delete all bw_meter entries for this mfc
2256 		 */
2257 		struct bw_meter *list;
2258 
2259 		list = mfc->mfc_bw_meter;
2260 		mfc->mfc_bw_meter = NULL;
2261 		free_bw_list(list);
2262 		splx(s);
2263 		return 0;
2264 	} else {			/* Delete a single bw_meter entry */
2265 		struct bw_meter *prev;
2266 		uint32_t flags = 0;
2267 
2268 		flags = compute_bw_meter_flags(req);
2269 
2270 		/* Find the bw_meter entry to delete */
2271 		for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
2272 		     prev = x, x = x->bm_mfc_next) {
2273 			if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
2274 			    &req->bu_threshold.b_time, ==)) &&
2275 			    (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
2276 			    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
2277 			    (x->bm_flags & BW_METER_USER_FLAGS) == flags)
2278 				break;
2279 		}
2280 		if (x != NULL) { /* Delete entry from the list for this MFC */
2281 			if (prev != NULL)
2282 				prev->bm_mfc_next = x->bm_mfc_next;	/* remove from middle*/
2283 			else
2284 				x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
2285 
2286 			unschedule_bw_meter(x);
2287 			splx(s);
2288 			/* Free the bw_meter entry */
2289 			kmem_intr_free(x, sizeof(*x));
2290 			return 0;
2291 		} else {
2292 			splx(s);
2293 			return EINVAL;
2294 		}
2295 	}
2296 	/* NOTREACHED */
2297 }
2298 
2299 /*
2300  * Perform bandwidth measurement processing that may result in an upcall
2301  */
2302 static void
2303 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
2304 {
2305 	struct timeval delta;
2306 
2307 	delta = *nowp;
2308 	BW_TIMEVALDECR(&delta, &x->bm_start_time);
2309 
2310 	if (x->bm_flags & BW_METER_GEQ) {
2311 		/*
2312 		 * Processing for ">=" type of bw_meter entry
2313 		 */
2314 		if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
2315 			/* Reset the bw_meter entry */
2316 			x->bm_start_time = *nowp;
2317 			x->bm_measured.b_packets = 0;
2318 			x->bm_measured.b_bytes = 0;
2319 			x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2320 		}
2321 
2322 		/* Record that a packet is received */
2323 		x->bm_measured.b_packets++;
2324 		x->bm_measured.b_bytes += plen;
2325 
2326 		/*
2327 		 * Test if we should deliver an upcall
2328 		 */
2329 		if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
2330 			if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2331 				 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
2332 				((x->bm_flags & BW_METER_UNIT_BYTES) &&
2333 				 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
2334 				/* Prepare an upcall for delivery */
2335 				bw_meter_prepare_upcall(x, nowp);
2336 				x->bm_flags |= BW_METER_UPCALL_DELIVERED;
2337 			}
2338 		}
2339 	} else if (x->bm_flags & BW_METER_LEQ) {
2340 		/*
2341 		 * Processing for "<=" type of bw_meter entry
2342 		 */
2343 		if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
2344 			/*
2345 			 * We are behind time with the multicast forwarding table
2346 			 * scanning for "<=" type of bw_meter entries, so test now
2347 			 * if we should deliver an upcall.
2348 			 */
2349 			if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2350 				 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
2351 				((x->bm_flags & BW_METER_UNIT_BYTES) &&
2352 				 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
2353 				/* Prepare an upcall for delivery */
2354 				bw_meter_prepare_upcall(x, nowp);
2355 			}
2356 			/* Reschedule the bw_meter entry */
2357 			unschedule_bw_meter(x);
2358 			schedule_bw_meter(x, nowp);
2359 		}
2360 
2361 		/* Record that a packet is received */
2362 		x->bm_measured.b_packets++;
2363 		x->bm_measured.b_bytes += plen;
2364 
2365 		/*
2366 		 * Test if we should restart the measuring interval
2367 		 */
2368 		if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
2369 		     x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
2370 		    (x->bm_flags & BW_METER_UNIT_BYTES &&
2371 		     x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
2372 			/* Don't restart the measuring interval */
2373 		} else {
2374 			/* Do restart the measuring interval */
2375 			/*
2376 			 * XXX: note that we don't unschedule and schedule, because this
2377 			 * might be too much overhead per packet. Instead, when we process
2378 			 * all entries for a given timer hash bin, we check whether it is
2379 			 * really a timeout. If not, we reschedule at that time.
2380 			 */
2381 			x->bm_start_time = *nowp;
2382 			x->bm_measured.b_packets = 0;
2383 			x->bm_measured.b_bytes = 0;
2384 			x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2385 		}
2386 	}
2387 }
2388 
2389 /*
2390  * Prepare a bandwidth-related upcall
2391  */
2392 static void
2393 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
2394 {
2395 	struct timeval delta;
2396 	struct bw_upcall *u;
2397 
2398 	/*
2399 	 * Compute the measured time interval
2400 	 */
2401 	delta = *nowp;
2402 	BW_TIMEVALDECR(&delta, &x->bm_start_time);
2403 
2404 	/*
2405 	 * If there are too many pending upcalls, deliver them now
2406 	 */
2407 	if (bw_upcalls_n >= BW_UPCALLS_MAX)
2408 		bw_upcalls_send();
2409 
2410 	/*
2411 	 * Set the bw_upcall entry
2412 	 */
2413 	u = &bw_upcalls[bw_upcalls_n++];
2414 	u->bu_src = x->bm_mfc->mfc_origin;
2415 	u->bu_dst = x->bm_mfc->mfc_mcastgrp;
2416 	u->bu_threshold.b_time = x->bm_threshold.b_time;
2417 	u->bu_threshold.b_packets = x->bm_threshold.b_packets;
2418 	u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
2419 	u->bu_measured.b_time = delta;
2420 	u->bu_measured.b_packets = x->bm_measured.b_packets;
2421 	u->bu_measured.b_bytes = x->bm_measured.b_bytes;
2422 	u->bu_flags = 0;
2423 	if (x->bm_flags & BW_METER_UNIT_PACKETS)
2424 		u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
2425 	if (x->bm_flags & BW_METER_UNIT_BYTES)
2426 		u->bu_flags |= BW_UPCALL_UNIT_BYTES;
2427 	if (x->bm_flags & BW_METER_GEQ)
2428 		u->bu_flags |= BW_UPCALL_GEQ;
2429 	if (x->bm_flags & BW_METER_LEQ)
2430 		u->bu_flags |= BW_UPCALL_LEQ;
2431 }
2432 
2433 /*
2434  * Send the pending bandwidth-related upcalls
2435  */
2436 static void
2437 bw_upcalls_send(void)
2438 {
2439 	struct mbuf *m;
2440 	int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
2441 	struct sockaddr_in k_igmpsrc = {
2442 		.sin_len = sizeof(k_igmpsrc),
2443 		.sin_family = AF_INET,
2444 	};
2445 	static struct igmpmsg igmpmsg = {
2446 		0,		/* unused1 */
2447 		0,		/* unused2 */
2448 		IGMPMSG_BW_UPCALL,/* im_msgtype */
2449 		0,		/* im_mbz */
2450 		0,		/* im_vif */
2451 		0,		/* unused3 */
2452 		{ 0 },		/* im_src */
2453 		{ 0 }		/* im_dst */
2454 	};
2455 
2456 	if (bw_upcalls_n == 0)
2457 		return;			/* No pending upcalls */
2458 
2459 	bw_upcalls_n = 0;
2460 
2461 	/*
2462 	 * Allocate a new mbuf, initialize it with the header and
2463 	 * the payload for the pending calls.
2464 	 */
2465 	MGETHDR(m, M_DONTWAIT, MT_HEADER);
2466 	if (m == NULL) {
2467 		log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
2468 		return;
2469 	}
2470 
2471 	m->m_len = m->m_pkthdr.len = 0;
2472 	m_copyback(m, 0, sizeof(struct igmpmsg), (void *)&igmpmsg);
2473 	m_copyback(m, sizeof(struct igmpmsg), len, (void *)&bw_upcalls[0]);
2474 
2475 	/*
2476 	 * Send the upcalls
2477 	 * XXX do we need to set the address in k_igmpsrc ?
2478 	 */
2479 	mrtstat.mrts_upcalls++;
2480 	if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
2481 		log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
2482 		++mrtstat.mrts_upq_sockfull;
2483 	}
2484 }
2485 
2486 /*
2487  * Compute the timeout hash value for the bw_meter entries
2488  */
2489 #define	BW_METER_TIMEHASH(bw_meter, hash)				\
2490     do {								\
2491 	struct timeval next_timeval = (bw_meter)->bm_start_time;	\
2492 	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time);	\
2493 	(hash) = next_timeval.tv_sec;					\
2494 	if (next_timeval.tv_usec)					\
2495 		(hash)++; /* XXX: make sure we don't timeout early */	\
2496 	(hash) %= BW_METER_BUCKETS;					\
2497     } while (/*CONSTCOND*/ 0)
2498 
2499 /*
2500  * Schedule a timer to process periodically bw_meter entry of type "<="
2501  * by linking the entry in the proper hash bucket.
2502  */
2503 static void
2504 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
2505 {
2506 	int time_hash;
2507 
2508 	if (!(x->bm_flags & BW_METER_LEQ))
2509 		return;		/* XXX: we schedule timers only for "<=" entries */
2510 
2511 	/*
2512 	 * Reset the bw_meter entry
2513 	 */
2514 	x->bm_start_time = *nowp;
2515 	x->bm_measured.b_packets = 0;
2516 	x->bm_measured.b_bytes = 0;
2517 	x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2518 
2519 	/*
2520 	 * Compute the timeout hash value and insert the entry
2521 	 */
2522 	BW_METER_TIMEHASH(x, time_hash);
2523 	x->bm_time_next = bw_meter_timers[time_hash];
2524 	bw_meter_timers[time_hash] = x;
2525 	x->bm_time_hash = time_hash;
2526 }
2527 
2528 /*
2529  * Unschedule the periodic timer that processes bw_meter entry of type "<="
2530  * by removing the entry from the proper hash bucket.
2531  */
2532 static void
2533 unschedule_bw_meter(struct bw_meter *x)
2534 {
2535 	int time_hash;
2536 	struct bw_meter *prev, *tmp;
2537 
2538 	if (!(x->bm_flags & BW_METER_LEQ))
2539 		return;		/* XXX: we schedule timers only for "<=" entries */
2540 
2541 	/*
2542 	 * Compute the timeout hash value and delete the entry
2543 	 */
2544 	time_hash = x->bm_time_hash;
2545 	if (time_hash >= BW_METER_BUCKETS)
2546 		return;		/* Entry was not scheduled */
2547 
2548 	for (prev = NULL, tmp = bw_meter_timers[time_hash];
2549 	     tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
2550 		if (tmp == x)
2551 			break;
2552 
2553 	if (tmp == NULL)
2554 		panic("unschedule_bw_meter: bw_meter entry not found");
2555 
2556 	if (prev != NULL)
2557 		prev->bm_time_next = x->bm_time_next;
2558 	else
2559 		bw_meter_timers[time_hash] = x->bm_time_next;
2560 
2561 	x->bm_time_next = NULL;
2562 	x->bm_time_hash = BW_METER_BUCKETS;
2563 }
2564 
2565 /*
2566  * Process all "<=" type of bw_meter that should be processed now,
2567  * and for each entry prepare an upcall if necessary. Each processed
2568  * entry is rescheduled again for the (periodic) processing.
2569  *
2570  * This is run periodically (once per second normally). On each round,
2571  * all the potentially matching entries are in the hash slot that we are
2572  * looking at.
2573  */
2574 static void
2575 bw_meter_process(void)
2576 {
2577 	int s;
2578 	static uint32_t last_tv_sec;	/* last time we processed this */
2579 
2580 	uint32_t loops;
2581 	int i;
2582 	struct timeval now, process_endtime;
2583 
2584 	microtime(&now);
2585 	if (last_tv_sec == now.tv_sec)
2586 		return;		/* nothing to do */
2587 
2588 	loops = now.tv_sec - last_tv_sec;
2589 	last_tv_sec = now.tv_sec;
2590 	if (loops > BW_METER_BUCKETS)
2591 		loops = BW_METER_BUCKETS;
2592 
2593 	s = splsoftnet();
2594 	/*
2595 	 * Process all bins of bw_meter entries from the one after the last
2596 	 * processed to the current one. On entry, i points to the last bucket
2597 	 * visited, so we need to increment i at the beginning of the loop.
2598 	 */
2599 	for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
2600 		struct bw_meter *x, *tmp_list;
2601 
2602 		if (++i >= BW_METER_BUCKETS)
2603 			i = 0;
2604 
2605 		/* Disconnect the list of bw_meter entries from the bin */
2606 		tmp_list = bw_meter_timers[i];
2607 		bw_meter_timers[i] = NULL;
2608 
2609 		/* Process the list of bw_meter entries */
2610 		while (tmp_list != NULL) {
2611 			x = tmp_list;
2612 			tmp_list = tmp_list->bm_time_next;
2613 
2614 			/* Test if the time interval is over */
2615 			process_endtime = x->bm_start_time;
2616 			BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
2617 			if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
2618 				/* Not yet: reschedule, but don't reset */
2619 				int time_hash;
2620 
2621 				BW_METER_TIMEHASH(x, time_hash);
2622 				if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
2623 					/*
2624 					 * XXX: somehow the bin processing is a bit ahead of time.
2625 					 * Put the entry in the next bin.
2626 					 */
2627 					if (++time_hash >= BW_METER_BUCKETS)
2628 						time_hash = 0;
2629 				}
2630 				x->bm_time_next = bw_meter_timers[time_hash];
2631 				bw_meter_timers[time_hash] = x;
2632 				x->bm_time_hash = time_hash;
2633 
2634 				continue;
2635 			}
2636 
2637 			/*
2638 			 * Test if we should deliver an upcall
2639 			 */
2640 			if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2641 			    (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
2642 			    ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2643 			    (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
2644 				/* Prepare an upcall for delivery */
2645 				bw_meter_prepare_upcall(x, &now);
2646 			}
2647 
2648 			/*
2649 			  * Reschedule for next processing
2650 			 */
2651 			schedule_bw_meter(x, &now);
2652 		}
2653 	}
2654 
2655 	/* Send all upcalls that are pending delivery */
2656 	bw_upcalls_send();
2657 
2658 	splx(s);
2659 }
2660 
2661 /*
2662  * A periodic function for sending all upcalls that are pending delivery
2663  */
2664 static void
2665 expire_bw_upcalls_send(void *unused)
2666 {
2667 	int s;
2668 
2669 	s = splsoftnet();
2670 	bw_upcalls_send();
2671 	splx(s);
2672 
2673 	callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
2674 	    expire_bw_upcalls_send, NULL);
2675 }
2676 
2677 /*
2678  * A periodic function for periodic scanning of the multicast forwarding
2679  * table for processing all "<=" bw_meter entries.
2680  */
2681 static void
2682 expire_bw_meter_process(void *unused)
2683 {
2684 	if (mrt_api_config & MRT_MFC_BW_UPCALL)
2685 		bw_meter_process();
2686 
2687 	callout_reset(&bw_meter_ch, BW_METER_PERIOD,
2688 	    expire_bw_meter_process, NULL);
2689 }
2690 
2691 /*
2692  * End of bandwidth monitoring code
2693  */
2694 
2695 #ifdef PIM
2696 /*
2697  * Send the packet up to the user daemon, or eventually do kernel encapsulation
2698  */
2699 static int
2700 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
2701     struct mfc *rt)
2702 {
2703 	struct mbuf *mb_copy, *mm;
2704 
2705 	if (mrtdebug & DEBUG_PIM)
2706 		log(LOG_DEBUG, "pim_register_send: \n");
2707 
2708 	mb_copy = pim_register_prepare(ip, m);
2709 	if (mb_copy == NULL)
2710 		return ENOBUFS;
2711 
2712 	/*
2713 	 * Send all the fragments. Note that the mbuf for each fragment
2714 	 * is freed by the sending machinery.
2715 	 */
2716 	for (mm = mb_copy; mm; mm = mb_copy) {
2717 		mb_copy = mm->m_nextpkt;
2718 		mm->m_nextpkt = NULL;
2719 		mm = m_pullup(mm, sizeof(struct ip));
2720 		if (mm != NULL) {
2721 			ip = mtod(mm, struct ip *);
2722 			if ((mrt_api_config & MRT_MFC_RP) &&
2723 			    !in_nullhost(rt->mfc_rp)) {
2724 				pim_register_send_rp(ip, vifp, mm, rt);
2725 			} else {
2726 				pim_register_send_upcall(ip, vifp, mm, rt);
2727 			}
2728 		}
2729 	}
2730 
2731 	return 0;
2732 }
2733 
2734 /*
2735  * Return a copy of the data packet that is ready for PIM Register
2736  * encapsulation.
2737  * XXX: Note that in the returned copy the IP header is a valid one.
2738  */
2739 static struct mbuf *
2740 pim_register_prepare(struct ip *ip, struct mbuf *m)
2741 {
2742 	struct mbuf *mb_copy = NULL;
2743 	int mtu;
2744 
2745 	/* Take care of delayed checksums */
2746 	if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
2747 		in_undefer_cksum_tcpudp(m);
2748 		m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
2749 	}
2750 
2751 	/*
2752 	 * Copy the old packet & pullup its IP header into the
2753 	 * new mbuf so we can modify it.
2754 	 */
2755 	mb_copy = m_copypacket(m, M_DONTWAIT);
2756 	if (mb_copy == NULL)
2757 		return NULL;
2758 	mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
2759 	if (mb_copy == NULL)
2760 		return NULL;
2761 
2762 	/* take care of the TTL */
2763 	ip = mtod(mb_copy, struct ip *);
2764 	--ip->ip_ttl;
2765 
2766 	/* Compute the MTU after the PIM Register encapsulation */
2767 	mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
2768 
2769 	if (ntohs(ip->ip_len) <= mtu) {
2770 		/* Turn the IP header into a valid one */
2771 		ip->ip_sum = 0;
2772 		ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
2773 	} else {
2774 		/* Fragment the packet */
2775 		if (ip_fragment(mb_copy, NULL, mtu) != 0) {
2776 			/* XXX: mb_copy was freed by ip_fragment() */
2777 			return NULL;
2778 		}
2779 	}
2780 	return mb_copy;
2781 }
2782 
2783 /*
2784  * Send an upcall with the data packet to the user-level process.
2785  */
2786 static int
2787 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
2788     struct mbuf *mb_copy, struct mfc *rt)
2789 {
2790 	struct mbuf *mb_first;
2791 	int len = ntohs(ip->ip_len);
2792 	struct igmpmsg *im;
2793 	struct sockaddr_in k_igmpsrc = {
2794 		.sin_len = sizeof(k_igmpsrc),
2795 		.sin_family = AF_INET,
2796 	};
2797 
2798 	/*
2799 	 * Add a new mbuf with an upcall header
2800 	 */
2801 	MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
2802 	if (mb_first == NULL) {
2803 		m_freem(mb_copy);
2804 		return ENOBUFS;
2805 	}
2806 	mb_first->m_data += max_linkhdr;
2807 	mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
2808 	mb_first->m_len = sizeof(struct igmpmsg);
2809 	mb_first->m_next = mb_copy;
2810 
2811 	/* Send message to routing daemon */
2812 	im = mtod(mb_first, struct igmpmsg *);
2813 	im->im_msgtype	= IGMPMSG_WHOLEPKT;
2814 	im->im_mbz	= 0;
2815 	im->im_vif	= vifp - viftable;
2816 	im->im_src	= ip->ip_src;
2817 	im->im_dst	= ip->ip_dst;
2818 
2819 	k_igmpsrc.sin_addr	= ip->ip_src;
2820 
2821 	mrtstat.mrts_upcalls++;
2822 
2823 	if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
2824 		if (mrtdebug & DEBUG_PIM)
2825 			log(LOG_WARNING,
2826 			    "mcast: pim_register_send_upcall: ip_mrouter socket queue full\n");
2827 		++mrtstat.mrts_upq_sockfull;
2828 		return ENOBUFS;
2829 	}
2830 
2831 	/* Keep statistics */
2832 	pimstat.pims_snd_registers_msgs++;
2833 	pimstat.pims_snd_registers_bytes += len;
2834 
2835 	return 0;
2836 }
2837 
2838 /*
2839  * Encapsulate the data packet in PIM Register message and send it to the RP.
2840  */
2841 static int
2842 pim_register_send_rp(struct ip *ip, struct vif *vifp,
2843     struct mbuf *mb_copy, struct mfc *rt)
2844 {
2845 	struct mbuf *mb_first;
2846 	struct ip *ip_outer;
2847 	struct pim_encap_pimhdr *pimhdr;
2848 	int len = ntohs(ip->ip_len);
2849 	vifi_t vifi = rt->mfc_parent;
2850 
2851 	if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
2852 		m_freem(mb_copy);
2853 		return EADDRNOTAVAIL;		/* The iif vif is invalid */
2854 	}
2855 
2856 	/*
2857 	 * Add a new mbuf with the encapsulating header
2858 	 */
2859 	MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
2860 	if (mb_first == NULL) {
2861 		m_freem(mb_copy);
2862 		return ENOBUFS;
2863 	}
2864 	mb_first->m_data += max_linkhdr;
2865 	mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
2866 	mb_first->m_next = mb_copy;
2867 
2868 	mb_first->m_pkthdr.len = len + mb_first->m_len;
2869 
2870 	/*
2871 	 * Fill in the encapsulating IP and PIM header
2872 	 */
2873 	ip_outer = mtod(mb_first, struct ip *);
2874 	*ip_outer = pim_encap_iphdr;
2875 	if (mb_first->m_pkthdr.len < IP_MINFRAGSIZE)
2876 		ip_outer->ip_id = 0;
2877 	else
2878 		ip_outer->ip_id = ip_newid(NULL);
2879 	ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
2880 	    sizeof(pim_encap_pimhdr));
2881 	ip_outer->ip_src = viftable[vifi].v_lcl_addr;
2882 	ip_outer->ip_dst = rt->mfc_rp;
2883 	/*
2884 	 * Copy the inner header TOS to the outer header, and take care of the
2885 	 * IP_DF bit.
2886 	 */
2887 	ip_outer->ip_tos = ip->ip_tos;
2888 	if (ntohs(ip->ip_off) & IP_DF)
2889 		ip_outer->ip_off |= htons(IP_DF);
2890 	pimhdr = (struct pim_encap_pimhdr *)((char *)ip_outer
2891 	    + sizeof(pim_encap_iphdr));
2892 	*pimhdr = pim_encap_pimhdr;
2893 	/* If the iif crosses a border, set the Border-bit */
2894 	if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
2895 		pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
2896 
2897 	mb_first->m_data += sizeof(pim_encap_iphdr);
2898 	pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
2899 	mb_first->m_data -= sizeof(pim_encap_iphdr);
2900 
2901 	if (vifp->v_rate_limit == 0)
2902 		tbf_send_packet(vifp, mb_first);
2903 	else
2904 		tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len));
2905 
2906 	/* Keep statistics */
2907 	pimstat.pims_snd_registers_msgs++;
2908 	pimstat.pims_snd_registers_bytes += len;
2909 
2910 	return 0;
2911 }
2912 
2913 /*
2914  * PIM-SMv2 and PIM-DM messages processing.
2915  * Receives and verifies the PIM control messages, and passes them
2916  * up to the listening socket, using rip_input().
2917  * The only message with special processing is the PIM_REGISTER message
2918  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
2919  * is passed to if_simloop().
2920  */
2921 void
2922 pim_input(struct mbuf *m, int off, int proto)
2923 {
2924 	struct ip *ip = mtod(m, struct ip *);
2925 	struct pim *pim;
2926 	int minlen;
2927 	int datalen;
2928 	int ip_tos;
2929 	int iphlen;
2930 
2931 	iphlen = off;
2932 	datalen = ntohs(ip->ip_len) - iphlen;
2933 
2934 	/* Keep statistics */
2935 	pimstat.pims_rcv_total_msgs++;
2936 	pimstat.pims_rcv_total_bytes += datalen;
2937 
2938 	/*
2939 	 * Validate lengths
2940 	 */
2941 	if (datalen < PIM_MINLEN) {
2942 		pimstat.pims_rcv_tooshort++;
2943 		log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
2944 		    datalen, (u_long)ip->ip_src.s_addr);
2945 		m_freem(m);
2946 		return;
2947 	}
2948 
2949 	/*
2950 	 * If the packet is at least as big as a REGISTER, go ahead
2951 	 * and grab the PIM REGISTER header size, to avoid another
2952 	 * possible m_pullup() later.
2953 	 *
2954 	 * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
2955 	 * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
2956 	 */
2957 	minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
2958 
2959 	/*
2960 	 * Get the IP and PIM headers in contiguous memory, and
2961 	 * possibly the PIM REGISTER header.
2962 	 */
2963 	if ((m->m_flags & M_EXT || m->m_len < minlen) &&
2964 	    (m = m_pullup(m, minlen)) == NULL) {
2965 		log(LOG_ERR, "pim_input: m_pullup failure\n");
2966 		return;
2967 	}
2968 	ip = mtod(m, struct ip *);
2969 	ip_tos = ip->ip_tos;
2970 
2971 	/* adjust mbuf to point to the PIM header */
2972 	m->m_data += iphlen;
2973 	m->m_len  -= iphlen;
2974 	pim = mtod(m, struct pim *);
2975 
2976 	/*
2977 	 * Validate checksum. If PIM REGISTER, exclude the data packet.
2978 	 *
2979 	 * XXX: some older PIMv2 implementations don't make this distinction,
2980 	 * so for compatibility reason perform the checksum over part of the
2981 	 * message, and if error, then over the whole message.
2982 	 */
2983 	if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
2984 		/* do nothing, checksum okay */
2985 	} else if (in_cksum(m, datalen)) {
2986 		pimstat.pims_rcv_badsum++;
2987 		if (mrtdebug & DEBUG_PIM)
2988 			log(LOG_DEBUG, "pim_input: invalid checksum\n");
2989 		m_freem(m);
2990 		return;
2991 	}
2992 
2993 	/* PIM version check */
2994 	if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
2995 		pimstat.pims_rcv_badversion++;
2996 		log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
2997 		    PIM_VT_V(pim->pim_vt), PIM_VERSION);
2998 		m_freem(m);
2999 		return;
3000 	}
3001 
3002 	/* restore mbuf back to the outer IP */
3003 	m->m_data -= iphlen;
3004 	m->m_len  += iphlen;
3005 
3006 	if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
3007 		/*
3008 		 * Since this is a REGISTER, we'll make a copy of the register
3009 		 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
3010 		 * routing daemon.
3011 		 */
3012 		int s;
3013 		struct sockaddr_in dst = {
3014 			.sin_len = sizeof(dst),
3015 			.sin_family = AF_INET,
3016 		};
3017 		struct mbuf *mcp;
3018 		struct ip *encap_ip;
3019 		u_int32_t *reghdr;
3020 		struct ifnet *vifp;
3021 
3022 		s = splsoftnet();
3023 		if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
3024 			splx(s);
3025 			if (mrtdebug & DEBUG_PIM)
3026 				log(LOG_DEBUG,
3027 				    "pim_input: register vif not set: %d\n", reg_vif_num);
3028 			m_freem(m);
3029 			return;
3030 		}
3031 		/* XXX need refcnt? */
3032 		vifp = viftable[reg_vif_num].v_ifp;
3033 		splx(s);
3034 
3035 		/*
3036 		 * Validate length
3037 		 */
3038 		if (datalen < PIM_REG_MINLEN) {
3039 			pimstat.pims_rcv_tooshort++;
3040 			pimstat.pims_rcv_badregisters++;
3041 			log(LOG_ERR,
3042 			    "pim_input: register packet size too small %d from %lx\n",
3043 			    datalen, (u_long)ip->ip_src.s_addr);
3044 			m_freem(m);
3045 			return;
3046 		}
3047 
3048 		reghdr = (u_int32_t *)(pim + 1);
3049 		encap_ip = (struct ip *)(reghdr + 1);
3050 
3051 		if (mrtdebug & DEBUG_PIM) {
3052 			log(LOG_DEBUG,
3053 			    "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
3054 			    (u_long)ntohl(encap_ip->ip_src.s_addr),
3055 			    (u_long)ntohl(encap_ip->ip_dst.s_addr),
3056 			    ntohs(encap_ip->ip_len));
3057 		}
3058 
3059 		/* verify the version number of the inner packet */
3060 		if (encap_ip->ip_v != IPVERSION) {
3061 			pimstat.pims_rcv_badregisters++;
3062 			if (mrtdebug & DEBUG_PIM) {
3063 				log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
3064 				    "of the inner packet\n", encap_ip->ip_v);
3065 			}
3066 			m_freem(m);
3067 			return;
3068 		}
3069 
3070 		/* verify the inner packet doesn't have options */
3071 		if (encap_ip->ip_hl != (sizeof(struct ip) >> 2)) {
3072 			pimstat.pims_rcv_badregisters++;
3073 			m_freem(m);
3074 			return;
3075 		}
3076 
3077 		/* verify the inner packet is destined to a mcast group */
3078 		if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
3079 			pimstat.pims_rcv_badregisters++;
3080 			 if (mrtdebug & DEBUG_PIM)
3081 				log(LOG_DEBUG,
3082 				    "pim_input: inner packet of register is not "
3083 				    "multicast %lx\n",
3084 				    (u_long)ntohl(encap_ip->ip_dst.s_addr));
3085 			m_freem(m);
3086 			return;
3087 		}
3088 
3089 		/* If a NULL_REGISTER, pass it to the daemon */
3090 		if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
3091 			goto pim_input_to_daemon;
3092 
3093 		/*
3094 		 * Copy the TOS from the outer IP header to the inner IP header.
3095 		 */
3096 		if (encap_ip->ip_tos != ip_tos) {
3097 			/* Outer TOS -> inner TOS */
3098 			encap_ip->ip_tos = ip_tos;
3099 			/* Recompute the inner header checksum. Sigh... */
3100 
3101 			/* adjust mbuf to point to the inner IP header */
3102 			m->m_data += (iphlen + PIM_MINLEN);
3103 			m->m_len  -= (iphlen + PIM_MINLEN);
3104 
3105 			encap_ip->ip_sum = 0;
3106 			encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
3107 
3108 			/* restore mbuf to point back to the outer IP header */
3109 			m->m_data -= (iphlen + PIM_MINLEN);
3110 			m->m_len  += (iphlen + PIM_MINLEN);
3111 		}
3112 
3113 		/*
3114 		 * Decapsulate the inner IP packet and loopback to forward it
3115 		 * as a normal multicast packet. Also, make a copy of the
3116 		 *     outer_iphdr + pimhdr + reghdr + encap_iphdr
3117 		 * to pass to the daemon later, so it can take the appropriate
3118 		 * actions (e.g., send back PIM_REGISTER_STOP).
3119 		 * XXX: here m->m_data points to the outer IP header.
3120 		 */
3121 		mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_DONTWAIT);
3122 		if (mcp == NULL) {
3123 			log(LOG_ERR,
3124 			    "pim_input: pim register: could not copy register head\n");
3125 			m_freem(m);
3126 			return;
3127 		}
3128 
3129 		/* Keep statistics */
3130 		/* XXX: registers_bytes include only the encap. mcast pkt */
3131 		pimstat.pims_rcv_registers_msgs++;
3132 		pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
3133 
3134 		/*
3135 		 * forward the inner ip packet; point m_data at the inner ip.
3136 		 */
3137 		m_adj(m, iphlen + PIM_MINLEN);
3138 
3139 		if (mrtdebug & DEBUG_PIM) {
3140 			log(LOG_DEBUG,
3141 			    "pim_input: forwarding decapsulated register: "
3142 			    "src %lx, dst %lx, vif %d\n",
3143 			    (u_long)ntohl(encap_ip->ip_src.s_addr),
3144 			    (u_long)ntohl(encap_ip->ip_dst.s_addr),
3145 			    reg_vif_num);
3146 		}
3147 		/* NB: vifp was collected above; can it change on us? */
3148 		looutput(vifp, m, (struct sockaddr *)&dst, NULL);
3149 
3150 		/* prepare the register head to send to the mrouting daemon */
3151 		m = mcp;
3152 	}
3153 
3154 pim_input_to_daemon:
3155 	/*
3156 	 * Pass the PIM message up to the daemon; if it is a Register message,
3157 	 * pass the 'head' only up to the daemon. This includes the
3158 	 * outer IP header, PIM header, PIM-Register header and the
3159 	 * inner IP header.
3160 	 * XXX: the outer IP header pkt size of a Register is not adjust to
3161 	 * reflect the fact that the inner multicast data is truncated.
3162 	 */
3163 	/*
3164 	 * Currently, pim_input() is always called holding softnet_lock
3165 	 * by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE).
3166 	 */
3167 	KASSERT(mutex_owned(softnet_lock));
3168 	rip_input(m, iphlen, proto);
3169 
3170 	return;
3171 }
3172 #endif /* PIM */
3173