xref: /openbsd-src/sys/netinet/ip_mroute.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*	$OpenBSD: ip_mroute.c,v 1.67 2014/07/12 18:44:23 tedu Exp $	*/
2 /*	$NetBSD: ip_mroute.c,v 1.85 2004/04/26 01:31:57 matt Exp $	*/
3 
4 /*
5  * Copyright (c) 1989 Stephen Deering
6  * Copyright (c) 1992, 1993
7  *      The Regents of the University of California.  All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * Stephen Deering of Stanford University.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
37  */
38 
39 /*
40  * IP multicast forwarding procedures
41  *
42  * Written by David Waitzman, BBN Labs, August 1988.
43  * Modified by Steve Deering, Stanford, February 1989.
44  * Modified by Mark J. Steiglitz, Stanford, May, 1991
45  * Modified by Van Jacobson, LBL, January 1993
46  * Modified by Ajit Thyagarajan, PARC, August 1993
47  * Modified by Bill Fenner, PARC, April 1994
48  * Modified by Charles M. Hannum, NetBSD, May 1995.
49  * Modified by Ahmed Helmy, SGI, June 1996
50  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
51  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
52  * Modified by Hitoshi Asaeda, WIDE, August 2000
53  * Modified by Pavlin Radoslavov, ICSI, October 2002
54  *
55  * MROUTING Revision: 1.2
56  * and PIM-SMv2 and PIM-DM support, advanced API support,
57  * bandwidth metering and signaling
58  */
59 
60 #ifdef PIM
61 #define _PIM_VT 1
62 #endif
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/mbuf.h>
67 #include <sys/socket.h>
68 #include <sys/socketvar.h>
69 #include <sys/protosw.h>
70 #include <sys/errno.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/ioctl.h>
74 #include <sys/syslog.h>
75 #include <sys/timeout.h>
76 
77 #include <net/if.h>
78 #include <net/route.h>
79 #include <net/raw_cb.h>
80 
81 #include <netinet/in.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/ip.h>
84 #include <netinet/ip_var.h>
85 #include <netinet/in_pcb.h>
86 #include <netinet/udp.h>
87 #include <netinet/igmp.h>
88 #include <netinet/igmp_var.h>
89 #include <netinet/ip_mroute.h>
90 #ifdef PIM
91 #include <netinet/pim.h>
92 #include <netinet/pim_var.h>
93 #endif
94 
95 #include <sys/stdarg.h>
96 
97 #define IP_MULTICASTOPTS 0
98 #define	M_PULLUP(m, len)						 \
99 	do {								 \
100 		if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
101 			(m) = m_pullup((m), (len));			 \
102 	} while (/*CONSTCOND*/ 0)
103 
104 /*
105  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
106  * except for netstat or debugging purposes.
107  */
108 struct socket  *ip_mrouter  = NULL;
109 int		ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
110 
111 #define NO_RTE_FOUND	0x1
112 #define RTE_FOUND	0x2
113 
114 #define	MFCHASH(a, g)							\
115 	((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^	\
116 	    ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
117 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
118 u_long	mfchash;
119 
120 u_char		nexpire[MFCTBLSIZ];
121 struct vif	viftable[MAXVIFS];
122 struct mrtstat	mrtstat;
123 u_int		mrtdebug = 0;	  /* debug level 	*/
124 #define		DEBUG_MFC	0x02
125 #define		DEBUG_FORWARD	0x04
126 #define		DEBUG_EXPIRE	0x08
127 #define		DEBUG_XMIT	0x10
128 #define		DEBUG_PIM	0x20
129 
130 #define		VIFI_INVALID	((vifi_t) -1)
131 
132 #define		EXPIRE_TIMEOUT	250		/* 4x / second */
133 #define		UPCALL_EXPIRE	6		/* number of timeouts */
134 struct timeout	expire_upcalls_ch;
135 
136 static int get_sg_cnt(struct sioc_sg_req *);
137 static int get_vif_cnt(struct sioc_vif_req *);
138 static int ip_mrouter_init(struct socket *, struct mbuf *);
139 static int get_version(struct mbuf *);
140 static int set_assert(struct mbuf *);
141 static int get_assert(struct mbuf *);
142 static int add_vif(struct mbuf *);
143 static int del_vif(struct mbuf *);
144 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
145 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
146 static void expire_mfc(struct mfc *);
147 static int add_mfc(struct mbuf *);
148 static int del_mfc(struct mbuf *);
149 static int set_api_config(struct mbuf *); /* chose API capabilities */
150 static int get_api_support(struct mbuf *);
151 static int get_api_config(struct mbuf *);
152 static int socket_send(struct socket *, struct mbuf *,
153 			    struct sockaddr_in *);
154 static void expire_upcalls(void *);
155 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
156 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
157 static void encap_send(struct ip *, struct vif *, struct mbuf *);
158 static void send_packet(struct vif *, struct mbuf *);
159 
160 /*
161  * Bandwidth monitoring
162  */
163 static void free_bw_list(struct bw_meter *);
164 static int add_bw_upcall(struct mbuf *);
165 static int del_bw_upcall(struct mbuf *);
166 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
167 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
168 static void bw_upcalls_send(void);
169 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
170 static void unschedule_bw_meter(struct bw_meter *);
171 static void bw_meter_process(void);
172 static void expire_bw_upcalls_send(void *);
173 static void expire_bw_meter_process(void *);
174 
175 #ifdef PIM
176 static int pim_register_send(struct ip *, struct vif *,
177 		struct mbuf *, struct mfc *);
178 static int pim_register_send_rp(struct ip *, struct vif *,
179 		struct mbuf *, struct mfc *);
180 static int pim_register_send_upcall(struct ip *, struct vif *,
181 		struct mbuf *, struct mfc *);
182 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
183 #endif
184 
185 /*
186  * 'Interfaces' associated with decapsulator (so we can tell
187  * packets that went through it from ones that get reflected
188  * by a broken gateway).  These interfaces are never linked into
189  * the system ifnet list & no routes point to them.  I.e., packets
190  * can't be sent this way.  They only exist as a placeholder for
191  * multicast source verification.
192  */
193 #if 0
194 struct ifnet multicast_decap_if[MAXVIFS];
195 #endif
196 
197 #define	ENCAP_TTL	64
198 #define	ENCAP_PROTO	IPPROTO_IPIP	/* 4 */
199 
200 /* prototype IP hdr for encapsulated packets */
201 struct ip multicast_encap_iphdr = {
202 #if BYTE_ORDER == LITTLE_ENDIAN
203 	sizeof(struct ip) >> 2, IPVERSION,
204 #else
205 	IPVERSION, sizeof(struct ip) >> 2,
206 #endif
207 	0,				/* tos */
208 	sizeof(struct ip),		/* total length */
209 	0,				/* id */
210 	0,				/* frag offset */
211 	ENCAP_TTL, ENCAP_PROTO,
212 	0,				/* checksum */
213 };
214 
215 /*
216  * Bandwidth meter variables and constants
217  */
218 
219 /*
220  * Pending timeouts are stored in a hash table, the key being the
221  * expiration time. Periodically, the entries are analysed and processed.
222  */
223 #define BW_METER_BUCKETS	1024
224 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
225 struct timeout bw_meter_ch;
226 #define BW_METER_PERIOD 1000	/* periodical handling of bw meters (in ms) */
227 
228 /*
229  * Pending upcalls are stored in a vector which is flushed when
230  * full, or periodically
231  */
232 static struct bw_upcall	bw_upcalls[BW_UPCALLS_MAX];
233 static u_int	bw_upcalls_n; /* # of pending upcalls */
234 struct timeout	bw_upcalls_ch;
235 #define BW_UPCALLS_PERIOD 1000	/* periodical flush of bw upcalls (in ms) */
236 
237 #ifdef PIM
238 struct pimstat pimstat;
239 
240 /*
241  * Note: the PIM Register encapsulation adds the following in front of a
242  * data packet:
243  *
244  * struct pim_encap_hdr {
245  *    struct ip ip;
246  *    struct pim_encap_pimhdr  pim;
247  * }
248  *
249  */
250 
251 struct pim_encap_pimhdr {
252 	struct pim pim;
253 	uint32_t   flags;
254 };
255 
256 static struct ip pim_encap_iphdr = {
257 #if BYTE_ORDER == LITTLE_ENDIAN
258 	sizeof(struct ip) >> 2,
259 	IPVERSION,
260 #else
261 	IPVERSION,
262 	sizeof(struct ip) >> 2,
263 #endif
264 	0,			/* tos */
265 	sizeof(struct ip),	/* total length */
266 	0,			/* id */
267 	0,			/* frag offset */
268 	ENCAP_TTL,
269 	IPPROTO_PIM,
270 	0,			/* checksum */
271 };
272 
273 static struct pim_encap_pimhdr pim_encap_pimhdr = {
274     {
275 	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
276 	0,			/* reserved */
277 	0,			/* checksum */
278     },
279     0				/* flags */
280 };
281 
282 static struct ifnet multicast_register_if;
283 static vifi_t reg_vif_num = VIFI_INVALID;
284 #endif /* PIM */
285 
286 
287 /*
288  * Private variables.
289  */
290 static vifi_t	   numvifs = 0;
291 static int have_encap_tunnel = 0;
292 
293 /*
294  * whether or not special PIM assert processing is enabled.
295  */
296 static int pim_assert;
297 /*
298  * Rate limit for assert notification messages, in usec
299  */
300 #define ASSERT_MSG_TIME		3000000
301 
302 /*
303  * Kernel multicast routing API capabilities and setup.
304  * If more API capabilities are added to the kernel, they should be
305  * recorded in `mrt_api_support'.
306  */
307 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
308 					  MRT_MFC_FLAGS_BORDER_VIF |
309 					  MRT_MFC_RP |
310 					  MRT_MFC_BW_UPCALL);
311 static u_int32_t mrt_api_config = 0;
312 
313 /*
314  * Find a route for a given origin IP address and Multicast group address
315  * Type of service parameter to be added in the future!!!
316  * Statistics are updated by the caller if needed
317  * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
318  */
319 static struct mfc *
320 mfc_find(struct in_addr *o, struct in_addr *g)
321 {
322 	struct mfc *rt;
323 
324 	LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
325 		if (in_hosteq(rt->mfc_origin, *o) &&
326 		    in_hosteq(rt->mfc_mcastgrp, *g) &&
327 		    (rt->mfc_stall == NULL))
328 			break;
329 	}
330 
331 	return (rt);
332 }
333 
334 /*
335  * Macros to compute elapsed time efficiently
336  * Borrowed from Van Jacobson's scheduling code
337  */
338 #define TV_DELTA(a, b, delta) do {					\
339 	int xxs;							\
340 	delta = (a).tv_usec - (b).tv_usec;				\
341 	xxs = (a).tv_sec - (b).tv_sec;					\
342 	switch (xxs) {							\
343 	case 2:								\
344 		delta += 1000000;					\
345 		/* FALLTHROUGH */					\
346 	case 1:								\
347 		delta += 1000000;					\
348 		/* FALLTHROUGH */					\
349 	case 0:								\
350 		break;							\
351 	default:							\
352 		delta += (1000000 * xxs);				\
353 		break;							\
354 	}								\
355 } while (/*CONSTCOND*/ 0)
356 
357 /*
358  * Handle MRT setsockopt commands to modify the multicast routing tables.
359  */
360 int
361 ip_mrouter_set(struct socket *so, int optname, struct mbuf **m)
362 {
363 	int error;
364 
365 	if (optname != MRT_INIT && so != ip_mrouter)
366 		error = ENOPROTOOPT;
367 	else
368 		switch (optname) {
369 		case MRT_INIT:
370 			error = ip_mrouter_init(so, *m);
371 			break;
372 		case MRT_DONE:
373 			error = ip_mrouter_done();
374 			break;
375 		case MRT_ADD_VIF:
376 			error = add_vif(*m);
377 			break;
378 		case MRT_DEL_VIF:
379 			error = del_vif(*m);
380 			break;
381 		case MRT_ADD_MFC:
382 			error = add_mfc(*m);
383 			break;
384 		case MRT_DEL_MFC:
385 			error = del_mfc(*m);
386 			break;
387 		case MRT_ASSERT:
388 			error = set_assert(*m);
389 			break;
390 		case MRT_API_CONFIG:
391 			error = set_api_config(*m);
392 			break;
393 		case MRT_ADD_BW_UPCALL:
394 			error = add_bw_upcall(*m);
395 			break;
396 		case MRT_DEL_BW_UPCALL:
397 			error = del_bw_upcall(*m);
398 			break;
399 		default:
400 			error = ENOPROTOOPT;
401 			break;
402 		}
403 
404 	if (*m)
405 		m_free(*m);
406 	return (error);
407 }
408 
409 /*
410  * Handle MRT getsockopt commands
411  */
412 int
413 ip_mrouter_get(struct socket *so, int optname, struct mbuf **m)
414 {
415 	int error;
416 
417 	if (so != ip_mrouter)
418 		error = ENOPROTOOPT;
419 	else {
420 		*m = m_get(M_WAIT, MT_SOOPTS);
421 
422 		switch (optname) {
423 		case MRT_VERSION:
424 			error = get_version(*m);
425 			break;
426 		case MRT_ASSERT:
427 			error = get_assert(*m);
428 			break;
429 		case MRT_API_SUPPORT:
430 			error = get_api_support(*m);
431 			break;
432 		case MRT_API_CONFIG:
433 			error = get_api_config(*m);
434 			break;
435 		default:
436 			error = ENOPROTOOPT;
437 			break;
438 		}
439 
440 		if (error)
441 			m_free(*m);
442 	}
443 
444 	return (error);
445 }
446 
447 /*
448  * Handle ioctl commands to obtain information from the cache
449  */
450 int
451 mrt_ioctl(struct socket *so, u_long cmd, caddr_t data)
452 {
453 	int error;
454 
455 	if (so != ip_mrouter)
456 		error = EINVAL;
457 	else
458 		switch (cmd) {
459 		case SIOCGETVIFCNT:
460 			error = get_vif_cnt((struct sioc_vif_req *)data);
461 			break;
462 		case SIOCGETSGCNT:
463 			error = get_sg_cnt((struct sioc_sg_req *)data);
464 			break;
465 		default:
466 			error = ENOTTY;
467 			break;
468 		}
469 
470 	return (error);
471 }
472 
473 /*
474  * returns the packet, byte, rpf-failure count for the source group provided
475  */
476 static int
477 get_sg_cnt(struct sioc_sg_req *req)
478 {
479 	int s;
480 	struct mfc *rt;
481 
482 	s = splsoftnet();
483 	rt = mfc_find(&req->src, &req->grp);
484 	if (rt == NULL) {
485 		splx(s);
486 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
487 		return (EADDRNOTAVAIL);
488 	}
489 	req->pktcnt = rt->mfc_pkt_cnt;
490 	req->bytecnt = rt->mfc_byte_cnt;
491 	req->wrong_if = rt->mfc_wrong_if;
492 	splx(s);
493 
494 	return (0);
495 }
496 
497 /*
498  * returns the input and output packet and byte counts on the vif provided
499  */
500 static int
501 get_vif_cnt(struct sioc_vif_req *req)
502 {
503 	vifi_t vifi = req->vifi;
504 
505 	if (vifi >= numvifs)
506 		return (EINVAL);
507 
508 	req->icount = viftable[vifi].v_pkt_in;
509 	req->ocount = viftable[vifi].v_pkt_out;
510 	req->ibytes = viftable[vifi].v_bytes_in;
511 	req->obytes = viftable[vifi].v_bytes_out;
512 
513 	return (0);
514 }
515 
516 /*
517  * Enable multicast routing
518  */
519 static int
520 ip_mrouter_init(struct socket *so, struct mbuf *m)
521 {
522 	int *v;
523 
524 	if (mrtdebug)
525 		log(LOG_DEBUG,
526 		    "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
527 		    so->so_type, so->so_proto->pr_protocol);
528 
529 	if (so->so_type != SOCK_RAW ||
530 	    so->so_proto->pr_protocol != IPPROTO_IGMP)
531 		return (EOPNOTSUPP);
532 
533 	if (m == NULL || m->m_len < sizeof(int))
534 		return (EINVAL);
535 
536 	v = mtod(m, int *);
537 	if (*v != 1)
538 		return (EINVAL);
539 
540 	if (ip_mrouter != NULL)
541 		return (EADDRINUSE);
542 
543 	ip_mrouter = so;
544 
545 	mfchashtbl = hashinit(MFCTBLSIZ, M_MRTABLE, M_WAITOK, &mfchash);
546 	memset(nexpire, 0, sizeof(nexpire));
547 
548 	pim_assert = 0;
549 
550 	timeout_set(&expire_upcalls_ch, expire_upcalls, NULL);
551 	timeout_add_msec(&expire_upcalls_ch, EXPIRE_TIMEOUT);
552 
553 	timeout_set(&bw_upcalls_ch, expire_bw_upcalls_send, NULL);
554 	timeout_add_msec(&bw_upcalls_ch, BW_UPCALLS_PERIOD);
555 
556 	timeout_set(&bw_meter_ch, expire_bw_meter_process, NULL);
557 	timeout_add_msec(&bw_meter_ch, BW_METER_PERIOD);
558 
559 	if (mrtdebug)
560 		log(LOG_DEBUG, "ip_mrouter_init\n");
561 
562 	return (0);
563 }
564 
565 /*
566  * Disable multicast routing
567  */
568 int
569 ip_mrouter_done()
570 {
571 	vifi_t vifi;
572 	struct vif *vifp;
573 	int i;
574 	int s;
575 
576 	s = splsoftnet();
577 
578 	/* Clear out all the vifs currently in use. */
579 	for (vifi = 0; vifi < numvifs; vifi++) {
580 		vifp = &viftable[vifi];
581 		if (!in_nullhost(vifp->v_lcl_addr))
582 			reset_vif(vifp);
583 	}
584 
585 	numvifs = 0;
586 	pim_assert = 0;
587 	mrt_api_config = 0;
588 
589 	timeout_del(&expire_upcalls_ch);
590 	timeout_del(&bw_upcalls_ch);
591 	timeout_del(&bw_meter_ch);
592 
593 	/*
594 	 * Free all multicast forwarding cache entries.
595 	 */
596 	for (i = 0; i < MFCTBLSIZ; i++) {
597 		struct mfc *rt, *nrt;
598 
599 		for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
600 			nrt = LIST_NEXT(rt, mfc_hash);
601 
602 			expire_mfc(rt);
603 		}
604 	}
605 
606 	memset(nexpire, 0, sizeof(nexpire));
607 	free(mfchashtbl, M_MRTABLE, 0);
608 	mfchashtbl = NULL;
609 
610 	bw_upcalls_n = 0;
611 	memset(bw_meter_timers, 0, sizeof(bw_meter_timers));
612 
613 	/* Reset de-encapsulation cache. */
614 	have_encap_tunnel = 0;
615 
616 	ip_mrouter = NULL;
617 
618 	splx(s);
619 
620 	if (mrtdebug)
621 		log(LOG_DEBUG, "ip_mrouter_done\n");
622 
623 	return (0);
624 }
625 
626 void
627 ip_mrouter_detach(struct ifnet *ifp)
628 {
629 	int vifi, i;
630 	struct vif *vifp;
631 	struct mfc *rt;
632 	struct rtdetq *rte;
633 
634 	/* XXX not sure about side effect to userland routing daemon */
635 	for (vifi = 0; vifi < numvifs; vifi++) {
636 		vifp = &viftable[vifi];
637 		if (vifp->v_ifp == ifp)
638 			reset_vif(vifp);
639 	}
640 	for (i = 0; i < MFCTBLSIZ; i++) {
641 		if (nexpire[i] == 0)
642 			continue;
643 		LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
644 			for (rte = rt->mfc_stall; rte; rte = rte->next) {
645 				if (rte->ifp == ifp)
646 					rte->ifp = NULL;
647 			}
648 		}
649 	}
650 }
651 
652 static int
653 get_version(struct mbuf *m)
654 {
655 	int *v = mtod(m, int *);
656 
657 	*v = 0x0305;	/* XXX !!!! */
658 	m->m_len = sizeof(int);
659 	return (0);
660 }
661 
662 /*
663  * Set PIM assert processing global
664  */
665 static int
666 set_assert(struct mbuf *m)
667 {
668 	int *i;
669 
670 	if (m == NULL || m->m_len < sizeof(int))
671 		return (EINVAL);
672 
673 	i = mtod(m, int *);
674 	pim_assert = !!*i;
675 	return (0);
676 }
677 
678 /*
679  * Get PIM assert processing global
680  */
681 static int
682 get_assert(struct mbuf *m)
683 {
684 	int *i = mtod(m, int *);
685 
686 	*i = pim_assert;
687 	m->m_len = sizeof(int);
688 	return (0);
689 }
690 
691 /*
692  * Configure API capabilities
693  */
694 static int
695 set_api_config(struct mbuf *m)
696 {
697 	int i;
698 	u_int32_t *apival;
699 
700 	if (m == NULL || m->m_len < sizeof(u_int32_t))
701 		return (EINVAL);
702 
703 	apival = mtod(m, u_int32_t *);
704 
705 	/*
706 	 * We can set the API capabilities only if it is the first operation
707 	 * after MRT_INIT. I.e.:
708 	 *  - there are no vifs installed
709 	 *  - pim_assert is not enabled
710 	 *  - the MFC table is empty
711 	 */
712 	if (numvifs > 0) {
713 		*apival = 0;
714 		return (EPERM);
715 	}
716 	if (pim_assert) {
717 		*apival = 0;
718 		return (EPERM);
719 	}
720 	for (i = 0; i < MFCTBLSIZ; i++) {
721 		if (LIST_FIRST(&mfchashtbl[i]) != NULL) {
722 			*apival = 0;
723 			return (EPERM);
724 		}
725 	}
726 
727 	mrt_api_config = *apival & mrt_api_support;
728 	*apival = mrt_api_config;
729 
730 	return (0);
731 }
732 
733 /*
734  * Get API capabilities
735  */
736 static int
737 get_api_support(struct mbuf *m)
738 {
739 	u_int32_t *apival;
740 
741 	if (m == NULL || m->m_len < sizeof(u_int32_t))
742 		return (EINVAL);
743 
744 	apival = mtod(m, u_int32_t *);
745 
746 	*apival = mrt_api_support;
747 
748 	return (0);
749 }
750 
751 /*
752  * Get API configured capabilities
753  */
754 static int
755 get_api_config(struct mbuf *m)
756 {
757 	u_int32_t *apival;
758 
759 	if (m == NULL || m->m_len < sizeof(u_int32_t))
760 		return (EINVAL);
761 
762 	apival = mtod(m, u_int32_t *);
763 
764 	*apival = mrt_api_config;
765 
766 	return (0);
767 }
768 
769 static struct sockaddr_in sin = { sizeof(sin), AF_INET };
770 
771 /*
772  * Add a vif to the vif table
773  */
774 static int
775 add_vif(struct mbuf *m)
776 {
777 	struct vifctl *vifcp;
778 	struct vif *vifp;
779 	struct ifaddr *ifa;
780 	struct ifnet *ifp;
781 	struct ifreq ifr;
782 	int error, s;
783 
784 	if (m == NULL || m->m_len < sizeof(struct vifctl))
785 		return (EINVAL);
786 
787 	vifcp = mtod(m, struct vifctl *);
788 	if (vifcp->vifc_vifi >= MAXVIFS)
789 		return (EINVAL);
790 	if (in_nullhost(vifcp->vifc_lcl_addr))
791 		return (EADDRNOTAVAIL);
792 
793 	vifp = &viftable[vifcp->vifc_vifi];
794 	if (!in_nullhost(vifp->v_lcl_addr))
795 		return (EADDRINUSE);
796 
797 	/* Find the interface with an address in AF_INET family. */
798 #ifdef PIM
799 	if (vifcp->vifc_flags & VIFF_REGISTER) {
800 		/*
801 		 * XXX: Because VIFF_REGISTER does not really need a valid
802 		 * local interface (e.g. it could be 127.0.0.2), we don't
803 		 * check its address.
804 		 */
805 	} else
806 #endif
807 	{
808 		sin.sin_addr = vifcp->vifc_lcl_addr;
809 		ifa = ifa_ifwithaddr(sintosa(&sin), /* XXX */ 0);
810 		if (ifa == NULL)
811 			return (EADDRNOTAVAIL);
812 	}
813 
814 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
815 		/* tunnels are no longer supported use gif(4) instead */
816 		return (EOPNOTSUPP);
817 #ifdef PIM
818 	} else if (vifcp->vifc_flags & VIFF_REGISTER) {
819 		ifp = &multicast_register_if;
820 		if (mrtdebug)
821 			log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
822 			    (void *)ifp);
823 		if (reg_vif_num == VIFI_INVALID) {
824 			memset(ifp, 0, sizeof(*ifp));
825 			snprintf(ifp->if_xname, sizeof ifp->if_xname,
826 				 "register_vif");
827 			ifp->if_flags = IFF_LOOPBACK;
828 			memset(&vifp->v_route, 0, sizeof(vifp->v_route));
829 			reg_vif_num = vifcp->vifc_vifi;
830 		}
831 #endif
832 	} else {
833 		/* Use the physical interface associated with the address. */
834 		ifp = ifa->ifa_ifp;
835 
836 		/* Make sure the interface supports multicast. */
837 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
838 			return (EOPNOTSUPP);
839 
840 		/* Enable promiscuous reception of all IP multicasts. */
841 		satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
842 		satosin(&ifr.ifr_addr)->sin_family = AF_INET;
843 		satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
844 		error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
845 		if (error)
846 			return (error);
847 	}
848 
849 	s = splsoftnet();
850 
851 	vifp->v_flags = vifcp->vifc_flags;
852 	vifp->v_threshold = vifcp->vifc_threshold;
853 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
854 	vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
855 	vifp->v_ifp = ifp;
856 	/* Initialize per vif pkt counters. */
857 	vifp->v_pkt_in = 0;
858 	vifp->v_pkt_out = 0;
859 	vifp->v_bytes_in = 0;
860 	vifp->v_bytes_out = 0;
861 
862 	timeout_del(&vifp->v_repq_ch);
863 
864 	splx(s);
865 
866 	/* Adjust numvifs up if the vifi is higher than numvifs. */
867 	if (numvifs <= vifcp->vifc_vifi)
868 		numvifs = vifcp->vifc_vifi + 1;
869 
870 	if (mrtdebug)
871 		log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, "
872 		    "thresh %x\n",
873 		    vifcp->vifc_vifi,
874 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
875 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
876 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
877 		    vifcp->vifc_threshold);
878 
879 	return (0);
880 }
881 
882 void
883 reset_vif(struct vif *vifp)
884 {
885 	struct ifnet *ifp;
886 	struct ifreq ifr;
887 
888 	if (vifp->v_flags & VIFF_TUNNEL) {
889 		/* empty */
890 	} else if (vifp->v_flags & VIFF_REGISTER) {
891 #ifdef PIM
892 		reg_vif_num = VIFI_INVALID;
893 #endif
894 	} else {
895 		satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
896 		satosin(&ifr.ifr_addr)->sin_family = AF_INET;
897 		satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
898 		ifp = vifp->v_ifp;
899 		(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
900 	}
901 	memset(vifp, 0, sizeof(*vifp));
902 }
903 
904 /*
905  * Delete a vif from the vif table
906  */
907 static int
908 del_vif(struct mbuf *m)
909 {
910 	vifi_t *vifip;
911 	struct vif *vifp;
912 	vifi_t vifi;
913 	int s;
914 
915 	if (m == NULL || m->m_len < sizeof(vifi_t))
916 		return (EINVAL);
917 
918 	vifip = mtod(m, vifi_t *);
919 	if (*vifip >= numvifs)
920 		return (EINVAL);
921 
922 	vifp = &viftable[*vifip];
923 	if (in_nullhost(vifp->v_lcl_addr))
924 		return (EADDRNOTAVAIL);
925 
926 	s = splsoftnet();
927 
928 	reset_vif(vifp);
929 
930 	/* Adjust numvifs down */
931 	for (vifi = numvifs; vifi > 0; vifi--)
932 		if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
933 			break;
934 	numvifs = vifi;
935 
936 	splx(s);
937 
938 	if (mrtdebug)
939 		log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
940 
941 	return (0);
942 }
943 
944 void
945 vif_delete(struct ifnet *ifp)
946 {
947 	int i;
948 	struct vif *vifp;
949 	struct mfc *rt;
950 	struct rtdetq *rte;
951 
952 	for (i = 0; i < numvifs; i++) {
953 		vifp = &viftable[i];
954 		if (vifp->v_ifp == ifp)
955 			memset(vifp, 0, sizeof(*vifp));
956 	}
957 
958 	for (i = numvifs; i > 0; i--)
959 		if (!in_nullhost(viftable[i - 1].v_lcl_addr))
960 			break;
961 	numvifs = i;
962 
963 	for (i = 0; i < MFCTBLSIZ; i++) {
964 		if (nexpire[i] == 0)
965 			continue;
966 		LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
967 			for (rte = rt->mfc_stall; rte; rte = rte->next) {
968 				if (rte->ifp == ifp)
969 					rte->ifp = NULL;
970 			}
971 		}
972 	}
973 }
974 
975 /*
976  * update an mfc entry without resetting counters and S,G addresses.
977  */
978 static void
979 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
980 {
981 	int i;
982 
983 	rt->mfc_parent = mfccp->mfcc_parent;
984 	for (i = 0; i < numvifs; i++) {
985 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
986 		rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
987 		    MRT_MFC_FLAGS_ALL;
988 	}
989 	/* set the RP address */
990 	if (mrt_api_config & MRT_MFC_RP)
991 		rt->mfc_rp = mfccp->mfcc_rp;
992 	else
993 		rt->mfc_rp = zeroin_addr;
994 }
995 
996 /*
997  * fully initialize an mfc entry from the parameter.
998  */
999 static void
1000 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
1001 {
1002 	rt->mfc_origin     = mfccp->mfcc_origin;
1003 	rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
1004 
1005 	update_mfc_params(rt, mfccp);
1006 
1007 	/* initialize pkt counters per src-grp */
1008 	rt->mfc_pkt_cnt    = 0;
1009 	rt->mfc_byte_cnt   = 0;
1010 	rt->mfc_wrong_if   = 0;
1011 	timerclear(&rt->mfc_last_assert);
1012 }
1013 
1014 static void
1015 expire_mfc(struct mfc *rt)
1016 {
1017 	struct rtdetq *rte, *nrte;
1018 
1019 	free_bw_list(rt->mfc_bw_meter);
1020 
1021 	for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
1022 		nrte = rte->next;
1023 		m_freem(rte->m);
1024 		free(rte, M_MRTABLE, 0);
1025 	}
1026 
1027 	LIST_REMOVE(rt, mfc_hash);
1028 	free(rt, M_MRTABLE, 0);
1029 }
1030 
1031 /*
1032  * Add an mfc entry
1033  */
1034 static int
1035 add_mfc(struct mbuf *m)
1036 {
1037 	struct mfcctl2 mfcctl2;
1038 	struct mfcctl2 *mfccp;
1039 	struct mfc *rt;
1040 	u_int32_t hash = 0;
1041 	struct rtdetq *rte, *nrte;
1042 	u_short nstl;
1043 	int s;
1044 	int mfcctl_size = sizeof(struct mfcctl);
1045 
1046 	if (mrt_api_config & MRT_API_FLAGS_ALL)
1047 		mfcctl_size = sizeof(struct mfcctl2);
1048 
1049 	if (m == NULL || m->m_len < mfcctl_size)
1050 		return (EINVAL);
1051 
1052 	/*
1053 	 * select data size depending on API version.
1054 	 */
1055 	if (mrt_api_config & MRT_API_FLAGS_ALL) {
1056 		struct mfcctl2 *mp2 = mtod(m, struct mfcctl2 *);
1057 		bcopy(mp2, (caddr_t)&mfcctl2, sizeof(*mp2));
1058 	} else {
1059 		struct mfcctl *mp = mtod(m, struct mfcctl *);
1060 		bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp));
1061 		memset((caddr_t)&mfcctl2 + sizeof(struct mfcctl), 0,
1062 		    sizeof(mfcctl2) - sizeof(struct mfcctl));
1063 	}
1064 	mfccp = &mfcctl2;
1065 
1066 	s = splsoftnet();
1067 	rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1068 
1069 	/* If an entry already exists, just update the fields */
1070 	if (rt) {
1071 		if (mrtdebug & DEBUG_MFC)
1072 			log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
1073 			    ntohl(mfccp->mfcc_origin.s_addr),
1074 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1075 			    mfccp->mfcc_parent);
1076 
1077 		update_mfc_params(rt, mfccp);
1078 
1079 		splx(s);
1080 		return (0);
1081 	}
1082 
1083 	/*
1084 	 * Find the entry for which the upcall was made and update
1085 	 */
1086 	nstl = 0;
1087 	hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
1088 	LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1089 		if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1090 		    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
1091 		    rt->mfc_stall != NULL) {
1092 			if (nstl++)
1093 				log(LOG_ERR, "add_mfc %s o %x g %x "
1094 				    "p %x dbx %p\n",
1095 				    "multiple kernel entries",
1096 				    ntohl(mfccp->mfcc_origin.s_addr),
1097 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1098 				    mfccp->mfcc_parent, rt->mfc_stall);
1099 
1100 			if (mrtdebug & DEBUG_MFC)
1101 				log(LOG_DEBUG, "add_mfc o %x g %x "
1102 				    "p %x dbg %p\n",
1103 				    ntohl(mfccp->mfcc_origin.s_addr),
1104 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1105 				    mfccp->mfcc_parent, rt->mfc_stall);
1106 
1107 			rte = rt->mfc_stall;
1108 			init_mfc_params(rt, mfccp);
1109 			rt->mfc_stall = NULL;
1110 
1111 			rt->mfc_expire = 0; /* Don't clean this guy up */
1112 			nexpire[hash]--;
1113 
1114 			/* free packets Qed at the end of this entry */
1115 			for (; rte != NULL; rte = nrte) {
1116 				nrte = rte->next;
1117 				if (rte->ifp) {
1118 					ip_mdq(rte->m, rte->ifp, rt);
1119 				}
1120 				m_freem(rte->m);
1121 				free(rte, M_MRTABLE, 0);
1122 			}
1123 		}
1124 	}
1125 
1126 	/*
1127 	 * It is possible that an entry is being inserted without an upcall
1128 	 */
1129 	if (nstl == 0) {
1130 		/*
1131 		 * No mfc; make a new one
1132 		 */
1133 		if (mrtdebug & DEBUG_MFC)
1134 			log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
1135 			    ntohl(mfccp->mfcc_origin.s_addr),
1136 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1137 			    mfccp->mfcc_parent);
1138 
1139 		LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1140 			if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1141 			    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
1142 				init_mfc_params(rt, mfccp);
1143 				if (rt->mfc_expire)
1144 					nexpire[hash]--;
1145 				rt->mfc_expire = 0;
1146 				break; /* XXX */
1147 			}
1148 		}
1149 		if (rt == NULL) {	/* no upcall, so make a new entry */
1150 			rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
1151 			    M_NOWAIT);
1152 			if (rt == NULL) {
1153 				splx(s);
1154 				return (ENOBUFS);
1155 			}
1156 
1157 			init_mfc_params(rt, mfccp);
1158 			rt->mfc_expire	= 0;
1159 			rt->mfc_stall	= NULL;
1160 			rt->mfc_bw_meter = NULL;
1161 
1162 			/* insert new entry at head of hash chain */
1163 			LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1164 		}
1165 	}
1166 
1167 	splx(s);
1168 	return (0);
1169 }
1170 
1171 /*
1172  * Delete an mfc entry
1173  */
1174 static int
1175 del_mfc(struct mbuf *m)
1176 {
1177 	struct mfcctl2 mfcctl2;
1178 	struct mfcctl2 *mfccp;
1179 	struct mfc *rt;
1180 	int s;
1181 	int mfcctl_size = sizeof(struct mfcctl);
1182 	struct mfcctl *mp = mtod(m, struct mfcctl *);
1183 
1184 	/*
1185 	 * XXX: for deleting MFC entries the information in entries
1186 	 * of size "struct mfcctl" is sufficient.
1187 	 */
1188 
1189 	if (m == NULL || m->m_len < mfcctl_size)
1190 		return (EINVAL);
1191 
1192 	bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp));
1193 	memset((caddr_t)&mfcctl2 + sizeof(struct mfcctl), 0,
1194 	    sizeof(mfcctl2) - sizeof(struct mfcctl));
1195 
1196 	mfccp = &mfcctl2;
1197 
1198 	if (mrtdebug & DEBUG_MFC)
1199 		log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
1200 		    ntohl(mfccp->mfcc_origin.s_addr),
1201 		    ntohl(mfccp->mfcc_mcastgrp.s_addr));
1202 
1203 	s = splsoftnet();
1204 
1205 	rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1206 	if (rt == NULL) {
1207 		splx(s);
1208 		return (EADDRNOTAVAIL);
1209 	}
1210 
1211 	/*
1212 	 * free the bw_meter entries
1213 	 */
1214 	free_bw_list(rt->mfc_bw_meter);
1215 	rt->mfc_bw_meter = NULL;
1216 
1217 	LIST_REMOVE(rt, mfc_hash);
1218 	free(rt, M_MRTABLE, 0);
1219 
1220 	splx(s);
1221 	return (0);
1222 }
1223 
1224 static int
1225 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
1226 {
1227 	if (s != NULL) {
1228 		if (sbappendaddr(&s->so_rcv, sintosa(src), mm, NULL) != 0) {
1229 			sorwakeup(s);
1230 			return (0);
1231 		}
1232 	}
1233 	m_freem(mm);
1234 	return (-1);
1235 }
1236 
1237 /*
1238  * IP multicast forwarding function. This function assumes that the packet
1239  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
1240  * pointed to by "ifp", and the packet is to be relayed to other networks
1241  * that have members of the packet's destination IP multicast group.
1242  *
1243  * The packet is returned unscathed to the caller, unless it is
1244  * erroneous, in which case a non-zero return value tells the caller to
1245  * discard it.
1246  */
1247 
1248 #define IP_HDR_LEN  20	/* # bytes of fixed IP header (excluding options) */
1249 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1250 
1251 int
1252 ip_mforward(struct mbuf *m, struct ifnet *ifp)
1253 {
1254 	struct ip *ip = mtod(m, struct ip *);
1255 	struct mfc *rt;
1256 	static int srctun = 0;
1257 	struct mbuf *mm;
1258 	int s;
1259 	vifi_t vifi;
1260 
1261 	if (mrtdebug & DEBUG_FORWARD)
1262 		log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
1263 		    ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
1264 
1265 	if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
1266 	    ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
1267 		/*
1268 		 * Packet arrived via a physical interface or
1269 		 * an encapsulated tunnel or a register_vif.
1270 		 */
1271 	} else {
1272 		/*
1273 		 * Packet arrived through a source-route tunnel.
1274 		 * Source-route tunnels are no longer supported.
1275 		 */
1276 		if ((srctun++ % 1000) == 0)
1277 			log(LOG_ERR, "ip_mforward: received source-routed "
1278 			    "packet from %x\n", ntohl(ip->ip_src.s_addr));
1279 
1280 		return (1);
1281 	}
1282 
1283 	/*
1284 	 * Don't forward a packet with time-to-live of zero or one,
1285 	 * or a packet destined to a local-only group.
1286 	 */
1287 	if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
1288 		return (0);
1289 
1290 	/*
1291 	 * Determine forwarding vifs from the forwarding cache table
1292 	 */
1293 	s = splsoftnet();
1294 	++mrtstat.mrts_mfc_lookups;
1295 	rt = mfc_find(&ip->ip_src, &ip->ip_dst);
1296 
1297 	/* Entry exists, so forward if necessary */
1298 	if (rt != NULL) {
1299 		splx(s);
1300 		return (ip_mdq(m, ifp, rt));
1301 	} else {
1302 		/*
1303 		 * If we don't have a route for packet's origin,
1304 		 * Make a copy of the packet & send message to routing daemon
1305 		 */
1306 
1307 		struct mbuf *mb0;
1308 		struct rtdetq *rte;
1309 		u_int32_t hash;
1310 		int hlen = ip->ip_hl << 2;
1311 
1312 		++mrtstat.mrts_mfc_misses;
1313 
1314 		mrtstat.mrts_no_route++;
1315 		if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
1316 			log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
1317 			    ntohl(ip->ip_src.s_addr),
1318 			    ntohl(ip->ip_dst.s_addr));
1319 
1320 		/*
1321 		 * Allocate mbufs early so that we don't do extra work if we are
1322 		 * just going to fail anyway.  Make sure to pullup the header so
1323 		 * that other people can't step on it.
1324 		 */
1325 		rte = (struct rtdetq *)malloc(sizeof(*rte),
1326 		    M_MRTABLE, M_NOWAIT);
1327 		if (rte == NULL) {
1328 			splx(s);
1329 			return (ENOBUFS);
1330 		}
1331 		mb0 = m_copy(m, 0, M_COPYALL);
1332 		M_PULLUP(mb0, hlen);
1333 		if (mb0 == NULL) {
1334 			free(rte, M_MRTABLE, 0);
1335 			splx(s);
1336 			return (ENOBUFS);
1337 		}
1338 
1339 		/* is there an upcall waiting for this flow? */
1340 		hash = MFCHASH(ip->ip_src, ip->ip_dst);
1341 		LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1342 			if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
1343 			    in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
1344 			    rt->mfc_stall != NULL)
1345 				break;
1346 		}
1347 
1348 		if (rt == NULL) {
1349 			int i;
1350 			struct igmpmsg *im;
1351 
1352 			/*
1353 			 * Locate the vifi for the incoming interface for
1354 			 * this packet.
1355 			 * If none found, drop packet.
1356 			 */
1357 			for (vifi = 0; vifi < numvifs &&
1358 				 viftable[vifi].v_ifp != ifp; vifi++)
1359 				;
1360 			if (vifi >= numvifs) /* vif not found, drop packet */
1361 				goto non_fatal;
1362 
1363 			/* no upcall, so make a new entry */
1364 			rt = (struct mfc *)malloc(sizeof(*rt),
1365 			    M_MRTABLE, M_NOWAIT);
1366 			if (rt == NULL)
1367 				goto fail;
1368 			/*
1369 			 * Make a copy of the header to send to the user level
1370 			 * process
1371 			 */
1372 			mm = m_copy(m, 0, hlen);
1373 			M_PULLUP(mm, hlen);
1374 			if (mm == NULL)
1375 				goto fail1;
1376 
1377 			/*
1378 			 * Send message to routing daemon to install
1379 			 * a route into the kernel table
1380 			 */
1381 
1382 			im = mtod(mm, struct igmpmsg *);
1383 			im->im_msgtype = IGMPMSG_NOCACHE;
1384 			im->im_mbz = 0;
1385 			im->im_vif = vifi;
1386 
1387 			mrtstat.mrts_upcalls++;
1388 
1389 			sin.sin_addr = ip->ip_src;
1390 			if (socket_send(ip_mrouter, mm, &sin) < 0) {
1391 				log(LOG_WARNING, "ip_mforward: ip_mrouter "
1392 				    "socket queue full\n");
1393 				++mrtstat.mrts_upq_sockfull;
1394 			fail1:
1395 				free(rt, M_MRTABLE, 0);
1396 			fail:
1397 				free(rte, M_MRTABLE, 0);
1398 				m_freem(mb0);
1399 				splx(s);
1400 				return (ENOBUFS);
1401 			}
1402 
1403 			/* insert new entry at head of hash chain */
1404 			rt->mfc_origin = ip->ip_src;
1405 			rt->mfc_mcastgrp = ip->ip_dst;
1406 			rt->mfc_pkt_cnt = 0;
1407 			rt->mfc_byte_cnt = 0;
1408 			rt->mfc_wrong_if = 0;
1409 			rt->mfc_expire = UPCALL_EXPIRE;
1410 			nexpire[hash]++;
1411 			for (i = 0; i < numvifs; i++) {
1412 				rt->mfc_ttls[i] = 0;
1413 				rt->mfc_flags[i] = 0;
1414 			}
1415 			rt->mfc_parent = -1;
1416 
1417 			/* clear the RP address */
1418 			rt->mfc_rp = zeroin_addr;
1419 
1420 			rt->mfc_bw_meter = NULL;
1421 
1422 			/* link into table */
1423 			LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1424 			/* Add this entry to the end of the queue */
1425 			rt->mfc_stall = rte;
1426 		} else {
1427 			/* determine if q has overflowed */
1428 			struct rtdetq **p;
1429 			int npkts = 0;
1430 
1431 			/*
1432 			 * XXX ouch! we need to append to the list, but we
1433 			 * only have a pointer to the front, so we have to
1434 			 * scan the entire list every time.
1435 			 */
1436 			for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
1437 				if (++npkts > MAX_UPQ) {
1438 					mrtstat.mrts_upq_ovflw++;
1439 				non_fatal:
1440 					free(rte, M_MRTABLE, 0);
1441 					m_freem(mb0);
1442 					splx(s);
1443 					return (0);
1444 				}
1445 
1446 			/* Add this entry to the end of the queue */
1447 			*p = rte;
1448 		}
1449 
1450 		rte->next = NULL;
1451 		rte->m = mb0;
1452 		rte->ifp = ifp;
1453 
1454 		splx(s);
1455 
1456 		return (0);
1457 	}
1458 }
1459 
1460 
1461 /*ARGSUSED*/
1462 static void
1463 expire_upcalls(void *v)
1464 {
1465 	int i;
1466 	int s;
1467 
1468 	s = splsoftnet();
1469 
1470 	for (i = 0; i < MFCTBLSIZ; i++) {
1471 		struct mfc *rt, *nrt;
1472 
1473 		if (nexpire[i] == 0)
1474 			continue;
1475 
1476 		for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
1477 			nrt = LIST_NEXT(rt, mfc_hash);
1478 
1479 			if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
1480 				continue;
1481 			nexpire[i]--;
1482 
1483 			/*
1484 			 * free the bw_meter entries
1485 			 */
1486 			while (rt->mfc_bw_meter != NULL) {
1487 				struct bw_meter *x = rt->mfc_bw_meter;
1488 
1489 				rt->mfc_bw_meter = x->bm_mfc_next;
1490 				free(x, M_BWMETER, 0);
1491 			}
1492 
1493 			++mrtstat.mrts_cache_cleanups;
1494 			if (mrtdebug & DEBUG_EXPIRE)
1495 				log(LOG_DEBUG,
1496 				    "expire_upcalls: expiring (%x %x)\n",
1497 				    ntohl(rt->mfc_origin.s_addr),
1498 				    ntohl(rt->mfc_mcastgrp.s_addr));
1499 
1500 			expire_mfc(rt);
1501 		}
1502 	}
1503 
1504 	splx(s);
1505 	timeout_add_msec(&expire_upcalls_ch, EXPIRE_TIMEOUT);
1506 }
1507 
1508 /*
1509  * Packet forwarding routine once entry in the cache is made
1510  */
1511 static int
1512 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
1513 {
1514 	struct ip  *ip = mtod(m, struct ip *);
1515 	vifi_t vifi;
1516 	struct vif *vifp;
1517 	int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
1518 
1519 /*
1520  * Macro to send packet on vif.
1521  */
1522 #define MC_SEND(ip, vifp, m) do {					\
1523 	if ((vifp)->v_flags & VIFF_TUNNEL)				\
1524 		encap_send((ip), (vifp), (m));				\
1525 	else								\
1526 		phyint_send((ip), (vifp), (m));				\
1527 } while (/*CONSTCOND*/ 0)
1528 
1529 	/*
1530 	 * Don't forward if it didn't arrive from the parent vif for its origin.
1531 	 */
1532 	vifi = rt->mfc_parent;
1533 	if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
1534 		/* came in the wrong interface */
1535 		if (mrtdebug & DEBUG_FORWARD)
1536 			log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
1537 			    ifp, vifi,
1538 			    vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
1539 		++mrtstat.mrts_wrong_if;
1540 		++rt->mfc_wrong_if;
1541 		/*
1542 		 * If we are doing PIM assert processing, send a message
1543 		 * to the routing daemon.
1544 		 *
1545 		 * XXX: A PIM-SM router needs the WRONGVIF detection so it
1546 		 * can complete the SPT switch, regardless of the type
1547 		 * of interface (broadcast media, GRE tunnel, etc).
1548 		 */
1549 		if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
1550 			struct timeval now;
1551 			u_int32_t delta;
1552 
1553 #ifdef PIM
1554 			if (ifp == &multicast_register_if)
1555 				pimstat.pims_rcv_registers_wrongiif++;
1556 #endif
1557 
1558 			/* Get vifi for the incoming packet */
1559 			for (vifi = 0;
1560 			     vifi < numvifs && viftable[vifi].v_ifp != ifp;
1561 			     vifi++)
1562 			    ;
1563 			if (vifi >= numvifs) {
1564 				/* The iif is not found: ignore the packet. */
1565 				return (0);
1566 			}
1567 
1568 			if (rt->mfc_flags[vifi] &
1569 			    MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
1570 				/* WRONGVIF disabled: ignore the packet */
1571 				return (0);
1572 			}
1573 
1574 			microtime(&now);
1575 
1576 			TV_DELTA(rt->mfc_last_assert, now, delta);
1577 
1578 			if (delta > ASSERT_MSG_TIME) {
1579 				struct igmpmsg *im;
1580 				int hlen = ip->ip_hl << 2;
1581 				struct mbuf *mm = m_copy(m, 0, hlen);
1582 
1583 				M_PULLUP(mm, hlen);
1584 				if (mm == NULL)
1585 					return (ENOBUFS);
1586 
1587 				rt->mfc_last_assert = now;
1588 
1589 				im = mtod(mm, struct igmpmsg *);
1590 				im->im_msgtype	= IGMPMSG_WRONGVIF;
1591 				im->im_mbz	= 0;
1592 				im->im_vif	= vifi;
1593 
1594 				mrtstat.mrts_upcalls++;
1595 
1596 				sin.sin_addr = im->im_src;
1597 				if (socket_send(ip_mrouter, mm, &sin) < 0) {
1598 					log(LOG_WARNING, "ip_mforward: "
1599 					    "ip_mrouter socket queue full\n");
1600 					++mrtstat.mrts_upq_sockfull;
1601 					return (ENOBUFS);
1602 				}
1603 			}
1604 		}
1605 		return (0);
1606 	}
1607 
1608 	/* If I sourced this packet, it counts as output, else it was input. */
1609 	if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
1610 		viftable[vifi].v_pkt_out++;
1611 		viftable[vifi].v_bytes_out += plen;
1612 	} else {
1613 		viftable[vifi].v_pkt_in++;
1614 		viftable[vifi].v_bytes_in += plen;
1615 	}
1616 	rt->mfc_pkt_cnt++;
1617 	rt->mfc_byte_cnt += plen;
1618 
1619 	/*
1620 	 * For each vif, decide if a copy of the packet should be forwarded.
1621 	 * Forward if:
1622 	 *		- the ttl exceeds the vif's threshold
1623 	 *		- there are group members downstream on interface
1624 	 */
1625 	for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
1626 		if ((rt->mfc_ttls[vifi] > 0) &&
1627 			(ip->ip_ttl > rt->mfc_ttls[vifi])) {
1628 			vifp->v_pkt_out++;
1629 			vifp->v_bytes_out += plen;
1630 #ifdef PIM
1631 			if (vifp->v_flags & VIFF_REGISTER)
1632 				pim_register_send(ip, vifp, m, rt);
1633 			else
1634 #endif
1635 			MC_SEND(ip, vifp, m);
1636 		}
1637 
1638 	/*
1639 	 * Perform upcall-related bw measuring.
1640 	 */
1641 	if (rt->mfc_bw_meter != NULL) {
1642 		struct bw_meter *x;
1643 		struct timeval now;
1644 
1645 		microtime(&now);
1646 		for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
1647 			bw_meter_receive_packet(x, plen, &now);
1648 	}
1649 
1650 	return (0);
1651 }
1652 
1653 static void
1654 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1655 {
1656 	struct mbuf *mb_copy;
1657 	int hlen = ip->ip_hl << 2;
1658 
1659 	/*
1660 	 * Make a new reference to the packet; make sure that
1661 	 * the IP header is actually copied, not just referenced,
1662 	 * so that ip_output() only scribbles on the copy.
1663 	 */
1664 	mb_copy = m_copy(m, 0, M_COPYALL);
1665 	M_PULLUP(mb_copy, hlen);
1666 	if (mb_copy == NULL)
1667 		return;
1668 
1669 	send_packet(vifp, mb_copy);
1670 }
1671 
1672 static void
1673 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1674 {
1675 	struct mbuf *mb_copy;
1676 	struct ip *ip_copy;
1677 	int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
1678 
1679 	in_proto_cksum_out(m, NULL);
1680 
1681 	/*
1682 	 * copy the old packet & pullup its IP header into the
1683 	 * new mbuf so we can modify it.  Try to fill the new
1684 	 * mbuf since if we don't the ethernet driver will.
1685 	 */
1686 	MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
1687 	if (mb_copy == NULL)
1688 		return;
1689 	mb_copy->m_data += max_linkhdr;
1690 	mb_copy->m_pkthdr.len = len;
1691 	mb_copy->m_len = sizeof(multicast_encap_iphdr);
1692 
1693 	if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
1694 		m_freem(mb_copy);
1695 		return;
1696 	}
1697 	i = MHLEN - max_linkhdr;
1698 	if (i > len)
1699 		i = len;
1700 	mb_copy = m_pullup(mb_copy, i);
1701 	if (mb_copy == NULL)
1702 		return;
1703 
1704 	/*
1705 	 * fill in the encapsulating IP header.
1706 	 */
1707 	ip_copy = mtod(mb_copy, struct ip *);
1708 	*ip_copy = multicast_encap_iphdr;
1709 	ip_copy->ip_id = htons(ip_randomid());
1710 	ip_copy->ip_len = htons(len);
1711 	ip_copy->ip_src = vifp->v_lcl_addr;
1712 	ip_copy->ip_dst = vifp->v_rmt_addr;
1713 
1714 	/*
1715 	 * turn the encapsulated IP header back into a valid one.
1716 	 */
1717 	ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
1718 	--ip->ip_ttl;
1719 	ip->ip_sum = 0;
1720 	mb_copy->m_data += sizeof(multicast_encap_iphdr);
1721 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
1722 	mb_copy->m_data -= sizeof(multicast_encap_iphdr);
1723 
1724 	send_packet(vifp, mb_copy);
1725 }
1726 
1727 static void
1728 send_packet(struct vif *vifp, struct mbuf *m)
1729 {
1730 	int error;
1731 	int s = splsoftnet();
1732 
1733 	if (vifp->v_flags & VIFF_TUNNEL) {
1734 		/* If tunnel options */
1735 		ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL,
1736 		    0);
1737 	} else {
1738 		/*
1739 		 * if physical interface option, extract the options
1740 		 * and then send
1741 		 */
1742 		struct ip_moptions imo;
1743 
1744 		imo.imo_multicast_ifp = vifp->v_ifp;
1745 		imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - IPTTLDEC;
1746 		imo.imo_multicast_loop = 1;
1747 
1748 		error = ip_output(m, NULL, NULL,
1749 		    IP_FORWARDING | IP_MULTICASTOPTS, &imo, NULL, 0);
1750 
1751 		if (mrtdebug & DEBUG_XMIT)
1752 			log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
1753 			    (long)(vifp - viftable), error);
1754 	}
1755 	splx(s);
1756 }
1757 
1758 /*
1759  * Code for bandwidth monitors
1760  */
1761 
1762 /*
1763  * Define common interface for timeval-related methods
1764  */
1765 #define	BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
1766 #define	BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
1767 #define	BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
1768 
1769 static uint32_t
1770 compute_bw_meter_flags(struct bw_upcall *req)
1771 {
1772 	uint32_t flags = 0;
1773 
1774 	if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
1775 		flags |= BW_METER_UNIT_PACKETS;
1776 	if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
1777 		flags |= BW_METER_UNIT_BYTES;
1778 	if (req->bu_flags & BW_UPCALL_GEQ)
1779 		flags |= BW_METER_GEQ;
1780 	if (req->bu_flags & BW_UPCALL_LEQ)
1781 		flags |= BW_METER_LEQ;
1782 
1783 	return (flags);
1784 }
1785 
1786 /*
1787  * Add a bw_meter entry
1788  */
1789 static int
1790 add_bw_upcall(struct mbuf *m)
1791 {
1792 	int s;
1793 	struct mfc *mfc;
1794 	struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
1795 	    BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
1796 	struct timeval now;
1797 	struct bw_meter *x;
1798 	uint32_t flags;
1799 	struct bw_upcall *req;
1800 
1801 	if (m == NULL || m->m_len < sizeof(struct bw_upcall))
1802 		return (EINVAL);
1803 
1804 	req = mtod(m, struct bw_upcall *);
1805 
1806 	if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
1807 		return (EOPNOTSUPP);
1808 
1809 	/* Test if the flags are valid */
1810 	if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
1811 		return (EINVAL);
1812 	if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
1813 		return (EINVAL);
1814 	if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
1815 	    == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
1816 		return (EINVAL);
1817 
1818 	/* Test if the threshold time interval is valid */
1819 	if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
1820 		return (EINVAL);
1821 
1822 	flags = compute_bw_meter_flags(req);
1823 
1824 	/* Find if we have already same bw_meter entry */
1825 	s = splsoftnet();
1826 	mfc = mfc_find(&req->bu_src, &req->bu_dst);
1827 	if (mfc == NULL) {
1828 		splx(s);
1829 		return (EADDRNOTAVAIL);
1830 	}
1831 	for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
1832 		if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
1833 		    &req->bu_threshold.b_time, ==)) &&
1834 		    (x->bm_threshold.b_packets ==
1835 		    req->bu_threshold.b_packets) &&
1836 		    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
1837 		    (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
1838 			splx(s);
1839 			return (0);	/* XXX Already installed */
1840 		}
1841 	}
1842 
1843 	/* Allocate the new bw_meter entry */
1844 	x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
1845 	if (x == NULL) {
1846 		splx(s);
1847 		return (ENOBUFS);
1848 	}
1849 
1850 	/* Set the new bw_meter entry */
1851 	x->bm_threshold.b_time = req->bu_threshold.b_time;
1852 	microtime(&now);
1853 	x->bm_start_time = now;
1854 	x->bm_threshold.b_packets = req->bu_threshold.b_packets;
1855 	x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
1856 	x->bm_measured.b_packets = 0;
1857 	x->bm_measured.b_bytes = 0;
1858 	x->bm_flags = flags;
1859 	x->bm_time_next = NULL;
1860 	x->bm_time_hash = BW_METER_BUCKETS;
1861 
1862 	/* Add the new bw_meter entry to the front of entries for this MFC */
1863 	x->bm_mfc = mfc;
1864 	x->bm_mfc_next = mfc->mfc_bw_meter;
1865 	mfc->mfc_bw_meter = x;
1866 	schedule_bw_meter(x, &now);
1867 	splx(s);
1868 
1869 	return (0);
1870 }
1871 
1872 static void
1873 free_bw_list(struct bw_meter *list)
1874 {
1875 	while (list != NULL) {
1876 		struct bw_meter *x = list;
1877 
1878 		list = list->bm_mfc_next;
1879 		unschedule_bw_meter(x);
1880 		free(x, M_BWMETER, 0);
1881 	}
1882 }
1883 
1884 /*
1885  * Delete one or multiple bw_meter entries
1886  */
1887 static int
1888 del_bw_upcall(struct mbuf *m)
1889 {
1890 	int s;
1891 	struct mfc *mfc;
1892 	struct bw_meter *x;
1893 	struct bw_upcall *req;
1894 
1895 	if (m == NULL || m->m_len < sizeof(struct bw_upcall))
1896 		return (EINVAL);
1897 
1898 	req = mtod(m, struct bw_upcall *);
1899 
1900 	if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
1901 		return (EOPNOTSUPP);
1902 
1903 	s = splsoftnet();
1904 	/* Find the corresponding MFC entry */
1905 	mfc = mfc_find(&req->bu_src, &req->bu_dst);
1906 	if (mfc == NULL) {
1907 		splx(s);
1908 		return (EADDRNOTAVAIL);
1909 	} else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
1910 		/* Delete all bw_meter entries for this mfc */
1911 		struct bw_meter *list;
1912 
1913 		list = mfc->mfc_bw_meter;
1914 		mfc->mfc_bw_meter = NULL;
1915 		free_bw_list(list);
1916 		splx(s);
1917 		return (0);
1918 	} else {	/* Delete a single bw_meter entry */
1919 		struct bw_meter *prev;
1920 		uint32_t flags = 0;
1921 
1922 		flags = compute_bw_meter_flags(req);
1923 
1924 		/* Find the bw_meter entry to delete */
1925 		for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
1926 		    prev = x, x = x->bm_mfc_next) {
1927 			if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
1928 			    &req->bu_threshold.b_time, ==)) &&
1929 			    (x->bm_threshold.b_packets ==
1930 			    req->bu_threshold.b_packets) &&
1931 			    (x->bm_threshold.b_bytes ==
1932 			    req->bu_threshold.b_bytes) &&
1933 			    (x->bm_flags & BW_METER_USER_FLAGS) == flags)
1934 				break;
1935 		}
1936 		if (x != NULL) { /* Delete entry from the list for this MFC */
1937 			if (prev != NULL) {
1938 				/* remove from middle */
1939 				prev->bm_mfc_next = x->bm_mfc_next;
1940 			} else {
1941 				/* new head of list */
1942 				x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;
1943 			}
1944 
1945 			unschedule_bw_meter(x);
1946 			splx(s);
1947 			/* Free the bw_meter entry */
1948 			free(x, M_BWMETER, 0);
1949 			return (0);
1950 		} else {
1951 			splx(s);
1952 			return (EINVAL);
1953 		}
1954 	}
1955 	/* NOTREACHED */
1956 }
1957 
1958 /*
1959  * Perform bandwidth measurement processing that may result in an upcall
1960  */
1961 static void
1962 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
1963 {
1964 	struct timeval delta;
1965 
1966 	delta = *nowp;
1967 	BW_TIMEVALDECR(&delta, &x->bm_start_time);
1968 
1969 	if (x->bm_flags & BW_METER_GEQ) {
1970 		/* Processing for ">=" type of bw_meter entry */
1971 		if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
1972 			/* Reset the bw_meter entry */
1973 			x->bm_start_time = *nowp;
1974 			x->bm_measured.b_packets = 0;
1975 			x->bm_measured.b_bytes = 0;
1976 			x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
1977 		}
1978 
1979 		/* Record that a packet is received */
1980 		x->bm_measured.b_packets++;
1981 		x->bm_measured.b_bytes += plen;
1982 
1983 		/* Test if we should deliver an upcall */
1984 		if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
1985 			if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
1986 			    (x->bm_measured.b_packets >=
1987 			    x->bm_threshold.b_packets)) ||
1988 			    ((x->bm_flags & BW_METER_UNIT_BYTES) &&
1989 			    (x->bm_measured.b_bytes >=
1990 			    x->bm_threshold.b_bytes))) {
1991 				/* Prepare an upcall for delivery */
1992 				bw_meter_prepare_upcall(x, nowp);
1993 				x->bm_flags |= BW_METER_UPCALL_DELIVERED;
1994 			}
1995 		}
1996 	} else if (x->bm_flags & BW_METER_LEQ) {
1997 		/* Processing for "<=" type of bw_meter entry */
1998 		if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
1999 			/*
2000 			 * We are behind time with the multicast forwarding
2001 			 * table scanning for "<=" type of bw_meter entries,
2002 			 * so test now if we should deliver an upcall.
2003 			 */
2004 			if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2005 			    (x->bm_measured.b_packets <=
2006 			    x->bm_threshold.b_packets)) ||
2007 			    ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2008 			    (x->bm_measured.b_bytes <=
2009 			    x->bm_threshold.b_bytes))) {
2010 				/* Prepare an upcall for delivery */
2011 				bw_meter_prepare_upcall(x, nowp);
2012 			}
2013 			/* Reschedule the bw_meter entry */
2014 			unschedule_bw_meter(x);
2015 			schedule_bw_meter(x, nowp);
2016 		}
2017 
2018 		/* Record that a packet is received */
2019 		x->bm_measured.b_packets++;
2020 		x->bm_measured.b_bytes += plen;
2021 
2022 		/* Test if we should restart the measuring interval */
2023 		if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
2024 		    x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
2025 		    (x->bm_flags & BW_METER_UNIT_BYTES &&
2026 		    x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
2027 			/* Don't restart the measuring interval */
2028 		} else {
2029 			/* Do restart the measuring interval */
2030 			/*
2031 			 * XXX: note that we don't unschedule and schedule,
2032 			 * because this might be too much overhead per packet.
2033 			 * Instead, when we process all entries for a given
2034 			 * timer hash bin, we check whether it is really a
2035 			 * timeout. If not, we reschedule at that time.
2036 			 */
2037 			x->bm_start_time = *nowp;
2038 			x->bm_measured.b_packets = 0;
2039 			x->bm_measured.b_bytes = 0;
2040 			x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2041 		}
2042 	}
2043 }
2044 
2045 /*
2046  * Prepare a bandwidth-related upcall
2047  */
2048 static void
2049 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
2050 {
2051 	struct timeval delta;
2052 	struct bw_upcall *u;
2053 
2054 	/* Compute the measured time interval */
2055 	delta = *nowp;
2056 	BW_TIMEVALDECR(&delta, &x->bm_start_time);
2057 
2058 	/* If there are too many pending upcalls, deliver them now */
2059 	if (bw_upcalls_n >= BW_UPCALLS_MAX)
2060 		bw_upcalls_send();
2061 
2062 	/* Set the bw_upcall entry */
2063 	u = &bw_upcalls[bw_upcalls_n++];
2064 	u->bu_src = x->bm_mfc->mfc_origin;
2065 	u->bu_dst = x->bm_mfc->mfc_mcastgrp;
2066 	u->bu_threshold.b_time = x->bm_threshold.b_time;
2067 	u->bu_threshold.b_packets = x->bm_threshold.b_packets;
2068 	u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
2069 	u->bu_measured.b_time = delta;
2070 	u->bu_measured.b_packets = x->bm_measured.b_packets;
2071 	u->bu_measured.b_bytes = x->bm_measured.b_bytes;
2072 	u->bu_flags = 0;
2073 	if (x->bm_flags & BW_METER_UNIT_PACKETS)
2074 		u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
2075 	if (x->bm_flags & BW_METER_UNIT_BYTES)
2076 		u->bu_flags |= BW_UPCALL_UNIT_BYTES;
2077 	if (x->bm_flags & BW_METER_GEQ)
2078 		u->bu_flags |= BW_UPCALL_GEQ;
2079 	if (x->bm_flags & BW_METER_LEQ)
2080 		u->bu_flags |= BW_UPCALL_LEQ;
2081 }
2082 
2083 /*
2084  * Send the pending bandwidth-related upcalls
2085  */
2086 static void
2087 bw_upcalls_send(void)
2088 {
2089 	struct mbuf *m;
2090 	int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
2091 	struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
2092 	static struct igmpmsg igmpmsg = {
2093 	    0,			/* unused1 */
2094 	    0,			/* unused2 */
2095 	    IGMPMSG_BW_UPCALL,	/* im_msgtype */
2096 	    0,			/* im_mbz  */
2097 	    0,			/* im_vif  */
2098 	    0,			/* unused3 */
2099 	    { 0 },		/* im_src  */
2100 	    { 0 } };		/* im_dst  */
2101 
2102 	if (bw_upcalls_n == 0)
2103 		return;		/* No pending upcalls */
2104 
2105 	bw_upcalls_n = 0;
2106 
2107 	/*
2108 	 * Allocate a new mbuf, initialize it with the header and
2109 	 * the payload for the pending calls.
2110 	 */
2111 	MGETHDR(m, M_DONTWAIT, MT_HEADER);
2112 	if (m == NULL) {
2113 		log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
2114 		return;
2115 	}
2116 
2117 	m->m_len = m->m_pkthdr.len = 0;
2118 	m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg, M_NOWAIT);
2119 	m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0],
2120 	    M_NOWAIT);
2121 
2122 	/*
2123 	 * Send the upcalls
2124 	 * XXX do we need to set the address in k_igmpsrc ?
2125 	 */
2126 	mrtstat.mrts_upcalls++;
2127 	if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
2128 		log(LOG_WARNING,
2129 		    "bw_upcalls_send: ip_mrouter socket queue full\n");
2130 		++mrtstat.mrts_upq_sockfull;
2131 	}
2132 }
2133 
2134 /*
2135  * Compute the timeout hash value for the bw_meter entries
2136  */
2137 #define	BW_METER_TIMEHASH(bw_meter, hash) do {				\
2138 	struct timeval next_timeval = (bw_meter)->bm_start_time;	\
2139 									\
2140 	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
2141 	(hash) = next_timeval.tv_sec;					\
2142 	if (next_timeval.tv_usec)					\
2143 		(hash)++; /* XXX: make sure we don't timeout early */	\
2144 	(hash) %= BW_METER_BUCKETS;					\
2145 } while (/*CONSTCOND*/ 0)
2146 
2147 /*
2148  * Schedule a timer to process periodically bw_meter entry of type "<="
2149  * by linking the entry in the proper hash bucket.
2150  */
2151 static void
2152 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
2153 {
2154 	int time_hash;
2155 
2156 	if (!(x->bm_flags & BW_METER_LEQ))
2157 		return;	/* XXX: we schedule timers only for "<=" entries */
2158 
2159 	/* Reset the bw_meter entry */
2160 	x->bm_start_time = *nowp;
2161 	x->bm_measured.b_packets = 0;
2162 	x->bm_measured.b_bytes = 0;
2163 	x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2164 
2165 	/* Compute the timeout hash value and insert the entry */
2166 	BW_METER_TIMEHASH(x, time_hash);
2167 	x->bm_time_next = bw_meter_timers[time_hash];
2168 	bw_meter_timers[time_hash] = x;
2169 	x->bm_time_hash = time_hash;
2170 }
2171 
2172 /*
2173  * Unschedule the periodic timer that processes bw_meter entry of type "<="
2174  * by removing the entry from the proper hash bucket.
2175  */
2176 static void
2177 unschedule_bw_meter(struct bw_meter *x)
2178 {
2179 	int time_hash;
2180 	struct bw_meter *prev, *tmp;
2181 
2182 	if (!(x->bm_flags & BW_METER_LEQ))
2183 		return;	/* XXX: we schedule timers only for "<=" entries */
2184 
2185 	/* Compute the timeout hash value and delete the entry */
2186 	time_hash = x->bm_time_hash;
2187 	if (time_hash >= BW_METER_BUCKETS)
2188 		return;		/* Entry was not scheduled */
2189 
2190 	for (prev = NULL, tmp = bw_meter_timers[time_hash];
2191 	    tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
2192 		if (tmp == x)
2193 			break;
2194 
2195 	if (tmp == NULL)
2196 		panic("unschedule_bw_meter: bw_meter entry not found");
2197 
2198 	if (prev != NULL)
2199 		prev->bm_time_next = x->bm_time_next;
2200 	else
2201 		bw_meter_timers[time_hash] = x->bm_time_next;
2202 
2203 	x->bm_time_next = NULL;
2204 	x->bm_time_hash = BW_METER_BUCKETS;
2205 }
2206 
2207 /*
2208  * Process all "<=" type of bw_meter that should be processed now,
2209  * and for each entry prepare an upcall if necessary. Each processed
2210  * entry is rescheduled again for the (periodic) processing.
2211  *
2212  * This is run periodically (once per second normally). On each round,
2213  * all the potentially matching entries are in the hash slot that we are
2214  * looking at.
2215  */
2216 static void
2217 bw_meter_process()
2218 {
2219 	int s;
2220 	static uint32_t last_tv_sec;	/* last time we processed this */
2221 
2222 	uint32_t loops;
2223 	int i;
2224 	struct timeval now, process_endtime;
2225 
2226 	microtime(&now);
2227 	if (last_tv_sec == now.tv_sec)
2228 		return;		/* nothing to do */
2229 
2230 	loops = now.tv_sec - last_tv_sec;
2231 	last_tv_sec = now.tv_sec;
2232 	if (loops > BW_METER_BUCKETS)
2233 		loops = BW_METER_BUCKETS;
2234 
2235 	s = splsoftnet();
2236 	/*
2237 	 * Process all bins of bw_meter entries from the one after the last
2238 	 * processed to the current one. On entry, i points to the last bucket
2239 	 * visited, so we need to increment i at the beginning of the loop.
2240 	 */
2241 	for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
2242 		struct bw_meter *x, *tmp_list;
2243 
2244 		if (++i >= BW_METER_BUCKETS)
2245 			i = 0;
2246 
2247 		/* Disconnect the list of bw_meter entries from the bin */
2248 		tmp_list = bw_meter_timers[i];
2249 		bw_meter_timers[i] = NULL;
2250 
2251 		/* Process the list of bw_meter entries */
2252 		while (tmp_list != NULL) {
2253 			x = tmp_list;
2254 			tmp_list = tmp_list->bm_time_next;
2255 
2256 			/* Test if the time interval is over */
2257 			process_endtime = x->bm_start_time;
2258 			BW_TIMEVALADD(&process_endtime,
2259 			    &x->bm_threshold.b_time);
2260 			if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
2261 				/* Not yet: reschedule, but don't reset */
2262 				int time_hash;
2263 
2264 				BW_METER_TIMEHASH(x, time_hash);
2265 				if (time_hash == i &&
2266 				    process_endtime.tv_sec == now.tv_sec) {
2267 					/*
2268 					 * XXX: somehow the bin processing is
2269 					 * a bit ahead of time. Put the entry
2270 					 * in the next bin.
2271 					 */
2272 					if (++time_hash >= BW_METER_BUCKETS)
2273 						time_hash = 0;
2274 				}
2275 				x->bm_time_next = bw_meter_timers[time_hash];
2276 				bw_meter_timers[time_hash] = x;
2277 				x->bm_time_hash = time_hash;
2278 
2279 				continue;
2280 			}
2281 
2282 			/* Test if we should deliver an upcall */
2283 			if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2284 			    (x->bm_measured.b_packets <=
2285 			    x->bm_threshold.b_packets)) ||
2286 			    ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2287 			    (x->bm_measured.b_bytes <=
2288 			    x->bm_threshold.b_bytes))) {
2289 				/* Prepare an upcall for delivery */
2290 				bw_meter_prepare_upcall(x, &now);
2291 			}
2292 
2293 			/* Reschedule for next processing */
2294 			schedule_bw_meter(x, &now);
2295 		}
2296 	}
2297 
2298 	/* Send all upcalls that are pending delivery */
2299 	bw_upcalls_send();
2300 
2301 	splx(s);
2302 }
2303 
2304 /*
2305  * A periodic function for sending all upcalls that are pending delivery
2306  */
2307 static void
2308 expire_bw_upcalls_send(void *unused)
2309 {
2310 	int s;
2311 
2312 	s = splsoftnet();
2313 	bw_upcalls_send();
2314 	splx(s);
2315 
2316 	timeout_add_msec(&bw_upcalls_ch, BW_UPCALLS_PERIOD);
2317 }
2318 
2319 /*
2320  * A periodic function for periodic scanning of the multicast forwarding
2321  * table for processing all "<=" bw_meter entries.
2322  */
2323 static void
2324 expire_bw_meter_process(void *unused)
2325 {
2326 	if (mrt_api_config & MRT_MFC_BW_UPCALL)
2327 		bw_meter_process();
2328 
2329 	timeout_add_msec(&bw_meter_ch, BW_METER_PERIOD);
2330 }
2331 
2332 /*
2333  * End of bandwidth monitoring code
2334  */
2335 
2336 #ifdef PIM
2337 /*
2338  * Send the packet up to the user daemon, or eventually do kernel encapsulation
2339  */
2340 static int
2341 pim_register_send(struct ip *ip, struct vif *vifp,
2342 	struct mbuf *m, struct mfc *rt)
2343 {
2344 	struct mbuf *mb_copy, *mm;
2345 
2346 	if (mrtdebug & DEBUG_PIM)
2347 		log(LOG_DEBUG, "pim_register_send: ");
2348 
2349 	mb_copy = pim_register_prepare(ip, m);
2350 	if (mb_copy == NULL)
2351 		return (ENOBUFS);
2352 
2353 	/*
2354 	 * Send all the fragments. Note that the mbuf for each fragment
2355 	 * is freed by the sending machinery.
2356 	 */
2357 	for (mm = mb_copy; mm; mm = mb_copy) {
2358 		mb_copy = mm->m_nextpkt;
2359 		mm->m_nextpkt = NULL;
2360 		mm = m_pullup(mm, sizeof(struct ip));
2361 		if (mm != NULL) {
2362 			ip = mtod(mm, struct ip *);
2363 			if ((mrt_api_config & MRT_MFC_RP) &&
2364 			    !in_nullhost(rt->mfc_rp)) {
2365 				pim_register_send_rp(ip, vifp, mm, rt);
2366 			} else {
2367 				pim_register_send_upcall(ip, vifp, mm, rt);
2368 			}
2369 		}
2370 	}
2371 
2372 	return (0);
2373 }
2374 
2375 /*
2376  * Return a copy of the data packet that is ready for PIM Register
2377  * encapsulation.
2378  * XXX: Note that in the returned copy the IP header is a valid one.
2379  */
2380 static struct mbuf *
2381 pim_register_prepare(struct ip *ip, struct mbuf *m)
2382 {
2383 	struct mbuf *mb_copy = NULL;
2384 	int mtu;
2385 
2386 	in_proto_cksum_out(m, NULL);
2387 
2388 	/*
2389 	 * Copy the old packet & pullup its IP header into the
2390 	 * new mbuf so we can modify it.
2391 	 */
2392 	mb_copy = m_copy(m, 0, M_COPYALL);
2393 	if (mb_copy == NULL)
2394 		return (NULL);
2395 	mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
2396 	if (mb_copy == NULL)
2397 		return (NULL);
2398 
2399 	/* take care of the TTL */
2400 	ip = mtod(mb_copy, struct ip *);
2401 	--ip->ip_ttl;
2402 
2403 	/* Compute the MTU after the PIM Register encapsulation */
2404 	mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
2405 
2406 	if (ntohs(ip->ip_len) <= mtu) {
2407 		/* Turn the IP header into a valid one */
2408 		ip->ip_sum = 0;
2409 		ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
2410 	} else {
2411 		/* Fragment the packet */
2412 		if (ip_fragment(mb_copy, NULL, mtu) != 0) {
2413 			/* XXX: mb_copy was freed by ip_fragment() */
2414 			return (NULL);
2415 		}
2416 	}
2417 	return (mb_copy);
2418 }
2419 
2420 /*
2421  * Send an upcall with the data packet to the user-level process.
2422  */
2423 static int
2424 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
2425 	struct mbuf *mb_copy, struct mfc *rt)
2426 {
2427 	struct mbuf *mb_first;
2428 	int len = ntohs(ip->ip_len);
2429 	struct igmpmsg *im;
2430 	struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
2431 
2432 	/* Add a new mbuf with an upcall header */
2433 	MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
2434 	if (mb_first == NULL) {
2435 		m_freem(mb_copy);
2436 		return (ENOBUFS);
2437 	}
2438 	mb_first->m_data += max_linkhdr;
2439 	mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
2440 	mb_first->m_len = sizeof(struct igmpmsg);
2441 	mb_first->m_next = mb_copy;
2442 
2443 	/* Send message to routing daemon */
2444 	im = mtod(mb_first, struct igmpmsg *);
2445 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2446 	im->im_mbz = 0;
2447 	im->im_vif = vifp - viftable;
2448 	im->im_src = ip->ip_src;
2449 	im->im_dst = ip->ip_dst;
2450 
2451 	k_igmpsrc.sin_addr = ip->ip_src;
2452 
2453 	mrtstat.mrts_upcalls++;
2454 
2455 	if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
2456 		if (mrtdebug & DEBUG_PIM)
2457 			log(LOG_WARNING, "mcast: pim_register_send_upcall: "
2458 			    "ip_mrouter socket queue full");
2459 		++mrtstat.mrts_upq_sockfull;
2460 		return (ENOBUFS);
2461 	}
2462 
2463 	/* Keep statistics */
2464 	pimstat.pims_snd_registers_msgs++;
2465 	pimstat.pims_snd_registers_bytes += len;
2466 
2467 	return (0);
2468 }
2469 
2470 /*
2471  * Encapsulate the data packet in PIM Register message and send it to the RP.
2472  */
2473 static int
2474 pim_register_send_rp(struct ip *ip, struct vif *vifp,
2475 	struct mbuf *mb_copy, struct mfc *rt)
2476 {
2477 	struct mbuf *mb_first;
2478 	struct ip *ip_outer;
2479 	struct pim_encap_pimhdr *pimhdr;
2480 	int len = ntohs(ip->ip_len);
2481 	vifi_t vifi = rt->mfc_parent;
2482 
2483 	if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
2484 		m_freem(mb_copy);
2485 		return (EADDRNOTAVAIL);		/* The iif vif is invalid */
2486 	}
2487 
2488 	/* Add a new mbuf with the encapsulating header */
2489 	MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
2490 	if (mb_first == NULL) {
2491 		m_freem(mb_copy);
2492 		return (ENOBUFS);
2493 	}
2494 	mb_first->m_data += max_linkhdr;
2495 	mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
2496 	mb_first->m_next = mb_copy;
2497 
2498 	mb_first->m_pkthdr.len = len + mb_first->m_len;
2499 
2500 	/* Fill in the encapsulating IP and PIM header */
2501 	ip_outer = mtod(mb_first, struct ip *);
2502 	*ip_outer = pim_encap_iphdr;
2503 	ip_outer->ip_id = htons(ip_randomid());
2504 	ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
2505 	    sizeof(pim_encap_pimhdr));
2506 	ip_outer->ip_src = viftable[vifi].v_lcl_addr;
2507 	ip_outer->ip_dst = rt->mfc_rp;
2508 	/*
2509 	 * Copy the inner header TOS to the outer header, and take care of the
2510 	 * IP_DF bit.
2511 	 */
2512 	ip_outer->ip_tos = ip->ip_tos;
2513 	if (ntohs(ip->ip_off) & IP_DF)
2514 		ip_outer->ip_off |= htons(IP_DF);
2515 	pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
2516 	    + sizeof(pim_encap_iphdr));
2517 	*pimhdr = pim_encap_pimhdr;
2518 	/* If the iif crosses a border, set the Border-bit */
2519 	if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
2520 		pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
2521 
2522 	mb_first->m_data += sizeof(pim_encap_iphdr);
2523 	pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
2524 	mb_first->m_data -= sizeof(pim_encap_iphdr);
2525 
2526 	send_packet(vifp, mb_first);
2527 
2528 	/* Keep statistics */
2529 	pimstat.pims_snd_registers_msgs++;
2530 	pimstat.pims_snd_registers_bytes += len;
2531 
2532 	return (0);
2533 }
2534 
2535 /*
2536  * PIM-SMv2 and PIM-DM messages processing.
2537  * Receives and verifies the PIM control messages, and passes them
2538  * up to the listening socket, using rip_input().
2539  * The only message with special processing is the PIM_REGISTER message
2540  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
2541  * is passed to if_simloop().
2542  */
2543 void
2544 pim_input(struct mbuf *m, ...)
2545 {
2546 	struct ip *ip = mtod(m, struct ip *);
2547 	struct pim *pim;
2548 	int minlen;
2549 	int datalen;
2550 	int ip_tos;
2551 	int iphlen;
2552 	va_list ap;
2553 
2554 	va_start(ap, m);
2555 	iphlen = va_arg(ap, int);
2556 	va_end(ap);
2557 
2558 	datalen = ntohs(ip->ip_len) - iphlen;
2559 
2560 	/* Keep statistics */
2561 	pimstat.pims_rcv_total_msgs++;
2562 	pimstat.pims_rcv_total_bytes += datalen;
2563 
2564 	/* Validate lengths */
2565 	if (datalen < PIM_MINLEN) {
2566 		pimstat.pims_rcv_tooshort++;
2567 		log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
2568 		    datalen, (u_long)ip->ip_src.s_addr);
2569 		m_freem(m);
2570 		return;
2571 	}
2572 
2573 	/*
2574 	 * If the packet is at least as big as a REGISTER, go agead
2575 	 * and grab the PIM REGISTER header size, to avoid another
2576 	 * possible m_pullup() later.
2577 	 *
2578 	 * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
2579 	 * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
2580 	 */
2581 	minlen = iphlen + (datalen >= PIM_REG_MINLEN ?
2582 	    PIM_REG_MINLEN : PIM_MINLEN);
2583 	/*
2584 	 * Get the IP and PIM headers in contiguous memory, and
2585 	 * possibly the PIM REGISTER header.
2586 	 */
2587 	if ((m->m_flags & M_EXT || m->m_len < minlen) &&
2588 	    (m = m_pullup(m, minlen)) == NULL) {
2589 		log(LOG_ERR, "pim_input: m_pullup failure\n");
2590 		return;
2591 	}
2592 	/* m_pullup() may have given us a new mbuf so reset ip. */
2593 	ip = mtod(m, struct ip *);
2594 	ip_tos = ip->ip_tos;
2595 
2596 	/* adjust mbuf to point to the PIM header */
2597 	m->m_data += iphlen;
2598 	m->m_len  -= iphlen;
2599 	pim = mtod(m, struct pim *);
2600 
2601 	/*
2602 	 * Validate checksum. If PIM REGISTER, exclude the data packet.
2603 	 *
2604 	 * XXX: some older PIMv2 implementations don't make this distinction,
2605 	 * so for compatibility reason perform the checksum over part of the
2606 	 * message, and if error, then over the whole message.
2607 	 */
2608 	if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER &&
2609 	    in_cksum(m, PIM_MINLEN) == 0) {
2610 		/* do nothing, checksum okay */
2611 	} else if (in_cksum(m, datalen)) {
2612 		pimstat.pims_rcv_badsum++;
2613 		if (mrtdebug & DEBUG_PIM)
2614 			log(LOG_DEBUG, "pim_input: invalid checksum");
2615 		m_freem(m);
2616 		return;
2617 	}
2618 
2619 	/* PIM version check */
2620 	if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
2621 		pimstat.pims_rcv_badversion++;
2622 		log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
2623 		    PIM_VT_V(pim->pim_vt), PIM_VERSION);
2624 		m_freem(m);
2625 		return;
2626 	}
2627 
2628 	/* restore mbuf back to the outer IP */
2629 	m->m_data -= iphlen;
2630 	m->m_len  += iphlen;
2631 
2632 	if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
2633 		/*
2634 		 * Since this is a REGISTER, we'll make a copy of the register
2635 		 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
2636 		 * routing daemon.
2637 		 */
2638 		int s;
2639 		struct sockaddr_in dst = { sizeof(dst), AF_INET };
2640 		struct mbuf *mcp;
2641 		struct ip *encap_ip;
2642 		u_int32_t *reghdr;
2643 		struct ifnet *vifp;
2644 
2645 		s = splsoftnet();
2646 		if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
2647 			splx(s);
2648 			if (mrtdebug & DEBUG_PIM)
2649 				log(LOG_DEBUG, "pim_input: register vif "
2650 				    "not set: %d\n", reg_vif_num);
2651 			m_freem(m);
2652 			return;
2653 		}
2654 		/* XXX need refcnt? */
2655 		vifp = viftable[reg_vif_num].v_ifp;
2656 		splx(s);
2657 
2658 		/* Validate length */
2659 		if (datalen < PIM_REG_MINLEN) {
2660 			pimstat.pims_rcv_tooshort++;
2661 			pimstat.pims_rcv_badregisters++;
2662 			log(LOG_ERR, "pim_input: register packet size "
2663 			    "too small %d from %lx\n",
2664 			    datalen, (u_long)ip->ip_src.s_addr);
2665 			m_freem(m);
2666 			return;
2667 		}
2668 
2669 		reghdr = (u_int32_t *)(pim + 1);
2670 		encap_ip = (struct ip *)(reghdr + 1);
2671 
2672 		if (mrtdebug & DEBUG_PIM) {
2673 			log(LOG_DEBUG, "pim_input[register], encap_ip: "
2674 			    "%lx -> %lx, encap_ip len %d\n",
2675 			    (u_long)ntohl(encap_ip->ip_src.s_addr),
2676 			    (u_long)ntohl(encap_ip->ip_dst.s_addr),
2677 			    ntohs(encap_ip->ip_len));
2678 		}
2679 
2680 		/* verify the version number of the inner packet */
2681 		if (encap_ip->ip_v != IPVERSION) {
2682 			pimstat.pims_rcv_badregisters++;
2683 			if (mrtdebug & DEBUG_PIM) {
2684 				log(LOG_DEBUG, "pim_input: invalid IP version"
2685 				    " (%d) of the inner packet\n",
2686 				    encap_ip->ip_v);
2687 			}
2688 			m_freem(m);
2689 			return;
2690 		}
2691 
2692 		/* verify the inner packet is destined to a mcast group */
2693 		if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
2694 			pimstat.pims_rcv_badregisters++;
2695 			if (mrtdebug & DEBUG_PIM)
2696 				log(LOG_DEBUG,
2697 				    "pim_input: inner packet of register is"
2698 				    " not multicast %lx\n",
2699 				    (u_long)ntohl(encap_ip->ip_dst.s_addr));
2700 			m_freem(m);
2701 			return;
2702 		}
2703 
2704 		/* If a NULL_REGISTER, pass it to the daemon */
2705 		if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
2706 			goto pim_input_to_daemon;
2707 
2708 		/*
2709 		 * Copy the TOS from the outer IP header to the inner
2710 		 * IP header.
2711 		 */
2712 		if (encap_ip->ip_tos != ip_tos) {
2713 			/* Outer TOS -> inner TOS */
2714 			encap_ip->ip_tos = ip_tos;
2715 			/* Recompute the inner header checksum. Sigh... */
2716 
2717 			/* adjust mbuf to point to the inner IP header */
2718 			m->m_data += (iphlen + PIM_MINLEN);
2719 			m->m_len  -= (iphlen + PIM_MINLEN);
2720 
2721 			encap_ip->ip_sum = 0;
2722 			encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
2723 
2724 			/* restore mbuf to point back to the outer IP header */
2725 			m->m_data -= (iphlen + PIM_MINLEN);
2726 			m->m_len  += (iphlen + PIM_MINLEN);
2727 		}
2728 
2729 		/*
2730 		 * Decapsulate the inner IP packet and loopback to forward it
2731 		 * as a normal multicast packet. Also, make a copy of the
2732 		 *     outer_iphdr + pimhdr + reghdr + encap_iphdr
2733 		 * to pass to the daemon later, so it can take the appropriate
2734 		 * actions (e.g., send back PIM_REGISTER_STOP).
2735 		 * XXX: here m->m_data points to the outer IP header.
2736 		 */
2737 		mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
2738 		if (mcp == NULL) {
2739 			log(LOG_ERR, "pim_input: pim register: could not "
2740 			    "copy register head\n");
2741 			m_freem(m);
2742 			return;
2743 		}
2744 
2745 		/* Keep statistics */
2746 		/* XXX: registers_bytes include only the encap. mcast pkt */
2747 		pimstat.pims_rcv_registers_msgs++;
2748 		pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
2749 
2750 		/* forward the inner ip packet; point m_data at the inner ip. */
2751 		m_adj(m, iphlen + PIM_MINLEN);
2752 
2753 		if (mrtdebug & DEBUG_PIM) {
2754 			log(LOG_DEBUG,
2755 			    "pim_input: forwarding decapsulated register: "
2756 			    "src %lx, dst %lx, vif %d\n",
2757 			    (u_long)ntohl(encap_ip->ip_src.s_addr),
2758 			    (u_long)ntohl(encap_ip->ip_dst.s_addr),
2759 			    reg_vif_num);
2760 		}
2761 		/* NB: vifp was collected above; can it change on us? */
2762 		looutput(vifp, m, (struct sockaddr *)&dst, NULL);
2763 
2764 		/* prepare the register head to send to the mrouting daemon */
2765 		m = mcp;
2766 	}
2767 
2768 pim_input_to_daemon:
2769 	/*
2770 	 * Pass the PIM message up to the daemon; if it is a Register message,
2771 	 * pass the 'head' only up to the daemon. This includes the
2772 	 * outer IP header, PIM header, PIM-Register header and the
2773 	 * inner IP header.
2774 	 * XXX: the outer IP header pkt size of a Register is not adjust to
2775 	 * reflect the fact that the inner multicast data is truncated.
2776 	 */
2777 	rip_input(m);
2778 
2779 	return;
2780 }
2781 
2782 /*
2783  * Sysctl for pim variables.
2784  */
2785 int
2786 pim_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
2787     void *newp, size_t newlen)
2788 {
2789 	/* All sysctl names at this level are terminal. */
2790 	if (namelen != 1)
2791 		return (ENOTDIR);
2792 
2793 	switch (name[0]) {
2794 	case PIMCTL_STATS:
2795 		if (newp != NULL)
2796 			return (EPERM);
2797 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
2798 		    &pimstat, sizeof(pimstat)));
2799 
2800 	default:
2801 		return (ENOPROTOOPT);
2802 	}
2803 	/* NOTREACHED */
2804 }
2805 
2806 
2807 #endif /* PIM */
2808