xref: /netbsd-src/sys/netinet/ip_mroute.c (revision 220b5c059a84c51ea44107ea8951a57ffaecdc8c)
1 /*	$NetBSD: ip_mroute.c,v 1.58 2001/11/13 00:32:38 lukem Exp $	*/
2 
3 /*
4  * IP multicast forwarding procedures
5  *
6  * Written by David Waitzman, BBN Labs, August 1988.
7  * Modified by Steve Deering, Stanford, February 1989.
8  * Modified by Mark J. Steiglitz, Stanford, May, 1991
9  * Modified by Van Jacobson, LBL, January 1993
10  * Modified by Ajit Thyagarajan, PARC, August 1993
11  * Modified by Bill Fenner, PARC, April 1994
12  * Modified by Charles M. Hannum, NetBSD, May 1995.
13  *
14  * MROUTING Revision: 1.2
15  */
16 
17 #include <sys/cdefs.h>
18 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.58 2001/11/13 00:32:38 lukem Exp $");
19 
20 #include "opt_ipsec.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/callout.h>
25 #include <sys/mbuf.h>
26 #include <sys/socket.h>
27 #include <sys/socketvar.h>
28 #include <sys/protosw.h>
29 #include <sys/errno.h>
30 #include <sys/time.h>
31 #include <sys/kernel.h>
32 #include <sys/ioctl.h>
33 #include <sys/syslog.h>
34 #include <net/if.h>
35 #include <net/route.h>
36 #include <net/raw_cb.h>
37 #include <netinet/in.h>
38 #include <netinet/in_var.h>
39 #include <netinet/in_systm.h>
40 #include <netinet/ip.h>
41 #include <netinet/ip_var.h>
42 #include <netinet/in_pcb.h>
43 #include <netinet/udp.h>
44 #include <netinet/igmp.h>
45 #include <netinet/igmp_var.h>
46 #include <netinet/ip_mroute.h>
47 #include <netinet/ip_encap.h>
48 
49 #include <machine/stdarg.h>
50 
51 #define IP_MULTICASTOPTS 0
52 #define	M_PULLUP(m, len) \
53 	do { \
54 		if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
55 			(m) = m_pullup((m), (len)); \
56 	} while (0)
57 
58 /*
59  * Globals.  All but ip_mrouter and ip_mrtproto could be static,
60  * except for netstat or debugging purposes.
61  */
62 struct socket  *ip_mrouter  = 0;
63 int		ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
64 
65 #define NO_RTE_FOUND 	0x1
66 #define RTE_FOUND	0x2
67 
68 #define	MFCHASH(a, g) \
69 	((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
70 	  ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
71 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
72 u_long	mfchash;
73 
74 u_char		nexpire[MFCTBLSIZ];
75 struct vif	viftable[MAXVIFS];
76 struct mrtstat	mrtstat;
77 u_int		mrtdebug = 0;	  /* debug level 	*/
78 #define		DEBUG_MFC	0x02
79 #define		DEBUG_FORWARD	0x04
80 #define		DEBUG_EXPIRE	0x08
81 #define		DEBUG_XMIT	0x10
82 u_int       	tbfdebug = 0;     /* tbf debug level 	*/
83 #ifdef RSVP_ISI
84 u_int		rsvpdebug = 0;	  /* rsvp debug level   */
85 extern struct socket *ip_rsvpd;
86 extern int rsvp_on;
87 #endif /* RSVP_ISI */
88 
89 /* vif attachment using sys/netinet/ip_encap.c */
90 extern struct domain inetdomain;
91 static void vif_input __P((struct mbuf *, ...));
92 static int vif_encapcheck __P((const struct mbuf *, int, int, void *));
93 static struct protosw vif_protosw =
94 { SOCK_RAW,	&inetdomain,	IPPROTO_IPV4,	PR_ATOMIC|PR_ADDR,
95   vif_input,	rip_output,	0,		rip_ctloutput,
96   rip_usrreq,
97   0,            0,              0,              0,
98 };
99 
100 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second */
101 #define		UPCALL_EXPIRE	6		/* number of timeouts */
102 
103 /*
104  * Define the token bucket filter structures
105  */
106 
107 #define		TBF_REPROCESS	(hz / 100)	/* 100x / second */
108 
109 static int get_sg_cnt __P((struct sioc_sg_req *));
110 static int get_vif_cnt __P((struct sioc_vif_req *));
111 static int ip_mrouter_init __P((struct socket *, struct mbuf *));
112 static int get_version __P((struct mbuf *));
113 static int set_assert __P((struct mbuf *));
114 static int get_assert __P((struct mbuf *));
115 static int add_vif __P((struct mbuf *));
116 static int del_vif __P((struct mbuf *));
117 static void update_mfc __P((struct mfcctl *, struct mfc *));
118 static void expire_mfc __P((struct mfc *));
119 static int add_mfc __P((struct mbuf *));
120 #ifdef UPCALL_TIMING
121 static void collate __P((struct timeval *));
122 #endif
123 static int del_mfc __P((struct mbuf *));
124 static int socket_send __P((struct socket *, struct mbuf *,
125 			    struct sockaddr_in *));
126 static void expire_upcalls __P((void *));
127 #ifdef RSVP_ISI
128 static int ip_mdq __P((struct mbuf *, struct ifnet *, struct mfc *, vifi_t));
129 #else
130 static int ip_mdq __P((struct mbuf *, struct ifnet *, struct mfc *));
131 #endif
132 static void phyint_send __P((struct ip *, struct vif *, struct mbuf *));
133 static void encap_send __P((struct ip *, struct vif *, struct mbuf *));
134 static void tbf_control __P((struct vif *, struct mbuf *, struct ip *,
135 			     u_int32_t));
136 static void tbf_queue __P((struct vif *, struct mbuf *));
137 static void tbf_process_q __P((struct vif *));
138 static void tbf_reprocess_q __P((void *));
139 static int tbf_dq_sel __P((struct vif *, struct ip *));
140 static void tbf_send_packet __P((struct vif *, struct mbuf *));
141 static void tbf_update_tokens __P((struct vif *));
142 static int priority __P((struct vif *, struct ip *));
143 
144 /*
145  * 'Interfaces' associated with decapsulator (so we can tell
146  * packets that went through it from ones that get reflected
147  * by a broken gateway).  These interfaces are never linked into
148  * the system ifnet list & no routes point to them.  I.e., packets
149  * can't be sent this way.  They only exist as a placeholder for
150  * multicast source verification.
151  */
152 #if 0
153 struct ifnet multicast_decap_if[MAXVIFS];
154 #endif
155 
156 #define	ENCAP_TTL	64
157 #define	ENCAP_PROTO	IPPROTO_IPIP	/* 4 */
158 
159 /* prototype IP hdr for encapsulated packets */
160 struct ip multicast_encap_iphdr = {
161 #if BYTE_ORDER == LITTLE_ENDIAN
162 	sizeof(struct ip) >> 2, IPVERSION,
163 #else
164 	IPVERSION, sizeof(struct ip) >> 2,
165 #endif
166 	0,				/* tos */
167 	sizeof(struct ip),		/* total length */
168 	0,				/* id */
169 	0,				/* frag offset */
170 	ENCAP_TTL, ENCAP_PROTO,
171 	0,				/* checksum */
172 };
173 
174 /*
175  * Private variables.
176  */
177 static vifi_t	   numvifs = 0;
178 static int have_encap_tunnel = 0;
179 
180 static struct callout expire_upcalls_ch;
181 
182 /*
183  * one-back cache used by mrt_ipip_input to locate a tunnel's vif
184  * given a datagram's src ip address.
185  */
186 static struct in_addr last_encap_src;
187 static struct vif *last_encap_vif;
188 
189 /*
190  * whether or not special PIM assert processing is enabled.
191  */
192 static int pim_assert;
193 /*
194  * Rate limit for assert notification messages, in usec
195  */
196 #define ASSERT_MSG_TIME		3000000
197 
198 /*
199  * Find a route for a given origin IP address and Multicast group address
200  * Type of service parameter to be added in the future!!!
201  */
202 
203 #define MFCFIND(o, g, rt) { \
204 	struct mfc *_rt; \
205 	(rt) = 0; \
206 	++mrtstat.mrts_mfc_lookups; \
207 	LIST_FOREACH(_rt, &mfchashtbl[MFCHASH(o, g)], mfc_hash) { \
208 		if (in_hosteq(_rt->mfc_origin, (o)) && \
209 		    in_hosteq(_rt->mfc_mcastgrp, (g)) && \
210 		    _rt->mfc_stall == 0) { \
211 			(rt) = _rt; \
212 			break; \
213 		} \
214 	} \
215 	if ((rt) == 0) \
216 		++mrtstat.mrts_mfc_misses; \
217 }
218 
219 /*
220  * Macros to compute elapsed time efficiently
221  * Borrowed from Van Jacobson's scheduling code
222  */
223 #define TV_DELTA(a, b, delta) { \
224 	int xxs; \
225 	delta = (a).tv_usec - (b).tv_usec; \
226 	xxs = (a).tv_sec - (b).tv_sec; \
227 	switch (xxs) { \
228 	case 2: \
229 		delta += 1000000; \
230 		/* fall through */ \
231 	case 1: \
232 		delta += 1000000; \
233 		/* fall through */ \
234 	case 0: \
235 		break; \
236 	default: \
237 		delta += (1000000 * xxs); \
238 		break; \
239 	} \
240 }
241 
242 #ifdef UPCALL_TIMING
243 u_int32_t upcall_data[51];
244 #endif /* UPCALL_TIMING */
245 
246 /*
247  * Handle MRT setsockopt commands to modify the multicast routing tables.
248  */
249 int
250 ip_mrouter_set(so, optname, m)
251 	struct socket *so;
252 	int optname;
253 	struct mbuf **m;
254 {
255 	int error;
256 
257 	if (optname != MRT_INIT && so != ip_mrouter)
258 		error = ENOPROTOOPT;
259 	else
260 		switch (optname) {
261 		case MRT_INIT:
262 			error = ip_mrouter_init(so, *m);
263 			break;
264 		case MRT_DONE:
265 			error = ip_mrouter_done();
266 			break;
267 		case MRT_ADD_VIF:
268 			error = add_vif(*m);
269 			break;
270 		case MRT_DEL_VIF:
271 			error = del_vif(*m);
272 			break;
273 		case MRT_ADD_MFC:
274 			error = add_mfc(*m);
275 			break;
276 		case MRT_DEL_MFC:
277 			error = del_mfc(*m);
278 			break;
279 		case MRT_ASSERT:
280 			error = set_assert(*m);
281 			break;
282 		default:
283 			error = ENOPROTOOPT;
284 			break;
285 		}
286 
287 	if (*m)
288 		m_free(*m);
289 	return (error);
290 }
291 
292 /*
293  * Handle MRT getsockopt commands
294  */
295 int
296 ip_mrouter_get(so, optname, m)
297 	struct socket *so;
298 	int optname;
299 	struct mbuf **m;
300 {
301 	int error;
302 
303 	if (so != ip_mrouter)
304 		error = ENOPROTOOPT;
305 	else {
306 		*m = m_get(M_WAIT, MT_SOOPTS);
307 
308 		switch (optname) {
309 		case MRT_VERSION:
310 			error = get_version(*m);
311 			break;
312 		case MRT_ASSERT:
313 			error = get_assert(*m);
314 			break;
315 		default:
316 			error = ENOPROTOOPT;
317 			break;
318 		}
319 
320 		if (error)
321 			m_free(*m);
322 	}
323 
324 	return (error);
325 }
326 
327 /*
328  * Handle ioctl commands to obtain information from the cache
329  */
330 int
331 mrt_ioctl(so, cmd, data)
332 	struct socket *so;
333 	u_long cmd;
334 	caddr_t data;
335 {
336 	int error;
337 
338 	if (so != ip_mrouter)
339 		error = EINVAL;
340 	else
341 		switch (cmd) {
342 		case SIOCGETVIFCNT:
343 			error = get_vif_cnt((struct sioc_vif_req *)data);
344 			break;
345 		case SIOCGETSGCNT:
346 			error = get_sg_cnt((struct sioc_sg_req *)data);
347 			break;
348 		default:
349 			error = EINVAL;
350 			break;
351 		}
352 
353 	return (error);
354 }
355 
356 /*
357  * returns the packet, byte, rpf-failure count for the source group provided
358  */
359 static int
360 get_sg_cnt(req)
361 	struct sioc_sg_req *req;
362 {
363 	struct mfc *rt;
364 	int s;
365 
366 	s = splsoftnet();
367 	MFCFIND(req->src, req->grp, rt);
368 	splx(s);
369 	if (rt != 0) {
370 		req->pktcnt = rt->mfc_pkt_cnt;
371 		req->bytecnt = rt->mfc_byte_cnt;
372 		req->wrong_if = rt->mfc_wrong_if;
373 	} else
374 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
375 
376 	return (0);
377 }
378 
379 /*
380  * returns the input and output packet and byte counts on the vif provided
381  */
382 static int
383 get_vif_cnt(req)
384 	struct sioc_vif_req *req;
385 {
386 	vifi_t vifi = req->vifi;
387 
388 	if (vifi >= numvifs)
389 		return (EINVAL);
390 
391 	req->icount = viftable[vifi].v_pkt_in;
392 	req->ocount = viftable[vifi].v_pkt_out;
393 	req->ibytes = viftable[vifi].v_bytes_in;
394 	req->obytes = viftable[vifi].v_bytes_out;
395 
396 	return (0);
397 }
398 
399 /*
400  * Enable multicast routing
401  */
402 static int
403 ip_mrouter_init(so, m)
404 	struct socket *so;
405 	struct mbuf *m;
406 {
407 	int *v;
408 
409 	if (mrtdebug)
410 		log(LOG_DEBUG,
411 		    "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
412 		    so->so_type, so->so_proto->pr_protocol);
413 
414 	if (so->so_type != SOCK_RAW ||
415 	    so->so_proto->pr_protocol != IPPROTO_IGMP)
416 		return (EOPNOTSUPP);
417 
418 	if (m == 0 || m->m_len < sizeof(int))
419 		return (EINVAL);
420 
421 	v = mtod(m, int *);
422 	if (*v != 1)
423 		return (EINVAL);
424 
425 	if (ip_mrouter != 0)
426 		return (EADDRINUSE);
427 
428 	ip_mrouter = so;
429 
430 	mfchashtbl =
431 	    hashinit(MFCTBLSIZ, HASH_LIST, M_MRTABLE, M_WAITOK, &mfchash);
432 	bzero((caddr_t)nexpire, sizeof(nexpire));
433 
434 	pim_assert = 0;
435 
436 	callout_init(&expire_upcalls_ch);
437 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
438 	    expire_upcalls, NULL);
439 
440 	if (mrtdebug)
441 		log(LOG_DEBUG, "ip_mrouter_init\n");
442 
443 	return (0);
444 }
445 
446 /*
447  * Disable multicast routing
448  */
449 int
450 ip_mrouter_done()
451 {
452 	vifi_t vifi;
453 	struct vif *vifp;
454 	int i;
455 	int s;
456 
457 	s = splsoftnet();
458 
459 	/* Clear out all the vifs currently in use. */
460 	for (vifi = 0; vifi < numvifs; vifi++) {
461 		vifp = &viftable[vifi];
462 		if (!in_nullhost(vifp->v_lcl_addr))
463 			reset_vif(vifp);
464 	}
465 
466 	numvifs = 0;
467 	pim_assert = 0;
468 
469 	callout_stop(&expire_upcalls_ch);
470 
471 	/*
472 	 * Free all multicast forwarding cache entries.
473 	 */
474 	for (i = 0; i < MFCTBLSIZ; i++) {
475 		struct mfc *rt, *nrt;
476 
477 		for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
478 			nrt = LIST_NEXT(rt, mfc_hash);
479 
480 			expire_mfc(rt);
481 		}
482 	}
483 
484 	free(mfchashtbl, M_MRTABLE);
485 	mfchashtbl = 0;
486 
487 	/* Reset de-encapsulation cache. */
488 	have_encap_tunnel = 0;
489 
490 	ip_mrouter = 0;
491 
492 	splx(s);
493 
494 	if (mrtdebug)
495 		log(LOG_DEBUG, "ip_mrouter_done\n");
496 
497 	return (0);
498 }
499 
500 static int
501 get_version(m)
502 	struct mbuf *m;
503 {
504 	int *v = mtod(m, int *);
505 
506 	*v = 0x0305;	/* XXX !!!! */
507 	m->m_len = sizeof(int);
508 	return (0);
509 }
510 
511 /*
512  * Set PIM assert processing global
513  */
514 static int
515 set_assert(m)
516 	struct mbuf *m;
517 {
518 	int *i;
519 
520 	if (m == 0 || m->m_len < sizeof(int))
521 		return (EINVAL);
522 
523 	i = mtod(m, int *);
524 	pim_assert = !!*i;
525 	return (0);
526 }
527 
528 /*
529  * Get PIM assert processing global
530  */
531 static int
532 get_assert(m)
533 	struct mbuf *m;
534 {
535 	int *i = mtod(m, int *);
536 
537 	*i = pim_assert;
538 	m->m_len = sizeof(int);
539 	return (0);
540 }
541 
542 static struct sockaddr_in sin = { sizeof(sin), AF_INET };
543 
544 /*
545  * Add a vif to the vif table
546  */
547 static int
548 add_vif(m)
549 	struct mbuf *m;
550 {
551 	struct vifctl *vifcp;
552 	struct vif *vifp;
553 	struct ifaddr *ifa;
554 	struct ifnet *ifp;
555 	struct ifreq ifr;
556 	int error, s;
557 
558 	if (m == 0 || m->m_len < sizeof(struct vifctl))
559 		return (EINVAL);
560 
561 	vifcp = mtod(m, struct vifctl *);
562 	if (vifcp->vifc_vifi >= MAXVIFS)
563 		return (EINVAL);
564 
565 	vifp = &viftable[vifcp->vifc_vifi];
566 	if (!in_nullhost(vifp->v_lcl_addr))
567 		return (EADDRINUSE);
568 
569 	/* Find the interface with an address in AF_INET family. */
570 	sin.sin_addr = vifcp->vifc_lcl_addr;
571 	ifa = ifa_ifwithaddr(sintosa(&sin));
572 	if (ifa == 0)
573 		return (EADDRNOTAVAIL);
574 
575 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
576 		if (vifcp->vifc_flags & VIFF_SRCRT) {
577 			log(LOG_ERR, "Source routed tunnels not supported\n");
578 			return (EOPNOTSUPP);
579 		}
580 
581 		/* attach this vif to decapsulator dispatch table */
582 		vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
583 		    vif_encapcheck, &vif_protosw, vifp);
584 		if (!vifp->v_encap_cookie)
585 			return (EINVAL);
586 
587 		/* Create a fake encapsulation interface. */
588 		ifp = (struct ifnet *)malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK);
589 		bzero(ifp, sizeof(*ifp));
590 		sprintf(ifp->if_xname, "mdecap%d", vifcp->vifc_vifi);
591 
592 		/* Prepare cached route entry. */
593 		bzero(&vifp->v_route, sizeof(vifp->v_route));
594 
595 		/*
596 		 * Tell mrt_ipip_input() to start looking at encapsulated
597 		 * packets.
598 		 */
599 		have_encap_tunnel = 1;
600 	} else {
601 		/* Use the physical interface associated with the address. */
602 		ifp = ifa->ifa_ifp;
603 
604 		/* Make sure the interface supports multicast. */
605 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
606 			return (EOPNOTSUPP);
607 
608 		/* Enable promiscuous reception of all IP multicasts. */
609 		satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
610 		satosin(&ifr.ifr_addr)->sin_family = AF_INET;
611 		satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
612 		error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
613 		if (error)
614 			return (error);
615 	}
616 
617 	s = splsoftnet();
618 
619 	/* Define parameters for the tbf structure. */
620 	vifp->tbf_q = 0;
621 	vifp->tbf_t = &vifp->tbf_q;
622 	microtime(&vifp->tbf_last_pkt_t);
623 	vifp->tbf_n_tok = 0;
624 	vifp->tbf_q_len = 0;
625 	vifp->tbf_max_q_len = MAXQSIZE;
626 
627 	vifp->v_flags = vifcp->vifc_flags;
628 	vifp->v_threshold = vifcp->vifc_threshold;
629 	/* scaling up here allows division by 1024 in critical code */
630 	vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
631 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
632 	vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
633 	vifp->v_ifp = ifp;
634 	/* Initialize per vif pkt counters. */
635 	vifp->v_pkt_in = 0;
636 	vifp->v_pkt_out = 0;
637 	vifp->v_bytes_in = 0;
638 	vifp->v_bytes_out = 0;
639 
640 	callout_init(&vifp->v_repq_ch);
641 
642 #ifdef RSVP_ISI
643 	vifp->v_rsvp_on = 0;
644 	vifp->v_rsvpd = 0;
645 #endif /* RSVP_ISI */
646 
647 	splx(s);
648 
649 	/* Adjust numvifs up if the vifi is higher than numvifs. */
650 	if (numvifs <= vifcp->vifc_vifi)
651 		numvifs = vifcp->vifc_vifi + 1;
652 
653 	if (mrtdebug)
654 		log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
655 		    vifcp->vifc_vifi,
656 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
657 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
658 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
659 		    vifcp->vifc_threshold,
660 		    vifcp->vifc_rate_limit);
661 
662 	return (0);
663 }
664 
665 void
666 reset_vif(vifp)
667 	struct vif *vifp;
668 {
669 	struct mbuf *m, *n;
670 	struct ifnet *ifp;
671 	struct ifreq ifr;
672 
673 	callout_stop(&vifp->v_repq_ch);
674 
675 	/* detach this vif from decapsulator dispatch table */
676 	encap_detach(vifp->v_encap_cookie);
677 	vifp->v_encap_cookie = NULL;
678 
679 	for (m = vifp->tbf_q; m != 0; m = n) {
680 		n = m->m_nextpkt;
681 		m_freem(m);
682 	}
683 
684 	if (vifp->v_flags & VIFF_TUNNEL) {
685 		free(vifp->v_ifp, M_MRTABLE);
686 		if (vifp == last_encap_vif) {
687 			last_encap_vif = 0;
688 			last_encap_src = zeroin_addr;
689 		}
690 	} else {
691 		satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in);
692 		satosin(&ifr.ifr_addr)->sin_family = AF_INET;
693 		satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr;
694 		ifp = vifp->v_ifp;
695 		(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
696 	}
697 	bzero((caddr_t)vifp, sizeof(*vifp));
698 }
699 
700 /*
701  * Delete a vif from the vif table
702  */
703 static int
704 del_vif(m)
705 	struct mbuf *m;
706 {
707 	vifi_t *vifip;
708 	struct vif *vifp;
709 	vifi_t vifi;
710 	int s;
711 
712 	if (m == 0 || m->m_len < sizeof(vifi_t))
713 		return (EINVAL);
714 
715 	vifip = mtod(m, vifi_t *);
716 	if (*vifip >= numvifs)
717 		return (EINVAL);
718 
719 	vifp = &viftable[*vifip];
720 	if (in_nullhost(vifp->v_lcl_addr))
721 		return (EADDRNOTAVAIL);
722 
723 	s = splsoftnet();
724 
725 	reset_vif(vifp);
726 
727 	/* Adjust numvifs down */
728 	for (vifi = numvifs; vifi > 0; vifi--)
729 		if (!in_nullhost(viftable[vifi-1].v_lcl_addr))
730 			break;
731 	numvifs = vifi;
732 
733 	splx(s);
734 
735 	if (mrtdebug)
736 		log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
737 
738 	return (0);
739 }
740 
741 static void
742 update_mfc(mfccp, rt)
743 	struct mfcctl *mfccp;
744 	struct mfc *rt;
745 {
746 	vifi_t vifi;
747 
748 	rt->mfc_parent = mfccp->mfcc_parent;
749 	for (vifi = 0; vifi < numvifs; vifi++)
750 		rt->mfc_ttls[vifi] = mfccp->mfcc_ttls[vifi];
751 	rt->mfc_expire = 0;
752 	rt->mfc_stall = 0;
753 }
754 
755 static void
756 expire_mfc(rt)
757 	struct mfc *rt;
758 {
759 	struct rtdetq *rte, *nrte;
760 
761 	for (rte = rt->mfc_stall; rte != 0; rte = nrte) {
762 		nrte = rte->next;
763 		m_freem(rte->m);
764 		free(rte, M_MRTABLE);
765 	}
766 
767 	LIST_REMOVE(rt, mfc_hash);
768 	free(rt, M_MRTABLE);
769 }
770 
771 /*
772  * Add an mfc entry
773  */
774 static int
775 add_mfc(m)
776 	struct mbuf *m;
777 {
778 	struct mfcctl *mfccp;
779 	struct mfc *rt;
780 	u_int32_t hash = 0;
781 	struct rtdetq *rte, *nrte;
782 	u_short nstl;
783 	int s;
784 
785 	if (m == 0 || m->m_len < sizeof(struct mfcctl))
786 		return (EINVAL);
787 
788 	mfccp = mtod(m, struct mfcctl *);
789 
790 	s = splsoftnet();
791 	MFCFIND(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp, rt);
792 
793 	/* If an entry already exists, just update the fields */
794 	if (rt) {
795 		if (mrtdebug & DEBUG_MFC)
796 			log(LOG_DEBUG,"add_mfc update o %x g %x p %x\n",
797 			    ntohl(mfccp->mfcc_origin.s_addr),
798 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
799 			    mfccp->mfcc_parent);
800 
801 		if (rt->mfc_expire)
802 			nexpire[hash]--;
803 
804 		update_mfc(mfccp, rt);
805 
806 		splx(s);
807 		return (0);
808 	}
809 
810 	/*
811 	 * Find the entry for which the upcall was made and update
812 	 */
813 	nstl = 0;
814 	hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
815 	LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
816 		if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
817 		    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
818 		    rt->mfc_stall != 0) {
819 			if (nstl++)
820 				log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
821 				    "multiple kernel entries",
822 				    ntohl(mfccp->mfcc_origin.s_addr),
823 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
824 				    mfccp->mfcc_parent, rt->mfc_stall);
825 
826 			if (mrtdebug & DEBUG_MFC)
827 				log(LOG_DEBUG,"add_mfc o %x g %x p %x dbg %p\n",
828 				    ntohl(mfccp->mfcc_origin.s_addr),
829 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
830 				    mfccp->mfcc_parent, rt->mfc_stall);
831 
832 			if (rt->mfc_expire)
833 				nexpire[hash]--;
834 
835 			rte = rt->mfc_stall;
836 			update_mfc(mfccp, rt);
837 
838 			/* free packets Qed at the end of this entry */
839 			for (; rte != 0; rte = nrte) {
840 				nrte = rte->next;
841 #ifdef RSVP_ISI
842 				ip_mdq(rte->m, rte->ifp, rt, -1);
843 #else
844 				ip_mdq(rte->m, rte->ifp, rt);
845 #endif /* RSVP_ISI */
846 				m_freem(rte->m);
847 #ifdef UPCALL_TIMING
848 				collate(&rte->t);
849 #endif /* UPCALL_TIMING */
850 				free(rte, M_MRTABLE);
851 			}
852 		}
853 	}
854 
855 	if (nstl == 0) {
856 		/*
857 		 * No mfc; make a new one
858 		 */
859 		if (mrtdebug & DEBUG_MFC)
860 			log(LOG_DEBUG,"add_mfc no upcall o %x g %x p %x\n",
861 			    ntohl(mfccp->mfcc_origin.s_addr),
862 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
863 			    mfccp->mfcc_parent);
864 
865 		rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
866 		if (rt == 0) {
867 			splx(s);
868 			return (ENOBUFS);
869 		}
870 
871 		rt->mfc_origin = mfccp->mfcc_origin;
872 		rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
873 		/* initialize pkt counters per src-grp */
874 		rt->mfc_pkt_cnt = 0;
875 		rt->mfc_byte_cnt = 0;
876 		rt->mfc_wrong_if = 0;
877 		timerclear(&rt->mfc_last_assert);
878 		update_mfc(mfccp, rt);
879 
880 		/* insert new entry at head of hash chain */
881 		LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
882 	}
883 
884 	splx(s);
885 	return (0);
886 }
887 
888 #ifdef UPCALL_TIMING
889 /*
890  * collect delay statistics on the upcalls
891  */
892 static void collate(t)
893 struct timeval *t;
894 {
895     u_int32_t d;
896     struct timeval tp;
897     u_int32_t delta;
898 
899     microtime(&tp);
900 
901     if (timercmp(t, &tp, <)) {
902 	TV_DELTA(tp, *t, delta);
903 
904 	d = delta >> 10;
905 	if (d > 50)
906 	    d = 50;
907 
908 	++upcall_data[d];
909     }
910 }
911 #endif /* UPCALL_TIMING */
912 
913 /*
914  * Delete an mfc entry
915  */
916 static int
917 del_mfc(m)
918 	struct mbuf *m;
919 {
920 	struct mfcctl *mfccp;
921 	struct mfc *rt;
922 	int s;
923 
924 	if (m == 0 || m->m_len < sizeof(struct mfcctl))
925 		return (EINVAL);
926 
927 	mfccp = mtod(m, struct mfcctl *);
928 
929 	if (mrtdebug & DEBUG_MFC)
930 		log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
931 		    ntohl(mfccp->mfcc_origin.s_addr),
932 		    ntohl(mfccp->mfcc_mcastgrp.s_addr));
933 
934 	s = splsoftnet();
935 
936 	MFCFIND(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp, rt);
937 	if (rt == 0) {
938 		splx(s);
939 		return (EADDRNOTAVAIL);
940 	}
941 
942 	LIST_REMOVE(rt, mfc_hash);
943 	free(rt, M_MRTABLE);
944 
945 	splx(s);
946 	return (0);
947 }
948 
949 static int
950 socket_send(s, mm, src)
951     struct socket *s;
952     struct mbuf *mm;
953     struct sockaddr_in *src;
954 {
955     if (s) {
956 	if (sbappendaddr(&s->so_rcv, sintosa(src), mm, (struct mbuf *)0) != 0) {
957 	    sorwakeup(s);
958 	    return (0);
959 	}
960     }
961     m_freem(mm);
962     return (-1);
963 }
964 
965 /*
966  * IP multicast forwarding function. This function assumes that the packet
967  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
968  * pointed to by "ifp", and the packet is to be relayed to other networks
969  * that have members of the packet's destination IP multicast group.
970  *
971  * The packet is returned unscathed to the caller, unless it is
972  * erroneous, in which case a non-zero return value tells the caller to
973  * discard it.
974  */
975 
976 #define IP_HDR_LEN  20	/* # bytes of fixed IP header (excluding options) */
977 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
978 
979 int
980 #ifdef RSVP_ISI
981 ip_mforward(m, ifp, imo)
982 #else
983 ip_mforward(m, ifp)
984 #endif /* RSVP_ISI */
985     struct mbuf *m;
986     struct ifnet *ifp;
987 #ifdef RSVP_ISI
988     struct ip_moptions *imo;
989 #endif /* RSVP_ISI */
990 {
991     struct ip *ip = mtod(m, struct ip *);
992     struct mfc *rt;
993     u_char *ipoptions;
994     static int srctun = 0;
995     struct mbuf *mm;
996     int s;
997 #ifdef RSVP_ISI
998     struct vif *vifp;
999     vifi_t vifi;
1000 #endif /* RSVP_ISI */
1001 
1002     /*
1003      * Clear any in-bound checksum flags for this packet.
1004      */
1005     m->m_pkthdr.csum_flags = 0;
1006 
1007     if (mrtdebug & DEBUG_FORWARD)
1008 	log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
1009 	    ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
1010 
1011     if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
1012 	(ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR) {
1013 	/*
1014 	 * Packet arrived via a physical interface or
1015 	 * an encapuslated tunnel.
1016 	 */
1017     } else {
1018 	/*
1019 	 * Packet arrived through a source-route tunnel.
1020 	 * Source-route tunnels are no longer supported.
1021 	 */
1022 	if ((srctun++ % 1000) == 0)
1023 	    log(LOG_ERR, "ip_mforward: received source-routed packet from %x\n",
1024 		ntohl(ip->ip_src.s_addr));
1025 
1026 	return (1);
1027     }
1028 
1029 #ifdef RSVP_ISI
1030     if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
1031 	if (ip->ip_ttl < 255)
1032 	    ip->ip_ttl++;	/* compensate for -1 in *_send routines */
1033 	if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
1034 	    vifp = viftable + vifi;
1035 	    printf("Sending IPPROTO_RSVP from %x to %x on vif %d (%s%s)\n",
1036 		ntohl(ip->ip_src), ntohl(ip->ip_dst), vifi,
1037 		(vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
1038 		vifp->v_ifp->if_xname);
1039 	}
1040 	return (ip_mdq(m, ifp, (struct mfc *)0, vifi));
1041     }
1042     if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
1043 	printf("Warning: IPPROTO_RSVP from %x to %x without vif option\n",
1044 	    ntohl(ip->ip_src), ntohl(ip->ip_dst));
1045     }
1046 #endif /* RSVP_ISI */
1047 
1048     /*
1049      * Don't forward a packet with time-to-live of zero or one,
1050      * or a packet destined to a local-only group.
1051      */
1052     if (ip->ip_ttl <= 1 ||
1053 	IN_LOCAL_GROUP(ip->ip_dst.s_addr))
1054 	return (0);
1055 
1056     /*
1057      * Determine forwarding vifs from the forwarding cache table
1058      */
1059     s = splsoftnet();
1060     MFCFIND(ip->ip_src, ip->ip_dst, rt);
1061 
1062     /* Entry exists, so forward if necessary */
1063     if (rt != 0) {
1064 	splx(s);
1065 #ifdef RSVP_ISI
1066 	return (ip_mdq(m, ifp, rt, -1));
1067 #else
1068 	return (ip_mdq(m, ifp, rt));
1069 #endif /* RSVP_ISI */
1070     } else {
1071 	/*
1072 	 * If we don't have a route for packet's origin,
1073 	 * Make a copy of the packet &
1074 	 * send message to routing daemon
1075 	 */
1076 
1077 	struct mbuf *mb0;
1078 	struct rtdetq *rte;
1079 	u_int32_t hash;
1080 	int hlen = ip->ip_hl << 2;
1081 #ifdef UPCALL_TIMING
1082 	struct timeval tp;
1083 
1084 	microtime(&tp);
1085 #endif /* UPCALL_TIMING */
1086 
1087 	mrtstat.mrts_no_route++;
1088 	if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
1089 	    log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
1090 		ntohl(ip->ip_src.s_addr),
1091 		ntohl(ip->ip_dst.s_addr));
1092 
1093 	/*
1094 	 * Allocate mbufs early so that we don't do extra work if we are
1095 	 * just going to fail anyway.  Make sure to pullup the header so
1096 	 * that other people can't step on it.
1097 	 */
1098 	rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT);
1099 	if (rte == 0) {
1100 	    splx(s);
1101 	    return (ENOBUFS);
1102 	}
1103 	mb0 = m_copy(m, 0, M_COPYALL);
1104 	M_PULLUP(mb0, hlen);
1105 	if (mb0 == 0) {
1106 	    free(rte, M_MRTABLE);
1107 	    splx(s);
1108 	    return (ENOBUFS);
1109 	}
1110 
1111 	/* is there an upcall waiting for this packet? */
1112 	hash = MFCHASH(ip->ip_src, ip->ip_dst);
1113 	LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1114 	    if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
1115 		in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
1116 		rt->mfc_stall != 0)
1117 		break;
1118 	}
1119 
1120 	if (rt == 0) {
1121 	    int i;
1122 	    struct igmpmsg *im;
1123 
1124 	    /* no upcall, so make a new entry */
1125 	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
1126 	    if (rt == 0) {
1127 		free(rte, M_MRTABLE);
1128 		m_freem(mb0);
1129 		splx(s);
1130 		return (ENOBUFS);
1131 	    }
1132 	    /* Make a copy of the header to send to the user level process */
1133 	    mm = m_copy(m, 0, hlen);
1134 	    M_PULLUP(mm, hlen);
1135 	    if (mm == 0) {
1136 		free(rte, M_MRTABLE);
1137 		m_freem(mb0);
1138 		free(rt, M_MRTABLE);
1139 		splx(s);
1140 		return (ENOBUFS);
1141 	    }
1142 
1143 	    /*
1144 	     * Send message to routing daemon to install
1145 	     * a route into the kernel table
1146 	     */
1147 	    sin.sin_addr = ip->ip_src;
1148 
1149 	    im = mtod(mm, struct igmpmsg *);
1150 	    im->im_msgtype	= IGMPMSG_NOCACHE;
1151 	    im->im_mbz		= 0;
1152 
1153 	    mrtstat.mrts_upcalls++;
1154 
1155 	    if (socket_send(ip_mrouter, mm, &sin) < 0) {
1156 		log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n");
1157 		++mrtstat.mrts_upq_sockfull;
1158 		free(rte, M_MRTABLE);
1159 		m_freem(mb0);
1160 		free(rt, M_MRTABLE);
1161 		splx(s);
1162 		return (ENOBUFS);
1163 	    }
1164 
1165 	    /* insert new entry at head of hash chain */
1166 	    rt->mfc_origin = ip->ip_src;
1167 	    rt->mfc_mcastgrp = ip->ip_dst;
1168 	    rt->mfc_pkt_cnt = 0;
1169 	    rt->mfc_byte_cnt = 0;
1170 	    rt->mfc_wrong_if = 0;
1171 	    rt->mfc_expire = UPCALL_EXPIRE;
1172 	    nexpire[hash]++;
1173 	    for (i = 0; i < numvifs; i++)
1174 		rt->mfc_ttls[i] = 0;
1175 	    rt->mfc_parent = -1;
1176 
1177 	    /* link into table */
1178 	    LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1179 	    /* Add this entry to the end of the queue */
1180 	    rt->mfc_stall = rte;
1181 	} else {
1182 	    /* determine if q has overflowed */
1183 	    struct rtdetq **p;
1184 	    int npkts = 0;
1185 
1186 	    for (p = &rt->mfc_stall; *p != 0; p = &(*p)->next)
1187 		if (++npkts > MAX_UPQ) {
1188 		    mrtstat.mrts_upq_ovflw++;
1189 		    free(rte, M_MRTABLE);
1190 		    m_freem(mb0);
1191 		    splx(s);
1192 		    return (0);
1193 	        }
1194 
1195 	    /* Add this entry to the end of the queue */
1196 	    *p = rte;
1197 	}
1198 
1199 	rte->next		= 0;
1200 	rte->m 			= mb0;
1201 	rte->ifp 		= ifp;
1202 #ifdef UPCALL_TIMING
1203 	rte->t			= tp;
1204 #endif /* UPCALL_TIMING */
1205 
1206 
1207 	splx(s);
1208 
1209 	return (0);
1210     }
1211 }
1212 
1213 
1214 /*ARGSUSED*/
1215 static void
1216 expire_upcalls(v)
1217 	void *v;
1218 {
1219 	int i;
1220 	int s;
1221 
1222 	s = splsoftnet();
1223 
1224 	for (i = 0; i < MFCTBLSIZ; i++) {
1225 		struct mfc *rt, *nrt;
1226 
1227 		if (nexpire[i] == 0)
1228 			continue;
1229 
1230 		for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
1231 			nrt = LIST_NEXT(rt, mfc_hash);
1232 
1233 			if (rt->mfc_expire == 0 ||
1234 			    --rt->mfc_expire > 0)
1235 				continue;
1236 			nexpire[i]--;
1237 
1238 			++mrtstat.mrts_cache_cleanups;
1239 			if (mrtdebug & DEBUG_EXPIRE)
1240 				log(LOG_DEBUG,
1241 				    "expire_upcalls: expiring (%x %x)\n",
1242 				    ntohl(rt->mfc_origin.s_addr),
1243 				    ntohl(rt->mfc_mcastgrp.s_addr));
1244 
1245 			expire_mfc(rt);
1246 		}
1247 	}
1248 
1249 	splx(s);
1250 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
1251 	    expire_upcalls, NULL);
1252 }
1253 
1254 /*
1255  * Packet forwarding routine once entry in the cache is made
1256  */
1257 static int
1258 #ifdef RSVP_ISI
1259 ip_mdq(m, ifp, rt, xmt_vif)
1260 #else
1261 ip_mdq(m, ifp, rt)
1262 #endif /* RSVP_ISI */
1263     struct mbuf *m;
1264     struct ifnet *ifp;
1265     struct mfc *rt;
1266 #ifdef RSVP_ISI
1267     vifi_t xmt_vif;
1268 #endif /* RSVP_ISI */
1269 {
1270     struct ip  *ip = mtod(m, struct ip *);
1271     vifi_t vifi;
1272     struct vif *vifp;
1273     int plen = ntohs(ip->ip_len);
1274 
1275 /*
1276  * Macro to send packet on vif.  Since RSVP packets don't get counted on
1277  * input, they shouldn't get counted on output, so statistics keeping is
1278  * separate.
1279  */
1280 #define MC_SEND(ip,vifp,m) {                             \
1281                 if ((vifp)->v_flags & VIFF_TUNNEL)	 \
1282                     encap_send((ip), (vifp), (m));       \
1283                 else                                     \
1284                     phyint_send((ip), (vifp), (m));      \
1285 }
1286 
1287 #ifdef RSVP_ISI
1288     /*
1289      * If xmt_vif is not -1, send on only the requested vif.
1290      *
1291      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.
1292      */
1293     if (xmt_vif < numvifs) {
1294         MC_SEND(ip, viftable + xmt_vif, m);
1295 	return (1);
1296     }
1297 #endif /* RSVP_ISI */
1298 
1299     /*
1300      * Don't forward if it didn't arrive from the parent vif for its origin.
1301      */
1302     vifi = rt->mfc_parent;
1303     if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
1304 	/* came in the wrong interface */
1305 	if (mrtdebug & DEBUG_FORWARD)
1306 	    log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
1307 		ifp, vifi, viftable[vifi].v_ifp);
1308 	++mrtstat.mrts_wrong_if;
1309 	++rt->mfc_wrong_if;
1310 	/*
1311 	 * If we are doing PIM assert processing, and we are forwarding
1312 	 * packets on this interface, and it is a broadcast medium
1313 	 * interface (and not a tunnel), send a message to the routing daemon.
1314 	 */
1315 	if (pim_assert && rt->mfc_ttls[vifi] &&
1316 		(ifp->if_flags & IFF_BROADCAST) &&
1317 		!(viftable[vifi].v_flags & VIFF_TUNNEL)) {
1318 	    struct mbuf *mm;
1319 	    struct igmpmsg *im;
1320 	    int hlen = ip->ip_hl << 2;
1321 	    struct timeval now;
1322 	    u_int32_t delta;
1323 
1324 	    microtime(&now);
1325 
1326 	    TV_DELTA(rt->mfc_last_assert, now, delta);
1327 
1328 	    if (delta > ASSERT_MSG_TIME) {
1329 		mm = m_copy(m, 0, hlen);
1330 		M_PULLUP(mm, hlen);
1331 		if (mm == 0) {
1332 		    return (ENOBUFS);
1333 		}
1334 
1335 		rt->mfc_last_assert = now;
1336 
1337 		im = mtod(mm, struct igmpmsg *);
1338 		im->im_msgtype	= IGMPMSG_WRONGVIF;
1339 		im->im_mbz	= 0;
1340 		im->im_vif	= vifi;
1341 
1342 		sin.sin_addr = im->im_src;
1343 
1344 		socket_send(ip_mrouter, mm, &sin);
1345 	    }
1346 	}
1347 	return (0);
1348     }
1349 
1350     /* If I sourced this packet, it counts as output, else it was input. */
1351     if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
1352 	viftable[vifi].v_pkt_out++;
1353 	viftable[vifi].v_bytes_out += plen;
1354     } else {
1355 	viftable[vifi].v_pkt_in++;
1356 	viftable[vifi].v_bytes_in += plen;
1357     }
1358     rt->mfc_pkt_cnt++;
1359     rt->mfc_byte_cnt += plen;
1360 
1361     /*
1362      * For each vif, decide if a copy of the packet should be forwarded.
1363      * Forward if:
1364      *		- the ttl exceeds the vif's threshold
1365      *		- there are group members downstream on interface
1366      */
1367     for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
1368 	if ((rt->mfc_ttls[vifi] > 0) &&
1369 	    (ip->ip_ttl > rt->mfc_ttls[vifi])) {
1370 	    vifp->v_pkt_out++;
1371 	    vifp->v_bytes_out += plen;
1372 	    MC_SEND(ip, vifp, m);
1373 	}
1374 
1375     return (0);
1376 }
1377 
1378 #ifdef RSVP_ISI
1379 /*
1380  * check if a vif number is legal/ok. This is used by ip_output, to export
1381  * numvifs there,
1382  */
1383 int
1384 legal_vif_num(vif)
1385     int vif;
1386 {
1387     if (vif >= 0 && vif < numvifs)
1388        return (1);
1389     else
1390        return (0);
1391 }
1392 #endif /* RSVP_ISI */
1393 
1394 static void
1395 phyint_send(ip, vifp, m)
1396 	struct ip *ip;
1397 	struct vif *vifp;
1398 	struct mbuf *m;
1399 {
1400 	struct mbuf *mb_copy;
1401 	int hlen = ip->ip_hl << 2;
1402 
1403 	/*
1404 	 * Make a new reference to the packet; make sure that
1405 	 * the IP header is actually copied, not just referenced,
1406 	 * so that ip_output() only scribbles on the copy.
1407 	 */
1408 	mb_copy = m_copy(m, 0, M_COPYALL);
1409 	M_PULLUP(mb_copy, hlen);
1410 	if (mb_copy == 0)
1411 		return;
1412 
1413 	if (vifp->v_rate_limit <= 0)
1414 		tbf_send_packet(vifp, mb_copy);
1415 	else
1416 		tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len);
1417 }
1418 
1419 static void
1420 encap_send(ip, vifp, m)
1421 	struct ip *ip;
1422 	struct vif *vifp;
1423 	struct mbuf *m;
1424 {
1425 	struct mbuf *mb_copy;
1426 	struct ip *ip_copy;
1427 	int i, len = ip->ip_len + sizeof(multicast_encap_iphdr);
1428 
1429 	/*
1430 	 * copy the old packet & pullup it's IP header into the
1431 	 * new mbuf so we can modify it.  Try to fill the new
1432 	 * mbuf since if we don't the ethernet driver will.
1433 	 */
1434 	MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
1435 	if (mb_copy == 0)
1436 		return;
1437 	mb_copy->m_data += max_linkhdr;
1438 	mb_copy->m_pkthdr.len = len;
1439 	mb_copy->m_len = sizeof(multicast_encap_iphdr);
1440 
1441 	if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == 0) {
1442 		m_freem(mb_copy);
1443 		return;
1444 	}
1445 	i = MHLEN - max_linkhdr;
1446 	if (i > len)
1447 		i = len;
1448 	mb_copy = m_pullup(mb_copy, i);
1449 	if (mb_copy == 0)
1450 		return;
1451 
1452 	/*
1453 	 * fill in the encapsulating IP header.
1454 	 */
1455 	ip_copy = mtod(mb_copy, struct ip *);
1456 	*ip_copy = multicast_encap_iphdr;
1457 	ip_copy->ip_id = htons(ip_id++);
1458 	ip_copy->ip_len = len;
1459 	ip_copy->ip_src = vifp->v_lcl_addr;
1460 	ip_copy->ip_dst = vifp->v_rmt_addr;
1461 
1462 	/*
1463 	 * turn the encapsulated IP header back into a valid one.
1464 	 */
1465 	ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
1466 	--ip->ip_ttl;
1467 	HTONS(ip->ip_len);
1468 	HTONS(ip->ip_off);
1469 	ip->ip_sum = 0;
1470 	mb_copy->m_data += sizeof(multicast_encap_iphdr);
1471 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
1472 	mb_copy->m_data -= sizeof(multicast_encap_iphdr);
1473 
1474 	if (vifp->v_rate_limit <= 0)
1475 		tbf_send_packet(vifp, mb_copy);
1476 	else
1477 		tbf_control(vifp, mb_copy, ip, ip_copy->ip_len);
1478 }
1479 
1480 /*
1481  * De-encapsulate a packet and feed it back through ip input.
1482  */
1483 static void
1484 #if __STDC__
1485 vif_input(struct mbuf *m, ...)
1486 #else
1487 vif_input(m, va_alist)
1488 	struct mbuf *m;
1489 	va_dcl
1490 #endif
1491 {
1492 	int off, proto;
1493 	va_list ap;
1494 	struct ip *ip;
1495 	struct vif *vifp;
1496 	int s;
1497 	struct ifqueue *ifq;
1498 
1499 	va_start(ap, m);
1500 	off = va_arg(ap, int);
1501 	proto = va_arg(ap, int);
1502 	va_end(ap);
1503 
1504 	vifp = (struct vif *)encap_getarg(m);
1505 	if (!vifp || proto != AF_INET) {
1506 		m_freem(m);
1507 		mrtstat.mrts_bad_tunnel++;
1508 		return;
1509 	}
1510 
1511 	ip = mtod(m, struct ip *);
1512 
1513 	m_adj(m, off);
1514 	m->m_pkthdr.rcvif = vifp->v_ifp;
1515 	ifq = &ipintrq;
1516 	s = splnet();
1517 	if (IF_QFULL(ifq)) {
1518 		IF_DROP(ifq);
1519 		m_freem(m);
1520 	} else {
1521 		IF_ENQUEUE(ifq, m);
1522 		/*
1523 		 * normally we would need a "schednetisr(NETISR_IP)"
1524 		 * here but we were called by ip_input and it is going
1525 		 * to loop back & try to dequeue the packet we just
1526 		 * queued as soon as we return so we avoid the
1527 		 * unnecessary software interrrupt.
1528 		 */
1529 	}
1530 	splx(s);
1531 }
1532 
1533 /*
1534  * Check if the packet should be grabbed by us.
1535  */
1536 static int
1537 vif_encapcheck(m, off, proto, arg)
1538 	const struct mbuf *m;
1539 	int off;
1540 	int proto;
1541 	void *arg;
1542 {
1543 	struct vif *vifp;
1544 	struct ip ip;
1545 
1546 #ifdef DIAGNOSTIC
1547 	if (!arg || proto != IPPROTO_IPV4)
1548 		panic("unexpected arg in vif_encapcheck");
1549 #endif
1550 
1551 	/*
1552 	 * do not grab the packet if it's not to a multicast destination or if
1553 	 * we don't have an encapsulating tunnel with the source.
1554 	 * Note:  This code assumes that the remote site IP address
1555 	 * uniquely identifies the tunnel (i.e., that this site has
1556 	 * at most one tunnel with the remote site).
1557 	 */
1558 
1559 	/* LINTED const cast */
1560 	m_copydata((struct mbuf *)m, off, sizeof(ip), (caddr_t)&ip);
1561 	if (!IN_MULTICAST(ip.ip_dst.s_addr))
1562 		return 0;
1563 
1564 	/* LINTED const cast */
1565 	m_copydata((struct mbuf *)m, 0, sizeof(ip), (caddr_t)&ip);
1566 	if (!in_hosteq(ip.ip_src, last_encap_src)) {
1567 		vifp = (struct vif *)arg;
1568 		if (vifp->v_flags & VIFF_TUNNEL &&
1569 		    in_hosteq(vifp->v_rmt_addr, ip.ip_src))
1570 			;
1571 		else
1572 			return 0;
1573 		last_encap_vif = vifp;
1574 		last_encap_src = ip.ip_src;
1575 	} else
1576 		vifp = last_encap_vif;
1577 
1578 	/* 32bit match, since we have checked ip_src only */
1579 	return 32;
1580 }
1581 
1582 /*
1583  * Token bucket filter module
1584  */
1585 static void
1586 tbf_control(vifp, m, ip, len)
1587 	struct vif *vifp;
1588 	struct mbuf *m;
1589 	struct ip *ip;
1590 	u_int32_t len;
1591 {
1592 
1593 	if (len > MAX_BKT_SIZE) {
1594 		/* drop if packet is too large */
1595 		mrtstat.mrts_pkt2large++;
1596 		m_freem(m);
1597 		return;
1598 	}
1599 
1600 	tbf_update_tokens(vifp);
1601 
1602 	/*
1603 	 * If there are enough tokens, and the queue is empty, send this packet
1604 	 * out immediately.  Otherwise, try to insert it on this vif's queue.
1605 	 */
1606 	if (vifp->tbf_q_len == 0) {
1607 		if (len <= vifp->tbf_n_tok) {
1608 			vifp->tbf_n_tok -= len;
1609 			tbf_send_packet(vifp, m);
1610 		} else {
1611 			/* queue packet and timeout till later */
1612 			tbf_queue(vifp, m);
1613 			callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1614 			    tbf_reprocess_q, vifp);
1615 		}
1616 	} else {
1617 		if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
1618 		    !tbf_dq_sel(vifp, ip)) {
1619 			/* queue length too much, and couldn't make room */
1620 			mrtstat.mrts_q_overflow++;
1621 			m_freem(m);
1622 		} else {
1623 			/* queue length low enough, or made room */
1624 			tbf_queue(vifp, m);
1625 			tbf_process_q(vifp);
1626 		}
1627 	}
1628 }
1629 
1630 /*
1631  * adds a packet to the queue at the interface
1632  */
1633 static void
1634 tbf_queue(vifp, m)
1635 	struct vif *vifp;
1636 	struct mbuf *m;
1637 {
1638 	int s = splsoftnet();
1639 
1640 	/* insert at tail */
1641 	*vifp->tbf_t = m;
1642 	vifp->tbf_t = &m->m_nextpkt;
1643 	vifp->tbf_q_len++;
1644 
1645 	splx(s);
1646 }
1647 
1648 
1649 /*
1650  * processes the queue at the interface
1651  */
1652 static void
1653 tbf_process_q(vifp)
1654 	struct vif *vifp;
1655 {
1656 	struct mbuf *m;
1657 	int len;
1658 	int s = splsoftnet();
1659 
1660 	/*
1661 	 * Loop through the queue at the interface and send as many packets
1662 	 * as possible.
1663 	 */
1664 	for (m = vifp->tbf_q;
1665 	    m != 0;
1666 	    m = vifp->tbf_q) {
1667 		len = mtod(m, struct ip *)->ip_len;
1668 
1669 		/* determine if the packet can be sent */
1670 		if (len <= vifp->tbf_n_tok) {
1671 			/* if so,
1672 			 * reduce no of tokens, dequeue the packet,
1673 			 * send the packet.
1674 			 */
1675 			if ((vifp->tbf_q = m->m_nextpkt) == 0)
1676 				vifp->tbf_t = &vifp->tbf_q;
1677 			--vifp->tbf_q_len;
1678 
1679 			m->m_nextpkt = 0;
1680 			vifp->tbf_n_tok -= len;
1681 			tbf_send_packet(vifp, m);
1682 		} else
1683 			break;
1684 	}
1685 	splx(s);
1686 }
1687 
1688 static void
1689 tbf_reprocess_q(arg)
1690 	void *arg;
1691 {
1692 	struct vif *vifp = arg;
1693 
1694 	if (ip_mrouter == 0)
1695 		return;
1696 
1697 	tbf_update_tokens(vifp);
1698 	tbf_process_q(vifp);
1699 
1700 	if (vifp->tbf_q_len != 0)
1701 		callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1702 		    tbf_reprocess_q, vifp);
1703 }
1704 
1705 /* function that will selectively discard a member of the queue
1706  * based on the precedence value and the priority
1707  */
1708 static int
1709 tbf_dq_sel(vifp, ip)
1710 	struct vif *vifp;
1711 	struct ip *ip;
1712 {
1713 	u_int p;
1714 	struct mbuf **mp, *m;
1715 	int s = splsoftnet();
1716 
1717 	p = priority(vifp, ip);
1718 
1719 	for (mp = &vifp->tbf_q, m = *mp;
1720 	    m != 0;
1721 	    mp = &m->m_nextpkt, m = *mp) {
1722 		if (p > priority(vifp, mtod(m, struct ip *))) {
1723 			if ((*mp = m->m_nextpkt) == 0)
1724 				vifp->tbf_t = mp;
1725 			--vifp->tbf_q_len;
1726 
1727 			m_freem(m);
1728 			mrtstat.mrts_drop_sel++;
1729 			splx(s);
1730 			return (1);
1731 		}
1732 	}
1733 	splx(s);
1734 	return (0);
1735 }
1736 
1737 static void
1738 tbf_send_packet(vifp, m)
1739 	struct vif *vifp;
1740 	struct mbuf *m;
1741 {
1742 	int error;
1743 	int s = splsoftnet();
1744 
1745 	if (vifp->v_flags & VIFF_TUNNEL) {
1746 		/* If tunnel options */
1747 #ifdef IPSEC
1748 		/* Don't lookup socket in forwading case */
1749 		(void)ipsec_setsocket(m, NULL);
1750 #endif
1751 		ip_output(m, (struct mbuf *)0, &vifp->v_route,
1752 			  IP_FORWARDING, (struct ip_moptions *)0);
1753 	} else {
1754 		/* if physical interface option, extract the options and then send */
1755 		struct ip_moptions imo;
1756 
1757 		imo.imo_multicast_ifp = vifp->v_ifp;
1758 		imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
1759 		imo.imo_multicast_loop = 1;
1760 #ifdef RSVP_ISI
1761 		imo.imo_multicast_vif = -1;
1762 #endif
1763 
1764 #ifdef IPSEC
1765 		/* Don't lookup socket in forwading case */
1766 		(void)ipsec_setsocket(m, NULL);
1767 #endif
1768 		error = ip_output(m, (struct mbuf *)0, (struct route *)0,
1769 				  IP_FORWARDING|IP_MULTICASTOPTS, &imo);
1770 
1771 		if (mrtdebug & DEBUG_XMIT)
1772 			log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
1773 			    (long)(vifp-viftable), error);
1774 	}
1775 	splx(s);
1776 }
1777 
1778 /* determine the current time and then
1779  * the elapsed time (between the last time and time now)
1780  * in milliseconds & update the no. of tokens in the bucket
1781  */
1782 static void
1783 tbf_update_tokens(vifp)
1784 	struct vif *vifp;
1785 {
1786 	struct timeval tp;
1787 	u_int32_t tm;
1788 	int s = splsoftnet();
1789 
1790 	microtime(&tp);
1791 
1792 	TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
1793 
1794 	/*
1795 	 * This formula is actually
1796 	 * "time in seconds" * "bytes/second".
1797 	 *
1798 	 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
1799 	 *
1800 	 * The (1000/1024) was introduced in add_vif to optimize
1801 	 * this divide into a shift.
1802 	 */
1803 	vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
1804 	vifp->tbf_last_pkt_t = tp;
1805 
1806 	if (vifp->tbf_n_tok > MAX_BKT_SIZE)
1807 		vifp->tbf_n_tok = MAX_BKT_SIZE;
1808 
1809 	splx(s);
1810 }
1811 
1812 static int
1813 priority(vifp, ip)
1814     struct vif *vifp;
1815     struct ip *ip;
1816 {
1817     int prio;
1818 
1819     /* temporary hack; may add general packet classifier some day */
1820 
1821     /*
1822      * The UDP port space is divided up into four priority ranges:
1823      * [0, 16384)     : unclassified - lowest priority
1824      * [16384, 32768) : audio - highest priority
1825      * [32768, 49152) : whiteboard - medium priority
1826      * [49152, 65536) : video - low priority
1827      */
1828     if (ip->ip_p == IPPROTO_UDP) {
1829 	struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
1830 
1831 	switch (ntohs(udp->uh_dport) & 0xc000) {
1832 	    case 0x4000:
1833 		prio = 70;
1834 		break;
1835 	    case 0x8000:
1836 		prio = 60;
1837 		break;
1838 	    case 0xc000:
1839 		prio = 55;
1840 		break;
1841 	    default:
1842 		prio = 50;
1843 		break;
1844 	}
1845 
1846 	if (tbfdebug > 1)
1847 	    log(LOG_DEBUG, "port %x prio %d\n", ntohs(udp->uh_dport), prio);
1848     } else
1849 	prio = 50;
1850 
1851 
1852     return (prio);
1853 }
1854 
1855 /*
1856  * End of token bucket filter modifications
1857  */
1858 
1859 #ifdef RSVP_ISI
1860 
1861 int
1862 ip_rsvp_vif_init(so, m)
1863     struct socket *so;
1864     struct mbuf *m;
1865 {
1866     int i;
1867     int s;
1868 
1869     if (rsvpdebug)
1870 	printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n",
1871 	    so->so_type, so->so_proto->pr_protocol);
1872 
1873     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
1874 	return (EOPNOTSUPP);
1875 
1876     /* Check mbuf. */
1877     if (m == 0 || m->m_len != sizeof(int)) {
1878 	return (EINVAL);
1879     }
1880     i = *(mtod(m, int *));
1881 
1882     if (rsvpdebug)
1883 	printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n",i,rsvp_on);
1884 
1885     s = splsoftnet();
1886 
1887     /* Check vif. */
1888     if (!legal_vif_num(i)) {
1889 	splx(s);
1890 	return (EADDRNOTAVAIL);
1891     }
1892 
1893     /* Check if socket is available. */
1894     if (viftable[i].v_rsvpd != 0) {
1895 	splx(s);
1896 	return (EADDRINUSE);
1897     }
1898 
1899     viftable[i].v_rsvpd = so;
1900     /* This may seem silly, but we need to be sure we don't over-increment
1901      * the RSVP counter, in case something slips up.
1902      */
1903     if (!viftable[i].v_rsvp_on) {
1904 	viftable[i].v_rsvp_on = 1;
1905 	rsvp_on++;
1906     }
1907 
1908     splx(s);
1909     return (0);
1910 }
1911 
1912 int
1913 ip_rsvp_vif_done(so, m)
1914     struct socket *so;
1915     struct mbuf *m;
1916 {
1917     int i;
1918     int s;
1919 
1920     if (rsvpdebug)
1921 	printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n",
1922 	       so->so_type, so->so_proto->pr_protocol);
1923 
1924     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
1925 	return (EOPNOTSUPP);
1926 
1927     /* Check mbuf. */
1928     if (m == 0 || m->m_len != sizeof(int)) {
1929 	return (EINVAL);
1930     }
1931     i = *(mtod(m, int *));
1932 
1933     s = splsoftnet();
1934 
1935     /* Check vif. */
1936     if (!legal_vif_num(i)) {
1937 	splx(s);
1938         return (EADDRNOTAVAIL);
1939     }
1940 
1941     if (rsvpdebug)
1942 	printf("ip_rsvp_vif_done: v_rsvpd = %x so = %x\n",
1943 	    viftable[i].v_rsvpd, so);
1944 
1945     viftable[i].v_rsvpd = 0;
1946     /* This may seem silly, but we need to be sure we don't over-decrement
1947      * the RSVP counter, in case something slips up.
1948      */
1949     if (viftable[i].v_rsvp_on) {
1950 	viftable[i].v_rsvp_on = 0;
1951 	rsvp_on--;
1952     }
1953 
1954     splx(s);
1955     return (0);
1956 }
1957 
1958 void
1959 ip_rsvp_force_done(so)
1960     struct socket *so;
1961 {
1962     int vifi;
1963     int s;
1964 
1965     /* Don't bother if it is not the right type of socket. */
1966     if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
1967 	return;
1968 
1969     s = splsoftnet();
1970 
1971     /* The socket may be attached to more than one vif...this
1972      * is perfectly legal.
1973      */
1974     for (vifi = 0; vifi < numvifs; vifi++) {
1975 	if (viftable[vifi].v_rsvpd == so) {
1976 	    viftable[vifi].v_rsvpd = 0;
1977 	    /* This may seem silly, but we need to be sure we don't
1978 	     * over-decrement the RSVP counter, in case something slips up.
1979 	     */
1980 	    if (viftable[vifi].v_rsvp_on) {
1981 		viftable[vifi].v_rsvp_on = 0;
1982 		rsvp_on--;
1983 	    }
1984 	}
1985     }
1986 
1987     splx(s);
1988     return;
1989 }
1990 
1991 void
1992 rsvp_input(m, ifp)
1993     struct mbuf *m;
1994     struct ifnet *ifp;
1995 {
1996     int vifi;
1997     struct ip *ip = mtod(m, struct ip *);
1998     static struct sockaddr_in rsvp_src = { sizeof(sin), AF_INET };
1999     int s;
2000 
2001     if (rsvpdebug)
2002 	printf("rsvp_input: rsvp_on %d\n",rsvp_on);
2003 
2004     /* Can still get packets with rsvp_on = 0 if there is a local member
2005      * of the group to which the RSVP packet is addressed.  But in this
2006      * case we want to throw the packet away.
2007      */
2008     if (!rsvp_on) {
2009 	m_freem(m);
2010 	return;
2011     }
2012 
2013     /* If the old-style non-vif-associated socket is set, then use
2014      * it and ignore the new ones.
2015      */
2016     if (ip_rsvpd != 0) {
2017 	if (rsvpdebug)
2018 	    printf("rsvp_input: Sending packet up old-style socket\n");
2019 	rip_input(m);	/*XXX*/
2020 	return;
2021     }
2022 
2023     s = splsoftnet();
2024 
2025     if (rsvpdebug)
2026 	printf("rsvp_input: check vifs\n");
2027 
2028     /* Find which vif the packet arrived on. */
2029     for (vifi = 0; vifi < numvifs; vifi++) {
2030 	if (viftable[vifi].v_ifp == ifp)
2031 	    break;
2032     }
2033 
2034     if (vifi == numvifs) {
2035 	/* Can't find vif packet arrived on. Drop packet. */
2036 	if (rsvpdebug)
2037 	    printf("rsvp_input: Can't find vif for packet...dropping it.\n");
2038 	m_freem(m);
2039 	splx(s);
2040 	return;
2041     }
2042 
2043     if (rsvpdebug)
2044 	printf("rsvp_input: check socket\n");
2045 
2046     if (viftable[vifi].v_rsvpd == 0) {
2047 	/* drop packet, since there is no specific socket for this
2048 	 * interface */
2049 	if (rsvpdebug)
2050 	    printf("rsvp_input: No socket defined for vif %d\n",vifi);
2051 	m_freem(m);
2052 	splx(s);
2053 	return;
2054     }
2055 
2056     rsvp_src.sin_addr = ip->ip_src;
2057 
2058     if (rsvpdebug && m)
2059 	printf("rsvp_input: m->m_len = %d, sbspace() = %d\n",
2060 	       m->m_len,sbspace(&viftable[vifi].v_rsvpd->so_rcv));
2061 
2062     if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0)
2063 	if (rsvpdebug)
2064 	    printf("rsvp_input: Failed to append to socket\n");
2065     else
2066 	if (rsvpdebug)
2067 	    printf("rsvp_input: send packet up\n");
2068 
2069     splx(s);
2070 }
2071 #endif /* RSVP_ISI */
2072