1 /* $NetBSD: ip_mroute.c,v 1.96 2005/12/11 12:24:57 christos Exp $ */ 2 3 /* 4 * Copyright (c) 1992, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Stephen Deering of Stanford University. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 35 */ 36 37 /* 38 * Copyright (c) 1989 Stephen Deering 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Stephen Deering of Stanford University. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 72 */ 73 74 /* 75 * IP multicast forwarding procedures 76 * 77 * Written by David Waitzman, BBN Labs, August 1988. 78 * Modified by Steve Deering, Stanford, February 1989. 79 * Modified by Mark J. Steiglitz, Stanford, May, 1991 80 * Modified by Van Jacobson, LBL, January 1993 81 * Modified by Ajit Thyagarajan, PARC, August 1993 82 * Modified by Bill Fenner, PARC, April 1994 83 * Modified by Charles M. Hannum, NetBSD, May 1995. 84 * Modified by Ahmed Helmy, SGI, June 1996 85 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 86 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 87 * Modified by Hitoshi Asaeda, WIDE, August 2000 88 * Modified by Pavlin Radoslavov, ICSI, October 2002 89 * 90 * MROUTING Revision: 1.2 91 * and PIM-SMv2 and PIM-DM support, advanced API support, 92 * bandwidth metering and signaling 93 */ 94 95 #include <sys/cdefs.h> 96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.96 2005/12/11 12:24:57 christos Exp $"); 97 98 #include "opt_inet.h" 99 #include "opt_ipsec.h" 100 #include "opt_pim.h" 101 102 #ifdef PIM 103 #define _PIM_VT 1 104 #endif 105 106 #include <sys/param.h> 107 #include <sys/systm.h> 108 #include <sys/callout.h> 109 #include <sys/mbuf.h> 110 #include <sys/socket.h> 111 #include <sys/socketvar.h> 112 #include <sys/protosw.h> 113 #include <sys/errno.h> 114 #include <sys/time.h> 115 #include <sys/kernel.h> 116 #include <sys/ioctl.h> 117 #include <sys/syslog.h> 118 119 #include <net/if.h> 120 #include <net/route.h> 121 #include <net/raw_cb.h> 122 123 #include <netinet/in.h> 124 #include <netinet/in_var.h> 125 #include <netinet/in_systm.h> 126 #include <netinet/ip.h> 127 #include <netinet/ip_var.h> 128 #include <netinet/in_pcb.h> 129 #include <netinet/udp.h> 130 #include <netinet/igmp.h> 131 #include <netinet/igmp_var.h> 132 #include <netinet/ip_mroute.h> 133 #ifdef PIM 134 #include <netinet/pim.h> 135 #include <netinet/pim_var.h> 136 #endif 137 #include <netinet/ip_encap.h> 138 139 #ifdef IPSEC 140 #include <netinet6/ipsec.h> 141 #include <netkey/key.h> 142 #endif 143 144 #ifdef FAST_IPSEC 145 #include <netipsec/ipsec.h> 146 #include <netipsec/key.h> 147 #endif 148 149 #include <machine/stdarg.h> 150 151 #define IP_MULTICASTOPTS 0 152 #define M_PULLUP(m, len) \ 153 do { \ 154 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \ 155 (m) = m_pullup((m), (len)); \ 156 } while (/*CONSTCOND*/ 0) 157 158 /* 159 * Globals. All but ip_mrouter and ip_mrtproto could be static, 160 * except for netstat or debugging purposes. 161 */ 162 struct socket *ip_mrouter = NULL; 163 int ip_mrtproto = IGMP_DVMRP; /* for netstat only */ 164 165 #define NO_RTE_FOUND 0x1 166 #define RTE_FOUND 0x2 167 168 #define MFCHASH(a, g) \ 169 ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ 170 ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash) 171 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl; 172 u_long mfchash; 173 174 u_char nexpire[MFCTBLSIZ]; 175 struct vif viftable[MAXVIFS]; 176 struct mrtstat mrtstat; 177 u_int mrtdebug = 0; /* debug level */ 178 #define DEBUG_MFC 0x02 179 #define DEBUG_FORWARD 0x04 180 #define DEBUG_EXPIRE 0x08 181 #define DEBUG_XMIT 0x10 182 #define DEBUG_PIM 0x20 183 184 #define VIFI_INVALID ((vifi_t) -1) 185 186 u_int tbfdebug = 0; /* tbf debug level */ 187 #ifdef RSVP_ISI 188 u_int rsvpdebug = 0; /* rsvp debug level */ 189 extern struct socket *ip_rsvpd; 190 extern int rsvp_on; 191 #endif /* RSVP_ISI */ 192 193 /* vif attachment using sys/netinet/ip_encap.c */ 194 static void vif_input(struct mbuf *, ...); 195 static int vif_encapcheck(struct mbuf *, int, int, void *); 196 197 static const struct protosw vif_protosw = 198 { SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, 199 vif_input, rip_output, 0, rip_ctloutput, 200 rip_usrreq, 201 0, 0, 0, 0, 202 }; 203 204 #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 205 #define UPCALL_EXPIRE 6 /* number of timeouts */ 206 207 /* 208 * Define the token bucket filter structures 209 */ 210 211 #define TBF_REPROCESS (hz / 100) /* 100x / second */ 212 213 static int get_sg_cnt(struct sioc_sg_req *); 214 static int get_vif_cnt(struct sioc_vif_req *); 215 static int ip_mrouter_init(struct socket *, struct mbuf *); 216 static int get_version(struct mbuf *); 217 static int set_assert(struct mbuf *); 218 static int get_assert(struct mbuf *); 219 static int add_vif(struct mbuf *); 220 static int del_vif(struct mbuf *); 221 static void update_mfc_params(struct mfc *, struct mfcctl2 *); 222 static void init_mfc_params(struct mfc *, struct mfcctl2 *); 223 static void expire_mfc(struct mfc *); 224 static int add_mfc(struct mbuf *); 225 #ifdef UPCALL_TIMING 226 static void collate(struct timeval *); 227 #endif 228 static int del_mfc(struct mbuf *); 229 static int set_api_config(struct mbuf *); /* chose API capabilities */ 230 static int get_api_support(struct mbuf *); 231 static int get_api_config(struct mbuf *); 232 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); 233 static void expire_upcalls(void *); 234 #ifdef RSVP_ISI 235 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); 236 #else 237 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *); 238 #endif 239 static void phyint_send(struct ip *, struct vif *, struct mbuf *); 240 static void encap_send(struct ip *, struct vif *, struct mbuf *); 241 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t); 242 static void tbf_queue(struct vif *, struct mbuf *); 243 static void tbf_process_q(struct vif *); 244 static void tbf_reprocess_q(void *); 245 static int tbf_dq_sel(struct vif *, struct ip *); 246 static void tbf_send_packet(struct vif *, struct mbuf *); 247 static void tbf_update_tokens(struct vif *); 248 static int priority(struct vif *, struct ip *); 249 250 /* 251 * Bandwidth monitoring 252 */ 253 static void free_bw_list(struct bw_meter *); 254 static int add_bw_upcall(struct mbuf *); 255 static int del_bw_upcall(struct mbuf *); 256 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *); 257 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); 258 static void bw_upcalls_send(void); 259 static void schedule_bw_meter(struct bw_meter *, struct timeval *); 260 static void unschedule_bw_meter(struct bw_meter *); 261 static void bw_meter_process(void); 262 static void expire_bw_upcalls_send(void *); 263 static void expire_bw_meter_process(void *); 264 265 #ifdef PIM 266 static int pim_register_send(struct ip *, struct vif *, 267 struct mbuf *, struct mfc *); 268 static int pim_register_send_rp(struct ip *, struct vif *, 269 struct mbuf *, struct mfc *); 270 static int pim_register_send_upcall(struct ip *, struct vif *, 271 struct mbuf *, struct mfc *); 272 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); 273 #endif 274 275 /* 276 * 'Interfaces' associated with decapsulator (so we can tell 277 * packets that went through it from ones that get reflected 278 * by a broken gateway). These interfaces are never linked into 279 * the system ifnet list & no routes point to them. I.e., packets 280 * can't be sent this way. They only exist as a placeholder for 281 * multicast source verification. 282 */ 283 #if 0 284 struct ifnet multicast_decap_if[MAXVIFS]; 285 #endif 286 287 #define ENCAP_TTL 64 288 #define ENCAP_PROTO IPPROTO_IPIP /* 4 */ 289 290 /* prototype IP hdr for encapsulated packets */ 291 struct ip multicast_encap_iphdr = { 292 #if BYTE_ORDER == LITTLE_ENDIAN 293 sizeof(struct ip) >> 2, IPVERSION, 294 #else 295 IPVERSION, sizeof(struct ip) >> 2, 296 #endif 297 0, /* tos */ 298 sizeof(struct ip), /* total length */ 299 0, /* id */ 300 0, /* frag offset */ 301 ENCAP_TTL, ENCAP_PROTO, 302 0, /* checksum */ 303 }; 304 305 /* 306 * Bandwidth meter variables and constants 307 */ 308 309 /* 310 * Pending timeouts are stored in a hash table, the key being the 311 * expiration time. Periodically, the entries are analysed and processed. 312 */ 313 #define BW_METER_BUCKETS 1024 314 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 315 struct callout bw_meter_ch; 316 #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 317 318 /* 319 * Pending upcalls are stored in a vector which is flushed when 320 * full, or periodically 321 */ 322 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 323 static u_int bw_upcalls_n; /* # of pending upcalls */ 324 struct callout bw_upcalls_ch; 325 #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 326 327 #ifdef PIM 328 struct pimstat pimstat; 329 330 /* 331 * Note: the PIM Register encapsulation adds the following in front of a 332 * data packet: 333 * 334 * struct pim_encap_hdr { 335 * struct ip ip; 336 * struct pim_encap_pimhdr pim; 337 * } 338 * 339 */ 340 341 struct pim_encap_pimhdr { 342 struct pim pim; 343 uint32_t flags; 344 }; 345 346 static struct ip pim_encap_iphdr = { 347 #if BYTE_ORDER == LITTLE_ENDIAN 348 sizeof(struct ip) >> 2, 349 IPVERSION, 350 #else 351 IPVERSION, 352 sizeof(struct ip) >> 2, 353 #endif 354 0, /* tos */ 355 sizeof(struct ip), /* total length */ 356 0, /* id */ 357 0, /* frag offset */ 358 ENCAP_TTL, 359 IPPROTO_PIM, 360 0, /* checksum */ 361 }; 362 363 static struct pim_encap_pimhdr pim_encap_pimhdr = { 364 { 365 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 366 0, /* reserved */ 367 0, /* checksum */ 368 }, 369 0 /* flags */ 370 }; 371 372 static struct ifnet multicast_register_if; 373 static vifi_t reg_vif_num = VIFI_INVALID; 374 #endif /* PIM */ 375 376 377 /* 378 * Private variables. 379 */ 380 static vifi_t numvifs = 0; 381 382 static struct callout expire_upcalls_ch; 383 384 /* 385 * whether or not special PIM assert processing is enabled. 386 */ 387 static int pim_assert; 388 /* 389 * Rate limit for assert notification messages, in usec 390 */ 391 #define ASSERT_MSG_TIME 3000000 392 393 /* 394 * Kernel multicast routing API capabilities and setup. 395 * If more API capabilities are added to the kernel, they should be 396 * recorded in `mrt_api_support'. 397 */ 398 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 399 MRT_MFC_FLAGS_BORDER_VIF | 400 MRT_MFC_RP | 401 MRT_MFC_BW_UPCALL); 402 static u_int32_t mrt_api_config = 0; 403 404 /* 405 * Find a route for a given origin IP address and Multicast group address 406 * Type of service parameter to be added in the future!!! 407 * Statistics are updated by the caller if needed 408 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) 409 */ 410 static struct mfc * 411 mfc_find(struct in_addr *o, struct in_addr *g) 412 { 413 struct mfc *rt; 414 415 LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { 416 if (in_hosteq(rt->mfc_origin, *o) && 417 in_hosteq(rt->mfc_mcastgrp, *g) && 418 (rt->mfc_stall == NULL)) 419 break; 420 } 421 422 return (rt); 423 } 424 425 /* 426 * Macros to compute elapsed time efficiently 427 * Borrowed from Van Jacobson's scheduling code 428 */ 429 #define TV_DELTA(a, b, delta) do { \ 430 int xxs; \ 431 delta = (a).tv_usec - (b).tv_usec; \ 432 xxs = (a).tv_sec - (b).tv_sec; \ 433 switch (xxs) { \ 434 case 2: \ 435 delta += 1000000; \ 436 /* fall through */ \ 437 case 1: \ 438 delta += 1000000; \ 439 /* fall through */ \ 440 case 0: \ 441 break; \ 442 default: \ 443 delta += (1000000 * xxs); \ 444 break; \ 445 } \ 446 } while (/*CONSTCOND*/ 0) 447 448 #ifdef UPCALL_TIMING 449 u_int32_t upcall_data[51]; 450 #endif /* UPCALL_TIMING */ 451 452 /* 453 * Handle MRT setsockopt commands to modify the multicast routing tables. 454 */ 455 int 456 ip_mrouter_set(struct socket *so, int optname, struct mbuf **m) 457 { 458 int error; 459 460 if (optname != MRT_INIT && so != ip_mrouter) 461 error = ENOPROTOOPT; 462 else 463 switch (optname) { 464 case MRT_INIT: 465 error = ip_mrouter_init(so, *m); 466 break; 467 case MRT_DONE: 468 error = ip_mrouter_done(); 469 break; 470 case MRT_ADD_VIF: 471 error = add_vif(*m); 472 break; 473 case MRT_DEL_VIF: 474 error = del_vif(*m); 475 break; 476 case MRT_ADD_MFC: 477 error = add_mfc(*m); 478 break; 479 case MRT_DEL_MFC: 480 error = del_mfc(*m); 481 break; 482 case MRT_ASSERT: 483 error = set_assert(*m); 484 break; 485 case MRT_API_CONFIG: 486 error = set_api_config(*m); 487 break; 488 case MRT_ADD_BW_UPCALL: 489 error = add_bw_upcall(*m); 490 break; 491 case MRT_DEL_BW_UPCALL: 492 error = del_bw_upcall(*m); 493 break; 494 default: 495 error = ENOPROTOOPT; 496 break; 497 } 498 499 if (*m) 500 m_free(*m); 501 return (error); 502 } 503 504 /* 505 * Handle MRT getsockopt commands 506 */ 507 int 508 ip_mrouter_get(struct socket *so, int optname, struct mbuf **m) 509 { 510 int error; 511 512 if (so != ip_mrouter) 513 error = ENOPROTOOPT; 514 else { 515 *m = m_get(M_WAIT, MT_SOOPTS); 516 MCLAIM(*m, so->so_mowner); 517 518 switch (optname) { 519 case MRT_VERSION: 520 error = get_version(*m); 521 break; 522 case MRT_ASSERT: 523 error = get_assert(*m); 524 break; 525 case MRT_API_SUPPORT: 526 error = get_api_support(*m); 527 break; 528 case MRT_API_CONFIG: 529 error = get_api_config(*m); 530 break; 531 default: 532 error = ENOPROTOOPT; 533 break; 534 } 535 536 if (error) 537 m_free(*m); 538 } 539 540 return (error); 541 } 542 543 /* 544 * Handle ioctl commands to obtain information from the cache 545 */ 546 int 547 mrt_ioctl(struct socket *so, u_long cmd, caddr_t data) 548 { 549 int error; 550 551 if (so != ip_mrouter) 552 error = EINVAL; 553 else 554 switch (cmd) { 555 case SIOCGETVIFCNT: 556 error = get_vif_cnt((struct sioc_vif_req *)data); 557 break; 558 case SIOCGETSGCNT: 559 error = get_sg_cnt((struct sioc_sg_req *)data); 560 break; 561 default: 562 error = EINVAL; 563 break; 564 } 565 566 return (error); 567 } 568 569 /* 570 * returns the packet, byte, rpf-failure count for the source group provided 571 */ 572 static int 573 get_sg_cnt(struct sioc_sg_req *req) 574 { 575 int s; 576 struct mfc *rt; 577 578 s = splsoftnet(); 579 rt = mfc_find(&req->src, &req->grp); 580 if (rt == NULL) { 581 splx(s); 582 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 583 return (EADDRNOTAVAIL); 584 } 585 req->pktcnt = rt->mfc_pkt_cnt; 586 req->bytecnt = rt->mfc_byte_cnt; 587 req->wrong_if = rt->mfc_wrong_if; 588 splx(s); 589 590 return (0); 591 } 592 593 /* 594 * returns the input and output packet and byte counts on the vif provided 595 */ 596 static int 597 get_vif_cnt(struct sioc_vif_req *req) 598 { 599 vifi_t vifi = req->vifi; 600 601 if (vifi >= numvifs) 602 return (EINVAL); 603 604 req->icount = viftable[vifi].v_pkt_in; 605 req->ocount = viftable[vifi].v_pkt_out; 606 req->ibytes = viftable[vifi].v_bytes_in; 607 req->obytes = viftable[vifi].v_bytes_out; 608 609 return (0); 610 } 611 612 /* 613 * Enable multicast routing 614 */ 615 static int 616 ip_mrouter_init(struct socket *so, struct mbuf *m) 617 { 618 int *v; 619 620 if (mrtdebug) 621 log(LOG_DEBUG, 622 "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", 623 so->so_type, so->so_proto->pr_protocol); 624 625 if (so->so_type != SOCK_RAW || 626 so->so_proto->pr_protocol != IPPROTO_IGMP) 627 return (EOPNOTSUPP); 628 629 if (m == NULL || m->m_len < sizeof(int)) 630 return (EINVAL); 631 632 v = mtod(m, int *); 633 if (*v != 1) 634 return (EINVAL); 635 636 if (ip_mrouter != NULL) 637 return (EADDRINUSE); 638 639 ip_mrouter = so; 640 641 mfchashtbl = 642 hashinit(MFCTBLSIZ, HASH_LIST, M_MRTABLE, M_WAITOK, &mfchash); 643 bzero((caddr_t)nexpire, sizeof(nexpire)); 644 645 pim_assert = 0; 646 647 callout_init(&expire_upcalls_ch); 648 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, 649 expire_upcalls, NULL); 650 651 callout_init(&bw_upcalls_ch); 652 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 653 expire_bw_upcalls_send, NULL); 654 655 callout_init(&bw_meter_ch); 656 callout_reset(&bw_meter_ch, BW_METER_PERIOD, 657 expire_bw_meter_process, NULL); 658 659 if (mrtdebug) 660 log(LOG_DEBUG, "ip_mrouter_init\n"); 661 662 return (0); 663 } 664 665 /* 666 * Disable multicast routing 667 */ 668 int 669 ip_mrouter_done(void) 670 { 671 vifi_t vifi; 672 struct vif *vifp; 673 int i; 674 int s; 675 676 s = splsoftnet(); 677 678 /* Clear out all the vifs currently in use. */ 679 for (vifi = 0; vifi < numvifs; vifi++) { 680 vifp = &viftable[vifi]; 681 if (!in_nullhost(vifp->v_lcl_addr)) 682 reset_vif(vifp); 683 } 684 685 numvifs = 0; 686 pim_assert = 0; 687 mrt_api_config = 0; 688 689 callout_stop(&expire_upcalls_ch); 690 callout_stop(&bw_upcalls_ch); 691 callout_stop(&bw_meter_ch); 692 693 /* 694 * Free all multicast forwarding cache entries. 695 */ 696 for (i = 0; i < MFCTBLSIZ; i++) { 697 struct mfc *rt, *nrt; 698 699 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 700 nrt = LIST_NEXT(rt, mfc_hash); 701 702 expire_mfc(rt); 703 } 704 } 705 706 bzero((caddr_t)nexpire, sizeof(nexpire)); 707 free(mfchashtbl, M_MRTABLE); 708 mfchashtbl = NULL; 709 710 bw_upcalls_n = 0; 711 bzero(bw_meter_timers, sizeof(bw_meter_timers)); 712 713 /* Reset de-encapsulation cache. */ 714 715 ip_mrouter = NULL; 716 717 splx(s); 718 719 if (mrtdebug) 720 log(LOG_DEBUG, "ip_mrouter_done\n"); 721 722 return (0); 723 } 724 725 void 726 ip_mrouter_detach(struct ifnet *ifp) 727 { 728 int vifi, i; 729 struct vif *vifp; 730 struct mfc *rt; 731 struct rtdetq *rte; 732 733 /* XXX not sure about side effect to userland routing daemon */ 734 for (vifi = 0; vifi < numvifs; vifi++) { 735 vifp = &viftable[vifi]; 736 if (vifp->v_ifp == ifp) 737 reset_vif(vifp); 738 } 739 for (i = 0; i < MFCTBLSIZ; i++) { 740 if (nexpire[i] == 0) 741 continue; 742 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) { 743 for (rte = rt->mfc_stall; rte; rte = rte->next) { 744 if (rte->ifp == ifp) 745 rte->ifp = NULL; 746 } 747 } 748 } 749 } 750 751 static int 752 get_version(struct mbuf *m) 753 { 754 int *v = mtod(m, int *); 755 756 *v = 0x0305; /* XXX !!!! */ 757 m->m_len = sizeof(int); 758 return (0); 759 } 760 761 /* 762 * Set PIM assert processing global 763 */ 764 static int 765 set_assert(struct mbuf *m) 766 { 767 int *i; 768 769 if (m == NULL || m->m_len < sizeof(int)) 770 return (EINVAL); 771 772 i = mtod(m, int *); 773 pim_assert = !!*i; 774 return (0); 775 } 776 777 /* 778 * Get PIM assert processing global 779 */ 780 static int 781 get_assert(struct mbuf *m) 782 { 783 int *i = mtod(m, int *); 784 785 *i = pim_assert; 786 m->m_len = sizeof(int); 787 return (0); 788 } 789 790 /* 791 * Configure API capabilities 792 */ 793 static int 794 set_api_config(struct mbuf *m) 795 { 796 int i; 797 u_int32_t *apival; 798 799 if (m == NULL || m->m_len < sizeof(u_int32_t)) 800 return (EINVAL); 801 802 apival = mtod(m, u_int32_t *); 803 804 /* 805 * We can set the API capabilities only if it is the first operation 806 * after MRT_INIT. I.e.: 807 * - there are no vifs installed 808 * - pim_assert is not enabled 809 * - the MFC table is empty 810 */ 811 if (numvifs > 0) { 812 *apival = 0; 813 return (EPERM); 814 } 815 if (pim_assert) { 816 *apival = 0; 817 return (EPERM); 818 } 819 for (i = 0; i < MFCTBLSIZ; i++) { 820 if (LIST_FIRST(&mfchashtbl[i]) != NULL) { 821 *apival = 0; 822 return (EPERM); 823 } 824 } 825 826 mrt_api_config = *apival & mrt_api_support; 827 *apival = mrt_api_config; 828 829 return (0); 830 } 831 832 /* 833 * Get API capabilities 834 */ 835 static int 836 get_api_support(struct mbuf *m) 837 { 838 u_int32_t *apival; 839 840 if (m == NULL || m->m_len < sizeof(u_int32_t)) 841 return (EINVAL); 842 843 apival = mtod(m, u_int32_t *); 844 845 *apival = mrt_api_support; 846 847 return (0); 848 } 849 850 /* 851 * Get API configured capabilities 852 */ 853 static int 854 get_api_config(struct mbuf *m) 855 { 856 u_int32_t *apival; 857 858 if (m == NULL || m->m_len < sizeof(u_int32_t)) 859 return (EINVAL); 860 861 apival = mtod(m, u_int32_t *); 862 863 *apival = mrt_api_config; 864 865 return (0); 866 } 867 868 static struct sockaddr_in sin = { sizeof(sin), AF_INET }; 869 870 /* 871 * Add a vif to the vif table 872 */ 873 static int 874 add_vif(struct mbuf *m) 875 { 876 struct vifctl *vifcp; 877 struct vif *vifp; 878 struct ifaddr *ifa; 879 struct ifnet *ifp; 880 struct ifreq ifr; 881 int error, s; 882 883 if (m == NULL || m->m_len < sizeof(struct vifctl)) 884 return (EINVAL); 885 886 vifcp = mtod(m, struct vifctl *); 887 if (vifcp->vifc_vifi >= MAXVIFS) 888 return (EINVAL); 889 if (in_nullhost(vifcp->vifc_lcl_addr)) 890 return (EADDRNOTAVAIL); 891 892 vifp = &viftable[vifcp->vifc_vifi]; 893 if (!in_nullhost(vifp->v_lcl_addr)) 894 return (EADDRINUSE); 895 896 /* Find the interface with an address in AF_INET family. */ 897 #ifdef PIM 898 if (vifcp->vifc_flags & VIFF_REGISTER) { 899 /* 900 * XXX: Because VIFF_REGISTER does not really need a valid 901 * local interface (e.g. it could be 127.0.0.2), we don't 902 * check its address. 903 */ 904 ifp = NULL; 905 } else 906 #endif 907 { 908 sin.sin_addr = vifcp->vifc_lcl_addr; 909 ifa = ifa_ifwithaddr(sintosa(&sin)); 910 if (ifa == NULL) 911 return (EADDRNOTAVAIL); 912 ifp = ifa->ifa_ifp; 913 } 914 915 if (vifcp->vifc_flags & VIFF_TUNNEL) { 916 if (vifcp->vifc_flags & VIFF_SRCRT) { 917 log(LOG_ERR, "source routed tunnels not supported\n"); 918 return (EOPNOTSUPP); 919 } 920 921 /* attach this vif to decapsulator dispatch table */ 922 /* 923 * XXX Use addresses in registration so that matching 924 * can be done with radix tree in decapsulator. But, 925 * we need to check inner header for multicast, so 926 * this requires both radix tree lookup and then a 927 * function to check, and this is not supported yet. 928 */ 929 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, 930 vif_encapcheck, &vif_protosw, vifp); 931 if (!vifp->v_encap_cookie) 932 return (EINVAL); 933 934 /* Create a fake encapsulation interface. */ 935 ifp = (struct ifnet *)malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK); 936 bzero(ifp, sizeof(*ifp)); 937 snprintf(ifp->if_xname, sizeof(ifp->if_xname), 938 "mdecap%d", vifcp->vifc_vifi); 939 940 /* Prepare cached route entry. */ 941 bzero(&vifp->v_route, sizeof(vifp->v_route)); 942 #ifdef PIM 943 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 944 ifp = &multicast_register_if; 945 if (mrtdebug) 946 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", 947 (void *)ifp); 948 if (reg_vif_num == VIFI_INVALID) { 949 bzero(ifp, sizeof(*ifp)); 950 snprintf(ifp->if_xname, sizeof(ifp->if_xname), 951 "register_vif"); 952 ifp->if_flags = IFF_LOOPBACK; 953 bzero(&vifp->v_route, sizeof(vifp->v_route)); 954 reg_vif_num = vifcp->vifc_vifi; 955 } 956 #endif 957 } else { 958 /* Make sure the interface supports multicast. */ 959 if ((ifp->if_flags & IFF_MULTICAST) == 0) 960 return (EOPNOTSUPP); 961 962 /* Enable promiscuous reception of all IP multicasts. */ 963 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in); 964 satosin(&ifr.ifr_addr)->sin_family = AF_INET; 965 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr; 966 error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr); 967 if (error) 968 return (error); 969 } 970 971 s = splsoftnet(); 972 973 /* Define parameters for the tbf structure. */ 974 vifp->tbf_q = NULL; 975 vifp->tbf_t = &vifp->tbf_q; 976 microtime(&vifp->tbf_last_pkt_t); 977 vifp->tbf_n_tok = 0; 978 vifp->tbf_q_len = 0; 979 vifp->tbf_max_q_len = MAXQSIZE; 980 981 vifp->v_flags = vifcp->vifc_flags; 982 vifp->v_threshold = vifcp->vifc_threshold; 983 /* scaling up here allows division by 1024 in critical code */ 984 vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000; 985 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 986 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 987 vifp->v_ifp = ifp; 988 /* Initialize per vif pkt counters. */ 989 vifp->v_pkt_in = 0; 990 vifp->v_pkt_out = 0; 991 vifp->v_bytes_in = 0; 992 vifp->v_bytes_out = 0; 993 994 callout_init(&vifp->v_repq_ch); 995 996 #ifdef RSVP_ISI 997 vifp->v_rsvp_on = 0; 998 vifp->v_rsvpd = NULL; 999 #endif /* RSVP_ISI */ 1000 1001 splx(s); 1002 1003 /* Adjust numvifs up if the vifi is higher than numvifs. */ 1004 if (numvifs <= vifcp->vifc_vifi) 1005 numvifs = vifcp->vifc_vifi + 1; 1006 1007 if (mrtdebug) 1008 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n", 1009 vifcp->vifc_vifi, 1010 ntohl(vifcp->vifc_lcl_addr.s_addr), 1011 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1012 ntohl(vifcp->vifc_rmt_addr.s_addr), 1013 vifcp->vifc_threshold, 1014 vifcp->vifc_rate_limit); 1015 1016 return (0); 1017 } 1018 1019 void 1020 reset_vif(struct vif *vifp) 1021 { 1022 struct mbuf *m, *n; 1023 struct ifnet *ifp; 1024 struct ifreq ifr; 1025 1026 callout_stop(&vifp->v_repq_ch); 1027 1028 /* detach this vif from decapsulator dispatch table */ 1029 encap_detach(vifp->v_encap_cookie); 1030 vifp->v_encap_cookie = NULL; 1031 1032 /* 1033 * Free packets queued at the interface 1034 */ 1035 for (m = vifp->tbf_q; m != NULL; m = n) { 1036 n = m->m_nextpkt; 1037 m_freem(m); 1038 } 1039 1040 if (vifp->v_flags & VIFF_TUNNEL) 1041 free(vifp->v_ifp, M_MRTABLE); 1042 else if (vifp->v_flags & VIFF_REGISTER) { 1043 #ifdef PIM 1044 reg_vif_num = VIFI_INVALID; 1045 #endif 1046 } else { 1047 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in); 1048 satosin(&ifr.ifr_addr)->sin_family = AF_INET; 1049 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr; 1050 ifp = vifp->v_ifp; 1051 (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr); 1052 } 1053 bzero((caddr_t)vifp, sizeof(*vifp)); 1054 } 1055 1056 /* 1057 * Delete a vif from the vif table 1058 */ 1059 static int 1060 del_vif(struct mbuf *m) 1061 { 1062 vifi_t *vifip; 1063 struct vif *vifp; 1064 vifi_t vifi; 1065 int s; 1066 1067 if (m == NULL || m->m_len < sizeof(vifi_t)) 1068 return (EINVAL); 1069 1070 vifip = mtod(m, vifi_t *); 1071 if (*vifip >= numvifs) 1072 return (EINVAL); 1073 1074 vifp = &viftable[*vifip]; 1075 if (in_nullhost(vifp->v_lcl_addr)) 1076 return (EADDRNOTAVAIL); 1077 1078 s = splsoftnet(); 1079 1080 reset_vif(vifp); 1081 1082 /* Adjust numvifs down */ 1083 for (vifi = numvifs; vifi > 0; vifi--) 1084 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr)) 1085 break; 1086 numvifs = vifi; 1087 1088 splx(s); 1089 1090 if (mrtdebug) 1091 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs); 1092 1093 return (0); 1094 } 1095 1096 /* 1097 * update an mfc entry without resetting counters and S,G addresses. 1098 */ 1099 static void 1100 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1101 { 1102 int i; 1103 1104 rt->mfc_parent = mfccp->mfcc_parent; 1105 for (i = 0; i < numvifs; i++) { 1106 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1107 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 1108 MRT_MFC_FLAGS_ALL; 1109 } 1110 /* set the RP address */ 1111 if (mrt_api_config & MRT_MFC_RP) 1112 rt->mfc_rp = mfccp->mfcc_rp; 1113 else 1114 rt->mfc_rp = zeroin_addr; 1115 } 1116 1117 /* 1118 * fully initialize an mfc entry from the parameter. 1119 */ 1120 static void 1121 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1122 { 1123 rt->mfc_origin = mfccp->mfcc_origin; 1124 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1125 1126 update_mfc_params(rt, mfccp); 1127 1128 /* initialize pkt counters per src-grp */ 1129 rt->mfc_pkt_cnt = 0; 1130 rt->mfc_byte_cnt = 0; 1131 rt->mfc_wrong_if = 0; 1132 timerclear(&rt->mfc_last_assert); 1133 } 1134 1135 static void 1136 expire_mfc(struct mfc *rt) 1137 { 1138 struct rtdetq *rte, *nrte; 1139 1140 free_bw_list(rt->mfc_bw_meter); 1141 1142 for (rte = rt->mfc_stall; rte != NULL; rte = nrte) { 1143 nrte = rte->next; 1144 m_freem(rte->m); 1145 free(rte, M_MRTABLE); 1146 } 1147 1148 LIST_REMOVE(rt, mfc_hash); 1149 free(rt, M_MRTABLE); 1150 } 1151 1152 /* 1153 * Add an mfc entry 1154 */ 1155 static int 1156 add_mfc(struct mbuf *m) 1157 { 1158 struct mfcctl2 mfcctl2; 1159 struct mfcctl2 *mfccp; 1160 struct mfc *rt; 1161 u_int32_t hash = 0; 1162 struct rtdetq *rte, *nrte; 1163 u_short nstl; 1164 int s; 1165 int mfcctl_size = sizeof(struct mfcctl); 1166 1167 if (mrt_api_config & MRT_API_FLAGS_ALL) 1168 mfcctl_size = sizeof(struct mfcctl2); 1169 1170 if (m == NULL || m->m_len < mfcctl_size) 1171 return (EINVAL); 1172 1173 /* 1174 * select data size depending on API version. 1175 */ 1176 if (mrt_api_config & MRT_API_FLAGS_ALL) { 1177 struct mfcctl2 *mp2 = mtod(m, struct mfcctl2 *); 1178 bcopy(mp2, (caddr_t)&mfcctl2, sizeof(*mp2)); 1179 } else { 1180 struct mfcctl *mp = mtod(m, struct mfcctl *); 1181 bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp)); 1182 bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl), 1183 sizeof(mfcctl2) - sizeof(struct mfcctl)); 1184 } 1185 mfccp = &mfcctl2; 1186 1187 s = splsoftnet(); 1188 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1189 1190 /* If an entry already exists, just update the fields */ 1191 if (rt) { 1192 if (mrtdebug & DEBUG_MFC) 1193 log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n", 1194 ntohl(mfccp->mfcc_origin.s_addr), 1195 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1196 mfccp->mfcc_parent); 1197 1198 update_mfc_params(rt, mfccp); 1199 1200 splx(s); 1201 return (0); 1202 } 1203 1204 /* 1205 * Find the entry for which the upcall was made and update 1206 */ 1207 nstl = 0; 1208 hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); 1209 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1210 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1211 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && 1212 rt->mfc_stall != NULL) { 1213 if (nstl++) 1214 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n", 1215 "multiple kernel entries", 1216 ntohl(mfccp->mfcc_origin.s_addr), 1217 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1218 mfccp->mfcc_parent, rt->mfc_stall); 1219 1220 if (mrtdebug & DEBUG_MFC) 1221 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n", 1222 ntohl(mfccp->mfcc_origin.s_addr), 1223 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1224 mfccp->mfcc_parent, rt->mfc_stall); 1225 1226 rte = rt->mfc_stall; 1227 init_mfc_params(rt, mfccp); 1228 rt->mfc_stall = NULL; 1229 1230 rt->mfc_expire = 0; /* Don't clean this guy up */ 1231 nexpire[hash]--; 1232 1233 /* free packets Qed at the end of this entry */ 1234 for (; rte != NULL; rte = nrte) { 1235 nrte = rte->next; 1236 if (rte->ifp) { 1237 #ifdef RSVP_ISI 1238 ip_mdq(rte->m, rte->ifp, rt, -1); 1239 #else 1240 ip_mdq(rte->m, rte->ifp, rt); 1241 #endif /* RSVP_ISI */ 1242 } 1243 m_freem(rte->m); 1244 #ifdef UPCALL_TIMING 1245 collate(&rte->t); 1246 #endif /* UPCALL_TIMING */ 1247 free(rte, M_MRTABLE); 1248 } 1249 } 1250 } 1251 1252 /* 1253 * It is possible that an entry is being inserted without an upcall 1254 */ 1255 if (nstl == 0) { 1256 /* 1257 * No mfc; make a new one 1258 */ 1259 if (mrtdebug & DEBUG_MFC) 1260 log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n", 1261 ntohl(mfccp->mfcc_origin.s_addr), 1262 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1263 mfccp->mfcc_parent); 1264 1265 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1266 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1267 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { 1268 init_mfc_params(rt, mfccp); 1269 if (rt->mfc_expire) 1270 nexpire[hash]--; 1271 rt->mfc_expire = 0; 1272 break; /* XXX */ 1273 } 1274 } 1275 if (rt == NULL) { /* no upcall, so make a new entry */ 1276 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, 1277 M_NOWAIT); 1278 if (rt == NULL) { 1279 splx(s); 1280 return (ENOBUFS); 1281 } 1282 1283 init_mfc_params(rt, mfccp); 1284 rt->mfc_expire = 0; 1285 rt->mfc_stall = NULL; 1286 rt->mfc_bw_meter = NULL; 1287 1288 /* insert new entry at head of hash chain */ 1289 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1290 } 1291 } 1292 1293 splx(s); 1294 return (0); 1295 } 1296 1297 #ifdef UPCALL_TIMING 1298 /* 1299 * collect delay statistics on the upcalls 1300 */ 1301 static void 1302 collate(struct timeval *t) 1303 { 1304 u_int32_t d; 1305 struct timeval tp; 1306 u_int32_t delta; 1307 1308 microtime(&tp); 1309 1310 if (timercmp(t, &tp, <)) { 1311 TV_DELTA(tp, *t, delta); 1312 1313 d = delta >> 10; 1314 if (d > 50) 1315 d = 50; 1316 1317 ++upcall_data[d]; 1318 } 1319 } 1320 #endif /* UPCALL_TIMING */ 1321 1322 /* 1323 * Delete an mfc entry 1324 */ 1325 static int 1326 del_mfc(struct mbuf *m) 1327 { 1328 struct mfcctl2 mfcctl2; 1329 struct mfcctl2 *mfccp; 1330 struct mfc *rt; 1331 int s; 1332 int mfcctl_size = sizeof(struct mfcctl); 1333 struct mfcctl *mp = mtod(m, struct mfcctl *); 1334 1335 /* 1336 * XXX: for deleting MFC entries the information in entries 1337 * of size "struct mfcctl" is sufficient. 1338 */ 1339 1340 if (m == NULL || m->m_len < mfcctl_size) 1341 return (EINVAL); 1342 1343 bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp)); 1344 bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl), 1345 sizeof(mfcctl2) - sizeof(struct mfcctl)); 1346 1347 mfccp = &mfcctl2; 1348 1349 if (mrtdebug & DEBUG_MFC) 1350 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n", 1351 ntohl(mfccp->mfcc_origin.s_addr), 1352 ntohl(mfccp->mfcc_mcastgrp.s_addr)); 1353 1354 s = splsoftnet(); 1355 1356 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1357 if (rt == NULL) { 1358 splx(s); 1359 return (EADDRNOTAVAIL); 1360 } 1361 1362 /* 1363 * free the bw_meter entries 1364 */ 1365 free_bw_list(rt->mfc_bw_meter); 1366 rt->mfc_bw_meter = NULL; 1367 1368 LIST_REMOVE(rt, mfc_hash); 1369 free(rt, M_MRTABLE); 1370 1371 splx(s); 1372 return (0); 1373 } 1374 1375 static int 1376 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1377 { 1378 if (s) { 1379 if (sbappendaddr(&s->so_rcv, sintosa(src), mm, 1380 (struct mbuf *)NULL) != 0) { 1381 sorwakeup(s); 1382 return (0); 1383 } 1384 } 1385 m_freem(mm); 1386 return (-1); 1387 } 1388 1389 /* 1390 * IP multicast forwarding function. This function assumes that the packet 1391 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1392 * pointed to by "ifp", and the packet is to be relayed to other networks 1393 * that have members of the packet's destination IP multicast group. 1394 * 1395 * The packet is returned unscathed to the caller, unless it is 1396 * erroneous, in which case a non-zero return value tells the caller to 1397 * discard it. 1398 */ 1399 1400 #define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */ 1401 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1402 1403 int 1404 #ifdef RSVP_ISI 1405 ip_mforward(struct mbuf *m, struct ifnet *ifp, struct ip_moptions *imo) 1406 #else 1407 ip_mforward(struct mbuf *m, struct ifnet *ifp) 1408 #endif /* RSVP_ISI */ 1409 { 1410 struct ip *ip = mtod(m, struct ip *); 1411 struct mfc *rt; 1412 static int srctun = 0; 1413 struct mbuf *mm; 1414 int s; 1415 vifi_t vifi; 1416 1417 if (mrtdebug & DEBUG_FORWARD) 1418 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n", 1419 ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp); 1420 1421 if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 || 1422 ((u_char *)(ip + 1))[1] != IPOPT_LSRR) { 1423 /* 1424 * Packet arrived via a physical interface or 1425 * an encapsulated tunnel or a register_vif. 1426 */ 1427 } else { 1428 /* 1429 * Packet arrived through a source-route tunnel. 1430 * Source-route tunnels are no longer supported. 1431 */ 1432 if ((srctun++ % 1000) == 0) 1433 log(LOG_ERR, 1434 "ip_mforward: received source-routed packet from %x\n", 1435 ntohl(ip->ip_src.s_addr)); 1436 1437 return (1); 1438 } 1439 1440 #ifdef RSVP_ISI 1441 if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { 1442 if (ip->ip_ttl < 255) 1443 ip->ip_ttl++; /* compensate for -1 in *_send routines */ 1444 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1445 struct vif *vifp = viftable + vifi; 1446 printf("Sending IPPROTO_RSVP from %x to %x on vif %d (%s%s)\n", 1447 ntohl(ip->ip_src), ntohl(ip->ip_dst), vifi, 1448 (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", 1449 vifp->v_ifp->if_xname); 1450 } 1451 return (ip_mdq(m, ifp, (struct mfc *)NULL, vifi)); 1452 } 1453 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1454 printf("Warning: IPPROTO_RSVP from %x to %x without vif option\n", 1455 ntohl(ip->ip_src), ntohl(ip->ip_dst)); 1456 } 1457 #endif /* RSVP_ISI */ 1458 1459 /* 1460 * Don't forward a packet with time-to-live of zero or one, 1461 * or a packet destined to a local-only group. 1462 */ 1463 if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 1464 return (0); 1465 1466 /* 1467 * Determine forwarding vifs from the forwarding cache table 1468 */ 1469 s = splsoftnet(); 1470 ++mrtstat.mrts_mfc_lookups; 1471 rt = mfc_find(&ip->ip_src, &ip->ip_dst); 1472 1473 /* Entry exists, so forward if necessary */ 1474 if (rt != NULL) { 1475 splx(s); 1476 #ifdef RSVP_ISI 1477 return (ip_mdq(m, ifp, rt, -1)); 1478 #else 1479 return (ip_mdq(m, ifp, rt)); 1480 #endif /* RSVP_ISI */ 1481 } else { 1482 /* 1483 * If we don't have a route for packet's origin, 1484 * Make a copy of the packet & send message to routing daemon 1485 */ 1486 1487 struct mbuf *mb0; 1488 struct rtdetq *rte; 1489 u_int32_t hash; 1490 int hlen = ip->ip_hl << 2; 1491 #ifdef UPCALL_TIMING 1492 struct timeval tp; 1493 1494 microtime(&tp); 1495 #endif /* UPCALL_TIMING */ 1496 1497 ++mrtstat.mrts_mfc_misses; 1498 1499 mrtstat.mrts_no_route++; 1500 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) 1501 log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n", 1502 ntohl(ip->ip_src.s_addr), 1503 ntohl(ip->ip_dst.s_addr)); 1504 1505 /* 1506 * Allocate mbufs early so that we don't do extra work if we are 1507 * just going to fail anyway. Make sure to pullup the header so 1508 * that other people can't step on it. 1509 */ 1510 rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE, 1511 M_NOWAIT); 1512 if (rte == NULL) { 1513 splx(s); 1514 return (ENOBUFS); 1515 } 1516 mb0 = m_copy(m, 0, M_COPYALL); 1517 M_PULLUP(mb0, hlen); 1518 if (mb0 == NULL) { 1519 free(rte, M_MRTABLE); 1520 splx(s); 1521 return (ENOBUFS); 1522 } 1523 1524 /* is there an upcall waiting for this flow? */ 1525 hash = MFCHASH(ip->ip_src, ip->ip_dst); 1526 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1527 if (in_hosteq(ip->ip_src, rt->mfc_origin) && 1528 in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && 1529 rt->mfc_stall != NULL) 1530 break; 1531 } 1532 1533 if (rt == NULL) { 1534 int i; 1535 struct igmpmsg *im; 1536 1537 /* 1538 * Locate the vifi for the incoming interface for 1539 * this packet. 1540 * If none found, drop packet. 1541 */ 1542 for (vifi = 0; vifi < numvifs && 1543 viftable[vifi].v_ifp != ifp; vifi++) 1544 ; 1545 if (vifi >= numvifs) /* vif not found, drop packet */ 1546 goto non_fatal; 1547 1548 /* no upcall, so make a new entry */ 1549 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, 1550 M_NOWAIT); 1551 if (rt == NULL) 1552 goto fail; 1553 1554 /* 1555 * Make a copy of the header to send to the user level 1556 * process 1557 */ 1558 mm = m_copy(m, 0, hlen); 1559 M_PULLUP(mm, hlen); 1560 if (mm == NULL) 1561 goto fail1; 1562 1563 /* 1564 * Send message to routing daemon to install 1565 * a route into the kernel table 1566 */ 1567 1568 im = mtod(mm, struct igmpmsg *); 1569 im->im_msgtype = IGMPMSG_NOCACHE; 1570 im->im_mbz = 0; 1571 im->im_vif = vifi; 1572 1573 mrtstat.mrts_upcalls++; 1574 1575 sin.sin_addr = ip->ip_src; 1576 if (socket_send(ip_mrouter, mm, &sin) < 0) { 1577 log(LOG_WARNING, 1578 "ip_mforward: ip_mrouter socket queue full\n"); 1579 ++mrtstat.mrts_upq_sockfull; 1580 fail1: 1581 free(rt, M_MRTABLE); 1582 fail: 1583 free(rte, M_MRTABLE); 1584 m_freem(mb0); 1585 splx(s); 1586 return (ENOBUFS); 1587 } 1588 1589 /* insert new entry at head of hash chain */ 1590 rt->mfc_origin = ip->ip_src; 1591 rt->mfc_mcastgrp = ip->ip_dst; 1592 rt->mfc_pkt_cnt = 0; 1593 rt->mfc_byte_cnt = 0; 1594 rt->mfc_wrong_if = 0; 1595 rt->mfc_expire = UPCALL_EXPIRE; 1596 nexpire[hash]++; 1597 for (i = 0; i < numvifs; i++) { 1598 rt->mfc_ttls[i] = 0; 1599 rt->mfc_flags[i] = 0; 1600 } 1601 rt->mfc_parent = -1; 1602 1603 /* clear the RP address */ 1604 rt->mfc_rp = zeroin_addr; 1605 1606 rt->mfc_bw_meter = NULL; 1607 1608 /* link into table */ 1609 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1610 /* Add this entry to the end of the queue */ 1611 rt->mfc_stall = rte; 1612 } else { 1613 /* determine if q has overflowed */ 1614 struct rtdetq **p; 1615 int npkts = 0; 1616 1617 /* 1618 * XXX ouch! we need to append to the list, but we 1619 * only have a pointer to the front, so we have to 1620 * scan the entire list every time. 1621 */ 1622 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) 1623 if (++npkts > MAX_UPQ) { 1624 mrtstat.mrts_upq_ovflw++; 1625 non_fatal: 1626 free(rte, M_MRTABLE); 1627 m_freem(mb0); 1628 splx(s); 1629 return (0); 1630 } 1631 1632 /* Add this entry to the end of the queue */ 1633 *p = rte; 1634 } 1635 1636 rte->next = NULL; 1637 rte->m = mb0; 1638 rte->ifp = ifp; 1639 #ifdef UPCALL_TIMING 1640 rte->t = tp; 1641 #endif /* UPCALL_TIMING */ 1642 1643 splx(s); 1644 1645 return (0); 1646 } 1647 } 1648 1649 1650 /*ARGSUSED*/ 1651 static void 1652 expire_upcalls(void *v) 1653 { 1654 int i; 1655 int s; 1656 1657 s = splsoftnet(); 1658 1659 for (i = 0; i < MFCTBLSIZ; i++) { 1660 struct mfc *rt, *nrt; 1661 1662 if (nexpire[i] == 0) 1663 continue; 1664 1665 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 1666 nrt = LIST_NEXT(rt, mfc_hash); 1667 1668 if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) 1669 continue; 1670 nexpire[i]--; 1671 1672 /* 1673 * free the bw_meter entries 1674 */ 1675 while (rt->mfc_bw_meter != NULL) { 1676 struct bw_meter *x = rt->mfc_bw_meter; 1677 1678 rt->mfc_bw_meter = x->bm_mfc_next; 1679 free(x, M_BWMETER); 1680 } 1681 1682 ++mrtstat.mrts_cache_cleanups; 1683 if (mrtdebug & DEBUG_EXPIRE) 1684 log(LOG_DEBUG, 1685 "expire_upcalls: expiring (%x %x)\n", 1686 ntohl(rt->mfc_origin.s_addr), 1687 ntohl(rt->mfc_mcastgrp.s_addr)); 1688 1689 expire_mfc(rt); 1690 } 1691 } 1692 1693 splx(s); 1694 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, 1695 expire_upcalls, NULL); 1696 } 1697 1698 /* 1699 * Packet forwarding routine once entry in the cache is made 1700 */ 1701 static int 1702 #ifdef RSVP_ISI 1703 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) 1704 #else 1705 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt) 1706 #endif /* RSVP_ISI */ 1707 { 1708 struct ip *ip = mtod(m, struct ip *); 1709 vifi_t vifi; 1710 struct vif *vifp; 1711 int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2); 1712 1713 /* 1714 * Macro to send packet on vif. Since RSVP packets don't get counted on 1715 * input, they shouldn't get counted on output, so statistics keeping is 1716 * separate. 1717 */ 1718 #define MC_SEND(ip, vifp, m) do { \ 1719 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1720 encap_send((ip), (vifp), (m)); \ 1721 else \ 1722 phyint_send((ip), (vifp), (m)); \ 1723 } while (/*CONSTCOND*/ 0) 1724 1725 #ifdef RSVP_ISI 1726 /* 1727 * If xmt_vif is not -1, send on only the requested vif. 1728 * 1729 * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs. 1730 */ 1731 if (xmt_vif < numvifs) { 1732 #ifdef PIM 1733 if (viftable[xmt_vif].v_flags & VIFF_REGISTER) 1734 pim_register_send(ip, viftable + xmt_vif, m, rt); 1735 else 1736 #endif 1737 MC_SEND(ip, viftable + xmt_vif, m); 1738 return (1); 1739 } 1740 #endif /* RSVP_ISI */ 1741 1742 /* 1743 * Don't forward if it didn't arrive from the parent vif for its origin. 1744 */ 1745 vifi = rt->mfc_parent; 1746 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1747 /* came in the wrong interface */ 1748 if (mrtdebug & DEBUG_FORWARD) 1749 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", 1750 ifp, vifi, 1751 vifi >= numvifs ? 0 : viftable[vifi].v_ifp); 1752 ++mrtstat.mrts_wrong_if; 1753 ++rt->mfc_wrong_if; 1754 /* 1755 * If we are doing PIM assert processing, send a message 1756 * to the routing daemon. 1757 * 1758 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1759 * can complete the SPT switch, regardless of the type 1760 * of the iif (broadcast media, GRE tunnel, etc). 1761 */ 1762 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { 1763 struct timeval now; 1764 u_int32_t delta; 1765 1766 #ifdef PIM 1767 if (ifp == &multicast_register_if) 1768 pimstat.pims_rcv_registers_wrongiif++; 1769 #endif 1770 1771 /* Get vifi for the incoming packet */ 1772 for (vifi = 0; 1773 vifi < numvifs && viftable[vifi].v_ifp != ifp; 1774 vifi++) 1775 ; 1776 if (vifi >= numvifs) { 1777 /* The iif is not found: ignore the packet. */ 1778 return (0); 1779 } 1780 1781 if (rt->mfc_flags[vifi] & 1782 MRT_MFC_FLAGS_DISABLE_WRONGVIF) { 1783 /* WRONGVIF disabled: ignore the packet */ 1784 return (0); 1785 } 1786 1787 microtime(&now); 1788 1789 TV_DELTA(rt->mfc_last_assert, now, delta); 1790 1791 if (delta > ASSERT_MSG_TIME) { 1792 struct igmpmsg *im; 1793 int hlen = ip->ip_hl << 2; 1794 struct mbuf *mm = m_copy(m, 0, hlen); 1795 1796 M_PULLUP(mm, hlen); 1797 if (mm == NULL) 1798 return (ENOBUFS); 1799 1800 rt->mfc_last_assert = now; 1801 1802 im = mtod(mm, struct igmpmsg *); 1803 im->im_msgtype = IGMPMSG_WRONGVIF; 1804 im->im_mbz = 0; 1805 im->im_vif = vifi; 1806 1807 mrtstat.mrts_upcalls++; 1808 1809 sin.sin_addr = im->im_src; 1810 if (socket_send(ip_mrouter, mm, &sin) < 0) { 1811 log(LOG_WARNING, 1812 "ip_mforward: ip_mrouter socket queue full\n"); 1813 ++mrtstat.mrts_upq_sockfull; 1814 return (ENOBUFS); 1815 } 1816 } 1817 } 1818 return (0); 1819 } 1820 1821 /* If I sourced this packet, it counts as output, else it was input. */ 1822 if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) { 1823 viftable[vifi].v_pkt_out++; 1824 viftable[vifi].v_bytes_out += plen; 1825 } else { 1826 viftable[vifi].v_pkt_in++; 1827 viftable[vifi].v_bytes_in += plen; 1828 } 1829 rt->mfc_pkt_cnt++; 1830 rt->mfc_byte_cnt += plen; 1831 1832 /* 1833 * For each vif, decide if a copy of the packet should be forwarded. 1834 * Forward if: 1835 * - the ttl exceeds the vif's threshold 1836 * - there are group members downstream on interface 1837 */ 1838 for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) 1839 if ((rt->mfc_ttls[vifi] > 0) && 1840 (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1841 vifp->v_pkt_out++; 1842 vifp->v_bytes_out += plen; 1843 #ifdef PIM 1844 if (vifp->v_flags & VIFF_REGISTER) 1845 pim_register_send(ip, vifp, m, rt); 1846 else 1847 #endif 1848 MC_SEND(ip, vifp, m); 1849 } 1850 1851 /* 1852 * Perform upcall-related bw measuring. 1853 */ 1854 if (rt->mfc_bw_meter != NULL) { 1855 struct bw_meter *x; 1856 struct timeval now; 1857 1858 microtime(&now); 1859 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1860 bw_meter_receive_packet(x, plen, &now); 1861 } 1862 1863 return (0); 1864 } 1865 1866 #ifdef RSVP_ISI 1867 /* 1868 * check if a vif number is legal/ok. This is used by ip_output. 1869 */ 1870 int 1871 legal_vif_num(int vif) 1872 { 1873 if (vif >= 0 && vif < numvifs) 1874 return (1); 1875 else 1876 return (0); 1877 } 1878 #endif /* RSVP_ISI */ 1879 1880 static void 1881 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1882 { 1883 struct mbuf *mb_copy; 1884 int hlen = ip->ip_hl << 2; 1885 1886 /* 1887 * Make a new reference to the packet; make sure that 1888 * the IP header is actually copied, not just referenced, 1889 * so that ip_output() only scribbles on the copy. 1890 */ 1891 mb_copy = m_copy(m, 0, M_COPYALL); 1892 M_PULLUP(mb_copy, hlen); 1893 if (mb_copy == NULL) 1894 return; 1895 1896 if (vifp->v_rate_limit <= 0) 1897 tbf_send_packet(vifp, mb_copy); 1898 else 1899 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), 1900 ntohs(ip->ip_len)); 1901 } 1902 1903 static void 1904 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1905 { 1906 struct mbuf *mb_copy; 1907 struct ip *ip_copy; 1908 int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr); 1909 1910 /* Take care of delayed checksums */ 1911 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 1912 in_delayed_cksum(m); 1913 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 1914 } 1915 1916 /* 1917 * copy the old packet & pullup it's IP header into the 1918 * new mbuf so we can modify it. Try to fill the new 1919 * mbuf since if we don't the ethernet driver will. 1920 */ 1921 MGETHDR(mb_copy, M_DONTWAIT, MT_DATA); 1922 if (mb_copy == NULL) 1923 return; 1924 mb_copy->m_data += max_linkhdr; 1925 mb_copy->m_pkthdr.len = len; 1926 mb_copy->m_len = sizeof(multicast_encap_iphdr); 1927 1928 if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { 1929 m_freem(mb_copy); 1930 return; 1931 } 1932 i = MHLEN - max_linkhdr; 1933 if (i > len) 1934 i = len; 1935 mb_copy = m_pullup(mb_copy, i); 1936 if (mb_copy == NULL) 1937 return; 1938 1939 /* 1940 * fill in the encapsulating IP header. 1941 */ 1942 ip_copy = mtod(mb_copy, struct ip *); 1943 *ip_copy = multicast_encap_iphdr; 1944 ip_copy->ip_id = ip_newid(); 1945 ip_copy->ip_len = htons(len); 1946 ip_copy->ip_src = vifp->v_lcl_addr; 1947 ip_copy->ip_dst = vifp->v_rmt_addr; 1948 1949 /* 1950 * turn the encapsulated IP header back into a valid one. 1951 */ 1952 ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); 1953 --ip->ip_ttl; 1954 ip->ip_sum = 0; 1955 mb_copy->m_data += sizeof(multicast_encap_iphdr); 1956 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 1957 mb_copy->m_data -= sizeof(multicast_encap_iphdr); 1958 1959 if (vifp->v_rate_limit <= 0) 1960 tbf_send_packet(vifp, mb_copy); 1961 else 1962 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len)); 1963 } 1964 1965 /* 1966 * De-encapsulate a packet and feed it back through ip input. 1967 */ 1968 static void 1969 vif_input(struct mbuf *m, ...) 1970 { 1971 int off, proto; 1972 va_list ap; 1973 struct vif *vifp; 1974 int s; 1975 struct ifqueue *ifq; 1976 1977 va_start(ap, m); 1978 off = va_arg(ap, int); 1979 proto = va_arg(ap, int); 1980 va_end(ap); 1981 1982 vifp = (struct vif *)encap_getarg(m); 1983 if (!vifp || proto != ENCAP_PROTO) { 1984 m_freem(m); 1985 mrtstat.mrts_bad_tunnel++; 1986 return; 1987 } 1988 1989 m_adj(m, off); 1990 m->m_pkthdr.rcvif = vifp->v_ifp; 1991 ifq = &ipintrq; 1992 s = splnet(); 1993 if (IF_QFULL(ifq)) { 1994 IF_DROP(ifq); 1995 m_freem(m); 1996 } else { 1997 IF_ENQUEUE(ifq, m); 1998 /* 1999 * normally we would need a "schednetisr(NETISR_IP)" 2000 * here but we were called by ip_input and it is going 2001 * to loop back & try to dequeue the packet we just 2002 * queued as soon as we return so we avoid the 2003 * unnecessary software interrrupt. 2004 */ 2005 } 2006 splx(s); 2007 } 2008 2009 /* 2010 * Check if the packet should be received on the vif denoted by arg. 2011 * (The encap selection code will call this once per vif since each is 2012 * registered separately.) 2013 */ 2014 static int 2015 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg) 2016 { 2017 struct vif *vifp; 2018 struct ip ip; 2019 2020 #ifdef DIAGNOSTIC 2021 if (!arg || proto != IPPROTO_IPV4) 2022 panic("unexpected arg in vif_encapcheck"); 2023 #endif 2024 2025 /* 2026 * Accept the packet only if the inner heaader is multicast 2027 * and the outer header matches a tunnel-mode vif. Order 2028 * checks in the hope that common non-matching packets will be 2029 * rejected quickly. Assume that unicast IPv4 traffic in a 2030 * parallel tunnel (e.g. gif(4)) is unlikely. 2031 */ 2032 2033 /* Obtain the outer IP header and the vif pointer. */ 2034 m_copydata((struct mbuf *)m, 0, sizeof(ip), (caddr_t)&ip); 2035 vifp = (struct vif *)arg; 2036 2037 /* 2038 * The outer source must match the vif's remote peer address. 2039 * For a multicast router with several tunnels, this is the 2040 * only check that will fail on packets in other tunnels, 2041 * assuming the local address is the same. 2042 */ 2043 if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src)) 2044 return 0; 2045 2046 /* The outer destination must match the vif's local address. */ 2047 if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst)) 2048 return 0; 2049 2050 /* The vif must be of tunnel type. */ 2051 if ((vifp->v_flags & VIFF_TUNNEL) == 0) 2052 return 0; 2053 2054 /* Check that the inner destination is multicast. */ 2055 m_copydata((struct mbuf *)m, off, sizeof(ip), (caddr_t)&ip); 2056 if (!IN_MULTICAST(ip.ip_dst.s_addr)) 2057 return 0; 2058 2059 /* 2060 * We have checked that both the outer src and dst addresses 2061 * match the vif, and that the inner destination is multicast 2062 * (224/5). By claiming more than 64, we intend to 2063 * preferentially take packets that also match a parallel 2064 * gif(4). 2065 */ 2066 return 32 + 32 + 5; 2067 } 2068 2069 /* 2070 * Token bucket filter module 2071 */ 2072 static void 2073 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len) 2074 { 2075 2076 if (len > MAX_BKT_SIZE) { 2077 /* drop if packet is too large */ 2078 mrtstat.mrts_pkt2large++; 2079 m_freem(m); 2080 return; 2081 } 2082 2083 tbf_update_tokens(vifp); 2084 2085 /* 2086 * If there are enough tokens, and the queue is empty, send this packet 2087 * out immediately. Otherwise, try to insert it on this vif's queue. 2088 */ 2089 if (vifp->tbf_q_len == 0) { 2090 if (len <= vifp->tbf_n_tok) { 2091 vifp->tbf_n_tok -= len; 2092 tbf_send_packet(vifp, m); 2093 } else { 2094 /* queue packet and timeout till later */ 2095 tbf_queue(vifp, m); 2096 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS, 2097 tbf_reprocess_q, vifp); 2098 } 2099 } else { 2100 if (vifp->tbf_q_len >= vifp->tbf_max_q_len && 2101 !tbf_dq_sel(vifp, ip)) { 2102 /* queue full, and couldn't make room */ 2103 mrtstat.mrts_q_overflow++; 2104 m_freem(m); 2105 } else { 2106 /* queue length low enough, or made room */ 2107 tbf_queue(vifp, m); 2108 tbf_process_q(vifp); 2109 } 2110 } 2111 } 2112 2113 /* 2114 * adds a packet to the queue at the interface 2115 */ 2116 static void 2117 tbf_queue(struct vif *vifp, struct mbuf *m) 2118 { 2119 int s = splsoftnet(); 2120 2121 /* insert at tail */ 2122 *vifp->tbf_t = m; 2123 vifp->tbf_t = &m->m_nextpkt; 2124 vifp->tbf_q_len++; 2125 2126 splx(s); 2127 } 2128 2129 2130 /* 2131 * processes the queue at the interface 2132 */ 2133 static void 2134 tbf_process_q(struct vif *vifp) 2135 { 2136 struct mbuf *m; 2137 int len; 2138 int s = splsoftnet(); 2139 2140 /* 2141 * Loop through the queue at the interface and send as many packets 2142 * as possible. 2143 */ 2144 for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) { 2145 len = ntohs(mtod(m, struct ip *)->ip_len); 2146 2147 /* determine if the packet can be sent */ 2148 if (len <= vifp->tbf_n_tok) { 2149 /* if so, 2150 * reduce no of tokens, dequeue the packet, 2151 * send the packet. 2152 */ 2153 if ((vifp->tbf_q = m->m_nextpkt) == NULL) 2154 vifp->tbf_t = &vifp->tbf_q; 2155 --vifp->tbf_q_len; 2156 2157 m->m_nextpkt = NULL; 2158 vifp->tbf_n_tok -= len; 2159 tbf_send_packet(vifp, m); 2160 } else 2161 break; 2162 } 2163 splx(s); 2164 } 2165 2166 static void 2167 tbf_reprocess_q(void *arg) 2168 { 2169 struct vif *vifp = arg; 2170 2171 if (ip_mrouter == NULL) 2172 return; 2173 2174 tbf_update_tokens(vifp); 2175 tbf_process_q(vifp); 2176 2177 if (vifp->tbf_q_len != 0) 2178 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS, 2179 tbf_reprocess_q, vifp); 2180 } 2181 2182 /* function that will selectively discard a member of the queue 2183 * based on the precedence value and the priority 2184 */ 2185 static int 2186 tbf_dq_sel(struct vif *vifp, struct ip *ip) 2187 { 2188 u_int p; 2189 struct mbuf **mp, *m; 2190 int s = splsoftnet(); 2191 2192 p = priority(vifp, ip); 2193 2194 for (mp = &vifp->tbf_q, m = *mp; 2195 m != NULL; 2196 mp = &m->m_nextpkt, m = *mp) { 2197 if (p > priority(vifp, mtod(m, struct ip *))) { 2198 if ((*mp = m->m_nextpkt) == NULL) 2199 vifp->tbf_t = mp; 2200 --vifp->tbf_q_len; 2201 2202 m_freem(m); 2203 mrtstat.mrts_drop_sel++; 2204 splx(s); 2205 return (1); 2206 } 2207 } 2208 splx(s); 2209 return (0); 2210 } 2211 2212 static void 2213 tbf_send_packet(struct vif *vifp, struct mbuf *m) 2214 { 2215 int error; 2216 int s = splsoftnet(); 2217 2218 if (vifp->v_flags & VIFF_TUNNEL) { 2219 /* If tunnel options */ 2220 ip_output(m, (struct mbuf *)NULL, &vifp->v_route, 2221 IP_FORWARDING, (struct ip_moptions *)NULL, 2222 (struct socket *)NULL); 2223 } else { 2224 /* if physical interface option, extract the options and then send */ 2225 struct ip_moptions imo; 2226 2227 imo.imo_multicast_ifp = vifp->v_ifp; 2228 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 2229 imo.imo_multicast_loop = 1; 2230 #ifdef RSVP_ISI 2231 imo.imo_multicast_vif = -1; 2232 #endif 2233 2234 error = ip_output(m, (struct mbuf *)NULL, (struct route *)NULL, 2235 IP_FORWARDING|IP_MULTICASTOPTS, &imo, 2236 (struct socket *)NULL); 2237 2238 if (mrtdebug & DEBUG_XMIT) 2239 log(LOG_DEBUG, "phyint_send on vif %ld err %d\n", 2240 (long)(vifp - viftable), error); 2241 } 2242 splx(s); 2243 } 2244 2245 /* determine the current time and then 2246 * the elapsed time (between the last time and time now) 2247 * in milliseconds & update the no. of tokens in the bucket 2248 */ 2249 static void 2250 tbf_update_tokens(struct vif *vifp) 2251 { 2252 struct timeval tp; 2253 u_int32_t tm; 2254 int s = splsoftnet(); 2255 2256 microtime(&tp); 2257 2258 TV_DELTA(tp, vifp->tbf_last_pkt_t, tm); 2259 2260 /* 2261 * This formula is actually 2262 * "time in seconds" * "bytes/second". 2263 * 2264 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) 2265 * 2266 * The (1000/1024) was introduced in add_vif to optimize 2267 * this divide into a shift. 2268 */ 2269 vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192; 2270 vifp->tbf_last_pkt_t = tp; 2271 2272 if (vifp->tbf_n_tok > MAX_BKT_SIZE) 2273 vifp->tbf_n_tok = MAX_BKT_SIZE; 2274 2275 splx(s); 2276 } 2277 2278 static int 2279 priority(struct vif *vifp, struct ip *ip) 2280 { 2281 int prio = 50; /* the lowest priority -- default case */ 2282 2283 /* temporary hack; may add general packet classifier some day */ 2284 2285 /* 2286 * The UDP port space is divided up into four priority ranges: 2287 * [0, 16384) : unclassified - lowest priority 2288 * [16384, 32768) : audio - highest priority 2289 * [32768, 49152) : whiteboard - medium priority 2290 * [49152, 65536) : video - low priority 2291 */ 2292 if (ip->ip_p == IPPROTO_UDP) { 2293 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); 2294 2295 switch (ntohs(udp->uh_dport) & 0xc000) { 2296 case 0x4000: 2297 prio = 70; 2298 break; 2299 case 0x8000: 2300 prio = 60; 2301 break; 2302 case 0xc000: 2303 prio = 55; 2304 break; 2305 } 2306 2307 if (tbfdebug > 1) 2308 log(LOG_DEBUG, "port %x prio %d\n", 2309 ntohs(udp->uh_dport), prio); 2310 } 2311 2312 return (prio); 2313 } 2314 2315 /* 2316 * End of token bucket filter modifications 2317 */ 2318 #ifdef RSVP_ISI 2319 int 2320 ip_rsvp_vif_init(struct socket *so, struct mbuf *m) 2321 { 2322 int vifi, s; 2323 2324 if (rsvpdebug) 2325 printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n", 2326 so->so_type, so->so_proto->pr_protocol); 2327 2328 if (so->so_type != SOCK_RAW || 2329 so->so_proto->pr_protocol != IPPROTO_RSVP) 2330 return (EOPNOTSUPP); 2331 2332 /* Check mbuf. */ 2333 if (m == NULL || m->m_len != sizeof(int)) { 2334 return (EINVAL); 2335 } 2336 vifi = *(mtod(m, int *)); 2337 2338 if (rsvpdebug) 2339 printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n", 2340 vifi, rsvp_on); 2341 2342 s = splsoftnet(); 2343 2344 /* Check vif. */ 2345 if (!legal_vif_num(vifi)) { 2346 splx(s); 2347 return (EADDRNOTAVAIL); 2348 } 2349 2350 /* Check if socket is available. */ 2351 if (viftable[vifi].v_rsvpd != NULL) { 2352 splx(s); 2353 return (EADDRINUSE); 2354 } 2355 2356 viftable[vifi].v_rsvpd = so; 2357 /* 2358 * This may seem silly, but we need to be sure we don't over-increment 2359 * the RSVP counter, in case something slips up. 2360 */ 2361 if (!viftable[vifi].v_rsvp_on) { 2362 viftable[vifi].v_rsvp_on = 1; 2363 rsvp_on++; 2364 } 2365 2366 splx(s); 2367 return (0); 2368 } 2369 2370 int 2371 ip_rsvp_vif_done(struct socket *so, struct mbuf *m) 2372 { 2373 int vifi, s; 2374 2375 if (rsvpdebug) 2376 printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n", 2377 so->so_type, so->so_proto->pr_protocol); 2378 2379 if (so->so_type != SOCK_RAW || 2380 so->so_proto->pr_protocol != IPPROTO_RSVP) 2381 return (EOPNOTSUPP); 2382 2383 /* Check mbuf. */ 2384 if (m == NULL || m->m_len != sizeof(int)) { 2385 return (EINVAL); 2386 } 2387 vifi = *(mtod(m, int *)); 2388 2389 s = splsoftnet(); 2390 2391 /* Check vif. */ 2392 if (!legal_vif_num(vifi)) { 2393 splx(s); 2394 return (EADDRNOTAVAIL); 2395 } 2396 2397 if (rsvpdebug) 2398 printf("ip_rsvp_vif_done: v_rsvpd = %x so = %x\n", 2399 viftable[vifi].v_rsvpd, so); 2400 2401 viftable[vifi].v_rsvpd = NULL; 2402 /* 2403 * This may seem silly, but we need to be sure we don't over-decrement 2404 * the RSVP counter, in case something slips up. 2405 */ 2406 if (viftable[vifi].v_rsvp_on) { 2407 viftable[vifi].v_rsvp_on = 0; 2408 rsvp_on--; 2409 } 2410 2411 splx(s); 2412 return (0); 2413 } 2414 2415 void 2416 ip_rsvp_force_done(struct socket *so) 2417 { 2418 int vifi, s; 2419 2420 /* Don't bother if it is not the right type of socket. */ 2421 if (so->so_type != SOCK_RAW || 2422 so->so_proto->pr_protocol != IPPROTO_RSVP) 2423 return; 2424 2425 s = splsoftnet(); 2426 2427 /* 2428 * The socket may be attached to more than one vif...this 2429 * is perfectly legal. 2430 */ 2431 for (vifi = 0; vifi < numvifs; vifi++) { 2432 if (viftable[vifi].v_rsvpd == so) { 2433 viftable[vifi].v_rsvpd = NULL; 2434 /* 2435 * This may seem silly, but we need to be sure we don't 2436 * over-decrement the RSVP counter, in case something 2437 * slips up. 2438 */ 2439 if (viftable[vifi].v_rsvp_on) { 2440 viftable[vifi].v_rsvp_on = 0; 2441 rsvp_on--; 2442 } 2443 } 2444 } 2445 2446 splx(s); 2447 return; 2448 } 2449 2450 void 2451 rsvp_input(struct mbuf *m, struct ifnet *ifp) 2452 { 2453 int vifi, s; 2454 struct ip *ip = mtod(m, struct ip *); 2455 static struct sockaddr_in rsvp_src = { sizeof(sin), AF_INET }; 2456 2457 if (rsvpdebug) 2458 printf("rsvp_input: rsvp_on %d\n", rsvp_on); 2459 2460 /* 2461 * Can still get packets with rsvp_on = 0 if there is a local member 2462 * of the group to which the RSVP packet is addressed. But in this 2463 * case we want to throw the packet away. 2464 */ 2465 if (!rsvp_on) { 2466 m_freem(m); 2467 return; 2468 } 2469 2470 /* 2471 * If the old-style non-vif-associated socket is set, then use 2472 * it and ignore the new ones. 2473 */ 2474 if (ip_rsvpd != NULL) { 2475 if (rsvpdebug) 2476 printf("rsvp_input: " 2477 "Sending packet up old-style socket\n"); 2478 rip_input(m); /*XXX*/ 2479 return; 2480 } 2481 2482 s = splsoftnet(); 2483 2484 if (rsvpdebug) 2485 printf("rsvp_input: check vifs\n"); 2486 2487 /* Find which vif the packet arrived on. */ 2488 for (vifi = 0; vifi < numvifs; vifi++) { 2489 if (viftable[vifi].v_ifp == ifp) 2490 break; 2491 } 2492 2493 if (vifi == numvifs) { 2494 /* Can't find vif packet arrived on. Drop packet. */ 2495 if (rsvpdebug) 2496 printf("rsvp_input: " 2497 "Can't find vif for packet...dropping it.\n"); 2498 m_freem(m); 2499 splx(s); 2500 return; 2501 } 2502 2503 if (rsvpdebug) 2504 printf("rsvp_input: check socket\n"); 2505 2506 if (viftable[vifi].v_rsvpd == NULL) { 2507 /* 2508 * drop packet, since there is no specific socket for this 2509 * interface 2510 */ 2511 if (rsvpdebug) 2512 printf("rsvp_input: No socket defined for vif %d\n", 2513 vifi); 2514 m_freem(m); 2515 splx(s); 2516 return; 2517 } 2518 2519 rsvp_src.sin_addr = ip->ip_src; 2520 2521 if (rsvpdebug && m) 2522 printf("rsvp_input: m->m_len = %d, sbspace() = %d\n", 2523 m->m_len, sbspace(&viftable[vifi].v_rsvpd->so_rcv)); 2524 2525 if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) 2526 if (rsvpdebug) 2527 printf("rsvp_input: Failed to append to socket\n"); 2528 else 2529 if (rsvpdebug) 2530 printf("rsvp_input: send packet up\n"); 2531 2532 splx(s); 2533 } 2534 #endif /* RSVP_ISI */ 2535 2536 /* 2537 * Code for bandwidth monitors 2538 */ 2539 2540 /* 2541 * Define common interface for timeval-related methods 2542 */ 2543 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp) 2544 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp)) 2545 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp)) 2546 2547 static uint32_t 2548 compute_bw_meter_flags(struct bw_upcall *req) 2549 { 2550 uint32_t flags = 0; 2551 2552 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 2553 flags |= BW_METER_UNIT_PACKETS; 2554 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 2555 flags |= BW_METER_UNIT_BYTES; 2556 if (req->bu_flags & BW_UPCALL_GEQ) 2557 flags |= BW_METER_GEQ; 2558 if (req->bu_flags & BW_UPCALL_LEQ) 2559 flags |= BW_METER_LEQ; 2560 2561 return flags; 2562 } 2563 2564 /* 2565 * Add a bw_meter entry 2566 */ 2567 static int 2568 add_bw_upcall(struct mbuf *m) 2569 { 2570 int s; 2571 struct mfc *mfc; 2572 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 2573 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 2574 struct timeval now; 2575 struct bw_meter *x; 2576 uint32_t flags; 2577 struct bw_upcall *req; 2578 2579 if (m == NULL || m->m_len < sizeof(struct bw_upcall)) 2580 return EINVAL; 2581 2582 req = mtod(m, struct bw_upcall *); 2583 2584 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2585 return EOPNOTSUPP; 2586 2587 /* Test if the flags are valid */ 2588 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 2589 return EINVAL; 2590 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 2591 return EINVAL; 2592 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2593 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2594 return EINVAL; 2595 2596 /* Test if the threshold time interval is valid */ 2597 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 2598 return EINVAL; 2599 2600 flags = compute_bw_meter_flags(req); 2601 2602 /* 2603 * Find if we have already same bw_meter entry 2604 */ 2605 s = splsoftnet(); 2606 mfc = mfc_find(&req->bu_src, &req->bu_dst); 2607 if (mfc == NULL) { 2608 splx(s); 2609 return EADDRNOTAVAIL; 2610 } 2611 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 2612 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2613 &req->bu_threshold.b_time, ==)) && 2614 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2615 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2616 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 2617 splx(s); 2618 return 0; /* XXX Already installed */ 2619 } 2620 } 2621 2622 /* Allocate the new bw_meter entry */ 2623 x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); 2624 if (x == NULL) { 2625 splx(s); 2626 return ENOBUFS; 2627 } 2628 2629 /* Set the new bw_meter entry */ 2630 x->bm_threshold.b_time = req->bu_threshold.b_time; 2631 microtime(&now); 2632 x->bm_start_time = now; 2633 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 2634 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 2635 x->bm_measured.b_packets = 0; 2636 x->bm_measured.b_bytes = 0; 2637 x->bm_flags = flags; 2638 x->bm_time_next = NULL; 2639 x->bm_time_hash = BW_METER_BUCKETS; 2640 2641 /* Add the new bw_meter entry to the front of entries for this MFC */ 2642 x->bm_mfc = mfc; 2643 x->bm_mfc_next = mfc->mfc_bw_meter; 2644 mfc->mfc_bw_meter = x; 2645 schedule_bw_meter(x, &now); 2646 splx(s); 2647 2648 return 0; 2649 } 2650 2651 static void 2652 free_bw_list(struct bw_meter *list) 2653 { 2654 while (list != NULL) { 2655 struct bw_meter *x = list; 2656 2657 list = list->bm_mfc_next; 2658 unschedule_bw_meter(x); 2659 free(x, M_BWMETER); 2660 } 2661 } 2662 2663 /* 2664 * Delete one or multiple bw_meter entries 2665 */ 2666 static int 2667 del_bw_upcall(struct mbuf *m) 2668 { 2669 int s; 2670 struct mfc *mfc; 2671 struct bw_meter *x; 2672 struct bw_upcall *req; 2673 2674 if (m == NULL || m->m_len < sizeof(struct bw_upcall)) 2675 return EINVAL; 2676 2677 req = mtod(m, struct bw_upcall *); 2678 2679 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2680 return EOPNOTSUPP; 2681 2682 s = splsoftnet(); 2683 /* Find the corresponding MFC entry */ 2684 mfc = mfc_find(&req->bu_src, &req->bu_dst); 2685 if (mfc == NULL) { 2686 splx(s); 2687 return EADDRNOTAVAIL; 2688 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 2689 /* 2690 * Delete all bw_meter entries for this mfc 2691 */ 2692 struct bw_meter *list; 2693 2694 list = mfc->mfc_bw_meter; 2695 mfc->mfc_bw_meter = NULL; 2696 free_bw_list(list); 2697 splx(s); 2698 return 0; 2699 } else { /* Delete a single bw_meter entry */ 2700 struct bw_meter *prev; 2701 uint32_t flags = 0; 2702 2703 flags = compute_bw_meter_flags(req); 2704 2705 /* Find the bw_meter entry to delete */ 2706 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 2707 prev = x, x = x->bm_mfc_next) { 2708 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2709 &req->bu_threshold.b_time, ==)) && 2710 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2711 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2712 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 2713 break; 2714 } 2715 if (x != NULL) { /* Delete entry from the list for this MFC */ 2716 if (prev != NULL) 2717 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ 2718 else 2719 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ 2720 2721 unschedule_bw_meter(x); 2722 splx(s); 2723 /* Free the bw_meter entry */ 2724 free(x, M_BWMETER); 2725 return 0; 2726 } else { 2727 splx(s); 2728 return EINVAL; 2729 } 2730 } 2731 /* NOTREACHED */ 2732 } 2733 2734 /* 2735 * Perform bandwidth measurement processing that may result in an upcall 2736 */ 2737 static void 2738 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 2739 { 2740 struct timeval delta; 2741 2742 delta = *nowp; 2743 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2744 2745 if (x->bm_flags & BW_METER_GEQ) { 2746 /* 2747 * Processing for ">=" type of bw_meter entry 2748 */ 2749 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2750 /* Reset the bw_meter entry */ 2751 x->bm_start_time = *nowp; 2752 x->bm_measured.b_packets = 0; 2753 x->bm_measured.b_bytes = 0; 2754 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2755 } 2756 2757 /* Record that a packet is received */ 2758 x->bm_measured.b_packets++; 2759 x->bm_measured.b_bytes += plen; 2760 2761 /* 2762 * Test if we should deliver an upcall 2763 */ 2764 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 2765 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2766 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || 2767 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2768 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { 2769 /* Prepare an upcall for delivery */ 2770 bw_meter_prepare_upcall(x, nowp); 2771 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 2772 } 2773 } 2774 } else if (x->bm_flags & BW_METER_LEQ) { 2775 /* 2776 * Processing for "<=" type of bw_meter entry 2777 */ 2778 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2779 /* 2780 * We are behind time with the multicast forwarding table 2781 * scanning for "<=" type of bw_meter entries, so test now 2782 * if we should deliver an upcall. 2783 */ 2784 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2785 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2786 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2787 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2788 /* Prepare an upcall for delivery */ 2789 bw_meter_prepare_upcall(x, nowp); 2790 } 2791 /* Reschedule the bw_meter entry */ 2792 unschedule_bw_meter(x); 2793 schedule_bw_meter(x, nowp); 2794 } 2795 2796 /* Record that a packet is received */ 2797 x->bm_measured.b_packets++; 2798 x->bm_measured.b_bytes += plen; 2799 2800 /* 2801 * Test if we should restart the measuring interval 2802 */ 2803 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 2804 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 2805 (x->bm_flags & BW_METER_UNIT_BYTES && 2806 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 2807 /* Don't restart the measuring interval */ 2808 } else { 2809 /* Do restart the measuring interval */ 2810 /* 2811 * XXX: note that we don't unschedule and schedule, because this 2812 * might be too much overhead per packet. Instead, when we process 2813 * all entries for a given timer hash bin, we check whether it is 2814 * really a timeout. If not, we reschedule at that time. 2815 */ 2816 x->bm_start_time = *nowp; 2817 x->bm_measured.b_packets = 0; 2818 x->bm_measured.b_bytes = 0; 2819 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2820 } 2821 } 2822 } 2823 2824 /* 2825 * Prepare a bandwidth-related upcall 2826 */ 2827 static void 2828 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2829 { 2830 struct timeval delta; 2831 struct bw_upcall *u; 2832 2833 /* 2834 * Compute the measured time interval 2835 */ 2836 delta = *nowp; 2837 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2838 2839 /* 2840 * If there are too many pending upcalls, deliver them now 2841 */ 2842 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2843 bw_upcalls_send(); 2844 2845 /* 2846 * Set the bw_upcall entry 2847 */ 2848 u = &bw_upcalls[bw_upcalls_n++]; 2849 u->bu_src = x->bm_mfc->mfc_origin; 2850 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2851 u->bu_threshold.b_time = x->bm_threshold.b_time; 2852 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2853 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2854 u->bu_measured.b_time = delta; 2855 u->bu_measured.b_packets = x->bm_measured.b_packets; 2856 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2857 u->bu_flags = 0; 2858 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2859 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2860 if (x->bm_flags & BW_METER_UNIT_BYTES) 2861 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2862 if (x->bm_flags & BW_METER_GEQ) 2863 u->bu_flags |= BW_UPCALL_GEQ; 2864 if (x->bm_flags & BW_METER_LEQ) 2865 u->bu_flags |= BW_UPCALL_LEQ; 2866 } 2867 2868 /* 2869 * Send the pending bandwidth-related upcalls 2870 */ 2871 static void 2872 bw_upcalls_send(void) 2873 { 2874 struct mbuf *m; 2875 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2876 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2877 static struct igmpmsg igmpmsg = { 0, /* unused1 */ 2878 0, /* unused2 */ 2879 IGMPMSG_BW_UPCALL,/* im_msgtype */ 2880 0, /* im_mbz */ 2881 0, /* im_vif */ 2882 0, /* unused3 */ 2883 { 0 }, /* im_src */ 2884 { 0 } }; /* im_dst */ 2885 2886 if (bw_upcalls_n == 0) 2887 return; /* No pending upcalls */ 2888 2889 bw_upcalls_n = 0; 2890 2891 /* 2892 * Allocate a new mbuf, initialize it with the header and 2893 * the payload for the pending calls. 2894 */ 2895 MGETHDR(m, M_DONTWAIT, MT_HEADER); 2896 if (m == NULL) { 2897 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2898 return; 2899 } 2900 2901 m->m_len = m->m_pkthdr.len = 0; 2902 m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); 2903 m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); 2904 2905 /* 2906 * Send the upcalls 2907 * XXX do we need to set the address in k_igmpsrc ? 2908 */ 2909 mrtstat.mrts_upcalls++; 2910 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { 2911 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); 2912 ++mrtstat.mrts_upq_sockfull; 2913 } 2914 } 2915 2916 /* 2917 * Compute the timeout hash value for the bw_meter entries 2918 */ 2919 #define BW_METER_TIMEHASH(bw_meter, hash) \ 2920 do { \ 2921 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2922 \ 2923 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2924 (hash) = next_timeval.tv_sec; \ 2925 if (next_timeval.tv_usec) \ 2926 (hash)++; /* XXX: make sure we don't timeout early */ \ 2927 (hash) %= BW_METER_BUCKETS; \ 2928 } while (/*CONSTCOND*/ 0) 2929 2930 /* 2931 * Schedule a timer to process periodically bw_meter entry of type "<=" 2932 * by linking the entry in the proper hash bucket. 2933 */ 2934 static void 2935 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2936 { 2937 int time_hash; 2938 2939 if (!(x->bm_flags & BW_METER_LEQ)) 2940 return; /* XXX: we schedule timers only for "<=" entries */ 2941 2942 /* 2943 * Reset the bw_meter entry 2944 */ 2945 x->bm_start_time = *nowp; 2946 x->bm_measured.b_packets = 0; 2947 x->bm_measured.b_bytes = 0; 2948 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2949 2950 /* 2951 * Compute the timeout hash value and insert the entry 2952 */ 2953 BW_METER_TIMEHASH(x, time_hash); 2954 x->bm_time_next = bw_meter_timers[time_hash]; 2955 bw_meter_timers[time_hash] = x; 2956 x->bm_time_hash = time_hash; 2957 } 2958 2959 /* 2960 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2961 * by removing the entry from the proper hash bucket. 2962 */ 2963 static void 2964 unschedule_bw_meter(struct bw_meter *x) 2965 { 2966 int time_hash; 2967 struct bw_meter *prev, *tmp; 2968 2969 if (!(x->bm_flags & BW_METER_LEQ)) 2970 return; /* XXX: we schedule timers only for "<=" entries */ 2971 2972 /* 2973 * Compute the timeout hash value and delete the entry 2974 */ 2975 time_hash = x->bm_time_hash; 2976 if (time_hash >= BW_METER_BUCKETS) 2977 return; /* Entry was not scheduled */ 2978 2979 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2980 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2981 if (tmp == x) 2982 break; 2983 2984 if (tmp == NULL) 2985 panic("unschedule_bw_meter: bw_meter entry not found"); 2986 2987 if (prev != NULL) 2988 prev->bm_time_next = x->bm_time_next; 2989 else 2990 bw_meter_timers[time_hash] = x->bm_time_next; 2991 2992 x->bm_time_next = NULL; 2993 x->bm_time_hash = BW_METER_BUCKETS; 2994 } 2995 2996 /* 2997 * Process all "<=" type of bw_meter that should be processed now, 2998 * and for each entry prepare an upcall if necessary. Each processed 2999 * entry is rescheduled again for the (periodic) processing. 3000 * 3001 * This is run periodically (once per second normally). On each round, 3002 * all the potentially matching entries are in the hash slot that we are 3003 * looking at. 3004 */ 3005 static void 3006 bw_meter_process(void) 3007 { 3008 int s; 3009 static uint32_t last_tv_sec; /* last time we processed this */ 3010 3011 uint32_t loops; 3012 int i; 3013 struct timeval now, process_endtime; 3014 3015 microtime(&now); 3016 if (last_tv_sec == now.tv_sec) 3017 return; /* nothing to do */ 3018 3019 loops = now.tv_sec - last_tv_sec; 3020 last_tv_sec = now.tv_sec; 3021 if (loops > BW_METER_BUCKETS) 3022 loops = BW_METER_BUCKETS; 3023 3024 s = splsoftnet(); 3025 /* 3026 * Process all bins of bw_meter entries from the one after the last 3027 * processed to the current one. On entry, i points to the last bucket 3028 * visited, so we need to increment i at the beginning of the loop. 3029 */ 3030 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 3031 struct bw_meter *x, *tmp_list; 3032 3033 if (++i >= BW_METER_BUCKETS) 3034 i = 0; 3035 3036 /* Disconnect the list of bw_meter entries from the bin */ 3037 tmp_list = bw_meter_timers[i]; 3038 bw_meter_timers[i] = NULL; 3039 3040 /* Process the list of bw_meter entries */ 3041 while (tmp_list != NULL) { 3042 x = tmp_list; 3043 tmp_list = tmp_list->bm_time_next; 3044 3045 /* Test if the time interval is over */ 3046 process_endtime = x->bm_start_time; 3047 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); 3048 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 3049 /* Not yet: reschedule, but don't reset */ 3050 int time_hash; 3051 3052 BW_METER_TIMEHASH(x, time_hash); 3053 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { 3054 /* 3055 * XXX: somehow the bin processing is a bit ahead of time. 3056 * Put the entry in the next bin. 3057 */ 3058 if (++time_hash >= BW_METER_BUCKETS) 3059 time_hash = 0; 3060 } 3061 x->bm_time_next = bw_meter_timers[time_hash]; 3062 bw_meter_timers[time_hash] = x; 3063 x->bm_time_hash = time_hash; 3064 3065 continue; 3066 } 3067 3068 /* 3069 * Test if we should deliver an upcall 3070 */ 3071 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 3072 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 3073 ((x->bm_flags & BW_METER_UNIT_BYTES) && 3074 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 3075 /* Prepare an upcall for delivery */ 3076 bw_meter_prepare_upcall(x, &now); 3077 } 3078 3079 /* 3080 * Reschedule for next processing 3081 */ 3082 schedule_bw_meter(x, &now); 3083 } 3084 } 3085 3086 /* Send all upcalls that are pending delivery */ 3087 bw_upcalls_send(); 3088 3089 splx(s); 3090 } 3091 3092 /* 3093 * A periodic function for sending all upcalls that are pending delivery 3094 */ 3095 static void 3096 expire_bw_upcalls_send(void *unused) 3097 { 3098 int s; 3099 3100 s = splsoftnet(); 3101 bw_upcalls_send(); 3102 splx(s); 3103 3104 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 3105 expire_bw_upcalls_send, NULL); 3106 } 3107 3108 /* 3109 * A periodic function for periodic scanning of the multicast forwarding 3110 * table for processing all "<=" bw_meter entries. 3111 */ 3112 static void 3113 expire_bw_meter_process(void *unused) 3114 { 3115 if (mrt_api_config & MRT_MFC_BW_UPCALL) 3116 bw_meter_process(); 3117 3118 callout_reset(&bw_meter_ch, BW_METER_PERIOD, 3119 expire_bw_meter_process, NULL); 3120 } 3121 3122 /* 3123 * End of bandwidth monitoring code 3124 */ 3125 3126 #ifdef PIM 3127 /* 3128 * Send the packet up to the user daemon, or eventually do kernel encapsulation 3129 */ 3130 static int 3131 pim_register_send(struct ip *ip, struct vif *vifp, 3132 struct mbuf *m, struct mfc *rt) 3133 { 3134 struct mbuf *mb_copy, *mm; 3135 3136 if (mrtdebug & DEBUG_PIM) 3137 log(LOG_DEBUG, "pim_register_send: "); 3138 3139 mb_copy = pim_register_prepare(ip, m); 3140 if (mb_copy == NULL) 3141 return ENOBUFS; 3142 3143 /* 3144 * Send all the fragments. Note that the mbuf for each fragment 3145 * is freed by the sending machinery. 3146 */ 3147 for (mm = mb_copy; mm; mm = mb_copy) { 3148 mb_copy = mm->m_nextpkt; 3149 mm->m_nextpkt = NULL; 3150 mm = m_pullup(mm, sizeof(struct ip)); 3151 if (mm != NULL) { 3152 ip = mtod(mm, struct ip *); 3153 if ((mrt_api_config & MRT_MFC_RP) && 3154 !in_nullhost(rt->mfc_rp)) { 3155 pim_register_send_rp(ip, vifp, mm, rt); 3156 } else { 3157 pim_register_send_upcall(ip, vifp, mm, rt); 3158 } 3159 } 3160 } 3161 3162 return 0; 3163 } 3164 3165 /* 3166 * Return a copy of the data packet that is ready for PIM Register 3167 * encapsulation. 3168 * XXX: Note that in the returned copy the IP header is a valid one. 3169 */ 3170 static struct mbuf * 3171 pim_register_prepare(struct ip *ip, struct mbuf *m) 3172 { 3173 struct mbuf *mb_copy = NULL; 3174 int mtu; 3175 3176 /* Take care of delayed checksums */ 3177 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 3178 in_delayed_cksum(m); 3179 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 3180 } 3181 3182 /* 3183 * Copy the old packet & pullup its IP header into the 3184 * new mbuf so we can modify it. 3185 */ 3186 mb_copy = m_copy(m, 0, M_COPYALL); 3187 if (mb_copy == NULL) 3188 return NULL; 3189 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 3190 if (mb_copy == NULL) 3191 return NULL; 3192 3193 /* take care of the TTL */ 3194 ip = mtod(mb_copy, struct ip *); 3195 --ip->ip_ttl; 3196 3197 /* Compute the MTU after the PIM Register encapsulation */ 3198 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 3199 3200 if (ntohs(ip->ip_len) <= mtu) { 3201 /* Turn the IP header into a valid one */ 3202 ip->ip_sum = 0; 3203 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 3204 } else { 3205 /* Fragment the packet */ 3206 if (ip_fragment(mb_copy, NULL, mtu) != 0) { 3207 /* XXX: mb_copy was freed by ip_fragment() */ 3208 return NULL; 3209 } 3210 } 3211 return mb_copy; 3212 } 3213 3214 /* 3215 * Send an upcall with the data packet to the user-level process. 3216 */ 3217 static int 3218 pim_register_send_upcall(struct ip *ip, struct vif *vifp, 3219 struct mbuf *mb_copy, struct mfc *rt) 3220 { 3221 struct mbuf *mb_first; 3222 int len = ntohs(ip->ip_len); 3223 struct igmpmsg *im; 3224 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 3225 3226 /* 3227 * Add a new mbuf with an upcall header 3228 */ 3229 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3230 if (mb_first == NULL) { 3231 m_freem(mb_copy); 3232 return ENOBUFS; 3233 } 3234 mb_first->m_data += max_linkhdr; 3235 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 3236 mb_first->m_len = sizeof(struct igmpmsg); 3237 mb_first->m_next = mb_copy; 3238 3239 /* Send message to routing daemon */ 3240 im = mtod(mb_first, struct igmpmsg *); 3241 im->im_msgtype = IGMPMSG_WHOLEPKT; 3242 im->im_mbz = 0; 3243 im->im_vif = vifp - viftable; 3244 im->im_src = ip->ip_src; 3245 im->im_dst = ip->ip_dst; 3246 3247 k_igmpsrc.sin_addr = ip->ip_src; 3248 3249 mrtstat.mrts_upcalls++; 3250 3251 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { 3252 if (mrtdebug & DEBUG_PIM) 3253 log(LOG_WARNING, 3254 "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); 3255 ++mrtstat.mrts_upq_sockfull; 3256 return ENOBUFS; 3257 } 3258 3259 /* Keep statistics */ 3260 pimstat.pims_snd_registers_msgs++; 3261 pimstat.pims_snd_registers_bytes += len; 3262 3263 return 0; 3264 } 3265 3266 /* 3267 * Encapsulate the data packet in PIM Register message and send it to the RP. 3268 */ 3269 static int 3270 pim_register_send_rp(struct ip *ip, struct vif *vifp, 3271 struct mbuf *mb_copy, struct mfc *rt) 3272 { 3273 struct mbuf *mb_first; 3274 struct ip *ip_outer; 3275 struct pim_encap_pimhdr *pimhdr; 3276 int len = ntohs(ip->ip_len); 3277 vifi_t vifi = rt->mfc_parent; 3278 3279 if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) { 3280 m_freem(mb_copy); 3281 return EADDRNOTAVAIL; /* The iif vif is invalid */ 3282 } 3283 3284 /* 3285 * Add a new mbuf with the encapsulating header 3286 */ 3287 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3288 if (mb_first == NULL) { 3289 m_freem(mb_copy); 3290 return ENOBUFS; 3291 } 3292 mb_first->m_data += max_linkhdr; 3293 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 3294 mb_first->m_next = mb_copy; 3295 3296 mb_first->m_pkthdr.len = len + mb_first->m_len; 3297 3298 /* 3299 * Fill in the encapsulating IP and PIM header 3300 */ 3301 ip_outer = mtod(mb_first, struct ip *); 3302 *ip_outer = pim_encap_iphdr; 3303 ip_outer->ip_id = ip_newid(); 3304 ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + 3305 sizeof(pim_encap_pimhdr)); 3306 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 3307 ip_outer->ip_dst = rt->mfc_rp; 3308 /* 3309 * Copy the inner header TOS to the outer header, and take care of the 3310 * IP_DF bit. 3311 */ 3312 ip_outer->ip_tos = ip->ip_tos; 3313 if (ntohs(ip->ip_off) & IP_DF) 3314 ip_outer->ip_off |= IP_DF; 3315 pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer 3316 + sizeof(pim_encap_iphdr)); 3317 *pimhdr = pim_encap_pimhdr; 3318 /* If the iif crosses a border, set the Border-bit */ 3319 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 3320 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 3321 3322 mb_first->m_data += sizeof(pim_encap_iphdr); 3323 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 3324 mb_first->m_data -= sizeof(pim_encap_iphdr); 3325 3326 if (vifp->v_rate_limit == 0) 3327 tbf_send_packet(vifp, mb_first); 3328 else 3329 tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len)); 3330 3331 /* Keep statistics */ 3332 pimstat.pims_snd_registers_msgs++; 3333 pimstat.pims_snd_registers_bytes += len; 3334 3335 return 0; 3336 } 3337 3338 /* 3339 * PIM-SMv2 and PIM-DM messages processing. 3340 * Receives and verifies the PIM control messages, and passes them 3341 * up to the listening socket, using rip_input(). 3342 * The only message with special processing is the PIM_REGISTER message 3343 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 3344 * is passed to if_simloop(). 3345 */ 3346 void 3347 pim_input(struct mbuf *m, ...) 3348 { 3349 struct ip *ip = mtod(m, struct ip *); 3350 struct pim *pim; 3351 int minlen; 3352 int datalen; 3353 int ip_tos; 3354 int proto; 3355 int iphlen; 3356 va_list ap; 3357 3358 va_start(ap, m); 3359 iphlen = va_arg(ap, int); 3360 proto = va_arg(ap, int); 3361 va_end(ap); 3362 3363 datalen = ntohs(ip->ip_len) - iphlen; 3364 3365 /* Keep statistics */ 3366 pimstat.pims_rcv_total_msgs++; 3367 pimstat.pims_rcv_total_bytes += datalen; 3368 3369 /* 3370 * Validate lengths 3371 */ 3372 if (datalen < PIM_MINLEN) { 3373 pimstat.pims_rcv_tooshort++; 3374 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", 3375 datalen, (u_long)ip->ip_src.s_addr); 3376 m_freem(m); 3377 return; 3378 } 3379 3380 /* 3381 * If the packet is at least as big as a REGISTER, go agead 3382 * and grab the PIM REGISTER header size, to avoid another 3383 * possible m_pullup() later. 3384 * 3385 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 3386 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 3387 */ 3388 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); 3389 /* 3390 * Get the IP and PIM headers in contiguous memory, and 3391 * possibly the PIM REGISTER header. 3392 */ 3393 if ((m->m_flags & M_EXT || m->m_len < minlen) && 3394 (m = m_pullup(m, minlen)) == NULL) { 3395 log(LOG_ERR, "pim_input: m_pullup failure\n"); 3396 return; 3397 } 3398 /* m_pullup() may have given us a new mbuf so reset ip. */ 3399 ip = mtod(m, struct ip *); 3400 ip_tos = ip->ip_tos; 3401 3402 /* adjust mbuf to point to the PIM header */ 3403 m->m_data += iphlen; 3404 m->m_len -= iphlen; 3405 pim = mtod(m, struct pim *); 3406 3407 /* 3408 * Validate checksum. If PIM REGISTER, exclude the data packet. 3409 * 3410 * XXX: some older PIMv2 implementations don't make this distinction, 3411 * so for compatibility reason perform the checksum over part of the 3412 * message, and if error, then over the whole message. 3413 */ 3414 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { 3415 /* do nothing, checksum okay */ 3416 } else if (in_cksum(m, datalen)) { 3417 pimstat.pims_rcv_badsum++; 3418 if (mrtdebug & DEBUG_PIM) 3419 log(LOG_DEBUG, "pim_input: invalid checksum"); 3420 m_freem(m); 3421 return; 3422 } 3423 3424 /* PIM version check */ 3425 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 3426 pimstat.pims_rcv_badversion++; 3427 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", 3428 PIM_VT_V(pim->pim_vt), PIM_VERSION); 3429 m_freem(m); 3430 return; 3431 } 3432 3433 /* restore mbuf back to the outer IP */ 3434 m->m_data -= iphlen; 3435 m->m_len += iphlen; 3436 3437 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 3438 /* 3439 * Since this is a REGISTER, we'll make a copy of the register 3440 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 3441 * routing daemon. 3442 */ 3443 int s; 3444 struct sockaddr_in dst = { sizeof(dst), AF_INET }; 3445 struct mbuf *mcp; 3446 struct ip *encap_ip; 3447 u_int32_t *reghdr; 3448 struct ifnet *vifp; 3449 3450 s = splsoftnet(); 3451 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 3452 splx(s); 3453 if (mrtdebug & DEBUG_PIM) 3454 log(LOG_DEBUG, 3455 "pim_input: register vif not set: %d\n", reg_vif_num); 3456 m_freem(m); 3457 return; 3458 } 3459 /* XXX need refcnt? */ 3460 vifp = viftable[reg_vif_num].v_ifp; 3461 splx(s); 3462 3463 /* 3464 * Validate length 3465 */ 3466 if (datalen < PIM_REG_MINLEN) { 3467 pimstat.pims_rcv_tooshort++; 3468 pimstat.pims_rcv_badregisters++; 3469 log(LOG_ERR, 3470 "pim_input: register packet size too small %d from %lx\n", 3471 datalen, (u_long)ip->ip_src.s_addr); 3472 m_freem(m); 3473 return; 3474 } 3475 3476 reghdr = (u_int32_t *)(pim + 1); 3477 encap_ip = (struct ip *)(reghdr + 1); 3478 3479 if (mrtdebug & DEBUG_PIM) { 3480 log(LOG_DEBUG, 3481 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", 3482 (u_long)ntohl(encap_ip->ip_src.s_addr), 3483 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3484 ntohs(encap_ip->ip_len)); 3485 } 3486 3487 /* verify the version number of the inner packet */ 3488 if (encap_ip->ip_v != IPVERSION) { 3489 pimstat.pims_rcv_badregisters++; 3490 if (mrtdebug & DEBUG_PIM) { 3491 log(LOG_DEBUG, "pim_input: invalid IP version (%d) " 3492 "of the inner packet\n", encap_ip->ip_v); 3493 } 3494 m_freem(m); 3495 return; 3496 } 3497 3498 /* verify the inner packet is destined to a mcast group */ 3499 if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) { 3500 pimstat.pims_rcv_badregisters++; 3501 if (mrtdebug & DEBUG_PIM) 3502 log(LOG_DEBUG, 3503 "pim_input: inner packet of register is not " 3504 "multicast %lx\n", 3505 (u_long)ntohl(encap_ip->ip_dst.s_addr)); 3506 m_freem(m); 3507 return; 3508 } 3509 3510 /* If a NULL_REGISTER, pass it to the daemon */ 3511 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 3512 goto pim_input_to_daemon; 3513 3514 /* 3515 * Copy the TOS from the outer IP header to the inner IP header. 3516 */ 3517 if (encap_ip->ip_tos != ip_tos) { 3518 /* Outer TOS -> inner TOS */ 3519 encap_ip->ip_tos = ip_tos; 3520 /* Recompute the inner header checksum. Sigh... */ 3521 3522 /* adjust mbuf to point to the inner IP header */ 3523 m->m_data += (iphlen + PIM_MINLEN); 3524 m->m_len -= (iphlen + PIM_MINLEN); 3525 3526 encap_ip->ip_sum = 0; 3527 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 3528 3529 /* restore mbuf to point back to the outer IP header */ 3530 m->m_data -= (iphlen + PIM_MINLEN); 3531 m->m_len += (iphlen + PIM_MINLEN); 3532 } 3533 3534 /* 3535 * Decapsulate the inner IP packet and loopback to forward it 3536 * as a normal multicast packet. Also, make a copy of the 3537 * outer_iphdr + pimhdr + reghdr + encap_iphdr 3538 * to pass to the daemon later, so it can take the appropriate 3539 * actions (e.g., send back PIM_REGISTER_STOP). 3540 * XXX: here m->m_data points to the outer IP header. 3541 */ 3542 mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); 3543 if (mcp == NULL) { 3544 log(LOG_ERR, 3545 "pim_input: pim register: could not copy register head\n"); 3546 m_freem(m); 3547 return; 3548 } 3549 3550 /* Keep statistics */ 3551 /* XXX: registers_bytes include only the encap. mcast pkt */ 3552 pimstat.pims_rcv_registers_msgs++; 3553 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); 3554 3555 /* 3556 * forward the inner ip packet; point m_data at the inner ip. 3557 */ 3558 m_adj(m, iphlen + PIM_MINLEN); 3559 3560 if (mrtdebug & DEBUG_PIM) { 3561 log(LOG_DEBUG, 3562 "pim_input: forwarding decapsulated register: " 3563 "src %lx, dst %lx, vif %d\n", 3564 (u_long)ntohl(encap_ip->ip_src.s_addr), 3565 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3566 reg_vif_num); 3567 } 3568 /* NB: vifp was collected above; can it change on us? */ 3569 looutput(vifp, m, (struct sockaddr *)&dst, (struct rtentry *)NULL); 3570 3571 /* prepare the register head to send to the mrouting daemon */ 3572 m = mcp; 3573 } 3574 3575 pim_input_to_daemon: 3576 /* 3577 * Pass the PIM message up to the daemon; if it is a Register message, 3578 * pass the 'head' only up to the daemon. This includes the 3579 * outer IP header, PIM header, PIM-Register header and the 3580 * inner IP header. 3581 * XXX: the outer IP header pkt size of a Register is not adjust to 3582 * reflect the fact that the inner multicast data is truncated. 3583 */ 3584 rip_input(m, iphlen, proto); 3585 3586 return; 3587 } 3588 #endif /* PIM */ 3589