1 /* $OpenBSD: ip_mroute.c,v 1.52 2008/09/16 21:33:37 chl Exp $ */ 2 /* $NetBSD: ip_mroute.c,v 1.85 2004/04/26 01:31:57 matt Exp $ */ 3 4 /* 5 * Copyright (c) 1989 Stephen Deering 6 * Copyright (c) 1992, 1993 7 * The Regents of the University of California. All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * Stephen Deering of Stanford University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 37 */ 38 39 /* 40 * IP multicast forwarding procedures 41 * 42 * Written by David Waitzman, BBN Labs, August 1988. 43 * Modified by Steve Deering, Stanford, February 1989. 44 * Modified by Mark J. Steiglitz, Stanford, May, 1991 45 * Modified by Van Jacobson, LBL, January 1993 46 * Modified by Ajit Thyagarajan, PARC, August 1993 47 * Modified by Bill Fenner, PARC, April 1994 48 * Modified by Charles M. Hannum, NetBSD, May 1995. 49 * Modified by Ahmed Helmy, SGI, June 1996 50 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 51 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 52 * Modified by Hitoshi Asaeda, WIDE, August 2000 53 * Modified by Pavlin Radoslavov, ICSI, October 2002 54 * 55 * MROUTING Revision: 1.2 56 * and PIM-SMv2 and PIM-DM support, advanced API support, 57 * bandwidth metering and signaling 58 */ 59 60 #ifdef PIM 61 #define _PIM_VT 1 62 #endif 63 64 #include <sys/param.h> 65 #include <sys/systm.h> 66 #include <sys/mbuf.h> 67 #include <sys/socket.h> 68 #include <sys/socketvar.h> 69 #include <sys/protosw.h> 70 #include <sys/errno.h> 71 #include <sys/time.h> 72 #include <sys/kernel.h> 73 #include <sys/ioctl.h> 74 #include <sys/syslog.h> 75 #include <sys/sysctl.h> 76 #include <sys/timeout.h> 77 78 #include <net/if.h> 79 #include <net/route.h> 80 #include <net/raw_cb.h> 81 82 #include <netinet/in.h> 83 #include <netinet/in_var.h> 84 #include <netinet/in_systm.h> 85 #include <netinet/ip.h> 86 #include <netinet/ip_var.h> 87 #include <netinet/in_pcb.h> 88 #include <netinet/udp.h> 89 #include <netinet/igmp.h> 90 #include <netinet/igmp_var.h> 91 #include <netinet/ip_mroute.h> 92 #ifdef PIM 93 #include <netinet/pim.h> 94 #include <netinet/pim_var.h> 95 #endif 96 97 #include <sys/stdarg.h> 98 99 #define IP_MULTICASTOPTS 0 100 #define M_PULLUP(m, len) \ 101 do { \ 102 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \ 103 (m) = m_pullup((m), (len)); \ 104 } while (/*CONSTCOND*/ 0) 105 106 /* 107 * Globals. All but ip_mrouter and ip_mrtproto could be static, 108 * except for netstat or debugging purposes. 109 */ 110 struct socket *ip_mrouter = NULL; 111 int ip_mrtproto = IGMP_DVMRP; /* for netstat only */ 112 113 #define NO_RTE_FOUND 0x1 114 #define RTE_FOUND 0x2 115 116 #define MFCHASH(a, g) \ 117 ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ 118 ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash) 119 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl; 120 u_long mfchash; 121 122 u_char nexpire[MFCTBLSIZ]; 123 struct vif viftable[MAXVIFS]; 124 struct mrtstat mrtstat; 125 u_int mrtdebug = 0; /* debug level */ 126 #define DEBUG_MFC 0x02 127 #define DEBUG_FORWARD 0x04 128 #define DEBUG_EXPIRE 0x08 129 #define DEBUG_XMIT 0x10 130 #define DEBUG_PIM 0x20 131 132 #define VIFI_INVALID ((vifi_t) -1) 133 134 u_int tbfdebug = 0; /* tbf debug level */ 135 #ifdef RSVP_ISI 136 u_int rsvpdebug = 0; /* rsvp debug level */ 137 extern struct socket *ip_rsvpd; 138 extern int rsvp_on; 139 #endif /* RSVP_ISI */ 140 141 #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 142 #define UPCALL_EXPIRE 6 /* number of timeouts */ 143 struct timeout expire_upcalls_ch; 144 145 /* 146 * Define the token bucket filter structures 147 */ 148 149 #define TBF_REPROCESS (hz / 100) /* 100x / second */ 150 151 static int get_sg_cnt(struct sioc_sg_req *); 152 static int get_vif_cnt(struct sioc_vif_req *); 153 static int ip_mrouter_init(struct socket *, struct mbuf *); 154 static int get_version(struct mbuf *); 155 static int set_assert(struct mbuf *); 156 static int get_assert(struct mbuf *); 157 static int add_vif(struct mbuf *); 158 static int del_vif(struct mbuf *); 159 static void update_mfc_params(struct mfc *, struct mfcctl2 *); 160 static void init_mfc_params(struct mfc *, struct mfcctl2 *); 161 static void expire_mfc(struct mfc *); 162 static int add_mfc(struct mbuf *); 163 #ifdef UPCALL_TIMING 164 static void collate(struct timeval *); 165 #endif 166 static int del_mfc(struct mbuf *); 167 static int set_api_config(struct mbuf *); /* chose API capabilities */ 168 static int get_api_support(struct mbuf *); 169 static int get_api_config(struct mbuf *); 170 static int socket_send(struct socket *, struct mbuf *, 171 struct sockaddr_in *); 172 static void expire_upcalls(void *); 173 #ifdef RSVP_ISI 174 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); 175 #else 176 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *); 177 #endif 178 static void phyint_send(struct ip *, struct vif *, struct mbuf *); 179 static void encap_send(struct ip *, struct vif *, struct mbuf *); 180 static void tbf_control(struct vif *, struct mbuf *, struct ip *, 181 u_int32_t); 182 static void tbf_queue(struct vif *, struct mbuf *); 183 static void tbf_process_q(struct vif *); 184 static void tbf_reprocess_q(void *); 185 static int tbf_dq_sel(struct vif *, struct ip *); 186 static void tbf_send_packet(struct vif *, struct mbuf *); 187 static void tbf_update_tokens(struct vif *); 188 static int priority(struct vif *, struct ip *); 189 190 /* 191 * Bandwidth monitoring 192 */ 193 static void free_bw_list(struct bw_meter *); 194 static int add_bw_upcall(struct mbuf *); 195 static int del_bw_upcall(struct mbuf *); 196 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *); 197 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); 198 static void bw_upcalls_send(void); 199 static void schedule_bw_meter(struct bw_meter *, struct timeval *); 200 static void unschedule_bw_meter(struct bw_meter *); 201 static void bw_meter_process(void); 202 static void expire_bw_upcalls_send(void *); 203 static void expire_bw_meter_process(void *); 204 205 #ifdef PIM 206 static int pim_register_send(struct ip *, struct vif *, 207 struct mbuf *, struct mfc *); 208 static int pim_register_send_rp(struct ip *, struct vif *, 209 struct mbuf *, struct mfc *); 210 static int pim_register_send_upcall(struct ip *, struct vif *, 211 struct mbuf *, struct mfc *); 212 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); 213 #endif 214 215 /* 216 * 'Interfaces' associated with decapsulator (so we can tell 217 * packets that went through it from ones that get reflected 218 * by a broken gateway). These interfaces are never linked into 219 * the system ifnet list & no routes point to them. I.e., packets 220 * can't be sent this way. They only exist as a placeholder for 221 * multicast source verification. 222 */ 223 #if 0 224 struct ifnet multicast_decap_if[MAXVIFS]; 225 #endif 226 227 #define ENCAP_TTL 64 228 #define ENCAP_PROTO IPPROTO_IPIP /* 4 */ 229 230 /* prototype IP hdr for encapsulated packets */ 231 struct ip multicast_encap_iphdr = { 232 #if BYTE_ORDER == LITTLE_ENDIAN 233 sizeof(struct ip) >> 2, IPVERSION, 234 #else 235 IPVERSION, sizeof(struct ip) >> 2, 236 #endif 237 0, /* tos */ 238 sizeof(struct ip), /* total length */ 239 0, /* id */ 240 0, /* frag offset */ 241 ENCAP_TTL, ENCAP_PROTO, 242 0, /* checksum */ 243 }; 244 245 /* 246 * Bandwidth meter variables and constants 247 */ 248 249 /* 250 * Pending timeouts are stored in a hash table, the key being the 251 * expiration time. Periodically, the entries are analysed and processed. 252 */ 253 #define BW_METER_BUCKETS 1024 254 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 255 struct timeout bw_meter_ch; 256 #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 257 258 /* 259 * Pending upcalls are stored in a vector which is flushed when 260 * full, or periodically 261 */ 262 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 263 static u_int bw_upcalls_n; /* # of pending upcalls */ 264 struct timeout bw_upcalls_ch; 265 #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 266 267 #ifdef PIM 268 struct pimstat pimstat; 269 270 /* 271 * Note: the PIM Register encapsulation adds the following in front of a 272 * data packet: 273 * 274 * struct pim_encap_hdr { 275 * struct ip ip; 276 * struct pim_encap_pimhdr pim; 277 * } 278 * 279 */ 280 281 struct pim_encap_pimhdr { 282 struct pim pim; 283 uint32_t flags; 284 }; 285 286 static struct ip pim_encap_iphdr = { 287 #if BYTE_ORDER == LITTLE_ENDIAN 288 sizeof(struct ip) >> 2, 289 IPVERSION, 290 #else 291 IPVERSION, 292 sizeof(struct ip) >> 2, 293 #endif 294 0, /* tos */ 295 sizeof(struct ip), /* total length */ 296 0, /* id */ 297 0, /* frag offset */ 298 ENCAP_TTL, 299 IPPROTO_PIM, 300 0, /* checksum */ 301 }; 302 303 static struct pim_encap_pimhdr pim_encap_pimhdr = { 304 { 305 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 306 0, /* reserved */ 307 0, /* checksum */ 308 }, 309 0 /* flags */ 310 }; 311 312 static struct ifnet multicast_register_if; 313 static vifi_t reg_vif_num = VIFI_INVALID; 314 #endif /* PIM */ 315 316 317 /* 318 * Private variables. 319 */ 320 static vifi_t numvifs = 0; 321 static int have_encap_tunnel = 0; 322 323 /* 324 * whether or not special PIM assert processing is enabled. 325 */ 326 static int pim_assert; 327 /* 328 * Rate limit for assert notification messages, in usec 329 */ 330 #define ASSERT_MSG_TIME 3000000 331 332 /* 333 * Kernel multicast routing API capabilities and setup. 334 * If more API capabilities are added to the kernel, they should be 335 * recorded in `mrt_api_support'. 336 */ 337 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 338 MRT_MFC_FLAGS_BORDER_VIF | 339 MRT_MFC_RP | 340 MRT_MFC_BW_UPCALL); 341 static u_int32_t mrt_api_config = 0; 342 343 /* 344 * Find a route for a given origin IP address and Multicast group address 345 * Type of service parameter to be added in the future!!! 346 * Statistics are updated by the caller if needed 347 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) 348 */ 349 static struct mfc * 350 mfc_find(struct in_addr *o, struct in_addr *g) 351 { 352 struct mfc *rt; 353 354 LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { 355 if (in_hosteq(rt->mfc_origin, *o) && 356 in_hosteq(rt->mfc_mcastgrp, *g) && 357 (rt->mfc_stall == NULL)) 358 break; 359 } 360 361 return (rt); 362 } 363 364 /* 365 * Macros to compute elapsed time efficiently 366 * Borrowed from Van Jacobson's scheduling code 367 */ 368 #define TV_DELTA(a, b, delta) do { \ 369 int xxs; \ 370 delta = (a).tv_usec - (b).tv_usec; \ 371 xxs = (a).tv_sec - (b).tv_sec; \ 372 switch (xxs) { \ 373 case 2: \ 374 delta += 1000000; \ 375 /* FALLTHROUGH */ \ 376 case 1: \ 377 delta += 1000000; \ 378 /* FALLTHROUGH */ \ 379 case 0: \ 380 break; \ 381 default: \ 382 delta += (1000000 * xxs); \ 383 break; \ 384 } \ 385 } while (/*CONSTCOND*/ 0) 386 387 #ifdef UPCALL_TIMING 388 u_int32_t upcall_data[51]; 389 #endif /* UPCALL_TIMING */ 390 391 /* 392 * Handle MRT setsockopt commands to modify the multicast routing tables. 393 */ 394 int 395 ip_mrouter_set(struct socket *so, int optname, struct mbuf **m) 396 { 397 int error; 398 399 if (optname != MRT_INIT && so != ip_mrouter) 400 error = ENOPROTOOPT; 401 else 402 switch (optname) { 403 case MRT_INIT: 404 error = ip_mrouter_init(so, *m); 405 break; 406 case MRT_DONE: 407 error = ip_mrouter_done(); 408 break; 409 case MRT_ADD_VIF: 410 error = add_vif(*m); 411 break; 412 case MRT_DEL_VIF: 413 error = del_vif(*m); 414 break; 415 case MRT_ADD_MFC: 416 error = add_mfc(*m); 417 break; 418 case MRT_DEL_MFC: 419 error = del_mfc(*m); 420 break; 421 case MRT_ASSERT: 422 error = set_assert(*m); 423 break; 424 case MRT_API_CONFIG: 425 error = set_api_config(*m); 426 break; 427 case MRT_ADD_BW_UPCALL: 428 error = add_bw_upcall(*m); 429 break; 430 case MRT_DEL_BW_UPCALL: 431 error = del_bw_upcall(*m); 432 break; 433 default: 434 error = ENOPROTOOPT; 435 break; 436 } 437 438 if (*m) 439 m_free(*m); 440 return (error); 441 } 442 443 /* 444 * Handle MRT getsockopt commands 445 */ 446 int 447 ip_mrouter_get(struct socket *so, int optname, struct mbuf **m) 448 { 449 int error; 450 451 if (so != ip_mrouter) 452 error = ENOPROTOOPT; 453 else { 454 *m = m_get(M_WAIT, MT_SOOPTS); 455 456 switch (optname) { 457 case MRT_VERSION: 458 error = get_version(*m); 459 break; 460 case MRT_ASSERT: 461 error = get_assert(*m); 462 break; 463 case MRT_API_SUPPORT: 464 error = get_api_support(*m); 465 break; 466 case MRT_API_CONFIG: 467 error = get_api_config(*m); 468 break; 469 default: 470 error = ENOPROTOOPT; 471 break; 472 } 473 474 if (error) 475 m_free(*m); 476 } 477 478 return (error); 479 } 480 481 /* 482 * Handle ioctl commands to obtain information from the cache 483 */ 484 int 485 mrt_ioctl(struct socket *so, u_long cmd, caddr_t data) 486 { 487 int error; 488 489 if (so != ip_mrouter) 490 error = EINVAL; 491 else 492 switch (cmd) { 493 case SIOCGETVIFCNT: 494 error = get_vif_cnt((struct sioc_vif_req *)data); 495 break; 496 case SIOCGETSGCNT: 497 error = get_sg_cnt((struct sioc_sg_req *)data); 498 break; 499 default: 500 error = ENOTTY; 501 break; 502 } 503 504 return (error); 505 } 506 507 /* 508 * returns the packet, byte, rpf-failure count for the source group provided 509 */ 510 static int 511 get_sg_cnt(struct sioc_sg_req *req) 512 { 513 int s; 514 struct mfc *rt; 515 516 s = splsoftnet(); 517 rt = mfc_find(&req->src, &req->grp); 518 if (rt == NULL) { 519 splx(s); 520 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 521 return (EADDRNOTAVAIL); 522 } 523 req->pktcnt = rt->mfc_pkt_cnt; 524 req->bytecnt = rt->mfc_byte_cnt; 525 req->wrong_if = rt->mfc_wrong_if; 526 splx(s); 527 528 return (0); 529 } 530 531 /* 532 * returns the input and output packet and byte counts on the vif provided 533 */ 534 static int 535 get_vif_cnt(struct sioc_vif_req *req) 536 { 537 vifi_t vifi = req->vifi; 538 539 if (vifi >= numvifs) 540 return (EINVAL); 541 542 req->icount = viftable[vifi].v_pkt_in; 543 req->ocount = viftable[vifi].v_pkt_out; 544 req->ibytes = viftable[vifi].v_bytes_in; 545 req->obytes = viftable[vifi].v_bytes_out; 546 547 return (0); 548 } 549 550 /* 551 * Enable multicast routing 552 */ 553 static int 554 ip_mrouter_init(struct socket *so, struct mbuf *m) 555 { 556 int *v; 557 558 if (mrtdebug) 559 log(LOG_DEBUG, 560 "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", 561 so->so_type, so->so_proto->pr_protocol); 562 563 if (so->so_type != SOCK_RAW || 564 so->so_proto->pr_protocol != IPPROTO_IGMP) 565 return (EOPNOTSUPP); 566 567 if (m == NULL || m->m_len < sizeof(int)) 568 return (EINVAL); 569 570 v = mtod(m, int *); 571 if (*v != 1) 572 return (EINVAL); 573 574 if (ip_mrouter != NULL) 575 return (EADDRINUSE); 576 577 ip_mrouter = so; 578 579 mfchashtbl = hashinit(MFCTBLSIZ, M_MRTABLE, M_WAITOK, &mfchash); 580 bzero((caddr_t)nexpire, sizeof(nexpire)); 581 582 pim_assert = 0; 583 584 timeout_set(&expire_upcalls_ch, expire_upcalls, NULL); 585 timeout_add(&expire_upcalls_ch, EXPIRE_TIMEOUT); 586 587 timeout_set(&bw_upcalls_ch, expire_bw_upcalls_send, NULL); 588 timeout_add(&bw_upcalls_ch, BW_UPCALLS_PERIOD); 589 590 timeout_set(&bw_meter_ch, expire_bw_meter_process, NULL); 591 timeout_add(&bw_meter_ch, BW_METER_PERIOD); 592 593 if (mrtdebug) 594 log(LOG_DEBUG, "ip_mrouter_init\n"); 595 596 return (0); 597 } 598 599 /* 600 * Disable multicast routing 601 */ 602 int 603 ip_mrouter_done() 604 { 605 vifi_t vifi; 606 struct vif *vifp; 607 int i; 608 int s; 609 610 s = splsoftnet(); 611 612 /* Clear out all the vifs currently in use. */ 613 for (vifi = 0; vifi < numvifs; vifi++) { 614 vifp = &viftable[vifi]; 615 if (!in_nullhost(vifp->v_lcl_addr)) 616 reset_vif(vifp); 617 } 618 619 numvifs = 0; 620 pim_assert = 0; 621 mrt_api_config = 0; 622 623 timeout_del(&expire_upcalls_ch); 624 timeout_del(&bw_upcalls_ch); 625 timeout_del(&bw_meter_ch); 626 627 /* 628 * Free all multicast forwarding cache entries. 629 */ 630 for (i = 0; i < MFCTBLSIZ; i++) { 631 struct mfc *rt, *nrt; 632 633 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 634 nrt = LIST_NEXT(rt, mfc_hash); 635 636 expire_mfc(rt); 637 } 638 } 639 640 bzero((caddr_t)nexpire, sizeof(nexpire)); 641 free(mfchashtbl, M_MRTABLE); 642 mfchashtbl = NULL; 643 644 bw_upcalls_n = 0; 645 bzero(bw_meter_timers, sizeof(bw_meter_timers)); 646 647 /* Reset de-encapsulation cache. */ 648 have_encap_tunnel = 0; 649 650 ip_mrouter = NULL; 651 652 splx(s); 653 654 if (mrtdebug) 655 log(LOG_DEBUG, "ip_mrouter_done\n"); 656 657 return (0); 658 } 659 660 void 661 ip_mrouter_detach(struct ifnet *ifp) 662 { 663 int vifi, i; 664 struct vif *vifp; 665 struct mfc *rt; 666 struct rtdetq *rte; 667 668 /* XXX not sure about side effect to userland routing daemon */ 669 for (vifi = 0; vifi < numvifs; vifi++) { 670 vifp = &viftable[vifi]; 671 if (vifp->v_ifp == ifp) 672 reset_vif(vifp); 673 } 674 for (i = 0; i < MFCTBLSIZ; i++) { 675 if (nexpire[i] == 0) 676 continue; 677 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) { 678 for (rte = rt->mfc_stall; rte; rte = rte->next) { 679 if (rte->ifp == ifp) 680 rte->ifp = NULL; 681 } 682 } 683 } 684 } 685 686 static int 687 get_version(struct mbuf *m) 688 { 689 int *v = mtod(m, int *); 690 691 *v = 0x0305; /* XXX !!!! */ 692 m->m_len = sizeof(int); 693 return (0); 694 } 695 696 /* 697 * Set PIM assert processing global 698 */ 699 static int 700 set_assert(struct mbuf *m) 701 { 702 int *i; 703 704 if (m == NULL || m->m_len < sizeof(int)) 705 return (EINVAL); 706 707 i = mtod(m, int *); 708 pim_assert = !!*i; 709 return (0); 710 } 711 712 /* 713 * Get PIM assert processing global 714 */ 715 static int 716 get_assert(struct mbuf *m) 717 { 718 int *i = mtod(m, int *); 719 720 *i = pim_assert; 721 m->m_len = sizeof(int); 722 return (0); 723 } 724 725 /* 726 * Configure API capabilities 727 */ 728 static int 729 set_api_config(struct mbuf *m) 730 { 731 int i; 732 u_int32_t *apival; 733 734 if (m == NULL || m->m_len < sizeof(u_int32_t)) 735 return (EINVAL); 736 737 apival = mtod(m, u_int32_t *); 738 739 /* 740 * We can set the API capabilities only if it is the first operation 741 * after MRT_INIT. I.e.: 742 * - there are no vifs installed 743 * - pim_assert is not enabled 744 * - the MFC table is empty 745 */ 746 if (numvifs > 0) { 747 *apival = 0; 748 return (EPERM); 749 } 750 if (pim_assert) { 751 *apival = 0; 752 return (EPERM); 753 } 754 for (i = 0; i < MFCTBLSIZ; i++) { 755 if (LIST_FIRST(&mfchashtbl[i]) != NULL) { 756 *apival = 0; 757 return (EPERM); 758 } 759 } 760 761 mrt_api_config = *apival & mrt_api_support; 762 *apival = mrt_api_config; 763 764 return (0); 765 } 766 767 /* 768 * Get API capabilities 769 */ 770 static int 771 get_api_support(struct mbuf *m) 772 { 773 u_int32_t *apival; 774 775 if (m == NULL || m->m_len < sizeof(u_int32_t)) 776 return (EINVAL); 777 778 apival = mtod(m, u_int32_t *); 779 780 *apival = mrt_api_support; 781 782 return (0); 783 } 784 785 /* 786 * Get API configured capabilities 787 */ 788 static int 789 get_api_config(struct mbuf *m) 790 { 791 u_int32_t *apival; 792 793 if (m == NULL || m->m_len < sizeof(u_int32_t)) 794 return (EINVAL); 795 796 apival = mtod(m, u_int32_t *); 797 798 *apival = mrt_api_config; 799 800 return (0); 801 } 802 803 static struct sockaddr_in sin = { sizeof(sin), AF_INET }; 804 805 /* 806 * Add a vif to the vif table 807 */ 808 static int 809 add_vif(struct mbuf *m) 810 { 811 struct vifctl *vifcp; 812 struct vif *vifp; 813 struct ifaddr *ifa; 814 struct ifnet *ifp; 815 struct ifreq ifr; 816 int error, s; 817 818 if (m == NULL || m->m_len < sizeof(struct vifctl)) 819 return (EINVAL); 820 821 vifcp = mtod(m, struct vifctl *); 822 if (vifcp->vifc_vifi >= MAXVIFS) 823 return (EINVAL); 824 if (in_nullhost(vifcp->vifc_lcl_addr)) 825 return (EADDRNOTAVAIL); 826 827 vifp = &viftable[vifcp->vifc_vifi]; 828 if (!in_nullhost(vifp->v_lcl_addr)) 829 return (EADDRINUSE); 830 831 /* Find the interface with an address in AF_INET family. */ 832 #ifdef PIM 833 if (vifcp->vifc_flags & VIFF_REGISTER) { 834 /* 835 * XXX: Because VIFF_REGISTER does not really need a valid 836 * local interface (e.g. it could be 127.0.0.2), we don't 837 * check its address. 838 */ 839 } else 840 #endif 841 { 842 sin.sin_addr = vifcp->vifc_lcl_addr; 843 ifa = ifa_ifwithaddr(sintosa(&sin)); 844 if (ifa == NULL) 845 return (EADDRNOTAVAIL); 846 } 847 848 if (vifcp->vifc_flags & VIFF_TUNNEL) { 849 /* tunnels are no longer supported use gif(4) instead */ 850 return (EOPNOTSUPP); 851 #ifdef PIM 852 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 853 ifp = &multicast_register_if; 854 if (mrtdebug) 855 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", 856 (void *)ifp); 857 if (reg_vif_num == VIFI_INVALID) { 858 bzero(ifp, sizeof(*ifp)); 859 snprintf(ifp->if_xname, sizeof ifp->if_xname, 860 "register_vif"); 861 ifp->if_flags = IFF_LOOPBACK; 862 bzero(&vifp->v_route, sizeof(vifp->v_route)); 863 reg_vif_num = vifcp->vifc_vifi; 864 } 865 #endif 866 } else { 867 /* Use the physical interface associated with the address. */ 868 ifp = ifa->ifa_ifp; 869 870 /* Make sure the interface supports multicast. */ 871 if ((ifp->if_flags & IFF_MULTICAST) == 0) 872 return (EOPNOTSUPP); 873 874 /* Enable promiscuous reception of all IP multicasts. */ 875 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in); 876 satosin(&ifr.ifr_addr)->sin_family = AF_INET; 877 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr; 878 error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr); 879 if (error) 880 return (error); 881 } 882 883 s = splsoftnet(); 884 885 /* Define parameters for the tbf structure. */ 886 vifp->tbf_q = NULL; 887 vifp->tbf_t = &vifp->tbf_q; 888 microtime(&vifp->tbf_last_pkt_t); 889 vifp->tbf_n_tok = 0; 890 vifp->tbf_q_len = 0; 891 vifp->tbf_max_q_len = MAXQSIZE; 892 893 vifp->v_flags = vifcp->vifc_flags; 894 vifp->v_threshold = vifcp->vifc_threshold; 895 /* scaling up here allows division by 1024 in critical code */ 896 vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000; 897 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 898 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 899 vifp->v_ifp = ifp; 900 /* Initialize per vif pkt counters. */ 901 vifp->v_pkt_in = 0; 902 vifp->v_pkt_out = 0; 903 vifp->v_bytes_in = 0; 904 vifp->v_bytes_out = 0; 905 906 timeout_del(&vifp->v_repq_ch); 907 908 #ifdef RSVP_ISI 909 vifp->v_rsvp_on = 0; 910 vifp->v_rsvpd = NULL; 911 #endif /* RSVP_ISI */ 912 913 splx(s); 914 915 /* Adjust numvifs up if the vifi is higher than numvifs. */ 916 if (numvifs <= vifcp->vifc_vifi) 917 numvifs = vifcp->vifc_vifi + 1; 918 919 if (mrtdebug) 920 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, " 921 "thresh %x, rate %d\n", 922 vifcp->vifc_vifi, 923 ntohl(vifcp->vifc_lcl_addr.s_addr), 924 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 925 ntohl(vifcp->vifc_rmt_addr.s_addr), 926 vifcp->vifc_threshold, 927 vifcp->vifc_rate_limit); 928 929 return (0); 930 } 931 932 void 933 reset_vif(struct vif *vifp) 934 { 935 struct mbuf *m, *n; 936 struct ifnet *ifp; 937 struct ifreq ifr; 938 939 timeout_set(&vifp->v_repq_ch, tbf_reprocess_q, vifp); 940 941 /* 942 * Free packets queued at the interface 943 */ 944 for (m = vifp->tbf_q; m != NULL; m = n) { 945 n = m->m_nextpkt; 946 m_freem(m); 947 } 948 949 if (vifp->v_flags & VIFF_TUNNEL) { 950 /* empty */ 951 } else if (vifp->v_flags & VIFF_REGISTER) { 952 #ifdef PIM 953 reg_vif_num = VIFI_INVALID; 954 #endif 955 } else { 956 satosin(&ifr.ifr_addr)->sin_len = sizeof(struct sockaddr_in); 957 satosin(&ifr.ifr_addr)->sin_family = AF_INET; 958 satosin(&ifr.ifr_addr)->sin_addr = zeroin_addr; 959 ifp = vifp->v_ifp; 960 (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr); 961 } 962 bzero((caddr_t)vifp, sizeof(*vifp)); 963 } 964 965 /* 966 * Delete a vif from the vif table 967 */ 968 static int 969 del_vif(struct mbuf *m) 970 { 971 vifi_t *vifip; 972 struct vif *vifp; 973 vifi_t vifi; 974 int s; 975 976 if (m == NULL || m->m_len < sizeof(vifi_t)) 977 return (EINVAL); 978 979 vifip = mtod(m, vifi_t *); 980 if (*vifip >= numvifs) 981 return (EINVAL); 982 983 vifp = &viftable[*vifip]; 984 if (in_nullhost(vifp->v_lcl_addr)) 985 return (EADDRNOTAVAIL); 986 987 s = splsoftnet(); 988 989 reset_vif(vifp); 990 991 /* Adjust numvifs down */ 992 for (vifi = numvifs; vifi > 0; vifi--) 993 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr)) 994 break; 995 numvifs = vifi; 996 997 splx(s); 998 999 if (mrtdebug) 1000 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs); 1001 1002 return (0); 1003 } 1004 1005 void 1006 vif_delete(struct ifnet *ifp) 1007 { 1008 int i; 1009 struct vif *vifp; 1010 struct mfc *rt; 1011 struct rtdetq *rte; 1012 1013 for (i = 0; i < numvifs; i++) { 1014 vifp = &viftable[i]; 1015 if (vifp->v_ifp == ifp) 1016 bzero((caddr_t)vifp, sizeof *vifp); 1017 } 1018 1019 for (i = numvifs; i > 0; i--) 1020 if (!in_nullhost(viftable[i - 1].v_lcl_addr)) 1021 break; 1022 numvifs = i; 1023 1024 for (i = 0; i < MFCTBLSIZ; i++) { 1025 if (nexpire[i] == 0) 1026 continue; 1027 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) { 1028 for (rte = rt->mfc_stall; rte; rte = rte->next) { 1029 if (rte->ifp == ifp) 1030 rte->ifp = NULL; 1031 } 1032 } 1033 } 1034 } 1035 1036 /* 1037 * update an mfc entry without resetting counters and S,G addresses. 1038 */ 1039 static void 1040 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1041 { 1042 int i; 1043 1044 rt->mfc_parent = mfccp->mfcc_parent; 1045 for (i = 0; i < numvifs; i++) { 1046 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1047 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 1048 MRT_MFC_FLAGS_ALL; 1049 } 1050 /* set the RP address */ 1051 if (mrt_api_config & MRT_MFC_RP) 1052 rt->mfc_rp = mfccp->mfcc_rp; 1053 else 1054 rt->mfc_rp = zeroin_addr; 1055 } 1056 1057 /* 1058 * fully initialize an mfc entry from the parameter. 1059 */ 1060 static void 1061 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1062 { 1063 rt->mfc_origin = mfccp->mfcc_origin; 1064 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1065 1066 update_mfc_params(rt, mfccp); 1067 1068 /* initialize pkt counters per src-grp */ 1069 rt->mfc_pkt_cnt = 0; 1070 rt->mfc_byte_cnt = 0; 1071 rt->mfc_wrong_if = 0; 1072 timerclear(&rt->mfc_last_assert); 1073 } 1074 1075 static void 1076 expire_mfc(struct mfc *rt) 1077 { 1078 struct rtdetq *rte, *nrte; 1079 1080 free_bw_list(rt->mfc_bw_meter); 1081 1082 for (rte = rt->mfc_stall; rte != NULL; rte = nrte) { 1083 nrte = rte->next; 1084 m_freem(rte->m); 1085 free(rte, M_MRTABLE); 1086 } 1087 1088 LIST_REMOVE(rt, mfc_hash); 1089 free(rt, M_MRTABLE); 1090 } 1091 1092 /* 1093 * Add an mfc entry 1094 */ 1095 static int 1096 add_mfc(struct mbuf *m) 1097 { 1098 struct mfcctl2 mfcctl2; 1099 struct mfcctl2 *mfccp; 1100 struct mfc *rt; 1101 u_int32_t hash = 0; 1102 struct rtdetq *rte, *nrte; 1103 u_short nstl; 1104 int s; 1105 int mfcctl_size = sizeof(struct mfcctl); 1106 1107 if (mrt_api_config & MRT_API_FLAGS_ALL) 1108 mfcctl_size = sizeof(struct mfcctl2); 1109 1110 if (m == NULL || m->m_len < mfcctl_size) 1111 return (EINVAL); 1112 1113 /* 1114 * select data size depending on API version. 1115 */ 1116 if (mrt_api_config & MRT_API_FLAGS_ALL) { 1117 struct mfcctl2 *mp2 = mtod(m, struct mfcctl2 *); 1118 bcopy(mp2, (caddr_t)&mfcctl2, sizeof(*mp2)); 1119 } else { 1120 struct mfcctl *mp = mtod(m, struct mfcctl *); 1121 bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp)); 1122 bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl), 1123 sizeof(mfcctl2) - sizeof(struct mfcctl)); 1124 } 1125 mfccp = &mfcctl2; 1126 1127 s = splsoftnet(); 1128 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1129 1130 /* If an entry already exists, just update the fields */ 1131 if (rt) { 1132 if (mrtdebug & DEBUG_MFC) 1133 log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n", 1134 ntohl(mfccp->mfcc_origin.s_addr), 1135 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1136 mfccp->mfcc_parent); 1137 1138 update_mfc_params(rt, mfccp); 1139 1140 splx(s); 1141 return (0); 1142 } 1143 1144 /* 1145 * Find the entry for which the upcall was made and update 1146 */ 1147 nstl = 0; 1148 hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); 1149 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1150 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1151 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && 1152 rt->mfc_stall != NULL) { 1153 if (nstl++) 1154 log(LOG_ERR, "add_mfc %s o %x g %x " 1155 "p %x dbx %p\n", 1156 "multiple kernel entries", 1157 ntohl(mfccp->mfcc_origin.s_addr), 1158 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1159 mfccp->mfcc_parent, rt->mfc_stall); 1160 1161 if (mrtdebug & DEBUG_MFC) 1162 log(LOG_DEBUG, "add_mfc o %x g %x " 1163 "p %x dbg %p\n", 1164 ntohl(mfccp->mfcc_origin.s_addr), 1165 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1166 mfccp->mfcc_parent, rt->mfc_stall); 1167 1168 rte = rt->mfc_stall; 1169 init_mfc_params(rt, mfccp); 1170 rt->mfc_stall = NULL; 1171 1172 rt->mfc_expire = 0; /* Don't clean this guy up */ 1173 nexpire[hash]--; 1174 1175 /* free packets Qed at the end of this entry */ 1176 for (; rte != NULL; rte = nrte) { 1177 nrte = rte->next; 1178 if (rte->ifp) { 1179 #ifdef RSVP_ISI 1180 ip_mdq(rte->m, rte->ifp, rt, -1); 1181 #else 1182 ip_mdq(rte->m, rte->ifp, rt); 1183 #endif /* RSVP_ISI */ 1184 } 1185 m_freem(rte->m); 1186 #ifdef UPCALL_TIMING 1187 collate(&rte->t); 1188 #endif /* UPCALL_TIMING */ 1189 free(rte, M_MRTABLE); 1190 } 1191 } 1192 } 1193 1194 /* 1195 * It is possible that an entry is being inserted without an upcall 1196 */ 1197 if (nstl == 0) { 1198 /* 1199 * No mfc; make a new one 1200 */ 1201 if (mrtdebug & DEBUG_MFC) 1202 log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n", 1203 ntohl(mfccp->mfcc_origin.s_addr), 1204 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1205 mfccp->mfcc_parent); 1206 1207 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1208 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1209 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { 1210 init_mfc_params(rt, mfccp); 1211 if (rt->mfc_expire) 1212 nexpire[hash]--; 1213 rt->mfc_expire = 0; 1214 break; /* XXX */ 1215 } 1216 } 1217 if (rt == NULL) { /* no upcall, so make a new entry */ 1218 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, 1219 M_NOWAIT); 1220 if (rt == NULL) { 1221 splx(s); 1222 return (ENOBUFS); 1223 } 1224 1225 init_mfc_params(rt, mfccp); 1226 rt->mfc_expire = 0; 1227 rt->mfc_stall = NULL; 1228 rt->mfc_bw_meter = NULL; 1229 1230 /* insert new entry at head of hash chain */ 1231 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1232 } 1233 } 1234 1235 splx(s); 1236 return (0); 1237 } 1238 1239 #ifdef UPCALL_TIMING 1240 /* 1241 * collect delay statistics on the upcalls 1242 */ 1243 static void 1244 collate(struct timeval *t) 1245 { 1246 u_int32_t d; 1247 struct timeval tp; 1248 u_int32_t delta; 1249 1250 microtime(&tp); 1251 1252 if (timercmp(t, &tp, <)) { 1253 TV_DELTA(tp, *t, delta); 1254 1255 d = delta >> 10; 1256 if (d > 50) 1257 d = 50; 1258 1259 ++upcall_data[d]; 1260 } 1261 } 1262 #endif /* UPCALL_TIMING */ 1263 1264 /* 1265 * Delete an mfc entry 1266 */ 1267 static int 1268 del_mfc(struct mbuf *m) 1269 { 1270 struct mfcctl2 mfcctl2; 1271 struct mfcctl2 *mfccp; 1272 struct mfc *rt; 1273 int s; 1274 int mfcctl_size = sizeof(struct mfcctl); 1275 struct mfcctl *mp = mtod(m, struct mfcctl *); 1276 1277 /* 1278 * XXX: for deleting MFC entries the information in entries 1279 * of size "struct mfcctl" is sufficient. 1280 */ 1281 1282 if (m == NULL || m->m_len < mfcctl_size) 1283 return (EINVAL); 1284 1285 bcopy(mp, (caddr_t)&mfcctl2, sizeof(*mp)); 1286 bzero((caddr_t)&mfcctl2 + sizeof(struct mfcctl), 1287 sizeof(mfcctl2) - sizeof(struct mfcctl)); 1288 1289 mfccp = &mfcctl2; 1290 1291 if (mrtdebug & DEBUG_MFC) 1292 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n", 1293 ntohl(mfccp->mfcc_origin.s_addr), 1294 ntohl(mfccp->mfcc_mcastgrp.s_addr)); 1295 1296 s = splsoftnet(); 1297 1298 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1299 if (rt == NULL) { 1300 splx(s); 1301 return (EADDRNOTAVAIL); 1302 } 1303 1304 /* 1305 * free the bw_meter entries 1306 */ 1307 free_bw_list(rt->mfc_bw_meter); 1308 rt->mfc_bw_meter = NULL; 1309 1310 LIST_REMOVE(rt, mfc_hash); 1311 free(rt, M_MRTABLE); 1312 1313 splx(s); 1314 return (0); 1315 } 1316 1317 static int 1318 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1319 { 1320 if (s != NULL) { 1321 if (sbappendaddr(&s->so_rcv, sintosa(src), mm, 1322 (struct mbuf *)NULL) != 0) { 1323 sorwakeup(s); 1324 return (0); 1325 } 1326 } 1327 m_freem(mm); 1328 return (-1); 1329 } 1330 1331 /* 1332 * IP multicast forwarding function. This function assumes that the packet 1333 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1334 * pointed to by "ifp", and the packet is to be relayed to other networks 1335 * that have members of the packet's destination IP multicast group. 1336 * 1337 * The packet is returned unscathed to the caller, unless it is 1338 * erroneous, in which case a non-zero return value tells the caller to 1339 * discard it. 1340 */ 1341 1342 #define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */ 1343 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1344 1345 int 1346 #ifdef RSVP_ISI 1347 ip_mforward(struct mbuf *m, struct ifnet *ifp, struct ip_moptions *imo) 1348 #else 1349 ip_mforward(struct mbuf *m, struct ifnet *ifp) 1350 #endif /* RSVP_ISI */ 1351 { 1352 struct ip *ip = mtod(m, struct ip *); 1353 struct mfc *rt; 1354 static int srctun = 0; 1355 struct mbuf *mm; 1356 int s; 1357 vifi_t vifi; 1358 1359 if (mrtdebug & DEBUG_FORWARD) 1360 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n", 1361 ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp); 1362 1363 if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 || 1364 ((u_char *)(ip + 1))[1] != IPOPT_LSRR) { 1365 /* 1366 * Packet arrived via a physical interface or 1367 * an encapsulated tunnel or a register_vif. 1368 */ 1369 } else { 1370 /* 1371 * Packet arrived through a source-route tunnel. 1372 * Source-route tunnels are no longer supported. 1373 */ 1374 if ((srctun++ % 1000) == 0) 1375 log(LOG_ERR, "ip_mforward: received source-routed " 1376 "packet from %x\n", ntohl(ip->ip_src.s_addr)); 1377 1378 return (1); 1379 } 1380 1381 #ifdef RSVP_ISI 1382 if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { 1383 if (ip->ip_ttl < 255) { 1384 /* compensate for -1 in *_send routines */ 1385 ip->ip_ttl++; 1386 } 1387 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1388 struct vif *vifp = viftable + vifi; 1389 printf("Sending IPPROTO_RSVP from %x to %x on " 1390 "vif %d (%s%s)\n", 1391 ntohl(ip->ip_src), ntohl(ip->ip_dst), vifi, 1392 (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", 1393 vifp->v_ifp->if_xname); 1394 } 1395 return (ip_mdq(m, ifp, (struct mfc *)NULL, vifi)); 1396 } 1397 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1398 printf("Warning: IPPROTO_RSVP from %x to %x without " 1399 "vif option\n", ntohl(ip->ip_src), ntohl(ip->ip_dst)); 1400 } 1401 #endif /* RSVP_ISI */ 1402 1403 /* 1404 * Don't forward a packet with time-to-live of zero or one, 1405 * or a packet destined to a local-only group. 1406 */ 1407 if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 1408 return (0); 1409 1410 /* 1411 * Determine forwarding vifs from the forwarding cache table 1412 */ 1413 s = splsoftnet(); 1414 ++mrtstat.mrts_mfc_lookups; 1415 rt = mfc_find(&ip->ip_src, &ip->ip_dst); 1416 1417 /* Entry exists, so forward if necessary */ 1418 if (rt != NULL) { 1419 splx(s); 1420 #ifdef RSVP_ISI 1421 return (ip_mdq(m, ifp, rt, -1)); 1422 #else 1423 return (ip_mdq(m, ifp, rt)); 1424 #endif /* RSVP_ISI */ 1425 } else { 1426 /* 1427 * If we don't have a route for packet's origin, 1428 * Make a copy of the packet & send message to routing daemon 1429 */ 1430 1431 struct mbuf *mb0; 1432 struct rtdetq *rte; 1433 u_int32_t hash; 1434 int hlen = ip->ip_hl << 2; 1435 #ifdef UPCALL_TIMING 1436 struct timeval tp; 1437 1438 microtime(&tp); 1439 #endif /* UPCALL_TIMING */ 1440 1441 ++mrtstat.mrts_mfc_misses; 1442 1443 mrtstat.mrts_no_route++; 1444 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) 1445 log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n", 1446 ntohl(ip->ip_src.s_addr), 1447 ntohl(ip->ip_dst.s_addr)); 1448 1449 /* 1450 * Allocate mbufs early so that we don't do extra work if we are 1451 * just going to fail anyway. Make sure to pullup the header so 1452 * that other people can't step on it. 1453 */ 1454 rte = (struct rtdetq *)malloc(sizeof(*rte), 1455 M_MRTABLE, M_NOWAIT); 1456 if (rte == NULL) { 1457 splx(s); 1458 return (ENOBUFS); 1459 } 1460 mb0 = m_copy(m, 0, M_COPYALL); 1461 M_PULLUP(mb0, hlen); 1462 if (mb0 == NULL) { 1463 free(rte, M_MRTABLE); 1464 splx(s); 1465 return (ENOBUFS); 1466 } 1467 1468 /* is there an upcall waiting for this flow? */ 1469 hash = MFCHASH(ip->ip_src, ip->ip_dst); 1470 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1471 if (in_hosteq(ip->ip_src, rt->mfc_origin) && 1472 in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && 1473 rt->mfc_stall != NULL) 1474 break; 1475 } 1476 1477 if (rt == NULL) { 1478 int i; 1479 struct igmpmsg *im; 1480 1481 /* 1482 * Locate the vifi for the incoming interface for 1483 * this packet. 1484 * If none found, drop packet. 1485 */ 1486 for (vifi = 0; vifi < numvifs && 1487 viftable[vifi].v_ifp != ifp; vifi++) 1488 ; 1489 if (vifi >= numvifs) /* vif not found, drop packet */ 1490 goto non_fatal; 1491 1492 /* no upcall, so make a new entry */ 1493 rt = (struct mfc *)malloc(sizeof(*rt), 1494 M_MRTABLE, M_NOWAIT); 1495 if (rt == NULL) 1496 goto fail; 1497 /* 1498 * Make a copy of the header to send to the user level 1499 * process 1500 */ 1501 mm = m_copy(m, 0, hlen); 1502 M_PULLUP(mm, hlen); 1503 if (mm == NULL) 1504 goto fail1; 1505 1506 /* 1507 * Send message to routing daemon to install 1508 * a route into the kernel table 1509 */ 1510 1511 im = mtod(mm, struct igmpmsg *); 1512 im->im_msgtype = IGMPMSG_NOCACHE; 1513 im->im_mbz = 0; 1514 im->im_vif = vifi; 1515 1516 mrtstat.mrts_upcalls++; 1517 1518 sin.sin_addr = ip->ip_src; 1519 if (socket_send(ip_mrouter, mm, &sin) < 0) { 1520 log(LOG_WARNING, "ip_mforward: ip_mrouter " 1521 "socket queue full\n"); 1522 ++mrtstat.mrts_upq_sockfull; 1523 fail1: 1524 free(rt, M_MRTABLE); 1525 fail: 1526 free(rte, M_MRTABLE); 1527 m_freem(mb0); 1528 splx(s); 1529 return (ENOBUFS); 1530 } 1531 1532 /* insert new entry at head of hash chain */ 1533 rt->mfc_origin = ip->ip_src; 1534 rt->mfc_mcastgrp = ip->ip_dst; 1535 rt->mfc_pkt_cnt = 0; 1536 rt->mfc_byte_cnt = 0; 1537 rt->mfc_wrong_if = 0; 1538 rt->mfc_expire = UPCALL_EXPIRE; 1539 nexpire[hash]++; 1540 for (i = 0; i < numvifs; i++) { 1541 rt->mfc_ttls[i] = 0; 1542 rt->mfc_flags[i] = 0; 1543 } 1544 rt->mfc_parent = -1; 1545 1546 /* clear the RP address */ 1547 rt->mfc_rp = zeroin_addr; 1548 1549 rt->mfc_bw_meter = NULL; 1550 1551 /* link into table */ 1552 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1553 /* Add this entry to the end of the queue */ 1554 rt->mfc_stall = rte; 1555 } else { 1556 /* determine if q has overflowed */ 1557 struct rtdetq **p; 1558 int npkts = 0; 1559 1560 /* 1561 * XXX ouch! we need to append to the list, but we 1562 * only have a pointer to the front, so we have to 1563 * scan the entire list every time. 1564 */ 1565 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) 1566 if (++npkts > MAX_UPQ) { 1567 mrtstat.mrts_upq_ovflw++; 1568 non_fatal: 1569 free(rte, M_MRTABLE); 1570 m_freem(mb0); 1571 splx(s); 1572 return (0); 1573 } 1574 1575 /* Add this entry to the end of the queue */ 1576 *p = rte; 1577 } 1578 1579 rte->next = NULL; 1580 rte->m = mb0; 1581 rte->ifp = ifp; 1582 #ifdef UPCALL_TIMING 1583 rte->t = tp; 1584 #endif /* UPCALL_TIMING */ 1585 1586 splx(s); 1587 1588 return (0); 1589 } 1590 } 1591 1592 1593 /*ARGSUSED*/ 1594 static void 1595 expire_upcalls(void *v) 1596 { 1597 int i; 1598 int s; 1599 1600 s = splsoftnet(); 1601 1602 for (i = 0; i < MFCTBLSIZ; i++) { 1603 struct mfc *rt, *nrt; 1604 1605 if (nexpire[i] == 0) 1606 continue; 1607 1608 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 1609 nrt = LIST_NEXT(rt, mfc_hash); 1610 1611 if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) 1612 continue; 1613 nexpire[i]--; 1614 1615 /* 1616 * free the bw_meter entries 1617 */ 1618 while (rt->mfc_bw_meter != NULL) { 1619 struct bw_meter *x = rt->mfc_bw_meter; 1620 1621 rt->mfc_bw_meter = x->bm_mfc_next; 1622 free(x, M_BWMETER); 1623 } 1624 1625 ++mrtstat.mrts_cache_cleanups; 1626 if (mrtdebug & DEBUG_EXPIRE) 1627 log(LOG_DEBUG, 1628 "expire_upcalls: expiring (%x %x)\n", 1629 ntohl(rt->mfc_origin.s_addr), 1630 ntohl(rt->mfc_mcastgrp.s_addr)); 1631 1632 expire_mfc(rt); 1633 } 1634 } 1635 1636 splx(s); 1637 timeout_add(&expire_upcalls_ch, EXPIRE_TIMEOUT); 1638 } 1639 1640 /* 1641 * Packet forwarding routine once entry in the cache is made 1642 */ 1643 static int 1644 #ifdef RSVP_ISI 1645 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) 1646 #else 1647 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt) 1648 #endif /* RSVP_ISI */ 1649 { 1650 struct ip *ip = mtod(m, struct ip *); 1651 vifi_t vifi; 1652 struct vif *vifp; 1653 int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2); 1654 1655 /* 1656 * Macro to send packet on vif. Since RSVP packets don't get counted on 1657 * input, they shouldn't get counted on output, so statistics keeping is 1658 * separate. 1659 */ 1660 #define MC_SEND(ip, vifp, m) do { \ 1661 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1662 encap_send((ip), (vifp), (m)); \ 1663 else \ 1664 phyint_send((ip), (vifp), (m)); \ 1665 } while (/*CONSTCOND*/ 0) 1666 1667 #ifdef RSVP_ISI 1668 /* 1669 * If xmt_vif is not -1, send on only the requested vif. 1670 * 1671 * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs. 1672 */ 1673 if (xmt_vif < numvifs) { 1674 #ifdef PIM 1675 if (viftable[xmt_vif].v_flags & VIFF_REGISTER) 1676 pim_register_send(ip, viftable + xmt_vif, m, rt); 1677 else 1678 #endif 1679 MC_SEND(ip, viftable + xmt_vif, m); 1680 return (1); 1681 } 1682 #endif /* RSVP_ISI */ 1683 1684 /* 1685 * Don't forward if it didn't arrive from the parent vif for its origin. 1686 */ 1687 vifi = rt->mfc_parent; 1688 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1689 /* came in the wrong interface */ 1690 if (mrtdebug & DEBUG_FORWARD) 1691 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", 1692 ifp, vifi, 1693 vifi >= numvifs ? 0 : viftable[vifi].v_ifp); 1694 ++mrtstat.mrts_wrong_if; 1695 ++rt->mfc_wrong_if; 1696 /* 1697 * If we are doing PIM assert processing, send a message 1698 * to the routing daemon. 1699 * 1700 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1701 * can complete the SPT switch, regardless of the type 1702 * of interface (broadcast media, GRE tunnel, etc). 1703 */ 1704 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { 1705 struct timeval now; 1706 u_int32_t delta; 1707 1708 #ifdef PIM 1709 if (ifp == &multicast_register_if) 1710 pimstat.pims_rcv_registers_wrongiif++; 1711 #endif 1712 1713 /* Get vifi for the incoming packet */ 1714 for (vifi = 0; 1715 vifi < numvifs && viftable[vifi].v_ifp != ifp; 1716 vifi++) 1717 ; 1718 if (vifi >= numvifs) { 1719 /* The iif is not found: ignore the packet. */ 1720 return (0); 1721 } 1722 1723 if (rt->mfc_flags[vifi] & 1724 MRT_MFC_FLAGS_DISABLE_WRONGVIF) { 1725 /* WRONGVIF disabled: ignore the packet */ 1726 return (0); 1727 } 1728 1729 microtime(&now); 1730 1731 TV_DELTA(rt->mfc_last_assert, now, delta); 1732 1733 if (delta > ASSERT_MSG_TIME) { 1734 struct igmpmsg *im; 1735 int hlen = ip->ip_hl << 2; 1736 struct mbuf *mm = m_copy(m, 0, hlen); 1737 1738 M_PULLUP(mm, hlen); 1739 if (mm == NULL) 1740 return (ENOBUFS); 1741 1742 rt->mfc_last_assert = now; 1743 1744 im = mtod(mm, struct igmpmsg *); 1745 im->im_msgtype = IGMPMSG_WRONGVIF; 1746 im->im_mbz = 0; 1747 im->im_vif = vifi; 1748 1749 mrtstat.mrts_upcalls++; 1750 1751 sin.sin_addr = im->im_src; 1752 if (socket_send(ip_mrouter, mm, &sin) < 0) { 1753 log(LOG_WARNING, "ip_mforward: " 1754 "ip_mrouter socket queue full\n"); 1755 ++mrtstat.mrts_upq_sockfull; 1756 return (ENOBUFS); 1757 } 1758 } 1759 } 1760 return (0); 1761 } 1762 1763 /* If I sourced this packet, it counts as output, else it was input. */ 1764 if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) { 1765 viftable[vifi].v_pkt_out++; 1766 viftable[vifi].v_bytes_out += plen; 1767 } else { 1768 viftable[vifi].v_pkt_in++; 1769 viftable[vifi].v_bytes_in += plen; 1770 } 1771 rt->mfc_pkt_cnt++; 1772 rt->mfc_byte_cnt += plen; 1773 1774 /* 1775 * For each vif, decide if a copy of the packet should be forwarded. 1776 * Forward if: 1777 * - the ttl exceeds the vif's threshold 1778 * - there are group members downstream on interface 1779 */ 1780 for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) 1781 if ((rt->mfc_ttls[vifi] > 0) && 1782 (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1783 vifp->v_pkt_out++; 1784 vifp->v_bytes_out += plen; 1785 #ifdef PIM 1786 if (vifp->v_flags & VIFF_REGISTER) 1787 pim_register_send(ip, vifp, m, rt); 1788 else 1789 #endif 1790 MC_SEND(ip, vifp, m); 1791 } 1792 1793 /* 1794 * Perform upcall-related bw measuring. 1795 */ 1796 if (rt->mfc_bw_meter != NULL) { 1797 struct bw_meter *x; 1798 struct timeval now; 1799 1800 microtime(&now); 1801 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1802 bw_meter_receive_packet(x, plen, &now); 1803 } 1804 1805 return (0); 1806 } 1807 1808 #ifdef RSVP_ISI 1809 /* 1810 * check if a vif number is legal/ok. This is used by ip_output. 1811 */ 1812 int 1813 legal_vif_num(int vif) 1814 { 1815 if (vif >= 0 && vif < numvifs) 1816 return (1); 1817 else 1818 return (0); 1819 } 1820 #endif /* RSVP_ISI */ 1821 1822 static void 1823 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1824 { 1825 struct mbuf *mb_copy; 1826 int hlen = ip->ip_hl << 2; 1827 1828 /* 1829 * Make a new reference to the packet; make sure that 1830 * the IP header is actually copied, not just referenced, 1831 * so that ip_output() only scribbles on the copy. 1832 */ 1833 mb_copy = m_copy(m, 0, M_COPYALL); 1834 M_PULLUP(mb_copy, hlen); 1835 if (mb_copy == NULL) 1836 return; 1837 1838 if (vifp->v_rate_limit <= 0) 1839 tbf_send_packet(vifp, mb_copy); 1840 else 1841 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), 1842 ntohs(ip->ip_len)); 1843 } 1844 1845 static void 1846 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1847 { 1848 struct mbuf *mb_copy; 1849 struct ip *ip_copy; 1850 int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr); 1851 1852 /* Take care of delayed checksums */ 1853 if (m->m_pkthdr.csum_flags & (M_TCPV4_CSUM_OUT | M_UDPV4_CSUM_OUT)) { 1854 in_delayed_cksum(m); 1855 m->m_pkthdr.csum_flags &= 1856 ~(M_UDPV4_CSUM_OUT | M_TCPV4_CSUM_OUT); 1857 } 1858 1859 /* 1860 * copy the old packet & pullup its IP header into the 1861 * new mbuf so we can modify it. Try to fill the new 1862 * mbuf since if we don't the ethernet driver will. 1863 */ 1864 MGETHDR(mb_copy, M_DONTWAIT, MT_DATA); 1865 if (mb_copy == NULL) 1866 return; 1867 mb_copy->m_data += max_linkhdr; 1868 mb_copy->m_pkthdr.len = len; 1869 mb_copy->m_len = sizeof(multicast_encap_iphdr); 1870 1871 if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { 1872 m_freem(mb_copy); 1873 return; 1874 } 1875 i = MHLEN - max_linkhdr; 1876 if (i > len) 1877 i = len; 1878 mb_copy = m_pullup(mb_copy, i); 1879 if (mb_copy == NULL) 1880 return; 1881 1882 /* 1883 * fill in the encapsulating IP header. 1884 */ 1885 ip_copy = mtod(mb_copy, struct ip *); 1886 *ip_copy = multicast_encap_iphdr; 1887 ip_copy->ip_id = htons(ip_randomid()); 1888 ip_copy->ip_len = htons(len); 1889 ip_copy->ip_src = vifp->v_lcl_addr; 1890 ip_copy->ip_dst = vifp->v_rmt_addr; 1891 1892 /* 1893 * turn the encapsulated IP header back into a valid one. 1894 */ 1895 ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); 1896 --ip->ip_ttl; 1897 ip->ip_sum = 0; 1898 mb_copy->m_data += sizeof(multicast_encap_iphdr); 1899 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 1900 mb_copy->m_data -= sizeof(multicast_encap_iphdr); 1901 1902 if (vifp->v_rate_limit <= 0) 1903 tbf_send_packet(vifp, mb_copy); 1904 else 1905 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len)); 1906 } 1907 1908 /* 1909 * Token bucket filter module 1910 */ 1911 static void 1912 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len) 1913 { 1914 1915 if (len > MAX_BKT_SIZE) { 1916 /* drop if packet is too large */ 1917 mrtstat.mrts_pkt2large++; 1918 m_freem(m); 1919 return; 1920 } 1921 1922 tbf_update_tokens(vifp); 1923 1924 /* 1925 * If there are enough tokens, and the queue is empty, send this packet 1926 * out immediately. Otherwise, try to insert it on this vif's queue. 1927 */ 1928 if (vifp->tbf_q_len == 0) { 1929 if (len <= vifp->tbf_n_tok) { 1930 vifp->tbf_n_tok -= len; 1931 tbf_send_packet(vifp, m); 1932 } else { 1933 /* queue packet and timeout till later */ 1934 tbf_queue(vifp, m); 1935 timeout_add(&vifp->v_repq_ch, TBF_REPROCESS); 1936 } 1937 } else { 1938 if (vifp->tbf_q_len >= vifp->tbf_max_q_len && 1939 !tbf_dq_sel(vifp, ip)) { 1940 /* queue full, and couldn't make room */ 1941 mrtstat.mrts_q_overflow++; 1942 m_freem(m); 1943 } else { 1944 /* queue length low enough, or made room */ 1945 tbf_queue(vifp, m); 1946 tbf_process_q(vifp); 1947 } 1948 } 1949 } 1950 1951 /* 1952 * adds a packet to the queue at the interface 1953 */ 1954 static void 1955 tbf_queue(struct vif *vifp, struct mbuf *m) 1956 { 1957 int s = splsoftnet(); 1958 1959 /* insert at tail */ 1960 *vifp->tbf_t = m; 1961 vifp->tbf_t = &m->m_nextpkt; 1962 vifp->tbf_q_len++; 1963 1964 splx(s); 1965 } 1966 1967 1968 /* 1969 * processes the queue at the interface 1970 */ 1971 static void 1972 tbf_process_q(struct vif *vifp) 1973 { 1974 struct mbuf *m; 1975 int len; 1976 int s = splsoftnet(); 1977 1978 /* 1979 * Loop through the queue at the interface and send as many packets 1980 * as possible. 1981 */ 1982 for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) { 1983 len = ntohs(mtod(m, struct ip *)->ip_len); 1984 1985 /* determine if the packet can be sent */ 1986 if (len <= vifp->tbf_n_tok) { 1987 /* if so, 1988 * reduce no of tokens, dequeue the packet, 1989 * send the packet. 1990 */ 1991 if ((vifp->tbf_q = m->m_nextpkt) == NULL) 1992 vifp->tbf_t = &vifp->tbf_q; 1993 --vifp->tbf_q_len; 1994 1995 m->m_nextpkt = NULL; 1996 vifp->tbf_n_tok -= len; 1997 tbf_send_packet(vifp, m); 1998 } else 1999 break; 2000 } 2001 splx(s); 2002 } 2003 2004 static void 2005 tbf_reprocess_q(void *arg) 2006 { 2007 struct vif *vifp = arg; 2008 2009 if (ip_mrouter == NULL) 2010 return; 2011 2012 tbf_update_tokens(vifp); 2013 tbf_process_q(vifp); 2014 2015 if (vifp->tbf_q_len != 0) 2016 timeout_add(&vifp->v_repq_ch, TBF_REPROCESS); 2017 } 2018 2019 /* function that will selectively discard a member of the queue 2020 * based on the precedence value and the priority 2021 */ 2022 static int 2023 tbf_dq_sel(struct vif *vifp, struct ip *ip) 2024 { 2025 u_int p; 2026 struct mbuf **mp, *m; 2027 int s = splsoftnet(); 2028 2029 p = priority(vifp, ip); 2030 2031 for (mp = &vifp->tbf_q, m = *mp; 2032 m != NULL; 2033 mp = &m->m_nextpkt, m = *mp) { 2034 if (p > priority(vifp, mtod(m, struct ip *))) { 2035 if ((*mp = m->m_nextpkt) == NULL) 2036 vifp->tbf_t = mp; 2037 --vifp->tbf_q_len; 2038 2039 m_freem(m); 2040 mrtstat.mrts_drop_sel++; 2041 splx(s); 2042 return (1); 2043 } 2044 } 2045 splx(s); 2046 return (0); 2047 } 2048 2049 static void 2050 tbf_send_packet(struct vif *vifp, struct mbuf *m) 2051 { 2052 int error; 2053 int s = splsoftnet(); 2054 2055 if (vifp->v_flags & VIFF_TUNNEL) { 2056 /* If tunnel options */ 2057 ip_output(m, (struct mbuf *)NULL, &vifp->v_route, 2058 IP_FORWARDING, (struct ip_moptions *)NULL, 2059 (struct inpcb *)NULL); 2060 } else { 2061 /* 2062 * if physical interface option, extract the options 2063 * and then send 2064 */ 2065 struct ip_moptions imo; 2066 2067 imo.imo_multicast_ifp = vifp->v_ifp; 2068 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 2069 imo.imo_multicast_loop = 1; 2070 #ifdef RSVP_ISI 2071 imo.imo_multicast_vif = -1; 2072 #endif 2073 2074 error = ip_output(m, (struct mbuf *)NULL, (struct route *)NULL, 2075 IP_FORWARDING|IP_MULTICASTOPTS, &imo, 2076 (struct inpcb *)NULL); 2077 2078 if (mrtdebug & DEBUG_XMIT) 2079 log(LOG_DEBUG, "phyint_send on vif %ld err %d\n", 2080 (long)(vifp - viftable), error); 2081 } 2082 splx(s); 2083 } 2084 2085 /* determine the current time and then 2086 * the elapsed time (between the last time and time now) 2087 * in milliseconds & update the no. of tokens in the bucket 2088 */ 2089 static void 2090 tbf_update_tokens(struct vif *vifp) 2091 { 2092 struct timeval tp; 2093 u_int32_t tm; 2094 int s = splsoftnet(); 2095 2096 microtime(&tp); 2097 2098 TV_DELTA(tp, vifp->tbf_last_pkt_t, tm); 2099 2100 /* 2101 * This formula is actually 2102 * "time in seconds" * "bytes/second". 2103 * 2104 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) 2105 * 2106 * The (1000/1024) was introduced in add_vif to optimize 2107 * this divide into a shift. 2108 */ 2109 vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192; 2110 vifp->tbf_last_pkt_t = tp; 2111 2112 if (vifp->tbf_n_tok > MAX_BKT_SIZE) 2113 vifp->tbf_n_tok = MAX_BKT_SIZE; 2114 2115 splx(s); 2116 } 2117 2118 static int 2119 priority(struct vif *vifp, struct ip *ip) 2120 { 2121 int prio = 50; /* the lowest priority -- default case */ 2122 2123 /* temporary hack; may add general packet classifier some day */ 2124 2125 /* 2126 * The UDP port space is divided up into four priority ranges: 2127 * [0, 16384) : unclassified - lowest priority 2128 * [16384, 32768) : audio - highest priority 2129 * [32768, 49152) : whiteboard - medium priority 2130 * [49152, 65536) : video - low priority 2131 */ 2132 if (ip->ip_p == IPPROTO_UDP) { 2133 struct udphdr *udp = 2134 (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); 2135 2136 switch (ntohs(udp->uh_dport) & 0xc000) { 2137 case 0x4000: 2138 prio = 70; 2139 break; 2140 case 0x8000: 2141 prio = 60; 2142 break; 2143 case 0xc000: 2144 prio = 55; 2145 break; 2146 } 2147 2148 if (tbfdebug > 1) 2149 log(LOG_DEBUG, "port %x prio %d\n", 2150 ntohs(udp->uh_dport), prio); 2151 } 2152 2153 return (prio); 2154 } 2155 2156 /* 2157 * End of token bucket filter modifications 2158 */ 2159 #ifdef RSVP_ISI 2160 int 2161 ip_rsvp_vif_init(struct socket *so, struct mbuf *m) 2162 { 2163 int vifi, s; 2164 2165 if (rsvpdebug) 2166 printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n", 2167 so->so_type, so->so_proto->pr_protocol); 2168 2169 if (so->so_type != SOCK_RAW || 2170 so->so_proto->pr_protocol != IPPROTO_RSVP) 2171 return (EOPNOTSUPP); 2172 2173 /* Check mbuf. */ 2174 if (m == NULL || m->m_len != sizeof(int)) { 2175 return (EINVAL); 2176 } 2177 vifi = *(mtod(m, int *)); 2178 2179 if (rsvpdebug) 2180 printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n", 2181 vifi, rsvp_on); 2182 2183 s = splsoftnet(); 2184 2185 /* Check vif. */ 2186 if (!legal_vif_num(vifi)) { 2187 splx(s); 2188 return (EADDRNOTAVAIL); 2189 } 2190 2191 /* Check if socket is available. */ 2192 if (viftable[vifi].v_rsvpd != NULL) { 2193 splx(s); 2194 return (EADDRINUSE); 2195 } 2196 2197 viftable[vifi].v_rsvpd = so; 2198 /* This may seem silly, but we need to be sure we don't over-increment 2199 * the RSVP counter, in case something slips up. 2200 */ 2201 if (!viftable[vifi].v_rsvp_on) { 2202 viftable[vifi].v_rsvp_on = 1; 2203 rsvp_on++; 2204 } 2205 2206 splx(s); 2207 return (0); 2208 } 2209 2210 int 2211 ip_rsvp_vif_done(struct socket *so, struct mbuf *m) 2212 { 2213 int vifi, s; 2214 2215 if (rsvpdebug) 2216 printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n", 2217 so->so_type, so->so_proto->pr_protocol); 2218 2219 if (so->so_type != SOCK_RAW || 2220 so->so_proto->pr_protocol != IPPROTO_RSVP) 2221 return (EOPNOTSUPP); 2222 2223 /* Check mbuf. */ 2224 if (m == NULL || m->m_len != sizeof(int)) { 2225 return (EINVAL); 2226 } 2227 vifi = *(mtod(m, int *)); 2228 2229 s = splsoftnet(); 2230 2231 /* Check vif. */ 2232 if (!legal_vif_num(vifi)) { 2233 splx(s); 2234 return (EADDRNOTAVAIL); 2235 } 2236 2237 if (rsvpdebug) 2238 printf("ip_rsvp_vif_done: v_rsvpd = %x so = %x\n", 2239 viftable[vifi].v_rsvpd, so); 2240 2241 viftable[vifi].v_rsvpd = NULL; 2242 /* 2243 * This may seem silly, but we need to be sure we don't over-decrement 2244 * the RSVP counter, in case something slips up. 2245 */ 2246 if (viftable[vifi].v_rsvp_on) { 2247 viftable[vifi].v_rsvp_on = 0; 2248 rsvp_on--; 2249 } 2250 2251 splx(s); 2252 return (0); 2253 } 2254 2255 void 2256 ip_rsvp_force_done(struct socket *so) 2257 { 2258 int vifi, s; 2259 2260 /* Don't bother if it is not the right type of socket. */ 2261 if (so->so_type != SOCK_RAW || 2262 so->so_proto->pr_protocol != IPPROTO_RSVP) 2263 return; 2264 2265 s = splsoftnet(); 2266 2267 /* 2268 * The socket may be attached to more than one vif...this 2269 * is perfectly legal. 2270 */ 2271 for (vifi = 0; vifi < numvifs; vifi++) { 2272 if (viftable[vifi].v_rsvpd == so) { 2273 viftable[vifi].v_rsvpd = NULL; 2274 /* 2275 * This may seem silly, but we need to be sure we don't 2276 * over-decrement the RSVP counter, in case something 2277 * slips up. 2278 */ 2279 if (viftable[vifi].v_rsvp_on) { 2280 viftable[vifi].v_rsvp_on = 0; 2281 rsvp_on--; 2282 } 2283 } 2284 } 2285 2286 splx(s); 2287 return; 2288 } 2289 2290 void 2291 rsvp_input(struct mbuf *m, struct ifnet *ifp) 2292 { 2293 int vifi, s; 2294 struct ip *ip = mtod(m, struct ip *); 2295 static struct sockaddr_in rsvp_src = { sizeof(sin), AF_INET }; 2296 2297 if (rsvpdebug) 2298 printf("rsvp_input: rsvp_on %d\n", rsvp_on); 2299 2300 /* 2301 * Can still get packets with rsvp_on = 0 if there is a local member 2302 * of the group to which the RSVP packet is addressed. But in this 2303 * case we want to throw the packet away. 2304 */ 2305 if (!rsvp_on) { 2306 m_freem(m); 2307 return; 2308 } 2309 2310 /* 2311 * If the old-style non-vif-associated socket is set, then use 2312 * it and ignore the new ones. 2313 */ 2314 if (ip_rsvpd != NULL) { 2315 if (rsvpdebug) 2316 printf("rsvp_input: " 2317 "Sending packet up old-style socket\n"); 2318 rip_input(m, 0); /*XXX*/ 2319 return; 2320 } 2321 2322 s = splsoftnet(); 2323 2324 if (rsvpdebug) 2325 printf("rsvp_input: check vifs\n"); 2326 2327 /* Find which vif the packet arrived on. */ 2328 for (vifi = 0; vifi < numvifs; vifi++) { 2329 if (viftable[vifi].v_ifp == ifp) 2330 break; 2331 } 2332 2333 if (vifi == numvifs) { 2334 /* Can't find vif packet arrived on. Drop packet. */ 2335 if (rsvpdebug) 2336 printf("rsvp_input: " 2337 "Can't find vif for packet...dropping it.\n"); 2338 m_freem(m); 2339 splx(s); 2340 return; 2341 } 2342 2343 if (rsvpdebug) 2344 printf("rsvp_input: check socket\n"); 2345 2346 if (viftable[vifi].v_rsvpd == NULL) { 2347 /* 2348 * drop packet, since there is no specific socket for this 2349 * interface 2350 */ 2351 if (rsvpdebug) 2352 printf("rsvp_input: No socket defined for vif %d\n", 2353 vifi); 2354 m_freem(m); 2355 splx(s); 2356 return; 2357 } 2358 2359 rsvp_src.sin_addr = ip->ip_src; 2360 2361 if (rsvpdebug && m) 2362 printf("rsvp_input: m->m_len = %d, sbspace() = %d\n", 2363 m->m_len, sbspace(&viftable[vifi].v_rsvpd->so_rcv)); 2364 2365 if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) 2366 if (rsvpdebug) 2367 printf("rsvp_input: Failed to append to socket\n"); 2368 else 2369 if (rsvpdebug) 2370 printf("rsvp_input: send packet up\n"); 2371 2372 splx(s); 2373 } 2374 #endif /* RSVP_ISI */ 2375 2376 /* 2377 * Code for bandwidth monitors 2378 */ 2379 2380 /* 2381 * Define common interface for timeval-related methods 2382 */ 2383 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp) 2384 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp)) 2385 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp)) 2386 2387 static uint32_t 2388 compute_bw_meter_flags(struct bw_upcall *req) 2389 { 2390 uint32_t flags = 0; 2391 2392 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 2393 flags |= BW_METER_UNIT_PACKETS; 2394 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 2395 flags |= BW_METER_UNIT_BYTES; 2396 if (req->bu_flags & BW_UPCALL_GEQ) 2397 flags |= BW_METER_GEQ; 2398 if (req->bu_flags & BW_UPCALL_LEQ) 2399 flags |= BW_METER_LEQ; 2400 2401 return (flags); 2402 } 2403 2404 /* 2405 * Add a bw_meter entry 2406 */ 2407 static int 2408 add_bw_upcall(struct mbuf *m) 2409 { 2410 int s; 2411 struct mfc *mfc; 2412 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 2413 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 2414 struct timeval now; 2415 struct bw_meter *x; 2416 uint32_t flags; 2417 struct bw_upcall *req; 2418 2419 if (m == NULL || m->m_len < sizeof(struct bw_upcall)) 2420 return (EINVAL); 2421 2422 req = mtod(m, struct bw_upcall *); 2423 2424 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2425 return (EOPNOTSUPP); 2426 2427 /* Test if the flags are valid */ 2428 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 2429 return (EINVAL); 2430 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 2431 return (EINVAL); 2432 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2433 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2434 return (EINVAL); 2435 2436 /* Test if the threshold time interval is valid */ 2437 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 2438 return (EINVAL); 2439 2440 flags = compute_bw_meter_flags(req); 2441 2442 /* Find if we have already same bw_meter entry */ 2443 s = splsoftnet(); 2444 mfc = mfc_find(&req->bu_src, &req->bu_dst); 2445 if (mfc == NULL) { 2446 splx(s); 2447 return (EADDRNOTAVAIL); 2448 } 2449 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 2450 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2451 &req->bu_threshold.b_time, ==)) && 2452 (x->bm_threshold.b_packets == 2453 req->bu_threshold.b_packets) && 2454 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2455 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 2456 splx(s); 2457 return (0); /* XXX Already installed */ 2458 } 2459 } 2460 2461 /* Allocate the new bw_meter entry */ 2462 x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); 2463 if (x == NULL) { 2464 splx(s); 2465 return (ENOBUFS); 2466 } 2467 2468 /* Set the new bw_meter entry */ 2469 x->bm_threshold.b_time = req->bu_threshold.b_time; 2470 microtime(&now); 2471 x->bm_start_time = now; 2472 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 2473 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 2474 x->bm_measured.b_packets = 0; 2475 x->bm_measured.b_bytes = 0; 2476 x->bm_flags = flags; 2477 x->bm_time_next = NULL; 2478 x->bm_time_hash = BW_METER_BUCKETS; 2479 2480 /* Add the new bw_meter entry to the front of entries for this MFC */ 2481 x->bm_mfc = mfc; 2482 x->bm_mfc_next = mfc->mfc_bw_meter; 2483 mfc->mfc_bw_meter = x; 2484 schedule_bw_meter(x, &now); 2485 splx(s); 2486 2487 return (0); 2488 } 2489 2490 static void 2491 free_bw_list(struct bw_meter *list) 2492 { 2493 while (list != NULL) { 2494 struct bw_meter *x = list; 2495 2496 list = list->bm_mfc_next; 2497 unschedule_bw_meter(x); 2498 free(x, M_BWMETER); 2499 } 2500 } 2501 2502 /* 2503 * Delete one or multiple bw_meter entries 2504 */ 2505 static int 2506 del_bw_upcall(struct mbuf *m) 2507 { 2508 int s; 2509 struct mfc *mfc; 2510 struct bw_meter *x; 2511 struct bw_upcall *req; 2512 2513 if (m == NULL || m->m_len < sizeof(struct bw_upcall)) 2514 return (EINVAL); 2515 2516 req = mtod(m, struct bw_upcall *); 2517 2518 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2519 return (EOPNOTSUPP); 2520 2521 s = splsoftnet(); 2522 /* Find the corresponding MFC entry */ 2523 mfc = mfc_find(&req->bu_src, &req->bu_dst); 2524 if (mfc == NULL) { 2525 splx(s); 2526 return (EADDRNOTAVAIL); 2527 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 2528 /* Delete all bw_meter entries for this mfc */ 2529 struct bw_meter *list; 2530 2531 list = mfc->mfc_bw_meter; 2532 mfc->mfc_bw_meter = NULL; 2533 free_bw_list(list); 2534 splx(s); 2535 return (0); 2536 } else { /* Delete a single bw_meter entry */ 2537 struct bw_meter *prev; 2538 uint32_t flags = 0; 2539 2540 flags = compute_bw_meter_flags(req); 2541 2542 /* Find the bw_meter entry to delete */ 2543 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 2544 prev = x, x = x->bm_mfc_next) { 2545 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2546 &req->bu_threshold.b_time, ==)) && 2547 (x->bm_threshold.b_packets == 2548 req->bu_threshold.b_packets) && 2549 (x->bm_threshold.b_bytes == 2550 req->bu_threshold.b_bytes) && 2551 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 2552 break; 2553 } 2554 if (x != NULL) { /* Delete entry from the list for this MFC */ 2555 if (prev != NULL) { 2556 /* remove from middle */ 2557 prev->bm_mfc_next = x->bm_mfc_next; 2558 } else { 2559 /* new head of list */ 2560 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next; 2561 } 2562 2563 unschedule_bw_meter(x); 2564 splx(s); 2565 /* Free the bw_meter entry */ 2566 free(x, M_BWMETER); 2567 return (0); 2568 } else { 2569 splx(s); 2570 return (EINVAL); 2571 } 2572 } 2573 /* NOTREACHED */ 2574 } 2575 2576 /* 2577 * Perform bandwidth measurement processing that may result in an upcall 2578 */ 2579 static void 2580 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 2581 { 2582 struct timeval delta; 2583 2584 delta = *nowp; 2585 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2586 2587 if (x->bm_flags & BW_METER_GEQ) { 2588 /* Processing for ">=" type of bw_meter entry */ 2589 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2590 /* Reset the bw_meter entry */ 2591 x->bm_start_time = *nowp; 2592 x->bm_measured.b_packets = 0; 2593 x->bm_measured.b_bytes = 0; 2594 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2595 } 2596 2597 /* Record that a packet is received */ 2598 x->bm_measured.b_packets++; 2599 x->bm_measured.b_bytes += plen; 2600 2601 /* Test if we should deliver an upcall */ 2602 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 2603 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2604 (x->bm_measured.b_packets >= 2605 x->bm_threshold.b_packets)) || 2606 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2607 (x->bm_measured.b_bytes >= 2608 x->bm_threshold.b_bytes))) { 2609 /* Prepare an upcall for delivery */ 2610 bw_meter_prepare_upcall(x, nowp); 2611 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 2612 } 2613 } 2614 } else if (x->bm_flags & BW_METER_LEQ) { 2615 /* Processing for "<=" type of bw_meter entry */ 2616 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2617 /* 2618 * We are behind time with the multicast forwarding 2619 * table scanning for "<=" type of bw_meter entries, 2620 * so test now if we should deliver an upcall. 2621 */ 2622 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2623 (x->bm_measured.b_packets <= 2624 x->bm_threshold.b_packets)) || 2625 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2626 (x->bm_measured.b_bytes <= 2627 x->bm_threshold.b_bytes))) { 2628 /* Prepare an upcall for delivery */ 2629 bw_meter_prepare_upcall(x, nowp); 2630 } 2631 /* Reschedule the bw_meter entry */ 2632 unschedule_bw_meter(x); 2633 schedule_bw_meter(x, nowp); 2634 } 2635 2636 /* Record that a packet is received */ 2637 x->bm_measured.b_packets++; 2638 x->bm_measured.b_bytes += plen; 2639 2640 /* Test if we should restart the measuring interval */ 2641 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 2642 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 2643 (x->bm_flags & BW_METER_UNIT_BYTES && 2644 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 2645 /* Don't restart the measuring interval */ 2646 } else { 2647 /* Do restart the measuring interval */ 2648 /* 2649 * XXX: note that we don't unschedule and schedule, 2650 * because this might be too much overhead per packet. 2651 * Instead, when we process all entries for a given 2652 * timer hash bin, we check whether it is really a 2653 * timeout. If not, we reschedule at that time. 2654 */ 2655 x->bm_start_time = *nowp; 2656 x->bm_measured.b_packets = 0; 2657 x->bm_measured.b_bytes = 0; 2658 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2659 } 2660 } 2661 } 2662 2663 /* 2664 * Prepare a bandwidth-related upcall 2665 */ 2666 static void 2667 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2668 { 2669 struct timeval delta; 2670 struct bw_upcall *u; 2671 2672 /* Compute the measured time interval */ 2673 delta = *nowp; 2674 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2675 2676 /* If there are too many pending upcalls, deliver them now */ 2677 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2678 bw_upcalls_send(); 2679 2680 /* Set the bw_upcall entry */ 2681 u = &bw_upcalls[bw_upcalls_n++]; 2682 u->bu_src = x->bm_mfc->mfc_origin; 2683 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2684 u->bu_threshold.b_time = x->bm_threshold.b_time; 2685 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2686 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2687 u->bu_measured.b_time = delta; 2688 u->bu_measured.b_packets = x->bm_measured.b_packets; 2689 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2690 u->bu_flags = 0; 2691 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2692 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2693 if (x->bm_flags & BW_METER_UNIT_BYTES) 2694 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2695 if (x->bm_flags & BW_METER_GEQ) 2696 u->bu_flags |= BW_UPCALL_GEQ; 2697 if (x->bm_flags & BW_METER_LEQ) 2698 u->bu_flags |= BW_UPCALL_LEQ; 2699 } 2700 2701 /* 2702 * Send the pending bandwidth-related upcalls 2703 */ 2704 static void 2705 bw_upcalls_send(void) 2706 { 2707 struct mbuf *m; 2708 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2709 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2710 static struct igmpmsg igmpmsg = { 2711 0, /* unused1 */ 2712 0, /* unused2 */ 2713 IGMPMSG_BW_UPCALL, /* im_msgtype */ 2714 0, /* im_mbz */ 2715 0, /* im_vif */ 2716 0, /* unused3 */ 2717 { 0 }, /* im_src */ 2718 { 0 } }; /* im_dst */ 2719 2720 if (bw_upcalls_n == 0) 2721 return; /* No pending upcalls */ 2722 2723 bw_upcalls_n = 0; 2724 2725 /* 2726 * Allocate a new mbuf, initialize it with the header and 2727 * the payload for the pending calls. 2728 */ 2729 MGETHDR(m, M_DONTWAIT, MT_HEADER); 2730 if (m == NULL) { 2731 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2732 return; 2733 } 2734 2735 m->m_len = m->m_pkthdr.len = 0; 2736 m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); 2737 m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); 2738 2739 /* 2740 * Send the upcalls 2741 * XXX do we need to set the address in k_igmpsrc ? 2742 */ 2743 mrtstat.mrts_upcalls++; 2744 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { 2745 log(LOG_WARNING, 2746 "bw_upcalls_send: ip_mrouter socket queue full\n"); 2747 ++mrtstat.mrts_upq_sockfull; 2748 } 2749 } 2750 2751 /* 2752 * Compute the timeout hash value for the bw_meter entries 2753 */ 2754 #define BW_METER_TIMEHASH(bw_meter, hash) do { \ 2755 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2756 \ 2757 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2758 (hash) = next_timeval.tv_sec; \ 2759 if (next_timeval.tv_usec) \ 2760 (hash)++; /* XXX: make sure we don't timeout early */ \ 2761 (hash) %= BW_METER_BUCKETS; \ 2762 } while (/*CONSTCOND*/ 0) 2763 2764 /* 2765 * Schedule a timer to process periodically bw_meter entry of type "<=" 2766 * by linking the entry in the proper hash bucket. 2767 */ 2768 static void 2769 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2770 { 2771 int time_hash; 2772 2773 if (!(x->bm_flags & BW_METER_LEQ)) 2774 return; /* XXX: we schedule timers only for "<=" entries */ 2775 2776 /* Reset the bw_meter entry */ 2777 x->bm_start_time = *nowp; 2778 x->bm_measured.b_packets = 0; 2779 x->bm_measured.b_bytes = 0; 2780 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2781 2782 /* Compute the timeout hash value and insert the entry */ 2783 BW_METER_TIMEHASH(x, time_hash); 2784 x->bm_time_next = bw_meter_timers[time_hash]; 2785 bw_meter_timers[time_hash] = x; 2786 x->bm_time_hash = time_hash; 2787 } 2788 2789 /* 2790 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2791 * by removing the entry from the proper hash bucket. 2792 */ 2793 static void 2794 unschedule_bw_meter(struct bw_meter *x) 2795 { 2796 int time_hash; 2797 struct bw_meter *prev, *tmp; 2798 2799 if (!(x->bm_flags & BW_METER_LEQ)) 2800 return; /* XXX: we schedule timers only for "<=" entries */ 2801 2802 /* Compute the timeout hash value and delete the entry */ 2803 time_hash = x->bm_time_hash; 2804 if (time_hash >= BW_METER_BUCKETS) 2805 return; /* Entry was not scheduled */ 2806 2807 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2808 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2809 if (tmp == x) 2810 break; 2811 2812 if (tmp == NULL) 2813 panic("unschedule_bw_meter: bw_meter entry not found"); 2814 2815 if (prev != NULL) 2816 prev->bm_time_next = x->bm_time_next; 2817 else 2818 bw_meter_timers[time_hash] = x->bm_time_next; 2819 2820 x->bm_time_next = NULL; 2821 x->bm_time_hash = BW_METER_BUCKETS; 2822 } 2823 2824 /* 2825 * Process all "<=" type of bw_meter that should be processed now, 2826 * and for each entry prepare an upcall if necessary. Each processed 2827 * entry is rescheduled again for the (periodic) processing. 2828 * 2829 * This is run periodically (once per second normally). On each round, 2830 * all the potentially matching entries are in the hash slot that we are 2831 * looking at. 2832 */ 2833 static void 2834 bw_meter_process() 2835 { 2836 int s; 2837 static uint32_t last_tv_sec; /* last time we processed this */ 2838 2839 uint32_t loops; 2840 int i; 2841 struct timeval now, process_endtime; 2842 2843 microtime(&now); 2844 if (last_tv_sec == now.tv_sec) 2845 return; /* nothing to do */ 2846 2847 loops = now.tv_sec - last_tv_sec; 2848 last_tv_sec = now.tv_sec; 2849 if (loops > BW_METER_BUCKETS) 2850 loops = BW_METER_BUCKETS; 2851 2852 s = splsoftnet(); 2853 /* 2854 * Process all bins of bw_meter entries from the one after the last 2855 * processed to the current one. On entry, i points to the last bucket 2856 * visited, so we need to increment i at the beginning of the loop. 2857 */ 2858 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 2859 struct bw_meter *x, *tmp_list; 2860 2861 if (++i >= BW_METER_BUCKETS) 2862 i = 0; 2863 2864 /* Disconnect the list of bw_meter entries from the bin */ 2865 tmp_list = bw_meter_timers[i]; 2866 bw_meter_timers[i] = NULL; 2867 2868 /* Process the list of bw_meter entries */ 2869 while (tmp_list != NULL) { 2870 x = tmp_list; 2871 tmp_list = tmp_list->bm_time_next; 2872 2873 /* Test if the time interval is over */ 2874 process_endtime = x->bm_start_time; 2875 BW_TIMEVALADD(&process_endtime, 2876 &x->bm_threshold.b_time); 2877 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 2878 /* Not yet: reschedule, but don't reset */ 2879 int time_hash; 2880 2881 BW_METER_TIMEHASH(x, time_hash); 2882 if (time_hash == i && 2883 process_endtime.tv_sec == now.tv_sec) { 2884 /* 2885 * XXX: somehow the bin processing is 2886 * a bit ahead of time. Put the entry 2887 * in the next bin. 2888 */ 2889 if (++time_hash >= BW_METER_BUCKETS) 2890 time_hash = 0; 2891 } 2892 x->bm_time_next = bw_meter_timers[time_hash]; 2893 bw_meter_timers[time_hash] = x; 2894 x->bm_time_hash = time_hash; 2895 2896 continue; 2897 } 2898 2899 /* Test if we should deliver an upcall */ 2900 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2901 (x->bm_measured.b_packets <= 2902 x->bm_threshold.b_packets)) || 2903 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2904 (x->bm_measured.b_bytes <= 2905 x->bm_threshold.b_bytes))) { 2906 /* Prepare an upcall for delivery */ 2907 bw_meter_prepare_upcall(x, &now); 2908 } 2909 2910 /* Reschedule for next processing */ 2911 schedule_bw_meter(x, &now); 2912 } 2913 } 2914 2915 /* Send all upcalls that are pending delivery */ 2916 bw_upcalls_send(); 2917 2918 splx(s); 2919 } 2920 2921 /* 2922 * A periodic function for sending all upcalls that are pending delivery 2923 */ 2924 static void 2925 expire_bw_upcalls_send(void *unused) 2926 { 2927 int s; 2928 2929 s = splsoftnet(); 2930 bw_upcalls_send(); 2931 splx(s); 2932 2933 timeout_add(&bw_upcalls_ch, BW_UPCALLS_PERIOD); 2934 } 2935 2936 /* 2937 * A periodic function for periodic scanning of the multicast forwarding 2938 * table for processing all "<=" bw_meter entries. 2939 */ 2940 static void 2941 expire_bw_meter_process(void *unused) 2942 { 2943 if (mrt_api_config & MRT_MFC_BW_UPCALL) 2944 bw_meter_process(); 2945 2946 timeout_add(&bw_meter_ch, BW_METER_PERIOD); 2947 } 2948 2949 /* 2950 * End of bandwidth monitoring code 2951 */ 2952 2953 #ifdef PIM 2954 /* 2955 * Send the packet up to the user daemon, or eventually do kernel encapsulation 2956 */ 2957 static int 2958 pim_register_send(struct ip *ip, struct vif *vifp, 2959 struct mbuf *m, struct mfc *rt) 2960 { 2961 struct mbuf *mb_copy, *mm; 2962 2963 if (mrtdebug & DEBUG_PIM) 2964 log(LOG_DEBUG, "pim_register_send: "); 2965 2966 mb_copy = pim_register_prepare(ip, m); 2967 if (mb_copy == NULL) 2968 return (ENOBUFS); 2969 2970 /* 2971 * Send all the fragments. Note that the mbuf for each fragment 2972 * is freed by the sending machinery. 2973 */ 2974 for (mm = mb_copy; mm; mm = mb_copy) { 2975 mb_copy = mm->m_nextpkt; 2976 mm->m_nextpkt = NULL; 2977 mm = m_pullup(mm, sizeof(struct ip)); 2978 if (mm != NULL) { 2979 ip = mtod(mm, struct ip *); 2980 if ((mrt_api_config & MRT_MFC_RP) && 2981 !in_nullhost(rt->mfc_rp)) { 2982 pim_register_send_rp(ip, vifp, mm, rt); 2983 } else { 2984 pim_register_send_upcall(ip, vifp, mm, rt); 2985 } 2986 } 2987 } 2988 2989 return (0); 2990 } 2991 2992 /* 2993 * Return a copy of the data packet that is ready for PIM Register 2994 * encapsulation. 2995 * XXX: Note that in the returned copy the IP header is a valid one. 2996 */ 2997 static struct mbuf * 2998 pim_register_prepare(struct ip *ip, struct mbuf *m) 2999 { 3000 struct mbuf *mb_copy = NULL; 3001 int mtu; 3002 3003 /* Take care of delayed checksums */ 3004 if (m->m_pkthdr.csum_flags & (M_TCPV4_CSUM_OUT | M_UDPV4_CSUM_OUT)) { 3005 in_delayed_cksum(m); 3006 m->m_pkthdr.csum_flags &= 3007 ~(M_UDPV4_CSUM_OUT | M_TCPV4_CSUM_OUT); 3008 } 3009 3010 /* 3011 * Copy the old packet & pullup its IP header into the 3012 * new mbuf so we can modify it. 3013 */ 3014 mb_copy = m_copy(m, 0, M_COPYALL); 3015 if (mb_copy == NULL) 3016 return (NULL); 3017 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 3018 if (mb_copy == NULL) 3019 return (NULL); 3020 3021 /* take care of the TTL */ 3022 ip = mtod(mb_copy, struct ip *); 3023 --ip->ip_ttl; 3024 3025 /* Compute the MTU after the PIM Register encapsulation */ 3026 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 3027 3028 if (ntohs(ip->ip_len) <= mtu) { 3029 /* Turn the IP header into a valid one */ 3030 ip->ip_sum = 0; 3031 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 3032 } else { 3033 /* Fragment the packet */ 3034 if (ip_fragment(mb_copy, NULL, mtu) != 0) { 3035 /* XXX: mb_copy was freed by ip_fragment() */ 3036 return (NULL); 3037 } 3038 } 3039 return (mb_copy); 3040 } 3041 3042 /* 3043 * Send an upcall with the data packet to the user-level process. 3044 */ 3045 static int 3046 pim_register_send_upcall(struct ip *ip, struct vif *vifp, 3047 struct mbuf *mb_copy, struct mfc *rt) 3048 { 3049 struct mbuf *mb_first; 3050 int len = ntohs(ip->ip_len); 3051 struct igmpmsg *im; 3052 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 3053 3054 /* Add a new mbuf with an upcall header */ 3055 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3056 if (mb_first == NULL) { 3057 m_freem(mb_copy); 3058 return (ENOBUFS); 3059 } 3060 mb_first->m_data += max_linkhdr; 3061 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 3062 mb_first->m_len = sizeof(struct igmpmsg); 3063 mb_first->m_next = mb_copy; 3064 3065 /* Send message to routing daemon */ 3066 im = mtod(mb_first, struct igmpmsg *); 3067 im->im_msgtype = IGMPMSG_WHOLEPKT; 3068 im->im_mbz = 0; 3069 im->im_vif = vifp - viftable; 3070 im->im_src = ip->ip_src; 3071 im->im_dst = ip->ip_dst; 3072 3073 k_igmpsrc.sin_addr = ip->ip_src; 3074 3075 mrtstat.mrts_upcalls++; 3076 3077 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { 3078 if (mrtdebug & DEBUG_PIM) 3079 log(LOG_WARNING, "mcast: pim_register_send_upcall: " 3080 "ip_mrouter socket queue full"); 3081 ++mrtstat.mrts_upq_sockfull; 3082 return (ENOBUFS); 3083 } 3084 3085 /* Keep statistics */ 3086 pimstat.pims_snd_registers_msgs++; 3087 pimstat.pims_snd_registers_bytes += len; 3088 3089 return (0); 3090 } 3091 3092 /* 3093 * Encapsulate the data packet in PIM Register message and send it to the RP. 3094 */ 3095 static int 3096 pim_register_send_rp(struct ip *ip, struct vif *vifp, 3097 struct mbuf *mb_copy, struct mfc *rt) 3098 { 3099 struct mbuf *mb_first; 3100 struct ip *ip_outer; 3101 struct pim_encap_pimhdr *pimhdr; 3102 int len = ntohs(ip->ip_len); 3103 vifi_t vifi = rt->mfc_parent; 3104 3105 if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) { 3106 m_freem(mb_copy); 3107 return (EADDRNOTAVAIL); /* The iif vif is invalid */ 3108 } 3109 3110 /* Add a new mbuf with the encapsulating header */ 3111 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3112 if (mb_first == NULL) { 3113 m_freem(mb_copy); 3114 return (ENOBUFS); 3115 } 3116 mb_first->m_data += max_linkhdr; 3117 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 3118 mb_first->m_next = mb_copy; 3119 3120 mb_first->m_pkthdr.len = len + mb_first->m_len; 3121 3122 /* Fill in the encapsulating IP and PIM header */ 3123 ip_outer = mtod(mb_first, struct ip *); 3124 *ip_outer = pim_encap_iphdr; 3125 ip_outer->ip_id = htons(ip_randomid()); 3126 ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + 3127 sizeof(pim_encap_pimhdr)); 3128 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 3129 ip_outer->ip_dst = rt->mfc_rp; 3130 /* 3131 * Copy the inner header TOS to the outer header, and take care of the 3132 * IP_DF bit. 3133 */ 3134 ip_outer->ip_tos = ip->ip_tos; 3135 if (ntohs(ip->ip_off) & IP_DF) 3136 ip_outer->ip_off |= htons(IP_DF); 3137 pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer 3138 + sizeof(pim_encap_iphdr)); 3139 *pimhdr = pim_encap_pimhdr; 3140 /* If the iif crosses a border, set the Border-bit */ 3141 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 3142 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 3143 3144 mb_first->m_data += sizeof(pim_encap_iphdr); 3145 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 3146 mb_first->m_data -= sizeof(pim_encap_iphdr); 3147 3148 if (vifp->v_rate_limit == 0) 3149 tbf_send_packet(vifp, mb_first); 3150 else 3151 tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len)); 3152 3153 /* Keep statistics */ 3154 pimstat.pims_snd_registers_msgs++; 3155 pimstat.pims_snd_registers_bytes += len; 3156 3157 return (0); 3158 } 3159 3160 /* 3161 * PIM-SMv2 and PIM-DM messages processing. 3162 * Receives and verifies the PIM control messages, and passes them 3163 * up to the listening socket, using rip_input(). 3164 * The only message with special processing is the PIM_REGISTER message 3165 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 3166 * is passed to if_simloop(). 3167 */ 3168 void 3169 pim_input(struct mbuf *m, ...) 3170 { 3171 struct ip *ip = mtod(m, struct ip *); 3172 struct pim *pim; 3173 int minlen; 3174 int datalen; 3175 int ip_tos; 3176 int iphlen; 3177 va_list ap; 3178 3179 va_start(ap, m); 3180 iphlen = va_arg(ap, int); 3181 va_end(ap); 3182 3183 datalen = ntohs(ip->ip_len) - iphlen; 3184 3185 /* Keep statistics */ 3186 pimstat.pims_rcv_total_msgs++; 3187 pimstat.pims_rcv_total_bytes += datalen; 3188 3189 /* Validate lengths */ 3190 if (datalen < PIM_MINLEN) { 3191 pimstat.pims_rcv_tooshort++; 3192 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", 3193 datalen, (u_long)ip->ip_src.s_addr); 3194 m_freem(m); 3195 return; 3196 } 3197 3198 /* 3199 * If the packet is at least as big as a REGISTER, go agead 3200 * and grab the PIM REGISTER header size, to avoid another 3201 * possible m_pullup() later. 3202 * 3203 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 3204 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 3205 */ 3206 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? 3207 PIM_REG_MINLEN : PIM_MINLEN); 3208 /* 3209 * Get the IP and PIM headers in contiguous memory, and 3210 * possibly the PIM REGISTER header. 3211 */ 3212 if ((m->m_flags & M_EXT || m->m_len < minlen) && 3213 (m = m_pullup(m, minlen)) == NULL) { 3214 log(LOG_ERR, "pim_input: m_pullup failure\n"); 3215 return; 3216 } 3217 /* m_pullup() may have given us a new mbuf so reset ip. */ 3218 ip = mtod(m, struct ip *); 3219 ip_tos = ip->ip_tos; 3220 3221 /* adjust mbuf to point to the PIM header */ 3222 m->m_data += iphlen; 3223 m->m_len -= iphlen; 3224 pim = mtod(m, struct pim *); 3225 3226 /* 3227 * Validate checksum. If PIM REGISTER, exclude the data packet. 3228 * 3229 * XXX: some older PIMv2 implementations don't make this distinction, 3230 * so for compatibility reason perform the checksum over part of the 3231 * message, and if error, then over the whole message. 3232 */ 3233 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && 3234 in_cksum(m, PIM_MINLEN) == 0) { 3235 /* do nothing, checksum okay */ 3236 } else if (in_cksum(m, datalen)) { 3237 pimstat.pims_rcv_badsum++; 3238 if (mrtdebug & DEBUG_PIM) 3239 log(LOG_DEBUG, "pim_input: invalid checksum"); 3240 m_freem(m); 3241 return; 3242 } 3243 3244 /* PIM version check */ 3245 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 3246 pimstat.pims_rcv_badversion++; 3247 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", 3248 PIM_VT_V(pim->pim_vt), PIM_VERSION); 3249 m_freem(m); 3250 return; 3251 } 3252 3253 /* restore mbuf back to the outer IP */ 3254 m->m_data -= iphlen; 3255 m->m_len += iphlen; 3256 3257 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 3258 /* 3259 * Since this is a REGISTER, we'll make a copy of the register 3260 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 3261 * routing daemon. 3262 */ 3263 int s; 3264 struct sockaddr_in dst = { sizeof(dst), AF_INET }; 3265 struct mbuf *mcp; 3266 struct ip *encap_ip; 3267 u_int32_t *reghdr; 3268 struct ifnet *vifp; 3269 3270 s = splsoftnet(); 3271 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 3272 splx(s); 3273 if (mrtdebug & DEBUG_PIM) 3274 log(LOG_DEBUG, "pim_input: register vif " 3275 "not set: %d\n", reg_vif_num); 3276 m_freem(m); 3277 return; 3278 } 3279 /* XXX need refcnt? */ 3280 vifp = viftable[reg_vif_num].v_ifp; 3281 splx(s); 3282 3283 /* Validate length */ 3284 if (datalen < PIM_REG_MINLEN) { 3285 pimstat.pims_rcv_tooshort++; 3286 pimstat.pims_rcv_badregisters++; 3287 log(LOG_ERR, "pim_input: register packet size " 3288 "too small %d from %lx\n", 3289 datalen, (u_long)ip->ip_src.s_addr); 3290 m_freem(m); 3291 return; 3292 } 3293 3294 reghdr = (u_int32_t *)(pim + 1); 3295 encap_ip = (struct ip *)(reghdr + 1); 3296 3297 if (mrtdebug & DEBUG_PIM) { 3298 log(LOG_DEBUG, "pim_input[register], encap_ip: " 3299 "%lx -> %lx, encap_ip len %d\n", 3300 (u_long)ntohl(encap_ip->ip_src.s_addr), 3301 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3302 ntohs(encap_ip->ip_len)); 3303 } 3304 3305 /* verify the version number of the inner packet */ 3306 if (encap_ip->ip_v != IPVERSION) { 3307 pimstat.pims_rcv_badregisters++; 3308 if (mrtdebug & DEBUG_PIM) { 3309 log(LOG_DEBUG, "pim_input: invalid IP version" 3310 " (%d) of the inner packet\n", 3311 encap_ip->ip_v); 3312 } 3313 m_freem(m); 3314 return; 3315 } 3316 3317 /* verify the inner packet is destined to a mcast group */ 3318 if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) { 3319 pimstat.pims_rcv_badregisters++; 3320 if (mrtdebug & DEBUG_PIM) 3321 log(LOG_DEBUG, 3322 "pim_input: inner packet of register is" 3323 " not multicast %lx\n", 3324 (u_long)ntohl(encap_ip->ip_dst.s_addr)); 3325 m_freem(m); 3326 return; 3327 } 3328 3329 /* If a NULL_REGISTER, pass it to the daemon */ 3330 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 3331 goto pim_input_to_daemon; 3332 3333 /* 3334 * Copy the TOS from the outer IP header to the inner 3335 * IP header. 3336 */ 3337 if (encap_ip->ip_tos != ip_tos) { 3338 /* Outer TOS -> inner TOS */ 3339 encap_ip->ip_tos = ip_tos; 3340 /* Recompute the inner header checksum. Sigh... */ 3341 3342 /* adjust mbuf to point to the inner IP header */ 3343 m->m_data += (iphlen + PIM_MINLEN); 3344 m->m_len -= (iphlen + PIM_MINLEN); 3345 3346 encap_ip->ip_sum = 0; 3347 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 3348 3349 /* restore mbuf to point back to the outer IP header */ 3350 m->m_data -= (iphlen + PIM_MINLEN); 3351 m->m_len += (iphlen + PIM_MINLEN); 3352 } 3353 3354 /* 3355 * Decapsulate the inner IP packet and loopback to forward it 3356 * as a normal multicast packet. Also, make a copy of the 3357 * outer_iphdr + pimhdr + reghdr + encap_iphdr 3358 * to pass to the daemon later, so it can take the appropriate 3359 * actions (e.g., send back PIM_REGISTER_STOP). 3360 * XXX: here m->m_data points to the outer IP header. 3361 */ 3362 mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); 3363 if (mcp == NULL) { 3364 log(LOG_ERR, "pim_input: pim register: could not " 3365 "copy register head\n"); 3366 m_freem(m); 3367 return; 3368 } 3369 3370 /* Keep statistics */ 3371 /* XXX: registers_bytes include only the encap. mcast pkt */ 3372 pimstat.pims_rcv_registers_msgs++; 3373 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); 3374 3375 /* forward the inner ip packet; point m_data at the inner ip. */ 3376 m_adj(m, iphlen + PIM_MINLEN); 3377 3378 if (mrtdebug & DEBUG_PIM) { 3379 log(LOG_DEBUG, 3380 "pim_input: forwarding decapsulated register: " 3381 "src %lx, dst %lx, vif %d\n", 3382 (u_long)ntohl(encap_ip->ip_src.s_addr), 3383 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3384 reg_vif_num); 3385 } 3386 /* NB: vifp was collected above; can it change on us? */ 3387 looutput(vifp, m, (struct sockaddr *)&dst, 3388 (struct rtentry *)NULL); 3389 3390 /* prepare the register head to send to the mrouting daemon */ 3391 m = mcp; 3392 } 3393 3394 pim_input_to_daemon: 3395 /* 3396 * Pass the PIM message up to the daemon; if it is a Register message, 3397 * pass the 'head' only up to the daemon. This includes the 3398 * outer IP header, PIM header, PIM-Register header and the 3399 * inner IP header. 3400 * XXX: the outer IP header pkt size of a Register is not adjust to 3401 * reflect the fact that the inner multicast data is truncated. 3402 */ 3403 rip_input(m); 3404 3405 return; 3406 } 3407 3408 /* 3409 * Sysctl for pim variables. 3410 */ 3411 int 3412 pim_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, 3413 void *newp, size_t newlen) 3414 { 3415 /* All sysctl names at this level are terminal. */ 3416 if (namelen != 1) 3417 return (ENOTDIR); 3418 3419 switch (name[0]) { 3420 case PIMCTL_STATS: 3421 if (newp != NULL) 3422 return (EPERM); 3423 return (sysctl_struct(oldp, oldlenp, newp, newlen, 3424 &pimstat, sizeof(pimstat))); 3425 3426 default: 3427 return (ENOPROTOOPT); 3428 } 3429 /* NOTREACHED */ 3430 } 3431 3432 3433 #endif /* PIM */ 3434