1 /* $NetBSD: ip_mroute.c,v 1.164 2020/11/12 13:13:45 kardel Exp $ */ 2 3 /* 4 * Copyright (c) 1992, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Stephen Deering of Stanford University. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 35 */ 36 37 /* 38 * Copyright (c) 1989 Stephen Deering 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Stephen Deering of Stanford University. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 72 */ 73 74 /* 75 * IP multicast forwarding procedures 76 * 77 * Written by David Waitzman, BBN Labs, August 1988. 78 * Modified by Steve Deering, Stanford, February 1989. 79 * Modified by Mark J. Steiglitz, Stanford, May, 1991 80 * Modified by Van Jacobson, LBL, January 1993 81 * Modified by Ajit Thyagarajan, PARC, August 1993 82 * Modified by Bill Fenner, PARC, April 1994 83 * Modified by Charles M. Hannum, NetBSD, May 1995. 84 * Modified by Ahmed Helmy, SGI, June 1996 85 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 86 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 87 * Modified by Hitoshi Asaeda, WIDE, August 2000 88 * Modified by Pavlin Radoslavov, ICSI, October 2002 89 * 90 * MROUTING Revision: 1.2 91 * and PIM-SMv2 and PIM-DM support, advanced API support, 92 * bandwidth metering and signaling 93 */ 94 95 #include <sys/cdefs.h> 96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.164 2020/11/12 13:13:45 kardel Exp $"); 97 98 #ifdef _KERNEL_OPT 99 #include "opt_inet.h" 100 #include "opt_ipsec.h" 101 #include "opt_pim.h" 102 #endif 103 104 #ifdef PIM 105 #define _PIM_VT 1 106 #endif 107 108 #include <sys/param.h> 109 #include <sys/systm.h> 110 #include <sys/callout.h> 111 #include <sys/mbuf.h> 112 #include <sys/socket.h> 113 #include <sys/socketvar.h> 114 #include <sys/errno.h> 115 #include <sys/time.h> 116 #include <sys/kernel.h> 117 #include <sys/kmem.h> 118 #include <sys/ioctl.h> 119 #include <sys/syslog.h> 120 121 #include <net/if.h> 122 #include <net/raw_cb.h> 123 124 #include <netinet/in.h> 125 #include <netinet/in_var.h> 126 #include <netinet/in_systm.h> 127 #include <netinet/in_offload.h> 128 #include <netinet/ip.h> 129 #include <netinet/ip_var.h> 130 #include <netinet/in_pcb.h> 131 #include <netinet/udp.h> 132 #include <netinet/igmp.h> 133 #include <netinet/igmp_var.h> 134 #include <netinet/ip_mroute.h> 135 #ifdef PIM 136 #include <netinet/pim.h> 137 #include <netinet/pim_var.h> 138 #endif 139 #include <netinet/ip_encap.h> 140 141 #ifdef IPSEC 142 #include <netipsec/ipsec.h> 143 #include <netipsec/key.h> 144 #endif 145 146 #define IP_MULTICASTOPTS 0 147 #define M_PULLUP(m, len) \ 148 do { \ 149 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \ 150 (m) = m_pullup((m), (len)); \ 151 } while (/*CONSTCOND*/ 0) 152 153 /* 154 * Globals. All but ip_mrouter and ip_mrtproto could be static, 155 * except for netstat or debugging purposes. 156 */ 157 struct socket *ip_mrouter = NULL; 158 int ip_mrtproto = IGMP_DVMRP; /* for netstat only */ 159 160 #define MFCHASH(a, g) \ 161 ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ 162 ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash) 163 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl; 164 u_long mfchash; 165 166 u_char nexpire[MFCTBLSIZ]; 167 struct vif viftable[MAXVIFS]; 168 struct mrtstat mrtstat; 169 u_int mrtdebug = 0; /* debug level */ 170 #define DEBUG_MFC 0x02 171 #define DEBUG_FORWARD 0x04 172 #define DEBUG_EXPIRE 0x08 173 #define DEBUG_XMIT 0x10 174 #define DEBUG_PIM 0x20 175 176 #define VIFI_INVALID ((vifi_t) -1) 177 178 u_int tbfdebug = 0; /* tbf debug level */ 179 180 /* vif attachment using sys/netinet/ip_encap.c */ 181 static void vif_input(struct mbuf *, int, int, void *); 182 static int vif_encapcheck(struct mbuf *, int, int, void *); 183 184 static const struct encapsw vif_encapsw = { 185 .encapsw4 = { 186 .pr_input = vif_input, 187 .pr_ctlinput = NULL, 188 } 189 }; 190 191 #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 192 #define UPCALL_EXPIRE 6 /* number of timeouts */ 193 194 /* 195 * Define the token bucket filter structures 196 */ 197 198 #define TBF_REPROCESS (hz / 100) /* 100x / second */ 199 200 static int get_sg_cnt(struct sioc_sg_req *); 201 static int get_vif_cnt(struct sioc_vif_req *); 202 static int ip_mrouter_init(struct socket *, int); 203 static int set_assert(int); 204 static int add_vif(struct vifctl *); 205 static int del_vif(vifi_t *); 206 static void update_mfc_params(struct mfc *, struct mfcctl2 *); 207 static void init_mfc_params(struct mfc *, struct mfcctl2 *); 208 static void expire_mfc(struct mfc *); 209 static int add_mfc(struct sockopt *); 210 #ifdef UPCALL_TIMING 211 static void collate(struct timeval *); 212 #endif 213 static int del_mfc(struct sockopt *); 214 static int set_api_config(struct sockopt *); /* chose API capabilities */ 215 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); 216 static void expire_upcalls(void *); 217 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *); 218 static void phyint_send(struct ip *, struct vif *, struct mbuf *); 219 static void encap_send(struct ip *, struct vif *, struct mbuf *); 220 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t); 221 static void tbf_queue(struct vif *, struct mbuf *); 222 static void tbf_process_q(struct vif *); 223 static void tbf_reprocess_q(void *); 224 static int tbf_dq_sel(struct vif *, struct ip *); 225 static void tbf_send_packet(struct vif *, struct mbuf *); 226 static void tbf_update_tokens(struct vif *); 227 static int priority(struct vif *, struct ip *); 228 static int ip_mforward_real(struct mbuf *, struct ifnet *); 229 230 231 /* 232 * Bandwidth monitoring 233 */ 234 static void free_bw_list(struct bw_meter *); 235 static int add_bw_upcall(struct bw_upcall *); 236 static int del_bw_upcall(struct bw_upcall *); 237 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *); 238 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); 239 static void bw_upcalls_send(void); 240 static void schedule_bw_meter(struct bw_meter *, struct timeval *); 241 static void unschedule_bw_meter(struct bw_meter *); 242 static void bw_meter_process(void); 243 static void expire_bw_upcalls_send(void *); 244 static void expire_bw_meter_process(void *); 245 246 #ifdef PIM 247 static int pim_register_send(struct ip *, struct vif *, 248 struct mbuf *, struct mfc *); 249 static int pim_register_send_rp(struct ip *, struct vif *, 250 struct mbuf *, struct mfc *); 251 static int pim_register_send_upcall(struct ip *, struct vif *, 252 struct mbuf *, struct mfc *); 253 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); 254 #endif 255 256 #define ENCAP_TTL 64 257 #define ENCAP_PROTO IPPROTO_IPIP 258 259 /* prototype IP hdr for encapsulated packets */ 260 static const struct ip multicast_encap_iphdr = { 261 .ip_hl = sizeof(struct ip) >> 2, 262 .ip_v = IPVERSION, 263 .ip_len = sizeof(struct ip), 264 .ip_ttl = ENCAP_TTL, 265 .ip_p = ENCAP_PROTO, 266 }; 267 268 /* 269 * Bandwidth meter variables and constants 270 */ 271 272 /* 273 * Pending timeouts are stored in a hash table, the key being the 274 * expiration time. Periodically, the entries are analysed and processed. 275 */ 276 #define BW_METER_BUCKETS 1024 277 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 278 struct callout bw_meter_ch; 279 #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 280 281 /* 282 * Pending upcalls are stored in a vector which is flushed when 283 * full, or periodically 284 */ 285 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 286 static u_int bw_upcalls_n; /* # of pending upcalls */ 287 struct callout bw_upcalls_ch; 288 #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 289 290 #ifdef PIM 291 struct pimstat pimstat; 292 293 /* 294 * Note: the PIM Register encapsulation adds the following in front of a 295 * data packet: 296 * 297 * struct pim_encap_hdr { 298 * struct ip ip; 299 * struct pim_encap_pimhdr pim; 300 * } 301 */ 302 303 struct pim_encap_pimhdr { 304 struct pim pim; 305 uint32_t flags; 306 }; 307 308 static struct ip pim_encap_iphdr = { 309 .ip_v = IPVERSION, 310 .ip_hl = sizeof(struct ip) >> 2, 311 .ip_len = sizeof(struct ip), 312 .ip_ttl = ENCAP_TTL, 313 .ip_p = IPPROTO_PIM, 314 }; 315 316 static struct pim_encap_pimhdr pim_encap_pimhdr = { 317 { 318 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 319 0, /* reserved */ 320 0, /* checksum */ 321 }, 322 0 /* flags */ 323 }; 324 325 static struct ifnet multicast_register_if; 326 static vifi_t reg_vif_num = VIFI_INVALID; 327 #endif /* PIM */ 328 329 330 /* 331 * Private variables. 332 */ 333 static vifi_t numvifs = 0; 334 335 static struct callout expire_upcalls_ch; 336 337 /* 338 * whether or not special PIM assert processing is enabled. 339 */ 340 static int pim_assert; 341 /* 342 * Rate limit for assert notification messages, in usec 343 */ 344 #define ASSERT_MSG_TIME 3000000 345 346 /* 347 * Kernel multicast routing API capabilities and setup. 348 * If more API capabilities are added to the kernel, they should be 349 * recorded in `mrt_api_support'. 350 */ 351 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 352 MRT_MFC_FLAGS_BORDER_VIF | 353 MRT_MFC_RP | 354 MRT_MFC_BW_UPCALL); 355 static u_int32_t mrt_api_config = 0; 356 357 /* 358 * Find a route for a given origin IP address and Multicast group address 359 * Type of service parameter to be added in the future!!! 360 * Statistics are updated by the caller if needed 361 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) 362 */ 363 static struct mfc * 364 mfc_find(struct in_addr *o, struct in_addr *g) 365 { 366 struct mfc *rt; 367 368 LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { 369 if (in_hosteq(rt->mfc_origin, *o) && 370 in_hosteq(rt->mfc_mcastgrp, *g) && 371 (rt->mfc_stall == NULL)) 372 break; 373 } 374 375 return rt; 376 } 377 378 /* 379 * Macros to compute elapsed time efficiently 380 * Borrowed from Van Jacobson's scheduling code 381 */ 382 #define TV_DELTA(a, b, delta) do { \ 383 int xxs; \ 384 delta = (a).tv_usec - (b).tv_usec; \ 385 xxs = (a).tv_sec - (b).tv_sec; \ 386 switch (xxs) { \ 387 case 2: \ 388 delta += 1000000; \ 389 /* fall through */ \ 390 case 1: \ 391 delta += 1000000; \ 392 /* fall through */ \ 393 case 0: \ 394 break; \ 395 default: \ 396 delta += (1000000 * xxs); \ 397 break; \ 398 } \ 399 } while (/*CONSTCOND*/ 0) 400 401 #ifdef UPCALL_TIMING 402 u_int32_t upcall_data[51]; 403 #endif /* UPCALL_TIMING */ 404 405 /* 406 * Handle MRT setsockopt commands to modify the multicast routing tables. 407 */ 408 int 409 ip_mrouter_set(struct socket *so, struct sockopt *sopt) 410 { 411 int error; 412 int optval; 413 struct vifctl vifc; 414 vifi_t vifi; 415 struct bw_upcall bwuc; 416 417 if (sopt->sopt_name != MRT_INIT && so != ip_mrouter) 418 error = ENOPROTOOPT; 419 else { 420 switch (sopt->sopt_name) { 421 case MRT_INIT: 422 error = sockopt_getint(sopt, &optval); 423 if (error) 424 break; 425 426 error = ip_mrouter_init(so, optval); 427 break; 428 case MRT_DONE: 429 error = ip_mrouter_done(); 430 break; 431 case MRT_ADD_VIF: 432 error = sockopt_get(sopt, &vifc, sizeof(vifc)); 433 if (error) 434 break; 435 error = add_vif(&vifc); 436 break; 437 case MRT_DEL_VIF: 438 error = sockopt_get(sopt, &vifi, sizeof(vifi)); 439 if (error) 440 break; 441 error = del_vif(&vifi); 442 break; 443 case MRT_ADD_MFC: 444 error = add_mfc(sopt); 445 break; 446 case MRT_DEL_MFC: 447 error = del_mfc(sopt); 448 break; 449 case MRT_ASSERT: 450 error = sockopt_getint(sopt, &optval); 451 if (error) 452 break; 453 error = set_assert(optval); 454 break; 455 case MRT_API_CONFIG: 456 error = set_api_config(sopt); 457 break; 458 case MRT_ADD_BW_UPCALL: 459 error = sockopt_get(sopt, &bwuc, sizeof(bwuc)); 460 if (error) 461 break; 462 error = add_bw_upcall(&bwuc); 463 break; 464 case MRT_DEL_BW_UPCALL: 465 error = sockopt_get(sopt, &bwuc, sizeof(bwuc)); 466 if (error) 467 break; 468 error = del_bw_upcall(&bwuc); 469 break; 470 default: 471 error = ENOPROTOOPT; 472 break; 473 } 474 } 475 return error; 476 } 477 478 /* 479 * Handle MRT getsockopt commands 480 */ 481 int 482 ip_mrouter_get(struct socket *so, struct sockopt *sopt) 483 { 484 int error; 485 486 if (so != ip_mrouter) 487 error = ENOPROTOOPT; 488 else { 489 switch (sopt->sopt_name) { 490 case MRT_VERSION: 491 error = sockopt_setint(sopt, 0x0305); /* XXX !!!! */ 492 break; 493 case MRT_ASSERT: 494 error = sockopt_setint(sopt, pim_assert); 495 break; 496 case MRT_API_SUPPORT: 497 error = sockopt_set(sopt, &mrt_api_support, 498 sizeof(mrt_api_support)); 499 break; 500 case MRT_API_CONFIG: 501 error = sockopt_set(sopt, &mrt_api_config, 502 sizeof(mrt_api_config)); 503 break; 504 default: 505 error = ENOPROTOOPT; 506 break; 507 } 508 } 509 return error; 510 } 511 512 /* 513 * Handle ioctl commands to obtain information from the cache 514 */ 515 int 516 mrt_ioctl(struct socket *so, u_long cmd, void *data) 517 { 518 int error; 519 520 if (so != ip_mrouter) 521 error = EINVAL; 522 else 523 switch (cmd) { 524 case SIOCGETVIFCNT: 525 error = get_vif_cnt((struct sioc_vif_req *)data); 526 break; 527 case SIOCGETSGCNT: 528 error = get_sg_cnt((struct sioc_sg_req *)data); 529 break; 530 default: 531 error = EINVAL; 532 break; 533 } 534 535 return error; 536 } 537 538 /* 539 * returns the packet, byte, rpf-failure count for the source group provided 540 */ 541 static int 542 get_sg_cnt(struct sioc_sg_req *req) 543 { 544 int s; 545 struct mfc *rt; 546 547 s = splsoftnet(); 548 rt = mfc_find(&req->src, &req->grp); 549 if (rt == NULL) { 550 splx(s); 551 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 552 return EADDRNOTAVAIL; 553 } 554 req->pktcnt = rt->mfc_pkt_cnt; 555 req->bytecnt = rt->mfc_byte_cnt; 556 req->wrong_if = rt->mfc_wrong_if; 557 splx(s); 558 559 return 0; 560 } 561 562 /* 563 * returns the input and output packet and byte counts on the vif provided 564 */ 565 static int 566 get_vif_cnt(struct sioc_vif_req *req) 567 { 568 vifi_t vifi = req->vifi; 569 570 if (vifi >= numvifs) 571 return EINVAL; 572 573 req->icount = viftable[vifi].v_pkt_in; 574 req->ocount = viftable[vifi].v_pkt_out; 575 req->ibytes = viftable[vifi].v_bytes_in; 576 req->obytes = viftable[vifi].v_bytes_out; 577 578 return 0; 579 } 580 581 /* 582 * Enable multicast routing 583 */ 584 static int 585 ip_mrouter_init(struct socket *so, int v) 586 { 587 if (mrtdebug) 588 log(LOG_DEBUG, 589 "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", 590 so->so_type, so->so_proto->pr_protocol); 591 592 if (so->so_type != SOCK_RAW || 593 so->so_proto->pr_protocol != IPPROTO_IGMP) 594 return EOPNOTSUPP; 595 596 if (v != 1) 597 return EINVAL; 598 599 if (ip_mrouter != NULL) 600 return EADDRINUSE; 601 602 ip_mrouter = so; 603 604 mfchashtbl = hashinit(MFCTBLSIZ, HASH_LIST, true, &mfchash); 605 memset((void *)nexpire, 0, sizeof(nexpire)); 606 607 pim_assert = 0; 608 609 callout_init(&expire_upcalls_ch, 0); 610 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, 611 expire_upcalls, NULL); 612 613 callout_init(&bw_upcalls_ch, 0); 614 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 615 expire_bw_upcalls_send, NULL); 616 617 callout_init(&bw_meter_ch, 0); 618 callout_reset(&bw_meter_ch, BW_METER_PERIOD, 619 expire_bw_meter_process, NULL); 620 621 if (mrtdebug) 622 log(LOG_DEBUG, "ip_mrouter_init\n"); 623 624 return 0; 625 } 626 627 /* 628 * Disable multicast routing 629 */ 630 int 631 ip_mrouter_done(void) 632 { 633 vifi_t vifi; 634 struct vif *vifp; 635 int i; 636 int s; 637 638 s = splsoftnet(); 639 640 /* Clear out all the vifs currently in use. */ 641 for (vifi = 0; vifi < numvifs; vifi++) { 642 vifp = &viftable[vifi]; 643 if (!in_nullhost(vifp->v_lcl_addr)) 644 reset_vif(vifp); 645 } 646 647 numvifs = 0; 648 pim_assert = 0; 649 mrt_api_config = 0; 650 651 callout_stop(&expire_upcalls_ch); 652 callout_stop(&bw_upcalls_ch); 653 callout_stop(&bw_meter_ch); 654 655 /* 656 * Free all multicast forwarding cache entries. 657 */ 658 for (i = 0; i < MFCTBLSIZ; i++) { 659 struct mfc *rt, *nrt; 660 661 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 662 nrt = LIST_NEXT(rt, mfc_hash); 663 664 expire_mfc(rt); 665 } 666 } 667 668 memset((void *)nexpire, 0, sizeof(nexpire)); 669 hashdone(mfchashtbl, HASH_LIST, mfchash); 670 mfchashtbl = NULL; 671 672 bw_upcalls_n = 0; 673 memset(bw_meter_timers, 0, sizeof(bw_meter_timers)); 674 675 /* Reset de-encapsulation cache. */ 676 677 ip_mrouter = NULL; 678 679 splx(s); 680 681 if (mrtdebug) 682 log(LOG_DEBUG, "ip_mrouter_done\n"); 683 684 return 0; 685 } 686 687 void 688 ip_mrouter_detach(struct ifnet *ifp) 689 { 690 int vifi, i; 691 struct vif *vifp; 692 struct mfc *rt; 693 struct rtdetq *rte; 694 695 /* XXX not sure about side effect to userland routing daemon */ 696 for (vifi = 0; vifi < numvifs; vifi++) { 697 vifp = &viftable[vifi]; 698 if (vifp->v_ifp == ifp) 699 reset_vif(vifp); 700 } 701 for (i = 0; i < MFCTBLSIZ; i++) { 702 if (nexpire[i] == 0) 703 continue; 704 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) { 705 for (rte = rt->mfc_stall; rte; rte = rte->next) { 706 if (rte->ifp == ifp) 707 rte->ifp = NULL; 708 } 709 } 710 } 711 } 712 713 /* 714 * Set PIM assert processing global 715 */ 716 static int 717 set_assert(int i) 718 { 719 pim_assert = !!i; 720 return 0; 721 } 722 723 /* 724 * Configure API capabilities 725 */ 726 static int 727 set_api_config(struct sockopt *sopt) 728 { 729 u_int32_t apival; 730 int i, error; 731 732 /* 733 * We can set the API capabilities only if it is the first operation 734 * after MRT_INIT. I.e.: 735 * - there are no vifs installed 736 * - pim_assert is not enabled 737 * - the MFC table is empty 738 */ 739 error = sockopt_get(sopt, &apival, sizeof(apival)); 740 if (error) 741 return error; 742 if (numvifs > 0) 743 return EPERM; 744 if (pim_assert) 745 return EPERM; 746 for (i = 0; i < MFCTBLSIZ; i++) { 747 if (LIST_FIRST(&mfchashtbl[i]) != NULL) 748 return EPERM; 749 } 750 751 mrt_api_config = apival & mrt_api_support; 752 return 0; 753 } 754 755 /* 756 * Add a vif to the vif table 757 */ 758 static int 759 add_vif(struct vifctl *vifcp) 760 { 761 struct vif *vifp; 762 struct ifnet *ifp; 763 int error, s; 764 struct sockaddr_in sin; 765 766 if (vifcp->vifc_vifi >= MAXVIFS) 767 return EINVAL; 768 if (in_nullhost(vifcp->vifc_lcl_addr)) 769 return EADDRNOTAVAIL; 770 771 vifp = &viftable[vifcp->vifc_vifi]; 772 if (!in_nullhost(vifp->v_lcl_addr)) 773 return EADDRINUSE; 774 775 /* Find the interface with an address in AF_INET family. */ 776 #ifdef PIM 777 if (vifcp->vifc_flags & VIFF_REGISTER) { 778 /* 779 * XXX: Because VIFF_REGISTER does not really need a valid 780 * local interface (e.g. it could be 127.0.0.2), we don't 781 * check its address. 782 */ 783 ifp = NULL; 784 } else 785 #endif 786 { 787 struct ifaddr *ifa; 788 789 sockaddr_in_init(&sin, &vifcp->vifc_lcl_addr, 0); 790 s = pserialize_read_enter(); 791 ifa = ifa_ifwithaddr(sintosa(&sin)); 792 if (ifa == NULL) { 793 pserialize_read_exit(s); 794 return EADDRNOTAVAIL; 795 } 796 ifp = ifa->ifa_ifp; 797 /* FIXME NOMPSAFE */ 798 pserialize_read_exit(s); 799 } 800 801 if (vifcp->vifc_flags & VIFF_TUNNEL) { 802 if (vifcp->vifc_flags & VIFF_SRCRT) { 803 log(LOG_ERR, "source routed tunnels not supported\n"); 804 return EOPNOTSUPP; 805 } 806 807 /* attach this vif to decapsulator dispatch table */ 808 /* 809 * XXX Use addresses in registration so that matching 810 * can be done with radix tree in decapsulator. But, 811 * we need to check inner header for multicast, so 812 * this requires both radix tree lookup and then a 813 * function to check, and this is not supported yet. 814 */ 815 error = encap_lock_enter(); 816 if (error) 817 return error; 818 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, 819 vif_encapcheck, &vif_encapsw, vifp); 820 encap_lock_exit(); 821 if (!vifp->v_encap_cookie) 822 return EINVAL; 823 824 /* Create a fake encapsulation interface. */ 825 ifp = malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK|M_ZERO); 826 snprintf(ifp->if_xname, sizeof(ifp->if_xname), 827 "mdecap%d", vifcp->vifc_vifi); 828 829 /* Prepare cached route entry. */ 830 memset(&vifp->v_route, 0, sizeof(vifp->v_route)); 831 #ifdef PIM 832 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 833 ifp = &multicast_register_if; 834 if (mrtdebug) 835 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", 836 (void *)ifp); 837 if (reg_vif_num == VIFI_INVALID) { 838 memset(ifp, 0, sizeof(*ifp)); 839 snprintf(ifp->if_xname, sizeof(ifp->if_xname), 840 "register_vif"); 841 ifp->if_flags = IFF_LOOPBACK; 842 memset(&vifp->v_route, 0, sizeof(vifp->v_route)); 843 reg_vif_num = vifcp->vifc_vifi; 844 } 845 #endif 846 } else { 847 /* Make sure the interface supports multicast. */ 848 if ((ifp->if_flags & IFF_MULTICAST) == 0) 849 return EOPNOTSUPP; 850 851 /* Enable promiscuous reception of all IP multicasts. */ 852 sockaddr_in_init(&sin, &zeroin_addr, 0); 853 error = if_mcast_op(ifp, SIOCADDMULTI, sintosa(&sin)); 854 if (error) 855 return error; 856 } 857 858 s = splsoftnet(); 859 860 /* Define parameters for the tbf structure. */ 861 vifp->tbf_q = NULL; 862 vifp->tbf_t = &vifp->tbf_q; 863 microtime(&vifp->tbf_last_pkt_t); 864 vifp->tbf_n_tok = 0; 865 vifp->tbf_q_len = 0; 866 vifp->tbf_max_q_len = MAXQSIZE; 867 868 vifp->v_flags = vifcp->vifc_flags; 869 vifp->v_threshold = vifcp->vifc_threshold; 870 /* scaling up here allows division by 1024 in critical code */ 871 vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000; 872 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 873 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 874 vifp->v_ifp = ifp; 875 /* Initialize per vif pkt counters. */ 876 vifp->v_pkt_in = 0; 877 vifp->v_pkt_out = 0; 878 vifp->v_bytes_in = 0; 879 vifp->v_bytes_out = 0; 880 881 callout_init(&vifp->v_repq_ch, 0); 882 883 splx(s); 884 885 /* Adjust numvifs up if the vifi is higher than numvifs. */ 886 if (numvifs <= vifcp->vifc_vifi) 887 numvifs = vifcp->vifc_vifi + 1; 888 889 if (mrtdebug) 890 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n", 891 vifcp->vifc_vifi, 892 ntohl(vifcp->vifc_lcl_addr.s_addr), 893 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 894 ntohl(vifcp->vifc_rmt_addr.s_addr), 895 vifcp->vifc_threshold, 896 vifcp->vifc_rate_limit); 897 898 return 0; 899 } 900 901 void 902 reset_vif(struct vif *vifp) 903 { 904 struct mbuf *m, *n; 905 struct ifnet *ifp; 906 struct sockaddr_in sin; 907 908 callout_stop(&vifp->v_repq_ch); 909 910 /* detach this vif from decapsulator dispatch table */ 911 encap_lock_enter(); 912 encap_detach(vifp->v_encap_cookie); 913 encap_lock_exit(); 914 vifp->v_encap_cookie = NULL; 915 916 /* 917 * Free packets queued at the interface 918 */ 919 for (m = vifp->tbf_q; m != NULL; m = n) { 920 n = m->m_nextpkt; 921 m_freem(m); 922 } 923 924 if (vifp->v_flags & VIFF_TUNNEL) 925 free(vifp->v_ifp, M_MRTABLE); 926 else if (vifp->v_flags & VIFF_REGISTER) { 927 #ifdef PIM 928 reg_vif_num = VIFI_INVALID; 929 #endif 930 } else { 931 sockaddr_in_init(&sin, &zeroin_addr, 0); 932 ifp = vifp->v_ifp; 933 if_mcast_op(ifp, SIOCDELMULTI, sintosa(&sin)); 934 } 935 memset((void *)vifp, 0, sizeof(*vifp)); 936 } 937 938 /* 939 * Delete a vif from the vif table 940 */ 941 static int 942 del_vif(vifi_t *vifip) 943 { 944 struct vif *vifp; 945 vifi_t vifi; 946 int s; 947 948 if (*vifip >= numvifs) 949 return EINVAL; 950 951 vifp = &viftable[*vifip]; 952 if (in_nullhost(vifp->v_lcl_addr)) 953 return EADDRNOTAVAIL; 954 955 s = splsoftnet(); 956 957 reset_vif(vifp); 958 959 /* Adjust numvifs down */ 960 for (vifi = numvifs; vifi > 0; vifi--) 961 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr)) 962 break; 963 numvifs = vifi; 964 965 splx(s); 966 967 if (mrtdebug) 968 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs); 969 970 return 0; 971 } 972 973 /* 974 * update an mfc entry without resetting counters and S,G addresses. 975 */ 976 static void 977 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 978 { 979 int i; 980 981 rt->mfc_parent = mfccp->mfcc_parent; 982 for (i = 0; i < numvifs; i++) { 983 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 984 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 985 MRT_MFC_FLAGS_ALL; 986 } 987 /* set the RP address */ 988 if (mrt_api_config & MRT_MFC_RP) 989 rt->mfc_rp = mfccp->mfcc_rp; 990 else 991 rt->mfc_rp = zeroin_addr; 992 } 993 994 /* 995 * fully initialize an mfc entry from the parameter. 996 */ 997 static void 998 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 999 { 1000 rt->mfc_origin = mfccp->mfcc_origin; 1001 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1002 1003 update_mfc_params(rt, mfccp); 1004 1005 /* initialize pkt counters per src-grp */ 1006 rt->mfc_pkt_cnt = 0; 1007 rt->mfc_byte_cnt = 0; 1008 rt->mfc_wrong_if = 0; 1009 timerclear(&rt->mfc_last_assert); 1010 } 1011 1012 static void 1013 expire_mfc(struct mfc *rt) 1014 { 1015 struct rtdetq *rte, *nrte; 1016 1017 free_bw_list(rt->mfc_bw_meter); 1018 1019 for (rte = rt->mfc_stall; rte != NULL; rte = nrte) { 1020 nrte = rte->next; 1021 m_freem(rte->m); 1022 free(rte, M_MRTABLE); 1023 } 1024 1025 LIST_REMOVE(rt, mfc_hash); 1026 free(rt, M_MRTABLE); 1027 } 1028 1029 /* 1030 * Add an mfc entry 1031 */ 1032 static int 1033 add_mfc(struct sockopt *sopt) 1034 { 1035 struct mfcctl2 mfcctl2; 1036 struct mfcctl2 *mfccp; 1037 struct mfc *rt; 1038 u_int32_t hash = 0; 1039 struct rtdetq *rte, *nrte; 1040 u_short nstl; 1041 int s; 1042 int error; 1043 1044 /* 1045 * select data size depending on API version. 1046 */ 1047 mfccp = &mfcctl2; 1048 memset(&mfcctl2, 0, sizeof(mfcctl2)); 1049 1050 if (mrt_api_config & MRT_API_FLAGS_ALL) 1051 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2)); 1052 else 1053 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl)); 1054 1055 if (error) 1056 return error; 1057 1058 s = splsoftnet(); 1059 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1060 1061 /* If an entry already exists, just update the fields */ 1062 if (rt) { 1063 if (mrtdebug & DEBUG_MFC) 1064 log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n", 1065 ntohl(mfccp->mfcc_origin.s_addr), 1066 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1067 mfccp->mfcc_parent); 1068 1069 update_mfc_params(rt, mfccp); 1070 1071 splx(s); 1072 return 0; 1073 } 1074 1075 /* 1076 * Find the entry for which the upcall was made and update 1077 */ 1078 nstl = 0; 1079 hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); 1080 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1081 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1082 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && 1083 rt->mfc_stall != NULL) { 1084 if (nstl++) 1085 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n", 1086 "multiple kernel entries", 1087 ntohl(mfccp->mfcc_origin.s_addr), 1088 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1089 mfccp->mfcc_parent, rt->mfc_stall); 1090 1091 if (mrtdebug & DEBUG_MFC) 1092 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n", 1093 ntohl(mfccp->mfcc_origin.s_addr), 1094 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1095 mfccp->mfcc_parent, rt->mfc_stall); 1096 1097 rte = rt->mfc_stall; 1098 init_mfc_params(rt, mfccp); 1099 rt->mfc_stall = NULL; 1100 1101 rt->mfc_expire = 0; /* Don't clean this guy up */ 1102 nexpire[hash]--; 1103 1104 /* free packets Qed at the end of this entry */ 1105 for (; rte != NULL; rte = nrte) { 1106 nrte = rte->next; 1107 if (rte->ifp) { 1108 ip_mdq(rte->m, rte->ifp, rt); 1109 } 1110 m_freem(rte->m); 1111 #ifdef UPCALL_TIMING 1112 collate(&rte->t); 1113 #endif /* UPCALL_TIMING */ 1114 free(rte, M_MRTABLE); 1115 } 1116 } 1117 } 1118 1119 /* 1120 * It is possible that an entry is being inserted without an upcall 1121 */ 1122 if (nstl == 0) { 1123 /* 1124 * No mfc; make a new one 1125 */ 1126 if (mrtdebug & DEBUG_MFC) 1127 log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n", 1128 ntohl(mfccp->mfcc_origin.s_addr), 1129 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1130 mfccp->mfcc_parent); 1131 1132 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1133 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1134 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { 1135 init_mfc_params(rt, mfccp); 1136 if (rt->mfc_expire) 1137 nexpire[hash]--; 1138 rt->mfc_expire = 0; 1139 break; /* XXX */ 1140 } 1141 } 1142 if (rt == NULL) { /* no upcall, so make a new entry */ 1143 rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1144 if (rt == NULL) { 1145 splx(s); 1146 return ENOBUFS; 1147 } 1148 1149 init_mfc_params(rt, mfccp); 1150 rt->mfc_expire = 0; 1151 rt->mfc_stall = NULL; 1152 rt->mfc_bw_meter = NULL; 1153 1154 /* insert new entry at head of hash chain */ 1155 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1156 } 1157 } 1158 1159 splx(s); 1160 return 0; 1161 } 1162 1163 #ifdef UPCALL_TIMING 1164 /* 1165 * collect delay statistics on the upcalls 1166 */ 1167 static void 1168 collate(struct timeval *t) 1169 { 1170 u_int32_t d; 1171 struct timeval tp; 1172 u_int32_t delta; 1173 1174 microtime(&tp); 1175 1176 if (timercmp(t, &tp, <)) { 1177 TV_DELTA(tp, *t, delta); 1178 1179 d = delta >> 10; 1180 if (d > 50) 1181 d = 50; 1182 1183 ++upcall_data[d]; 1184 } 1185 } 1186 #endif /* UPCALL_TIMING */ 1187 1188 /* 1189 * Delete an mfc entry 1190 */ 1191 static int 1192 del_mfc(struct sockopt *sopt) 1193 { 1194 struct mfcctl2 mfcctl2; 1195 struct mfcctl2 *mfccp; 1196 struct mfc *rt; 1197 int s; 1198 int error; 1199 1200 /* 1201 * XXX: for deleting MFC entries the information in entries 1202 * of size "struct mfcctl" is sufficient. 1203 */ 1204 1205 mfccp = &mfcctl2; 1206 memset(&mfcctl2, 0, sizeof(mfcctl2)); 1207 1208 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl)); 1209 if (error) { 1210 /* Try with the size of mfcctl2. */ 1211 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2)); 1212 if (error) 1213 return error; 1214 } 1215 1216 if (mrtdebug & DEBUG_MFC) 1217 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n", 1218 ntohl(mfccp->mfcc_origin.s_addr), 1219 ntohl(mfccp->mfcc_mcastgrp.s_addr)); 1220 1221 s = splsoftnet(); 1222 1223 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1224 if (rt == NULL) { 1225 splx(s); 1226 return EADDRNOTAVAIL; 1227 } 1228 1229 /* 1230 * free the bw_meter entries 1231 */ 1232 free_bw_list(rt->mfc_bw_meter); 1233 rt->mfc_bw_meter = NULL; 1234 1235 LIST_REMOVE(rt, mfc_hash); 1236 free(rt, M_MRTABLE); 1237 1238 splx(s); 1239 return 0; 1240 } 1241 1242 static int 1243 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1244 { 1245 if (s) { 1246 if (sbappendaddr(&s->so_rcv, sintosa(src), mm, NULL) != 0) { 1247 sorwakeup(s); 1248 return 0; 1249 } 1250 soroverflow(s); 1251 } 1252 m_freem(mm); 1253 return -1; 1254 } 1255 1256 /* 1257 * IP multicast forwarding function. This function assumes that the packet 1258 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1259 * pointed to by "ifp", and the packet is to be relayed to other networks 1260 * that have members of the packet's destination IP multicast group. 1261 * 1262 * The packet is returned unscathed to the caller, unless it is 1263 * erroneous, in which case a non-zero return value tells the caller to 1264 * discard it. 1265 */ 1266 1267 #define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */ 1268 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1269 1270 int 1271 ip_mforward(struct mbuf *m, struct ifnet *ifp) 1272 { 1273 int rc; 1274 /* 1275 * save csum_flags to uphold the 1276 * "unscathed" guarantee. 1277 * ip_output() relies on that and 1278 * without it we send out 1279 * multicast packets with an invalid 1280 * checksum 1281 * 1282 * see PR kern/55779 1283 */ 1284 int csum_flags = m->m_pkthdr.csum_flags; 1285 1286 /* 1287 * Temporarily clear any in-bound checksum flags for this packet. 1288 */ 1289 m->m_pkthdr.csum_flags = 0; 1290 1291 rc = ip_mforward_real(m, ifp); 1292 1293 m->m_pkthdr.csum_flags = csum_flags; 1294 1295 return rc; 1296 } 1297 1298 static int 1299 ip_mforward_real(struct mbuf *m, struct ifnet *ifp) 1300 { 1301 struct ip *ip = mtod(m, struct ip *); 1302 struct mfc *rt; 1303 static int srctun = 0; 1304 struct mbuf *mm; 1305 struct sockaddr_in sin; 1306 int s; 1307 vifi_t vifi; 1308 1309 if (mrtdebug & DEBUG_FORWARD) 1310 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n", 1311 ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp); 1312 1313 /* 1314 * XXX XXX: Why do we check [1] against IPOPT_LSRR? Because we 1315 * expect [0] to be IPOPT_NOP, maybe? In all cases that doesn't 1316 * make a lot of sense, a forged packet can just put two IPOPT_NOPs 1317 * followed by one IPOPT_LSRR, and bypass the check. 1318 */ 1319 if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 || 1320 ((u_char *)(ip + 1))[1] != IPOPT_LSRR) { 1321 /* 1322 * Packet arrived via a physical interface or 1323 * an encapsulated tunnel or a register_vif. 1324 */ 1325 } else { 1326 /* 1327 * Packet arrived through a source-route tunnel. 1328 * Source-route tunnels are no longer supported. 1329 */ 1330 if ((srctun++ % 1000) == 0) 1331 log(LOG_ERR, 1332 "ip_mforward: received source-routed packet from %x\n", 1333 ntohl(ip->ip_src.s_addr)); 1334 return EOPNOTSUPP; 1335 } 1336 1337 /* 1338 * Don't forward a packet with time-to-live of zero or one, 1339 * or a packet destined to a local-only group. 1340 */ 1341 if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 1342 return 0; 1343 1344 /* 1345 * Determine forwarding vifs from the forwarding cache table 1346 */ 1347 s = splsoftnet(); 1348 ++mrtstat.mrts_mfc_lookups; 1349 rt = mfc_find(&ip->ip_src, &ip->ip_dst); 1350 1351 /* Entry exists, so forward if necessary */ 1352 if (rt != NULL) { 1353 splx(s); 1354 return ip_mdq(m, ifp, rt); 1355 } else { 1356 /* 1357 * If we don't have a route for packet's origin, make a copy 1358 * of the packet and send message to routing daemon. 1359 */ 1360 1361 struct mbuf *mb0; 1362 struct rtdetq *rte; 1363 u_int32_t hash; 1364 const int hlen = ip->ip_hl << 2; 1365 #ifdef UPCALL_TIMING 1366 struct timeval tp; 1367 microtime(&tp); 1368 #endif 1369 1370 ++mrtstat.mrts_mfc_misses; 1371 1372 mrtstat.mrts_no_route++; 1373 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) 1374 log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n", 1375 ntohl(ip->ip_src.s_addr), 1376 ntohl(ip->ip_dst.s_addr)); 1377 1378 /* 1379 * Allocate mbufs early so that we don't do extra work if we are 1380 * just going to fail anyway. Make sure to pullup the header so 1381 * that other people can't step on it. 1382 */ 1383 rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT); 1384 if (rte == NULL) { 1385 splx(s); 1386 return ENOBUFS; 1387 } 1388 mb0 = m_copypacket(m, M_DONTWAIT); 1389 M_PULLUP(mb0, hlen); 1390 if (mb0 == NULL) { 1391 free(rte, M_MRTABLE); 1392 splx(s); 1393 return ENOBUFS; 1394 } 1395 1396 /* is there an upcall waiting for this flow? */ 1397 hash = MFCHASH(ip->ip_src, ip->ip_dst); 1398 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1399 if (in_hosteq(ip->ip_src, rt->mfc_origin) && 1400 in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && 1401 rt->mfc_stall != NULL) 1402 break; 1403 } 1404 1405 if (rt == NULL) { 1406 int i; 1407 struct igmpmsg *im; 1408 1409 /* 1410 * Locate the vifi for the incoming interface for 1411 * this packet. 1412 * If none found, drop packet. 1413 */ 1414 for (vifi = 0; vifi < numvifs && 1415 viftable[vifi].v_ifp != ifp; vifi++) 1416 ; 1417 if (vifi >= numvifs) /* vif not found, drop packet */ 1418 goto non_fatal; 1419 1420 /* no upcall, so make a new entry */ 1421 rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1422 if (rt == NULL) 1423 goto fail; 1424 1425 /* 1426 * Make a copy of the header to send to the user level 1427 * process 1428 */ 1429 mm = m_copym(m, 0, hlen, M_DONTWAIT); 1430 M_PULLUP(mm, hlen); 1431 if (mm == NULL) 1432 goto fail1; 1433 1434 /* 1435 * Send message to routing daemon to install 1436 * a route into the kernel table 1437 */ 1438 1439 im = mtod(mm, struct igmpmsg *); 1440 im->im_msgtype = IGMPMSG_NOCACHE; 1441 im->im_mbz = 0; 1442 im->im_vif = vifi; 1443 1444 mrtstat.mrts_upcalls++; 1445 1446 sockaddr_in_init(&sin, &ip->ip_src, 0); 1447 if (socket_send(ip_mrouter, mm, &sin) < 0) { 1448 log(LOG_WARNING, 1449 "ip_mforward: ip_mrouter socket queue full\n"); 1450 ++mrtstat.mrts_upq_sockfull; 1451 fail1: 1452 free(rt, M_MRTABLE); 1453 fail: 1454 free(rte, M_MRTABLE); 1455 m_freem(mb0); 1456 splx(s); 1457 return ENOBUFS; 1458 } 1459 1460 /* insert new entry at head of hash chain */ 1461 rt->mfc_origin = ip->ip_src; 1462 rt->mfc_mcastgrp = ip->ip_dst; 1463 rt->mfc_pkt_cnt = 0; 1464 rt->mfc_byte_cnt = 0; 1465 rt->mfc_wrong_if = 0; 1466 rt->mfc_expire = UPCALL_EXPIRE; 1467 nexpire[hash]++; 1468 for (i = 0; i < numvifs; i++) { 1469 rt->mfc_ttls[i] = 0; 1470 rt->mfc_flags[i] = 0; 1471 } 1472 rt->mfc_parent = -1; 1473 1474 /* clear the RP address */ 1475 rt->mfc_rp = zeroin_addr; 1476 1477 rt->mfc_bw_meter = NULL; 1478 1479 /* link into table */ 1480 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1481 /* Add this entry to the end of the queue */ 1482 rt->mfc_stall = rte; 1483 } else { 1484 /* determine if q has overflowed */ 1485 struct rtdetq **p; 1486 int npkts = 0; 1487 1488 /* 1489 * XXX ouch! we need to append to the list, but we 1490 * only have a pointer to the front, so we have to 1491 * scan the entire list every time. 1492 */ 1493 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) 1494 if (++npkts > MAX_UPQ) { 1495 mrtstat.mrts_upq_ovflw++; 1496 non_fatal: 1497 free(rte, M_MRTABLE); 1498 m_freem(mb0); 1499 splx(s); 1500 return 0; 1501 } 1502 1503 /* Add this entry to the end of the queue */ 1504 *p = rte; 1505 } 1506 1507 rte->next = NULL; 1508 rte->m = mb0; 1509 rte->ifp = ifp; 1510 #ifdef UPCALL_TIMING 1511 rte->t = tp; 1512 #endif 1513 1514 splx(s); 1515 1516 return 0; 1517 } 1518 } 1519 1520 /*ARGSUSED*/ 1521 static void 1522 expire_upcalls(void *v) 1523 { 1524 int i; 1525 1526 /* XXX NOMPSAFE still need softnet_lock */ 1527 mutex_enter(softnet_lock); 1528 KERNEL_LOCK(1, NULL); 1529 1530 for (i = 0; i < MFCTBLSIZ; i++) { 1531 struct mfc *rt, *nrt; 1532 1533 if (nexpire[i] == 0) 1534 continue; 1535 1536 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 1537 nrt = LIST_NEXT(rt, mfc_hash); 1538 1539 if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) 1540 continue; 1541 nexpire[i]--; 1542 1543 /* 1544 * free the bw_meter entries 1545 */ 1546 while (rt->mfc_bw_meter != NULL) { 1547 struct bw_meter *x = rt->mfc_bw_meter; 1548 1549 rt->mfc_bw_meter = x->bm_mfc_next; 1550 kmem_intr_free(x, sizeof(*x)); 1551 } 1552 1553 ++mrtstat.mrts_cache_cleanups; 1554 if (mrtdebug & DEBUG_EXPIRE) 1555 log(LOG_DEBUG, 1556 "expire_upcalls: expiring (%x %x)\n", 1557 ntohl(rt->mfc_origin.s_addr), 1558 ntohl(rt->mfc_mcastgrp.s_addr)); 1559 1560 expire_mfc(rt); 1561 } 1562 } 1563 1564 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, 1565 expire_upcalls, NULL); 1566 1567 KERNEL_UNLOCK_ONE(NULL); 1568 mutex_exit(softnet_lock); 1569 } 1570 1571 /* 1572 * Macro to send packet on vif. 1573 */ 1574 #define MC_SEND(ip, vifp, m) do { \ 1575 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1576 encap_send((ip), (vifp), (m)); \ 1577 else \ 1578 phyint_send((ip), (vifp), (m)); \ 1579 } while (/*CONSTCOND*/ 0) 1580 1581 /* 1582 * Packet forwarding routine once entry in the cache is made 1583 */ 1584 static int 1585 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt) 1586 { 1587 struct ip *ip = mtod(m, struct ip *); 1588 vifi_t vifi; 1589 struct vif *vifp; 1590 struct sockaddr_in sin; 1591 const int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2); 1592 1593 /* 1594 * Don't forward if it didn't arrive from the parent vif for its origin. 1595 */ 1596 vifi = rt->mfc_parent; 1597 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1598 /* came in the wrong interface */ 1599 if (mrtdebug & DEBUG_FORWARD) 1600 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", 1601 ifp, vifi, 1602 vifi >= numvifs ? 0 : viftable[vifi].v_ifp); 1603 ++mrtstat.mrts_wrong_if; 1604 ++rt->mfc_wrong_if; 1605 1606 /* 1607 * If we are doing PIM assert processing, send a message 1608 * to the routing daemon. 1609 * 1610 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1611 * can complete the SPT switch, regardless of the type 1612 * of the iif (broadcast media, GRE tunnel, etc). 1613 */ 1614 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { 1615 struct timeval now; 1616 u_int32_t delta; 1617 1618 #ifdef PIM 1619 if (ifp == &multicast_register_if) 1620 pimstat.pims_rcv_registers_wrongiif++; 1621 #endif 1622 1623 /* Get vifi for the incoming packet */ 1624 for (vifi = 0; 1625 vifi < numvifs && viftable[vifi].v_ifp != ifp; 1626 vifi++) 1627 ; 1628 if (vifi >= numvifs) { 1629 /* The iif is not found: ignore the packet. */ 1630 return 0; 1631 } 1632 1633 if (rt->mfc_flags[vifi] & 1634 MRT_MFC_FLAGS_DISABLE_WRONGVIF) { 1635 /* WRONGVIF disabled: ignore the packet */ 1636 return 0; 1637 } 1638 1639 microtime(&now); 1640 1641 TV_DELTA(rt->mfc_last_assert, now, delta); 1642 1643 if (delta > ASSERT_MSG_TIME) { 1644 struct igmpmsg *im; 1645 const int hlen = ip->ip_hl << 2; 1646 struct mbuf *mm = 1647 m_copym(m, 0, hlen, M_DONTWAIT); 1648 1649 M_PULLUP(mm, hlen); 1650 if (mm == NULL) 1651 return ENOBUFS; 1652 1653 rt->mfc_last_assert = now; 1654 1655 im = mtod(mm, struct igmpmsg *); 1656 im->im_msgtype = IGMPMSG_WRONGVIF; 1657 im->im_mbz = 0; 1658 im->im_vif = vifi; 1659 1660 mrtstat.mrts_upcalls++; 1661 1662 sockaddr_in_init(&sin, &im->im_src, 0); 1663 if (socket_send(ip_mrouter, mm, &sin) < 0) { 1664 log(LOG_WARNING, 1665 "ip_mforward: ip_mrouter socket queue full\n"); 1666 ++mrtstat.mrts_upq_sockfull; 1667 return ENOBUFS; 1668 } 1669 } 1670 } 1671 return 0; 1672 } 1673 1674 /* If I sourced this packet, it counts as output, else it was input. */ 1675 if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) { 1676 viftable[vifi].v_pkt_out++; 1677 viftable[vifi].v_bytes_out += plen; 1678 } else { 1679 viftable[vifi].v_pkt_in++; 1680 viftable[vifi].v_bytes_in += plen; 1681 } 1682 rt->mfc_pkt_cnt++; 1683 rt->mfc_byte_cnt += plen; 1684 1685 /* 1686 * For each vif, decide if a copy of the packet should be forwarded. 1687 * Forward if: 1688 * - the ttl exceeds the vif's threshold 1689 * - there are group members downstream on interface 1690 */ 1691 for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) { 1692 if ((rt->mfc_ttls[vifi] > 0) && 1693 (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1694 vifp->v_pkt_out++; 1695 vifp->v_bytes_out += plen; 1696 #ifdef PIM 1697 if (vifp->v_flags & VIFF_REGISTER) 1698 pim_register_send(ip, vifp, m, rt); 1699 else 1700 #endif 1701 MC_SEND(ip, vifp, m); 1702 } 1703 } 1704 1705 /* 1706 * Perform upcall-related bw measuring. 1707 */ 1708 if (rt->mfc_bw_meter != NULL) { 1709 struct bw_meter *x; 1710 struct timeval now; 1711 1712 microtime(&now); 1713 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1714 bw_meter_receive_packet(x, plen, &now); 1715 } 1716 1717 return 0; 1718 } 1719 1720 static void 1721 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1722 { 1723 struct mbuf *mb_copy; 1724 const int hlen = ip->ip_hl << 2; 1725 1726 /* 1727 * Make a new reference to the packet; make sure that 1728 * the IP header is actually copied, not just referenced, 1729 * so that ip_output() only scribbles on the copy. 1730 */ 1731 mb_copy = m_copypacket(m, M_DONTWAIT); 1732 M_PULLUP(mb_copy, hlen); 1733 if (mb_copy == NULL) 1734 return; 1735 1736 if (vifp->v_rate_limit <= 0) 1737 tbf_send_packet(vifp, mb_copy); 1738 else 1739 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), 1740 ntohs(ip->ip_len)); 1741 } 1742 1743 static void 1744 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1745 { 1746 struct mbuf *mb_copy; 1747 struct ip *ip_copy; 1748 int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr); 1749 1750 /* Take care of delayed checksums */ 1751 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 1752 in_undefer_cksum_tcpudp(m); 1753 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 1754 } 1755 1756 /* 1757 * copy the old packet & pullup its IP header into the 1758 * new mbuf so we can modify it. Try to fill the new 1759 * mbuf since if we don't the ethernet driver will. 1760 */ 1761 MGETHDR(mb_copy, M_DONTWAIT, MT_DATA); 1762 if (mb_copy == NULL) 1763 return; 1764 mb_copy->m_data += max_linkhdr; 1765 mb_copy->m_pkthdr.len = len; 1766 mb_copy->m_len = sizeof(multicast_encap_iphdr); 1767 1768 if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) { 1769 m_freem(mb_copy); 1770 return; 1771 } 1772 i = MHLEN - max_linkhdr; 1773 if (i > len) 1774 i = len; 1775 mb_copy = m_pullup(mb_copy, i); 1776 if (mb_copy == NULL) 1777 return; 1778 1779 /* 1780 * fill in the encapsulating IP header. 1781 */ 1782 ip_copy = mtod(mb_copy, struct ip *); 1783 *ip_copy = multicast_encap_iphdr; 1784 if (len < IP_MINFRAGSIZE) 1785 ip_copy->ip_id = 0; 1786 else 1787 ip_copy->ip_id = ip_newid(NULL); 1788 ip_copy->ip_len = htons(len); 1789 ip_copy->ip_src = vifp->v_lcl_addr; 1790 ip_copy->ip_dst = vifp->v_rmt_addr; 1791 1792 /* 1793 * turn the encapsulated IP header back into a valid one. 1794 */ 1795 ip = (struct ip *)((char *)ip_copy + sizeof(multicast_encap_iphdr)); 1796 --ip->ip_ttl; 1797 ip->ip_sum = 0; 1798 mb_copy->m_data += sizeof(multicast_encap_iphdr); 1799 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 1800 mb_copy->m_data -= sizeof(multicast_encap_iphdr); 1801 1802 if (vifp->v_rate_limit <= 0) 1803 tbf_send_packet(vifp, mb_copy); 1804 else 1805 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len)); 1806 } 1807 1808 /* 1809 * De-encapsulate a packet and feed it back through ip input. 1810 */ 1811 static void 1812 vif_input(struct mbuf *m, int off, int proto, void *eparg) 1813 { 1814 struct vif *vifp = eparg; 1815 1816 KASSERT(vifp != NULL); 1817 1818 if (proto != ENCAP_PROTO) { 1819 m_freem(m); 1820 mrtstat.mrts_bad_tunnel++; 1821 return; 1822 } 1823 1824 m_adj(m, off); 1825 m_set_rcvif(m, vifp->v_ifp); 1826 1827 if (__predict_false(!pktq_enqueue(ip_pktq, m, 0))) { 1828 m_freem(m); 1829 } 1830 } 1831 1832 /* 1833 * Check if the packet should be received on the vif denoted by arg. 1834 * (The encap selection code will call this once per vif since each is 1835 * registered separately.) 1836 */ 1837 static int 1838 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg) 1839 { 1840 struct vif *vifp; 1841 struct ip ip; 1842 1843 #ifdef DIAGNOSTIC 1844 if (!arg || proto != IPPROTO_IPV4) 1845 panic("unexpected arg in vif_encapcheck"); 1846 #endif 1847 1848 /* 1849 * Accept the packet only if the inner heaader is multicast 1850 * and the outer header matches a tunnel-mode vif. Order 1851 * checks in the hope that common non-matching packets will be 1852 * rejected quickly. Assume that unicast IPv4 traffic in a 1853 * parallel tunnel (e.g. gif(4)) is unlikely. 1854 */ 1855 1856 /* Obtain the outer IP header and the vif pointer. */ 1857 m_copydata(m, 0, sizeof(ip), (void *)&ip); 1858 vifp = (struct vif *)arg; 1859 1860 /* 1861 * The outer source must match the vif's remote peer address. 1862 * For a multicast router with several tunnels, this is the 1863 * only check that will fail on packets in other tunnels, 1864 * assuming the local address is the same. 1865 */ 1866 if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src)) 1867 return 0; 1868 1869 /* The outer destination must match the vif's local address. */ 1870 if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst)) 1871 return 0; 1872 1873 /* The vif must be of tunnel type. */ 1874 if ((vifp->v_flags & VIFF_TUNNEL) == 0) 1875 return 0; 1876 1877 /* Check that the inner destination is multicast. */ 1878 if (off + sizeof(ip) > m->m_pkthdr.len) 1879 return 0; 1880 m_copydata(m, off, sizeof(ip), (void *)&ip); 1881 if (!IN_MULTICAST(ip.ip_dst.s_addr)) 1882 return 0; 1883 1884 /* 1885 * We have checked that both the outer src and dst addresses 1886 * match the vif, and that the inner destination is multicast 1887 * (224/5). By claiming more than 64, we intend to 1888 * preferentially take packets that also match a parallel 1889 * gif(4). 1890 */ 1891 return 32 + 32 + 5; 1892 } 1893 1894 /* 1895 * Token bucket filter module 1896 */ 1897 static void 1898 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len) 1899 { 1900 1901 if (len > MAX_BKT_SIZE) { 1902 /* drop if packet is too large */ 1903 mrtstat.mrts_pkt2large++; 1904 m_freem(m); 1905 return; 1906 } 1907 1908 tbf_update_tokens(vifp); 1909 1910 /* 1911 * If there are enough tokens, and the queue is empty, send this packet 1912 * out immediately. Otherwise, try to insert it on this vif's queue. 1913 */ 1914 if (vifp->tbf_q_len == 0) { 1915 if (len <= vifp->tbf_n_tok) { 1916 vifp->tbf_n_tok -= len; 1917 tbf_send_packet(vifp, m); 1918 } else { 1919 /* queue packet and timeout till later */ 1920 tbf_queue(vifp, m); 1921 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS, 1922 tbf_reprocess_q, vifp); 1923 } 1924 } else { 1925 if (vifp->tbf_q_len >= vifp->tbf_max_q_len && 1926 !tbf_dq_sel(vifp, ip)) { 1927 /* queue full, and couldn't make room */ 1928 mrtstat.mrts_q_overflow++; 1929 m_freem(m); 1930 } else { 1931 /* queue length low enough, or made room */ 1932 tbf_queue(vifp, m); 1933 tbf_process_q(vifp); 1934 } 1935 } 1936 } 1937 1938 /* 1939 * adds a packet to the queue at the interface 1940 */ 1941 static void 1942 tbf_queue(struct vif *vifp, struct mbuf *m) 1943 { 1944 int s = splsoftnet(); 1945 1946 /* insert at tail */ 1947 *vifp->tbf_t = m; 1948 vifp->tbf_t = &m->m_nextpkt; 1949 vifp->tbf_q_len++; 1950 1951 splx(s); 1952 } 1953 1954 /* 1955 * processes the queue at the interface 1956 */ 1957 static void 1958 tbf_process_q(struct vif *vifp) 1959 { 1960 struct mbuf *m; 1961 int len; 1962 int s = splsoftnet(); 1963 1964 /* 1965 * Loop through the queue at the interface and send as many packets 1966 * as possible. 1967 */ 1968 for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) { 1969 len = ntohs(mtod(m, struct ip *)->ip_len); 1970 1971 /* determine if the packet can be sent */ 1972 if (len <= vifp->tbf_n_tok) { 1973 /* if so, 1974 * reduce no of tokens, dequeue the packet, 1975 * send the packet. 1976 */ 1977 if ((vifp->tbf_q = m->m_nextpkt) == NULL) 1978 vifp->tbf_t = &vifp->tbf_q; 1979 --vifp->tbf_q_len; 1980 1981 m->m_nextpkt = NULL; 1982 vifp->tbf_n_tok -= len; 1983 tbf_send_packet(vifp, m); 1984 } else 1985 break; 1986 } 1987 splx(s); 1988 } 1989 1990 static void 1991 tbf_reprocess_q(void *arg) 1992 { 1993 struct vif *vifp = arg; 1994 1995 if (ip_mrouter == NULL) 1996 return; 1997 1998 tbf_update_tokens(vifp); 1999 tbf_process_q(vifp); 2000 2001 if (vifp->tbf_q_len != 0) 2002 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS, 2003 tbf_reprocess_q, vifp); 2004 } 2005 2006 /* function that will selectively discard a member of the queue 2007 * based on the precedence value and the priority 2008 */ 2009 static int 2010 tbf_dq_sel(struct vif *vifp, struct ip *ip) 2011 { 2012 u_int p; 2013 struct mbuf **mp, *m; 2014 int s = splsoftnet(); 2015 2016 p = priority(vifp, ip); 2017 2018 for (mp = &vifp->tbf_q, m = *mp; 2019 m != NULL; 2020 mp = &m->m_nextpkt, m = *mp) { 2021 if (p > priority(vifp, mtod(m, struct ip *))) { 2022 if ((*mp = m->m_nextpkt) == NULL) 2023 vifp->tbf_t = mp; 2024 --vifp->tbf_q_len; 2025 2026 m_freem(m); 2027 mrtstat.mrts_drop_sel++; 2028 splx(s); 2029 return 1; 2030 } 2031 } 2032 splx(s); 2033 return 0; 2034 } 2035 2036 static void 2037 tbf_send_packet(struct vif *vifp, struct mbuf *m) 2038 { 2039 int error; 2040 int s = splsoftnet(); 2041 2042 if (vifp->v_flags & VIFF_TUNNEL) { 2043 /* If tunnel options */ 2044 ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL); 2045 } else { 2046 /* if physical interface option, extract the options and then send */ 2047 struct ip_moptions imo; 2048 2049 imo.imo_multicast_if_index = if_get_index(vifp->v_ifp); 2050 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 2051 imo.imo_multicast_loop = 1; 2052 2053 error = ip_output(m, NULL, NULL, IP_FORWARDING|IP_MULTICASTOPTS, 2054 &imo, NULL); 2055 2056 if (mrtdebug & DEBUG_XMIT) 2057 log(LOG_DEBUG, "phyint_send on vif %ld err %d\n", 2058 (long)(vifp - viftable), error); 2059 } 2060 splx(s); 2061 } 2062 2063 /* determine the current time and then 2064 * the elapsed time (between the last time and time now) 2065 * in milliseconds & update the no. of tokens in the bucket 2066 */ 2067 static void 2068 tbf_update_tokens(struct vif *vifp) 2069 { 2070 struct timeval tp; 2071 u_int32_t tm; 2072 int s = splsoftnet(); 2073 2074 microtime(&tp); 2075 2076 TV_DELTA(tp, vifp->tbf_last_pkt_t, tm); 2077 2078 /* 2079 * This formula is actually 2080 * "time in seconds" * "bytes/second". 2081 * 2082 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) 2083 * 2084 * The (1000/1024) was introduced in add_vif to optimize 2085 * this divide into a shift. 2086 */ 2087 vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192; 2088 vifp->tbf_last_pkt_t = tp; 2089 2090 if (vifp->tbf_n_tok > MAX_BKT_SIZE) 2091 vifp->tbf_n_tok = MAX_BKT_SIZE; 2092 2093 splx(s); 2094 } 2095 2096 static int 2097 priority(struct vif *vifp, struct ip *ip) 2098 { 2099 int prio = 50; /* the lowest priority -- default case */ 2100 2101 /* temporary hack; may add general packet classifier some day */ 2102 2103 /* 2104 * XXX XXX: We're reading the UDP header, but we didn't ensure 2105 * it was present in the packet. 2106 */ 2107 2108 /* 2109 * The UDP port space is divided up into four priority ranges: 2110 * [0, 16384) : unclassified - lowest priority 2111 * [16384, 32768) : audio - highest priority 2112 * [32768, 49152) : whiteboard - medium priority 2113 * [49152, 65536) : video - low priority 2114 */ 2115 if (ip->ip_p == IPPROTO_UDP) { 2116 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); 2117 2118 switch (ntohs(udp->uh_dport) & 0xc000) { 2119 case 0x4000: 2120 prio = 70; 2121 break; 2122 case 0x8000: 2123 prio = 60; 2124 break; 2125 case 0xc000: 2126 prio = 55; 2127 break; 2128 } 2129 2130 if (tbfdebug > 1) 2131 log(LOG_DEBUG, "port %x prio %d\n", 2132 ntohs(udp->uh_dport), prio); 2133 } 2134 2135 return prio; 2136 } 2137 2138 /* 2139 * Code for bandwidth monitors 2140 */ 2141 2142 /* 2143 * Define common interface for timeval-related methods 2144 */ 2145 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp) 2146 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp)) 2147 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp)) 2148 2149 static uint32_t 2150 compute_bw_meter_flags(struct bw_upcall *req) 2151 { 2152 uint32_t flags = 0; 2153 2154 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 2155 flags |= BW_METER_UNIT_PACKETS; 2156 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 2157 flags |= BW_METER_UNIT_BYTES; 2158 if (req->bu_flags & BW_UPCALL_GEQ) 2159 flags |= BW_METER_GEQ; 2160 if (req->bu_flags & BW_UPCALL_LEQ) 2161 flags |= BW_METER_LEQ; 2162 2163 return flags; 2164 } 2165 2166 /* 2167 * Add a bw_meter entry 2168 */ 2169 static int 2170 add_bw_upcall(struct bw_upcall *req) 2171 { 2172 int s; 2173 struct mfc *mfc; 2174 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 2175 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 2176 struct timeval now; 2177 struct bw_meter *x; 2178 uint32_t flags; 2179 2180 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2181 return EOPNOTSUPP; 2182 2183 /* Test if the flags are valid */ 2184 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 2185 return EINVAL; 2186 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 2187 return EINVAL; 2188 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2189 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2190 return EINVAL; 2191 2192 /* Test if the threshold time interval is valid */ 2193 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 2194 return EINVAL; 2195 2196 flags = compute_bw_meter_flags(req); 2197 2198 /* 2199 * Find if we have already same bw_meter entry 2200 */ 2201 s = splsoftnet(); 2202 mfc = mfc_find(&req->bu_src, &req->bu_dst); 2203 if (mfc == NULL) { 2204 splx(s); 2205 return EADDRNOTAVAIL; 2206 } 2207 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 2208 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2209 &req->bu_threshold.b_time, ==)) && 2210 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2211 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2212 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 2213 splx(s); 2214 return 0; /* XXX Already installed */ 2215 } 2216 } 2217 2218 /* Allocate the new bw_meter entry */ 2219 x = kmem_intr_alloc(sizeof(*x), KM_NOSLEEP); 2220 if (x == NULL) { 2221 splx(s); 2222 return ENOBUFS; 2223 } 2224 2225 /* Set the new bw_meter entry */ 2226 x->bm_threshold.b_time = req->bu_threshold.b_time; 2227 microtime(&now); 2228 x->bm_start_time = now; 2229 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 2230 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 2231 x->bm_measured.b_packets = 0; 2232 x->bm_measured.b_bytes = 0; 2233 x->bm_flags = flags; 2234 x->bm_time_next = NULL; 2235 x->bm_time_hash = BW_METER_BUCKETS; 2236 2237 /* Add the new bw_meter entry to the front of entries for this MFC */ 2238 x->bm_mfc = mfc; 2239 x->bm_mfc_next = mfc->mfc_bw_meter; 2240 mfc->mfc_bw_meter = x; 2241 schedule_bw_meter(x, &now); 2242 splx(s); 2243 2244 return 0; 2245 } 2246 2247 static void 2248 free_bw_list(struct bw_meter *list) 2249 { 2250 while (list != NULL) { 2251 struct bw_meter *x = list; 2252 2253 list = list->bm_mfc_next; 2254 unschedule_bw_meter(x); 2255 kmem_intr_free(x, sizeof(*x)); 2256 } 2257 } 2258 2259 /* 2260 * Delete one or multiple bw_meter entries 2261 */ 2262 static int 2263 del_bw_upcall(struct bw_upcall *req) 2264 { 2265 int s; 2266 struct mfc *mfc; 2267 struct bw_meter *x; 2268 2269 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2270 return EOPNOTSUPP; 2271 2272 s = splsoftnet(); 2273 /* Find the corresponding MFC entry */ 2274 mfc = mfc_find(&req->bu_src, &req->bu_dst); 2275 if (mfc == NULL) { 2276 splx(s); 2277 return EADDRNOTAVAIL; 2278 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 2279 /* 2280 * Delete all bw_meter entries for this mfc 2281 */ 2282 struct bw_meter *list; 2283 2284 list = mfc->mfc_bw_meter; 2285 mfc->mfc_bw_meter = NULL; 2286 free_bw_list(list); 2287 splx(s); 2288 return 0; 2289 } else { /* Delete a single bw_meter entry */ 2290 struct bw_meter *prev; 2291 uint32_t flags = 0; 2292 2293 flags = compute_bw_meter_flags(req); 2294 2295 /* Find the bw_meter entry to delete */ 2296 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 2297 prev = x, x = x->bm_mfc_next) { 2298 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2299 &req->bu_threshold.b_time, ==)) && 2300 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2301 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2302 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 2303 break; 2304 } 2305 if (x != NULL) { /* Delete entry from the list for this MFC */ 2306 if (prev != NULL) 2307 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ 2308 else 2309 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ 2310 2311 unschedule_bw_meter(x); 2312 splx(s); 2313 /* Free the bw_meter entry */ 2314 kmem_intr_free(x, sizeof(*x)); 2315 return 0; 2316 } else { 2317 splx(s); 2318 return EINVAL; 2319 } 2320 } 2321 /* NOTREACHED */ 2322 } 2323 2324 /* 2325 * Perform bandwidth measurement processing that may result in an upcall 2326 */ 2327 static void 2328 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 2329 { 2330 struct timeval delta; 2331 2332 delta = *nowp; 2333 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2334 2335 if (x->bm_flags & BW_METER_GEQ) { 2336 /* 2337 * Processing for ">=" type of bw_meter entry 2338 */ 2339 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2340 /* Reset the bw_meter entry */ 2341 x->bm_start_time = *nowp; 2342 x->bm_measured.b_packets = 0; 2343 x->bm_measured.b_bytes = 0; 2344 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2345 } 2346 2347 /* Record that a packet is received */ 2348 x->bm_measured.b_packets++; 2349 x->bm_measured.b_bytes += plen; 2350 2351 /* 2352 * Test if we should deliver an upcall 2353 */ 2354 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 2355 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2356 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || 2357 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2358 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { 2359 /* Prepare an upcall for delivery */ 2360 bw_meter_prepare_upcall(x, nowp); 2361 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 2362 } 2363 } 2364 } else if (x->bm_flags & BW_METER_LEQ) { 2365 /* 2366 * Processing for "<=" type of bw_meter entry 2367 */ 2368 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2369 /* 2370 * We are behind time with the multicast forwarding table 2371 * scanning for "<=" type of bw_meter entries, so test now 2372 * if we should deliver an upcall. 2373 */ 2374 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2375 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2376 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2377 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2378 /* Prepare an upcall for delivery */ 2379 bw_meter_prepare_upcall(x, nowp); 2380 } 2381 /* Reschedule the bw_meter entry */ 2382 unschedule_bw_meter(x); 2383 schedule_bw_meter(x, nowp); 2384 } 2385 2386 /* Record that a packet is received */ 2387 x->bm_measured.b_packets++; 2388 x->bm_measured.b_bytes += plen; 2389 2390 /* 2391 * Test if we should restart the measuring interval 2392 */ 2393 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 2394 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 2395 (x->bm_flags & BW_METER_UNIT_BYTES && 2396 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 2397 /* Don't restart the measuring interval */ 2398 } else { 2399 /* Do restart the measuring interval */ 2400 /* 2401 * XXX: note that we don't unschedule and schedule, because this 2402 * might be too much overhead per packet. Instead, when we process 2403 * all entries for a given timer hash bin, we check whether it is 2404 * really a timeout. If not, we reschedule at that time. 2405 */ 2406 x->bm_start_time = *nowp; 2407 x->bm_measured.b_packets = 0; 2408 x->bm_measured.b_bytes = 0; 2409 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2410 } 2411 } 2412 } 2413 2414 /* 2415 * Prepare a bandwidth-related upcall 2416 */ 2417 static void 2418 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2419 { 2420 struct timeval delta; 2421 struct bw_upcall *u; 2422 2423 /* 2424 * Compute the measured time interval 2425 */ 2426 delta = *nowp; 2427 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2428 2429 /* 2430 * If there are too many pending upcalls, deliver them now 2431 */ 2432 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2433 bw_upcalls_send(); 2434 2435 /* 2436 * Set the bw_upcall entry 2437 */ 2438 u = &bw_upcalls[bw_upcalls_n++]; 2439 u->bu_src = x->bm_mfc->mfc_origin; 2440 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2441 u->bu_threshold.b_time = x->bm_threshold.b_time; 2442 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2443 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2444 u->bu_measured.b_time = delta; 2445 u->bu_measured.b_packets = x->bm_measured.b_packets; 2446 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2447 u->bu_flags = 0; 2448 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2449 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2450 if (x->bm_flags & BW_METER_UNIT_BYTES) 2451 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2452 if (x->bm_flags & BW_METER_GEQ) 2453 u->bu_flags |= BW_UPCALL_GEQ; 2454 if (x->bm_flags & BW_METER_LEQ) 2455 u->bu_flags |= BW_UPCALL_LEQ; 2456 } 2457 2458 /* 2459 * Send the pending bandwidth-related upcalls 2460 */ 2461 static void 2462 bw_upcalls_send(void) 2463 { 2464 struct mbuf *m; 2465 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2466 struct sockaddr_in k_igmpsrc = { 2467 .sin_len = sizeof(k_igmpsrc), 2468 .sin_family = AF_INET, 2469 }; 2470 static struct igmpmsg igmpmsg = { 2471 0, /* unused1 */ 2472 0, /* unused2 */ 2473 IGMPMSG_BW_UPCALL,/* im_msgtype */ 2474 0, /* im_mbz */ 2475 0, /* im_vif */ 2476 0, /* unused3 */ 2477 { 0 }, /* im_src */ 2478 { 0 } /* im_dst */ 2479 }; 2480 2481 if (bw_upcalls_n == 0) 2482 return; /* No pending upcalls */ 2483 2484 bw_upcalls_n = 0; 2485 2486 /* 2487 * Allocate a new mbuf, initialize it with the header and 2488 * the payload for the pending calls. 2489 */ 2490 MGETHDR(m, M_DONTWAIT, MT_HEADER); 2491 if (m == NULL) { 2492 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2493 return; 2494 } 2495 2496 m->m_len = m->m_pkthdr.len = 0; 2497 m_copyback(m, 0, sizeof(struct igmpmsg), (void *)&igmpmsg); 2498 m_copyback(m, sizeof(struct igmpmsg), len, (void *)&bw_upcalls[0]); 2499 2500 /* 2501 * Send the upcalls 2502 * XXX do we need to set the address in k_igmpsrc ? 2503 */ 2504 mrtstat.mrts_upcalls++; 2505 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { 2506 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); 2507 ++mrtstat.mrts_upq_sockfull; 2508 } 2509 } 2510 2511 /* 2512 * Compute the timeout hash value for the bw_meter entries 2513 */ 2514 #define BW_METER_TIMEHASH(bw_meter, hash) \ 2515 do { \ 2516 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2517 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2518 (hash) = next_timeval.tv_sec; \ 2519 if (next_timeval.tv_usec) \ 2520 (hash)++; /* XXX: make sure we don't timeout early */ \ 2521 (hash) %= BW_METER_BUCKETS; \ 2522 } while (/*CONSTCOND*/ 0) 2523 2524 /* 2525 * Schedule a timer to process periodically bw_meter entry of type "<=" 2526 * by linking the entry in the proper hash bucket. 2527 */ 2528 static void 2529 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2530 { 2531 int time_hash; 2532 2533 if (!(x->bm_flags & BW_METER_LEQ)) 2534 return; /* XXX: we schedule timers only for "<=" entries */ 2535 2536 /* 2537 * Reset the bw_meter entry 2538 */ 2539 x->bm_start_time = *nowp; 2540 x->bm_measured.b_packets = 0; 2541 x->bm_measured.b_bytes = 0; 2542 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2543 2544 /* 2545 * Compute the timeout hash value and insert the entry 2546 */ 2547 BW_METER_TIMEHASH(x, time_hash); 2548 x->bm_time_next = bw_meter_timers[time_hash]; 2549 bw_meter_timers[time_hash] = x; 2550 x->bm_time_hash = time_hash; 2551 } 2552 2553 /* 2554 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2555 * by removing the entry from the proper hash bucket. 2556 */ 2557 static void 2558 unschedule_bw_meter(struct bw_meter *x) 2559 { 2560 int time_hash; 2561 struct bw_meter *prev, *tmp; 2562 2563 if (!(x->bm_flags & BW_METER_LEQ)) 2564 return; /* XXX: we schedule timers only for "<=" entries */ 2565 2566 /* 2567 * Compute the timeout hash value and delete the entry 2568 */ 2569 time_hash = x->bm_time_hash; 2570 if (time_hash >= BW_METER_BUCKETS) 2571 return; /* Entry was not scheduled */ 2572 2573 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2574 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2575 if (tmp == x) 2576 break; 2577 2578 if (tmp == NULL) 2579 panic("unschedule_bw_meter: bw_meter entry not found"); 2580 2581 if (prev != NULL) 2582 prev->bm_time_next = x->bm_time_next; 2583 else 2584 bw_meter_timers[time_hash] = x->bm_time_next; 2585 2586 x->bm_time_next = NULL; 2587 x->bm_time_hash = BW_METER_BUCKETS; 2588 } 2589 2590 /* 2591 * Process all "<=" type of bw_meter that should be processed now, 2592 * and for each entry prepare an upcall if necessary. Each processed 2593 * entry is rescheduled again for the (periodic) processing. 2594 * 2595 * This is run periodically (once per second normally). On each round, 2596 * all the potentially matching entries are in the hash slot that we are 2597 * looking at. 2598 */ 2599 static void 2600 bw_meter_process(void) 2601 { 2602 int s; 2603 static uint32_t last_tv_sec; /* last time we processed this */ 2604 2605 uint32_t loops; 2606 int i; 2607 struct timeval now, process_endtime; 2608 2609 microtime(&now); 2610 if (last_tv_sec == now.tv_sec) 2611 return; /* nothing to do */ 2612 2613 loops = now.tv_sec - last_tv_sec; 2614 last_tv_sec = now.tv_sec; 2615 if (loops > BW_METER_BUCKETS) 2616 loops = BW_METER_BUCKETS; 2617 2618 s = splsoftnet(); 2619 /* 2620 * Process all bins of bw_meter entries from the one after the last 2621 * processed to the current one. On entry, i points to the last bucket 2622 * visited, so we need to increment i at the beginning of the loop. 2623 */ 2624 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 2625 struct bw_meter *x, *tmp_list; 2626 2627 if (++i >= BW_METER_BUCKETS) 2628 i = 0; 2629 2630 /* Disconnect the list of bw_meter entries from the bin */ 2631 tmp_list = bw_meter_timers[i]; 2632 bw_meter_timers[i] = NULL; 2633 2634 /* Process the list of bw_meter entries */ 2635 while (tmp_list != NULL) { 2636 x = tmp_list; 2637 tmp_list = tmp_list->bm_time_next; 2638 2639 /* Test if the time interval is over */ 2640 process_endtime = x->bm_start_time; 2641 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); 2642 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 2643 /* Not yet: reschedule, but don't reset */ 2644 int time_hash; 2645 2646 BW_METER_TIMEHASH(x, time_hash); 2647 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { 2648 /* 2649 * XXX: somehow the bin processing is a bit ahead of time. 2650 * Put the entry in the next bin. 2651 */ 2652 if (++time_hash >= BW_METER_BUCKETS) 2653 time_hash = 0; 2654 } 2655 x->bm_time_next = bw_meter_timers[time_hash]; 2656 bw_meter_timers[time_hash] = x; 2657 x->bm_time_hash = time_hash; 2658 2659 continue; 2660 } 2661 2662 /* 2663 * Test if we should deliver an upcall 2664 */ 2665 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2666 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2667 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2668 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2669 /* Prepare an upcall for delivery */ 2670 bw_meter_prepare_upcall(x, &now); 2671 } 2672 2673 /* 2674 * Reschedule for next processing 2675 */ 2676 schedule_bw_meter(x, &now); 2677 } 2678 } 2679 2680 /* Send all upcalls that are pending delivery */ 2681 bw_upcalls_send(); 2682 2683 splx(s); 2684 } 2685 2686 /* 2687 * A periodic function for sending all upcalls that are pending delivery 2688 */ 2689 static void 2690 expire_bw_upcalls_send(void *unused) 2691 { 2692 int s; 2693 2694 s = splsoftnet(); 2695 bw_upcalls_send(); 2696 splx(s); 2697 2698 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 2699 expire_bw_upcalls_send, NULL); 2700 } 2701 2702 /* 2703 * A periodic function for periodic scanning of the multicast forwarding 2704 * table for processing all "<=" bw_meter entries. 2705 */ 2706 static void 2707 expire_bw_meter_process(void *unused) 2708 { 2709 if (mrt_api_config & MRT_MFC_BW_UPCALL) 2710 bw_meter_process(); 2711 2712 callout_reset(&bw_meter_ch, BW_METER_PERIOD, 2713 expire_bw_meter_process, NULL); 2714 } 2715 2716 /* 2717 * End of bandwidth monitoring code 2718 */ 2719 2720 #ifdef PIM 2721 /* 2722 * Send the packet up to the user daemon, or eventually do kernel encapsulation 2723 */ 2724 static int 2725 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, 2726 struct mfc *rt) 2727 { 2728 struct mbuf *mb_copy, *mm; 2729 2730 if (mrtdebug & DEBUG_PIM) 2731 log(LOG_DEBUG, "pim_register_send: \n"); 2732 2733 mb_copy = pim_register_prepare(ip, m); 2734 if (mb_copy == NULL) 2735 return ENOBUFS; 2736 2737 /* 2738 * Send all the fragments. Note that the mbuf for each fragment 2739 * is freed by the sending machinery. 2740 */ 2741 for (mm = mb_copy; mm; mm = mb_copy) { 2742 mb_copy = mm->m_nextpkt; 2743 mm->m_nextpkt = NULL; 2744 mm = m_pullup(mm, sizeof(struct ip)); 2745 if (mm != NULL) { 2746 ip = mtod(mm, struct ip *); 2747 if ((mrt_api_config & MRT_MFC_RP) && 2748 !in_nullhost(rt->mfc_rp)) { 2749 pim_register_send_rp(ip, vifp, mm, rt); 2750 } else { 2751 pim_register_send_upcall(ip, vifp, mm, rt); 2752 } 2753 } 2754 } 2755 2756 return 0; 2757 } 2758 2759 /* 2760 * Return a copy of the data packet that is ready for PIM Register 2761 * encapsulation. 2762 * XXX: Note that in the returned copy the IP header is a valid one. 2763 */ 2764 static struct mbuf * 2765 pim_register_prepare(struct ip *ip, struct mbuf *m) 2766 { 2767 struct mbuf *mb_copy = NULL; 2768 int mtu; 2769 2770 /* Take care of delayed checksums */ 2771 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 2772 in_undefer_cksum_tcpudp(m); 2773 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 2774 } 2775 2776 /* 2777 * Copy the old packet & pullup its IP header into the 2778 * new mbuf so we can modify it. 2779 */ 2780 mb_copy = m_copypacket(m, M_DONTWAIT); 2781 if (mb_copy == NULL) 2782 return NULL; 2783 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 2784 if (mb_copy == NULL) 2785 return NULL; 2786 2787 /* take care of the TTL */ 2788 ip = mtod(mb_copy, struct ip *); 2789 --ip->ip_ttl; 2790 2791 /* Compute the MTU after the PIM Register encapsulation */ 2792 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 2793 2794 if (ntohs(ip->ip_len) <= mtu) { 2795 /* Turn the IP header into a valid one */ 2796 ip->ip_sum = 0; 2797 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 2798 } else { 2799 /* Fragment the packet */ 2800 if (ip_fragment(mb_copy, NULL, mtu) != 0) { 2801 /* XXX: mb_copy was freed by ip_fragment() */ 2802 return NULL; 2803 } 2804 } 2805 return mb_copy; 2806 } 2807 2808 /* 2809 * Send an upcall with the data packet to the user-level process. 2810 */ 2811 static int 2812 pim_register_send_upcall(struct ip *ip, struct vif *vifp, 2813 struct mbuf *mb_copy, struct mfc *rt) 2814 { 2815 struct mbuf *mb_first; 2816 int len = ntohs(ip->ip_len); 2817 struct igmpmsg *im; 2818 struct sockaddr_in k_igmpsrc = { 2819 .sin_len = sizeof(k_igmpsrc), 2820 .sin_family = AF_INET, 2821 }; 2822 2823 /* 2824 * Add a new mbuf with an upcall header 2825 */ 2826 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 2827 if (mb_first == NULL) { 2828 m_freem(mb_copy); 2829 return ENOBUFS; 2830 } 2831 mb_first->m_data += max_linkhdr; 2832 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 2833 mb_first->m_len = sizeof(struct igmpmsg); 2834 mb_first->m_next = mb_copy; 2835 2836 /* Send message to routing daemon */ 2837 im = mtod(mb_first, struct igmpmsg *); 2838 im->im_msgtype = IGMPMSG_WHOLEPKT; 2839 im->im_mbz = 0; 2840 im->im_vif = vifp - viftable; 2841 im->im_src = ip->ip_src; 2842 im->im_dst = ip->ip_dst; 2843 2844 k_igmpsrc.sin_addr = ip->ip_src; 2845 2846 mrtstat.mrts_upcalls++; 2847 2848 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { 2849 if (mrtdebug & DEBUG_PIM) 2850 log(LOG_WARNING, 2851 "mcast: pim_register_send_upcall: ip_mrouter socket queue full\n"); 2852 ++mrtstat.mrts_upq_sockfull; 2853 return ENOBUFS; 2854 } 2855 2856 /* Keep statistics */ 2857 pimstat.pims_snd_registers_msgs++; 2858 pimstat.pims_snd_registers_bytes += len; 2859 2860 return 0; 2861 } 2862 2863 /* 2864 * Encapsulate the data packet in PIM Register message and send it to the RP. 2865 */ 2866 static int 2867 pim_register_send_rp(struct ip *ip, struct vif *vifp, 2868 struct mbuf *mb_copy, struct mfc *rt) 2869 { 2870 struct mbuf *mb_first; 2871 struct ip *ip_outer; 2872 struct pim_encap_pimhdr *pimhdr; 2873 int len = ntohs(ip->ip_len); 2874 vifi_t vifi = rt->mfc_parent; 2875 2876 if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) { 2877 m_freem(mb_copy); 2878 return EADDRNOTAVAIL; /* The iif vif is invalid */ 2879 } 2880 2881 /* 2882 * Add a new mbuf with the encapsulating header 2883 */ 2884 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 2885 if (mb_first == NULL) { 2886 m_freem(mb_copy); 2887 return ENOBUFS; 2888 } 2889 mb_first->m_data += max_linkhdr; 2890 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 2891 mb_first->m_next = mb_copy; 2892 2893 mb_first->m_pkthdr.len = len + mb_first->m_len; 2894 2895 /* 2896 * Fill in the encapsulating IP and PIM header 2897 */ 2898 ip_outer = mtod(mb_first, struct ip *); 2899 *ip_outer = pim_encap_iphdr; 2900 if (mb_first->m_pkthdr.len < IP_MINFRAGSIZE) 2901 ip_outer->ip_id = 0; 2902 else 2903 ip_outer->ip_id = ip_newid(NULL); 2904 ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + 2905 sizeof(pim_encap_pimhdr)); 2906 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 2907 ip_outer->ip_dst = rt->mfc_rp; 2908 /* 2909 * Copy the inner header TOS to the outer header, and take care of the 2910 * IP_DF bit. 2911 */ 2912 ip_outer->ip_tos = ip->ip_tos; 2913 if (ntohs(ip->ip_off) & IP_DF) 2914 ip_outer->ip_off |= htons(IP_DF); 2915 pimhdr = (struct pim_encap_pimhdr *)((char *)ip_outer 2916 + sizeof(pim_encap_iphdr)); 2917 *pimhdr = pim_encap_pimhdr; 2918 /* If the iif crosses a border, set the Border-bit */ 2919 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 2920 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 2921 2922 mb_first->m_data += sizeof(pim_encap_iphdr); 2923 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 2924 mb_first->m_data -= sizeof(pim_encap_iphdr); 2925 2926 if (vifp->v_rate_limit == 0) 2927 tbf_send_packet(vifp, mb_first); 2928 else 2929 tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len)); 2930 2931 /* Keep statistics */ 2932 pimstat.pims_snd_registers_msgs++; 2933 pimstat.pims_snd_registers_bytes += len; 2934 2935 return 0; 2936 } 2937 2938 /* 2939 * PIM-SMv2 and PIM-DM messages processing. 2940 * Receives and verifies the PIM control messages, and passes them 2941 * up to the listening socket, using rip_input(). 2942 * The only message with special processing is the PIM_REGISTER message 2943 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 2944 * is passed to if_simloop(). 2945 */ 2946 void 2947 pim_input(struct mbuf *m, int off, int proto) 2948 { 2949 struct ip *ip = mtod(m, struct ip *); 2950 struct pim *pim; 2951 int minlen; 2952 int datalen; 2953 int ip_tos; 2954 int iphlen; 2955 2956 iphlen = off; 2957 datalen = ntohs(ip->ip_len) - iphlen; 2958 2959 /* Keep statistics */ 2960 pimstat.pims_rcv_total_msgs++; 2961 pimstat.pims_rcv_total_bytes += datalen; 2962 2963 /* 2964 * Validate lengths 2965 */ 2966 if (datalen < PIM_MINLEN) { 2967 pimstat.pims_rcv_tooshort++; 2968 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", 2969 datalen, (u_long)ip->ip_src.s_addr); 2970 m_freem(m); 2971 return; 2972 } 2973 2974 /* 2975 * If the packet is at least as big as a REGISTER, go ahead 2976 * and grab the PIM REGISTER header size, to avoid another 2977 * possible m_pullup() later. 2978 * 2979 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 2980 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 2981 */ 2982 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); 2983 2984 /* 2985 * Get the IP and PIM headers in contiguous memory, and 2986 * possibly the PIM REGISTER header. 2987 */ 2988 if ((m->m_flags & M_EXT || m->m_len < minlen) && 2989 (m = m_pullup(m, minlen)) == NULL) { 2990 log(LOG_ERR, "pim_input: m_pullup failure\n"); 2991 return; 2992 } 2993 ip = mtod(m, struct ip *); 2994 ip_tos = ip->ip_tos; 2995 2996 /* adjust mbuf to point to the PIM header */ 2997 m->m_data += iphlen; 2998 m->m_len -= iphlen; 2999 pim = mtod(m, struct pim *); 3000 3001 /* 3002 * Validate checksum. If PIM REGISTER, exclude the data packet. 3003 * 3004 * XXX: some older PIMv2 implementations don't make this distinction, 3005 * so for compatibility reason perform the checksum over part of the 3006 * message, and if error, then over the whole message. 3007 */ 3008 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { 3009 /* do nothing, checksum okay */ 3010 } else if (in_cksum(m, datalen)) { 3011 pimstat.pims_rcv_badsum++; 3012 if (mrtdebug & DEBUG_PIM) 3013 log(LOG_DEBUG, "pim_input: invalid checksum\n"); 3014 m_freem(m); 3015 return; 3016 } 3017 3018 /* PIM version check */ 3019 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 3020 pimstat.pims_rcv_badversion++; 3021 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", 3022 PIM_VT_V(pim->pim_vt), PIM_VERSION); 3023 m_freem(m); 3024 return; 3025 } 3026 3027 /* restore mbuf back to the outer IP */ 3028 m->m_data -= iphlen; 3029 m->m_len += iphlen; 3030 3031 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 3032 /* 3033 * Since this is a REGISTER, we'll make a copy of the register 3034 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 3035 * routing daemon. 3036 */ 3037 int s; 3038 struct sockaddr_in dst = { 3039 .sin_len = sizeof(dst), 3040 .sin_family = AF_INET, 3041 }; 3042 struct mbuf *mcp; 3043 struct ip *encap_ip; 3044 u_int32_t *reghdr; 3045 struct ifnet *vifp; 3046 3047 s = splsoftnet(); 3048 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 3049 splx(s); 3050 if (mrtdebug & DEBUG_PIM) 3051 log(LOG_DEBUG, 3052 "pim_input: register vif not set: %d\n", reg_vif_num); 3053 m_freem(m); 3054 return; 3055 } 3056 /* XXX need refcnt? */ 3057 vifp = viftable[reg_vif_num].v_ifp; 3058 splx(s); 3059 3060 /* 3061 * Validate length 3062 */ 3063 if (datalen < PIM_REG_MINLEN) { 3064 pimstat.pims_rcv_tooshort++; 3065 pimstat.pims_rcv_badregisters++; 3066 log(LOG_ERR, 3067 "pim_input: register packet size too small %d from %lx\n", 3068 datalen, (u_long)ip->ip_src.s_addr); 3069 m_freem(m); 3070 return; 3071 } 3072 3073 reghdr = (u_int32_t *)(pim + 1); 3074 encap_ip = (struct ip *)(reghdr + 1); 3075 3076 if (mrtdebug & DEBUG_PIM) { 3077 log(LOG_DEBUG, 3078 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", 3079 (u_long)ntohl(encap_ip->ip_src.s_addr), 3080 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3081 ntohs(encap_ip->ip_len)); 3082 } 3083 3084 /* verify the version number of the inner packet */ 3085 if (encap_ip->ip_v != IPVERSION) { 3086 pimstat.pims_rcv_badregisters++; 3087 if (mrtdebug & DEBUG_PIM) { 3088 log(LOG_DEBUG, "pim_input: invalid IP version (%d) " 3089 "of the inner packet\n", encap_ip->ip_v); 3090 } 3091 m_freem(m); 3092 return; 3093 } 3094 3095 /* verify the inner packet doesn't have options */ 3096 if (encap_ip->ip_hl != (sizeof(struct ip) >> 2)) { 3097 pimstat.pims_rcv_badregisters++; 3098 m_freem(m); 3099 return; 3100 } 3101 3102 /* verify the inner packet is destined to a mcast group */ 3103 if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) { 3104 pimstat.pims_rcv_badregisters++; 3105 if (mrtdebug & DEBUG_PIM) 3106 log(LOG_DEBUG, 3107 "pim_input: inner packet of register is not " 3108 "multicast %lx\n", 3109 (u_long)ntohl(encap_ip->ip_dst.s_addr)); 3110 m_freem(m); 3111 return; 3112 } 3113 3114 /* If a NULL_REGISTER, pass it to the daemon */ 3115 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 3116 goto pim_input_to_daemon; 3117 3118 /* 3119 * Copy the TOS from the outer IP header to the inner IP header. 3120 */ 3121 if (encap_ip->ip_tos != ip_tos) { 3122 /* Outer TOS -> inner TOS */ 3123 encap_ip->ip_tos = ip_tos; 3124 /* Recompute the inner header checksum. Sigh... */ 3125 3126 /* adjust mbuf to point to the inner IP header */ 3127 m->m_data += (iphlen + PIM_MINLEN); 3128 m->m_len -= (iphlen + PIM_MINLEN); 3129 3130 encap_ip->ip_sum = 0; 3131 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 3132 3133 /* restore mbuf to point back to the outer IP header */ 3134 m->m_data -= (iphlen + PIM_MINLEN); 3135 m->m_len += (iphlen + PIM_MINLEN); 3136 } 3137 3138 /* 3139 * Decapsulate the inner IP packet and loopback to forward it 3140 * as a normal multicast packet. Also, make a copy of the 3141 * outer_iphdr + pimhdr + reghdr + encap_iphdr 3142 * to pass to the daemon later, so it can take the appropriate 3143 * actions (e.g., send back PIM_REGISTER_STOP). 3144 * XXX: here m->m_data points to the outer IP header. 3145 */ 3146 mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_DONTWAIT); 3147 if (mcp == NULL) { 3148 log(LOG_ERR, 3149 "pim_input: pim register: could not copy register head\n"); 3150 m_freem(m); 3151 return; 3152 } 3153 3154 /* Keep statistics */ 3155 /* XXX: registers_bytes include only the encap. mcast pkt */ 3156 pimstat.pims_rcv_registers_msgs++; 3157 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); 3158 3159 /* 3160 * forward the inner ip packet; point m_data at the inner ip. 3161 */ 3162 m_adj(m, iphlen + PIM_MINLEN); 3163 3164 if (mrtdebug & DEBUG_PIM) { 3165 log(LOG_DEBUG, 3166 "pim_input: forwarding decapsulated register: " 3167 "src %lx, dst %lx, vif %d\n", 3168 (u_long)ntohl(encap_ip->ip_src.s_addr), 3169 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3170 reg_vif_num); 3171 } 3172 /* NB: vifp was collected above; can it change on us? */ 3173 looutput(vifp, m, (struct sockaddr *)&dst, NULL); 3174 3175 /* prepare the register head to send to the mrouting daemon */ 3176 m = mcp; 3177 } 3178 3179 pim_input_to_daemon: 3180 /* 3181 * Pass the PIM message up to the daemon; if it is a Register message, 3182 * pass the 'head' only up to the daemon. This includes the 3183 * outer IP header, PIM header, PIM-Register header and the 3184 * inner IP header. 3185 * XXX: the outer IP header pkt size of a Register is not adjust to 3186 * reflect the fact that the inner multicast data is truncated. 3187 */ 3188 /* 3189 * Currently, pim_input() is always called holding softnet_lock 3190 * by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE). 3191 */ 3192 KASSERT(mutex_owned(softnet_lock)); 3193 rip_input(m, iphlen, proto); 3194 3195 return; 3196 } 3197 #endif /* PIM */ 3198