1 /* $NetBSD: ip_mroute.c,v 1.163 2018/09/14 05:09:51 maxv Exp $ */ 2 3 /* 4 * Copyright (c) 1992, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Stephen Deering of Stanford University. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 35 */ 36 37 /* 38 * Copyright (c) 1989 Stephen Deering 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Stephen Deering of Stanford University. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 72 */ 73 74 /* 75 * IP multicast forwarding procedures 76 * 77 * Written by David Waitzman, BBN Labs, August 1988. 78 * Modified by Steve Deering, Stanford, February 1989. 79 * Modified by Mark J. Steiglitz, Stanford, May, 1991 80 * Modified by Van Jacobson, LBL, January 1993 81 * Modified by Ajit Thyagarajan, PARC, August 1993 82 * Modified by Bill Fenner, PARC, April 1994 83 * Modified by Charles M. Hannum, NetBSD, May 1995. 84 * Modified by Ahmed Helmy, SGI, June 1996 85 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 86 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 87 * Modified by Hitoshi Asaeda, WIDE, August 2000 88 * Modified by Pavlin Radoslavov, ICSI, October 2002 89 * 90 * MROUTING Revision: 1.2 91 * and PIM-SMv2 and PIM-DM support, advanced API support, 92 * bandwidth metering and signaling 93 */ 94 95 #include <sys/cdefs.h> 96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.163 2018/09/14 05:09:51 maxv Exp $"); 97 98 #ifdef _KERNEL_OPT 99 #include "opt_inet.h" 100 #include "opt_ipsec.h" 101 #include "opt_pim.h" 102 #endif 103 104 #ifdef PIM 105 #define _PIM_VT 1 106 #endif 107 108 #include <sys/param.h> 109 #include <sys/systm.h> 110 #include <sys/callout.h> 111 #include <sys/mbuf.h> 112 #include <sys/socket.h> 113 #include <sys/socketvar.h> 114 #include <sys/errno.h> 115 #include <sys/time.h> 116 #include <sys/kernel.h> 117 #include <sys/kmem.h> 118 #include <sys/ioctl.h> 119 #include <sys/syslog.h> 120 121 #include <net/if.h> 122 #include <net/raw_cb.h> 123 124 #include <netinet/in.h> 125 #include <netinet/in_var.h> 126 #include <netinet/in_systm.h> 127 #include <netinet/in_offload.h> 128 #include <netinet/ip.h> 129 #include <netinet/ip_var.h> 130 #include <netinet/in_pcb.h> 131 #include <netinet/udp.h> 132 #include <netinet/igmp.h> 133 #include <netinet/igmp_var.h> 134 #include <netinet/ip_mroute.h> 135 #ifdef PIM 136 #include <netinet/pim.h> 137 #include <netinet/pim_var.h> 138 #endif 139 #include <netinet/ip_encap.h> 140 141 #ifdef IPSEC 142 #include <netipsec/ipsec.h> 143 #include <netipsec/key.h> 144 #endif 145 146 #define IP_MULTICASTOPTS 0 147 #define M_PULLUP(m, len) \ 148 do { \ 149 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \ 150 (m) = m_pullup((m), (len)); \ 151 } while (/*CONSTCOND*/ 0) 152 153 /* 154 * Globals. All but ip_mrouter and ip_mrtproto could be static, 155 * except for netstat or debugging purposes. 156 */ 157 struct socket *ip_mrouter = NULL; 158 int ip_mrtproto = IGMP_DVMRP; /* for netstat only */ 159 160 #define MFCHASH(a, g) \ 161 ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ 162 ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash) 163 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl; 164 u_long mfchash; 165 166 u_char nexpire[MFCTBLSIZ]; 167 struct vif viftable[MAXVIFS]; 168 struct mrtstat mrtstat; 169 u_int mrtdebug = 0; /* debug level */ 170 #define DEBUG_MFC 0x02 171 #define DEBUG_FORWARD 0x04 172 #define DEBUG_EXPIRE 0x08 173 #define DEBUG_XMIT 0x10 174 #define DEBUG_PIM 0x20 175 176 #define VIFI_INVALID ((vifi_t) -1) 177 178 u_int tbfdebug = 0; /* tbf debug level */ 179 180 /* vif attachment using sys/netinet/ip_encap.c */ 181 static void vif_input(struct mbuf *, int, int, void *); 182 static int vif_encapcheck(struct mbuf *, int, int, void *); 183 184 static const struct encapsw vif_encapsw = { 185 .encapsw4 = { 186 .pr_input = vif_input, 187 .pr_ctlinput = NULL, 188 } 189 }; 190 191 #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 192 #define UPCALL_EXPIRE 6 /* number of timeouts */ 193 194 /* 195 * Define the token bucket filter structures 196 */ 197 198 #define TBF_REPROCESS (hz / 100) /* 100x / second */ 199 200 static int get_sg_cnt(struct sioc_sg_req *); 201 static int get_vif_cnt(struct sioc_vif_req *); 202 static int ip_mrouter_init(struct socket *, int); 203 static int set_assert(int); 204 static int add_vif(struct vifctl *); 205 static int del_vif(vifi_t *); 206 static void update_mfc_params(struct mfc *, struct mfcctl2 *); 207 static void init_mfc_params(struct mfc *, struct mfcctl2 *); 208 static void expire_mfc(struct mfc *); 209 static int add_mfc(struct sockopt *); 210 #ifdef UPCALL_TIMING 211 static void collate(struct timeval *); 212 #endif 213 static int del_mfc(struct sockopt *); 214 static int set_api_config(struct sockopt *); /* chose API capabilities */ 215 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); 216 static void expire_upcalls(void *); 217 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *); 218 static void phyint_send(struct ip *, struct vif *, struct mbuf *); 219 static void encap_send(struct ip *, struct vif *, struct mbuf *); 220 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t); 221 static void tbf_queue(struct vif *, struct mbuf *); 222 static void tbf_process_q(struct vif *); 223 static void tbf_reprocess_q(void *); 224 static int tbf_dq_sel(struct vif *, struct ip *); 225 static void tbf_send_packet(struct vif *, struct mbuf *); 226 static void tbf_update_tokens(struct vif *); 227 static int priority(struct vif *, struct ip *); 228 229 /* 230 * Bandwidth monitoring 231 */ 232 static void free_bw_list(struct bw_meter *); 233 static int add_bw_upcall(struct bw_upcall *); 234 static int del_bw_upcall(struct bw_upcall *); 235 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *); 236 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); 237 static void bw_upcalls_send(void); 238 static void schedule_bw_meter(struct bw_meter *, struct timeval *); 239 static void unschedule_bw_meter(struct bw_meter *); 240 static void bw_meter_process(void); 241 static void expire_bw_upcalls_send(void *); 242 static void expire_bw_meter_process(void *); 243 244 #ifdef PIM 245 static int pim_register_send(struct ip *, struct vif *, 246 struct mbuf *, struct mfc *); 247 static int pim_register_send_rp(struct ip *, struct vif *, 248 struct mbuf *, struct mfc *); 249 static int pim_register_send_upcall(struct ip *, struct vif *, 250 struct mbuf *, struct mfc *); 251 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); 252 #endif 253 254 #define ENCAP_TTL 64 255 #define ENCAP_PROTO IPPROTO_IPIP 256 257 /* prototype IP hdr for encapsulated packets */ 258 static const struct ip multicast_encap_iphdr = { 259 .ip_hl = sizeof(struct ip) >> 2, 260 .ip_v = IPVERSION, 261 .ip_len = sizeof(struct ip), 262 .ip_ttl = ENCAP_TTL, 263 .ip_p = ENCAP_PROTO, 264 }; 265 266 /* 267 * Bandwidth meter variables and constants 268 */ 269 270 /* 271 * Pending timeouts are stored in a hash table, the key being the 272 * expiration time. Periodically, the entries are analysed and processed. 273 */ 274 #define BW_METER_BUCKETS 1024 275 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 276 struct callout bw_meter_ch; 277 #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 278 279 /* 280 * Pending upcalls are stored in a vector which is flushed when 281 * full, or periodically 282 */ 283 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 284 static u_int bw_upcalls_n; /* # of pending upcalls */ 285 struct callout bw_upcalls_ch; 286 #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 287 288 #ifdef PIM 289 struct pimstat pimstat; 290 291 /* 292 * Note: the PIM Register encapsulation adds the following in front of a 293 * data packet: 294 * 295 * struct pim_encap_hdr { 296 * struct ip ip; 297 * struct pim_encap_pimhdr pim; 298 * } 299 */ 300 301 struct pim_encap_pimhdr { 302 struct pim pim; 303 uint32_t flags; 304 }; 305 306 static struct ip pim_encap_iphdr = { 307 .ip_v = IPVERSION, 308 .ip_hl = sizeof(struct ip) >> 2, 309 .ip_len = sizeof(struct ip), 310 .ip_ttl = ENCAP_TTL, 311 .ip_p = IPPROTO_PIM, 312 }; 313 314 static struct pim_encap_pimhdr pim_encap_pimhdr = { 315 { 316 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 317 0, /* reserved */ 318 0, /* checksum */ 319 }, 320 0 /* flags */ 321 }; 322 323 static struct ifnet multicast_register_if; 324 static vifi_t reg_vif_num = VIFI_INVALID; 325 #endif /* PIM */ 326 327 328 /* 329 * Private variables. 330 */ 331 static vifi_t numvifs = 0; 332 333 static struct callout expire_upcalls_ch; 334 335 /* 336 * whether or not special PIM assert processing is enabled. 337 */ 338 static int pim_assert; 339 /* 340 * Rate limit for assert notification messages, in usec 341 */ 342 #define ASSERT_MSG_TIME 3000000 343 344 /* 345 * Kernel multicast routing API capabilities and setup. 346 * If more API capabilities are added to the kernel, they should be 347 * recorded in `mrt_api_support'. 348 */ 349 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 350 MRT_MFC_FLAGS_BORDER_VIF | 351 MRT_MFC_RP | 352 MRT_MFC_BW_UPCALL); 353 static u_int32_t mrt_api_config = 0; 354 355 /* 356 * Find a route for a given origin IP address and Multicast group address 357 * Type of service parameter to be added in the future!!! 358 * Statistics are updated by the caller if needed 359 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) 360 */ 361 static struct mfc * 362 mfc_find(struct in_addr *o, struct in_addr *g) 363 { 364 struct mfc *rt; 365 366 LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { 367 if (in_hosteq(rt->mfc_origin, *o) && 368 in_hosteq(rt->mfc_mcastgrp, *g) && 369 (rt->mfc_stall == NULL)) 370 break; 371 } 372 373 return rt; 374 } 375 376 /* 377 * Macros to compute elapsed time efficiently 378 * Borrowed from Van Jacobson's scheduling code 379 */ 380 #define TV_DELTA(a, b, delta) do { \ 381 int xxs; \ 382 delta = (a).tv_usec - (b).tv_usec; \ 383 xxs = (a).tv_sec - (b).tv_sec; \ 384 switch (xxs) { \ 385 case 2: \ 386 delta += 1000000; \ 387 /* fall through */ \ 388 case 1: \ 389 delta += 1000000; \ 390 /* fall through */ \ 391 case 0: \ 392 break; \ 393 default: \ 394 delta += (1000000 * xxs); \ 395 break; \ 396 } \ 397 } while (/*CONSTCOND*/ 0) 398 399 #ifdef UPCALL_TIMING 400 u_int32_t upcall_data[51]; 401 #endif /* UPCALL_TIMING */ 402 403 /* 404 * Handle MRT setsockopt commands to modify the multicast routing tables. 405 */ 406 int 407 ip_mrouter_set(struct socket *so, struct sockopt *sopt) 408 { 409 int error; 410 int optval; 411 struct vifctl vifc; 412 vifi_t vifi; 413 struct bw_upcall bwuc; 414 415 if (sopt->sopt_name != MRT_INIT && so != ip_mrouter) 416 error = ENOPROTOOPT; 417 else { 418 switch (sopt->sopt_name) { 419 case MRT_INIT: 420 error = sockopt_getint(sopt, &optval); 421 if (error) 422 break; 423 424 error = ip_mrouter_init(so, optval); 425 break; 426 case MRT_DONE: 427 error = ip_mrouter_done(); 428 break; 429 case MRT_ADD_VIF: 430 error = sockopt_get(sopt, &vifc, sizeof(vifc)); 431 if (error) 432 break; 433 error = add_vif(&vifc); 434 break; 435 case MRT_DEL_VIF: 436 error = sockopt_get(sopt, &vifi, sizeof(vifi)); 437 if (error) 438 break; 439 error = del_vif(&vifi); 440 break; 441 case MRT_ADD_MFC: 442 error = add_mfc(sopt); 443 break; 444 case MRT_DEL_MFC: 445 error = del_mfc(sopt); 446 break; 447 case MRT_ASSERT: 448 error = sockopt_getint(sopt, &optval); 449 if (error) 450 break; 451 error = set_assert(optval); 452 break; 453 case MRT_API_CONFIG: 454 error = set_api_config(sopt); 455 break; 456 case MRT_ADD_BW_UPCALL: 457 error = sockopt_get(sopt, &bwuc, sizeof(bwuc)); 458 if (error) 459 break; 460 error = add_bw_upcall(&bwuc); 461 break; 462 case MRT_DEL_BW_UPCALL: 463 error = sockopt_get(sopt, &bwuc, sizeof(bwuc)); 464 if (error) 465 break; 466 error = del_bw_upcall(&bwuc); 467 break; 468 default: 469 error = ENOPROTOOPT; 470 break; 471 } 472 } 473 return error; 474 } 475 476 /* 477 * Handle MRT getsockopt commands 478 */ 479 int 480 ip_mrouter_get(struct socket *so, struct sockopt *sopt) 481 { 482 int error; 483 484 if (so != ip_mrouter) 485 error = ENOPROTOOPT; 486 else { 487 switch (sopt->sopt_name) { 488 case MRT_VERSION: 489 error = sockopt_setint(sopt, 0x0305); /* XXX !!!! */ 490 break; 491 case MRT_ASSERT: 492 error = sockopt_setint(sopt, pim_assert); 493 break; 494 case MRT_API_SUPPORT: 495 error = sockopt_set(sopt, &mrt_api_support, 496 sizeof(mrt_api_support)); 497 break; 498 case MRT_API_CONFIG: 499 error = sockopt_set(sopt, &mrt_api_config, 500 sizeof(mrt_api_config)); 501 break; 502 default: 503 error = ENOPROTOOPT; 504 break; 505 } 506 } 507 return error; 508 } 509 510 /* 511 * Handle ioctl commands to obtain information from the cache 512 */ 513 int 514 mrt_ioctl(struct socket *so, u_long cmd, void *data) 515 { 516 int error; 517 518 if (so != ip_mrouter) 519 error = EINVAL; 520 else 521 switch (cmd) { 522 case SIOCGETVIFCNT: 523 error = get_vif_cnt((struct sioc_vif_req *)data); 524 break; 525 case SIOCGETSGCNT: 526 error = get_sg_cnt((struct sioc_sg_req *)data); 527 break; 528 default: 529 error = EINVAL; 530 break; 531 } 532 533 return error; 534 } 535 536 /* 537 * returns the packet, byte, rpf-failure count for the source group provided 538 */ 539 static int 540 get_sg_cnt(struct sioc_sg_req *req) 541 { 542 int s; 543 struct mfc *rt; 544 545 s = splsoftnet(); 546 rt = mfc_find(&req->src, &req->grp); 547 if (rt == NULL) { 548 splx(s); 549 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 550 return EADDRNOTAVAIL; 551 } 552 req->pktcnt = rt->mfc_pkt_cnt; 553 req->bytecnt = rt->mfc_byte_cnt; 554 req->wrong_if = rt->mfc_wrong_if; 555 splx(s); 556 557 return 0; 558 } 559 560 /* 561 * returns the input and output packet and byte counts on the vif provided 562 */ 563 static int 564 get_vif_cnt(struct sioc_vif_req *req) 565 { 566 vifi_t vifi = req->vifi; 567 568 if (vifi >= numvifs) 569 return EINVAL; 570 571 req->icount = viftable[vifi].v_pkt_in; 572 req->ocount = viftable[vifi].v_pkt_out; 573 req->ibytes = viftable[vifi].v_bytes_in; 574 req->obytes = viftable[vifi].v_bytes_out; 575 576 return 0; 577 } 578 579 /* 580 * Enable multicast routing 581 */ 582 static int 583 ip_mrouter_init(struct socket *so, int v) 584 { 585 if (mrtdebug) 586 log(LOG_DEBUG, 587 "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", 588 so->so_type, so->so_proto->pr_protocol); 589 590 if (so->so_type != SOCK_RAW || 591 so->so_proto->pr_protocol != IPPROTO_IGMP) 592 return EOPNOTSUPP; 593 594 if (v != 1) 595 return EINVAL; 596 597 if (ip_mrouter != NULL) 598 return EADDRINUSE; 599 600 ip_mrouter = so; 601 602 mfchashtbl = hashinit(MFCTBLSIZ, HASH_LIST, true, &mfchash); 603 memset((void *)nexpire, 0, sizeof(nexpire)); 604 605 pim_assert = 0; 606 607 callout_init(&expire_upcalls_ch, 0); 608 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, 609 expire_upcalls, NULL); 610 611 callout_init(&bw_upcalls_ch, 0); 612 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 613 expire_bw_upcalls_send, NULL); 614 615 callout_init(&bw_meter_ch, 0); 616 callout_reset(&bw_meter_ch, BW_METER_PERIOD, 617 expire_bw_meter_process, NULL); 618 619 if (mrtdebug) 620 log(LOG_DEBUG, "ip_mrouter_init\n"); 621 622 return 0; 623 } 624 625 /* 626 * Disable multicast routing 627 */ 628 int 629 ip_mrouter_done(void) 630 { 631 vifi_t vifi; 632 struct vif *vifp; 633 int i; 634 int s; 635 636 s = splsoftnet(); 637 638 /* Clear out all the vifs currently in use. */ 639 for (vifi = 0; vifi < numvifs; vifi++) { 640 vifp = &viftable[vifi]; 641 if (!in_nullhost(vifp->v_lcl_addr)) 642 reset_vif(vifp); 643 } 644 645 numvifs = 0; 646 pim_assert = 0; 647 mrt_api_config = 0; 648 649 callout_stop(&expire_upcalls_ch); 650 callout_stop(&bw_upcalls_ch); 651 callout_stop(&bw_meter_ch); 652 653 /* 654 * Free all multicast forwarding cache entries. 655 */ 656 for (i = 0; i < MFCTBLSIZ; i++) { 657 struct mfc *rt, *nrt; 658 659 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 660 nrt = LIST_NEXT(rt, mfc_hash); 661 662 expire_mfc(rt); 663 } 664 } 665 666 memset((void *)nexpire, 0, sizeof(nexpire)); 667 hashdone(mfchashtbl, HASH_LIST, mfchash); 668 mfchashtbl = NULL; 669 670 bw_upcalls_n = 0; 671 memset(bw_meter_timers, 0, sizeof(bw_meter_timers)); 672 673 /* Reset de-encapsulation cache. */ 674 675 ip_mrouter = NULL; 676 677 splx(s); 678 679 if (mrtdebug) 680 log(LOG_DEBUG, "ip_mrouter_done\n"); 681 682 return 0; 683 } 684 685 void 686 ip_mrouter_detach(struct ifnet *ifp) 687 { 688 int vifi, i; 689 struct vif *vifp; 690 struct mfc *rt; 691 struct rtdetq *rte; 692 693 /* XXX not sure about side effect to userland routing daemon */ 694 for (vifi = 0; vifi < numvifs; vifi++) { 695 vifp = &viftable[vifi]; 696 if (vifp->v_ifp == ifp) 697 reset_vif(vifp); 698 } 699 for (i = 0; i < MFCTBLSIZ; i++) { 700 if (nexpire[i] == 0) 701 continue; 702 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) { 703 for (rte = rt->mfc_stall; rte; rte = rte->next) { 704 if (rte->ifp == ifp) 705 rte->ifp = NULL; 706 } 707 } 708 } 709 } 710 711 /* 712 * Set PIM assert processing global 713 */ 714 static int 715 set_assert(int i) 716 { 717 pim_assert = !!i; 718 return 0; 719 } 720 721 /* 722 * Configure API capabilities 723 */ 724 static int 725 set_api_config(struct sockopt *sopt) 726 { 727 u_int32_t apival; 728 int i, error; 729 730 /* 731 * We can set the API capabilities only if it is the first operation 732 * after MRT_INIT. I.e.: 733 * - there are no vifs installed 734 * - pim_assert is not enabled 735 * - the MFC table is empty 736 */ 737 error = sockopt_get(sopt, &apival, sizeof(apival)); 738 if (error) 739 return error; 740 if (numvifs > 0) 741 return EPERM; 742 if (pim_assert) 743 return EPERM; 744 for (i = 0; i < MFCTBLSIZ; i++) { 745 if (LIST_FIRST(&mfchashtbl[i]) != NULL) 746 return EPERM; 747 } 748 749 mrt_api_config = apival & mrt_api_support; 750 return 0; 751 } 752 753 /* 754 * Add a vif to the vif table 755 */ 756 static int 757 add_vif(struct vifctl *vifcp) 758 { 759 struct vif *vifp; 760 struct ifnet *ifp; 761 int error, s; 762 struct sockaddr_in sin; 763 764 if (vifcp->vifc_vifi >= MAXVIFS) 765 return EINVAL; 766 if (in_nullhost(vifcp->vifc_lcl_addr)) 767 return EADDRNOTAVAIL; 768 769 vifp = &viftable[vifcp->vifc_vifi]; 770 if (!in_nullhost(vifp->v_lcl_addr)) 771 return EADDRINUSE; 772 773 /* Find the interface with an address in AF_INET family. */ 774 #ifdef PIM 775 if (vifcp->vifc_flags & VIFF_REGISTER) { 776 /* 777 * XXX: Because VIFF_REGISTER does not really need a valid 778 * local interface (e.g. it could be 127.0.0.2), we don't 779 * check its address. 780 */ 781 ifp = NULL; 782 } else 783 #endif 784 { 785 struct ifaddr *ifa; 786 787 sockaddr_in_init(&sin, &vifcp->vifc_lcl_addr, 0); 788 s = pserialize_read_enter(); 789 ifa = ifa_ifwithaddr(sintosa(&sin)); 790 if (ifa == NULL) { 791 pserialize_read_exit(s); 792 return EADDRNOTAVAIL; 793 } 794 ifp = ifa->ifa_ifp; 795 /* FIXME NOMPSAFE */ 796 pserialize_read_exit(s); 797 } 798 799 if (vifcp->vifc_flags & VIFF_TUNNEL) { 800 if (vifcp->vifc_flags & VIFF_SRCRT) { 801 log(LOG_ERR, "source routed tunnels not supported\n"); 802 return EOPNOTSUPP; 803 } 804 805 /* attach this vif to decapsulator dispatch table */ 806 /* 807 * XXX Use addresses in registration so that matching 808 * can be done with radix tree in decapsulator. But, 809 * we need to check inner header for multicast, so 810 * this requires both radix tree lookup and then a 811 * function to check, and this is not supported yet. 812 */ 813 error = encap_lock_enter(); 814 if (error) 815 return error; 816 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, 817 vif_encapcheck, &vif_encapsw, vifp); 818 encap_lock_exit(); 819 if (!vifp->v_encap_cookie) 820 return EINVAL; 821 822 /* Create a fake encapsulation interface. */ 823 ifp = malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK|M_ZERO); 824 snprintf(ifp->if_xname, sizeof(ifp->if_xname), 825 "mdecap%d", vifcp->vifc_vifi); 826 827 /* Prepare cached route entry. */ 828 memset(&vifp->v_route, 0, sizeof(vifp->v_route)); 829 #ifdef PIM 830 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 831 ifp = &multicast_register_if; 832 if (mrtdebug) 833 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", 834 (void *)ifp); 835 if (reg_vif_num == VIFI_INVALID) { 836 memset(ifp, 0, sizeof(*ifp)); 837 snprintf(ifp->if_xname, sizeof(ifp->if_xname), 838 "register_vif"); 839 ifp->if_flags = IFF_LOOPBACK; 840 memset(&vifp->v_route, 0, sizeof(vifp->v_route)); 841 reg_vif_num = vifcp->vifc_vifi; 842 } 843 #endif 844 } else { 845 /* Make sure the interface supports multicast. */ 846 if ((ifp->if_flags & IFF_MULTICAST) == 0) 847 return EOPNOTSUPP; 848 849 /* Enable promiscuous reception of all IP multicasts. */ 850 sockaddr_in_init(&sin, &zeroin_addr, 0); 851 error = if_mcast_op(ifp, SIOCADDMULTI, sintosa(&sin)); 852 if (error) 853 return error; 854 } 855 856 s = splsoftnet(); 857 858 /* Define parameters for the tbf structure. */ 859 vifp->tbf_q = NULL; 860 vifp->tbf_t = &vifp->tbf_q; 861 microtime(&vifp->tbf_last_pkt_t); 862 vifp->tbf_n_tok = 0; 863 vifp->tbf_q_len = 0; 864 vifp->tbf_max_q_len = MAXQSIZE; 865 866 vifp->v_flags = vifcp->vifc_flags; 867 vifp->v_threshold = vifcp->vifc_threshold; 868 /* scaling up here allows division by 1024 in critical code */ 869 vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000; 870 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 871 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 872 vifp->v_ifp = ifp; 873 /* Initialize per vif pkt counters. */ 874 vifp->v_pkt_in = 0; 875 vifp->v_pkt_out = 0; 876 vifp->v_bytes_in = 0; 877 vifp->v_bytes_out = 0; 878 879 callout_init(&vifp->v_repq_ch, 0); 880 881 splx(s); 882 883 /* Adjust numvifs up if the vifi is higher than numvifs. */ 884 if (numvifs <= vifcp->vifc_vifi) 885 numvifs = vifcp->vifc_vifi + 1; 886 887 if (mrtdebug) 888 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n", 889 vifcp->vifc_vifi, 890 ntohl(vifcp->vifc_lcl_addr.s_addr), 891 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 892 ntohl(vifcp->vifc_rmt_addr.s_addr), 893 vifcp->vifc_threshold, 894 vifcp->vifc_rate_limit); 895 896 return 0; 897 } 898 899 void 900 reset_vif(struct vif *vifp) 901 { 902 struct mbuf *m, *n; 903 struct ifnet *ifp; 904 struct sockaddr_in sin; 905 906 callout_stop(&vifp->v_repq_ch); 907 908 /* detach this vif from decapsulator dispatch table */ 909 encap_lock_enter(); 910 encap_detach(vifp->v_encap_cookie); 911 encap_lock_exit(); 912 vifp->v_encap_cookie = NULL; 913 914 /* 915 * Free packets queued at the interface 916 */ 917 for (m = vifp->tbf_q; m != NULL; m = n) { 918 n = m->m_nextpkt; 919 m_freem(m); 920 } 921 922 if (vifp->v_flags & VIFF_TUNNEL) 923 free(vifp->v_ifp, M_MRTABLE); 924 else if (vifp->v_flags & VIFF_REGISTER) { 925 #ifdef PIM 926 reg_vif_num = VIFI_INVALID; 927 #endif 928 } else { 929 sockaddr_in_init(&sin, &zeroin_addr, 0); 930 ifp = vifp->v_ifp; 931 if_mcast_op(ifp, SIOCDELMULTI, sintosa(&sin)); 932 } 933 memset((void *)vifp, 0, sizeof(*vifp)); 934 } 935 936 /* 937 * Delete a vif from the vif table 938 */ 939 static int 940 del_vif(vifi_t *vifip) 941 { 942 struct vif *vifp; 943 vifi_t vifi; 944 int s; 945 946 if (*vifip >= numvifs) 947 return EINVAL; 948 949 vifp = &viftable[*vifip]; 950 if (in_nullhost(vifp->v_lcl_addr)) 951 return EADDRNOTAVAIL; 952 953 s = splsoftnet(); 954 955 reset_vif(vifp); 956 957 /* Adjust numvifs down */ 958 for (vifi = numvifs; vifi > 0; vifi--) 959 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr)) 960 break; 961 numvifs = vifi; 962 963 splx(s); 964 965 if (mrtdebug) 966 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs); 967 968 return 0; 969 } 970 971 /* 972 * update an mfc entry without resetting counters and S,G addresses. 973 */ 974 static void 975 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 976 { 977 int i; 978 979 rt->mfc_parent = mfccp->mfcc_parent; 980 for (i = 0; i < numvifs; i++) { 981 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 982 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 983 MRT_MFC_FLAGS_ALL; 984 } 985 /* set the RP address */ 986 if (mrt_api_config & MRT_MFC_RP) 987 rt->mfc_rp = mfccp->mfcc_rp; 988 else 989 rt->mfc_rp = zeroin_addr; 990 } 991 992 /* 993 * fully initialize an mfc entry from the parameter. 994 */ 995 static void 996 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 997 { 998 rt->mfc_origin = mfccp->mfcc_origin; 999 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1000 1001 update_mfc_params(rt, mfccp); 1002 1003 /* initialize pkt counters per src-grp */ 1004 rt->mfc_pkt_cnt = 0; 1005 rt->mfc_byte_cnt = 0; 1006 rt->mfc_wrong_if = 0; 1007 timerclear(&rt->mfc_last_assert); 1008 } 1009 1010 static void 1011 expire_mfc(struct mfc *rt) 1012 { 1013 struct rtdetq *rte, *nrte; 1014 1015 free_bw_list(rt->mfc_bw_meter); 1016 1017 for (rte = rt->mfc_stall; rte != NULL; rte = nrte) { 1018 nrte = rte->next; 1019 m_freem(rte->m); 1020 free(rte, M_MRTABLE); 1021 } 1022 1023 LIST_REMOVE(rt, mfc_hash); 1024 free(rt, M_MRTABLE); 1025 } 1026 1027 /* 1028 * Add an mfc entry 1029 */ 1030 static int 1031 add_mfc(struct sockopt *sopt) 1032 { 1033 struct mfcctl2 mfcctl2; 1034 struct mfcctl2 *mfccp; 1035 struct mfc *rt; 1036 u_int32_t hash = 0; 1037 struct rtdetq *rte, *nrte; 1038 u_short nstl; 1039 int s; 1040 int error; 1041 1042 /* 1043 * select data size depending on API version. 1044 */ 1045 mfccp = &mfcctl2; 1046 memset(&mfcctl2, 0, sizeof(mfcctl2)); 1047 1048 if (mrt_api_config & MRT_API_FLAGS_ALL) 1049 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2)); 1050 else 1051 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl)); 1052 1053 if (error) 1054 return error; 1055 1056 s = splsoftnet(); 1057 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1058 1059 /* If an entry already exists, just update the fields */ 1060 if (rt) { 1061 if (mrtdebug & DEBUG_MFC) 1062 log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n", 1063 ntohl(mfccp->mfcc_origin.s_addr), 1064 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1065 mfccp->mfcc_parent); 1066 1067 update_mfc_params(rt, mfccp); 1068 1069 splx(s); 1070 return 0; 1071 } 1072 1073 /* 1074 * Find the entry for which the upcall was made and update 1075 */ 1076 nstl = 0; 1077 hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); 1078 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1079 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1080 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && 1081 rt->mfc_stall != NULL) { 1082 if (nstl++) 1083 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n", 1084 "multiple kernel entries", 1085 ntohl(mfccp->mfcc_origin.s_addr), 1086 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1087 mfccp->mfcc_parent, rt->mfc_stall); 1088 1089 if (mrtdebug & DEBUG_MFC) 1090 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n", 1091 ntohl(mfccp->mfcc_origin.s_addr), 1092 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1093 mfccp->mfcc_parent, rt->mfc_stall); 1094 1095 rte = rt->mfc_stall; 1096 init_mfc_params(rt, mfccp); 1097 rt->mfc_stall = NULL; 1098 1099 rt->mfc_expire = 0; /* Don't clean this guy up */ 1100 nexpire[hash]--; 1101 1102 /* free packets Qed at the end of this entry */ 1103 for (; rte != NULL; rte = nrte) { 1104 nrte = rte->next; 1105 if (rte->ifp) { 1106 ip_mdq(rte->m, rte->ifp, rt); 1107 } 1108 m_freem(rte->m); 1109 #ifdef UPCALL_TIMING 1110 collate(&rte->t); 1111 #endif /* UPCALL_TIMING */ 1112 free(rte, M_MRTABLE); 1113 } 1114 } 1115 } 1116 1117 /* 1118 * It is possible that an entry is being inserted without an upcall 1119 */ 1120 if (nstl == 0) { 1121 /* 1122 * No mfc; make a new one 1123 */ 1124 if (mrtdebug & DEBUG_MFC) 1125 log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n", 1126 ntohl(mfccp->mfcc_origin.s_addr), 1127 ntohl(mfccp->mfcc_mcastgrp.s_addr), 1128 mfccp->mfcc_parent); 1129 1130 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1131 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1132 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { 1133 init_mfc_params(rt, mfccp); 1134 if (rt->mfc_expire) 1135 nexpire[hash]--; 1136 rt->mfc_expire = 0; 1137 break; /* XXX */ 1138 } 1139 } 1140 if (rt == NULL) { /* no upcall, so make a new entry */ 1141 rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1142 if (rt == NULL) { 1143 splx(s); 1144 return ENOBUFS; 1145 } 1146 1147 init_mfc_params(rt, mfccp); 1148 rt->mfc_expire = 0; 1149 rt->mfc_stall = NULL; 1150 rt->mfc_bw_meter = NULL; 1151 1152 /* insert new entry at head of hash chain */ 1153 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1154 } 1155 } 1156 1157 splx(s); 1158 return 0; 1159 } 1160 1161 #ifdef UPCALL_TIMING 1162 /* 1163 * collect delay statistics on the upcalls 1164 */ 1165 static void 1166 collate(struct timeval *t) 1167 { 1168 u_int32_t d; 1169 struct timeval tp; 1170 u_int32_t delta; 1171 1172 microtime(&tp); 1173 1174 if (timercmp(t, &tp, <)) { 1175 TV_DELTA(tp, *t, delta); 1176 1177 d = delta >> 10; 1178 if (d > 50) 1179 d = 50; 1180 1181 ++upcall_data[d]; 1182 } 1183 } 1184 #endif /* UPCALL_TIMING */ 1185 1186 /* 1187 * Delete an mfc entry 1188 */ 1189 static int 1190 del_mfc(struct sockopt *sopt) 1191 { 1192 struct mfcctl2 mfcctl2; 1193 struct mfcctl2 *mfccp; 1194 struct mfc *rt; 1195 int s; 1196 int error; 1197 1198 /* 1199 * XXX: for deleting MFC entries the information in entries 1200 * of size "struct mfcctl" is sufficient. 1201 */ 1202 1203 mfccp = &mfcctl2; 1204 memset(&mfcctl2, 0, sizeof(mfcctl2)); 1205 1206 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl)); 1207 if (error) { 1208 /* Try with the size of mfcctl2. */ 1209 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2)); 1210 if (error) 1211 return error; 1212 } 1213 1214 if (mrtdebug & DEBUG_MFC) 1215 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n", 1216 ntohl(mfccp->mfcc_origin.s_addr), 1217 ntohl(mfccp->mfcc_mcastgrp.s_addr)); 1218 1219 s = splsoftnet(); 1220 1221 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1222 if (rt == NULL) { 1223 splx(s); 1224 return EADDRNOTAVAIL; 1225 } 1226 1227 /* 1228 * free the bw_meter entries 1229 */ 1230 free_bw_list(rt->mfc_bw_meter); 1231 rt->mfc_bw_meter = NULL; 1232 1233 LIST_REMOVE(rt, mfc_hash); 1234 free(rt, M_MRTABLE); 1235 1236 splx(s); 1237 return 0; 1238 } 1239 1240 static int 1241 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1242 { 1243 if (s) { 1244 if (sbappendaddr(&s->so_rcv, sintosa(src), mm, NULL) != 0) { 1245 sorwakeup(s); 1246 return 0; 1247 } 1248 soroverflow(s); 1249 } 1250 m_freem(mm); 1251 return -1; 1252 } 1253 1254 /* 1255 * IP multicast forwarding function. This function assumes that the packet 1256 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1257 * pointed to by "ifp", and the packet is to be relayed to other networks 1258 * that have members of the packet's destination IP multicast group. 1259 * 1260 * The packet is returned unscathed to the caller, unless it is 1261 * erroneous, in which case a non-zero return value tells the caller to 1262 * discard it. 1263 */ 1264 1265 #define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */ 1266 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1267 1268 int 1269 ip_mforward(struct mbuf *m, struct ifnet *ifp) 1270 { 1271 struct ip *ip = mtod(m, struct ip *); 1272 struct mfc *rt; 1273 static int srctun = 0; 1274 struct mbuf *mm; 1275 struct sockaddr_in sin; 1276 int s; 1277 vifi_t vifi; 1278 1279 if (mrtdebug & DEBUG_FORWARD) 1280 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n", 1281 ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp); 1282 1283 /* 1284 * XXX XXX: Why do we check [1] against IPOPT_LSRR? Because we 1285 * expect [0] to be IPOPT_NOP, maybe? In all cases that doesn't 1286 * make a lot of sense, a forged packet can just put two IPOPT_NOPs 1287 * followed by one IPOPT_LSRR, and bypass the check. 1288 */ 1289 if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 || 1290 ((u_char *)(ip + 1))[1] != IPOPT_LSRR) { 1291 /* 1292 * Packet arrived via a physical interface or 1293 * an encapsulated tunnel or a register_vif. 1294 */ 1295 } else { 1296 /* 1297 * Packet arrived through a source-route tunnel. 1298 * Source-route tunnels are no longer supported. 1299 */ 1300 if ((srctun++ % 1000) == 0) 1301 log(LOG_ERR, 1302 "ip_mforward: received source-routed packet from %x\n", 1303 ntohl(ip->ip_src.s_addr)); 1304 return EOPNOTSUPP; 1305 } 1306 1307 /* 1308 * Clear any in-bound checksum flags for this packet. 1309 */ 1310 m->m_pkthdr.csum_flags = 0; 1311 1312 /* 1313 * Don't forward a packet with time-to-live of zero or one, 1314 * or a packet destined to a local-only group. 1315 */ 1316 if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 1317 return 0; 1318 1319 /* 1320 * Determine forwarding vifs from the forwarding cache table 1321 */ 1322 s = splsoftnet(); 1323 ++mrtstat.mrts_mfc_lookups; 1324 rt = mfc_find(&ip->ip_src, &ip->ip_dst); 1325 1326 /* Entry exists, so forward if necessary */ 1327 if (rt != NULL) { 1328 splx(s); 1329 return ip_mdq(m, ifp, rt); 1330 } else { 1331 /* 1332 * If we don't have a route for packet's origin, make a copy 1333 * of the packet and send message to routing daemon. 1334 */ 1335 1336 struct mbuf *mb0; 1337 struct rtdetq *rte; 1338 u_int32_t hash; 1339 const int hlen = ip->ip_hl << 2; 1340 #ifdef UPCALL_TIMING 1341 struct timeval tp; 1342 microtime(&tp); 1343 #endif 1344 1345 ++mrtstat.mrts_mfc_misses; 1346 1347 mrtstat.mrts_no_route++; 1348 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) 1349 log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n", 1350 ntohl(ip->ip_src.s_addr), 1351 ntohl(ip->ip_dst.s_addr)); 1352 1353 /* 1354 * Allocate mbufs early so that we don't do extra work if we are 1355 * just going to fail anyway. Make sure to pullup the header so 1356 * that other people can't step on it. 1357 */ 1358 rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT); 1359 if (rte == NULL) { 1360 splx(s); 1361 return ENOBUFS; 1362 } 1363 mb0 = m_copypacket(m, M_DONTWAIT); 1364 M_PULLUP(mb0, hlen); 1365 if (mb0 == NULL) { 1366 free(rte, M_MRTABLE); 1367 splx(s); 1368 return ENOBUFS; 1369 } 1370 1371 /* is there an upcall waiting for this flow? */ 1372 hash = MFCHASH(ip->ip_src, ip->ip_dst); 1373 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1374 if (in_hosteq(ip->ip_src, rt->mfc_origin) && 1375 in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && 1376 rt->mfc_stall != NULL) 1377 break; 1378 } 1379 1380 if (rt == NULL) { 1381 int i; 1382 struct igmpmsg *im; 1383 1384 /* 1385 * Locate the vifi for the incoming interface for 1386 * this packet. 1387 * If none found, drop packet. 1388 */ 1389 for (vifi = 0; vifi < numvifs && 1390 viftable[vifi].v_ifp != ifp; vifi++) 1391 ; 1392 if (vifi >= numvifs) /* vif not found, drop packet */ 1393 goto non_fatal; 1394 1395 /* no upcall, so make a new entry */ 1396 rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1397 if (rt == NULL) 1398 goto fail; 1399 1400 /* 1401 * Make a copy of the header to send to the user level 1402 * process 1403 */ 1404 mm = m_copym(m, 0, hlen, M_DONTWAIT); 1405 M_PULLUP(mm, hlen); 1406 if (mm == NULL) 1407 goto fail1; 1408 1409 /* 1410 * Send message to routing daemon to install 1411 * a route into the kernel table 1412 */ 1413 1414 im = mtod(mm, struct igmpmsg *); 1415 im->im_msgtype = IGMPMSG_NOCACHE; 1416 im->im_mbz = 0; 1417 im->im_vif = vifi; 1418 1419 mrtstat.mrts_upcalls++; 1420 1421 sockaddr_in_init(&sin, &ip->ip_src, 0); 1422 if (socket_send(ip_mrouter, mm, &sin) < 0) { 1423 log(LOG_WARNING, 1424 "ip_mforward: ip_mrouter socket queue full\n"); 1425 ++mrtstat.mrts_upq_sockfull; 1426 fail1: 1427 free(rt, M_MRTABLE); 1428 fail: 1429 free(rte, M_MRTABLE); 1430 m_freem(mb0); 1431 splx(s); 1432 return ENOBUFS; 1433 } 1434 1435 /* insert new entry at head of hash chain */ 1436 rt->mfc_origin = ip->ip_src; 1437 rt->mfc_mcastgrp = ip->ip_dst; 1438 rt->mfc_pkt_cnt = 0; 1439 rt->mfc_byte_cnt = 0; 1440 rt->mfc_wrong_if = 0; 1441 rt->mfc_expire = UPCALL_EXPIRE; 1442 nexpire[hash]++; 1443 for (i = 0; i < numvifs; i++) { 1444 rt->mfc_ttls[i] = 0; 1445 rt->mfc_flags[i] = 0; 1446 } 1447 rt->mfc_parent = -1; 1448 1449 /* clear the RP address */ 1450 rt->mfc_rp = zeroin_addr; 1451 1452 rt->mfc_bw_meter = NULL; 1453 1454 /* link into table */ 1455 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1456 /* Add this entry to the end of the queue */ 1457 rt->mfc_stall = rte; 1458 } else { 1459 /* determine if q has overflowed */ 1460 struct rtdetq **p; 1461 int npkts = 0; 1462 1463 /* 1464 * XXX ouch! we need to append to the list, but we 1465 * only have a pointer to the front, so we have to 1466 * scan the entire list every time. 1467 */ 1468 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) 1469 if (++npkts > MAX_UPQ) { 1470 mrtstat.mrts_upq_ovflw++; 1471 non_fatal: 1472 free(rte, M_MRTABLE); 1473 m_freem(mb0); 1474 splx(s); 1475 return 0; 1476 } 1477 1478 /* Add this entry to the end of the queue */ 1479 *p = rte; 1480 } 1481 1482 rte->next = NULL; 1483 rte->m = mb0; 1484 rte->ifp = ifp; 1485 #ifdef UPCALL_TIMING 1486 rte->t = tp; 1487 #endif 1488 1489 splx(s); 1490 1491 return 0; 1492 } 1493 } 1494 1495 /*ARGSUSED*/ 1496 static void 1497 expire_upcalls(void *v) 1498 { 1499 int i; 1500 1501 /* XXX NOMPSAFE still need softnet_lock */ 1502 mutex_enter(softnet_lock); 1503 KERNEL_LOCK(1, NULL); 1504 1505 for (i = 0; i < MFCTBLSIZ; i++) { 1506 struct mfc *rt, *nrt; 1507 1508 if (nexpire[i] == 0) 1509 continue; 1510 1511 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 1512 nrt = LIST_NEXT(rt, mfc_hash); 1513 1514 if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) 1515 continue; 1516 nexpire[i]--; 1517 1518 /* 1519 * free the bw_meter entries 1520 */ 1521 while (rt->mfc_bw_meter != NULL) { 1522 struct bw_meter *x = rt->mfc_bw_meter; 1523 1524 rt->mfc_bw_meter = x->bm_mfc_next; 1525 kmem_intr_free(x, sizeof(*x)); 1526 } 1527 1528 ++mrtstat.mrts_cache_cleanups; 1529 if (mrtdebug & DEBUG_EXPIRE) 1530 log(LOG_DEBUG, 1531 "expire_upcalls: expiring (%x %x)\n", 1532 ntohl(rt->mfc_origin.s_addr), 1533 ntohl(rt->mfc_mcastgrp.s_addr)); 1534 1535 expire_mfc(rt); 1536 } 1537 } 1538 1539 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, 1540 expire_upcalls, NULL); 1541 1542 KERNEL_UNLOCK_ONE(NULL); 1543 mutex_exit(softnet_lock); 1544 } 1545 1546 /* 1547 * Macro to send packet on vif. 1548 */ 1549 #define MC_SEND(ip, vifp, m) do { \ 1550 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1551 encap_send((ip), (vifp), (m)); \ 1552 else \ 1553 phyint_send((ip), (vifp), (m)); \ 1554 } while (/*CONSTCOND*/ 0) 1555 1556 /* 1557 * Packet forwarding routine once entry in the cache is made 1558 */ 1559 static int 1560 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt) 1561 { 1562 struct ip *ip = mtod(m, struct ip *); 1563 vifi_t vifi; 1564 struct vif *vifp; 1565 struct sockaddr_in sin; 1566 const int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2); 1567 1568 /* 1569 * Don't forward if it didn't arrive from the parent vif for its origin. 1570 */ 1571 vifi = rt->mfc_parent; 1572 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1573 /* came in the wrong interface */ 1574 if (mrtdebug & DEBUG_FORWARD) 1575 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", 1576 ifp, vifi, 1577 vifi >= numvifs ? 0 : viftable[vifi].v_ifp); 1578 ++mrtstat.mrts_wrong_if; 1579 ++rt->mfc_wrong_if; 1580 1581 /* 1582 * If we are doing PIM assert processing, send a message 1583 * to the routing daemon. 1584 * 1585 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1586 * can complete the SPT switch, regardless of the type 1587 * of the iif (broadcast media, GRE tunnel, etc). 1588 */ 1589 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { 1590 struct timeval now; 1591 u_int32_t delta; 1592 1593 #ifdef PIM 1594 if (ifp == &multicast_register_if) 1595 pimstat.pims_rcv_registers_wrongiif++; 1596 #endif 1597 1598 /* Get vifi for the incoming packet */ 1599 for (vifi = 0; 1600 vifi < numvifs && viftable[vifi].v_ifp != ifp; 1601 vifi++) 1602 ; 1603 if (vifi >= numvifs) { 1604 /* The iif is not found: ignore the packet. */ 1605 return 0; 1606 } 1607 1608 if (rt->mfc_flags[vifi] & 1609 MRT_MFC_FLAGS_DISABLE_WRONGVIF) { 1610 /* WRONGVIF disabled: ignore the packet */ 1611 return 0; 1612 } 1613 1614 microtime(&now); 1615 1616 TV_DELTA(rt->mfc_last_assert, now, delta); 1617 1618 if (delta > ASSERT_MSG_TIME) { 1619 struct igmpmsg *im; 1620 const int hlen = ip->ip_hl << 2; 1621 struct mbuf *mm = 1622 m_copym(m, 0, hlen, M_DONTWAIT); 1623 1624 M_PULLUP(mm, hlen); 1625 if (mm == NULL) 1626 return ENOBUFS; 1627 1628 rt->mfc_last_assert = now; 1629 1630 im = mtod(mm, struct igmpmsg *); 1631 im->im_msgtype = IGMPMSG_WRONGVIF; 1632 im->im_mbz = 0; 1633 im->im_vif = vifi; 1634 1635 mrtstat.mrts_upcalls++; 1636 1637 sockaddr_in_init(&sin, &im->im_src, 0); 1638 if (socket_send(ip_mrouter, mm, &sin) < 0) { 1639 log(LOG_WARNING, 1640 "ip_mforward: ip_mrouter socket queue full\n"); 1641 ++mrtstat.mrts_upq_sockfull; 1642 return ENOBUFS; 1643 } 1644 } 1645 } 1646 return 0; 1647 } 1648 1649 /* If I sourced this packet, it counts as output, else it was input. */ 1650 if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) { 1651 viftable[vifi].v_pkt_out++; 1652 viftable[vifi].v_bytes_out += plen; 1653 } else { 1654 viftable[vifi].v_pkt_in++; 1655 viftable[vifi].v_bytes_in += plen; 1656 } 1657 rt->mfc_pkt_cnt++; 1658 rt->mfc_byte_cnt += plen; 1659 1660 /* 1661 * For each vif, decide if a copy of the packet should be forwarded. 1662 * Forward if: 1663 * - the ttl exceeds the vif's threshold 1664 * - there are group members downstream on interface 1665 */ 1666 for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) { 1667 if ((rt->mfc_ttls[vifi] > 0) && 1668 (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1669 vifp->v_pkt_out++; 1670 vifp->v_bytes_out += plen; 1671 #ifdef PIM 1672 if (vifp->v_flags & VIFF_REGISTER) 1673 pim_register_send(ip, vifp, m, rt); 1674 else 1675 #endif 1676 MC_SEND(ip, vifp, m); 1677 } 1678 } 1679 1680 /* 1681 * Perform upcall-related bw measuring. 1682 */ 1683 if (rt->mfc_bw_meter != NULL) { 1684 struct bw_meter *x; 1685 struct timeval now; 1686 1687 microtime(&now); 1688 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1689 bw_meter_receive_packet(x, plen, &now); 1690 } 1691 1692 return 0; 1693 } 1694 1695 static void 1696 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1697 { 1698 struct mbuf *mb_copy; 1699 const int hlen = ip->ip_hl << 2; 1700 1701 /* 1702 * Make a new reference to the packet; make sure that 1703 * the IP header is actually copied, not just referenced, 1704 * so that ip_output() only scribbles on the copy. 1705 */ 1706 mb_copy = m_copypacket(m, M_DONTWAIT); 1707 M_PULLUP(mb_copy, hlen); 1708 if (mb_copy == NULL) 1709 return; 1710 1711 if (vifp->v_rate_limit <= 0) 1712 tbf_send_packet(vifp, mb_copy); 1713 else 1714 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), 1715 ntohs(ip->ip_len)); 1716 } 1717 1718 static void 1719 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1720 { 1721 struct mbuf *mb_copy; 1722 struct ip *ip_copy; 1723 int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr); 1724 1725 /* Take care of delayed checksums */ 1726 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 1727 in_undefer_cksum_tcpudp(m); 1728 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 1729 } 1730 1731 /* 1732 * copy the old packet & pullup its IP header into the 1733 * new mbuf so we can modify it. Try to fill the new 1734 * mbuf since if we don't the ethernet driver will. 1735 */ 1736 MGETHDR(mb_copy, M_DONTWAIT, MT_DATA); 1737 if (mb_copy == NULL) 1738 return; 1739 mb_copy->m_data += max_linkhdr; 1740 mb_copy->m_pkthdr.len = len; 1741 mb_copy->m_len = sizeof(multicast_encap_iphdr); 1742 1743 if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) { 1744 m_freem(mb_copy); 1745 return; 1746 } 1747 i = MHLEN - max_linkhdr; 1748 if (i > len) 1749 i = len; 1750 mb_copy = m_pullup(mb_copy, i); 1751 if (mb_copy == NULL) 1752 return; 1753 1754 /* 1755 * fill in the encapsulating IP header. 1756 */ 1757 ip_copy = mtod(mb_copy, struct ip *); 1758 *ip_copy = multicast_encap_iphdr; 1759 if (len < IP_MINFRAGSIZE) 1760 ip_copy->ip_id = 0; 1761 else 1762 ip_copy->ip_id = ip_newid(NULL); 1763 ip_copy->ip_len = htons(len); 1764 ip_copy->ip_src = vifp->v_lcl_addr; 1765 ip_copy->ip_dst = vifp->v_rmt_addr; 1766 1767 /* 1768 * turn the encapsulated IP header back into a valid one. 1769 */ 1770 ip = (struct ip *)((char *)ip_copy + sizeof(multicast_encap_iphdr)); 1771 --ip->ip_ttl; 1772 ip->ip_sum = 0; 1773 mb_copy->m_data += sizeof(multicast_encap_iphdr); 1774 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 1775 mb_copy->m_data -= sizeof(multicast_encap_iphdr); 1776 1777 if (vifp->v_rate_limit <= 0) 1778 tbf_send_packet(vifp, mb_copy); 1779 else 1780 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len)); 1781 } 1782 1783 /* 1784 * De-encapsulate a packet and feed it back through ip input. 1785 */ 1786 static void 1787 vif_input(struct mbuf *m, int off, int proto, void *eparg) 1788 { 1789 struct vif *vifp = eparg; 1790 1791 KASSERT(vifp != NULL); 1792 1793 if (proto != ENCAP_PROTO) { 1794 m_freem(m); 1795 mrtstat.mrts_bad_tunnel++; 1796 return; 1797 } 1798 1799 m_adj(m, off); 1800 m_set_rcvif(m, vifp->v_ifp); 1801 1802 if (__predict_false(!pktq_enqueue(ip_pktq, m, 0))) { 1803 m_freem(m); 1804 } 1805 } 1806 1807 /* 1808 * Check if the packet should be received on the vif denoted by arg. 1809 * (The encap selection code will call this once per vif since each is 1810 * registered separately.) 1811 */ 1812 static int 1813 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg) 1814 { 1815 struct vif *vifp; 1816 struct ip ip; 1817 1818 #ifdef DIAGNOSTIC 1819 if (!arg || proto != IPPROTO_IPV4) 1820 panic("unexpected arg in vif_encapcheck"); 1821 #endif 1822 1823 /* 1824 * Accept the packet only if the inner heaader is multicast 1825 * and the outer header matches a tunnel-mode vif. Order 1826 * checks in the hope that common non-matching packets will be 1827 * rejected quickly. Assume that unicast IPv4 traffic in a 1828 * parallel tunnel (e.g. gif(4)) is unlikely. 1829 */ 1830 1831 /* Obtain the outer IP header and the vif pointer. */ 1832 m_copydata(m, 0, sizeof(ip), (void *)&ip); 1833 vifp = (struct vif *)arg; 1834 1835 /* 1836 * The outer source must match the vif's remote peer address. 1837 * For a multicast router with several tunnels, this is the 1838 * only check that will fail on packets in other tunnels, 1839 * assuming the local address is the same. 1840 */ 1841 if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src)) 1842 return 0; 1843 1844 /* The outer destination must match the vif's local address. */ 1845 if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst)) 1846 return 0; 1847 1848 /* The vif must be of tunnel type. */ 1849 if ((vifp->v_flags & VIFF_TUNNEL) == 0) 1850 return 0; 1851 1852 /* Check that the inner destination is multicast. */ 1853 if (off + sizeof(ip) > m->m_pkthdr.len) 1854 return 0; 1855 m_copydata(m, off, sizeof(ip), (void *)&ip); 1856 if (!IN_MULTICAST(ip.ip_dst.s_addr)) 1857 return 0; 1858 1859 /* 1860 * We have checked that both the outer src and dst addresses 1861 * match the vif, and that the inner destination is multicast 1862 * (224/5). By claiming more than 64, we intend to 1863 * preferentially take packets that also match a parallel 1864 * gif(4). 1865 */ 1866 return 32 + 32 + 5; 1867 } 1868 1869 /* 1870 * Token bucket filter module 1871 */ 1872 static void 1873 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len) 1874 { 1875 1876 if (len > MAX_BKT_SIZE) { 1877 /* drop if packet is too large */ 1878 mrtstat.mrts_pkt2large++; 1879 m_freem(m); 1880 return; 1881 } 1882 1883 tbf_update_tokens(vifp); 1884 1885 /* 1886 * If there are enough tokens, and the queue is empty, send this packet 1887 * out immediately. Otherwise, try to insert it on this vif's queue. 1888 */ 1889 if (vifp->tbf_q_len == 0) { 1890 if (len <= vifp->tbf_n_tok) { 1891 vifp->tbf_n_tok -= len; 1892 tbf_send_packet(vifp, m); 1893 } else { 1894 /* queue packet and timeout till later */ 1895 tbf_queue(vifp, m); 1896 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS, 1897 tbf_reprocess_q, vifp); 1898 } 1899 } else { 1900 if (vifp->tbf_q_len >= vifp->tbf_max_q_len && 1901 !tbf_dq_sel(vifp, ip)) { 1902 /* queue full, and couldn't make room */ 1903 mrtstat.mrts_q_overflow++; 1904 m_freem(m); 1905 } else { 1906 /* queue length low enough, or made room */ 1907 tbf_queue(vifp, m); 1908 tbf_process_q(vifp); 1909 } 1910 } 1911 } 1912 1913 /* 1914 * adds a packet to the queue at the interface 1915 */ 1916 static void 1917 tbf_queue(struct vif *vifp, struct mbuf *m) 1918 { 1919 int s = splsoftnet(); 1920 1921 /* insert at tail */ 1922 *vifp->tbf_t = m; 1923 vifp->tbf_t = &m->m_nextpkt; 1924 vifp->tbf_q_len++; 1925 1926 splx(s); 1927 } 1928 1929 /* 1930 * processes the queue at the interface 1931 */ 1932 static void 1933 tbf_process_q(struct vif *vifp) 1934 { 1935 struct mbuf *m; 1936 int len; 1937 int s = splsoftnet(); 1938 1939 /* 1940 * Loop through the queue at the interface and send as many packets 1941 * as possible. 1942 */ 1943 for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) { 1944 len = ntohs(mtod(m, struct ip *)->ip_len); 1945 1946 /* determine if the packet can be sent */ 1947 if (len <= vifp->tbf_n_tok) { 1948 /* if so, 1949 * reduce no of tokens, dequeue the packet, 1950 * send the packet. 1951 */ 1952 if ((vifp->tbf_q = m->m_nextpkt) == NULL) 1953 vifp->tbf_t = &vifp->tbf_q; 1954 --vifp->tbf_q_len; 1955 1956 m->m_nextpkt = NULL; 1957 vifp->tbf_n_tok -= len; 1958 tbf_send_packet(vifp, m); 1959 } else 1960 break; 1961 } 1962 splx(s); 1963 } 1964 1965 static void 1966 tbf_reprocess_q(void *arg) 1967 { 1968 struct vif *vifp = arg; 1969 1970 if (ip_mrouter == NULL) 1971 return; 1972 1973 tbf_update_tokens(vifp); 1974 tbf_process_q(vifp); 1975 1976 if (vifp->tbf_q_len != 0) 1977 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS, 1978 tbf_reprocess_q, vifp); 1979 } 1980 1981 /* function that will selectively discard a member of the queue 1982 * based on the precedence value and the priority 1983 */ 1984 static int 1985 tbf_dq_sel(struct vif *vifp, struct ip *ip) 1986 { 1987 u_int p; 1988 struct mbuf **mp, *m; 1989 int s = splsoftnet(); 1990 1991 p = priority(vifp, ip); 1992 1993 for (mp = &vifp->tbf_q, m = *mp; 1994 m != NULL; 1995 mp = &m->m_nextpkt, m = *mp) { 1996 if (p > priority(vifp, mtod(m, struct ip *))) { 1997 if ((*mp = m->m_nextpkt) == NULL) 1998 vifp->tbf_t = mp; 1999 --vifp->tbf_q_len; 2000 2001 m_freem(m); 2002 mrtstat.mrts_drop_sel++; 2003 splx(s); 2004 return 1; 2005 } 2006 } 2007 splx(s); 2008 return 0; 2009 } 2010 2011 static void 2012 tbf_send_packet(struct vif *vifp, struct mbuf *m) 2013 { 2014 int error; 2015 int s = splsoftnet(); 2016 2017 if (vifp->v_flags & VIFF_TUNNEL) { 2018 /* If tunnel options */ 2019 ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL); 2020 } else { 2021 /* if physical interface option, extract the options and then send */ 2022 struct ip_moptions imo; 2023 2024 imo.imo_multicast_if_index = if_get_index(vifp->v_ifp); 2025 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 2026 imo.imo_multicast_loop = 1; 2027 2028 error = ip_output(m, NULL, NULL, IP_FORWARDING|IP_MULTICASTOPTS, 2029 &imo, NULL); 2030 2031 if (mrtdebug & DEBUG_XMIT) 2032 log(LOG_DEBUG, "phyint_send on vif %ld err %d\n", 2033 (long)(vifp - viftable), error); 2034 } 2035 splx(s); 2036 } 2037 2038 /* determine the current time and then 2039 * the elapsed time (between the last time and time now) 2040 * in milliseconds & update the no. of tokens in the bucket 2041 */ 2042 static void 2043 tbf_update_tokens(struct vif *vifp) 2044 { 2045 struct timeval tp; 2046 u_int32_t tm; 2047 int s = splsoftnet(); 2048 2049 microtime(&tp); 2050 2051 TV_DELTA(tp, vifp->tbf_last_pkt_t, tm); 2052 2053 /* 2054 * This formula is actually 2055 * "time in seconds" * "bytes/second". 2056 * 2057 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) 2058 * 2059 * The (1000/1024) was introduced in add_vif to optimize 2060 * this divide into a shift. 2061 */ 2062 vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192; 2063 vifp->tbf_last_pkt_t = tp; 2064 2065 if (vifp->tbf_n_tok > MAX_BKT_SIZE) 2066 vifp->tbf_n_tok = MAX_BKT_SIZE; 2067 2068 splx(s); 2069 } 2070 2071 static int 2072 priority(struct vif *vifp, struct ip *ip) 2073 { 2074 int prio = 50; /* the lowest priority -- default case */ 2075 2076 /* temporary hack; may add general packet classifier some day */ 2077 2078 /* 2079 * XXX XXX: We're reading the UDP header, but we didn't ensure 2080 * it was present in the packet. 2081 */ 2082 2083 /* 2084 * The UDP port space is divided up into four priority ranges: 2085 * [0, 16384) : unclassified - lowest priority 2086 * [16384, 32768) : audio - highest priority 2087 * [32768, 49152) : whiteboard - medium priority 2088 * [49152, 65536) : video - low priority 2089 */ 2090 if (ip->ip_p == IPPROTO_UDP) { 2091 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); 2092 2093 switch (ntohs(udp->uh_dport) & 0xc000) { 2094 case 0x4000: 2095 prio = 70; 2096 break; 2097 case 0x8000: 2098 prio = 60; 2099 break; 2100 case 0xc000: 2101 prio = 55; 2102 break; 2103 } 2104 2105 if (tbfdebug > 1) 2106 log(LOG_DEBUG, "port %x prio %d\n", 2107 ntohs(udp->uh_dport), prio); 2108 } 2109 2110 return prio; 2111 } 2112 2113 /* 2114 * Code for bandwidth monitors 2115 */ 2116 2117 /* 2118 * Define common interface for timeval-related methods 2119 */ 2120 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp) 2121 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp)) 2122 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp)) 2123 2124 static uint32_t 2125 compute_bw_meter_flags(struct bw_upcall *req) 2126 { 2127 uint32_t flags = 0; 2128 2129 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 2130 flags |= BW_METER_UNIT_PACKETS; 2131 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 2132 flags |= BW_METER_UNIT_BYTES; 2133 if (req->bu_flags & BW_UPCALL_GEQ) 2134 flags |= BW_METER_GEQ; 2135 if (req->bu_flags & BW_UPCALL_LEQ) 2136 flags |= BW_METER_LEQ; 2137 2138 return flags; 2139 } 2140 2141 /* 2142 * Add a bw_meter entry 2143 */ 2144 static int 2145 add_bw_upcall(struct bw_upcall *req) 2146 { 2147 int s; 2148 struct mfc *mfc; 2149 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 2150 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 2151 struct timeval now; 2152 struct bw_meter *x; 2153 uint32_t flags; 2154 2155 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2156 return EOPNOTSUPP; 2157 2158 /* Test if the flags are valid */ 2159 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 2160 return EINVAL; 2161 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 2162 return EINVAL; 2163 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2164 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2165 return EINVAL; 2166 2167 /* Test if the threshold time interval is valid */ 2168 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 2169 return EINVAL; 2170 2171 flags = compute_bw_meter_flags(req); 2172 2173 /* 2174 * Find if we have already same bw_meter entry 2175 */ 2176 s = splsoftnet(); 2177 mfc = mfc_find(&req->bu_src, &req->bu_dst); 2178 if (mfc == NULL) { 2179 splx(s); 2180 return EADDRNOTAVAIL; 2181 } 2182 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 2183 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2184 &req->bu_threshold.b_time, ==)) && 2185 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2186 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2187 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 2188 splx(s); 2189 return 0; /* XXX Already installed */ 2190 } 2191 } 2192 2193 /* Allocate the new bw_meter entry */ 2194 x = kmem_intr_alloc(sizeof(*x), KM_NOSLEEP); 2195 if (x == NULL) { 2196 splx(s); 2197 return ENOBUFS; 2198 } 2199 2200 /* Set the new bw_meter entry */ 2201 x->bm_threshold.b_time = req->bu_threshold.b_time; 2202 microtime(&now); 2203 x->bm_start_time = now; 2204 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 2205 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 2206 x->bm_measured.b_packets = 0; 2207 x->bm_measured.b_bytes = 0; 2208 x->bm_flags = flags; 2209 x->bm_time_next = NULL; 2210 x->bm_time_hash = BW_METER_BUCKETS; 2211 2212 /* Add the new bw_meter entry to the front of entries for this MFC */ 2213 x->bm_mfc = mfc; 2214 x->bm_mfc_next = mfc->mfc_bw_meter; 2215 mfc->mfc_bw_meter = x; 2216 schedule_bw_meter(x, &now); 2217 splx(s); 2218 2219 return 0; 2220 } 2221 2222 static void 2223 free_bw_list(struct bw_meter *list) 2224 { 2225 while (list != NULL) { 2226 struct bw_meter *x = list; 2227 2228 list = list->bm_mfc_next; 2229 unschedule_bw_meter(x); 2230 kmem_intr_free(x, sizeof(*x)); 2231 } 2232 } 2233 2234 /* 2235 * Delete one or multiple bw_meter entries 2236 */ 2237 static int 2238 del_bw_upcall(struct bw_upcall *req) 2239 { 2240 int s; 2241 struct mfc *mfc; 2242 struct bw_meter *x; 2243 2244 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2245 return EOPNOTSUPP; 2246 2247 s = splsoftnet(); 2248 /* Find the corresponding MFC entry */ 2249 mfc = mfc_find(&req->bu_src, &req->bu_dst); 2250 if (mfc == NULL) { 2251 splx(s); 2252 return EADDRNOTAVAIL; 2253 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 2254 /* 2255 * Delete all bw_meter entries for this mfc 2256 */ 2257 struct bw_meter *list; 2258 2259 list = mfc->mfc_bw_meter; 2260 mfc->mfc_bw_meter = NULL; 2261 free_bw_list(list); 2262 splx(s); 2263 return 0; 2264 } else { /* Delete a single bw_meter entry */ 2265 struct bw_meter *prev; 2266 uint32_t flags = 0; 2267 2268 flags = compute_bw_meter_flags(req); 2269 2270 /* Find the bw_meter entry to delete */ 2271 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 2272 prev = x, x = x->bm_mfc_next) { 2273 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2274 &req->bu_threshold.b_time, ==)) && 2275 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2276 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2277 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 2278 break; 2279 } 2280 if (x != NULL) { /* Delete entry from the list for this MFC */ 2281 if (prev != NULL) 2282 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ 2283 else 2284 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ 2285 2286 unschedule_bw_meter(x); 2287 splx(s); 2288 /* Free the bw_meter entry */ 2289 kmem_intr_free(x, sizeof(*x)); 2290 return 0; 2291 } else { 2292 splx(s); 2293 return EINVAL; 2294 } 2295 } 2296 /* NOTREACHED */ 2297 } 2298 2299 /* 2300 * Perform bandwidth measurement processing that may result in an upcall 2301 */ 2302 static void 2303 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 2304 { 2305 struct timeval delta; 2306 2307 delta = *nowp; 2308 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2309 2310 if (x->bm_flags & BW_METER_GEQ) { 2311 /* 2312 * Processing for ">=" type of bw_meter entry 2313 */ 2314 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2315 /* Reset the bw_meter entry */ 2316 x->bm_start_time = *nowp; 2317 x->bm_measured.b_packets = 0; 2318 x->bm_measured.b_bytes = 0; 2319 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2320 } 2321 2322 /* Record that a packet is received */ 2323 x->bm_measured.b_packets++; 2324 x->bm_measured.b_bytes += plen; 2325 2326 /* 2327 * Test if we should deliver an upcall 2328 */ 2329 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 2330 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2331 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || 2332 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2333 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { 2334 /* Prepare an upcall for delivery */ 2335 bw_meter_prepare_upcall(x, nowp); 2336 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 2337 } 2338 } 2339 } else if (x->bm_flags & BW_METER_LEQ) { 2340 /* 2341 * Processing for "<=" type of bw_meter entry 2342 */ 2343 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2344 /* 2345 * We are behind time with the multicast forwarding table 2346 * scanning for "<=" type of bw_meter entries, so test now 2347 * if we should deliver an upcall. 2348 */ 2349 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2350 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2351 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2352 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2353 /* Prepare an upcall for delivery */ 2354 bw_meter_prepare_upcall(x, nowp); 2355 } 2356 /* Reschedule the bw_meter entry */ 2357 unschedule_bw_meter(x); 2358 schedule_bw_meter(x, nowp); 2359 } 2360 2361 /* Record that a packet is received */ 2362 x->bm_measured.b_packets++; 2363 x->bm_measured.b_bytes += plen; 2364 2365 /* 2366 * Test if we should restart the measuring interval 2367 */ 2368 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 2369 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 2370 (x->bm_flags & BW_METER_UNIT_BYTES && 2371 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 2372 /* Don't restart the measuring interval */ 2373 } else { 2374 /* Do restart the measuring interval */ 2375 /* 2376 * XXX: note that we don't unschedule and schedule, because this 2377 * might be too much overhead per packet. Instead, when we process 2378 * all entries for a given timer hash bin, we check whether it is 2379 * really a timeout. If not, we reschedule at that time. 2380 */ 2381 x->bm_start_time = *nowp; 2382 x->bm_measured.b_packets = 0; 2383 x->bm_measured.b_bytes = 0; 2384 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2385 } 2386 } 2387 } 2388 2389 /* 2390 * Prepare a bandwidth-related upcall 2391 */ 2392 static void 2393 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2394 { 2395 struct timeval delta; 2396 struct bw_upcall *u; 2397 2398 /* 2399 * Compute the measured time interval 2400 */ 2401 delta = *nowp; 2402 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2403 2404 /* 2405 * If there are too many pending upcalls, deliver them now 2406 */ 2407 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2408 bw_upcalls_send(); 2409 2410 /* 2411 * Set the bw_upcall entry 2412 */ 2413 u = &bw_upcalls[bw_upcalls_n++]; 2414 u->bu_src = x->bm_mfc->mfc_origin; 2415 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2416 u->bu_threshold.b_time = x->bm_threshold.b_time; 2417 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2418 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2419 u->bu_measured.b_time = delta; 2420 u->bu_measured.b_packets = x->bm_measured.b_packets; 2421 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2422 u->bu_flags = 0; 2423 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2424 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2425 if (x->bm_flags & BW_METER_UNIT_BYTES) 2426 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2427 if (x->bm_flags & BW_METER_GEQ) 2428 u->bu_flags |= BW_UPCALL_GEQ; 2429 if (x->bm_flags & BW_METER_LEQ) 2430 u->bu_flags |= BW_UPCALL_LEQ; 2431 } 2432 2433 /* 2434 * Send the pending bandwidth-related upcalls 2435 */ 2436 static void 2437 bw_upcalls_send(void) 2438 { 2439 struct mbuf *m; 2440 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2441 struct sockaddr_in k_igmpsrc = { 2442 .sin_len = sizeof(k_igmpsrc), 2443 .sin_family = AF_INET, 2444 }; 2445 static struct igmpmsg igmpmsg = { 2446 0, /* unused1 */ 2447 0, /* unused2 */ 2448 IGMPMSG_BW_UPCALL,/* im_msgtype */ 2449 0, /* im_mbz */ 2450 0, /* im_vif */ 2451 0, /* unused3 */ 2452 { 0 }, /* im_src */ 2453 { 0 } /* im_dst */ 2454 }; 2455 2456 if (bw_upcalls_n == 0) 2457 return; /* No pending upcalls */ 2458 2459 bw_upcalls_n = 0; 2460 2461 /* 2462 * Allocate a new mbuf, initialize it with the header and 2463 * the payload for the pending calls. 2464 */ 2465 MGETHDR(m, M_DONTWAIT, MT_HEADER); 2466 if (m == NULL) { 2467 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2468 return; 2469 } 2470 2471 m->m_len = m->m_pkthdr.len = 0; 2472 m_copyback(m, 0, sizeof(struct igmpmsg), (void *)&igmpmsg); 2473 m_copyback(m, sizeof(struct igmpmsg), len, (void *)&bw_upcalls[0]); 2474 2475 /* 2476 * Send the upcalls 2477 * XXX do we need to set the address in k_igmpsrc ? 2478 */ 2479 mrtstat.mrts_upcalls++; 2480 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { 2481 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); 2482 ++mrtstat.mrts_upq_sockfull; 2483 } 2484 } 2485 2486 /* 2487 * Compute the timeout hash value for the bw_meter entries 2488 */ 2489 #define BW_METER_TIMEHASH(bw_meter, hash) \ 2490 do { \ 2491 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2492 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2493 (hash) = next_timeval.tv_sec; \ 2494 if (next_timeval.tv_usec) \ 2495 (hash)++; /* XXX: make sure we don't timeout early */ \ 2496 (hash) %= BW_METER_BUCKETS; \ 2497 } while (/*CONSTCOND*/ 0) 2498 2499 /* 2500 * Schedule a timer to process periodically bw_meter entry of type "<=" 2501 * by linking the entry in the proper hash bucket. 2502 */ 2503 static void 2504 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2505 { 2506 int time_hash; 2507 2508 if (!(x->bm_flags & BW_METER_LEQ)) 2509 return; /* XXX: we schedule timers only for "<=" entries */ 2510 2511 /* 2512 * Reset the bw_meter entry 2513 */ 2514 x->bm_start_time = *nowp; 2515 x->bm_measured.b_packets = 0; 2516 x->bm_measured.b_bytes = 0; 2517 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2518 2519 /* 2520 * Compute the timeout hash value and insert the entry 2521 */ 2522 BW_METER_TIMEHASH(x, time_hash); 2523 x->bm_time_next = bw_meter_timers[time_hash]; 2524 bw_meter_timers[time_hash] = x; 2525 x->bm_time_hash = time_hash; 2526 } 2527 2528 /* 2529 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2530 * by removing the entry from the proper hash bucket. 2531 */ 2532 static void 2533 unschedule_bw_meter(struct bw_meter *x) 2534 { 2535 int time_hash; 2536 struct bw_meter *prev, *tmp; 2537 2538 if (!(x->bm_flags & BW_METER_LEQ)) 2539 return; /* XXX: we schedule timers only for "<=" entries */ 2540 2541 /* 2542 * Compute the timeout hash value and delete the entry 2543 */ 2544 time_hash = x->bm_time_hash; 2545 if (time_hash >= BW_METER_BUCKETS) 2546 return; /* Entry was not scheduled */ 2547 2548 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2549 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2550 if (tmp == x) 2551 break; 2552 2553 if (tmp == NULL) 2554 panic("unschedule_bw_meter: bw_meter entry not found"); 2555 2556 if (prev != NULL) 2557 prev->bm_time_next = x->bm_time_next; 2558 else 2559 bw_meter_timers[time_hash] = x->bm_time_next; 2560 2561 x->bm_time_next = NULL; 2562 x->bm_time_hash = BW_METER_BUCKETS; 2563 } 2564 2565 /* 2566 * Process all "<=" type of bw_meter that should be processed now, 2567 * and for each entry prepare an upcall if necessary. Each processed 2568 * entry is rescheduled again for the (periodic) processing. 2569 * 2570 * This is run periodically (once per second normally). On each round, 2571 * all the potentially matching entries are in the hash slot that we are 2572 * looking at. 2573 */ 2574 static void 2575 bw_meter_process(void) 2576 { 2577 int s; 2578 static uint32_t last_tv_sec; /* last time we processed this */ 2579 2580 uint32_t loops; 2581 int i; 2582 struct timeval now, process_endtime; 2583 2584 microtime(&now); 2585 if (last_tv_sec == now.tv_sec) 2586 return; /* nothing to do */ 2587 2588 loops = now.tv_sec - last_tv_sec; 2589 last_tv_sec = now.tv_sec; 2590 if (loops > BW_METER_BUCKETS) 2591 loops = BW_METER_BUCKETS; 2592 2593 s = splsoftnet(); 2594 /* 2595 * Process all bins of bw_meter entries from the one after the last 2596 * processed to the current one. On entry, i points to the last bucket 2597 * visited, so we need to increment i at the beginning of the loop. 2598 */ 2599 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 2600 struct bw_meter *x, *tmp_list; 2601 2602 if (++i >= BW_METER_BUCKETS) 2603 i = 0; 2604 2605 /* Disconnect the list of bw_meter entries from the bin */ 2606 tmp_list = bw_meter_timers[i]; 2607 bw_meter_timers[i] = NULL; 2608 2609 /* Process the list of bw_meter entries */ 2610 while (tmp_list != NULL) { 2611 x = tmp_list; 2612 tmp_list = tmp_list->bm_time_next; 2613 2614 /* Test if the time interval is over */ 2615 process_endtime = x->bm_start_time; 2616 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); 2617 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 2618 /* Not yet: reschedule, but don't reset */ 2619 int time_hash; 2620 2621 BW_METER_TIMEHASH(x, time_hash); 2622 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { 2623 /* 2624 * XXX: somehow the bin processing is a bit ahead of time. 2625 * Put the entry in the next bin. 2626 */ 2627 if (++time_hash >= BW_METER_BUCKETS) 2628 time_hash = 0; 2629 } 2630 x->bm_time_next = bw_meter_timers[time_hash]; 2631 bw_meter_timers[time_hash] = x; 2632 x->bm_time_hash = time_hash; 2633 2634 continue; 2635 } 2636 2637 /* 2638 * Test if we should deliver an upcall 2639 */ 2640 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2641 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2642 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2643 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2644 /* Prepare an upcall for delivery */ 2645 bw_meter_prepare_upcall(x, &now); 2646 } 2647 2648 /* 2649 * Reschedule for next processing 2650 */ 2651 schedule_bw_meter(x, &now); 2652 } 2653 } 2654 2655 /* Send all upcalls that are pending delivery */ 2656 bw_upcalls_send(); 2657 2658 splx(s); 2659 } 2660 2661 /* 2662 * A periodic function for sending all upcalls that are pending delivery 2663 */ 2664 static void 2665 expire_bw_upcalls_send(void *unused) 2666 { 2667 int s; 2668 2669 s = splsoftnet(); 2670 bw_upcalls_send(); 2671 splx(s); 2672 2673 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 2674 expire_bw_upcalls_send, NULL); 2675 } 2676 2677 /* 2678 * A periodic function for periodic scanning of the multicast forwarding 2679 * table for processing all "<=" bw_meter entries. 2680 */ 2681 static void 2682 expire_bw_meter_process(void *unused) 2683 { 2684 if (mrt_api_config & MRT_MFC_BW_UPCALL) 2685 bw_meter_process(); 2686 2687 callout_reset(&bw_meter_ch, BW_METER_PERIOD, 2688 expire_bw_meter_process, NULL); 2689 } 2690 2691 /* 2692 * End of bandwidth monitoring code 2693 */ 2694 2695 #ifdef PIM 2696 /* 2697 * Send the packet up to the user daemon, or eventually do kernel encapsulation 2698 */ 2699 static int 2700 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, 2701 struct mfc *rt) 2702 { 2703 struct mbuf *mb_copy, *mm; 2704 2705 if (mrtdebug & DEBUG_PIM) 2706 log(LOG_DEBUG, "pim_register_send: \n"); 2707 2708 mb_copy = pim_register_prepare(ip, m); 2709 if (mb_copy == NULL) 2710 return ENOBUFS; 2711 2712 /* 2713 * Send all the fragments. Note that the mbuf for each fragment 2714 * is freed by the sending machinery. 2715 */ 2716 for (mm = mb_copy; mm; mm = mb_copy) { 2717 mb_copy = mm->m_nextpkt; 2718 mm->m_nextpkt = NULL; 2719 mm = m_pullup(mm, sizeof(struct ip)); 2720 if (mm != NULL) { 2721 ip = mtod(mm, struct ip *); 2722 if ((mrt_api_config & MRT_MFC_RP) && 2723 !in_nullhost(rt->mfc_rp)) { 2724 pim_register_send_rp(ip, vifp, mm, rt); 2725 } else { 2726 pim_register_send_upcall(ip, vifp, mm, rt); 2727 } 2728 } 2729 } 2730 2731 return 0; 2732 } 2733 2734 /* 2735 * Return a copy of the data packet that is ready for PIM Register 2736 * encapsulation. 2737 * XXX: Note that in the returned copy the IP header is a valid one. 2738 */ 2739 static struct mbuf * 2740 pim_register_prepare(struct ip *ip, struct mbuf *m) 2741 { 2742 struct mbuf *mb_copy = NULL; 2743 int mtu; 2744 2745 /* Take care of delayed checksums */ 2746 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 2747 in_undefer_cksum_tcpudp(m); 2748 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 2749 } 2750 2751 /* 2752 * Copy the old packet & pullup its IP header into the 2753 * new mbuf so we can modify it. 2754 */ 2755 mb_copy = m_copypacket(m, M_DONTWAIT); 2756 if (mb_copy == NULL) 2757 return NULL; 2758 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 2759 if (mb_copy == NULL) 2760 return NULL; 2761 2762 /* take care of the TTL */ 2763 ip = mtod(mb_copy, struct ip *); 2764 --ip->ip_ttl; 2765 2766 /* Compute the MTU after the PIM Register encapsulation */ 2767 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 2768 2769 if (ntohs(ip->ip_len) <= mtu) { 2770 /* Turn the IP header into a valid one */ 2771 ip->ip_sum = 0; 2772 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 2773 } else { 2774 /* Fragment the packet */ 2775 if (ip_fragment(mb_copy, NULL, mtu) != 0) { 2776 /* XXX: mb_copy was freed by ip_fragment() */ 2777 return NULL; 2778 } 2779 } 2780 return mb_copy; 2781 } 2782 2783 /* 2784 * Send an upcall with the data packet to the user-level process. 2785 */ 2786 static int 2787 pim_register_send_upcall(struct ip *ip, struct vif *vifp, 2788 struct mbuf *mb_copy, struct mfc *rt) 2789 { 2790 struct mbuf *mb_first; 2791 int len = ntohs(ip->ip_len); 2792 struct igmpmsg *im; 2793 struct sockaddr_in k_igmpsrc = { 2794 .sin_len = sizeof(k_igmpsrc), 2795 .sin_family = AF_INET, 2796 }; 2797 2798 /* 2799 * Add a new mbuf with an upcall header 2800 */ 2801 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 2802 if (mb_first == NULL) { 2803 m_freem(mb_copy); 2804 return ENOBUFS; 2805 } 2806 mb_first->m_data += max_linkhdr; 2807 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 2808 mb_first->m_len = sizeof(struct igmpmsg); 2809 mb_first->m_next = mb_copy; 2810 2811 /* Send message to routing daemon */ 2812 im = mtod(mb_first, struct igmpmsg *); 2813 im->im_msgtype = IGMPMSG_WHOLEPKT; 2814 im->im_mbz = 0; 2815 im->im_vif = vifp - viftable; 2816 im->im_src = ip->ip_src; 2817 im->im_dst = ip->ip_dst; 2818 2819 k_igmpsrc.sin_addr = ip->ip_src; 2820 2821 mrtstat.mrts_upcalls++; 2822 2823 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { 2824 if (mrtdebug & DEBUG_PIM) 2825 log(LOG_WARNING, 2826 "mcast: pim_register_send_upcall: ip_mrouter socket queue full\n"); 2827 ++mrtstat.mrts_upq_sockfull; 2828 return ENOBUFS; 2829 } 2830 2831 /* Keep statistics */ 2832 pimstat.pims_snd_registers_msgs++; 2833 pimstat.pims_snd_registers_bytes += len; 2834 2835 return 0; 2836 } 2837 2838 /* 2839 * Encapsulate the data packet in PIM Register message and send it to the RP. 2840 */ 2841 static int 2842 pim_register_send_rp(struct ip *ip, struct vif *vifp, 2843 struct mbuf *mb_copy, struct mfc *rt) 2844 { 2845 struct mbuf *mb_first; 2846 struct ip *ip_outer; 2847 struct pim_encap_pimhdr *pimhdr; 2848 int len = ntohs(ip->ip_len); 2849 vifi_t vifi = rt->mfc_parent; 2850 2851 if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) { 2852 m_freem(mb_copy); 2853 return EADDRNOTAVAIL; /* The iif vif is invalid */ 2854 } 2855 2856 /* 2857 * Add a new mbuf with the encapsulating header 2858 */ 2859 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 2860 if (mb_first == NULL) { 2861 m_freem(mb_copy); 2862 return ENOBUFS; 2863 } 2864 mb_first->m_data += max_linkhdr; 2865 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 2866 mb_first->m_next = mb_copy; 2867 2868 mb_first->m_pkthdr.len = len + mb_first->m_len; 2869 2870 /* 2871 * Fill in the encapsulating IP and PIM header 2872 */ 2873 ip_outer = mtod(mb_first, struct ip *); 2874 *ip_outer = pim_encap_iphdr; 2875 if (mb_first->m_pkthdr.len < IP_MINFRAGSIZE) 2876 ip_outer->ip_id = 0; 2877 else 2878 ip_outer->ip_id = ip_newid(NULL); 2879 ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + 2880 sizeof(pim_encap_pimhdr)); 2881 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 2882 ip_outer->ip_dst = rt->mfc_rp; 2883 /* 2884 * Copy the inner header TOS to the outer header, and take care of the 2885 * IP_DF bit. 2886 */ 2887 ip_outer->ip_tos = ip->ip_tos; 2888 if (ntohs(ip->ip_off) & IP_DF) 2889 ip_outer->ip_off |= htons(IP_DF); 2890 pimhdr = (struct pim_encap_pimhdr *)((char *)ip_outer 2891 + sizeof(pim_encap_iphdr)); 2892 *pimhdr = pim_encap_pimhdr; 2893 /* If the iif crosses a border, set the Border-bit */ 2894 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 2895 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 2896 2897 mb_first->m_data += sizeof(pim_encap_iphdr); 2898 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 2899 mb_first->m_data -= sizeof(pim_encap_iphdr); 2900 2901 if (vifp->v_rate_limit == 0) 2902 tbf_send_packet(vifp, mb_first); 2903 else 2904 tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len)); 2905 2906 /* Keep statistics */ 2907 pimstat.pims_snd_registers_msgs++; 2908 pimstat.pims_snd_registers_bytes += len; 2909 2910 return 0; 2911 } 2912 2913 /* 2914 * PIM-SMv2 and PIM-DM messages processing. 2915 * Receives and verifies the PIM control messages, and passes them 2916 * up to the listening socket, using rip_input(). 2917 * The only message with special processing is the PIM_REGISTER message 2918 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 2919 * is passed to if_simloop(). 2920 */ 2921 void 2922 pim_input(struct mbuf *m, int off, int proto) 2923 { 2924 struct ip *ip = mtod(m, struct ip *); 2925 struct pim *pim; 2926 int minlen; 2927 int datalen; 2928 int ip_tos; 2929 int iphlen; 2930 2931 iphlen = off; 2932 datalen = ntohs(ip->ip_len) - iphlen; 2933 2934 /* Keep statistics */ 2935 pimstat.pims_rcv_total_msgs++; 2936 pimstat.pims_rcv_total_bytes += datalen; 2937 2938 /* 2939 * Validate lengths 2940 */ 2941 if (datalen < PIM_MINLEN) { 2942 pimstat.pims_rcv_tooshort++; 2943 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", 2944 datalen, (u_long)ip->ip_src.s_addr); 2945 m_freem(m); 2946 return; 2947 } 2948 2949 /* 2950 * If the packet is at least as big as a REGISTER, go ahead 2951 * and grab the PIM REGISTER header size, to avoid another 2952 * possible m_pullup() later. 2953 * 2954 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 2955 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 2956 */ 2957 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); 2958 2959 /* 2960 * Get the IP and PIM headers in contiguous memory, and 2961 * possibly the PIM REGISTER header. 2962 */ 2963 if ((m->m_flags & M_EXT || m->m_len < minlen) && 2964 (m = m_pullup(m, minlen)) == NULL) { 2965 log(LOG_ERR, "pim_input: m_pullup failure\n"); 2966 return; 2967 } 2968 ip = mtod(m, struct ip *); 2969 ip_tos = ip->ip_tos; 2970 2971 /* adjust mbuf to point to the PIM header */ 2972 m->m_data += iphlen; 2973 m->m_len -= iphlen; 2974 pim = mtod(m, struct pim *); 2975 2976 /* 2977 * Validate checksum. If PIM REGISTER, exclude the data packet. 2978 * 2979 * XXX: some older PIMv2 implementations don't make this distinction, 2980 * so for compatibility reason perform the checksum over part of the 2981 * message, and if error, then over the whole message. 2982 */ 2983 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { 2984 /* do nothing, checksum okay */ 2985 } else if (in_cksum(m, datalen)) { 2986 pimstat.pims_rcv_badsum++; 2987 if (mrtdebug & DEBUG_PIM) 2988 log(LOG_DEBUG, "pim_input: invalid checksum\n"); 2989 m_freem(m); 2990 return; 2991 } 2992 2993 /* PIM version check */ 2994 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 2995 pimstat.pims_rcv_badversion++; 2996 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", 2997 PIM_VT_V(pim->pim_vt), PIM_VERSION); 2998 m_freem(m); 2999 return; 3000 } 3001 3002 /* restore mbuf back to the outer IP */ 3003 m->m_data -= iphlen; 3004 m->m_len += iphlen; 3005 3006 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 3007 /* 3008 * Since this is a REGISTER, we'll make a copy of the register 3009 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 3010 * routing daemon. 3011 */ 3012 int s; 3013 struct sockaddr_in dst = { 3014 .sin_len = sizeof(dst), 3015 .sin_family = AF_INET, 3016 }; 3017 struct mbuf *mcp; 3018 struct ip *encap_ip; 3019 u_int32_t *reghdr; 3020 struct ifnet *vifp; 3021 3022 s = splsoftnet(); 3023 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 3024 splx(s); 3025 if (mrtdebug & DEBUG_PIM) 3026 log(LOG_DEBUG, 3027 "pim_input: register vif not set: %d\n", reg_vif_num); 3028 m_freem(m); 3029 return; 3030 } 3031 /* XXX need refcnt? */ 3032 vifp = viftable[reg_vif_num].v_ifp; 3033 splx(s); 3034 3035 /* 3036 * Validate length 3037 */ 3038 if (datalen < PIM_REG_MINLEN) { 3039 pimstat.pims_rcv_tooshort++; 3040 pimstat.pims_rcv_badregisters++; 3041 log(LOG_ERR, 3042 "pim_input: register packet size too small %d from %lx\n", 3043 datalen, (u_long)ip->ip_src.s_addr); 3044 m_freem(m); 3045 return; 3046 } 3047 3048 reghdr = (u_int32_t *)(pim + 1); 3049 encap_ip = (struct ip *)(reghdr + 1); 3050 3051 if (mrtdebug & DEBUG_PIM) { 3052 log(LOG_DEBUG, 3053 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", 3054 (u_long)ntohl(encap_ip->ip_src.s_addr), 3055 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3056 ntohs(encap_ip->ip_len)); 3057 } 3058 3059 /* verify the version number of the inner packet */ 3060 if (encap_ip->ip_v != IPVERSION) { 3061 pimstat.pims_rcv_badregisters++; 3062 if (mrtdebug & DEBUG_PIM) { 3063 log(LOG_DEBUG, "pim_input: invalid IP version (%d) " 3064 "of the inner packet\n", encap_ip->ip_v); 3065 } 3066 m_freem(m); 3067 return; 3068 } 3069 3070 /* verify the inner packet doesn't have options */ 3071 if (encap_ip->ip_hl != (sizeof(struct ip) >> 2)) { 3072 pimstat.pims_rcv_badregisters++; 3073 m_freem(m); 3074 return; 3075 } 3076 3077 /* verify the inner packet is destined to a mcast group */ 3078 if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) { 3079 pimstat.pims_rcv_badregisters++; 3080 if (mrtdebug & DEBUG_PIM) 3081 log(LOG_DEBUG, 3082 "pim_input: inner packet of register is not " 3083 "multicast %lx\n", 3084 (u_long)ntohl(encap_ip->ip_dst.s_addr)); 3085 m_freem(m); 3086 return; 3087 } 3088 3089 /* If a NULL_REGISTER, pass it to the daemon */ 3090 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 3091 goto pim_input_to_daemon; 3092 3093 /* 3094 * Copy the TOS from the outer IP header to the inner IP header. 3095 */ 3096 if (encap_ip->ip_tos != ip_tos) { 3097 /* Outer TOS -> inner TOS */ 3098 encap_ip->ip_tos = ip_tos; 3099 /* Recompute the inner header checksum. Sigh... */ 3100 3101 /* adjust mbuf to point to the inner IP header */ 3102 m->m_data += (iphlen + PIM_MINLEN); 3103 m->m_len -= (iphlen + PIM_MINLEN); 3104 3105 encap_ip->ip_sum = 0; 3106 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 3107 3108 /* restore mbuf to point back to the outer IP header */ 3109 m->m_data -= (iphlen + PIM_MINLEN); 3110 m->m_len += (iphlen + PIM_MINLEN); 3111 } 3112 3113 /* 3114 * Decapsulate the inner IP packet and loopback to forward it 3115 * as a normal multicast packet. Also, make a copy of the 3116 * outer_iphdr + pimhdr + reghdr + encap_iphdr 3117 * to pass to the daemon later, so it can take the appropriate 3118 * actions (e.g., send back PIM_REGISTER_STOP). 3119 * XXX: here m->m_data points to the outer IP header. 3120 */ 3121 mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_DONTWAIT); 3122 if (mcp == NULL) { 3123 log(LOG_ERR, 3124 "pim_input: pim register: could not copy register head\n"); 3125 m_freem(m); 3126 return; 3127 } 3128 3129 /* Keep statistics */ 3130 /* XXX: registers_bytes include only the encap. mcast pkt */ 3131 pimstat.pims_rcv_registers_msgs++; 3132 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); 3133 3134 /* 3135 * forward the inner ip packet; point m_data at the inner ip. 3136 */ 3137 m_adj(m, iphlen + PIM_MINLEN); 3138 3139 if (mrtdebug & DEBUG_PIM) { 3140 log(LOG_DEBUG, 3141 "pim_input: forwarding decapsulated register: " 3142 "src %lx, dst %lx, vif %d\n", 3143 (u_long)ntohl(encap_ip->ip_src.s_addr), 3144 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3145 reg_vif_num); 3146 } 3147 /* NB: vifp was collected above; can it change on us? */ 3148 looutput(vifp, m, (struct sockaddr *)&dst, NULL); 3149 3150 /* prepare the register head to send to the mrouting daemon */ 3151 m = mcp; 3152 } 3153 3154 pim_input_to_daemon: 3155 /* 3156 * Pass the PIM message up to the daemon; if it is a Register message, 3157 * pass the 'head' only up to the daemon. This includes the 3158 * outer IP header, PIM header, PIM-Register header and the 3159 * inner IP header. 3160 * XXX: the outer IP header pkt size of a Register is not adjust to 3161 * reflect the fact that the inner multicast data is truncated. 3162 */ 3163 /* 3164 * Currently, pim_input() is always called holding softnet_lock 3165 * by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE). 3166 */ 3167 KASSERT(mutex_owned(softnet_lock)); 3168 rip_input(m, iphlen, proto); 3169 3170 return; 3171 } 3172 #endif /* PIM */ 3173