1 /* $NetBSD: ip_encap.c,v 1.74 2020/08/22 01:43:07 riastradh Exp $ */ 2 /* $KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $ */ 3 4 /* 5 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the project nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 /* 33 * My grandfather said that there's a devil inside tunnelling technology... 34 * 35 * We have surprisingly many protocols that want packets with IP protocol 36 * #4 or #41. Here's a list of protocols that want protocol #41: 37 * RFC1933 configured tunnel 38 * RFC1933 automatic tunnel 39 * RFC2401 IPsec tunnel 40 * RFC2473 IPv6 generic packet tunnelling 41 * RFC2529 6over4 tunnel 42 * RFC3056 6to4 tunnel 43 * isatap tunnel 44 * mobile-ip6 (uses RFC2473) 45 * Here's a list of protocol that want protocol #4: 46 * RFC1853 IPv4-in-IPv4 tunnelling 47 * RFC2003 IPv4 encapsulation within IPv4 48 * RFC2344 reverse tunnelling for mobile-ip4 49 * RFC2401 IPsec tunnel 50 * Well, what can I say. They impose different en/decapsulation mechanism 51 * from each other, so they need separate protocol handler. The only one 52 * we can easily determine by protocol # is IPsec, which always has 53 * AH/ESP/IPComp header right after outer IP header. 54 * 55 * So, clearly good old protosw does not work for protocol #4 and #41. 56 * The code will let you match protocol via src/dst address pair. 57 */ 58 /* XXX is M_NETADDR correct? */ 59 60 /* 61 * With USE_RADIX the code will use radix table for tunnel lookup, for 62 * tunnels registered with encap_attach() with a addr/mask pair. 63 * Faster on machines with thousands of tunnel registerations (= interfaces). 64 * 65 * The code assumes that radix table code can handle non-continuous netmask, 66 * as it will pass radix table memory region with (src + dst) sockaddr pair. 67 */ 68 #define USE_RADIX 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.74 2020/08/22 01:43:07 riastradh Exp $"); 72 73 #ifdef _KERNEL_OPT 74 #include "opt_mrouting.h" 75 #include "opt_inet.h" 76 #include "opt_net_mpsafe.h" 77 #endif 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/socket.h> 82 #include <sys/socketvar.h> /* for softnet_lock */ 83 #include <sys/sockio.h> 84 #include <sys/mbuf.h> 85 #include <sys/errno.h> 86 #include <sys/queue.h> 87 #include <sys/kmem.h> 88 #include <sys/mutex.h> 89 #include <sys/condvar.h> 90 #include <sys/psref.h> 91 #include <sys/pslist.h> 92 93 #include <net/if.h> 94 95 #include <netinet/in.h> 96 #include <netinet/in_systm.h> 97 #include <netinet/ip.h> 98 #include <netinet/ip_var.h> 99 #include <netinet/ip_encap.h> 100 #ifdef MROUTING 101 #include <netinet/ip_mroute.h> 102 #endif /* MROUTING */ 103 104 #ifdef INET6 105 #include <netinet/ip6.h> 106 #include <netinet6/ip6_var.h> 107 #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */ 108 #include <netinet6/in6_var.h> 109 #include <netinet6/in6_pcb.h> 110 #include <netinet/icmp6.h> 111 #endif 112 113 #ifdef NET_MPSAFE 114 #define ENCAP_MPSAFE 1 115 #endif 116 117 enum direction { INBOUND, OUTBOUND }; 118 119 #ifdef INET 120 static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction, 121 struct psref *); 122 #endif 123 #ifdef INET6 124 static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction, 125 struct psref *); 126 #endif 127 static int encap_add(struct encaptab *); 128 static int encap_remove(struct encaptab *); 129 static void encap_afcheck(int, const struct sockaddr *, const struct sockaddr *); 130 #ifdef USE_RADIX 131 static struct radix_node_head *encap_rnh(int); 132 static int mask_matchlen(const struct sockaddr *); 133 #else 134 static int mask_match(const struct encaptab *, const struct sockaddr *, 135 const struct sockaddr *); 136 #endif 137 138 /* 139 * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking 140 * encap_table. So, it cannot use pserialize_read_enter() 141 */ 142 static struct { 143 struct pslist_head list; 144 pserialize_t psz; 145 struct psref_class *elem_class; /* for the element of et_list */ 146 } encaptab __cacheline_aligned = { 147 .list = PSLIST_INITIALIZER, 148 }; 149 #define encap_table encaptab.list 150 151 static struct { 152 kmutex_t lock; 153 kcondvar_t cv; 154 struct lwp *busy; 155 } encap_whole __cacheline_aligned; 156 157 #ifdef USE_RADIX 158 struct radix_node_head *encap_head[2]; /* 0 for AF_INET, 1 for AF_INET6 */ 159 static bool encap_head_updating = false; 160 #endif 161 162 static bool encap_initialized = false; 163 /* 164 * must be done before other encap interfaces initialization. 165 */ 166 void 167 encapinit(void) 168 { 169 170 if (encap_initialized) 171 return; 172 173 encaptab.psz = pserialize_create(); 174 encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET); 175 176 mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE); 177 cv_init(&encap_whole.cv, "ip_encap cv"); 178 encap_whole.busy = NULL; 179 180 encap_initialized = true; 181 } 182 183 void 184 encap_init(void) 185 { 186 static int initialized = 0; 187 188 if (initialized) 189 return; 190 initialized++; 191 #if 0 192 /* 193 * we cannot use LIST_INIT() here, since drivers may want to call 194 * encap_attach(), on driver attach. encap_init() will be called 195 * on AF_INET{,6} initialization, which happens after driver 196 * initialization - using LIST_INIT() here can nuke encap_attach() 197 * from drivers. 198 */ 199 PSLIST_INIT(&encap_table); 200 #endif 201 202 #ifdef USE_RADIX 203 /* 204 * initialize radix lookup table when the radix subsystem is inited. 205 */ 206 rn_delayedinit((void *)&encap_head[0], 207 sizeof(struct sockaddr_pack) << 3); 208 #ifdef INET6 209 rn_delayedinit((void *)&encap_head[1], 210 sizeof(struct sockaddr_pack) << 3); 211 #endif 212 #endif 213 } 214 215 #ifdef INET 216 static struct encaptab * 217 encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir, 218 struct psref *match_psref) 219 { 220 struct ip *ip; 221 struct ip_pack4 pack; 222 struct encaptab *ep, *match; 223 int prio, matchprio; 224 int s; 225 #ifdef USE_RADIX 226 struct radix_node_head *rnh = encap_rnh(AF_INET); 227 struct radix_node *rn; 228 #endif 229 230 KASSERT(m->m_len >= sizeof(*ip)); 231 232 ip = mtod(m, struct ip *); 233 234 memset(&pack, 0, sizeof(pack)); 235 pack.p.sp_len = sizeof(pack); 236 pack.mine.sin_family = pack.yours.sin_family = AF_INET; 237 pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in); 238 if (dir == INBOUND) { 239 pack.mine.sin_addr = ip->ip_dst; 240 pack.yours.sin_addr = ip->ip_src; 241 } else { 242 pack.mine.sin_addr = ip->ip_src; 243 pack.yours.sin_addr = ip->ip_dst; 244 } 245 246 match = NULL; 247 matchprio = 0; 248 249 s = pserialize_read_enter(); 250 #ifdef USE_RADIX 251 if (encap_head_updating) { 252 /* 253 * Update in progress. Do nothing. 254 */ 255 pserialize_read_exit(s); 256 return NULL; 257 } 258 259 rn = rnh->rnh_matchaddr((void *)&pack, rnh); 260 if (rn && (rn->rn_flags & RNF_ROOT) == 0) { 261 struct encaptab *encapp = (struct encaptab *)rn; 262 263 psref_acquire(match_psref, &encapp->psref, 264 encaptab.elem_class); 265 match = encapp; 266 matchprio = mask_matchlen(match->srcmask) + 267 mask_matchlen(match->dstmask); 268 } 269 #endif 270 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 271 struct psref elem_psref; 272 273 if (ep->af != AF_INET) 274 continue; 275 if (ep->proto >= 0 && ep->proto != proto) 276 continue; 277 278 psref_acquire(&elem_psref, &ep->psref, 279 encaptab.elem_class); 280 if (ep->func) { 281 pserialize_read_exit(s); 282 /* ep->func is sleepable. e.g. rtalloc1 */ 283 prio = (*ep->func)(m, off, proto, ep->arg); 284 s = pserialize_read_enter(); 285 } else { 286 #ifdef USE_RADIX 287 psref_release(&elem_psref, &ep->psref, 288 encaptab.elem_class); 289 continue; 290 #else 291 prio = mask_match(ep, (struct sockaddr *)&pack.mine, 292 (struct sockaddr *)&pack.yours); 293 #endif 294 } 295 296 /* 297 * We prioritize the matches by using bit length of the 298 * matches. mask_match() and user-supplied matching function 299 * should return the bit length of the matches (for example, 300 * if both src/dst are matched for IPv4, 64 should be returned). 301 * 0 or negative return value means "it did not match". 302 * 303 * The question is, since we have two "mask" portion, we 304 * cannot really define total order between entries. 305 * For example, which of these should be preferred? 306 * mask_match() returns 48 (32 + 16) for both of them. 307 * src=3ffe::/16, dst=3ffe:501::/32 308 * src=3ffe:501::/32, dst=3ffe::/16 309 * 310 * We need to loop through all the possible candidates 311 * to get the best match - the search takes O(n) for 312 * n attachments (i.e. interfaces). 313 * 314 * For radix-based lookup, I guess source takes precedence. 315 * See rn_{refines,lexobetter} for the correct answer. 316 */ 317 if (prio <= 0) { 318 psref_release(&elem_psref, &ep->psref, 319 encaptab.elem_class); 320 continue; 321 } 322 if (prio > matchprio) { 323 /* release last matched ep */ 324 if (match != NULL) 325 psref_release(match_psref, &match->psref, 326 encaptab.elem_class); 327 328 psref_copy(match_psref, &elem_psref, 329 encaptab.elem_class); 330 matchprio = prio; 331 match = ep; 332 } 333 KASSERTMSG((match == NULL) || psref_held(&match->psref, 334 encaptab.elem_class), 335 "current match = %p, but not hold its psref", match); 336 337 psref_release(&elem_psref, &ep->psref, 338 encaptab.elem_class); 339 } 340 pserialize_read_exit(s); 341 342 return match; 343 } 344 345 void 346 encap4_input(struct mbuf *m, int off, int proto) 347 { 348 const struct encapsw *esw; 349 struct encaptab *match; 350 struct psref match_psref; 351 352 match = encap4_lookup(m, off, proto, INBOUND, &match_psref); 353 if (match) { 354 /* found a match, "match" has the best one */ 355 esw = match->esw; 356 if (esw && esw->encapsw4.pr_input) { 357 (*esw->encapsw4.pr_input)(m, off, proto, match->arg); 358 psref_release(&match_psref, &match->psref, 359 encaptab.elem_class); 360 } else { 361 psref_release(&match_psref, &match->psref, 362 encaptab.elem_class); 363 m_freem(m); 364 } 365 return; 366 } 367 368 /* last resort: inject to raw socket */ 369 SOFTNET_LOCK_IF_NET_MPSAFE(); 370 rip_input(m, off, proto); 371 SOFTNET_UNLOCK_IF_NET_MPSAFE(); 372 } 373 #endif 374 375 #ifdef INET6 376 static struct encaptab * 377 encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir, 378 struct psref *match_psref) 379 { 380 struct ip6_hdr *ip6; 381 struct ip_pack6 pack; 382 int prio, matchprio; 383 int s; 384 struct encaptab *ep, *match; 385 #ifdef USE_RADIX 386 struct radix_node_head *rnh = encap_rnh(AF_INET6); 387 struct radix_node *rn; 388 #endif 389 390 KASSERT(m->m_len >= sizeof(*ip6)); 391 392 ip6 = mtod(m, struct ip6_hdr *); 393 394 memset(&pack, 0, sizeof(pack)); 395 pack.p.sp_len = sizeof(pack); 396 pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6; 397 pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6); 398 if (dir == INBOUND) { 399 pack.mine.sin6_addr = ip6->ip6_dst; 400 pack.yours.sin6_addr = ip6->ip6_src; 401 } else { 402 pack.mine.sin6_addr = ip6->ip6_src; 403 pack.yours.sin6_addr = ip6->ip6_dst; 404 } 405 406 match = NULL; 407 matchprio = 0; 408 409 s = pserialize_read_enter(); 410 #ifdef USE_RADIX 411 if (encap_head_updating) { 412 /* 413 * Update in progress. Do nothing. 414 */ 415 pserialize_read_exit(s); 416 return NULL; 417 } 418 419 rn = rnh->rnh_matchaddr((void *)&pack, rnh); 420 if (rn && (rn->rn_flags & RNF_ROOT) == 0) { 421 struct encaptab *encapp = (struct encaptab *)rn; 422 423 psref_acquire(match_psref, &encapp->psref, 424 encaptab.elem_class); 425 match = encapp; 426 matchprio = mask_matchlen(match->srcmask) + 427 mask_matchlen(match->dstmask); 428 } 429 #endif 430 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 431 struct psref elem_psref; 432 433 if (ep->af != AF_INET6) 434 continue; 435 if (ep->proto >= 0 && ep->proto != proto) 436 continue; 437 438 psref_acquire(&elem_psref, &ep->psref, 439 encaptab.elem_class); 440 441 if (ep->func) { 442 pserialize_read_exit(s); 443 /* ep->func is sleepable. e.g. rtalloc1 */ 444 prio = (*ep->func)(m, off, proto, ep->arg); 445 s = pserialize_read_enter(); 446 } else { 447 #ifdef USE_RADIX 448 psref_release(&elem_psref, &ep->psref, 449 encaptab.elem_class); 450 continue; 451 #else 452 prio = mask_match(ep, (struct sockaddr *)&pack.mine, 453 (struct sockaddr *)&pack.yours); 454 #endif 455 } 456 457 /* see encap4_lookup() for issues here */ 458 if (prio <= 0) { 459 psref_release(&elem_psref, &ep->psref, 460 encaptab.elem_class); 461 continue; 462 } 463 if (prio > matchprio) { 464 /* release last matched ep */ 465 if (match != NULL) 466 psref_release(match_psref, &match->psref, 467 encaptab.elem_class); 468 469 psref_copy(match_psref, &elem_psref, 470 encaptab.elem_class); 471 matchprio = prio; 472 match = ep; 473 } 474 KASSERTMSG((match == NULL) || psref_held(&match->psref, 475 encaptab.elem_class), 476 "current match = %p, but not hold its psref", match); 477 478 psref_release(&elem_psref, &ep->psref, 479 encaptab.elem_class); 480 } 481 pserialize_read_exit(s); 482 483 return match; 484 } 485 486 int 487 encap6_input(struct mbuf **mp, int *offp, int proto) 488 { 489 struct mbuf *m = *mp; 490 const struct encapsw *esw; 491 struct encaptab *match; 492 struct psref match_psref; 493 int rv; 494 495 match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref); 496 497 if (match) { 498 /* found a match */ 499 esw = match->esw; 500 if (esw && esw->encapsw6.pr_input) { 501 int ret; 502 ret = (*esw->encapsw6.pr_input)(mp, offp, proto, 503 match->arg); 504 psref_release(&match_psref, &match->psref, 505 encaptab.elem_class); 506 return ret; 507 } else { 508 psref_release(&match_psref, &match->psref, 509 encaptab.elem_class); 510 m_freem(m); 511 return IPPROTO_DONE; 512 } 513 } 514 515 /* last resort: inject to raw socket */ 516 SOFTNET_LOCK_IF_NET_MPSAFE(); 517 rv = rip6_input(mp, offp, proto); 518 SOFTNET_UNLOCK_IF_NET_MPSAFE(); 519 return rv; 520 } 521 #endif 522 523 /* 524 * XXX 525 * The encaptab list and the rnh radix tree must be manipulated atomically. 526 */ 527 static int 528 encap_add(struct encaptab *ep) 529 { 530 #ifdef USE_RADIX 531 struct radix_node_head *rnh = encap_rnh(ep->af); 532 #endif 533 534 KASSERT(encap_lock_held()); 535 536 #ifdef USE_RADIX 537 if (!ep->func && rnh) { 538 /* Disable access to the radix tree for reader. */ 539 encap_head_updating = true; 540 /* Wait for all readers to drain. */ 541 pserialize_perform(encaptab.psz); 542 543 if (!rnh->rnh_addaddr((void *)ep->addrpack, 544 (void *)ep->maskpack, rnh, ep->nodes)) { 545 encap_head_updating = false; 546 return EEXIST; 547 } 548 549 /* 550 * The ep added to the radix tree must be skipped while 551 * encap[46]_lookup walks encaptab list. In other words, 552 * encap_add() does not need to care whether the ep has 553 * been added encaptab list or not yet. 554 * So, we can re-enable access to the radix tree for now. 555 */ 556 encap_head_updating = false; 557 } 558 #endif 559 PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain); 560 561 return 0; 562 } 563 564 /* 565 * XXX 566 * The encaptab list and the rnh radix tree must be manipulated atomically. 567 */ 568 static int 569 encap_remove(struct encaptab *ep) 570 { 571 #ifdef USE_RADIX 572 struct radix_node_head *rnh = encap_rnh(ep->af); 573 #endif 574 int error = 0; 575 576 KASSERT(encap_lock_held()); 577 578 #ifdef USE_RADIX 579 if (!ep->func && rnh) { 580 /* Disable access to the radix tree for reader. */ 581 encap_head_updating = true; 582 /* Wait for all readers to drain. */ 583 pserialize_perform(encaptab.psz); 584 585 if (!rnh->rnh_deladdr((void *)ep->addrpack, 586 (void *)ep->maskpack, rnh)) 587 error = ESRCH; 588 589 /* 590 * The ep added to the radix tree must be skipped while 591 * encap[46]_lookup walks encaptab list. In other words, 592 * encap_add() does not need to care whether the ep has 593 * been added encaptab list or not yet. 594 * So, we can re-enable access to the radix tree for now. 595 */ 596 encap_head_updating = false; 597 } 598 #endif 599 PSLIST_WRITER_REMOVE(ep, chain); 600 601 return error; 602 } 603 604 static void 605 encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp) 606 { 607 608 KASSERT(sp != NULL && dp != NULL); 609 KASSERT(sp->sa_len == dp->sa_len); 610 KASSERT(af == sp->sa_family && af == dp->sa_family); 611 612 socklen_t len __diagused = sockaddr_getsize_by_family(af); 613 KASSERT(len != 0 && len == sp->sa_len && len == dp->sa_len); 614 } 615 616 /* 617 * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. 618 * length of mask (sm and dm) is assumed to be same as sp/dp. 619 * Return value will be necessary as input (cookie) for encap_detach(). 620 */ 621 const struct encaptab * 622 encap_attach(int af, int proto, 623 const struct sockaddr *sp, const struct sockaddr *sm, 624 const struct sockaddr *dp, const struct sockaddr *dm, 625 const struct encapsw *esw, void *arg) 626 { 627 struct encaptab *ep; 628 int error; 629 int pss; 630 size_t l; 631 struct ip_pack4 *pack4; 632 #ifdef INET6 633 struct ip_pack6 *pack6; 634 #endif 635 #ifndef ENCAP_MPSAFE 636 int s; 637 638 s = splsoftnet(); 639 #endif 640 641 ASSERT_SLEEPABLE(); 642 643 /* sanity check on args */ 644 encap_afcheck(af, sp, dp); 645 646 /* check if anyone have already attached with exactly same config */ 647 pss = pserialize_read_enter(); 648 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 649 if (ep->af != af) 650 continue; 651 if (ep->proto != proto) 652 continue; 653 if (ep->func) 654 continue; 655 656 KASSERT(ep->src != NULL); 657 KASSERT(ep->dst != NULL); 658 KASSERT(ep->srcmask != NULL); 659 KASSERT(ep->dstmask != NULL); 660 661 if (ep->src->sa_len != sp->sa_len || 662 memcmp(ep->src, sp, sp->sa_len) != 0 || 663 memcmp(ep->srcmask, sm, sp->sa_len) != 0) 664 continue; 665 if (ep->dst->sa_len != dp->sa_len || 666 memcmp(ep->dst, dp, dp->sa_len) != 0 || 667 memcmp(ep->dstmask, dm, dp->sa_len) != 0) 668 continue; 669 670 error = EEXIST; 671 pserialize_read_exit(pss); 672 goto fail; 673 } 674 pserialize_read_exit(pss); 675 676 switch (af) { 677 case AF_INET: 678 l = sizeof(*pack4); 679 break; 680 #ifdef INET6 681 case AF_INET6: 682 l = sizeof(*pack6); 683 break; 684 #endif 685 default: 686 goto fail; 687 } 688 689 /* M_NETADDR ok? */ 690 ep = kmem_zalloc(sizeof(*ep), KM_SLEEP); 691 ep->addrpack = kmem_zalloc(l, KM_SLEEP); 692 ep->maskpack = kmem_zalloc(l, KM_SLEEP); 693 694 ep->af = af; 695 ep->proto = proto; 696 ep->addrpack->sa_len = l & 0xff; 697 ep->maskpack->sa_len = l & 0xff; 698 switch (af) { 699 case AF_INET: 700 pack4 = (struct ip_pack4 *)ep->addrpack; 701 ep->src = (struct sockaddr *)&pack4->mine; 702 ep->dst = (struct sockaddr *)&pack4->yours; 703 pack4 = (struct ip_pack4 *)ep->maskpack; 704 ep->srcmask = (struct sockaddr *)&pack4->mine; 705 ep->dstmask = (struct sockaddr *)&pack4->yours; 706 break; 707 #ifdef INET6 708 case AF_INET6: 709 pack6 = (struct ip_pack6 *)ep->addrpack; 710 ep->src = (struct sockaddr *)&pack6->mine; 711 ep->dst = (struct sockaddr *)&pack6->yours; 712 pack6 = (struct ip_pack6 *)ep->maskpack; 713 ep->srcmask = (struct sockaddr *)&pack6->mine; 714 ep->dstmask = (struct sockaddr *)&pack6->yours; 715 break; 716 #endif 717 } 718 719 memcpy(ep->src, sp, sp->sa_len); 720 memcpy(ep->srcmask, sm, sp->sa_len); 721 memcpy(ep->dst, dp, dp->sa_len); 722 memcpy(ep->dstmask, dm, dp->sa_len); 723 ep->esw = esw; 724 ep->arg = arg; 725 psref_target_init(&ep->psref, encaptab.elem_class); 726 727 error = encap_add(ep); 728 if (error) 729 goto gc; 730 731 error = 0; 732 #ifndef ENCAP_MPSAFE 733 splx(s); 734 #endif 735 return ep; 736 737 gc: 738 if (ep->addrpack) 739 kmem_free(ep->addrpack, l); 740 if (ep->maskpack) 741 kmem_free(ep->maskpack, l); 742 if (ep) 743 kmem_free(ep, sizeof(*ep)); 744 fail: 745 #ifndef ENCAP_MPSAFE 746 splx(s); 747 #endif 748 return NULL; 749 } 750 751 const struct encaptab * 752 encap_attach_func(int af, int proto, 753 int (*func)(struct mbuf *, int, int, void *), 754 const struct encapsw *esw, void *arg) 755 { 756 struct encaptab *ep; 757 int error; 758 #ifndef ENCAP_MPSAFE 759 int s; 760 761 s = splsoftnet(); 762 #endif 763 764 ASSERT_SLEEPABLE(); 765 766 /* sanity check on args */ 767 KASSERT(func != NULL); 768 KASSERT(af == AF_INET 769 #ifdef INET6 770 || af == AF_INET6 771 #endif 772 ); 773 774 ep = kmem_alloc(sizeof(*ep), KM_SLEEP); 775 memset(ep, 0, sizeof(*ep)); 776 777 ep->af = af; 778 ep->proto = proto; 779 ep->func = func; 780 ep->esw = esw; 781 ep->arg = arg; 782 psref_target_init(&ep->psref, encaptab.elem_class); 783 784 error = encap_add(ep); 785 if (error) 786 goto gc; 787 788 error = 0; 789 #ifndef ENCAP_MPSAFE 790 splx(s); 791 #endif 792 return ep; 793 794 gc: 795 kmem_free(ep, sizeof(*ep)); 796 #ifndef ENCAP_MPSAFE 797 splx(s); 798 #endif 799 return NULL; 800 } 801 802 /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */ 803 804 #ifdef INET6 805 void * 806 encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) 807 { 808 void *d = d0; 809 struct ip6_hdr *ip6; 810 struct mbuf *m; 811 int off; 812 struct ip6ctlparam *ip6cp = NULL; 813 int nxt; 814 int s; 815 struct encaptab *ep; 816 const struct encapsw *esw; 817 818 if (sa->sa_family != AF_INET6 || 819 sa->sa_len != sizeof(struct sockaddr_in6)) 820 return NULL; 821 822 if ((unsigned)cmd >= PRC_NCMDS) 823 return NULL; 824 if (cmd == PRC_HOSTDEAD) 825 d = NULL; 826 else if (cmd == PRC_MSGSIZE) 827 ; /* special code is present, see below */ 828 else if (inet6ctlerrmap[cmd] == 0) 829 return NULL; 830 831 /* if the parameter is from icmp6, decode it. */ 832 if (d != NULL) { 833 ip6cp = (struct ip6ctlparam *)d; 834 m = ip6cp->ip6c_m; 835 ip6 = ip6cp->ip6c_ip6; 836 off = ip6cp->ip6c_off; 837 nxt = ip6cp->ip6c_nxt; 838 839 if (ip6 && cmd == PRC_MSGSIZE) { 840 int valid = 0; 841 struct encaptab *match; 842 struct psref elem_psref; 843 844 /* 845 * Check to see if we have a valid encap configuration. 846 */ 847 match = encap6_lookup(m, off, nxt, OUTBOUND, 848 &elem_psref); 849 if (match) { 850 valid++; 851 psref_release(&elem_psref, &match->psref, 852 encaptab.elem_class); 853 } 854 855 /* 856 * Depending on the value of "valid" and routing table 857 * size (mtudisc_{hi,lo}wat), we will: 858 * - recalcurate the new MTU and create the 859 * corresponding routing entry, or 860 * - ignore the MTU change notification. 861 */ 862 icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); 863 } 864 } else { 865 m = NULL; 866 ip6 = NULL; 867 nxt = -1; 868 } 869 870 /* inform all listeners */ 871 872 s = pserialize_read_enter(); 873 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 874 struct psref elem_psref; 875 876 if (ep->af != AF_INET6) 877 continue; 878 if (ep->proto >= 0 && ep->proto != nxt) 879 continue; 880 881 /* should optimize by looking at address pairs */ 882 883 /* XXX need to pass ep->arg or ep itself to listeners */ 884 psref_acquire(&elem_psref, &ep->psref, 885 encaptab.elem_class); 886 esw = ep->esw; 887 if (esw && esw->encapsw6.pr_ctlinput) { 888 pserialize_read_exit(s); 889 /* pr_ctlinput is sleepable. e.g. rtcache_free */ 890 (*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg); 891 s = pserialize_read_enter(); 892 } 893 psref_release(&elem_psref, &ep->psref, 894 encaptab.elem_class); 895 } 896 pserialize_read_exit(s); 897 898 rip6_ctlinput(cmd, sa, d0); 899 return NULL; 900 } 901 #endif 902 903 int 904 encap_detach(const struct encaptab *cookie) 905 { 906 const struct encaptab *ep = cookie; 907 struct encaptab *p; 908 int error; 909 910 KASSERT(encap_lock_held()); 911 912 PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) { 913 if (p == ep) { 914 error = encap_remove(p); 915 if (error) 916 return error; 917 else 918 break; 919 } 920 } 921 if (p == NULL) 922 return ENOENT; 923 924 pserialize_perform(encaptab.psz); 925 psref_target_destroy(&p->psref, 926 encaptab.elem_class); 927 if (!ep->func) { 928 kmem_free(p->addrpack, ep->addrpack->sa_len); 929 kmem_free(p->maskpack, ep->maskpack->sa_len); 930 } 931 kmem_free(p, sizeof(*p)); 932 933 return 0; 934 } 935 936 #ifdef USE_RADIX 937 static struct radix_node_head * 938 encap_rnh(int af) 939 { 940 941 switch (af) { 942 case AF_INET: 943 return encap_head[0]; 944 #ifdef INET6 945 case AF_INET6: 946 return encap_head[1]; 947 #endif 948 default: 949 return NULL; 950 } 951 } 952 953 static int 954 mask_matchlen(const struct sockaddr *sa) 955 { 956 const char *p, *ep; 957 int l; 958 959 p = (const char *)sa; 960 ep = p + sa->sa_len; 961 p += 2; /* sa_len + sa_family */ 962 963 l = 0; 964 while (p < ep) { 965 l += (*p ? 8 : 0); /* estimate */ 966 p++; 967 } 968 return l; 969 } 970 #endif 971 972 #ifndef USE_RADIX 973 static int 974 mask_match(const struct encaptab *ep, 975 const struct sockaddr *sp, 976 const struct sockaddr *dp) 977 { 978 struct sockaddr_storage s; 979 struct sockaddr_storage d; 980 int i; 981 const u_int8_t *p, *q; 982 u_int8_t *r; 983 int matchlen; 984 985 KASSERTMSG(ep->func == NULL, "wrong encaptab passed to mask_match"); 986 987 if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) 988 return 0; 989 if (sp->sa_family != ep->af || dp->sa_family != ep->af) 990 return 0; 991 if (sp->sa_len != ep->src->sa_len || dp->sa_len != ep->dst->sa_len) 992 return 0; 993 994 matchlen = 0; 995 996 p = (const u_int8_t *)sp; 997 q = (const u_int8_t *)ep->srcmask; 998 r = (u_int8_t *)&s; 999 for (i = 0 ; i < sp->sa_len; i++) { 1000 r[i] = p[i] & q[i]; 1001 /* XXX estimate */ 1002 matchlen += (q[i] ? 8 : 0); 1003 } 1004 1005 p = (const u_int8_t *)dp; 1006 q = (const u_int8_t *)ep->dstmask; 1007 r = (u_int8_t *)&d; 1008 for (i = 0 ; i < dp->sa_len; i++) { 1009 r[i] = p[i] & q[i]; 1010 /* XXX rough estimate */ 1011 matchlen += (q[i] ? 8 : 0); 1012 } 1013 1014 /* need to overwrite len/family portion as we don't compare them */ 1015 s.ss_len = sp->sa_len; 1016 s.ss_family = sp->sa_family; 1017 d.ss_len = dp->sa_len; 1018 d.ss_family = dp->sa_family; 1019 1020 if (memcmp(&s, ep->src, ep->src->sa_len) == 0 && 1021 memcmp(&d, ep->dst, ep->dst->sa_len) == 0) { 1022 return matchlen; 1023 } else 1024 return 0; 1025 } 1026 #endif 1027 1028 int 1029 encap_lock_enter(void) 1030 { 1031 int error; 1032 1033 mutex_enter(&encap_whole.lock); 1034 while (encap_whole.busy != NULL) { 1035 error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock); 1036 if (error) { 1037 mutex_exit(&encap_whole.lock); 1038 return error; 1039 } 1040 } 1041 KASSERT(encap_whole.busy == NULL); 1042 encap_whole.busy = curlwp; 1043 mutex_exit(&encap_whole.lock); 1044 1045 return 0; 1046 } 1047 1048 void 1049 encap_lock_exit(void) 1050 { 1051 1052 mutex_enter(&encap_whole.lock); 1053 KASSERT(encap_whole.busy == curlwp); 1054 encap_whole.busy = NULL; 1055 cv_broadcast(&encap_whole.cv); 1056 mutex_exit(&encap_whole.lock); 1057 } 1058 1059 bool 1060 encap_lock_held(void) 1061 { 1062 1063 return (encap_whole.busy == curlwp); 1064 } 1065