1 /* $NetBSD: ip_encap.c,v 1.66 2017/11/15 10:42:41 knakahara Exp $ */ 2 /* $KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $ */ 3 4 /* 5 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the project nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 /* 33 * My grandfather said that there's a devil inside tunnelling technology... 34 * 35 * We have surprisingly many protocols that want packets with IP protocol 36 * #4 or #41. Here's a list of protocols that want protocol #41: 37 * RFC1933 configured tunnel 38 * RFC1933 automatic tunnel 39 * RFC2401 IPsec tunnel 40 * RFC2473 IPv6 generic packet tunnelling 41 * RFC2529 6over4 tunnel 42 * RFC3056 6to4 tunnel 43 * isatap tunnel 44 * mobile-ip6 (uses RFC2473) 45 * Here's a list of protocol that want protocol #4: 46 * RFC1853 IPv4-in-IPv4 tunnelling 47 * RFC2003 IPv4 encapsulation within IPv4 48 * RFC2344 reverse tunnelling for mobile-ip4 49 * RFC2401 IPsec tunnel 50 * Well, what can I say. They impose different en/decapsulation mechanism 51 * from each other, so they need separate protocol handler. The only one 52 * we can easily determine by protocol # is IPsec, which always has 53 * AH/ESP/IPComp header right after outer IP header. 54 * 55 * So, clearly good old protosw does not work for protocol #4 and #41. 56 * The code will let you match protocol via src/dst address pair. 57 */ 58 /* XXX is M_NETADDR correct? */ 59 60 /* 61 * With USE_RADIX the code will use radix table for tunnel lookup, for 62 * tunnels registered with encap_attach() with a addr/mask pair. 63 * Faster on machines with thousands of tunnel registerations (= interfaces). 64 * 65 * The code assumes that radix table code can handle non-continuous netmask, 66 * as it will pass radix table memory region with (src + dst) sockaddr pair. 67 */ 68 #define USE_RADIX 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.66 2017/11/15 10:42:41 knakahara Exp $"); 72 73 #ifdef _KERNEL_OPT 74 #include "opt_mrouting.h" 75 #include "opt_inet.h" 76 #include "opt_net_mpsafe.h" 77 #endif 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/socket.h> 82 #include <sys/sockio.h> 83 #include <sys/mbuf.h> 84 #include <sys/errno.h> 85 #include <sys/queue.h> 86 #include <sys/kmem.h> 87 #include <sys/mutex.h> 88 #include <sys/condvar.h> 89 #include <sys/psref.h> 90 #include <sys/pslist.h> 91 92 #include <net/if.h> 93 94 #include <netinet/in.h> 95 #include <netinet/in_systm.h> 96 #include <netinet/ip.h> 97 #include <netinet/ip_var.h> 98 #include <netinet/ip_encap.h> 99 #ifdef MROUTING 100 #include <netinet/ip_mroute.h> 101 #endif /* MROUTING */ 102 103 #ifdef INET6 104 #include <netinet/ip6.h> 105 #include <netinet6/ip6_var.h> 106 #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */ 107 #include <netinet6/in6_var.h> 108 #include <netinet6/in6_pcb.h> 109 #include <netinet/icmp6.h> 110 #endif 111 112 #include <net/net_osdep.h> 113 114 #ifdef NET_MPSAFE 115 #define ENCAP_MPSAFE 1 116 #endif 117 118 enum direction { INBOUND, OUTBOUND }; 119 120 #ifdef INET 121 static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction, 122 struct psref *); 123 #endif 124 #ifdef INET6 125 static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction, 126 struct psref *); 127 #endif 128 static int encap_add(struct encaptab *); 129 static int encap_remove(struct encaptab *); 130 static int encap_afcheck(int, const struct sockaddr *, const struct sockaddr *); 131 #ifdef USE_RADIX 132 static struct radix_node_head *encap_rnh(int); 133 static int mask_matchlen(const struct sockaddr *); 134 #else 135 static int mask_match(const struct encaptab *, const struct sockaddr *, 136 const struct sockaddr *); 137 #endif 138 139 /* 140 * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking 141 * encap_table. So, it cannot use pserialize_read_enter() 142 */ 143 static struct { 144 struct pslist_head list; 145 pserialize_t psz; 146 struct psref_class *elem_class; /* for the element of et_list */ 147 } encaptab __cacheline_aligned = { 148 .list = PSLIST_INITIALIZER, 149 }; 150 #define encap_table encaptab.list 151 152 static struct { 153 kmutex_t lock; 154 kcondvar_t cv; 155 struct lwp *busy; 156 } encap_whole __cacheline_aligned; 157 158 #ifdef USE_RADIX 159 struct radix_node_head *encap_head[2]; /* 0 for AF_INET, 1 for AF_INET6 */ 160 static bool encap_head_updating = false; 161 #endif 162 163 static bool encap_initialized = false; 164 /* 165 * must be done before other encap interfaces initialization. 166 */ 167 void 168 encapinit(void) 169 { 170 171 if (encap_initialized) 172 return; 173 174 encaptab.psz = pserialize_create(); 175 encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET); 176 177 mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE); 178 cv_init(&encap_whole.cv, "ip_encap cv"); 179 encap_whole.busy = NULL; 180 181 encap_initialized = true; 182 } 183 184 void 185 encap_init(void) 186 { 187 static int initialized = 0; 188 189 if (initialized) 190 return; 191 initialized++; 192 #if 0 193 /* 194 * we cannot use LIST_INIT() here, since drivers may want to call 195 * encap_attach(), on driver attach. encap_init() will be called 196 * on AF_INET{,6} initialization, which happens after driver 197 * initialization - using LIST_INIT() here can nuke encap_attach() 198 * from drivers. 199 */ 200 PSLIST_INIT(&encap_table); 201 #endif 202 203 #ifdef USE_RADIX 204 /* 205 * initialize radix lookup table when the radix subsystem is inited. 206 */ 207 rn_delayedinit((void *)&encap_head[0], 208 sizeof(struct sockaddr_pack) << 3); 209 #ifdef INET6 210 rn_delayedinit((void *)&encap_head[1], 211 sizeof(struct sockaddr_pack) << 3); 212 #endif 213 #endif 214 } 215 216 #ifdef INET 217 static struct encaptab * 218 encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir, 219 struct psref *match_psref) 220 { 221 struct ip *ip; 222 struct ip_pack4 pack; 223 struct encaptab *ep, *match; 224 int prio, matchprio; 225 int s; 226 #ifdef USE_RADIX 227 struct radix_node_head *rnh = encap_rnh(AF_INET); 228 struct radix_node *rn; 229 #endif 230 231 KASSERT(m->m_len >= sizeof(*ip)); 232 233 ip = mtod(m, struct ip *); 234 235 memset(&pack, 0, sizeof(pack)); 236 pack.p.sp_len = sizeof(pack); 237 pack.mine.sin_family = pack.yours.sin_family = AF_INET; 238 pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in); 239 if (dir == INBOUND) { 240 pack.mine.sin_addr = ip->ip_dst; 241 pack.yours.sin_addr = ip->ip_src; 242 } else { 243 pack.mine.sin_addr = ip->ip_src; 244 pack.yours.sin_addr = ip->ip_dst; 245 } 246 247 match = NULL; 248 matchprio = 0; 249 250 s = pserialize_read_enter(); 251 #ifdef USE_RADIX 252 if (encap_head_updating) { 253 /* 254 * Update in progress. Do nothing. 255 */ 256 pserialize_read_exit(s); 257 return NULL; 258 } 259 260 rn = rnh->rnh_matchaddr((void *)&pack, rnh); 261 if (rn && (rn->rn_flags & RNF_ROOT) == 0) { 262 struct encaptab *encapp = (struct encaptab *)rn; 263 264 psref_acquire(match_psref, &encapp->psref, 265 encaptab.elem_class); 266 match = encapp; 267 matchprio = mask_matchlen(match->srcmask) + 268 mask_matchlen(match->dstmask); 269 } 270 #endif 271 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 272 struct psref elem_psref; 273 274 if (ep->af != AF_INET) 275 continue; 276 if (ep->proto >= 0 && ep->proto != proto) 277 continue; 278 279 psref_acquire(&elem_psref, &ep->psref, 280 encaptab.elem_class); 281 if (ep->func) { 282 pserialize_read_exit(s); 283 /* ep->func is sleepable. e.g. rtalloc1 */ 284 prio = (*ep->func)(m, off, proto, ep->arg); 285 s = pserialize_read_enter(); 286 } else { 287 #ifdef USE_RADIX 288 psref_release(&elem_psref, &ep->psref, 289 encaptab.elem_class); 290 continue; 291 #else 292 prio = mask_match(ep, (struct sockaddr *)&pack.mine, 293 (struct sockaddr *)&pack.yours); 294 #endif 295 } 296 297 /* 298 * We prioritize the matches by using bit length of the 299 * matches. mask_match() and user-supplied matching function 300 * should return the bit length of the matches (for example, 301 * if both src/dst are matched for IPv4, 64 should be returned). 302 * 0 or negative return value means "it did not match". 303 * 304 * The question is, since we have two "mask" portion, we 305 * cannot really define total order between entries. 306 * For example, which of these should be preferred? 307 * mask_match() returns 48 (32 + 16) for both of them. 308 * src=3ffe::/16, dst=3ffe:501::/32 309 * src=3ffe:501::/32, dst=3ffe::/16 310 * 311 * We need to loop through all the possible candidates 312 * to get the best match - the search takes O(n) for 313 * n attachments (i.e. interfaces). 314 * 315 * For radix-based lookup, I guess source takes precedence. 316 * See rn_{refines,lexobetter} for the correct answer. 317 */ 318 if (prio <= 0) { 319 psref_release(&elem_psref, &ep->psref, 320 encaptab.elem_class); 321 continue; 322 } 323 if (prio > matchprio) { 324 /* release last matched ep */ 325 if (match != NULL) 326 psref_release(match_psref, &match->psref, 327 encaptab.elem_class); 328 329 psref_copy(match_psref, &elem_psref, 330 encaptab.elem_class); 331 matchprio = prio; 332 match = ep; 333 } 334 KASSERTMSG((match == NULL) || psref_held(&match->psref, 335 encaptab.elem_class), 336 "current match = %p, but not hold its psref", match); 337 338 psref_release(&elem_psref, &ep->psref, 339 encaptab.elem_class); 340 } 341 pserialize_read_exit(s); 342 343 return match; 344 } 345 346 void 347 encap4_input(struct mbuf *m, ...) 348 { 349 int off, proto; 350 va_list ap; 351 const struct encapsw *esw; 352 struct encaptab *match; 353 struct psref match_psref; 354 355 va_start(ap, m); 356 off = va_arg(ap, int); 357 proto = va_arg(ap, int); 358 va_end(ap); 359 360 match = encap4_lookup(m, off, proto, INBOUND, &match_psref); 361 if (match) { 362 /* found a match, "match" has the best one */ 363 esw = match->esw; 364 if (esw && esw->encapsw4.pr_input) { 365 (*esw->encapsw4.pr_input)(m, off, proto, match->arg); 366 psref_release(&match_psref, &match->psref, 367 encaptab.elem_class); 368 } else { 369 psref_release(&match_psref, &match->psref, 370 encaptab.elem_class); 371 m_freem(m); 372 } 373 return; 374 } 375 376 /* last resort: inject to raw socket */ 377 rip_input(m, off, proto); 378 } 379 #endif 380 381 #ifdef INET6 382 static struct encaptab * 383 encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir, 384 struct psref *match_psref) 385 { 386 struct ip6_hdr *ip6; 387 struct ip_pack6 pack; 388 int prio, matchprio; 389 int s; 390 struct encaptab *ep, *match; 391 #ifdef USE_RADIX 392 struct radix_node_head *rnh = encap_rnh(AF_INET6); 393 struct radix_node *rn; 394 #endif 395 396 KASSERT(m->m_len >= sizeof(*ip6)); 397 398 ip6 = mtod(m, struct ip6_hdr *); 399 400 memset(&pack, 0, sizeof(pack)); 401 pack.p.sp_len = sizeof(pack); 402 pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6; 403 pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6); 404 if (dir == INBOUND) { 405 pack.mine.sin6_addr = ip6->ip6_dst; 406 pack.yours.sin6_addr = ip6->ip6_src; 407 } else { 408 pack.mine.sin6_addr = ip6->ip6_src; 409 pack.yours.sin6_addr = ip6->ip6_dst; 410 } 411 412 match = NULL; 413 matchprio = 0; 414 415 s = pserialize_read_enter(); 416 #ifdef USE_RADIX 417 if (encap_head_updating) { 418 /* 419 * Update in progress. Do nothing. 420 */ 421 pserialize_read_exit(s); 422 return NULL; 423 } 424 425 rn = rnh->rnh_matchaddr((void *)&pack, rnh); 426 if (rn && (rn->rn_flags & RNF_ROOT) == 0) { 427 struct encaptab *encapp = (struct encaptab *)rn; 428 429 psref_acquire(match_psref, &encapp->psref, 430 encaptab.elem_class); 431 match = encapp; 432 matchprio = mask_matchlen(match->srcmask) + 433 mask_matchlen(match->dstmask); 434 } 435 #endif 436 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 437 struct psref elem_psref; 438 439 if (ep->af != AF_INET6) 440 continue; 441 if (ep->proto >= 0 && ep->proto != proto) 442 continue; 443 444 psref_acquire(&elem_psref, &ep->psref, 445 encaptab.elem_class); 446 447 if (ep->func) { 448 pserialize_read_exit(s); 449 /* ep->func is sleepable. e.g. rtalloc1 */ 450 prio = (*ep->func)(m, off, proto, ep->arg); 451 s = pserialize_read_enter(); 452 } else { 453 #ifdef USE_RADIX 454 psref_release(&elem_psref, &ep->psref, 455 encaptab.elem_class); 456 continue; 457 #else 458 prio = mask_match(ep, (struct sockaddr *)&pack.mine, 459 (struct sockaddr *)&pack.yours); 460 #endif 461 } 462 463 /* see encap4_lookup() for issues here */ 464 if (prio <= 0) { 465 psref_release(&elem_psref, &ep->psref, 466 encaptab.elem_class); 467 continue; 468 } 469 if (prio > matchprio) { 470 /* release last matched ep */ 471 if (match != NULL) 472 psref_release(match_psref, &match->psref, 473 encaptab.elem_class); 474 475 psref_copy(match_psref, &elem_psref, 476 encaptab.elem_class); 477 matchprio = prio; 478 match = ep; 479 } 480 KASSERTMSG((match == NULL) || psref_held(&match->psref, 481 encaptab.elem_class), 482 "current match = %p, but not hold its psref", match); 483 484 psref_release(&elem_psref, &ep->psref, 485 encaptab.elem_class); 486 } 487 pserialize_read_exit(s); 488 489 return match; 490 } 491 492 int 493 encap6_input(struct mbuf **mp, int *offp, int proto) 494 { 495 struct mbuf *m = *mp; 496 const struct encapsw *esw; 497 struct encaptab *match; 498 struct psref match_psref; 499 500 match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref); 501 502 if (match) { 503 /* found a match */ 504 esw = match->esw; 505 if (esw && esw->encapsw6.pr_input) { 506 int ret; 507 ret = (*esw->encapsw6.pr_input)(mp, offp, proto, 508 match->arg); 509 psref_release(&match_psref, &match->psref, 510 encaptab.elem_class); 511 return ret; 512 } else { 513 psref_release(&match_psref, &match->psref, 514 encaptab.elem_class); 515 m_freem(m); 516 return IPPROTO_DONE; 517 } 518 } 519 520 /* last resort: inject to raw socket */ 521 return rip6_input(mp, offp, proto); 522 } 523 #endif 524 525 /* 526 * XXX 527 * The encaptab list and the rnh radix tree must be manipulated atomically. 528 */ 529 static int 530 encap_add(struct encaptab *ep) 531 { 532 #ifdef USE_RADIX 533 struct radix_node_head *rnh = encap_rnh(ep->af); 534 #endif 535 536 KASSERT(encap_lock_held()); 537 538 #ifdef USE_RADIX 539 if (!ep->func && rnh) { 540 /* Disable access to the radix tree for reader. */ 541 encap_head_updating = true; 542 /* Wait for all readers to drain. */ 543 pserialize_perform(encaptab.psz); 544 545 if (!rnh->rnh_addaddr((void *)ep->addrpack, 546 (void *)ep->maskpack, rnh, ep->nodes)) { 547 encap_head_updating = false; 548 return EEXIST; 549 } 550 551 /* 552 * The ep added to the radix tree must be skipped while 553 * encap[46]_lookup walks encaptab list. In other words, 554 * encap_add() does not need to care whether the ep has 555 * been added encaptab list or not yet. 556 * So, we can re-enable access to the radix tree for now. 557 */ 558 encap_head_updating = false; 559 } 560 #endif 561 PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain); 562 563 return 0; 564 } 565 566 /* 567 * XXX 568 * The encaptab list and the rnh radix tree must be manipulated atomically. 569 */ 570 static int 571 encap_remove(struct encaptab *ep) 572 { 573 #ifdef USE_RADIX 574 struct radix_node_head *rnh = encap_rnh(ep->af); 575 #endif 576 int error = 0; 577 578 KASSERT(encap_lock_held()); 579 580 #ifdef USE_RADIX 581 if (!ep->func && rnh) { 582 /* Disable access to the radix tree for reader. */ 583 encap_head_updating = true; 584 /* Wait for all readers to drain. */ 585 pserialize_perform(encaptab.psz); 586 587 if (!rnh->rnh_deladdr((void *)ep->addrpack, 588 (void *)ep->maskpack, rnh)) 589 error = ESRCH; 590 591 /* 592 * The ep added to the radix tree must be skipped while 593 * encap[46]_lookup walks encaptab list. In other words, 594 * encap_add() does not need to care whether the ep has 595 * been added encaptab list or not yet. 596 * So, we can re-enable access to the radix tree for now. 597 */ 598 encap_head_updating = false; 599 } 600 #endif 601 PSLIST_WRITER_REMOVE(ep, chain); 602 603 return error; 604 } 605 606 static int 607 encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp) 608 { 609 if (sp && dp) { 610 if (sp->sa_len != dp->sa_len) 611 return EINVAL; 612 if (af != sp->sa_family || af != dp->sa_family) 613 return EINVAL; 614 } else if (!sp && !dp) 615 ; 616 else 617 return EINVAL; 618 619 switch (af) { 620 case AF_INET: 621 if (sp && sp->sa_len != sizeof(struct sockaddr_in)) 622 return EINVAL; 623 if (dp && dp->sa_len != sizeof(struct sockaddr_in)) 624 return EINVAL; 625 break; 626 #ifdef INET6 627 case AF_INET6: 628 if (sp && sp->sa_len != sizeof(struct sockaddr_in6)) 629 return EINVAL; 630 if (dp && dp->sa_len != sizeof(struct sockaddr_in6)) 631 return EINVAL; 632 break; 633 #endif 634 default: 635 return EAFNOSUPPORT; 636 } 637 638 return 0; 639 } 640 641 /* 642 * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. 643 * length of mask (sm and dm) is assumed to be same as sp/dp. 644 * Return value will be necessary as input (cookie) for encap_detach(). 645 */ 646 const struct encaptab * 647 encap_attach(int af, int proto, 648 const struct sockaddr *sp, const struct sockaddr *sm, 649 const struct sockaddr *dp, const struct sockaddr *dm, 650 const struct encapsw *esw, void *arg) 651 { 652 struct encaptab *ep; 653 int error; 654 int pss; 655 size_t l; 656 struct ip_pack4 *pack4; 657 #ifdef INET6 658 struct ip_pack6 *pack6; 659 #endif 660 #ifndef ENCAP_MPSAFE 661 int s; 662 663 s = splsoftnet(); 664 #endif 665 /* sanity check on args */ 666 error = encap_afcheck(af, sp, dp); 667 if (error) 668 goto fail; 669 670 /* check if anyone have already attached with exactly same config */ 671 pss = pserialize_read_enter(); 672 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 673 if (ep->af != af) 674 continue; 675 if (ep->proto != proto) 676 continue; 677 if (ep->func) 678 continue; 679 680 KASSERT(ep->src != NULL); 681 KASSERT(ep->dst != NULL); 682 KASSERT(ep->srcmask != NULL); 683 KASSERT(ep->dstmask != NULL); 684 685 if (ep->src->sa_len != sp->sa_len || 686 memcmp(ep->src, sp, sp->sa_len) != 0 || 687 memcmp(ep->srcmask, sm, sp->sa_len) != 0) 688 continue; 689 if (ep->dst->sa_len != dp->sa_len || 690 memcmp(ep->dst, dp, dp->sa_len) != 0 || 691 memcmp(ep->dstmask, dm, dp->sa_len) != 0) 692 continue; 693 694 error = EEXIST; 695 pserialize_read_exit(pss); 696 goto fail; 697 } 698 pserialize_read_exit(pss); 699 700 switch (af) { 701 case AF_INET: 702 l = sizeof(*pack4); 703 break; 704 #ifdef INET6 705 case AF_INET6: 706 l = sizeof(*pack6); 707 break; 708 #endif 709 default: 710 goto fail; 711 } 712 713 /* M_NETADDR ok? */ 714 ep = kmem_zalloc(sizeof(*ep), KM_NOSLEEP); 715 if (ep == NULL) { 716 error = ENOBUFS; 717 goto fail; 718 } 719 ep->addrpack = kmem_zalloc(l, KM_NOSLEEP); 720 if (ep->addrpack == NULL) { 721 error = ENOBUFS; 722 goto gc; 723 } 724 ep->maskpack = kmem_zalloc(l, KM_NOSLEEP); 725 if (ep->maskpack == NULL) { 726 error = ENOBUFS; 727 goto gc; 728 } 729 730 ep->af = af; 731 ep->proto = proto; 732 ep->addrpack->sa_len = l & 0xff; 733 ep->maskpack->sa_len = l & 0xff; 734 switch (af) { 735 case AF_INET: 736 pack4 = (struct ip_pack4 *)ep->addrpack; 737 ep->src = (struct sockaddr *)&pack4->mine; 738 ep->dst = (struct sockaddr *)&pack4->yours; 739 pack4 = (struct ip_pack4 *)ep->maskpack; 740 ep->srcmask = (struct sockaddr *)&pack4->mine; 741 ep->dstmask = (struct sockaddr *)&pack4->yours; 742 break; 743 #ifdef INET6 744 case AF_INET6: 745 pack6 = (struct ip_pack6 *)ep->addrpack; 746 ep->src = (struct sockaddr *)&pack6->mine; 747 ep->dst = (struct sockaddr *)&pack6->yours; 748 pack6 = (struct ip_pack6 *)ep->maskpack; 749 ep->srcmask = (struct sockaddr *)&pack6->mine; 750 ep->dstmask = (struct sockaddr *)&pack6->yours; 751 break; 752 #endif 753 } 754 755 memcpy(ep->src, sp, sp->sa_len); 756 memcpy(ep->srcmask, sm, sp->sa_len); 757 memcpy(ep->dst, dp, dp->sa_len); 758 memcpy(ep->dstmask, dm, dp->sa_len); 759 ep->esw = esw; 760 ep->arg = arg; 761 psref_target_init(&ep->psref, encaptab.elem_class); 762 763 error = encap_add(ep); 764 if (error) 765 goto gc; 766 767 error = 0; 768 #ifndef ENCAP_MPSAFE 769 splx(s); 770 #endif 771 return ep; 772 773 gc: 774 if (ep->addrpack) 775 kmem_free(ep->addrpack, l); 776 if (ep->maskpack) 777 kmem_free(ep->maskpack, l); 778 if (ep) 779 kmem_free(ep, sizeof(*ep)); 780 fail: 781 #ifndef ENCAP_MPSAFE 782 splx(s); 783 #endif 784 return NULL; 785 } 786 787 const struct encaptab * 788 encap_attach_func(int af, int proto, 789 int (*func)(struct mbuf *, int, int, void *), 790 const struct encapsw *esw, void *arg) 791 { 792 struct encaptab *ep; 793 int error; 794 #ifndef ENCAP_MPSAFE 795 int s; 796 797 s = splsoftnet(); 798 #endif 799 /* sanity check on args */ 800 if (!func) { 801 error = EINVAL; 802 goto fail; 803 } 804 805 error = encap_afcheck(af, NULL, NULL); 806 if (error) 807 goto fail; 808 809 ep = kmem_alloc(sizeof(*ep), KM_NOSLEEP); /*XXX*/ 810 if (ep == NULL) { 811 error = ENOBUFS; 812 goto fail; 813 } 814 memset(ep, 0, sizeof(*ep)); 815 816 ep->af = af; 817 ep->proto = proto; 818 ep->func = func; 819 ep->esw = esw; 820 ep->arg = arg; 821 psref_target_init(&ep->psref, encaptab.elem_class); 822 823 error = encap_add(ep); 824 if (error) 825 goto fail; 826 827 error = 0; 828 #ifndef ENCAP_MPSAFE 829 splx(s); 830 #endif 831 return ep; 832 833 fail: 834 #ifndef ENCAP_MPSAFE 835 splx(s); 836 #endif 837 return NULL; 838 } 839 840 /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */ 841 842 #ifdef INET6 843 void * 844 encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) 845 { 846 void *d = d0; 847 struct ip6_hdr *ip6; 848 struct mbuf *m; 849 int off; 850 struct ip6ctlparam *ip6cp = NULL; 851 int nxt; 852 int s; 853 struct encaptab *ep; 854 const struct encapsw *esw; 855 856 if (sa->sa_family != AF_INET6 || 857 sa->sa_len != sizeof(struct sockaddr_in6)) 858 return NULL; 859 860 if ((unsigned)cmd >= PRC_NCMDS) 861 return NULL; 862 if (cmd == PRC_HOSTDEAD) 863 d = NULL; 864 else if (cmd == PRC_MSGSIZE) 865 ; /* special code is present, see below */ 866 else if (inet6ctlerrmap[cmd] == 0) 867 return NULL; 868 869 /* if the parameter is from icmp6, decode it. */ 870 if (d != NULL) { 871 ip6cp = (struct ip6ctlparam *)d; 872 m = ip6cp->ip6c_m; 873 ip6 = ip6cp->ip6c_ip6; 874 off = ip6cp->ip6c_off; 875 nxt = ip6cp->ip6c_nxt; 876 877 if (ip6 && cmd == PRC_MSGSIZE) { 878 int valid = 0; 879 struct encaptab *match; 880 struct psref elem_psref; 881 882 /* 883 * Check to see if we have a valid encap configuration. 884 */ 885 match = encap6_lookup(m, off, nxt, OUTBOUND, 886 &elem_psref); 887 if (match) 888 valid++; 889 psref_release(&elem_psref, &match->psref, 890 encaptab.elem_class); 891 892 /* 893 * Depending on the value of "valid" and routing table 894 * size (mtudisc_{hi,lo}wat), we will: 895 * - recalcurate the new MTU and create the 896 * corresponding routing entry, or 897 * - ignore the MTU change notification. 898 */ 899 icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); 900 } 901 } else { 902 m = NULL; 903 ip6 = NULL; 904 nxt = -1; 905 } 906 907 /* inform all listeners */ 908 909 s = pserialize_read_enter(); 910 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 911 struct psref elem_psref; 912 913 if (ep->af != AF_INET6) 914 continue; 915 if (ep->proto >= 0 && ep->proto != nxt) 916 continue; 917 918 /* should optimize by looking at address pairs */ 919 920 /* XXX need to pass ep->arg or ep itself to listeners */ 921 psref_acquire(&elem_psref, &ep->psref, 922 encaptab.elem_class); 923 esw = ep->esw; 924 if (esw && esw->encapsw6.pr_ctlinput) { 925 pserialize_read_exit(s); 926 /* pr_ctlinput is sleepable. e.g. rtcache_free */ 927 (*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg); 928 s = pserialize_read_enter(); 929 } 930 psref_release(&elem_psref, &ep->psref, 931 encaptab.elem_class); 932 } 933 pserialize_read_exit(s); 934 935 rip6_ctlinput(cmd, sa, d0); 936 return NULL; 937 } 938 #endif 939 940 int 941 encap_detach(const struct encaptab *cookie) 942 { 943 const struct encaptab *ep = cookie; 944 struct encaptab *p; 945 int error; 946 947 KASSERT(encap_lock_held()); 948 949 PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) { 950 if (p == ep) { 951 error = encap_remove(p); 952 if (error) 953 return error; 954 else 955 break; 956 } 957 } 958 if (p == NULL) 959 return ENOENT; 960 961 pserialize_perform(encaptab.psz); 962 psref_target_destroy(&p->psref, 963 encaptab.elem_class); 964 if (!ep->func) { 965 kmem_free(p->addrpack, ep->addrpack->sa_len); 966 kmem_free(p->maskpack, ep->maskpack->sa_len); 967 } 968 kmem_free(p, sizeof(*p)); 969 970 return 0; 971 } 972 973 #ifdef USE_RADIX 974 static struct radix_node_head * 975 encap_rnh(int af) 976 { 977 978 switch (af) { 979 case AF_INET: 980 return encap_head[0]; 981 #ifdef INET6 982 case AF_INET6: 983 return encap_head[1]; 984 #endif 985 default: 986 return NULL; 987 } 988 } 989 990 static int 991 mask_matchlen(const struct sockaddr *sa) 992 { 993 const char *p, *ep; 994 int l; 995 996 p = (const char *)sa; 997 ep = p + sa->sa_len; 998 p += 2; /* sa_len + sa_family */ 999 1000 l = 0; 1001 while (p < ep) { 1002 l += (*p ? 8 : 0); /* estimate */ 1003 p++; 1004 } 1005 return l; 1006 } 1007 #endif 1008 1009 #ifndef USE_RADIX 1010 static int 1011 mask_match(const struct encaptab *ep, 1012 const struct sockaddr *sp, 1013 const struct sockaddr *dp) 1014 { 1015 struct sockaddr_storage s; 1016 struct sockaddr_storage d; 1017 int i; 1018 const u_int8_t *p, *q; 1019 u_int8_t *r; 1020 int matchlen; 1021 1022 KASSERTMSG(ep->func == NULL, "wrong encaptab passed to mask_match"); 1023 1024 if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) 1025 return 0; 1026 if (sp->sa_family != ep->af || dp->sa_family != ep->af) 1027 return 0; 1028 if (sp->sa_len != ep->src->sa_len || dp->sa_len != ep->dst->sa_len) 1029 return 0; 1030 1031 matchlen = 0; 1032 1033 p = (const u_int8_t *)sp; 1034 q = (const u_int8_t *)ep->srcmask; 1035 r = (u_int8_t *)&s; 1036 for (i = 0 ; i < sp->sa_len; i++) { 1037 r[i] = p[i] & q[i]; 1038 /* XXX estimate */ 1039 matchlen += (q[i] ? 8 : 0); 1040 } 1041 1042 p = (const u_int8_t *)dp; 1043 q = (const u_int8_t *)ep->dstmask; 1044 r = (u_int8_t *)&d; 1045 for (i = 0 ; i < dp->sa_len; i++) { 1046 r[i] = p[i] & q[i]; 1047 /* XXX rough estimate */ 1048 matchlen += (q[i] ? 8 : 0); 1049 } 1050 1051 /* need to overwrite len/family portion as we don't compare them */ 1052 s.ss_len = sp->sa_len; 1053 s.ss_family = sp->sa_family; 1054 d.ss_len = dp->sa_len; 1055 d.ss_family = dp->sa_family; 1056 1057 if (memcmp(&s, ep->src, ep->src->sa_len) == 0 && 1058 memcmp(&d, ep->dst, ep->dst->sa_len) == 0) { 1059 return matchlen; 1060 } else 1061 return 0; 1062 } 1063 #endif 1064 1065 int 1066 encap_lock_enter(void) 1067 { 1068 int error; 1069 1070 mutex_enter(&encap_whole.lock); 1071 while (encap_whole.busy != NULL) { 1072 error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock); 1073 if (error) { 1074 mutex_exit(&encap_whole.lock); 1075 return error; 1076 } 1077 } 1078 KASSERT(encap_whole.busy == NULL); 1079 encap_whole.busy = curlwp; 1080 mutex_exit(&encap_whole.lock); 1081 1082 return 0; 1083 } 1084 1085 void 1086 encap_lock_exit(void) 1087 { 1088 1089 mutex_enter(&encap_whole.lock); 1090 KASSERT(encap_whole.busy == curlwp); 1091 encap_whole.busy = NULL; 1092 cv_broadcast(&encap_whole.cv); 1093 mutex_exit(&encap_whole.lock); 1094 } 1095 1096 bool 1097 encap_lock_held(void) 1098 { 1099 1100 return (encap_whole.busy == curlwp); 1101 } 1102