1 /* $NetBSD: ip_encap.c,v 1.61 2016/07/04 04:40:13 knakahara Exp $ */ 2 /* $KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $ */ 3 4 /* 5 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the project nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 /* 33 * My grandfather said that there's a devil inside tunnelling technology... 34 * 35 * We have surprisingly many protocols that want packets with IP protocol 36 * #4 or #41. Here's a list of protocols that want protocol #41: 37 * RFC1933 configured tunnel 38 * RFC1933 automatic tunnel 39 * RFC2401 IPsec tunnel 40 * RFC2473 IPv6 generic packet tunnelling 41 * RFC2529 6over4 tunnel 42 * RFC3056 6to4 tunnel 43 * isatap tunnel 44 * mobile-ip6 (uses RFC2473) 45 * Here's a list of protocol that want protocol #4: 46 * RFC1853 IPv4-in-IPv4 tunnelling 47 * RFC2003 IPv4 encapsulation within IPv4 48 * RFC2344 reverse tunnelling for mobile-ip4 49 * RFC2401 IPsec tunnel 50 * Well, what can I say. They impose different en/decapsulation mechanism 51 * from each other, so they need separate protocol handler. The only one 52 * we can easily determine by protocol # is IPsec, which always has 53 * AH/ESP/IPComp header right after outer IP header. 54 * 55 * So, clearly good old protosw does not work for protocol #4 and #41. 56 * The code will let you match protocol via src/dst address pair. 57 */ 58 /* XXX is M_NETADDR correct? */ 59 60 /* 61 * With USE_RADIX the code will use radix table for tunnel lookup, for 62 * tunnels registered with encap_attach() with a addr/mask pair. 63 * Faster on machines with thousands of tunnel registerations (= interfaces). 64 * 65 * The code assumes that radix table code can handle non-continuous netmask, 66 * as it will pass radix table memory region with (src + dst) sockaddr pair. 67 */ 68 #define USE_RADIX 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.61 2016/07/04 04:40:13 knakahara Exp $"); 72 73 #ifdef _KERNEL_OPT 74 #include "opt_mrouting.h" 75 #include "opt_inet.h" 76 #include "opt_net_mpsafe.h" 77 #endif 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/socket.h> 82 #include <sys/sockio.h> 83 #include <sys/mbuf.h> 84 #include <sys/errno.h> 85 #include <sys/queue.h> 86 #include <sys/kmem.h> 87 #include <sys/mutex.h> 88 #include <sys/condvar.h> 89 #include <sys/psref.h> 90 #include <sys/pslist.h> 91 92 #include <net/if.h> 93 94 #include <netinet/in.h> 95 #include <netinet/in_systm.h> 96 #include <netinet/ip.h> 97 #include <netinet/ip_var.h> 98 #include <netinet/ip_encap.h> 99 #ifdef MROUTING 100 #include <netinet/ip_mroute.h> 101 #endif /* MROUTING */ 102 103 #ifdef INET6 104 #include <netinet/ip6.h> 105 #include <netinet6/ip6_var.h> 106 #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */ 107 #include <netinet6/in6_var.h> 108 #include <netinet6/in6_pcb.h> 109 #include <netinet/icmp6.h> 110 #endif 111 112 #include <net/net_osdep.h> 113 114 #ifdef NET_MPSAFE 115 #define ENCAP_MPSAFE 1 116 #endif 117 118 enum direction { INBOUND, OUTBOUND }; 119 120 #ifdef INET 121 static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction, 122 struct psref *); 123 #endif 124 #ifdef INET6 125 static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction, 126 struct psref *); 127 #endif 128 static int encap_add(struct encaptab *); 129 static int encap_remove(struct encaptab *); 130 static int encap_afcheck(int, const struct sockaddr *, const struct sockaddr *); 131 #ifdef USE_RADIX 132 static struct radix_node_head *encap_rnh(int); 133 static int mask_matchlen(const struct sockaddr *); 134 #else 135 static int mask_match(const struct encaptab *, const struct sockaddr *, 136 const struct sockaddr *); 137 #endif 138 static void encap_fillarg(struct mbuf *, const struct encaptab *); 139 140 /* 141 * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking 142 * encap_table. So, it cannot use pserialize_read_enter() 143 */ 144 static struct { 145 struct pslist_head list; 146 pserialize_t psz; 147 struct psref_class *elem_class; /* for the element of et_list */ 148 } encaptab __cacheline_aligned = { 149 .list = PSLIST_INITIALIZER, 150 }; 151 #define encap_table encaptab.list 152 153 static struct { 154 kmutex_t lock; 155 kcondvar_t cv; 156 struct lwp *busy; 157 } encap_whole __cacheline_aligned; 158 159 #ifdef USE_RADIX 160 struct radix_node_head *encap_head[2]; /* 0 for AF_INET, 1 for AF_INET6 */ 161 static bool encap_head_updating = false; 162 #endif 163 164 /* 165 * must be done before other encap interfaces initialization. 166 */ 167 void 168 encapinit(void) 169 { 170 171 encaptab.psz = pserialize_create(); 172 encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET); 173 if (encaptab.elem_class == NULL) 174 panic("encaptab.elem_class cannot be allocated.\n"); 175 176 mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE); 177 cv_init(&encap_whole.cv, "ip_encap cv"); 178 encap_whole.busy = NULL; 179 } 180 181 void 182 encap_init(void) 183 { 184 static int initialized = 0; 185 186 if (initialized) 187 return; 188 initialized++; 189 #if 0 190 /* 191 * we cannot use LIST_INIT() here, since drivers may want to call 192 * encap_attach(), on driver attach. encap_init() will be called 193 * on AF_INET{,6} initialization, which happens after driver 194 * initialization - using LIST_INIT() here can nuke encap_attach() 195 * from drivers. 196 */ 197 PSLIST_INIT(&encap_table); 198 #endif 199 200 #ifdef USE_RADIX 201 /* 202 * initialize radix lookup table when the radix subsystem is inited. 203 */ 204 rn_delayedinit((void *)&encap_head[0], 205 sizeof(struct sockaddr_pack) << 3); 206 #ifdef INET6 207 rn_delayedinit((void *)&encap_head[1], 208 sizeof(struct sockaddr_pack) << 3); 209 #endif 210 #endif 211 } 212 213 #ifdef INET 214 static struct encaptab * 215 encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir, 216 struct psref *match_psref) 217 { 218 struct ip *ip; 219 struct ip_pack4 pack; 220 struct encaptab *ep, *match; 221 int prio, matchprio; 222 int s; 223 #ifdef USE_RADIX 224 struct radix_node_head *rnh = encap_rnh(AF_INET); 225 struct radix_node *rn; 226 #endif 227 228 KASSERT(m->m_len >= sizeof(*ip)); 229 230 ip = mtod(m, struct ip *); 231 232 memset(&pack, 0, sizeof(pack)); 233 pack.p.sp_len = sizeof(pack); 234 pack.mine.sin_family = pack.yours.sin_family = AF_INET; 235 pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in); 236 if (dir == INBOUND) { 237 pack.mine.sin_addr = ip->ip_dst; 238 pack.yours.sin_addr = ip->ip_src; 239 } else { 240 pack.mine.sin_addr = ip->ip_src; 241 pack.yours.sin_addr = ip->ip_dst; 242 } 243 244 match = NULL; 245 matchprio = 0; 246 247 s = pserialize_read_enter(); 248 #ifdef USE_RADIX 249 if (encap_head_updating) { 250 /* 251 * Update in progress. Do nothing. 252 */ 253 pserialize_read_exit(s); 254 return NULL; 255 } 256 257 rn = rnh->rnh_matchaddr((void *)&pack, rnh); 258 if (rn && (rn->rn_flags & RNF_ROOT) == 0) { 259 struct encaptab *encapp = (struct encaptab *)rn; 260 261 psref_acquire(match_psref, &encapp->psref, 262 encaptab.elem_class); 263 match = encapp; 264 matchprio = mask_matchlen(match->srcmask) + 265 mask_matchlen(match->dstmask); 266 } 267 #endif 268 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 269 struct psref elem_psref; 270 271 membar_datadep_consumer(); 272 273 if (ep->af != AF_INET) 274 continue; 275 if (ep->proto >= 0 && ep->proto != proto) 276 continue; 277 278 psref_acquire(&elem_psref, &ep->psref, 279 encaptab.elem_class); 280 if (ep->func) { 281 pserialize_read_exit(s); 282 /* ep->func is sleepable. e.g. rtalloc1 */ 283 prio = (*ep->func)(m, off, proto, ep->arg); 284 s = pserialize_read_enter(); 285 } else { 286 #ifdef USE_RADIX 287 psref_release(&elem_psref, &ep->psref, 288 encaptab.elem_class); 289 continue; 290 #else 291 prio = mask_match(ep, (struct sockaddr *)&pack.mine, 292 (struct sockaddr *)&pack.yours); 293 #endif 294 } 295 296 /* 297 * We prioritize the matches by using bit length of the 298 * matches. mask_match() and user-supplied matching function 299 * should return the bit length of the matches (for example, 300 * if both src/dst are matched for IPv4, 64 should be returned). 301 * 0 or negative return value means "it did not match". 302 * 303 * The question is, since we have two "mask" portion, we 304 * cannot really define total order between entries. 305 * For example, which of these should be preferred? 306 * mask_match() returns 48 (32 + 16) for both of them. 307 * src=3ffe::/16, dst=3ffe:501::/32 308 * src=3ffe:501::/32, dst=3ffe::/16 309 * 310 * We need to loop through all the possible candidates 311 * to get the best match - the search takes O(n) for 312 * n attachments (i.e. interfaces). 313 * 314 * For radix-based lookup, I guess source takes precedence. 315 * See rn_{refines,lexobetter} for the correct answer. 316 */ 317 if (prio <= 0) { 318 psref_release(&elem_psref, &ep->psref, 319 encaptab.elem_class); 320 continue; 321 } 322 if (prio > matchprio) { 323 /* release last matched ep */ 324 if (match != NULL) 325 psref_release(match_psref, &match->psref, 326 encaptab.elem_class); 327 328 psref_copy(match_psref, &elem_psref, 329 encaptab.elem_class); 330 matchprio = prio; 331 match = ep; 332 } 333 KASSERTMSG((match == NULL) || psref_held(&match->psref, 334 encaptab.elem_class), 335 "current match = %p, but not hold its psref", match); 336 337 psref_release(&elem_psref, &ep->psref, 338 encaptab.elem_class); 339 } 340 pserialize_read_exit(s); 341 342 return match; 343 } 344 345 void 346 encap4_input(struct mbuf *m, ...) 347 { 348 int off, proto; 349 va_list ap; 350 const struct encapsw *esw; 351 struct encaptab *match; 352 struct psref match_psref; 353 354 va_start(ap, m); 355 off = va_arg(ap, int); 356 proto = va_arg(ap, int); 357 va_end(ap); 358 359 match = encap4_lookup(m, off, proto, INBOUND, &match_psref); 360 if (match) { 361 /* found a match, "match" has the best one */ 362 esw = match->esw; 363 if (esw && esw->encapsw4.pr_input) { 364 encap_fillarg(m, match); 365 (*esw->encapsw4.pr_input)(m, off, proto); 366 psref_release(&match_psref, &match->psref, 367 encaptab.elem_class); 368 } else { 369 psref_release(&match_psref, &match->psref, 370 encaptab.elem_class); 371 m_freem(m); 372 } 373 return; 374 } 375 376 /* last resort: inject to raw socket */ 377 rip_input(m, off, proto); 378 } 379 #endif 380 381 #ifdef INET6 382 static struct encaptab * 383 encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir, 384 struct psref *match_psref) 385 { 386 struct ip6_hdr *ip6; 387 struct ip_pack6 pack; 388 int prio, matchprio; 389 int s; 390 struct encaptab *ep, *match; 391 #ifdef USE_RADIX 392 struct radix_node_head *rnh = encap_rnh(AF_INET6); 393 struct radix_node *rn; 394 #endif 395 396 KASSERT(m->m_len >= sizeof(*ip6)); 397 398 ip6 = mtod(m, struct ip6_hdr *); 399 400 memset(&pack, 0, sizeof(pack)); 401 pack.p.sp_len = sizeof(pack); 402 pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6; 403 pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6); 404 if (dir == INBOUND) { 405 pack.mine.sin6_addr = ip6->ip6_dst; 406 pack.yours.sin6_addr = ip6->ip6_src; 407 } else { 408 pack.mine.sin6_addr = ip6->ip6_src; 409 pack.yours.sin6_addr = ip6->ip6_dst; 410 } 411 412 match = NULL; 413 matchprio = 0; 414 415 s = pserialize_read_enter(); 416 #ifdef USE_RADIX 417 if (encap_head_updating) { 418 /* 419 * Update in progress. Do nothing. 420 */ 421 pserialize_read_exit(s); 422 return NULL; 423 } 424 425 rn = rnh->rnh_matchaddr((void *)&pack, rnh); 426 if (rn && (rn->rn_flags & RNF_ROOT) == 0) { 427 struct encaptab *encapp = (struct encaptab *)rn; 428 429 psref_acquire(match_psref, &encapp->psref, 430 encaptab.elem_class); 431 match = encapp; 432 matchprio = mask_matchlen(match->srcmask) + 433 mask_matchlen(match->dstmask); 434 } 435 #endif 436 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 437 struct psref elem_psref; 438 439 membar_datadep_consumer(); 440 441 if (ep->af != AF_INET6) 442 continue; 443 if (ep->proto >= 0 && ep->proto != proto) 444 continue; 445 446 psref_acquire(&elem_psref, &ep->psref, 447 encaptab.elem_class); 448 449 if (ep->func) { 450 pserialize_read_exit(s); 451 /* ep->func is sleepable. e.g. rtalloc1 */ 452 prio = (*ep->func)(m, off, proto, ep->arg); 453 s = pserialize_read_enter(); 454 } else { 455 #ifdef USE_RADIX 456 psref_release(&elem_psref, &ep->psref, 457 encaptab.elem_class); 458 continue; 459 #else 460 prio = mask_match(ep, (struct sockaddr *)&pack.mine, 461 (struct sockaddr *)&pack.yours); 462 #endif 463 } 464 465 /* see encap4_lookup() for issues here */ 466 if (prio <= 0) { 467 psref_release(&elem_psref, &ep->psref, 468 encaptab.elem_class); 469 continue; 470 } 471 if (prio > matchprio) { 472 /* release last matched ep */ 473 if (match != NULL) 474 psref_release(match_psref, &match->psref, 475 encaptab.elem_class); 476 477 psref_copy(match_psref, &elem_psref, 478 encaptab.elem_class); 479 matchprio = prio; 480 match = ep; 481 } 482 KASSERTMSG((match == NULL) || psref_held(&match->psref, 483 encaptab.elem_class), 484 "current match = %p, but not hold its psref", match); 485 486 psref_release(&elem_psref, &ep->psref, 487 encaptab.elem_class); 488 } 489 pserialize_read_exit(s); 490 491 return match; 492 } 493 494 int 495 encap6_input(struct mbuf **mp, int *offp, int proto) 496 { 497 struct mbuf *m = *mp; 498 const struct encapsw *esw; 499 struct encaptab *match; 500 struct psref match_psref; 501 502 match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref); 503 504 if (match) { 505 /* found a match */ 506 esw = match->esw; 507 if (esw && esw->encapsw6.pr_input) { 508 int ret; 509 encap_fillarg(m, match); 510 ret = (*esw->encapsw6.pr_input)(mp, offp, proto); 511 psref_release(&match_psref, &match->psref, 512 encaptab.elem_class); 513 return ret; 514 } else { 515 psref_release(&match_psref, &match->psref, 516 encaptab.elem_class); 517 m_freem(m); 518 return IPPROTO_DONE; 519 } 520 } 521 522 /* last resort: inject to raw socket */ 523 return rip6_input(mp, offp, proto); 524 } 525 #endif 526 527 /* 528 * XXX 529 * The encaptab list and the rnh radix tree must be manipulated atomically. 530 */ 531 static int 532 encap_add(struct encaptab *ep) 533 { 534 #ifdef USE_RADIX 535 struct radix_node_head *rnh = encap_rnh(ep->af); 536 #endif 537 538 KASSERT(encap_lock_held()); 539 540 #ifdef USE_RADIX 541 if (!ep->func && rnh) { 542 /* Disable access to the radix tree for reader. */ 543 encap_head_updating = true; 544 /* Wait for all readers to drain. */ 545 pserialize_perform(encaptab.psz); 546 547 if (!rnh->rnh_addaddr((void *)ep->addrpack, 548 (void *)ep->maskpack, rnh, ep->nodes)) { 549 encap_head_updating = false; 550 return EEXIST; 551 } 552 553 /* 554 * The ep added to the radix tree must be skipped while 555 * encap[46]_lookup walks encaptab list. In other words, 556 * encap_add() does not need to care whether the ep has 557 * been added encaptab list or not yet. 558 * So, we can re-enable access to the radix tree for now. 559 */ 560 encap_head_updating = false; 561 } 562 #endif 563 PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain); 564 565 return 0; 566 } 567 568 /* 569 * XXX 570 * The encaptab list and the rnh radix tree must be manipulated atomically. 571 */ 572 static int 573 encap_remove(struct encaptab *ep) 574 { 575 #ifdef USE_RADIX 576 struct radix_node_head *rnh = encap_rnh(ep->af); 577 #endif 578 int error = 0; 579 580 KASSERT(encap_lock_held()); 581 582 #ifdef USE_RADIX 583 if (!ep->func && rnh) { 584 /* Disable access to the radix tree for reader. */ 585 encap_head_updating = true; 586 /* Wait for all readers to drain. */ 587 pserialize_perform(encaptab.psz); 588 589 if (!rnh->rnh_deladdr((void *)ep->addrpack, 590 (void *)ep->maskpack, rnh)) 591 error = ESRCH; 592 593 /* 594 * The ep added to the radix tree must be skipped while 595 * encap[46]_lookup walks encaptab list. In other words, 596 * encap_add() does not need to care whether the ep has 597 * been added encaptab list or not yet. 598 * So, we can re-enable access to the radix tree for now. 599 */ 600 encap_head_updating = false; 601 } 602 #endif 603 PSLIST_WRITER_REMOVE(ep, chain); 604 605 return error; 606 } 607 608 static int 609 encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp) 610 { 611 if (sp && dp) { 612 if (sp->sa_len != dp->sa_len) 613 return EINVAL; 614 if (af != sp->sa_family || af != dp->sa_family) 615 return EINVAL; 616 } else if (!sp && !dp) 617 ; 618 else 619 return EINVAL; 620 621 switch (af) { 622 case AF_INET: 623 if (sp && sp->sa_len != sizeof(struct sockaddr_in)) 624 return EINVAL; 625 if (dp && dp->sa_len != sizeof(struct sockaddr_in)) 626 return EINVAL; 627 break; 628 #ifdef INET6 629 case AF_INET6: 630 if (sp && sp->sa_len != sizeof(struct sockaddr_in6)) 631 return EINVAL; 632 if (dp && dp->sa_len != sizeof(struct sockaddr_in6)) 633 return EINVAL; 634 break; 635 #endif 636 default: 637 return EAFNOSUPPORT; 638 } 639 640 return 0; 641 } 642 643 /* 644 * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. 645 * length of mask (sm and dm) is assumed to be same as sp/dp. 646 * Return value will be necessary as input (cookie) for encap_detach(). 647 */ 648 const struct encaptab * 649 encap_attach(int af, int proto, 650 const struct sockaddr *sp, const struct sockaddr *sm, 651 const struct sockaddr *dp, const struct sockaddr *dm, 652 const struct encapsw *esw, void *arg) 653 { 654 struct encaptab *ep; 655 int error; 656 int pss; 657 size_t l; 658 struct ip_pack4 *pack4; 659 #ifdef INET6 660 struct ip_pack6 *pack6; 661 #endif 662 #ifndef ENCAP_MPSAFE 663 int s; 664 665 s = splsoftnet(); 666 #endif 667 /* sanity check on args */ 668 error = encap_afcheck(af, sp, dp); 669 if (error) 670 goto fail; 671 672 /* check if anyone have already attached with exactly same config */ 673 pss = pserialize_read_enter(); 674 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 675 membar_datadep_consumer(); 676 677 if (ep->af != af) 678 continue; 679 if (ep->proto != proto) 680 continue; 681 if (ep->func) 682 continue; 683 684 KASSERT(ep->src != NULL); 685 KASSERT(ep->dst != NULL); 686 KASSERT(ep->srcmask != NULL); 687 KASSERT(ep->dstmask != NULL); 688 689 if (ep->src->sa_len != sp->sa_len || 690 memcmp(ep->src, sp, sp->sa_len) != 0 || 691 memcmp(ep->srcmask, sm, sp->sa_len) != 0) 692 continue; 693 if (ep->dst->sa_len != dp->sa_len || 694 memcmp(ep->dst, dp, dp->sa_len) != 0 || 695 memcmp(ep->dstmask, dm, dp->sa_len) != 0) 696 continue; 697 698 error = EEXIST; 699 pserialize_read_exit(pss); 700 goto fail; 701 } 702 pserialize_read_exit(pss); 703 704 switch (af) { 705 case AF_INET: 706 l = sizeof(*pack4); 707 break; 708 #ifdef INET6 709 case AF_INET6: 710 l = sizeof(*pack6); 711 break; 712 #endif 713 default: 714 goto fail; 715 } 716 717 /* M_NETADDR ok? */ 718 ep = kmem_zalloc(sizeof(*ep), KM_NOSLEEP); 719 if (ep == NULL) { 720 error = ENOBUFS; 721 goto fail; 722 } 723 ep->addrpack = kmem_zalloc(l, KM_NOSLEEP); 724 if (ep->addrpack == NULL) { 725 error = ENOBUFS; 726 goto gc; 727 } 728 ep->maskpack = kmem_zalloc(l, KM_NOSLEEP); 729 if (ep->maskpack == NULL) { 730 error = ENOBUFS; 731 goto gc; 732 } 733 734 ep->af = af; 735 ep->proto = proto; 736 ep->addrpack->sa_len = l & 0xff; 737 ep->maskpack->sa_len = l & 0xff; 738 switch (af) { 739 case AF_INET: 740 pack4 = (struct ip_pack4 *)ep->addrpack; 741 ep->src = (struct sockaddr *)&pack4->mine; 742 ep->dst = (struct sockaddr *)&pack4->yours; 743 pack4 = (struct ip_pack4 *)ep->maskpack; 744 ep->srcmask = (struct sockaddr *)&pack4->mine; 745 ep->dstmask = (struct sockaddr *)&pack4->yours; 746 break; 747 #ifdef INET6 748 case AF_INET6: 749 pack6 = (struct ip_pack6 *)ep->addrpack; 750 ep->src = (struct sockaddr *)&pack6->mine; 751 ep->dst = (struct sockaddr *)&pack6->yours; 752 pack6 = (struct ip_pack6 *)ep->maskpack; 753 ep->srcmask = (struct sockaddr *)&pack6->mine; 754 ep->dstmask = (struct sockaddr *)&pack6->yours; 755 break; 756 #endif 757 } 758 759 memcpy(ep->src, sp, sp->sa_len); 760 memcpy(ep->srcmask, sm, sp->sa_len); 761 memcpy(ep->dst, dp, dp->sa_len); 762 memcpy(ep->dstmask, dm, dp->sa_len); 763 ep->esw = esw; 764 ep->arg = arg; 765 psref_target_init(&ep->psref, encaptab.elem_class); 766 767 error = encap_add(ep); 768 if (error) 769 goto gc; 770 771 error = 0; 772 #ifndef ENCAP_MPSAFE 773 splx(s); 774 #endif 775 return ep; 776 777 gc: 778 if (ep->addrpack) 779 kmem_free(ep->addrpack, l); 780 if (ep->maskpack) 781 kmem_free(ep->maskpack, l); 782 if (ep) 783 kmem_free(ep, sizeof(*ep)); 784 fail: 785 #ifndef ENCAP_MPSAFE 786 splx(s); 787 #endif 788 return NULL; 789 } 790 791 const struct encaptab * 792 encap_attach_func(int af, int proto, 793 int (*func)(struct mbuf *, int, int, void *), 794 const struct encapsw *esw, void *arg) 795 { 796 struct encaptab *ep; 797 int error; 798 #ifndef ENCAP_MPSAFE 799 int s; 800 801 s = splsoftnet(); 802 #endif 803 /* sanity check on args */ 804 if (!func) { 805 error = EINVAL; 806 goto fail; 807 } 808 809 error = encap_afcheck(af, NULL, NULL); 810 if (error) 811 goto fail; 812 813 ep = kmem_alloc(sizeof(*ep), KM_NOSLEEP); /*XXX*/ 814 if (ep == NULL) { 815 error = ENOBUFS; 816 goto fail; 817 } 818 memset(ep, 0, sizeof(*ep)); 819 820 ep->af = af; 821 ep->proto = proto; 822 ep->func = func; 823 ep->esw = esw; 824 ep->arg = arg; 825 psref_target_init(&ep->psref, encaptab.elem_class); 826 827 error = encap_add(ep); 828 if (error) 829 goto fail; 830 831 error = 0; 832 #ifndef ENCAP_MPSAFE 833 splx(s); 834 #endif 835 return ep; 836 837 fail: 838 #ifndef ENCAP_MPSAFE 839 splx(s); 840 #endif 841 return NULL; 842 } 843 844 /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */ 845 846 #ifdef INET6 847 void * 848 encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) 849 { 850 void *d = d0; 851 struct ip6_hdr *ip6; 852 struct mbuf *m; 853 int off; 854 struct ip6ctlparam *ip6cp = NULL; 855 int nxt; 856 int s; 857 struct encaptab *ep; 858 const struct encapsw *esw; 859 860 if (sa->sa_family != AF_INET6 || 861 sa->sa_len != sizeof(struct sockaddr_in6)) 862 return NULL; 863 864 if ((unsigned)cmd >= PRC_NCMDS) 865 return NULL; 866 if (cmd == PRC_HOSTDEAD) 867 d = NULL; 868 else if (cmd == PRC_MSGSIZE) 869 ; /* special code is present, see below */ 870 else if (inet6ctlerrmap[cmd] == 0) 871 return NULL; 872 873 /* if the parameter is from icmp6, decode it. */ 874 if (d != NULL) { 875 ip6cp = (struct ip6ctlparam *)d; 876 m = ip6cp->ip6c_m; 877 ip6 = ip6cp->ip6c_ip6; 878 off = ip6cp->ip6c_off; 879 nxt = ip6cp->ip6c_nxt; 880 881 if (ip6 && cmd == PRC_MSGSIZE) { 882 int valid = 0; 883 struct encaptab *match; 884 struct psref elem_psref; 885 886 /* 887 * Check to see if we have a valid encap configuration. 888 */ 889 match = encap6_lookup(m, off, nxt, OUTBOUND, 890 &elem_psref); 891 if (match) 892 valid++; 893 psref_release(&elem_psref, &match->psref, 894 encaptab.elem_class); 895 896 /* 897 * Depending on the value of "valid" and routing table 898 * size (mtudisc_{hi,lo}wat), we will: 899 * - recalcurate the new MTU and create the 900 * corresponding routing entry, or 901 * - ignore the MTU change notification. 902 */ 903 icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); 904 } 905 } else { 906 m = NULL; 907 ip6 = NULL; 908 nxt = -1; 909 } 910 911 /* inform all listeners */ 912 913 s = pserialize_read_enter(); 914 PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { 915 struct psref elem_psref; 916 917 membar_datadep_consumer(); 918 919 if (ep->af != AF_INET6) 920 continue; 921 if (ep->proto >= 0 && ep->proto != nxt) 922 continue; 923 924 /* should optimize by looking at address pairs */ 925 926 /* XXX need to pass ep->arg or ep itself to listeners */ 927 psref_acquire(&elem_psref, &ep->psref, 928 encaptab.elem_class); 929 esw = ep->esw; 930 if (esw && esw->encapsw6.pr_ctlinput) { 931 pserialize_read_exit(s); 932 /* pr_ctlinput is sleepable. e.g. rtcache_free */ 933 (*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg); 934 s = pserialize_read_enter(); 935 } 936 psref_release(&elem_psref, &ep->psref, 937 encaptab.elem_class); 938 } 939 pserialize_read_exit(s); 940 941 rip6_ctlinput(cmd, sa, d0); 942 return NULL; 943 } 944 #endif 945 946 int 947 encap_detach(const struct encaptab *cookie) 948 { 949 const struct encaptab *ep = cookie; 950 struct encaptab *p; 951 int error; 952 953 KASSERT(encap_lock_held()); 954 955 PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) { 956 membar_datadep_consumer(); 957 958 if (p == ep) { 959 error = encap_remove(p); 960 if (error) 961 return error; 962 else 963 break; 964 } 965 } 966 if (p == NULL) 967 return ENOENT; 968 969 #ifndef USE_RADIX 970 /* 971 * pserialize_perform(encaptab.psz) is already done in encap_remove(). 972 */ 973 pserialize_perform(encaptab.psz); 974 #endif 975 psref_target_destroy(&p->psref, 976 encaptab.elem_class); 977 if (!ep->func) { 978 kmem_free(p->addrpack, ep->addrpack->sa_len); 979 kmem_free(p->maskpack, ep->maskpack->sa_len); 980 } 981 kmem_free(p, sizeof(*p)); 982 983 return 0; 984 } 985 986 #ifdef USE_RADIX 987 static struct radix_node_head * 988 encap_rnh(int af) 989 { 990 991 switch (af) { 992 case AF_INET: 993 return encap_head[0]; 994 #ifdef INET6 995 case AF_INET6: 996 return encap_head[1]; 997 #endif 998 default: 999 return NULL; 1000 } 1001 } 1002 1003 static int 1004 mask_matchlen(const struct sockaddr *sa) 1005 { 1006 const char *p, *ep; 1007 int l; 1008 1009 p = (const char *)sa; 1010 ep = p + sa->sa_len; 1011 p += 2; /* sa_len + sa_family */ 1012 1013 l = 0; 1014 while (p < ep) { 1015 l += (*p ? 8 : 0); /* estimate */ 1016 p++; 1017 } 1018 return l; 1019 } 1020 #endif 1021 1022 #ifndef USE_RADIX 1023 static int 1024 mask_match(const struct encaptab *ep, 1025 const struct sockaddr *sp, 1026 const struct sockaddr *dp) 1027 { 1028 struct sockaddr_storage s; 1029 struct sockaddr_storage d; 1030 int i; 1031 const u_int8_t *p, *q; 1032 u_int8_t *r; 1033 int matchlen; 1034 1035 KASSERTMSG(ep->func == NULL, "wrong encaptab passed to mask_match"); 1036 1037 if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) 1038 return 0; 1039 if (sp->sa_family != ep->af || dp->sa_family != ep->af) 1040 return 0; 1041 if (sp->sa_len != ep->src->sa_len || dp->sa_len != ep->dst->sa_len) 1042 return 0; 1043 1044 matchlen = 0; 1045 1046 p = (const u_int8_t *)sp; 1047 q = (const u_int8_t *)ep->srcmask; 1048 r = (u_int8_t *)&s; 1049 for (i = 0 ; i < sp->sa_len; i++) { 1050 r[i] = p[i] & q[i]; 1051 /* XXX estimate */ 1052 matchlen += (q[i] ? 8 : 0); 1053 } 1054 1055 p = (const u_int8_t *)dp; 1056 q = (const u_int8_t *)ep->dstmask; 1057 r = (u_int8_t *)&d; 1058 for (i = 0 ; i < dp->sa_len; i++) { 1059 r[i] = p[i] & q[i]; 1060 /* XXX rough estimate */ 1061 matchlen += (q[i] ? 8 : 0); 1062 } 1063 1064 /* need to overwrite len/family portion as we don't compare them */ 1065 s.ss_len = sp->sa_len; 1066 s.ss_family = sp->sa_family; 1067 d.ss_len = dp->sa_len; 1068 d.ss_family = dp->sa_family; 1069 1070 if (memcmp(&s, ep->src, ep->src->sa_len) == 0 && 1071 memcmp(&d, ep->dst, ep->dst->sa_len) == 0) { 1072 return matchlen; 1073 } else 1074 return 0; 1075 } 1076 #endif 1077 1078 static void 1079 encap_fillarg(struct mbuf *m, const struct encaptab *ep) 1080 { 1081 struct m_tag *mtag; 1082 1083 mtag = m_tag_get(PACKET_TAG_ENCAP, sizeof(void *), M_NOWAIT); 1084 if (mtag) { 1085 *(void **)(mtag + 1) = ep->arg; 1086 m_tag_prepend(m, mtag); 1087 } 1088 } 1089 1090 void * 1091 encap_getarg(struct mbuf *m) 1092 { 1093 void *p; 1094 struct m_tag *mtag; 1095 1096 p = NULL; 1097 mtag = m_tag_find(m, PACKET_TAG_ENCAP, NULL); 1098 if (mtag != NULL) { 1099 p = *(void **)(mtag + 1); 1100 m_tag_delete(m, mtag); 1101 } 1102 return p; 1103 } 1104 1105 int 1106 encap_lock_enter(void) 1107 { 1108 int error; 1109 1110 mutex_enter(&encap_whole.lock); 1111 while (encap_whole.busy != NULL) { 1112 error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock); 1113 if (error) { 1114 mutex_exit(&encap_whole.lock); 1115 return error; 1116 } 1117 } 1118 KASSERT(encap_whole.busy == NULL); 1119 encap_whole.busy = curlwp; 1120 mutex_exit(&encap_whole.lock); 1121 1122 return 0; 1123 } 1124 1125 void 1126 encap_lock_exit(void) 1127 { 1128 1129 mutex_enter(&encap_whole.lock); 1130 KASSERT(encap_whole.busy == curlwp); 1131 encap_whole.busy = NULL; 1132 cv_broadcast(&encap_whole.cv); 1133 mutex_exit(&encap_whole.lock); 1134 } 1135 1136 bool 1137 encap_lock_held(void) 1138 { 1139 1140 return (encap_whole.busy == curlwp); 1141 } 1142