1 /* 2 * Copyright (c) 2011 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Coyote Point Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 #include <sys/cdefs.h> 30 31 #include "opt_ddb.h" 32 #include "opt_inet.h" 33 #include "opt_ipsec.h" 34 #include "opt_inet_csum.h" 35 #include "opt_tcp_debug.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/malloc.h> 40 #include <sys/kmem.h> 41 #include <sys/mbuf.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/errno.h> 46 #include <sys/syslog.h> 47 #include <sys/pool.h> 48 #include <sys/domain.h> 49 #include <sys/kernel.h> 50 #include <net/if.h> 51 #include <net/route.h> 52 #include <net/if_types.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_systm.h> 56 #include <netinet/ip.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/in_var.h> 59 #include <netinet/ip_var.h> 60 #include <netinet/in_offload.h> 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #include <netinet6/in6_pcb.h> 64 #include <netinet6/ip6_var.h> 65 #include <netinet6/in6_var.h> 66 #include <netinet/icmp6.h> 67 #include <netinet6/nd6.h> 68 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_seq.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_private.h> 75 #include <netinet/tcpip.h> 76 77 #include <netinet/tcp_vtw.h> 78 79 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.8 2011/07/17 20:54:53 joerg Exp $"); 80 81 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0) 82 83 static void vtw_debug_init(void); 84 85 fatp_ctl_t fat_tcpv4; 86 fatp_ctl_t fat_tcpv6; 87 vtw_ctl_t vtw_tcpv4[VTW_NCLASS]; 88 vtw_ctl_t vtw_tcpv6[VTW_NCLASS]; 89 vtw_stats_t vtw_stats; 90 91 /* We provide state for the lookup_ports iterator. 92 * As currently we are netlock-protected, there is one. 93 * If we were finer-grain, we would have one per CPU. 94 * I do not want to be in the business of alloc/free. 95 * The best alternate would be allocate on the caller's 96 * stack, but that would require them to know the struct, 97 * or at least the size. 98 * See how she goes. 99 */ 100 struct tcp_ports_iterator { 101 union { 102 struct in_addr v4; 103 struct in6_addr v6; 104 } addr; 105 u_int port; 106 107 uint32_t wild : 1; 108 109 vtw_ctl_t *ctl; 110 fatp_t *fp; 111 112 uint16_t slot_idx; 113 uint16_t ctl_idx; 114 }; 115 116 static struct tcp_ports_iterator tcp_ports_iterator_v4; 117 static struct tcp_ports_iterator tcp_ports_iterator_v6; 118 119 static int vtw_age(vtw_ctl_t *, struct timeval *); 120 121 /*!\brief allocate a fat pointer from a collection. 122 */ 123 static fatp_t * 124 fatp_alloc(fatp_ctl_t *fat) 125 { 126 fatp_t *fp = 0; 127 128 if (fat->nfree) { 129 fp = fat->free; 130 if (fp) { 131 fat->free = fatp_next(fat, fp); 132 --fat->nfree; 133 ++fat->nalloc; 134 fp->nxt = 0; 135 136 KASSERT(!fp->inuse); 137 } 138 } 139 140 return fp; 141 } 142 143 /*!\brief free a fat pointer. 144 */ 145 static void 146 fatp_free(fatp_ctl_t *fat, fatp_t *fp) 147 { 148 if (fp) { 149 KASSERT(!fp->inuse); 150 KASSERT(!fp->nxt); 151 152 fp->nxt = fatp_index(fat, fat->free); 153 fat->free = fp; 154 155 ++fat->nfree; 156 --fat->nalloc; 157 } 158 } 159 160 /*!\brief initialise a collection of fat pointers. 161 * 162 *\param n # hash buckets 163 *\param m total # fat pointers to allocate 164 * 165 * We allocate 2x as much, as we have two hashes: full and lport only. 166 */ 167 static void 168 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m, 169 fatp_t *fat_base, fatp_t **fat_hash) 170 { 171 fatp_t *fp; 172 173 KASSERT(n <= FATP_MAX / 2); 174 175 fat->hash = fat_hash; 176 fat->base = fat_base; 177 178 fat->port = &fat->hash[m]; 179 180 fat->mask = m - 1; // ASSERT is power of 2 (m) 181 fat->lim = fat->base + 2*n - 1; 182 fat->nfree = 0; 183 fat->nalloc = 2*n; 184 185 /* Initialise the free list. 186 */ 187 for (fp = fat->lim; fp >= fat->base; --fp) { 188 fatp_free(fat, fp); 189 } 190 } 191 192 /* 193 * The `xtra' is XORed into the tag stored. 194 */ 195 static uint32_t fatp_xtra[] = { 196 0x11111111,0x22222222,0x33333333,0x44444444, 197 0x55555555,0x66666666,0x77777777,0x88888888, 198 0x12121212,0x21212121,0x34343434,0x43434343, 199 0x56565656,0x65656565,0x78787878,0x87878787, 200 0x11221122,0x22112211,0x33443344,0x44334433, 201 0x55665566,0x66556655,0x77887788,0x88778877, 202 0x11112222,0x22221111,0x33334444,0x44443333, 203 0x55556666,0x66665555,0x77778888,0x88887777, 204 }; 205 206 /*!\brief turn a {fatp_t*,slot} into an integral key. 207 * 208 * The key can be used to obtain the fatp_t, and the slot, 209 * as it directly encodes them. 210 */ 211 static inline uint32_t 212 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot) 213 { 214 CTASSERT(CACHE_LINE_SIZE == 32 || 215 CACHE_LINE_SIZE == 64 || 216 CACHE_LINE_SIZE == 128); 217 218 switch (fatp_ntags()) { 219 case 7: 220 return (fatp_index(fat, fp) << 3) | slot; 221 case 15: 222 return (fatp_index(fat, fp) << 4) | slot; 223 case 31: 224 return (fatp_index(fat, fp) << 5) | slot; 225 default: 226 KASSERT(0 && "no support, for no good reason"); 227 return ~0; 228 } 229 } 230 231 static inline uint32_t 232 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key) 233 { 234 CTASSERT(CACHE_LINE_SIZE == 32 || 235 CACHE_LINE_SIZE == 64 || 236 CACHE_LINE_SIZE == 128); 237 238 switch (fatp_ntags()) { 239 case 7: 240 return key & 7; 241 case 15: 242 return key & 15; 243 case 31: 244 return key & 31; 245 default: 246 KASSERT(0 && "no support, for no good reason"); 247 return ~0; 248 } 249 } 250 251 static inline fatp_t * 252 fatp_from_key(fatp_ctl_t *fat, uint32_t key) 253 { 254 CTASSERT(CACHE_LINE_SIZE == 32 || 255 CACHE_LINE_SIZE == 64 || 256 CACHE_LINE_SIZE == 128); 257 258 switch (fatp_ntags()) { 259 case 7: 260 key >>= 3; 261 break; 262 case 15: 263 key >>= 4; 264 break; 265 case 31: 266 key >>= 5; 267 break; 268 default: 269 KASSERT(0 && "no support, for no good reason"); 270 return 0; 271 } 272 273 return key ? fat->base + key - 1 : 0; 274 } 275 276 static inline uint32_t 277 idx_encode(vtw_ctl_t *ctl, uint32_t idx) 278 { 279 return (idx << ctl->idx_bits) | idx; 280 } 281 282 static inline uint32_t 283 idx_decode(vtw_ctl_t *ctl, uint32_t bits) 284 { 285 uint32_t idx = bits & ctl->idx_mask; 286 287 if (idx_encode(ctl, idx) == bits) 288 return idx; 289 else 290 return ~0; 291 } 292 293 /*!\brief insert index into fatp hash 294 * 295 *\param idx - index of element being placed in hash chain 296 *\param tag - 32-bit tag identifier 297 * 298 *\returns 299 * value which can be used to locate entry. 300 * 301 *\note 302 * we rely on the fact that there are unused high bits in the index 303 * for verification purposes on lookup. 304 */ 305 306 static inline uint32_t 307 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which, 308 void *dbg) 309 { 310 fatp_t *fp; 311 fatp_t **hash = (which ? fat->port : fat->hash); 312 int i; 313 314 fp = hash[tag & fat->mask]; 315 316 while (!fp || fatp_full(fp)) { 317 fatp_t *fq; 318 319 /* All entries are inuse at the top level. 320 * We allocate a spare, and push the top level 321 * down one. All entries in the fp we push down 322 * (think of a tape worm here) will be expelled sooner than 323 * any entries added subsequently to this hash bucket. 324 * This is a property of the time waits we are exploiting. 325 */ 326 327 fq = fatp_alloc(fat); 328 if (!fq) { 329 vtw_age(fat->vtw, 0); 330 fp = hash[tag & fat->mask]; 331 continue; 332 } 333 334 fq->inuse = 0; 335 fq->nxt = fatp_index(fat, fp); 336 337 hash[tag & fat->mask] = fq; 338 339 fp = fq; 340 } 341 342 KASSERT(!fatp_full(fp)); 343 344 /* Fill highest index first. Lookup is lowest first. 345 */ 346 for (i = fatp_ntags(); --i >= 0; ) { 347 if (!((1 << i) & fp->inuse)) { 348 break; 349 } 350 } 351 352 fp->inuse |= 1 << i; 353 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i]; 354 355 db_trace(KTR_VTW 356 , (fp, "fat: inuse %5.5x tag[%x] %8.8x" 357 , fp->inuse 358 , i, fp->tag[i])); 359 360 return fatp_key(fat, fp, i); 361 } 362 363 static inline int 364 vtw_alive(const vtw_t *vtw) 365 { 366 return vtw->hashed && vtw->expire.tv_sec; 367 } 368 369 static inline uint32_t 370 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4) 371 { 372 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4) 373 return v4 - ctl->base.v4; 374 375 KASSERT(0 && "vtw out of bounds"); 376 377 return ~0; 378 } 379 380 static inline uint32_t 381 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6) 382 { 383 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6) 384 return v6 - ctl->base.v6; 385 386 KASSERT(0 && "vtw out of bounds"); 387 388 return ~0; 389 } 390 391 static inline uint32_t 392 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw) 393 { 394 if (ctl->clidx) 395 ctl = ctl->ctl; 396 397 if (ctl->is_v4) 398 return vtw_index_v4(ctl, (vtw_v4_t *)vtw); 399 400 if (ctl->is_v6) 401 return vtw_index_v6(ctl, (vtw_v6_t *)vtw); 402 403 KASSERT(0 && "neither 4 nor 6. most curious."); 404 405 return ~0; 406 } 407 408 static inline vtw_t * 409 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx) 410 { 411 if (ctl->clidx) 412 ctl = ctl->ctl; 413 414 /* See if the index looks like it might be an index. 415 * Bits on outside of the valid index bits is a give away. 416 */ 417 idx = idx_decode(ctl, idx); 418 419 if (idx == ~0) { 420 return 0; 421 } else if (ctl->is_v4) { 422 vtw_v4_t *vtw = ctl->base.v4 + idx; 423 424 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4) 425 ? &vtw->common : 0; 426 } else if (ctl->is_v6) { 427 vtw_v6_t *vtw = ctl->base.v6 + idx; 428 429 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6) 430 ? &vtw->common : 0; 431 } else { 432 KASSERT(0 && "badness"); 433 return 0; 434 } 435 } 436 437 /*!\brief return the next vtw after this one. 438 * 439 * Due to the differing sizes of the entries in differing 440 * arenas, we have to ensure we ++ the correct pointer type. 441 * 442 * Also handles wrap. 443 */ 444 static inline vtw_t * 445 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw) 446 { 447 if (ctl->is_v4) { 448 vtw_v4_t *v4 = (void*)vtw; 449 450 vtw = &(++v4)->common; 451 } else { 452 vtw_v6_t *v6 = (void*)vtw; 453 454 vtw = &(++v6)->common; 455 } 456 457 if (vtw > ctl->lim.v) 458 vtw = ctl->base.v; 459 460 return vtw; 461 } 462 463 /*!\brief remove entry from FATP hash chains 464 */ 465 static inline void 466 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw) 467 { 468 fatp_ctl_t *fat = ctl->fat; 469 fatp_t *fp; 470 uint32_t key = vtw->key; 471 uint32_t tag, slot, idx; 472 vtw_v4_t *v4 = (void*)vtw; 473 vtw_v6_t *v6 = (void*)vtw; 474 475 if (!vtw->hashed) { 476 KASSERT(0 && "unhashed"); 477 return; 478 } 479 480 if (fat->vtw->is_v4) { 481 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport); 482 } else if (fat->vtw->is_v6) { 483 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport); 484 } else { 485 tag = 0; 486 KASSERT(0 && "not reached"); 487 } 488 489 /* Remove from fat->hash[] 490 */ 491 slot = fatp_slot_from_key(fat, key); 492 fp = fatp_from_key(fat, key); 493 idx = vtw_index(ctl, vtw); 494 495 db_trace(KTR_VTW 496 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x" 497 , fp->inuse, slot, idx, key, tag)); 498 499 KASSERT(fp->inuse & (1 << slot)); 500 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 501 ^ fatp_xtra[slot])); 502 503 if ((fp->inuse & (1 << slot)) 504 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 505 ^ fatp_xtra[slot])) { 506 fp->inuse ^= 1 << slot; 507 fp->tag[slot] = 0; 508 509 /* When we delete entries, we do not compact. This is 510 * due to temporality. We add entries, and they 511 * (eventually) expire. Older entries will be further 512 * down the chain. 513 */ 514 if (!fp->inuse) { 515 uint32_t hi = tag & fat->mask; 516 fatp_t *fq = 0; 517 fatp_t *fr = fat->hash[hi]; 518 519 while (fr && fr != fp) { 520 fr = fatp_next(fat, fq = fr); 521 } 522 523 if (fr == fp) { 524 if (fq) { 525 fq->nxt = fp->nxt; 526 fp->nxt = 0; 527 fatp_free(fat, fp); 528 } else { 529 KASSERT(fat->hash[hi] == fp); 530 531 if (fp->nxt) { 532 fat->hash[hi] 533 = fatp_next(fat, fp); 534 fp->nxt = 0; 535 fatp_free(fat, fp); 536 } else { 537 /* retain for next use. 538 */ 539 ; 540 } 541 } 542 } else { 543 fr = fat->hash[hi]; 544 545 do { 546 db_trace(KTR_VTW 547 , (fr 548 , "fat:*del inuse %5.5x" 549 " nxt %x" 550 , fr->inuse, fr->nxt)); 551 552 fr = fatp_next(fat, fq = fr); 553 } while (fr && fr != fp); 554 555 KASSERT(0 && "oops"); 556 } 557 } 558 vtw->key ^= ~0; 559 } 560 561 if (fat->vtw->is_v4) { 562 tag = v4_port_tag(v4->lport); 563 } else if (fat->vtw->is_v6) { 564 tag = v6_port_tag(v6->lport); 565 } 566 567 /* Remove from fat->port[] 568 */ 569 key = vtw->port_key; 570 slot = fatp_slot_from_key(fat, key); 571 fp = fatp_from_key(fat, key); 572 idx = vtw_index(ctl, vtw); 573 574 db_trace(KTR_VTW 575 , (fp, "fatport: del inuse %5.5x" 576 " slot %x idx %x key %x tag %x" 577 , fp->inuse, slot, idx, key, tag)); 578 579 KASSERT(fp->inuse & (1 << slot)); 580 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 581 ^ fatp_xtra[slot])); 582 583 if ((fp->inuse & (1 << slot)) 584 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 585 ^ fatp_xtra[slot])) { 586 fp->inuse ^= 1 << slot; 587 fp->tag[slot] = 0; 588 589 if (!fp->inuse) { 590 uint32_t hi = tag & fat->mask; 591 fatp_t *fq = 0; 592 fatp_t *fr = fat->port[hi]; 593 594 while (fr && fr != fp) { 595 fr = fatp_next(fat, fq = fr); 596 } 597 598 if (fr == fp) { 599 if (fq) { 600 fq->nxt = fp->nxt; 601 fp->nxt = 0; 602 fatp_free(fat, fp); 603 } else { 604 KASSERT(fat->port[hi] == fp); 605 606 if (fp->nxt) { 607 fat->port[hi] 608 = fatp_next(fat, fp); 609 fp->nxt = 0; 610 fatp_free(fat, fp); 611 } else { 612 /* retain for next use. 613 */ 614 ; 615 } 616 } 617 } 618 } 619 vtw->port_key ^= ~0; 620 } 621 622 vtw->hashed = 0; 623 } 624 625 /*!\brief remove entry from hash, possibly free. 626 */ 627 void 628 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw) 629 { 630 KASSERT(mutex_owned(softnet_lock)); 631 632 if (vtw->hashed) { 633 ++vtw_stats.del; 634 vtw_unhash(ctl, vtw); 635 } 636 637 /* We only delete the oldest entry. 638 */ 639 if (vtw != ctl->oldest.v) 640 return; 641 642 --ctl->nalloc; 643 ++ctl->nfree; 644 645 vtw->expire.tv_sec = 0; 646 vtw->expire.tv_usec = ~0; 647 648 if (!ctl->nalloc) 649 ctl->oldest.v = 0; 650 651 ctl->oldest.v = vtw_next(ctl, vtw); 652 } 653 654 /*!\brief insert vestigial timewait in hash chain 655 */ 656 static void 657 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw) 658 { 659 uint32_t idx = vtw_index(ctl, vtw); 660 uint32_t tag; 661 vtw_v4_t *v4 = (void*)vtw; 662 663 KASSERT(mutex_owned(softnet_lock)); 664 KASSERT(!vtw->hashed); 665 KASSERT(ctl->clidx == vtw->msl_class); 666 667 ++vtw_stats.ins; 668 669 tag = v4_tag(v4->faddr, v4->fport, 670 v4->laddr, v4->lport); 671 672 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 673 674 db_trace(KTR_VTW, (ctl 675 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x" 676 " tag %8.8x key %8.8x" 677 , v4->faddr, v4->fport 678 , v4->laddr, v4->lport 679 , tag 680 , vtw->key)); 681 682 tag = v4_port_tag(v4->lport); 683 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 684 685 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 686 , v4->lport, v4->lport 687 , tag 688 , vtw->key)); 689 690 vtw->hashed = 1; 691 } 692 693 /*!\brief insert vestigial timewait in hash chain 694 */ 695 static void 696 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw) 697 { 698 uint32_t idx = vtw_index(ctl, vtw); 699 uint32_t tag; 700 vtw_v6_t *v6 = (void*)vtw; 701 702 KASSERT(mutex_owned(softnet_lock)); 703 KASSERT(!vtw->hashed); 704 KASSERT(ctl->clidx == vtw->msl_class); 705 706 ++vtw_stats.ins; 707 708 tag = v6_tag(&v6->faddr, v6->fport, 709 &v6->laddr, v6->lport); 710 711 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 712 713 tag = v6_port_tag(v6->lport); 714 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 715 716 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 717 , v6->lport, v6->lport 718 , tag 719 , vtw->key)); 720 721 vtw->hashed = 1; 722 } 723 724 static vtw_t * 725 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport 726 , uint32_t laddr, uint16_t lport 727 , int which) 728 { 729 vtw_v4_t *v4; 730 vtw_t *vtw; 731 uint32_t tag; 732 fatp_t *fp; 733 int i; 734 uint32_t fatps = 0, probes = 0, losings = 0; 735 736 if (!ctl || !ctl->fat) 737 return 0; 738 739 ++vtw_stats.look[which]; 740 741 if (which) { 742 tag = v4_port_tag(lport); 743 fp = ctl->fat->port[tag & ctl->fat->mask]; 744 } else { 745 tag = v4_tag(faddr, fport, laddr, lport); 746 fp = ctl->fat->hash[tag & ctl->fat->mask]; 747 } 748 749 while (fp && fp->inuse) { 750 uint32_t inuse = fp->inuse; 751 752 ++fatps; 753 754 for (i = 0; inuse && i < fatp_ntags(); ++i) { 755 uint32_t idx; 756 757 if (!(inuse & (1 << i))) 758 continue; 759 760 inuse ^= 1 << i; 761 762 ++probes; 763 ++vtw_stats.probe[which]; 764 765 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 766 vtw = vtw_from_index(ctl, idx); 767 768 if (!vtw) { 769 /* Hopefully fast path. 770 */ 771 db_trace(KTR_VTW 772 , (fp, "vtw: fast %A:%P %A:%P" 773 " idx %x tag %x" 774 , faddr, fport 775 , laddr, lport 776 , idx, tag)); 777 continue; 778 } 779 780 v4 = (void*)vtw; 781 782 /* The de-referencing of vtw is what we want to avoid. 783 * Losing. 784 */ 785 if (vtw_alive(vtw) 786 && ((which ? vtw->port_key : vtw->key) 787 == fatp_key(ctl->fat, fp, i)) 788 && (which 789 || (v4->faddr == faddr && v4->laddr == laddr 790 && v4->fport == fport)) 791 && v4->lport == lport) { 792 ++vtw_stats.hit[which]; 793 794 db_trace(KTR_VTW 795 , (fp, "vtw: hit %8.8x:%4.4x" 796 " %8.8x:%4.4x idx %x key %x" 797 , faddr, fport 798 , laddr, lport 799 , idx_decode(ctl, idx), vtw->key)); 800 801 KASSERT(vtw->hashed); 802 803 goto out; 804 } 805 ++vtw_stats.losing[which]; 806 ++losings; 807 808 if (vtw_alive(vtw)) { 809 db_trace(KTR_VTW 810 , (fp, "vtw:!mis %8.8x:%4.4x" 811 " %8.8x:%4.4x key %x tag %x" 812 , faddr, fport 813 , laddr, lport 814 , fatp_key(ctl->fat, fp, i) 815 , v4_tag(faddr, fport 816 , laddr, lport))); 817 db_trace(KTR_VTW 818 , (vtw, "vtw:!mis %8.8x:%4.4x" 819 " %8.8x:%4.4x key %x tag %x" 820 , v4->faddr, v4->fport 821 , v4->laddr, v4->lport 822 , vtw->key 823 , v4_tag(v4->faddr, v4->fport 824 , v4->laddr, v4->lport))); 825 826 if (vtw->key == fatp_key(ctl->fat, fp, i)) { 827 db_trace(KTR_VTW 828 , (vtw, "vtw:!mis %8.8x:%4.4x" 829 " %8.8x:%4.4x key %x" 830 " which %x" 831 , v4->faddr, v4->fport 832 , v4->laddr, v4->lport 833 , vtw->key 834 , which)); 835 836 } else { 837 db_trace(KTR_VTW 838 , (vtw 839 , "vtw:!mis" 840 " key %8.8x != %8.8x" 841 " idx %x i %x which %x" 842 , vtw->key 843 , fatp_key(ctl->fat, fp, i) 844 , idx_decode(ctl, idx) 845 , i 846 , which)); 847 } 848 } else { 849 db_trace(KTR_VTW 850 , (fp 851 , "vtw:!mis free entry" 852 " idx %x vtw %p which %x" 853 , idx_decode(ctl, idx) 854 , vtw, which)); 855 } 856 } 857 858 if (fp->nxt) { 859 fp = fatp_next(ctl->fat, fp); 860 } else { 861 break; 862 } 863 } 864 ++vtw_stats.miss[which]; 865 vtw = 0; 866 out: 867 if (fatps > vtw_stats.max_chain[which]) 868 vtw_stats.max_chain[which] = fatps; 869 if (probes > vtw_stats.max_probe[which]) 870 vtw_stats.max_probe[which] = probes; 871 if (losings > vtw_stats.max_loss[which]) 872 vtw_stats.max_loss[which] = losings; 873 874 return vtw; 875 } 876 877 static vtw_t * 878 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport 879 , const struct in6_addr *laddr, uint16_t lport 880 , int which) 881 { 882 vtw_v6_t *v6; 883 vtw_t *vtw; 884 uint32_t tag; 885 fatp_t *fp; 886 int i; 887 uint32_t fatps = 0, probes = 0, losings = 0; 888 889 ++vtw_stats.look[which]; 890 891 if (!ctl || !ctl->fat) 892 return 0; 893 894 if (which) { 895 tag = v6_port_tag(lport); 896 fp = ctl->fat->port[tag & ctl->fat->mask]; 897 } else { 898 tag = v6_tag(faddr, fport, laddr, lport); 899 fp = ctl->fat->hash[tag & ctl->fat->mask]; 900 } 901 902 while (fp && fp->inuse) { 903 uint32_t inuse = fp->inuse; 904 905 ++fatps; 906 907 for (i = 0; inuse && i < fatp_ntags(); ++i) { 908 uint32_t idx; 909 910 if (!(inuse & (1 << i))) 911 continue; 912 913 inuse ^= 1 << i; 914 915 ++probes; 916 ++vtw_stats.probe[which]; 917 918 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 919 vtw = vtw_from_index(ctl, idx); 920 921 db_trace(KTR_VTW 922 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x" 923 , i 924 , db_store(faddr, sizeof (*faddr)), fport 925 , db_store(laddr, sizeof (*laddr)), lport 926 , idx_decode(ctl, idx))); 927 928 if (!vtw) { 929 /* Hopefully fast path. 930 */ 931 continue; 932 } 933 934 v6 = (void*)vtw; 935 936 if (vtw_alive(vtw) 937 && ((which ? vtw->port_key : vtw->key) 938 == fatp_key(ctl->fat, fp, i)) 939 && v6->lport == lport 940 && (which 941 || (v6->fport == fport 942 && !bcmp(&v6->faddr, faddr, sizeof (*faddr)) 943 && !bcmp(&v6->laddr, laddr 944 , sizeof (*laddr))))) { 945 ++vtw_stats.hit[which]; 946 947 KASSERT(vtw->hashed); 948 goto out; 949 } else { 950 ++vtw_stats.losing[which]; 951 ++losings; 952 } 953 } 954 955 if (fp->nxt) { 956 fp = fatp_next(ctl->fat, fp); 957 } else { 958 break; 959 } 960 } 961 ++vtw_stats.miss[which]; 962 vtw = 0; 963 out: 964 if (fatps > vtw_stats.max_chain[which]) 965 vtw_stats.max_chain[which] = fatps; 966 if (probes > vtw_stats.max_probe[which]) 967 vtw_stats.max_probe[which] = probes; 968 if (losings > vtw_stats.max_loss[which]) 969 vtw_stats.max_loss[which] = losings; 970 971 return vtw; 972 } 973 974 /*!\brief port iterator 975 */ 976 static vtw_t * 977 vtw_next_port_v4(struct tcp_ports_iterator *it) 978 { 979 vtw_ctl_t *ctl = it->ctl; 980 vtw_v4_t *v4; 981 vtw_t *vtw; 982 uint32_t tag; 983 uint16_t lport = it->port; 984 fatp_t *fp; 985 int i; 986 uint32_t fatps = 0, probes = 0, losings = 0; 987 988 tag = v4_port_tag(lport); 989 if (!it->fp) { 990 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 991 it->slot_idx = 0; 992 } 993 fp = it->fp; 994 995 while (fp) { 996 uint32_t inuse = fp->inuse; 997 998 ++fatps; 999 1000 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1001 uint32_t idx; 1002 1003 if (!(inuse & (1 << i))) 1004 continue; 1005 1006 inuse &= ~0 << i; 1007 1008 if (i < it->slot_idx) 1009 continue; 1010 1011 ++vtw_stats.probe[1]; 1012 ++probes; 1013 1014 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1015 vtw = vtw_from_index(ctl, idx); 1016 1017 if (!vtw) { 1018 /* Hopefully fast path. 1019 */ 1020 continue; 1021 } 1022 1023 v4 = (void*)vtw; 1024 1025 if (vtw_alive(vtw) 1026 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1027 && v4->lport == lport) { 1028 ++vtw_stats.hit[1]; 1029 1030 it->slot_idx = i + 1; 1031 1032 goto out; 1033 } else if (vtw_alive(vtw)) { 1034 ++vtw_stats.losing[1]; 1035 ++losings; 1036 1037 db_trace(KTR_VTW 1038 , (vtw, "vtw:!mis" 1039 " port %8.8x:%4.4x %8.8x:%4.4x" 1040 " key %x port %x" 1041 , v4->faddr, v4->fport 1042 , v4->laddr, v4->lport 1043 , vtw->key 1044 , lport)); 1045 } else { 1046 /* Really losing here. We are coming 1047 * up with references to free entries. 1048 * Might find it better to use 1049 * traditional, or need another 1050 * add-hockery. The other add-hockery 1051 * would be to pul more into into the 1052 * cache line to reject the false 1053 * hits. 1054 */ 1055 ++vtw_stats.losing[1]; 1056 ++losings; 1057 db_trace(KTR_VTW 1058 , (fp, "vtw:!mis port %x" 1059 " - free entry idx %x vtw %p" 1060 , lport 1061 , idx_decode(ctl, idx) 1062 , vtw)); 1063 } 1064 } 1065 1066 if (fp->nxt) { 1067 it->fp = fp = fatp_next(ctl->fat, fp); 1068 it->slot_idx = 0; 1069 } else { 1070 it->fp = 0; 1071 break; 1072 } 1073 } 1074 ++vtw_stats.miss[1]; 1075 1076 vtw = 0; 1077 out: 1078 if (fatps > vtw_stats.max_chain[1]) 1079 vtw_stats.max_chain[1] = fatps; 1080 if (probes > vtw_stats.max_probe[1]) 1081 vtw_stats.max_probe[1] = probes; 1082 if (losings > vtw_stats.max_loss[1]) 1083 vtw_stats.max_loss[1] = losings; 1084 1085 return vtw; 1086 } 1087 1088 /*!\brief port iterator 1089 */ 1090 static vtw_t * 1091 vtw_next_port_v6(struct tcp_ports_iterator *it) 1092 { 1093 vtw_ctl_t *ctl = it->ctl; 1094 vtw_v6_t *v6; 1095 vtw_t *vtw; 1096 uint32_t tag; 1097 uint16_t lport = it->port; 1098 fatp_t *fp; 1099 int i; 1100 uint32_t fatps = 0, probes = 0, losings = 0; 1101 1102 tag = v6_port_tag(lport); 1103 if (!it->fp) { 1104 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1105 it->slot_idx = 0; 1106 } 1107 fp = it->fp; 1108 1109 while (fp) { 1110 uint32_t inuse = fp->inuse; 1111 1112 ++fatps; 1113 1114 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1115 uint32_t idx; 1116 1117 if (!(inuse & (1 << i))) 1118 continue; 1119 1120 inuse &= ~0 << i; 1121 1122 if (i < it->slot_idx) 1123 continue; 1124 1125 ++vtw_stats.probe[1]; 1126 ++probes; 1127 1128 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1129 vtw = vtw_from_index(ctl, idx); 1130 1131 if (!vtw) { 1132 /* Hopefully fast path. 1133 */ 1134 continue; 1135 } 1136 1137 v6 = (void*)vtw; 1138 1139 db_trace(KTR_VTW 1140 , (vtw, "vtw: i %x idx %x fp->tag %x" 1141 " tag %x xtra %x" 1142 , i, idx_decode(ctl, idx) 1143 , fp->tag[i], tag, fatp_xtra[i])); 1144 1145 if (vtw_alive(vtw) 1146 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1147 && v6->lport == lport) { 1148 ++vtw_stats.hit[1]; 1149 1150 db_trace(KTR_VTW 1151 , (fp, "vtw: nxt port %P - %4.4x" 1152 " idx %x key %x" 1153 , lport, lport 1154 , idx_decode(ctl, idx), vtw->key)); 1155 1156 it->slot_idx = i + 1; 1157 goto out; 1158 } else if (vtw_alive(vtw)) { 1159 ++vtw_stats.losing[1]; 1160 1161 db_trace(KTR_VTW 1162 , (vtw, "vtw:!mis port %6A:%4.4x" 1163 " %6A:%4.4x key %x port %x" 1164 , db_store(&v6->faddr 1165 , sizeof (v6->faddr)) 1166 , v6->fport 1167 , db_store(&v6->laddr 1168 , sizeof (v6->faddr)) 1169 , v6->lport 1170 , vtw->key 1171 , lport)); 1172 } else { 1173 /* Really losing here. We are coming 1174 * up with references to free entries. 1175 * Might find it better to use 1176 * traditional, or need another 1177 * add-hockery. The other add-hockery 1178 * would be to pul more into into the 1179 * cache line to reject the false 1180 * hits. 1181 */ 1182 ++vtw_stats.losing[1]; 1183 ++losings; 1184 1185 db_trace(KTR_VTW 1186 , (fp 1187 , "vtw:!mis port %x" 1188 " - free entry idx %x vtw %p" 1189 , lport, idx_decode(ctl, idx) 1190 , vtw)); 1191 } 1192 } 1193 1194 if (fp->nxt) { 1195 it->fp = fp = fatp_next(ctl->fat, fp); 1196 it->slot_idx = 0; 1197 } else { 1198 it->fp = 0; 1199 break; 1200 } 1201 } 1202 ++vtw_stats.miss[1]; 1203 1204 vtw = 0; 1205 out: 1206 if (fatps > vtw_stats.max_chain[1]) 1207 vtw_stats.max_chain[1] = fatps; 1208 if (probes > vtw_stats.max_probe[1]) 1209 vtw_stats.max_probe[1] = probes; 1210 if (losings > vtw_stats.max_loss[1]) 1211 vtw_stats.max_loss[1] = losings; 1212 1213 return vtw; 1214 } 1215 1216 /*!\brief initialise the VTW allocation arena 1217 * 1218 * There are 1+3 allocation classes: 1219 * 0 classless 1220 * {1,2,3} MSL-class based allocation 1221 * 1222 * The allocation arenas are all initialised. Classless gets all the 1223 * space. MSL-class based divides the arena, so that allocation 1224 * within a class can proceed without having to consider entries 1225 * (aka: cache lines) from different classes. 1226 * 1227 * Usually, we are completely classless or class-based, but there can be 1228 * transition periods, corresponding to dynamic adjustments in the config 1229 * by the operator. 1230 */ 1231 static void 1232 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v) 1233 { 1234 int class_n, i; 1235 vtw_t *base; 1236 1237 ctl->base.v = ctl_base_v; 1238 1239 if (ctl->is_v4) { 1240 ctl->lim.v4 = ctl->base.v4 + n - 1; 1241 ctl->alloc.v4 = ctl->base.v4; 1242 } else { 1243 ctl->lim.v6 = ctl->base.v6 + n - 1; 1244 ctl->alloc.v6 = ctl->base.v6; 1245 } 1246 1247 ctl->nfree = n; 1248 ctl->ctl = ctl; 1249 1250 ctl->idx_bits = 32; 1251 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) { 1252 ctl->idx_mask >>= 1; 1253 ctl->idx_bits -= 1; 1254 } 1255 1256 ctl->idx_mask <<= 1; 1257 ctl->idx_mask |= 1; 1258 ctl->idx_bits += 1; 1259 1260 ctl->fat = fat; 1261 fat->vtw = ctl; 1262 1263 /* Divide the resources equally amongst the classes. 1264 * This is not optimal, as the different classes 1265 * arrive and leave at different rates, but it is 1266 * the best I can do for now. 1267 */ 1268 class_n = n / (VTW_NCLASS-1); 1269 base = ctl->base.v; 1270 1271 for (i = 1; i < VTW_NCLASS; ++i) { 1272 int j; 1273 1274 ctl[i] = ctl[0]; 1275 ctl[i].clidx = i; 1276 1277 ctl[i].base.v = base; 1278 ctl[i].alloc = ctl[i].base; 1279 1280 for (j = 0; j < class_n - 1; ++j) { 1281 if (tcp_msl_enable) 1282 base->msl_class = i; 1283 base = vtw_next(ctl, base); 1284 } 1285 1286 ctl[i].lim.v = base; 1287 base = vtw_next(ctl, base); 1288 ctl[i].nfree = class_n; 1289 } 1290 1291 vtw_debug_init(); 1292 } 1293 1294 /*!\brief map class to TCP MSL 1295 */ 1296 static inline uint32_t 1297 class_to_msl(int class) 1298 { 1299 switch (class) { 1300 case 0: 1301 case 1: 1302 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0); 1303 case 2: 1304 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); 1305 default: 1306 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); 1307 } 1308 } 1309 1310 /*!\brief map TCP MSL to class 1311 */ 1312 static inline uint32_t 1313 msl_to_class(int msl) 1314 { 1315 if (tcp_msl_enable) { 1316 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2))) 1317 return 1+2; 1318 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1))) 1319 return 1+1; 1320 return 1; 1321 } 1322 return 0; 1323 } 1324 1325 /*!\brief allocate a vtw entry 1326 */ 1327 static inline vtw_t * 1328 vtw_alloc(vtw_ctl_t *ctl) 1329 { 1330 vtw_t *vtw = 0; 1331 int stuck = 0; 1332 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0; 1333 int msl; 1334 1335 KASSERT(mutex_owned(softnet_lock)); 1336 1337 /* If no resources, we will not get far. 1338 */ 1339 if (!ctl || !ctl->base.v4 || avail <= 0) 1340 return 0; 1341 1342 /* Obtain a free one. 1343 */ 1344 while (!ctl->nfree) { 1345 vtw_age(ctl, 0); 1346 1347 if (++stuck > avail) { 1348 /* When in transition between 1349 * schemes (classless, classed) we 1350 * can be stuck having to await the 1351 * expiration of cross-allocated entries. 1352 * 1353 * Returning zero means we will fall back to the 1354 * traditional TIME_WAIT handling, except in the 1355 * case of a re-shed, in which case we cannot 1356 * perform the reshecd, but will retain the extant 1357 * entry. 1358 */ 1359 db_trace(KTR_VTW 1360 , (ctl, "vtw:!none free in class %x %x/%x" 1361 , ctl->clidx 1362 , ctl->nalloc, ctl->nfree)); 1363 1364 return 0; 1365 } 1366 } 1367 1368 vtw = ctl->alloc.v; 1369 1370 if (vtw->msl_class != ctl->clidx) { 1371 /* Usurping rules: 1372 * 0 -> {1,2,3} or {1,2,3} -> 0 1373 */ 1374 KASSERT(!vtw->msl_class || !ctl->clidx); 1375 1376 if (vtw->hashed || vtw->expire.tv_sec) { 1377 /* As this is owned by some other class, 1378 * we must wait for it to expire it. 1379 * This will only happen on class/classless 1380 * transitions, which are guaranteed to progress 1381 * to completion in small finite time, barring bugs. 1382 */ 1383 db_trace(KTR_VTW 1384 , (ctl, "vtw:!%p class %x!=%x %x:%x%s" 1385 , vtw, vtw->msl_class, ctl->clidx 1386 , vtw->expire.tv_sec 1387 , vtw->expire.tv_usec 1388 , vtw->hashed ? " hashed" : "")); 1389 1390 return 0; 1391 } 1392 1393 db_trace(KTR_VTW 1394 , (ctl, "vtw:!%p usurped from %x to %x" 1395 , vtw, vtw->msl_class, ctl->clidx)); 1396 1397 vtw->msl_class = ctl->clidx; 1398 } 1399 1400 if (vtw_alive(vtw)) { 1401 KASSERT(0 && "next free not free"); 1402 return 0; 1403 } 1404 1405 /* Advance allocation poiter. 1406 */ 1407 ctl->alloc.v = vtw_next(ctl, vtw); 1408 1409 --ctl->nfree; 1410 ++ctl->nalloc; 1411 1412 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec 1413 1414 /* mark expiration 1415 */ 1416 getmicrouptime(&vtw->expire); 1417 1418 /* Move expiration into the future. 1419 */ 1420 vtw->expire.tv_sec += msl / 1000; 1421 vtw->expire.tv_usec += 1000 * (msl % 1000); 1422 1423 while (vtw->expire.tv_usec >= 1000*1000) { 1424 vtw->expire.tv_usec -= 1000*1000; 1425 vtw->expire.tv_sec += 1; 1426 } 1427 1428 if (!ctl->oldest.v) 1429 ctl->oldest.v = vtw; 1430 1431 return vtw; 1432 } 1433 1434 /*!\brief expiration 1435 */ 1436 static int 1437 vtw_age(vtw_ctl_t *ctl, struct timeval *_when) 1438 { 1439 vtw_t *vtw; 1440 struct timeval then, *when = _when; 1441 int maxtries = 0; 1442 1443 if (!ctl->oldest.v) { 1444 KASSERT(!ctl->nalloc); 1445 return 0; 1446 } 1447 1448 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) { 1449 if (++maxtries > ctl->nalloc) 1450 break; 1451 1452 if (vtw->msl_class != ctl->clidx) { 1453 db_trace(KTR_VTW 1454 , (vtw, "vtw:!age class mismatch %x != %x" 1455 , vtw->msl_class, ctl->clidx)); 1456 /* XXXX 1457 * See if the appropriate action is to skip to the next. 1458 * XXXX 1459 */ 1460 ctl->oldest.v = vtw = vtw_next(ctl, vtw); 1461 continue; 1462 } 1463 if (!when) { 1464 /* Latch oldest timeval if none specified. 1465 */ 1466 then = vtw->expire; 1467 when = &then; 1468 } 1469 1470 if (!timercmp(&vtw->expire, when, <=)) 1471 break; 1472 1473 db_trace(KTR_VTW 1474 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x" 1475 , ctl->clidx 1476 , vtw->expire.tv_sec 1477 , vtw->expire.tv_usec 1478 , ctl->nalloc 1479 , ctl->nfree)); 1480 1481 if (!_when) 1482 ++vtw_stats.kill; 1483 1484 vtw_del(ctl, vtw); 1485 vtw = ctl->oldest.v; 1486 } 1487 1488 return ctl->nalloc; // # remaining allocated 1489 } 1490 1491 static callout_t vtw_cs; 1492 1493 /*!\brief notice the passage of time. 1494 * It seems to be getting faster. What happened to the year? 1495 */ 1496 static void 1497 vtw_tick(void *arg) 1498 { 1499 struct timeval now; 1500 int i, cnt = 0; 1501 1502 getmicrouptime(&now); 1503 1504 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x" 1505 , now.tv_sec, now.tv_usec)); 1506 1507 mutex_enter(softnet_lock); 1508 1509 for (i = 0; i < VTW_NCLASS; ++i) { 1510 cnt += vtw_age(&vtw_tcpv4[i], &now); 1511 cnt += vtw_age(&vtw_tcpv6[i], &now); 1512 } 1513 1514 /* Keep ticks coming while we need them. 1515 */ 1516 if (cnt) 1517 callout_schedule(&vtw_cs, hz / 5); 1518 else { 1519 tcp_vtw_was_enabled = 0; 1520 tcbtable.vestige = 0; 1521 } 1522 mutex_exit(softnet_lock); 1523 } 1524 1525 /* in_pcblookup_ports assist for handling vestigial entries. 1526 */ 1527 static void * 1528 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild) 1529 { 1530 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4; 1531 1532 bzero(it, sizeof (*it)); 1533 1534 /* Note: the reference to vtw_tcpv4[0] is fine. 1535 * We do not need per-class iteration. We just 1536 * need to get to the fat, and there is one 1537 * shared fat. 1538 */ 1539 if (vtw_tcpv4[0].fat) { 1540 it->addr.v4 = addr; 1541 it->port = port; 1542 it->wild = !!wild; 1543 it->ctl = &vtw_tcpv4[0]; 1544 1545 ++vtw_stats.look[1]; 1546 } 1547 1548 return it; 1549 } 1550 1551 /*!\brief export an IPv4 vtw. 1552 */ 1553 static int 1554 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1555 { 1556 vtw_v4_t *v4 = (void*)vtw; 1557 1558 bzero(res, sizeof (*res)); 1559 1560 if (ctl && vtw) { 1561 if (!ctl->clidx && vtw->msl_class) 1562 ctl += vtw->msl_class; 1563 else 1564 KASSERT(ctl->clidx == vtw->msl_class); 1565 1566 res->valid = 1; 1567 res->v4 = 1; 1568 1569 res->faddr.v4.s_addr = v4->faddr; 1570 res->laddr.v4.s_addr = v4->laddr; 1571 res->fport = v4->fport; 1572 res->lport = v4->lport; 1573 res->vtw = vtw; // netlock held over call(s) 1574 res->ctl = ctl; 1575 res->reuse_addr = vtw->reuse_addr; 1576 res->reuse_port = vtw->reuse_port; 1577 res->snd_nxt = vtw->snd_nxt; 1578 res->rcv_nxt = vtw->rcv_nxt; 1579 res->rcv_wnd = vtw->rcv_wnd; 1580 res->uid = vtw->uid; 1581 } 1582 1583 return res->valid; 1584 } 1585 1586 /*!\brief return next port in the port iterator. yowza. 1587 */ 1588 static int 1589 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res) 1590 { 1591 struct tcp_ports_iterator *it = arg; 1592 vtw_t *vtw = 0; 1593 1594 if (it->ctl) 1595 vtw = vtw_next_port_v4(it); 1596 1597 if (!vtw) 1598 it->ctl = 0; 1599 1600 return vtw_export_v4(it->ctl, vtw, res); 1601 } 1602 1603 static int 1604 tcp_lookup_v4(struct in_addr faddr, uint16_t fport, 1605 struct in_addr laddr, uint16_t lport, 1606 struct vestigial_inpcb *res) 1607 { 1608 vtw_t *vtw; 1609 vtw_ctl_t *ctl; 1610 1611 1612 db_trace(KTR_VTW 1613 , (res, "vtw: lookup %A:%P %A:%P" 1614 , faddr, fport 1615 , laddr, lport)); 1616 1617 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0]) 1618 , faddr.s_addr, fport 1619 , laddr.s_addr, lport, 0); 1620 1621 return vtw_export_v4(ctl, vtw, res); 1622 } 1623 1624 /* in_pcblookup_ports assist for handling vestigial entries. 1625 */ 1626 static void * 1627 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild) 1628 { 1629 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6; 1630 1631 bzero(it, sizeof (*it)); 1632 1633 /* Note: the reference to vtw_tcpv6[0] is fine. 1634 * We do not need per-class iteration. We just 1635 * need to get to the fat, and there is one 1636 * shared fat. 1637 */ 1638 if (vtw_tcpv6[0].fat) { 1639 it->addr.v6 = *addr; 1640 it->port = port; 1641 it->wild = !!wild; 1642 it->ctl = &vtw_tcpv6[0]; 1643 1644 ++vtw_stats.look[1]; 1645 } 1646 1647 return it; 1648 } 1649 1650 /*!\brief export an IPv6 vtw. 1651 */ 1652 static int 1653 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1654 { 1655 vtw_v6_t *v6 = (void*)vtw; 1656 1657 bzero(res, sizeof (*res)); 1658 1659 if (ctl && vtw) { 1660 if (!ctl->clidx && vtw->msl_class) 1661 ctl += vtw->msl_class; 1662 else 1663 KASSERT(ctl->clidx == vtw->msl_class); 1664 1665 res->valid = 1; 1666 res->v4 = 0; 1667 1668 res->faddr.v6 = v6->faddr; 1669 res->laddr.v6 = v6->laddr; 1670 res->fport = v6->fport; 1671 res->lport = v6->lport; 1672 res->vtw = vtw; // netlock held over call(s) 1673 res->ctl = ctl; 1674 1675 res->v6only = vtw->v6only; 1676 res->reuse_addr = vtw->reuse_addr; 1677 res->reuse_port = vtw->reuse_port; 1678 1679 res->snd_nxt = vtw->snd_nxt; 1680 res->rcv_nxt = vtw->rcv_nxt; 1681 res->rcv_wnd = vtw->rcv_wnd; 1682 res->uid = vtw->uid; 1683 } 1684 1685 return res->valid; 1686 } 1687 1688 static int 1689 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res) 1690 { 1691 struct tcp_ports_iterator *it = arg; 1692 vtw_t *vtw = 0; 1693 1694 if (it->ctl) 1695 vtw = vtw_next_port_v6(it); 1696 1697 if (!vtw) 1698 it->ctl = 0; 1699 1700 return vtw_export_v6(it->ctl, vtw, res); 1701 } 1702 1703 static int 1704 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport, 1705 const struct in6_addr *laddr, uint16_t lport, 1706 struct vestigial_inpcb *res) 1707 { 1708 vtw_ctl_t *ctl; 1709 vtw_t *vtw; 1710 1711 db_trace(KTR_VTW 1712 , (res, "vtw: lookup %6A:%P %6A:%P" 1713 , db_store(faddr, sizeof (*faddr)), fport 1714 , db_store(laddr, sizeof (*laddr)), lport)); 1715 1716 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0]) 1717 , faddr, fport 1718 , laddr, lport, 0); 1719 1720 return vtw_export_v6(ctl, vtw, res); 1721 } 1722 1723 static vestigial_hooks_t tcp_hooks = { 1724 .init_ports4 = tcp_init_ports_v4, 1725 .next_port4 = tcp_next_port_v4, 1726 .lookup4 = tcp_lookup_v4, 1727 .init_ports6 = tcp_init_ports_v6, 1728 .next_port6 = tcp_next_port_v6, 1729 .lookup6 = tcp_lookup_v6, 1730 }; 1731 1732 static bool 1733 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp) 1734 { 1735 fatp_ctl_t *fat; 1736 vtw_ctl_t *ctl; 1737 1738 switch (af) { 1739 case AF_INET: 1740 fat = &fat_tcpv4; 1741 ctl = &vtw_tcpv4[0]; 1742 break; 1743 case AF_INET6: 1744 fat = &fat_tcpv6; 1745 ctl = &vtw_tcpv6[0]; 1746 break; 1747 default: 1748 return false; 1749 } 1750 if (fatp != NULL) 1751 *fatp = fat; 1752 if (ctlp != NULL) 1753 *ctlp = ctl; 1754 return true; 1755 } 1756 1757 /*!\brief initialize controlling instance 1758 */ 1759 static int 1760 vtw_control_init(int af) 1761 { 1762 fatp_ctl_t *fat; 1763 vtw_ctl_t *ctl; 1764 fatp_t *fat_base; 1765 fatp_t **fat_hash; 1766 vtw_t *ctl_base_v; 1767 uint32_t n, m; 1768 size_t sz; 1769 1770 KASSERT(powerof2(tcp_vtw_entries)); 1771 1772 if (!vtw_select(af, &fat, &ctl)) 1773 return EAFNOSUPPORT; 1774 1775 if (fat->hash != NULL) { 1776 KASSERT(fat->base != NULL && ctl->base.v != NULL); 1777 return 0; 1778 } 1779 1780 /* Allocate 10% more capacity in the fat pointers. 1781 * We should only need ~#hash additional based on 1782 * how they age, but TIME_WAIT assassination could cause 1783 * sparse fat pointer utilisation. 1784 */ 1785 m = 512; 1786 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10; 1787 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t)); 1788 1789 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP); 1790 1791 if (fat_hash == NULL) { 1792 printf("%s: could not allocate %zu bytes for " 1793 "hash anchors", __func__, 2*m * sizeof(fatp_t *)); 1794 return ENOMEM; 1795 } 1796 1797 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP); 1798 1799 if (fat_base == NULL) { 1800 kmem_free(fat_hash, 2*m * sizeof (fatp_t *)); 1801 printf("%s: could not allocate %zu bytes for " 1802 "fatp_t array", __func__, 2*n * sizeof(fatp_t)); 1803 return ENOMEM; 1804 } 1805 1806 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP); 1807 1808 if (ctl_base_v == NULL) { 1809 kmem_free(fat_hash, 2*m * sizeof (fatp_t *)); 1810 kmem_free(fat_base, 2*n * sizeof(fatp_t)); 1811 printf("%s: could not allocate %zu bytes for " 1812 "vtw_t array", __func__, tcp_vtw_entries * sz); 1813 return ENOMEM; 1814 } 1815 1816 fatp_init(fat, n, m, fat_base, fat_hash); 1817 1818 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v); 1819 1820 return 0; 1821 } 1822 1823 /*!\brief select controlling instance 1824 */ 1825 static vtw_ctl_t * 1826 vtw_control(int af, uint32_t msl) 1827 { 1828 fatp_ctl_t *fat; 1829 vtw_ctl_t *ctl; 1830 int class = msl_to_class(msl); 1831 1832 if (!vtw_select(af, &fat, &ctl)) 1833 return NULL; 1834 1835 if (!fat->base || !ctl->base.v) 1836 return NULL; 1837 1838 if (!tcp_vtw_was_enabled) { 1839 /* This guarantees is timer ticks until we no longer need them. 1840 */ 1841 tcp_vtw_was_enabled = 1; 1842 1843 callout_schedule(&vtw_cs, hz / 5); 1844 1845 tcbtable.vestige = &tcp_hooks; 1846 } 1847 1848 return ctl + class; 1849 } 1850 1851 /*!\brief add TCP pcb to vestigial timewait 1852 */ 1853 int 1854 vtw_add(int af, struct tcpcb *tp) 1855 { 1856 int enable; 1857 vtw_ctl_t *ctl; 1858 vtw_t *vtw; 1859 1860 KASSERT(mutex_owned(softnet_lock)); 1861 1862 ctl = vtw_control(af, tp->t_msl); 1863 if (!ctl) 1864 return 0; 1865 1866 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable; 1867 1868 vtw = vtw_alloc(ctl); 1869 1870 if (vtw) { 1871 vtw->snd_nxt = tp->snd_nxt; 1872 vtw->rcv_nxt = tp->rcv_nxt; 1873 1874 switch (af) { 1875 case AF_INET: { 1876 struct inpcb *inp = tp->t_inpcb; 1877 vtw_v4_t *v4 = (void*)vtw; 1878 1879 v4->faddr = inp->inp_faddr.s_addr; 1880 v4->laddr = inp->inp_laddr.s_addr; 1881 v4->fport = inp->inp_fport; 1882 v4->lport = inp->inp_lport; 1883 1884 vtw->reuse_port = !!(inp->inp_socket->so_options 1885 & SO_REUSEPORT); 1886 vtw->reuse_addr = !!(inp->inp_socket->so_options 1887 & SO_REUSEADDR); 1888 vtw->v6only = 0; 1889 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid; 1890 1891 vtw_inshash_v4(ctl, vtw); 1892 1893 1894 #ifdef VTW_DEBUG 1895 /* Immediate lookup (connected and port) to 1896 * ensure at least that works! 1897 */ 1898 if (enable & 4) { 1899 KASSERT(vtw_lookup_hash_v4 1900 (ctl 1901 , inp->inp_faddr.s_addr, inp->inp_fport 1902 , inp->inp_laddr.s_addr, inp->inp_lport 1903 , 0) 1904 == vtw); 1905 KASSERT(vtw_lookup_hash_v4 1906 (ctl 1907 , inp->inp_faddr.s_addr, inp->inp_fport 1908 , inp->inp_laddr.s_addr, inp->inp_lport 1909 , 1)); 1910 } 1911 /* Immediate port iterator functionality check: not wild 1912 */ 1913 if (enable & 8) { 1914 struct tcp_ports_iterator *it; 1915 struct vestigial_inpcb res; 1916 int cnt = 0; 1917 1918 it = tcp_init_ports_v4(inp->inp_laddr 1919 , inp->inp_lport, 0); 1920 1921 while (tcp_next_port_v4(it, &res)) { 1922 ++cnt; 1923 } 1924 KASSERT(cnt); 1925 } 1926 /* Immediate port iterator functionality check: wild 1927 */ 1928 if (enable & 16) { 1929 struct tcp_ports_iterator *it; 1930 struct vestigial_inpcb res; 1931 struct in_addr any; 1932 int cnt = 0; 1933 1934 any.s_addr = htonl(INADDR_ANY); 1935 1936 it = tcp_init_ports_v4(any, inp->inp_lport, 1); 1937 1938 while (tcp_next_port_v4(it, &res)) { 1939 ++cnt; 1940 } 1941 KASSERT(cnt); 1942 } 1943 #endif /* VTW_DEBUG */ 1944 break; 1945 } 1946 1947 case AF_INET6: { 1948 struct in6pcb *inp = tp->t_in6pcb; 1949 vtw_v6_t *v6 = (void*)vtw; 1950 1951 v6->faddr = inp->in6p_faddr; 1952 v6->laddr = inp->in6p_laddr; 1953 v6->fport = inp->in6p_fport; 1954 v6->lport = inp->in6p_lport; 1955 1956 vtw->reuse_port = !!(inp->in6p_socket->so_options 1957 & SO_REUSEPORT); 1958 vtw->reuse_addr = !!(inp->in6p_socket->so_options 1959 & SO_REUSEADDR); 1960 vtw->v6only = !!(inp->in6p_flags 1961 & IN6P_IPV6_V6ONLY); 1962 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid; 1963 1964 vtw_inshash_v6(ctl, vtw); 1965 #ifdef VTW_DEBUG 1966 /* Immediate lookup (connected and port) to 1967 * ensure at least that works! 1968 */ 1969 if (enable & 4) { 1970 KASSERT(vtw_lookup_hash_v6(ctl 1971 , &inp->in6p_faddr, inp->in6p_fport 1972 , &inp->in6p_laddr, inp->in6p_lport 1973 , 0) 1974 == vtw); 1975 KASSERT(vtw_lookup_hash_v6 1976 (ctl 1977 , &inp->in6p_faddr, inp->in6p_fport 1978 , &inp->in6p_laddr, inp->in6p_lport 1979 , 1)); 1980 } 1981 /* Immediate port iterator functionality check: not wild 1982 */ 1983 if (enable & 8) { 1984 struct tcp_ports_iterator *it; 1985 struct vestigial_inpcb res; 1986 int cnt = 0; 1987 1988 it = tcp_init_ports_v6(&inp->in6p_laddr 1989 , inp->in6p_lport, 0); 1990 1991 while (tcp_next_port_v6(it, &res)) { 1992 ++cnt; 1993 } 1994 KASSERT(cnt); 1995 } 1996 /* Immediate port iterator functionality check: wild 1997 */ 1998 if (enable & 16) { 1999 struct tcp_ports_iterator *it; 2000 struct vestigial_inpcb res; 2001 static struct in6_addr any = IN6ADDR_ANY_INIT; 2002 int cnt = 0; 2003 2004 it = tcp_init_ports_v6(&any 2005 , inp->in6p_lport, 1); 2006 2007 while (tcp_next_port_v6(it, &res)) { 2008 ++cnt; 2009 } 2010 KASSERT(cnt); 2011 } 2012 #endif /* VTW_DEBUG */ 2013 break; 2014 } 2015 } 2016 2017 tcp_canceltimers(tp); 2018 tp = tcp_close(tp); 2019 KASSERT(!tp); 2020 2021 return 1; 2022 } 2023 2024 return 0; 2025 } 2026 2027 /*!\brief restart timer for vestigial time-wait entry 2028 */ 2029 static void 2030 vtw_restart_v4(vestigial_inpcb_t *vp) 2031 { 2032 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw; 2033 vtw_t *vtw; 2034 vtw_t *cp = ©.common; 2035 vtw_ctl_t *ctl; 2036 2037 KASSERT(mutex_owned(softnet_lock)); 2038 2039 db_trace(KTR_VTW 2040 , (vp->vtw, "vtw: restart %A:%P %A:%P" 2041 , vp->faddr.v4.s_addr, vp->fport 2042 , vp->laddr.v4.s_addr, vp->lport)); 2043 2044 /* Class might have changed, so have a squiz. 2045 */ 2046 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class)); 2047 vtw = vtw_alloc(ctl); 2048 2049 if (vtw) { 2050 vtw_v4_t *v4 = (void*)vtw; 2051 2052 /* Safe now to unhash the old entry 2053 */ 2054 vtw_del(vp->ctl, vp->vtw); 2055 2056 vtw->snd_nxt = cp->snd_nxt; 2057 vtw->rcv_nxt = cp->rcv_nxt; 2058 2059 v4->faddr = copy.faddr; 2060 v4->laddr = copy.laddr; 2061 v4->fport = copy.fport; 2062 v4->lport = copy.lport; 2063 2064 vtw->reuse_port = cp->reuse_port; 2065 vtw->reuse_addr = cp->reuse_addr; 2066 vtw->v6only = 0; 2067 vtw->uid = cp->uid; 2068 2069 vtw_inshash_v4(ctl, vtw); 2070 } 2071 2072 vp->valid = 0; 2073 } 2074 2075 /*!\brief restart timer for vestigial time-wait entry 2076 */ 2077 static void 2078 vtw_restart_v6(vestigial_inpcb_t *vp) 2079 { 2080 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw; 2081 vtw_t *vtw; 2082 vtw_t *cp = ©.common; 2083 vtw_ctl_t *ctl; 2084 2085 KASSERT(mutex_owned(softnet_lock)); 2086 2087 db_trace(KTR_VTW 2088 , (vp->vtw, "vtw: restart %6A:%P %6A:%P" 2089 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6)) 2090 , vp->fport 2091 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6)) 2092 , vp->lport)); 2093 2094 /* Class might have changed, so have a squiz. 2095 */ 2096 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class)); 2097 vtw = vtw_alloc(ctl); 2098 2099 if (vtw) { 2100 vtw_v6_t *v6 = (void*)vtw; 2101 2102 /* Safe now to unhash the old entry 2103 */ 2104 vtw_del(vp->ctl, vp->vtw); 2105 2106 vtw->snd_nxt = cp->snd_nxt; 2107 vtw->rcv_nxt = cp->rcv_nxt; 2108 2109 v6->faddr = copy.faddr; 2110 v6->laddr = copy.laddr; 2111 v6->fport = copy.fport; 2112 v6->lport = copy.lport; 2113 2114 vtw->reuse_port = cp->reuse_port; 2115 vtw->reuse_addr = cp->reuse_addr; 2116 vtw->v6only = cp->v6only; 2117 vtw->uid = cp->uid; 2118 2119 vtw_inshash_v6(ctl, vtw); 2120 } 2121 2122 vp->valid = 0; 2123 } 2124 2125 /*!\brief restart timer for vestigial time-wait entry 2126 */ 2127 void 2128 vtw_restart(vestigial_inpcb_t *vp) 2129 { 2130 if (!vp || !vp->valid) 2131 return; 2132 2133 if (vp->v4) 2134 vtw_restart_v4(vp); 2135 else 2136 vtw_restart_v6(vp); 2137 } 2138 2139 int 2140 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS) 2141 { 2142 int en, rc; 2143 struct sysctlnode node; 2144 2145 node = *rnode; 2146 en = *(int *)rnode->sysctl_data; 2147 node.sysctl_data = &en; 2148 2149 rc = sysctl_lookup(SYSCTLFN_CALL(&node)); 2150 if (rc != 0 || newp == NULL) 2151 return rc; 2152 2153 if (rnode->sysctl_data != &tcp4_vtw_enable && 2154 rnode->sysctl_data != &tcp6_vtw_enable) 2155 rc = ENOENT; 2156 else if ((en & 1) == 0) 2157 rc = 0; 2158 else if (rnode->sysctl_data == &tcp4_vtw_enable) 2159 rc = vtw_control_init(AF_INET); 2160 else /* rnode->sysctl_data == &tcp6_vtw_enable */ 2161 rc = vtw_control_init(AF_INET6); 2162 2163 if (rc == 0) 2164 *(int *)rnode->sysctl_data = en; 2165 2166 return rc; 2167 } 2168 2169 int 2170 vtw_earlyinit(void) 2171 { 2172 int i, rc; 2173 2174 callout_init(&vtw_cs, 0); 2175 callout_setfunc(&vtw_cs, vtw_tick, 0); 2176 2177 for (i = 0; i < VTW_NCLASS; ++i) { 2178 vtw_tcpv4[i].is_v4 = 1; 2179 vtw_tcpv6[i].is_v6 = 1; 2180 } 2181 2182 if ((tcp4_vtw_enable & 1) != 0 && 2183 (rc = vtw_control_init(AF_INET)) != 0) 2184 return rc; 2185 2186 if ((tcp6_vtw_enable & 1) != 0 && 2187 (rc = vtw_control_init(AF_INET6)) != 0) 2188 return rc; 2189 2190 return 0; 2191 } 2192 2193 #ifdef VTW_DEBUG 2194 #include <sys/syscallargs.h> 2195 #include <sys/sysctl.h> 2196 2197 /*!\brief add lalp, fafp entries for debug 2198 */ 2199 int 2200 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int class) 2201 { 2202 vtw_ctl_t *ctl; 2203 vtw_t *vtw; 2204 2205 ctl = vtw_control(af, msl ? msl : class_to_msl(class)); 2206 if (!ctl) 2207 return 0; 2208 2209 vtw = vtw_alloc(ctl); 2210 2211 if (vtw) { 2212 vtw->snd_nxt = 0; 2213 vtw->rcv_nxt = 0; 2214 2215 switch (af) { 2216 case AF_INET: { 2217 vtw_v4_t *v4 = (void*)vtw; 2218 2219 v4->faddr = fa->sin_addr.v4.s_addr; 2220 v4->laddr = la->sin_addr.v4.s_addr; 2221 v4->fport = fa->sin_port; 2222 v4->lport = la->sin_port; 2223 2224 vtw->reuse_port = 1; 2225 vtw->reuse_addr = 1; 2226 vtw->v6only = 0; 2227 vtw->uid = 0; 2228 2229 vtw_inshash_v4(ctl, vtw); 2230 break; 2231 } 2232 2233 case AF_INET6: { 2234 vtw_v6_t *v6 = (void*)vtw; 2235 2236 v6->faddr = fa->sin_addr.v6; 2237 v6->laddr = la->sin_addr.v6; 2238 2239 v6->fport = fa->sin_port; 2240 v6->lport = la->sin_port; 2241 2242 vtw->reuse_port = 1; 2243 vtw->reuse_addr = 1; 2244 vtw->v6only = 0; 2245 vtw->uid = 0; 2246 2247 vtw_inshash_v6(ctl, vtw); 2248 break; 2249 } 2250 2251 default: 2252 break; 2253 } 2254 2255 return 1; 2256 } 2257 2258 return 0; 2259 } 2260 2261 static int vtw_syscall = 0; 2262 2263 static int 2264 vtw_debug_process(vtw_sysargs_t *ap) 2265 { 2266 struct vestigial_inpcb vestige; 2267 int rc = 0; 2268 2269 mutex_enter(softnet_lock); 2270 2271 switch (ap->op) { 2272 case 0: // insert 2273 vtw_debug_add(ap->la.sin_family 2274 , &ap->la 2275 , &ap->fa 2276 , TCPTV_MSL 2277 , 0); 2278 break; 2279 2280 case 1: // lookup 2281 case 2: // restart 2282 switch (ap->la.sin_family) { 2283 case AF_INET: 2284 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port, 2285 ap->la.sin_addr.v4, ap->la.sin_port, 2286 &vestige)) { 2287 if (ap->op == 2) { 2288 vtw_restart(&vestige); 2289 } 2290 rc = 0; 2291 } else 2292 rc = ESRCH; 2293 break; 2294 2295 case AF_INET6: 2296 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port, 2297 &ap->la.sin_addr.v6, ap->la.sin_port, 2298 &vestige)) { 2299 if (ap->op == 2) { 2300 vtw_restart(&vestige); 2301 } 2302 rc = 0; 2303 } else 2304 rc = ESRCH; 2305 break; 2306 default: 2307 rc = EINVAL; 2308 } 2309 break; 2310 2311 default: 2312 rc = EINVAL; 2313 } 2314 2315 mutex_exit(softnet_lock); 2316 return rc; 2317 } 2318 2319 struct sys_vtw_args { 2320 syscallarg(const vtw_sysargs_t *) req; 2321 syscallarg(size_t) len; 2322 }; 2323 2324 static int 2325 vtw_sys(struct lwp *l, const void *_, register_t *retval) 2326 { 2327 const struct sys_vtw_args *uap = _; 2328 void *buf; 2329 int rc; 2330 size_t len = SCARG(uap, len); 2331 2332 if (len != sizeof (vtw_sysargs_t)) 2333 return EINVAL; 2334 2335 buf = kmem_alloc(len, KM_SLEEP); 2336 if (!buf) 2337 return ENOMEM; 2338 2339 rc = copyin(SCARG(uap, req), buf, len); 2340 if (!rc) { 2341 rc = vtw_debug_process(buf); 2342 } 2343 kmem_free(buf, len); 2344 2345 return rc; 2346 } 2347 2348 static void 2349 vtw_sanity_check(void) 2350 { 2351 vtw_ctl_t *ctl; 2352 vtw_t *vtw; 2353 int i; 2354 int n; 2355 2356 for (i = 0; i < VTW_NCLASS; ++i) { 2357 ctl = &vtw_tcpv4[i]; 2358 2359 if (!ctl->base.v || ctl->nalloc) 2360 continue; 2361 2362 for (n = 0, vtw = ctl->base.v; ; ) { 2363 ++n; 2364 vtw = vtw_next(ctl, vtw); 2365 if (vtw == ctl->base.v) 2366 break; 2367 } 2368 db_trace(KTR_VTW 2369 , (ctl, "sanity: class %x n %x nfree %x" 2370 , i, n, ctl->nfree)); 2371 2372 KASSERT(n == ctl->nfree); 2373 } 2374 2375 for (i = 0; i < VTW_NCLASS; ++i) { 2376 ctl = &vtw_tcpv6[i]; 2377 2378 if (!ctl->base.v || ctl->nalloc) 2379 continue; 2380 2381 for (n = 0, vtw = ctl->base.v; ; ) { 2382 ++n; 2383 vtw = vtw_next(ctl, vtw); 2384 if (vtw == ctl->base.v) 2385 break; 2386 } 2387 db_trace(KTR_VTW 2388 , (ctl, "sanity: class %x n %x nfree %x" 2389 , i, n, ctl->nfree)); 2390 KASSERT(n == ctl->nfree); 2391 } 2392 } 2393 2394 /*!\brief Initialise debug support. 2395 */ 2396 static void 2397 vtw_debug_init(void) 2398 { 2399 int i; 2400 2401 vtw_sanity_check(); 2402 2403 if (vtw_syscall) 2404 return; 2405 2406 for (i = 511; i; --i) { 2407 if (sysent[i].sy_call == sys_nosys) { 2408 sysent[i].sy_call = vtw_sys; 2409 sysent[i].sy_narg = 2; 2410 sysent[i].sy_argsize = sizeof (struct sys_vtw_args); 2411 sysent[i].sy_flags = 0; 2412 2413 vtw_syscall = i; 2414 break; 2415 } 2416 } 2417 if (i) { 2418 const struct sysctlnode *node; 2419 uint32_t flags; 2420 2421 flags = sysctl_root.sysctl_flags; 2422 2423 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE; 2424 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT; 2425 2426 sysctl_createv(0, 0, 0, &node, 2427 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2428 "koff", 2429 SYSCTL_DESCR("Kernel Obscure Feature Finder"), 2430 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2431 2432 if (!node) { 2433 sysctl_createv(0, 0, 0, &node, 2434 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2435 "koffka", 2436 SYSCTL_DESCR("The Real(tm) Kernel" 2437 " Obscure Feature Finder"), 2438 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2439 } 2440 if (node) { 2441 sysctl_createv(0, 0, 0, 0, 2442 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 2443 CTLTYPE_INT, "vtw_debug_syscall", 2444 SYSCTL_DESCR("vtw debug" 2445 " system call number"), 2446 0, 0, &vtw_syscall, 0, node->sysctl_num, 2447 CTL_CREATE, CTL_EOL); 2448 } 2449 sysctl_root.sysctl_flags = flags; 2450 } 2451 } 2452 #else /* !VTW_DEBUG */ 2453 static void 2454 vtw_debug_init(void) 2455 { 2456 return; 2457 } 2458 #endif /* !VTW_DEBUG */ 2459