1 /* 2 * Copyright (c) 2011 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Coyote Point Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 /* 31 * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using 32 * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime 33 * Truncation (MSLT). 34 * 35 * MSLT and VTW were contributed by Coyote Point Systems, Inc. 36 * 37 * Even after a TCP session enters the TIME_WAIT state, its corresponding 38 * socket and protocol control blocks (PCBs) stick around until the TCP 39 * Maximum Segment Lifetime (MSL) expires. On a host whose workload 40 * necessarily creates and closes down many TCP sockets, the sockets & PCBs 41 * for TCP sessions in TIME_WAIT state amount to many megabytes of dead 42 * weight in RAM. 43 * 44 * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to 45 * a class based on the nearness of the peer. Corresponding to each class 46 * is an MSL, and a session uses the MSL of its class. The classes are 47 * loopback (local host equals remote host), local (local host and remote 48 * host are on the same link/subnet), and remote (local host and remote 49 * host communicate via one or more gateways). Classes corresponding to 50 * nearer peers have lower MSLs by default: 2 seconds for loopback, 10 51 * seconds for local, 60 seconds for remote. Loopback and local sessions 52 * expire more quickly when MSLT is used. 53 * 54 * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket 55 * dead weight with a compact representation of the session, called a 56 * "vestigial PCB". VTW data structures are designed to be very fast and 57 * memory-efficient: for fast insertion and lookup of vestigial PCBs, 58 * the PCBs are stored in a hash table that is designed to minimize the 59 * number of cacheline visits per lookup/insertion. The memory both 60 * for vestigial PCBs and for elements of the PCB hashtable come from 61 * fixed-size pools, and linked data structures exploit this to conserve 62 * memory by representing references with a narrow index/offset from the 63 * start of a pool instead of a pointer. When space for new vestigial PCBs 64 * runs out, VTW makes room by discarding old vestigial PCBs, oldest first. 65 * VTW cooperates with MSLT. 66 * 67 * It may help to think of VTW as a "FIN cache" by analogy to the SYN 68 * cache. 69 * 70 * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT 71 * sessions as fast as it can is approximately 17% idle when VTW is active 72 * versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM 73 * when VTW is active (approximately 64k vestigial PCBs are created) than 74 * when it is inactive. 75 */ 76 77 #include <sys/cdefs.h> 78 79 #ifdef _KERNEL_OPT 80 #include "opt_ddb.h" 81 #include "opt_inet.h" 82 #include "opt_inet_csum.h" 83 #include "opt_tcp_debug.h" 84 #endif 85 86 #include <sys/param.h> 87 #include <sys/systm.h> 88 #include <sys/kmem.h> 89 #include <sys/mbuf.h> 90 #include <sys/protosw.h> 91 #include <sys/socket.h> 92 #include <sys/socketvar.h> 93 #include <sys/errno.h> 94 #include <sys/syslog.h> 95 #include <sys/pool.h> 96 #include <sys/domain.h> 97 #include <sys/kernel.h> 98 #include <net/if.h> 99 #include <net/if_types.h> 100 101 #include <netinet/in.h> 102 #include <netinet/in_systm.h> 103 #include <netinet/ip.h> 104 #include <netinet/in_pcb.h> 105 #include <netinet/in_var.h> 106 #include <netinet/ip_var.h> 107 #include <netinet/in_offload.h> 108 #include <netinet/ip6.h> 109 #include <netinet6/ip6_var.h> 110 #include <netinet6/in6_pcb.h> 111 #include <netinet6/ip6_var.h> 112 #include <netinet6/in6_var.h> 113 #include <netinet/icmp6.h> 114 115 #include <netinet/tcp.h> 116 #include <netinet/tcp_fsm.h> 117 #include <netinet/tcp_seq.h> 118 #include <netinet/tcp_timer.h> 119 #include <netinet/tcp_var.h> 120 #include <netinet/tcp_private.h> 121 122 #include <netinet/tcp_vtw.h> 123 124 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.25 2024/10/07 23:17:00 jakllsch Exp $"); 125 126 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0) 127 128 static void vtw_debug_init(void); 129 130 fatp_ctl_t fat_tcpv4; 131 fatp_ctl_t fat_tcpv6; 132 vtw_ctl_t vtw_tcpv4[VTW_NCLASS]; 133 vtw_ctl_t vtw_tcpv6[VTW_NCLASS]; 134 vtw_stats_t vtw_stats; 135 136 /* We provide state for the lookup_ports iterator. 137 * As currently we are netlock-protected, there is one. 138 * If we were finer-grain, we would have one per CPU. 139 * I do not want to be in the business of alloc/free. 140 * The best alternate would be allocate on the caller's 141 * stack, but that would require them to know the struct, 142 * or at least the size. 143 * See how she goes. 144 */ 145 struct tcp_ports_iterator { 146 union { 147 struct in_addr v4; 148 struct in6_addr v6; 149 } addr; 150 u_int port; 151 152 uint32_t wild : 1; 153 154 vtw_ctl_t *ctl; 155 fatp_t *fp; 156 157 uint16_t slot_idx; 158 uint16_t ctl_idx; 159 }; 160 161 static struct tcp_ports_iterator tcp_ports_iterator_v4; 162 static struct tcp_ports_iterator tcp_ports_iterator_v6; 163 164 static int vtw_age(vtw_ctl_t *, struct timeval *); 165 166 /*!\brief allocate a fat pointer from a collection. 167 */ 168 static fatp_t * 169 fatp_alloc(fatp_ctl_t *fat) 170 { 171 fatp_t *fp = 0; 172 173 if (fat->nfree) { 174 fp = fat->free; 175 if (fp) { 176 fat->free = fatp_next(fat, fp); 177 --fat->nfree; 178 ++fat->nalloc; 179 fp->nxt = 0; 180 181 KASSERT(!fp->inuse); 182 } 183 } 184 185 return fp; 186 } 187 188 /*!\brief free a fat pointer. 189 */ 190 static void 191 fatp_free(fatp_ctl_t *fat, fatp_t *fp) 192 { 193 if (fp) { 194 KASSERT(!fp->inuse); 195 KASSERT(!fp->nxt); 196 197 fp->nxt = fatp_index(fat, fat->free); 198 fat->free = fp; 199 200 ++fat->nfree; 201 --fat->nalloc; 202 } 203 } 204 205 /*!\brief initialise a collection of fat pointers. 206 * 207 *\param n # hash buckets 208 *\param m total # fat pointers to allocate 209 * 210 * We allocate 2x as much, as we have two hashes: full and lport only. 211 */ 212 static void 213 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m, 214 fatp_t *fat_base, fatp_t **fat_hash) 215 { 216 fatp_t *fp; 217 218 KASSERT(n <= FATP_MAX / 2); 219 220 fat->hash = fat_hash; 221 fat->base = fat_base; 222 223 fat->port = &fat->hash[m]; 224 225 fat->mask = m - 1; // ASSERT is power of 2 (m) 226 fat->lim = fat->base + 2*n - 1; 227 fat->nfree = 0; 228 fat->nalloc = 2*n; 229 230 /* Initialise the free list. 231 */ 232 for (fp = fat->lim; fp >= fat->base; --fp) { 233 fatp_free(fat, fp); 234 } 235 } 236 237 /* 238 * The `xtra' is XORed into the tag stored. 239 */ 240 static uint32_t fatp_xtra[] = { 241 0x11111111,0x22222222,0x33333333,0x44444444, 242 0x55555555,0x66666666,0x77777777,0x88888888, 243 0x12121212,0x21212121,0x34343434,0x43434343, 244 0x56565656,0x65656565,0x78787878,0x87878787, 245 0x11221122,0x22112211,0x33443344,0x44334433, 246 0x55665566,0x66556655,0x77887788,0x88778877, 247 0x11112222,0x22221111,0x33334444,0x44443333, 248 0x55556666,0x66665555,0x77778888,0x88887777, 249 }; 250 251 /*!\brief turn a {fatp_t*,slot} into an integral key. 252 * 253 * The key can be used to obtain the fatp_t, and the slot, 254 * as it directly encodes them. 255 */ 256 static inline uint32_t 257 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot) 258 { 259 CTASSERT(CACHE_LINE_SIZE == 32 || 260 CACHE_LINE_SIZE == 64 || 261 CACHE_LINE_SIZE == 128 || 262 CACHE_LINE_SIZE == 256); 263 264 switch (fatp_ntags()) { 265 case 7: 266 return (fatp_index(fat, fp) << 3) | slot; 267 case 15: 268 return (fatp_index(fat, fp) << 4) | slot; 269 case 31: 270 return (fatp_index(fat, fp) << 5) | slot; 271 default: 272 KASSERT(0 && "no support, for no good reason"); 273 return ~0; 274 } 275 } 276 277 static inline uint32_t 278 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key) 279 { 280 CTASSERT(CACHE_LINE_SIZE == 32 || 281 CACHE_LINE_SIZE == 64 || 282 CACHE_LINE_SIZE == 128 || 283 CACHE_LINE_SIZE == 256); 284 285 switch (fatp_ntags()) { 286 case 7: 287 return key & 7; 288 case 15: 289 return key & 15; 290 case 31: 291 return key & 31; 292 default: 293 KASSERT(0 && "no support, for no good reason"); 294 return ~0; 295 } 296 } 297 298 static inline fatp_t * 299 fatp_from_key(fatp_ctl_t *fat, uint32_t key) 300 { 301 CTASSERT(CACHE_LINE_SIZE == 32 || 302 CACHE_LINE_SIZE == 64 || 303 CACHE_LINE_SIZE == 128 || 304 CACHE_LINE_SIZE == 256); 305 306 switch (fatp_ntags()) { 307 case 7: 308 key >>= 3; 309 break; 310 case 15: 311 key >>= 4; 312 break; 313 case 31: 314 key >>= 5; 315 break; 316 default: 317 KASSERT(0 && "no support, for no good reason"); 318 return 0; 319 } 320 321 return key ? fat->base + key - 1 : 0; 322 } 323 324 static inline uint32_t 325 idx_encode(vtw_ctl_t *ctl, uint32_t idx) 326 { 327 return (idx << ctl->idx_bits) | idx; 328 } 329 330 static inline uint32_t 331 idx_decode(vtw_ctl_t *ctl, uint32_t bits) 332 { 333 uint32_t idx = bits & ctl->idx_mask; 334 335 if (idx_encode(ctl, idx) == bits) 336 return idx; 337 else 338 return ~0; 339 } 340 341 /*!\brief insert index into fatp hash 342 * 343 *\param idx - index of element being placed in hash chain 344 *\param tag - 32-bit tag identifier 345 * 346 *\returns 347 * value which can be used to locate entry. 348 * 349 *\note 350 * we rely on the fact that there are unused high bits in the index 351 * for verification purposes on lookup. 352 */ 353 354 static inline uint32_t 355 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which, 356 void *dbg) 357 { 358 fatp_t *fp; 359 fatp_t **hash = (which ? fat->port : fat->hash); 360 int i; 361 362 fp = hash[tag & fat->mask]; 363 364 while (!fp || fatp_full(fp)) { 365 fatp_t *fq; 366 367 /* All entries are inuse at the top level. 368 * We allocate a spare, and push the top level 369 * down one. All entries in the fp we push down 370 * (think of a tape worm here) will be expelled sooner than 371 * any entries added subsequently to this hash bucket. 372 * This is a property of the time waits we are exploiting. 373 */ 374 375 fq = fatp_alloc(fat); 376 if (!fq) { 377 vtw_age(fat->vtw, 0); 378 fp = hash[tag & fat->mask]; 379 continue; 380 } 381 382 fq->inuse = 0; 383 fq->nxt = fatp_index(fat, fp); 384 385 hash[tag & fat->mask] = fq; 386 387 fp = fq; 388 } 389 390 KASSERT(!fatp_full(fp)); 391 392 /* Fill highest index first. Lookup is lowest first. 393 */ 394 for (i = fatp_ntags(); --i >= 0; ) { 395 if (!((1 << i) & fp->inuse)) { 396 break; 397 } 398 } 399 400 fp->inuse |= 1 << i; 401 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i]; 402 403 db_trace(KTR_VTW 404 , (fp, "fat: inuse %5.5x tag[%x] %8.8x" 405 , fp->inuse 406 , i, fp->tag[i])); 407 408 return fatp_key(fat, fp, i); 409 } 410 411 static inline int 412 vtw_alive(const vtw_t *vtw) 413 { 414 return vtw->hashed && vtw->expire.tv_sec; 415 } 416 417 static inline uint32_t 418 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4) 419 { 420 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4) 421 return v4 - ctl->base.v4; 422 423 KASSERT(0 && "vtw out of bounds"); 424 425 return ~0; 426 } 427 428 static inline uint32_t 429 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6) 430 { 431 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6) 432 return v6 - ctl->base.v6; 433 434 KASSERT(0 && "vtw out of bounds"); 435 436 return ~0; 437 } 438 439 static inline uint32_t 440 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw) 441 { 442 if (ctl->clidx) 443 ctl = ctl->ctl; 444 445 if (ctl->is_v4) 446 return vtw_index_v4(ctl, (vtw_v4_t *)vtw); 447 448 if (ctl->is_v6) 449 return vtw_index_v6(ctl, (vtw_v6_t *)vtw); 450 451 KASSERT(0 && "neither 4 nor 6. most curious."); 452 453 return ~0; 454 } 455 456 static inline vtw_t * 457 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx) 458 { 459 if (ctl->clidx) 460 ctl = ctl->ctl; 461 462 /* See if the index looks like it might be an index. 463 * Bits on outside of the valid index bits is a give away. 464 */ 465 idx = idx_decode(ctl, idx); 466 467 if (idx == ~0) { 468 return 0; 469 } else if (ctl->is_v4) { 470 vtw_v4_t *vtw = ctl->base.v4 + idx; 471 472 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4) 473 ? &vtw->common : 0; 474 } else if (ctl->is_v6) { 475 vtw_v6_t *vtw = ctl->base.v6 + idx; 476 477 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6) 478 ? &vtw->common : 0; 479 } else { 480 KASSERT(0 && "badness"); 481 return 0; 482 } 483 } 484 485 /*!\brief return the next vtw after this one. 486 * 487 * Due to the differing sizes of the entries in differing 488 * arenas, we have to ensure we ++ the correct pointer type. 489 * 490 * Also handles wrap. 491 */ 492 static inline vtw_t * 493 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw) 494 { 495 if (ctl->is_v4) { 496 vtw_v4_t *v4 = (void*)vtw; 497 498 vtw = &(++v4)->common; 499 } else { 500 vtw_v6_t *v6 = (void*)vtw; 501 502 vtw = &(++v6)->common; 503 } 504 505 if (vtw > ctl->lim.v) 506 vtw = ctl->base.v; 507 508 return vtw; 509 } 510 511 /*!\brief remove entry from FATP hash chains 512 */ 513 static inline void 514 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw) 515 { 516 fatp_ctl_t *fat = ctl->fat; 517 fatp_t *fp; 518 uint32_t key = vtw->key; 519 uint32_t tag, slot, idx; 520 vtw_v4_t *v4 = (void*)vtw; 521 vtw_v6_t *v6 = (void*)vtw; 522 523 if (!vtw->hashed) { 524 KASSERT(0 && "unhashed"); 525 return; 526 } 527 528 if (fat->vtw->is_v4) { 529 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport); 530 } else if (fat->vtw->is_v6) { 531 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport); 532 } else { 533 tag = 0; 534 KASSERT(0 && "not reached"); 535 } 536 537 /* Remove from fat->hash[] 538 */ 539 slot = fatp_slot_from_key(fat, key); 540 fp = fatp_from_key(fat, key); 541 idx = vtw_index(ctl, vtw); 542 543 db_trace(KTR_VTW 544 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x" 545 , fp->inuse, slot, idx, key, tag)); 546 547 KASSERT(fp->inuse & (1 << slot)); 548 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 549 ^ fatp_xtra[slot])); 550 551 if ((fp->inuse & (1 << slot)) 552 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 553 ^ fatp_xtra[slot])) { 554 fp->inuse ^= 1 << slot; 555 fp->tag[slot] = 0; 556 557 /* When we delete entries, we do not compact. This is 558 * due to temporality. We add entries, and they 559 * (eventually) expire. Older entries will be further 560 * down the chain. 561 */ 562 if (!fp->inuse) { 563 uint32_t hi = tag & fat->mask; 564 fatp_t *fq = 0; 565 fatp_t *fr = fat->hash[hi]; 566 567 while (fr && fr != fp) { 568 fr = fatp_next(fat, fq = fr); 569 } 570 571 if (fr == fp) { 572 if (fq) { 573 fq->nxt = fp->nxt; 574 fp->nxt = 0; 575 fatp_free(fat, fp); 576 } else { 577 KASSERT(fat->hash[hi] == fp); 578 579 if (fp->nxt) { 580 fat->hash[hi] 581 = fatp_next(fat, fp); 582 fp->nxt = 0; 583 fatp_free(fat, fp); 584 } else { 585 /* retain for next use. 586 */ 587 ; 588 } 589 } 590 } else { 591 fr = fat->hash[hi]; 592 593 do { 594 db_trace(KTR_VTW 595 , (fr 596 , "fat:*del inuse %5.5x" 597 " nxt %x" 598 , fr->inuse, fr->nxt)); 599 600 fr = fatp_next(fat, fq = fr); 601 } while (fr && fr != fp); 602 603 KASSERT(0 && "oops"); 604 } 605 } 606 vtw->key ^= ~0; 607 } 608 609 if (fat->vtw->is_v4) { 610 tag = v4_port_tag(v4->lport); 611 } else if (fat->vtw->is_v6) { 612 tag = v6_port_tag(v6->lport); 613 } 614 615 /* Remove from fat->port[] 616 */ 617 key = vtw->port_key; 618 slot = fatp_slot_from_key(fat, key); 619 fp = fatp_from_key(fat, key); 620 idx = vtw_index(ctl, vtw); 621 622 db_trace(KTR_VTW 623 , (fp, "fatport: del inuse %5.5x" 624 " slot %x idx %x key %x tag %x" 625 , fp->inuse, slot, idx, key, tag)); 626 627 KASSERT(fp->inuse & (1 << slot)); 628 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 629 ^ fatp_xtra[slot])); 630 631 if ((fp->inuse & (1 << slot)) 632 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 633 ^ fatp_xtra[slot])) { 634 fp->inuse ^= 1 << slot; 635 fp->tag[slot] = 0; 636 637 if (!fp->inuse) { 638 uint32_t hi = tag & fat->mask; 639 fatp_t *fq = 0; 640 fatp_t *fr = fat->port[hi]; 641 642 while (fr && fr != fp) { 643 fr = fatp_next(fat, fq = fr); 644 } 645 646 if (fr == fp) { 647 if (fq) { 648 fq->nxt = fp->nxt; 649 fp->nxt = 0; 650 fatp_free(fat, fp); 651 } else { 652 KASSERT(fat->port[hi] == fp); 653 654 if (fp->nxt) { 655 fat->port[hi] 656 = fatp_next(fat, fp); 657 fp->nxt = 0; 658 fatp_free(fat, fp); 659 } else { 660 /* retain for next use. 661 */ 662 ; 663 } 664 } 665 } 666 } 667 vtw->port_key ^= ~0; 668 } 669 670 vtw->hashed = 0; 671 } 672 673 /*!\brief remove entry from hash, possibly free. 674 */ 675 void 676 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw) 677 { 678 KASSERT(mutex_owned(softnet_lock)); 679 680 if (vtw->hashed) { 681 ++vtw_stats.del; 682 vtw_unhash(ctl, vtw); 683 } 684 685 /* We only delete the oldest entry. 686 */ 687 if (vtw != ctl->oldest.v) 688 return; 689 690 --ctl->nalloc; 691 ++ctl->nfree; 692 693 vtw->expire.tv_sec = 0; 694 vtw->expire.tv_usec = ~0; 695 696 if (!ctl->nalloc) 697 ctl->oldest.v = 0; 698 699 ctl->oldest.v = vtw_next(ctl, vtw); 700 } 701 702 /*!\brief insert vestigial timewait in hash chain 703 */ 704 static void 705 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw) 706 { 707 uint32_t idx = vtw_index(ctl, vtw); 708 uint32_t tag; 709 vtw_v4_t *v4 = (void*)vtw; 710 711 KASSERT(mutex_owned(softnet_lock)); 712 KASSERT(!vtw->hashed); 713 KASSERT(ctl->clidx == vtw->msl_class); 714 715 ++vtw_stats.ins; 716 717 tag = v4_tag(v4->faddr, v4->fport, 718 v4->laddr, v4->lport); 719 720 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 721 722 db_trace(KTR_VTW, (ctl 723 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x" 724 " tag %8.8x key %8.8x" 725 , v4->faddr, v4->fport 726 , v4->laddr, v4->lport 727 , tag 728 , vtw->key)); 729 730 tag = v4_port_tag(v4->lport); 731 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 732 733 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 734 , v4->lport, v4->lport 735 , tag 736 , vtw->key)); 737 738 vtw->hashed = 1; 739 } 740 741 /*!\brief insert vestigial timewait in hash chain 742 */ 743 static void 744 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw) 745 { 746 uint32_t idx = vtw_index(ctl, vtw); 747 uint32_t tag; 748 vtw_v6_t *v6 = (void*)vtw; 749 750 KASSERT(mutex_owned(softnet_lock)); 751 KASSERT(!vtw->hashed); 752 KASSERT(ctl->clidx == vtw->msl_class); 753 754 ++vtw_stats.ins; 755 756 tag = v6_tag(&v6->faddr, v6->fport, 757 &v6->laddr, v6->lport); 758 759 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 760 761 tag = v6_port_tag(v6->lport); 762 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 763 764 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 765 , v6->lport, v6->lport 766 , tag 767 , vtw->key)); 768 769 vtw->hashed = 1; 770 } 771 772 static vtw_t * 773 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport 774 , uint32_t laddr, uint16_t lport 775 , int which) 776 { 777 vtw_v4_t *v4; 778 vtw_t *vtw; 779 uint32_t tag; 780 fatp_t *fp; 781 int i; 782 uint32_t fatps = 0, probes = 0, losings = 0; 783 784 if (!ctl || !ctl->fat) 785 return 0; 786 787 ++vtw_stats.look[which]; 788 789 if (which) { 790 tag = v4_port_tag(lport); 791 fp = ctl->fat->port[tag & ctl->fat->mask]; 792 } else { 793 tag = v4_tag(faddr, fport, laddr, lport); 794 fp = ctl->fat->hash[tag & ctl->fat->mask]; 795 } 796 797 while (fp && fp->inuse) { 798 uint32_t inuse = fp->inuse; 799 800 ++fatps; 801 802 for (i = 0; inuse && i < fatp_ntags(); ++i) { 803 uint32_t idx; 804 805 if (!(inuse & (1 << i))) 806 continue; 807 808 inuse ^= 1 << i; 809 810 ++probes; 811 ++vtw_stats.probe[which]; 812 813 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 814 vtw = vtw_from_index(ctl, idx); 815 816 if (!vtw) { 817 /* Hopefully fast path. 818 */ 819 db_trace(KTR_VTW 820 , (fp, "vtw: fast %A:%P %A:%P" 821 " idx %x tag %x" 822 , faddr, fport 823 , laddr, lport 824 , idx, tag)); 825 continue; 826 } 827 828 v4 = (void*)vtw; 829 830 /* The de-referencing of vtw is what we want to avoid. 831 * Losing. 832 */ 833 if (vtw_alive(vtw) 834 && ((which ? vtw->port_key : vtw->key) 835 == fatp_key(ctl->fat, fp, i)) 836 && (which 837 || (v4->faddr == faddr && v4->laddr == laddr 838 && v4->fport == fport)) 839 && v4->lport == lport) { 840 ++vtw_stats.hit[which]; 841 842 db_trace(KTR_VTW 843 , (fp, "vtw: hit %8.8x:%4.4x" 844 " %8.8x:%4.4x idx %x key %x" 845 , faddr, fport 846 , laddr, lport 847 , idx_decode(ctl, idx), vtw->key)); 848 849 KASSERT(vtw->hashed); 850 851 goto out; 852 } 853 ++vtw_stats.losing[which]; 854 ++losings; 855 856 if (vtw_alive(vtw)) { 857 db_trace(KTR_VTW 858 , (fp, "vtw:!mis %8.8x:%4.4x" 859 " %8.8x:%4.4x key %x tag %x" 860 , faddr, fport 861 , laddr, lport 862 , fatp_key(ctl->fat, fp, i) 863 , v4_tag(faddr, fport 864 , laddr, lport))); 865 db_trace(KTR_VTW 866 , (vtw, "vtw:!mis %8.8x:%4.4x" 867 " %8.8x:%4.4x key %x tag %x" 868 , v4->faddr, v4->fport 869 , v4->laddr, v4->lport 870 , vtw->key 871 , v4_tag(v4->faddr, v4->fport 872 , v4->laddr, v4->lport))); 873 874 if (vtw->key == fatp_key(ctl->fat, fp, i)) { 875 db_trace(KTR_VTW 876 , (vtw, "vtw:!mis %8.8x:%4.4x" 877 " %8.8x:%4.4x key %x" 878 " which %x" 879 , v4->faddr, v4->fport 880 , v4->laddr, v4->lport 881 , vtw->key 882 , which)); 883 884 } else { 885 db_trace(KTR_VTW 886 , (vtw 887 , "vtw:!mis" 888 " key %8.8x != %8.8x" 889 " idx %x i %x which %x" 890 , vtw->key 891 , fatp_key(ctl->fat, fp, i) 892 , idx_decode(ctl, idx) 893 , i 894 , which)); 895 } 896 } else { 897 db_trace(KTR_VTW 898 , (fp 899 , "vtw:!mis free entry" 900 " idx %x vtw %p which %x" 901 , idx_decode(ctl, idx) 902 , vtw, which)); 903 } 904 } 905 906 if (fp->nxt) { 907 fp = fatp_next(ctl->fat, fp); 908 } else { 909 break; 910 } 911 } 912 ++vtw_stats.miss[which]; 913 vtw = 0; 914 out: 915 if (fatps > vtw_stats.max_chain[which]) 916 vtw_stats.max_chain[which] = fatps; 917 if (probes > vtw_stats.max_probe[which]) 918 vtw_stats.max_probe[which] = probes; 919 if (losings > vtw_stats.max_loss[which]) 920 vtw_stats.max_loss[which] = losings; 921 922 return vtw; 923 } 924 925 static vtw_t * 926 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport 927 , const struct in6_addr *laddr, uint16_t lport 928 , int which) 929 { 930 vtw_v6_t *v6; 931 vtw_t *vtw; 932 uint32_t tag; 933 fatp_t *fp; 934 int i; 935 uint32_t fatps = 0, probes = 0, losings = 0; 936 937 ++vtw_stats.look[which]; 938 939 if (!ctl || !ctl->fat) 940 return 0; 941 942 if (which) { 943 tag = v6_port_tag(lport); 944 fp = ctl->fat->port[tag & ctl->fat->mask]; 945 } else { 946 tag = v6_tag(faddr, fport, laddr, lport); 947 fp = ctl->fat->hash[tag & ctl->fat->mask]; 948 } 949 950 while (fp && fp->inuse) { 951 uint32_t inuse = fp->inuse; 952 953 ++fatps; 954 955 for (i = 0; inuse && i < fatp_ntags(); ++i) { 956 uint32_t idx; 957 958 if (!(inuse & (1 << i))) 959 continue; 960 961 inuse ^= 1 << i; 962 963 ++probes; 964 ++vtw_stats.probe[which]; 965 966 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 967 vtw = vtw_from_index(ctl, idx); 968 969 db_trace(KTR_VTW 970 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x" 971 , i 972 , db_store(faddr, sizeof (*faddr)), fport 973 , db_store(laddr, sizeof (*laddr)), lport 974 , idx_decode(ctl, idx))); 975 976 if (!vtw) { 977 /* Hopefully fast path. 978 */ 979 continue; 980 } 981 982 v6 = (void*)vtw; 983 984 if (vtw_alive(vtw) 985 && ((which ? vtw->port_key : vtw->key) 986 == fatp_key(ctl->fat, fp, i)) 987 && v6->lport == lport 988 && (which 989 || (v6->fport == fport 990 && !bcmp(&v6->faddr, faddr, sizeof (*faddr)) 991 && !bcmp(&v6->laddr, laddr 992 , sizeof (*laddr))))) { 993 ++vtw_stats.hit[which]; 994 995 KASSERT(vtw->hashed); 996 goto out; 997 } else { 998 ++vtw_stats.losing[which]; 999 ++losings; 1000 } 1001 } 1002 1003 if (fp->nxt) { 1004 fp = fatp_next(ctl->fat, fp); 1005 } else { 1006 break; 1007 } 1008 } 1009 ++vtw_stats.miss[which]; 1010 vtw = 0; 1011 out: 1012 if (fatps > vtw_stats.max_chain[which]) 1013 vtw_stats.max_chain[which] = fatps; 1014 if (probes > vtw_stats.max_probe[which]) 1015 vtw_stats.max_probe[which] = probes; 1016 if (losings > vtw_stats.max_loss[which]) 1017 vtw_stats.max_loss[which] = losings; 1018 1019 return vtw; 1020 } 1021 1022 /*!\brief port iterator 1023 */ 1024 static vtw_t * 1025 vtw_next_port_v4(struct tcp_ports_iterator *it) 1026 { 1027 vtw_ctl_t *ctl = it->ctl; 1028 vtw_v4_t *v4; 1029 vtw_t *vtw; 1030 uint32_t tag; 1031 uint16_t lport = it->port; 1032 fatp_t *fp; 1033 int i; 1034 uint32_t fatps = 0, probes = 0, losings = 0; 1035 1036 tag = v4_port_tag(lport); 1037 if (!it->fp) { 1038 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1039 it->slot_idx = 0; 1040 } 1041 fp = it->fp; 1042 1043 while (fp) { 1044 uint32_t inuse = fp->inuse; 1045 1046 ++fatps; 1047 1048 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1049 uint32_t idx; 1050 1051 if (!(inuse & (1 << i))) 1052 continue; 1053 1054 inuse &= ~0U << i; 1055 1056 if (i < it->slot_idx) 1057 continue; 1058 1059 ++vtw_stats.probe[1]; 1060 ++probes; 1061 1062 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1063 vtw = vtw_from_index(ctl, idx); 1064 1065 if (!vtw) { 1066 /* Hopefully fast path. 1067 */ 1068 continue; 1069 } 1070 1071 v4 = (void*)vtw; 1072 1073 if (vtw_alive(vtw) 1074 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1075 && v4->lport == lport) { 1076 ++vtw_stats.hit[1]; 1077 1078 it->slot_idx = i + 1; 1079 1080 goto out; 1081 } else if (vtw_alive(vtw)) { 1082 ++vtw_stats.losing[1]; 1083 ++losings; 1084 1085 db_trace(KTR_VTW 1086 , (vtw, "vtw:!mis" 1087 " port %8.8x:%4.4x %8.8x:%4.4x" 1088 " key %x port %x" 1089 , v4->faddr, v4->fport 1090 , v4->laddr, v4->lport 1091 , vtw->key 1092 , lport)); 1093 } else { 1094 /* Really losing here. We are coming 1095 * up with references to free entries. 1096 * Might find it better to use 1097 * traditional, or need another 1098 * add-hockery. The other add-hockery 1099 * would be to pul more into into the 1100 * cache line to reject the false 1101 * hits. 1102 */ 1103 ++vtw_stats.losing[1]; 1104 ++losings; 1105 db_trace(KTR_VTW 1106 , (fp, "vtw:!mis port %x" 1107 " - free entry idx %x vtw %p" 1108 , lport 1109 , idx_decode(ctl, idx) 1110 , vtw)); 1111 } 1112 } 1113 1114 if (fp->nxt) { 1115 it->fp = fp = fatp_next(ctl->fat, fp); 1116 it->slot_idx = 0; 1117 } else { 1118 it->fp = 0; 1119 break; 1120 } 1121 } 1122 ++vtw_stats.miss[1]; 1123 1124 vtw = 0; 1125 out: 1126 if (fatps > vtw_stats.max_chain[1]) 1127 vtw_stats.max_chain[1] = fatps; 1128 if (probes > vtw_stats.max_probe[1]) 1129 vtw_stats.max_probe[1] = probes; 1130 if (losings > vtw_stats.max_loss[1]) 1131 vtw_stats.max_loss[1] = losings; 1132 1133 return vtw; 1134 } 1135 1136 /*!\brief port iterator 1137 */ 1138 static vtw_t * 1139 vtw_next_port_v6(struct tcp_ports_iterator *it) 1140 { 1141 vtw_ctl_t *ctl = it->ctl; 1142 vtw_v6_t *v6; 1143 vtw_t *vtw; 1144 uint32_t tag; 1145 uint16_t lport = it->port; 1146 fatp_t *fp; 1147 int i; 1148 uint32_t fatps = 0, probes = 0, losings = 0; 1149 1150 tag = v6_port_tag(lport); 1151 if (!it->fp) { 1152 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1153 it->slot_idx = 0; 1154 } 1155 fp = it->fp; 1156 1157 while (fp) { 1158 uint32_t inuse = fp->inuse; 1159 1160 ++fatps; 1161 1162 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1163 uint32_t idx; 1164 1165 if (!(inuse & (1 << i))) 1166 continue; 1167 1168 inuse &= ~0U << i; 1169 1170 if (i < it->slot_idx) 1171 continue; 1172 1173 ++vtw_stats.probe[1]; 1174 ++probes; 1175 1176 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1177 vtw = vtw_from_index(ctl, idx); 1178 1179 if (!vtw) { 1180 /* Hopefully fast path. 1181 */ 1182 continue; 1183 } 1184 1185 v6 = (void*)vtw; 1186 1187 db_trace(KTR_VTW 1188 , (vtw, "vtw: i %x idx %x fp->tag %x" 1189 " tag %x xtra %x" 1190 , i, idx_decode(ctl, idx) 1191 , fp->tag[i], tag, fatp_xtra[i])); 1192 1193 if (vtw_alive(vtw) 1194 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1195 && v6->lport == lport) { 1196 ++vtw_stats.hit[1]; 1197 1198 db_trace(KTR_VTW 1199 , (fp, "vtw: nxt port %P - %4.4x" 1200 " idx %x key %x" 1201 , lport, lport 1202 , idx_decode(ctl, idx), vtw->key)); 1203 1204 it->slot_idx = i + 1; 1205 goto out; 1206 } else if (vtw_alive(vtw)) { 1207 ++vtw_stats.losing[1]; 1208 1209 db_trace(KTR_VTW 1210 , (vtw, "vtw:!mis port %6A:%4.4x" 1211 " %6A:%4.4x key %x port %x" 1212 , db_store(&v6->faddr 1213 , sizeof (v6->faddr)) 1214 , v6->fport 1215 , db_store(&v6->laddr 1216 , sizeof (v6->faddr)) 1217 , v6->lport 1218 , vtw->key 1219 , lport)); 1220 } else { 1221 /* Really losing here. We are coming 1222 * up with references to free entries. 1223 * Might find it better to use 1224 * traditional, or need another 1225 * add-hockery. The other add-hockery 1226 * would be to pul more into into the 1227 * cache line to reject the false 1228 * hits. 1229 */ 1230 ++vtw_stats.losing[1]; 1231 ++losings; 1232 1233 db_trace(KTR_VTW 1234 , (fp 1235 , "vtw:!mis port %x" 1236 " - free entry idx %x vtw %p" 1237 , lport, idx_decode(ctl, idx) 1238 , vtw)); 1239 } 1240 } 1241 1242 if (fp->nxt) { 1243 it->fp = fp = fatp_next(ctl->fat, fp); 1244 it->slot_idx = 0; 1245 } else { 1246 it->fp = 0; 1247 break; 1248 } 1249 } 1250 ++vtw_stats.miss[1]; 1251 1252 vtw = 0; 1253 out: 1254 if (fatps > vtw_stats.max_chain[1]) 1255 vtw_stats.max_chain[1] = fatps; 1256 if (probes > vtw_stats.max_probe[1]) 1257 vtw_stats.max_probe[1] = probes; 1258 if (losings > vtw_stats.max_loss[1]) 1259 vtw_stats.max_loss[1] = losings; 1260 1261 return vtw; 1262 } 1263 1264 /*!\brief initialise the VTW allocation arena 1265 * 1266 * There are 1+3 allocation classes: 1267 * 0 classless 1268 * {1,2,3} MSL-class based allocation 1269 * 1270 * The allocation arenas are all initialised. Classless gets all the 1271 * space. MSL-class based divides the arena, so that allocation 1272 * within a class can proceed without having to consider entries 1273 * (aka: cache lines) from different classes. 1274 * 1275 * Usually, we are completely classless or class-based, but there can be 1276 * transition periods, corresponding to dynamic adjustments in the config 1277 * by the operator. 1278 */ 1279 static void 1280 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v) 1281 { 1282 int class_n, i; 1283 vtw_t *base; 1284 1285 ctl->base.v = ctl_base_v; 1286 1287 if (ctl->is_v4) { 1288 ctl->lim.v4 = ctl->base.v4 + n - 1; 1289 ctl->alloc.v4 = ctl->base.v4; 1290 } else { 1291 ctl->lim.v6 = ctl->base.v6 + n - 1; 1292 ctl->alloc.v6 = ctl->base.v6; 1293 } 1294 1295 ctl->nfree = n; 1296 ctl->ctl = ctl; 1297 1298 ctl->idx_bits = 32; 1299 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) { 1300 ctl->idx_mask >>= 1; 1301 ctl->idx_bits -= 1; 1302 } 1303 1304 ctl->idx_mask <<= 1; 1305 ctl->idx_mask |= 1; 1306 ctl->idx_bits += 1; 1307 1308 ctl->fat = fat; 1309 fat->vtw = ctl; 1310 1311 /* Divide the resources equally amongst the classes. 1312 * This is not optimal, as the different classes 1313 * arrive and leave at different rates, but it is 1314 * the best I can do for now. 1315 */ 1316 class_n = n / (VTW_NCLASS-1); 1317 base = ctl->base.v; 1318 1319 for (i = 1; i < VTW_NCLASS; ++i) { 1320 int j; 1321 1322 ctl[i] = ctl[0]; 1323 ctl[i].clidx = i; 1324 1325 ctl[i].base.v = base; 1326 ctl[i].alloc = ctl[i].base; 1327 1328 for (j = 0; j < class_n - 1; ++j) { 1329 if (tcp_msl_enable) 1330 base->msl_class = i; 1331 base = vtw_next(ctl, base); 1332 } 1333 1334 ctl[i].lim.v = base; 1335 base = vtw_next(ctl, base); 1336 ctl[i].nfree = class_n; 1337 } 1338 1339 vtw_debug_init(); 1340 } 1341 1342 /*!\brief map class to TCP MSL 1343 */ 1344 static inline uint32_t 1345 class_to_msl(int msl_class) 1346 { 1347 switch (msl_class) { 1348 case 0: 1349 case 1: 1350 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0); 1351 case 2: 1352 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); 1353 default: 1354 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); 1355 } 1356 } 1357 1358 /*!\brief map TCP MSL to class 1359 */ 1360 static inline uint32_t 1361 msl_to_class(int msl) 1362 { 1363 if (tcp_msl_enable) { 1364 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2))) 1365 return 1+2; 1366 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1))) 1367 return 1+1; 1368 return 1; 1369 } 1370 return 0; 1371 } 1372 1373 /*!\brief allocate a vtw entry 1374 */ 1375 static inline vtw_t * 1376 vtw_alloc(vtw_ctl_t *ctl) 1377 { 1378 vtw_t *vtw = 0; 1379 int stuck = 0; 1380 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0; 1381 int msl; 1382 1383 KASSERT(mutex_owned(softnet_lock)); 1384 1385 /* If no resources, we will not get far. 1386 */ 1387 if (!ctl || !ctl->base.v4 || avail <= 0) 1388 return 0; 1389 1390 /* Obtain a free one. 1391 */ 1392 while (!ctl->nfree) { 1393 vtw_age(ctl, 0); 1394 1395 if (++stuck > avail) { 1396 /* When in transition between 1397 * schemes (classless, classed) we 1398 * can be stuck having to await the 1399 * expiration of cross-allocated entries. 1400 * 1401 * Returning zero means we will fall back to the 1402 * traditional TIME_WAIT handling, except in the 1403 * case of a re-shed, in which case we cannot 1404 * perform the reshecd, but will retain the extant 1405 * entry. 1406 */ 1407 db_trace(KTR_VTW 1408 , (ctl, "vtw:!none free in class %x %x/%x" 1409 , ctl->clidx 1410 , ctl->nalloc, ctl->nfree)); 1411 1412 return 0; 1413 } 1414 } 1415 1416 vtw = ctl->alloc.v; 1417 1418 if (vtw->msl_class != ctl->clidx) { 1419 /* Usurping rules: 1420 * 0 -> {1,2,3} or {1,2,3} -> 0 1421 */ 1422 KASSERT(!vtw->msl_class || !ctl->clidx); 1423 1424 if (vtw->hashed || vtw->expire.tv_sec) { 1425 /* As this is owned by some other class, 1426 * we must wait for it to expire it. 1427 * This will only happen on class/classless 1428 * transitions, which are guaranteed to progress 1429 * to completion in small finite time, barring bugs. 1430 */ 1431 db_trace(KTR_VTW 1432 , (ctl, "vtw:!%p class %x!=%x %x:%x%s" 1433 , vtw, vtw->msl_class, ctl->clidx 1434 , vtw->expire.tv_sec 1435 , vtw->expire.tv_usec 1436 , vtw->hashed ? " hashed" : "")); 1437 1438 return 0; 1439 } 1440 1441 db_trace(KTR_VTW 1442 , (ctl, "vtw:!%p usurped from %x to %x" 1443 , vtw, vtw->msl_class, ctl->clidx)); 1444 1445 vtw->msl_class = ctl->clidx; 1446 } 1447 1448 if (vtw_alive(vtw)) { 1449 KASSERT(0 && "next free not free"); 1450 return 0; 1451 } 1452 1453 /* Advance allocation pointer. 1454 */ 1455 ctl->alloc.v = vtw_next(ctl, vtw); 1456 1457 --ctl->nfree; 1458 ++ctl->nalloc; 1459 1460 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec 1461 1462 /* mark expiration 1463 */ 1464 getmicrouptime(&vtw->expire); 1465 1466 /* Move expiration into the future. 1467 */ 1468 vtw->expire.tv_sec += msl / 1000; 1469 vtw->expire.tv_usec += 1000 * (msl % 1000); 1470 1471 while (vtw->expire.tv_usec >= 1000*1000) { 1472 vtw->expire.tv_usec -= 1000*1000; 1473 vtw->expire.tv_sec += 1; 1474 } 1475 1476 if (!ctl->oldest.v) 1477 ctl->oldest.v = vtw; 1478 1479 return vtw; 1480 } 1481 1482 /*!\brief expiration 1483 */ 1484 static int 1485 vtw_age(vtw_ctl_t *ctl, struct timeval *_when) 1486 { 1487 vtw_t *vtw; 1488 struct timeval then, *when = _when; 1489 int maxtries = 0; 1490 1491 if (!ctl->oldest.v) { 1492 KASSERT(!ctl->nalloc); 1493 return 0; 1494 } 1495 1496 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) { 1497 if (++maxtries > ctl->nalloc) 1498 break; 1499 1500 if (vtw->msl_class != ctl->clidx) { 1501 db_trace(KTR_VTW 1502 , (vtw, "vtw:!age class mismatch %x != %x" 1503 , vtw->msl_class, ctl->clidx)); 1504 /* XXXX 1505 * See if the appropriate action is to skip to the next. 1506 * XXXX 1507 */ 1508 ctl->oldest.v = vtw = vtw_next(ctl, vtw); 1509 continue; 1510 } 1511 if (!when) { 1512 /* Latch oldest timeval if none specified. 1513 */ 1514 then = vtw->expire; 1515 when = &then; 1516 } 1517 1518 if (!timercmp(&vtw->expire, when, <=)) 1519 break; 1520 1521 db_trace(KTR_VTW 1522 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x" 1523 , ctl->clidx 1524 , vtw->expire.tv_sec 1525 , vtw->expire.tv_usec 1526 , ctl->nalloc 1527 , ctl->nfree)); 1528 1529 if (!_when) 1530 ++vtw_stats.kill; 1531 1532 vtw_del(ctl, vtw); 1533 vtw = ctl->oldest.v; 1534 } 1535 1536 return ctl->nalloc; // # remaining allocated 1537 } 1538 1539 static callout_t vtw_cs; 1540 1541 /*!\brief notice the passage of time. 1542 * It seems to be getting faster. What happened to the year? 1543 */ 1544 static void 1545 vtw_tick(void *arg) 1546 { 1547 struct timeval now; 1548 int i, cnt = 0; 1549 1550 getmicrouptime(&now); 1551 1552 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x" 1553 , now.tv_sec, now.tv_usec)); 1554 1555 mutex_enter(softnet_lock); 1556 1557 for (i = 0; i < VTW_NCLASS; ++i) { 1558 cnt += vtw_age(&vtw_tcpv4[i], &now); 1559 cnt += vtw_age(&vtw_tcpv6[i], &now); 1560 } 1561 1562 /* Keep ticks coming while we need them. 1563 */ 1564 if (cnt) 1565 callout_schedule(&vtw_cs, hz / 5); 1566 else { 1567 tcp_vtw_was_enabled = 0; 1568 tcbtable.vestige = 0; 1569 } 1570 mutex_exit(softnet_lock); 1571 } 1572 1573 /* inpcb_lookup_locals assist for handling vestigial entries. 1574 */ 1575 static void * 1576 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild) 1577 { 1578 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4; 1579 1580 bzero(it, sizeof (*it)); 1581 1582 /* Note: the reference to vtw_tcpv4[0] is fine. 1583 * We do not need per-class iteration. We just 1584 * need to get to the fat, and there is one 1585 * shared fat. 1586 */ 1587 if (vtw_tcpv4[0].fat) { 1588 it->addr.v4 = addr; 1589 it->port = port; 1590 it->wild = !!wild; 1591 it->ctl = &vtw_tcpv4[0]; 1592 1593 ++vtw_stats.look[1]; 1594 } 1595 1596 return it; 1597 } 1598 1599 /*!\brief export an IPv4 vtw. 1600 */ 1601 static int 1602 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1603 { 1604 vtw_v4_t *v4 = (void*)vtw; 1605 1606 bzero(res, sizeof (*res)); 1607 1608 if (ctl && vtw) { 1609 if (!ctl->clidx && vtw->msl_class) 1610 ctl += vtw->msl_class; 1611 else 1612 KASSERT(ctl->clidx == vtw->msl_class); 1613 1614 res->valid = 1; 1615 res->v4 = 1; 1616 1617 res->faddr.v4.s_addr = v4->faddr; 1618 res->laddr.v4.s_addr = v4->laddr; 1619 res->fport = v4->fport; 1620 res->lport = v4->lport; 1621 res->vtw = vtw; // netlock held over call(s) 1622 res->ctl = ctl; 1623 res->reuse_addr = vtw->reuse_addr; 1624 res->reuse_port = vtw->reuse_port; 1625 res->snd_nxt = vtw->snd_nxt; 1626 res->rcv_nxt = vtw->rcv_nxt; 1627 res->rcv_wnd = vtw->rcv_wnd; 1628 res->uid = vtw->uid; 1629 } 1630 1631 return res->valid; 1632 } 1633 1634 /*!\brief return next port in the port iterator. yowza. 1635 */ 1636 static int 1637 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res) 1638 { 1639 struct tcp_ports_iterator *it = arg; 1640 vtw_t *vtw = 0; 1641 1642 if (it->ctl) 1643 vtw = vtw_next_port_v4(it); 1644 1645 if (!vtw) 1646 it->ctl = 0; 1647 1648 return vtw_export_v4(it->ctl, vtw, res); 1649 } 1650 1651 static int 1652 tcp_lookup_v4(struct in_addr faddr, uint16_t fport, 1653 struct in_addr laddr, uint16_t lport, 1654 struct vestigial_inpcb *res) 1655 { 1656 vtw_t *vtw; 1657 vtw_ctl_t *ctl; 1658 1659 1660 db_trace(KTR_VTW 1661 , (res, "vtw: lookup %A:%P %A:%P" 1662 , faddr, fport 1663 , laddr, lport)); 1664 1665 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0]) 1666 , faddr.s_addr, fport 1667 , laddr.s_addr, lport, 0); 1668 1669 return vtw_export_v4(ctl, vtw, res); 1670 } 1671 1672 /* inpcb_lookup_locals assist for handling vestigial entries. 1673 */ 1674 static void * 1675 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild) 1676 { 1677 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6; 1678 1679 bzero(it, sizeof (*it)); 1680 1681 /* Note: the reference to vtw_tcpv6[0] is fine. 1682 * We do not need per-class iteration. We just 1683 * need to get to the fat, and there is one 1684 * shared fat. 1685 */ 1686 if (vtw_tcpv6[0].fat) { 1687 it->addr.v6 = *addr; 1688 it->port = port; 1689 it->wild = !!wild; 1690 it->ctl = &vtw_tcpv6[0]; 1691 1692 ++vtw_stats.look[1]; 1693 } 1694 1695 return it; 1696 } 1697 1698 /*!\brief export an IPv6 vtw. 1699 */ 1700 static int 1701 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1702 { 1703 vtw_v6_t *v6 = (void*)vtw; 1704 1705 bzero(res, sizeof (*res)); 1706 1707 if (ctl && vtw) { 1708 if (!ctl->clidx && vtw->msl_class) 1709 ctl += vtw->msl_class; 1710 else 1711 KASSERT(ctl->clidx == vtw->msl_class); 1712 1713 res->valid = 1; 1714 res->v4 = 0; 1715 1716 res->faddr.v6 = v6->faddr; 1717 res->laddr.v6 = v6->laddr; 1718 res->fport = v6->fport; 1719 res->lport = v6->lport; 1720 res->vtw = vtw; // netlock held over call(s) 1721 res->ctl = ctl; 1722 1723 res->v6only = vtw->v6only; 1724 res->reuse_addr = vtw->reuse_addr; 1725 res->reuse_port = vtw->reuse_port; 1726 1727 res->snd_nxt = vtw->snd_nxt; 1728 res->rcv_nxt = vtw->rcv_nxt; 1729 res->rcv_wnd = vtw->rcv_wnd; 1730 res->uid = vtw->uid; 1731 } 1732 1733 return res->valid; 1734 } 1735 1736 static int 1737 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res) 1738 { 1739 struct tcp_ports_iterator *it = arg; 1740 vtw_t *vtw = 0; 1741 1742 if (it->ctl) 1743 vtw = vtw_next_port_v6(it); 1744 1745 if (!vtw) 1746 it->ctl = 0; 1747 1748 return vtw_export_v6(it->ctl, vtw, res); 1749 } 1750 1751 static int 1752 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport, 1753 const struct in6_addr *laddr, uint16_t lport, 1754 struct vestigial_inpcb *res) 1755 { 1756 vtw_ctl_t *ctl; 1757 vtw_t *vtw; 1758 1759 db_trace(KTR_VTW 1760 , (res, "vtw: lookup %6A:%P %6A:%P" 1761 , db_store(faddr, sizeof (*faddr)), fport 1762 , db_store(laddr, sizeof (*laddr)), lport)); 1763 1764 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0]) 1765 , faddr, fport 1766 , laddr, lport, 0); 1767 1768 return vtw_export_v6(ctl, vtw, res); 1769 } 1770 1771 static vestigial_hooks_t tcp_hooks = { 1772 .init_ports4 = tcp_init_ports_v4, 1773 .next_port4 = tcp_next_port_v4, 1774 .lookup4 = tcp_lookup_v4, 1775 .init_ports6 = tcp_init_ports_v6, 1776 .next_port6 = tcp_next_port_v6, 1777 .lookup6 = tcp_lookup_v6, 1778 }; 1779 1780 static bool 1781 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp) 1782 { 1783 fatp_ctl_t *fat; 1784 vtw_ctl_t *ctl; 1785 1786 switch (af) { 1787 case AF_INET: 1788 fat = &fat_tcpv4; 1789 ctl = &vtw_tcpv4[0]; 1790 break; 1791 case AF_INET6: 1792 fat = &fat_tcpv6; 1793 ctl = &vtw_tcpv6[0]; 1794 break; 1795 default: 1796 return false; 1797 } 1798 if (fatp != NULL) 1799 *fatp = fat; 1800 if (ctlp != NULL) 1801 *ctlp = ctl; 1802 return true; 1803 } 1804 1805 /*!\brief initialize controlling instance 1806 */ 1807 static int 1808 vtw_control_init(int af) 1809 { 1810 fatp_ctl_t *fat; 1811 vtw_ctl_t *ctl; 1812 fatp_t *fat_base; 1813 fatp_t **fat_hash; 1814 vtw_t *ctl_base_v; 1815 uint32_t n, m; 1816 size_t sz; 1817 1818 KASSERT(powerof2(tcp_vtw_entries)); 1819 1820 if (!vtw_select(af, &fat, &ctl)) 1821 return EAFNOSUPPORT; 1822 1823 if (fat->hash != NULL) { 1824 KASSERT(fat->base != NULL && ctl->base.v != NULL); 1825 return 0; 1826 } 1827 1828 /* Allocate 10% more capacity in the fat pointers. 1829 * We should only need ~#hash additional based on 1830 * how they age, but TIME_WAIT assassination could cause 1831 * sparse fat pointer utilisation. 1832 */ 1833 m = 512; 1834 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10; 1835 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t)); 1836 1837 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_SLEEP); 1838 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_SLEEP); 1839 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_SLEEP); 1840 fatp_init(fat, n, m, fat_base, fat_hash); 1841 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v); 1842 1843 return 0; 1844 } 1845 1846 /*!\brief select controlling instance 1847 */ 1848 static vtw_ctl_t * 1849 vtw_control(int af, uint32_t msl) 1850 { 1851 fatp_ctl_t *fat; 1852 vtw_ctl_t *ctl; 1853 int msl_class = msl_to_class(msl); 1854 1855 if (!vtw_select(af, &fat, &ctl)) 1856 return NULL; 1857 1858 if (!fat->base || !ctl->base.v) 1859 return NULL; 1860 1861 if (!tcp_vtw_was_enabled) { 1862 /* This guarantees is timer ticks until we no longer need them. 1863 */ 1864 tcp_vtw_was_enabled = 1; 1865 1866 callout_schedule(&vtw_cs, hz / 5); 1867 1868 tcbtable.vestige = &tcp_hooks; 1869 } 1870 1871 return ctl + msl_class; 1872 } 1873 1874 /*!\brief add TCP pcb to vestigial timewait 1875 */ 1876 int 1877 vtw_add(int af, struct tcpcb *tp) 1878 { 1879 #ifdef VTW_DEBUG 1880 int enable; 1881 #endif 1882 vtw_ctl_t *ctl; 1883 vtw_t *vtw; 1884 1885 KASSERT(mutex_owned(softnet_lock)); 1886 1887 ctl = vtw_control(af, tp->t_msl); 1888 if (!ctl) 1889 return 0; 1890 1891 #ifdef VTW_DEBUG 1892 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable; 1893 #endif 1894 1895 vtw = vtw_alloc(ctl); 1896 1897 if (vtw) { 1898 vtw->snd_nxt = tp->snd_nxt; 1899 vtw->rcv_nxt = tp->rcv_nxt; 1900 1901 switch (af) { 1902 case AF_INET: { 1903 struct inpcb *inp = tp->t_inpcb; 1904 vtw_v4_t *v4 = (void*)vtw; 1905 1906 v4->faddr = in4p_faddr(inp).s_addr; 1907 v4->laddr = in4p_laddr(inp).s_addr; 1908 v4->fport = inp->inp_fport; 1909 v4->lport = inp->inp_lport; 1910 1911 vtw->reuse_port = !!(inp->inp_socket->so_options 1912 & SO_REUSEPORT); 1913 vtw->reuse_addr = !!(inp->inp_socket->so_options 1914 & SO_REUSEADDR); 1915 vtw->v6only = 0; 1916 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid; 1917 1918 vtw_inshash_v4(ctl, vtw); 1919 1920 1921 #ifdef VTW_DEBUG 1922 /* Immediate lookup (connected and port) to 1923 * ensure at least that works! 1924 */ 1925 if (enable & 4) { 1926 KASSERT(vtw_lookup_hash_v4 1927 (ctl 1928 , in4p_faddr(inp).s_addr, inp->inp_fport 1929 , in4p_laddr(inp).s_addr, inp->inp_lport 1930 , 0) 1931 == vtw); 1932 KASSERT(vtw_lookup_hash_v4 1933 (ctl 1934 , in4p_faddr(inp).s_addr, inp->inp_fport 1935 , in4p_laddr(inp).s_addr, inp->inp_lport 1936 , 1)); 1937 } 1938 /* Immediate port iterator functionality check: not wild 1939 */ 1940 if (enable & 8) { 1941 struct tcp_ports_iterator *it; 1942 struct vestigial_inpcb res; 1943 int cnt = 0; 1944 1945 it = tcp_init_ports_v4(in4p_laddr(inp) 1946 , inp->inp_lport, 0); 1947 1948 while (tcp_next_port_v4(it, &res)) { 1949 ++cnt; 1950 } 1951 KASSERT(cnt); 1952 } 1953 /* Immediate port iterator functionality check: wild 1954 */ 1955 if (enable & 16) { 1956 struct tcp_ports_iterator *it; 1957 struct vestigial_inpcb res; 1958 struct in_addr any; 1959 int cnt = 0; 1960 1961 any.s_addr = htonl(INADDR_ANY); 1962 1963 it = tcp_init_ports_v4(any, inp->inp_lport, 1); 1964 1965 while (tcp_next_port_v4(it, &res)) { 1966 ++cnt; 1967 } 1968 KASSERT(cnt); 1969 } 1970 #endif /* VTW_DEBUG */ 1971 break; 1972 } 1973 1974 case AF_INET6: { 1975 struct inpcb *inp = tp->t_inpcb; 1976 vtw_v6_t *v6 = (void*)vtw; 1977 1978 v6->faddr = in6p_faddr(inp); 1979 v6->laddr = in6p_laddr(inp); 1980 v6->fport = inp->inp_fport; 1981 v6->lport = inp->inp_lport; 1982 1983 vtw->reuse_port = !!(inp->inp_socket->so_options 1984 & SO_REUSEPORT); 1985 vtw->reuse_addr = !!(inp->inp_socket->so_options 1986 & SO_REUSEADDR); 1987 vtw->v6only = !!(inp->inp_flags 1988 & IN6P_IPV6_V6ONLY); 1989 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid; 1990 1991 vtw_inshash_v6(ctl, vtw); 1992 #ifdef VTW_DEBUG 1993 /* Immediate lookup (connected and port) to 1994 * ensure at least that works! 1995 */ 1996 if (enable & 4) { 1997 KASSERT(vtw_lookup_hash_v6(ctl 1998 , &in6p_faddr(inp), inp->inp_fport 1999 , &in6p_laddr(inp), inp->inp_lport 2000 , 0) 2001 == vtw); 2002 KASSERT(vtw_lookup_hash_v6 2003 (ctl 2004 , &in6p_faddr(inp), inp->inp_fport 2005 , &in6p_laddr(inp), inp->inp_lport 2006 , 1)); 2007 } 2008 /* Immediate port iterator functionality check: not wild 2009 */ 2010 if (enable & 8) { 2011 struct tcp_ports_iterator *it; 2012 struct vestigial_inpcb res; 2013 int cnt = 0; 2014 2015 it = tcp_init_ports_v6(&in6p_laddr(inp) 2016 , inp->inp_lport, 0); 2017 2018 while (tcp_next_port_v6(it, &res)) { 2019 ++cnt; 2020 } 2021 KASSERT(cnt); 2022 } 2023 /* Immediate port iterator functionality check: wild 2024 */ 2025 if (enable & 16) { 2026 struct tcp_ports_iterator *it; 2027 struct vestigial_inpcb res; 2028 static struct in6_addr any = IN6ADDR_ANY_INIT; 2029 int cnt = 0; 2030 2031 it = tcp_init_ports_v6(&any 2032 , inp->inp_lport, 1); 2033 2034 while (tcp_next_port_v6(it, &res)) { 2035 ++cnt; 2036 } 2037 KASSERT(cnt); 2038 } 2039 #endif /* VTW_DEBUG */ 2040 break; 2041 } 2042 } 2043 2044 tcp_canceltimers(tp); 2045 tp = tcp_close(tp); 2046 KASSERT(!tp); 2047 2048 return 1; 2049 } 2050 2051 return 0; 2052 } 2053 2054 /*!\brief restart timer for vestigial time-wait entry 2055 */ 2056 static void 2057 vtw_restart_v4(vestigial_inpcb_t *vp) 2058 { 2059 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw; 2060 vtw_t *vtw; 2061 vtw_t *cp = ©.common; 2062 vtw_ctl_t *ctl; 2063 2064 KASSERT(mutex_owned(softnet_lock)); 2065 2066 db_trace(KTR_VTW 2067 , (vp->vtw, "vtw: restart %A:%P %A:%P" 2068 , vp->faddr.v4.s_addr, vp->fport 2069 , vp->laddr.v4.s_addr, vp->lport)); 2070 2071 /* Class might have changed, so have a squiz. 2072 */ 2073 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class)); 2074 vtw = vtw_alloc(ctl); 2075 2076 if (vtw) { 2077 vtw_v4_t *v4 = (void*)vtw; 2078 2079 /* Safe now to unhash the old entry 2080 */ 2081 vtw_del(vp->ctl, vp->vtw); 2082 2083 vtw->snd_nxt = cp->snd_nxt; 2084 vtw->rcv_nxt = cp->rcv_nxt; 2085 2086 v4->faddr = copy.faddr; 2087 v4->laddr = copy.laddr; 2088 v4->fport = copy.fport; 2089 v4->lport = copy.lport; 2090 2091 vtw->reuse_port = cp->reuse_port; 2092 vtw->reuse_addr = cp->reuse_addr; 2093 vtw->v6only = 0; 2094 vtw->uid = cp->uid; 2095 2096 vtw_inshash_v4(ctl, vtw); 2097 } 2098 2099 vp->valid = 0; 2100 } 2101 2102 /*!\brief restart timer for vestigial time-wait entry 2103 */ 2104 static void 2105 vtw_restart_v6(vestigial_inpcb_t *vp) 2106 { 2107 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw; 2108 vtw_t *vtw; 2109 vtw_t *cp = ©.common; 2110 vtw_ctl_t *ctl; 2111 2112 KASSERT(mutex_owned(softnet_lock)); 2113 2114 db_trace(KTR_VTW 2115 , (vp->vtw, "vtw: restart %6A:%P %6A:%P" 2116 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6)) 2117 , vp->fport 2118 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6)) 2119 , vp->lport)); 2120 2121 /* Class might have changed, so have a squiz. 2122 */ 2123 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class)); 2124 vtw = vtw_alloc(ctl); 2125 2126 if (vtw) { 2127 vtw_v6_t *v6 = (void*)vtw; 2128 2129 /* Safe now to unhash the old entry 2130 */ 2131 vtw_del(vp->ctl, vp->vtw); 2132 2133 vtw->snd_nxt = cp->snd_nxt; 2134 vtw->rcv_nxt = cp->rcv_nxt; 2135 2136 v6->faddr = copy.faddr; 2137 v6->laddr = copy.laddr; 2138 v6->fport = copy.fport; 2139 v6->lport = copy.lport; 2140 2141 vtw->reuse_port = cp->reuse_port; 2142 vtw->reuse_addr = cp->reuse_addr; 2143 vtw->v6only = cp->v6only; 2144 vtw->uid = cp->uid; 2145 2146 vtw_inshash_v6(ctl, vtw); 2147 } 2148 2149 vp->valid = 0; 2150 } 2151 2152 /*!\brief restart timer for vestigial time-wait entry 2153 */ 2154 void 2155 vtw_restart(vestigial_inpcb_t *vp) 2156 { 2157 if (!vp || !vp->valid) 2158 return; 2159 2160 if (vp->v4) 2161 vtw_restart_v4(vp); 2162 else 2163 vtw_restart_v6(vp); 2164 } 2165 2166 int 2167 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS) 2168 { 2169 int en, rc; 2170 struct sysctlnode node; 2171 2172 node = *rnode; 2173 en = *(int *)rnode->sysctl_data; 2174 node.sysctl_data = &en; 2175 2176 rc = sysctl_lookup(SYSCTLFN_CALL(&node)); 2177 if (rc != 0 || newp == NULL) 2178 return rc; 2179 2180 if (rnode->sysctl_data != &tcp4_vtw_enable && 2181 rnode->sysctl_data != &tcp6_vtw_enable) 2182 rc = ENOENT; 2183 else if ((en & 1) == 0) 2184 rc = 0; 2185 else if (rnode->sysctl_data == &tcp4_vtw_enable) 2186 rc = vtw_control_init(AF_INET); 2187 else /* rnode->sysctl_data == &tcp6_vtw_enable */ 2188 rc = vtw_control_init(AF_INET6); 2189 2190 if (rc == 0) 2191 *(int *)rnode->sysctl_data = en; 2192 2193 return rc; 2194 } 2195 2196 int 2197 vtw_earlyinit(void) 2198 { 2199 int i, rc; 2200 2201 callout_init(&vtw_cs, 0); 2202 callout_setfunc(&vtw_cs, vtw_tick, 0); 2203 2204 for (i = 0; i < VTW_NCLASS; ++i) { 2205 vtw_tcpv4[i].is_v4 = 1; 2206 vtw_tcpv6[i].is_v6 = 1; 2207 } 2208 2209 if ((tcp4_vtw_enable & 1) != 0 && 2210 (rc = vtw_control_init(AF_INET)) != 0) 2211 return rc; 2212 2213 if ((tcp6_vtw_enable & 1) != 0 && 2214 (rc = vtw_control_init(AF_INET6)) != 0) 2215 return rc; 2216 2217 return 0; 2218 } 2219 2220 #ifdef VTW_DEBUG 2221 #include <sys/syscallargs.h> 2222 #include <sys/sysctl.h> 2223 2224 /*!\brief add lalp, fafp entries for debug 2225 */ 2226 int 2227 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class) 2228 { 2229 vtw_ctl_t *ctl; 2230 vtw_t *vtw; 2231 2232 ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class)); 2233 if (!ctl) 2234 return 0; 2235 2236 vtw = vtw_alloc(ctl); 2237 2238 if (vtw) { 2239 vtw->snd_nxt = 0; 2240 vtw->rcv_nxt = 0; 2241 2242 switch (af) { 2243 case AF_INET: { 2244 vtw_v4_t *v4 = (void*)vtw; 2245 2246 v4->faddr = fa->sin_addr.v4.s_addr; 2247 v4->laddr = la->sin_addr.v4.s_addr; 2248 v4->fport = fa->sin_port; 2249 v4->lport = la->sin_port; 2250 2251 vtw->reuse_port = 1; 2252 vtw->reuse_addr = 1; 2253 vtw->v6only = 0; 2254 vtw->uid = 0; 2255 2256 vtw_inshash_v4(ctl, vtw); 2257 break; 2258 } 2259 2260 case AF_INET6: { 2261 vtw_v6_t *v6 = (void*)vtw; 2262 2263 v6->faddr = fa->sin_addr.v6; 2264 v6->laddr = la->sin_addr.v6; 2265 2266 v6->fport = fa->sin_port; 2267 v6->lport = la->sin_port; 2268 2269 vtw->reuse_port = 1; 2270 vtw->reuse_addr = 1; 2271 vtw->v6only = 0; 2272 vtw->uid = 0; 2273 2274 vtw_inshash_v6(ctl, vtw); 2275 break; 2276 } 2277 2278 default: 2279 break; 2280 } 2281 2282 return 1; 2283 } 2284 2285 return 0; 2286 } 2287 2288 static int vtw_syscall = 0; 2289 2290 static int 2291 vtw_debug_process(vtw_sysargs_t *ap) 2292 { 2293 struct vestigial_inpcb vestige; 2294 int rc = 0; 2295 2296 mutex_enter(softnet_lock); 2297 2298 switch (ap->op) { 2299 case 0: // insert 2300 vtw_debug_add(ap->la.sin_family 2301 , &ap->la 2302 , &ap->fa 2303 , TCPTV_MSL 2304 , 0); 2305 break; 2306 2307 case 1: // lookup 2308 case 2: // restart 2309 switch (ap->la.sin_family) { 2310 case AF_INET: 2311 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port, 2312 ap->la.sin_addr.v4, ap->la.sin_port, 2313 &vestige)) { 2314 if (ap->op == 2) { 2315 vtw_restart(&vestige); 2316 } 2317 rc = 0; 2318 } else 2319 rc = ESRCH; 2320 break; 2321 2322 case AF_INET6: 2323 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port, 2324 &ap->la.sin_addr.v6, ap->la.sin_port, 2325 &vestige)) { 2326 if (ap->op == 2) { 2327 vtw_restart(&vestige); 2328 } 2329 rc = 0; 2330 } else 2331 rc = ESRCH; 2332 break; 2333 default: 2334 rc = EINVAL; 2335 } 2336 break; 2337 2338 default: 2339 rc = EINVAL; 2340 } 2341 2342 mutex_exit(softnet_lock); 2343 return rc; 2344 } 2345 2346 struct sys_vtw_args { 2347 syscallarg(const vtw_sysargs_t *) req; 2348 syscallarg(size_t) len; 2349 }; 2350 2351 static int 2352 vtw_sys(struct lwp *l, const void *_, register_t *retval) 2353 { 2354 const struct sys_vtw_args *uap = _; 2355 void *buf; 2356 int rc; 2357 size_t len = SCARG(uap, len); 2358 2359 if (len != sizeof (vtw_sysargs_t)) 2360 return EINVAL; 2361 2362 buf = kmem_alloc(len, KM_SLEEP); 2363 rc = copyin(SCARG(uap, req), buf, len); 2364 if (!rc) { 2365 rc = vtw_debug_process(buf); 2366 } 2367 kmem_free(buf, len); 2368 2369 return rc; 2370 } 2371 2372 static void 2373 vtw_sanity_check(void) 2374 { 2375 vtw_ctl_t *ctl; 2376 vtw_t *vtw; 2377 int i; 2378 int n; 2379 2380 for (i = 0; i < VTW_NCLASS; ++i) { 2381 ctl = &vtw_tcpv4[i]; 2382 2383 if (!ctl->base.v || ctl->nalloc) 2384 continue; 2385 2386 for (n = 0, vtw = ctl->base.v; ; ) { 2387 ++n; 2388 vtw = vtw_next(ctl, vtw); 2389 if (vtw == ctl->base.v) 2390 break; 2391 } 2392 db_trace(KTR_VTW 2393 , (ctl, "sanity: class %x n %x nfree %x" 2394 , i, n, ctl->nfree)); 2395 2396 KASSERT(n == ctl->nfree); 2397 } 2398 2399 for (i = 0; i < VTW_NCLASS; ++i) { 2400 ctl = &vtw_tcpv6[i]; 2401 2402 if (!ctl->base.v || ctl->nalloc) 2403 continue; 2404 2405 for (n = 0, vtw = ctl->base.v; ; ) { 2406 ++n; 2407 vtw = vtw_next(ctl, vtw); 2408 if (vtw == ctl->base.v) 2409 break; 2410 } 2411 db_trace(KTR_VTW 2412 , (ctl, "sanity: class %x n %x nfree %x" 2413 , i, n, ctl->nfree)); 2414 KASSERT(n == ctl->nfree); 2415 } 2416 } 2417 2418 /*!\brief Initialise debug support. 2419 */ 2420 static void 2421 vtw_debug_init(void) 2422 { 2423 int i; 2424 2425 vtw_sanity_check(); 2426 2427 if (vtw_syscall) 2428 return; 2429 2430 for (i = 511; i; --i) { 2431 if (sysent[i].sy_call == sys_nosys) { 2432 sysent[i].sy_call = vtw_sys; 2433 sysent[i].sy_narg = 2; 2434 sysent[i].sy_argsize = sizeof (struct sys_vtw_args); 2435 sysent[i].sy_flags = 0; 2436 2437 vtw_syscall = i; 2438 break; 2439 } 2440 } 2441 if (i) { 2442 const struct sysctlnode *node; 2443 uint32_t flags; 2444 2445 flags = sysctl_root.sysctl_flags; 2446 2447 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE; 2448 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT; 2449 2450 sysctl_createv(0, 0, 0, &node, 2451 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2452 "koff", 2453 SYSCTL_DESCR("Kernel Obscure Feature Finder"), 2454 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2455 2456 if (!node) { 2457 sysctl_createv(0, 0, 0, &node, 2458 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2459 "koffka", 2460 SYSCTL_DESCR("The Real(tm) Kernel" 2461 " Obscure Feature Finder"), 2462 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2463 } 2464 if (node) { 2465 sysctl_createv(0, 0, 0, 0, 2466 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 2467 CTLTYPE_INT, "vtw_debug_syscall", 2468 SYSCTL_DESCR("vtw debug" 2469 " system call number"), 2470 0, 0, &vtw_syscall, 0, node->sysctl_num, 2471 CTL_CREATE, CTL_EOL); 2472 } 2473 sysctl_root.sysctl_flags = flags; 2474 } 2475 } 2476 #else /* !VTW_DEBUG */ 2477 static void 2478 vtw_debug_init(void) 2479 { 2480 return; 2481 } 2482 #endif /* !VTW_DEBUG */ 2483