1 /* 2 * Copyright (c) 2011 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Coyote Point Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 /* 31 * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using 32 * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime 33 * Truncation (MSLT). 34 * 35 * MSLT and VTW were contributed by Coyote Point Systems, Inc. 36 * 37 * Even after a TCP session enters the TIME_WAIT state, its corresponding 38 * socket and protocol control blocks (PCBs) stick around until the TCP 39 * Maximum Segment Lifetime (MSL) expires. On a host whose workload 40 * necessarily creates and closes down many TCP sockets, the sockets & PCBs 41 * for TCP sessions in TIME_WAIT state amount to many megabytes of dead 42 * weight in RAM. 43 * 44 * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to 45 * a class based on the nearness of the peer. Corresponding to each class 46 * is an MSL, and a session uses the MSL of its class. The classes are 47 * loopback (local host equals remote host), local (local host and remote 48 * host are on the same link/subnet), and remote (local host and remote 49 * host communicate via one or more gateways). Classes corresponding to 50 * nearer peers have lower MSLs by default: 2 seconds for loopback, 10 51 * seconds for local, 60 seconds for remote. Loopback and local sessions 52 * expire more quickly when MSLT is used. 53 * 54 * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket 55 * dead weight with a compact representation of the session, called a 56 * "vestigial PCB". VTW data structures are designed to be very fast and 57 * memory-efficient: for fast insertion and lookup of vestigial PCBs, 58 * the PCBs are stored in a hash table that is designed to minimize the 59 * number of cacheline visits per lookup/insertion. The memory both 60 * for vestigial PCBs and for elements of the PCB hashtable come from 61 * fixed-size pools, and linked data structures exploit this to conserve 62 * memory by representing references with a narrow index/offset from the 63 * start of a pool instead of a pointer. When space for new vestigial PCBs 64 * runs out, VTW makes room by discarding old vestigial PCBs, oldest first. 65 * VTW cooperates with MSLT. 66 * 67 * It may help to think of VTW as a "FIN cache" by analogy to the SYN 68 * cache. 69 * 70 * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT 71 * sessions as fast as it can is approximately 17% idle when VTW is active 72 * versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM 73 * when VTW is active (approximately 64k vestigial PCBs are created) than 74 * when it is inactive. 75 */ 76 77 #include <sys/cdefs.h> 78 79 #ifdef _KERNEL_OPT 80 #include "opt_ddb.h" 81 #include "opt_inet.h" 82 #include "opt_inet_csum.h" 83 #include "opt_tcp_debug.h" 84 #endif 85 86 #include <sys/param.h> 87 #include <sys/systm.h> 88 #include <sys/kmem.h> 89 #include <sys/mbuf.h> 90 #include <sys/protosw.h> 91 #include <sys/socket.h> 92 #include <sys/socketvar.h> 93 #include <sys/errno.h> 94 #include <sys/syslog.h> 95 #include <sys/pool.h> 96 #include <sys/domain.h> 97 #include <sys/kernel.h> 98 #include <net/if.h> 99 #include <net/if_types.h> 100 101 #include <netinet/in.h> 102 #include <netinet/in_systm.h> 103 #include <netinet/ip.h> 104 #include <netinet/in_pcb.h> 105 #include <netinet/in_var.h> 106 #include <netinet/ip_var.h> 107 #include <netinet/in_offload.h> 108 #include <netinet/ip6.h> 109 #include <netinet6/ip6_var.h> 110 #include <netinet6/in6_pcb.h> 111 #include <netinet6/ip6_var.h> 112 #include <netinet6/in6_var.h> 113 #include <netinet/icmp6.h> 114 #include <netinet6/nd6.h> 115 116 #include <netinet/tcp.h> 117 #include <netinet/tcp_fsm.h> 118 #include <netinet/tcp_seq.h> 119 #include <netinet/tcp_timer.h> 120 #include <netinet/tcp_var.h> 121 #include <netinet/tcp_private.h> 122 #include <netinet/tcpip.h> 123 124 #include <netinet/tcp_vtw.h> 125 126 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.16 2016/07/28 07:54:31 martin Exp $"); 127 128 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0) 129 130 static void vtw_debug_init(void); 131 132 fatp_ctl_t fat_tcpv4; 133 fatp_ctl_t fat_tcpv6; 134 vtw_ctl_t vtw_tcpv4[VTW_NCLASS]; 135 vtw_ctl_t vtw_tcpv6[VTW_NCLASS]; 136 vtw_stats_t vtw_stats; 137 138 /* We provide state for the lookup_ports iterator. 139 * As currently we are netlock-protected, there is one. 140 * If we were finer-grain, we would have one per CPU. 141 * I do not want to be in the business of alloc/free. 142 * The best alternate would be allocate on the caller's 143 * stack, but that would require them to know the struct, 144 * or at least the size. 145 * See how she goes. 146 */ 147 struct tcp_ports_iterator { 148 union { 149 struct in_addr v4; 150 struct in6_addr v6; 151 } addr; 152 u_int port; 153 154 uint32_t wild : 1; 155 156 vtw_ctl_t *ctl; 157 fatp_t *fp; 158 159 uint16_t slot_idx; 160 uint16_t ctl_idx; 161 }; 162 163 static struct tcp_ports_iterator tcp_ports_iterator_v4; 164 static struct tcp_ports_iterator tcp_ports_iterator_v6; 165 166 static int vtw_age(vtw_ctl_t *, struct timeval *); 167 168 /*!\brief allocate a fat pointer from a collection. 169 */ 170 static fatp_t * 171 fatp_alloc(fatp_ctl_t *fat) 172 { 173 fatp_t *fp = 0; 174 175 if (fat->nfree) { 176 fp = fat->free; 177 if (fp) { 178 fat->free = fatp_next(fat, fp); 179 --fat->nfree; 180 ++fat->nalloc; 181 fp->nxt = 0; 182 183 KASSERT(!fp->inuse); 184 } 185 } 186 187 return fp; 188 } 189 190 /*!\brief free a fat pointer. 191 */ 192 static void 193 fatp_free(fatp_ctl_t *fat, fatp_t *fp) 194 { 195 if (fp) { 196 KASSERT(!fp->inuse); 197 KASSERT(!fp->nxt); 198 199 fp->nxt = fatp_index(fat, fat->free); 200 fat->free = fp; 201 202 ++fat->nfree; 203 --fat->nalloc; 204 } 205 } 206 207 /*!\brief initialise a collection of fat pointers. 208 * 209 *\param n # hash buckets 210 *\param m total # fat pointers to allocate 211 * 212 * We allocate 2x as much, as we have two hashes: full and lport only. 213 */ 214 static void 215 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m, 216 fatp_t *fat_base, fatp_t **fat_hash) 217 { 218 fatp_t *fp; 219 220 KASSERT(n <= FATP_MAX / 2); 221 222 fat->hash = fat_hash; 223 fat->base = fat_base; 224 225 fat->port = &fat->hash[m]; 226 227 fat->mask = m - 1; // ASSERT is power of 2 (m) 228 fat->lim = fat->base + 2*n - 1; 229 fat->nfree = 0; 230 fat->nalloc = 2*n; 231 232 /* Initialise the free list. 233 */ 234 for (fp = fat->lim; fp >= fat->base; --fp) { 235 fatp_free(fat, fp); 236 } 237 } 238 239 /* 240 * The `xtra' is XORed into the tag stored. 241 */ 242 static uint32_t fatp_xtra[] = { 243 0x11111111,0x22222222,0x33333333,0x44444444, 244 0x55555555,0x66666666,0x77777777,0x88888888, 245 0x12121212,0x21212121,0x34343434,0x43434343, 246 0x56565656,0x65656565,0x78787878,0x87878787, 247 0x11221122,0x22112211,0x33443344,0x44334433, 248 0x55665566,0x66556655,0x77887788,0x88778877, 249 0x11112222,0x22221111,0x33334444,0x44443333, 250 0x55556666,0x66665555,0x77778888,0x88887777, 251 }; 252 253 /*!\brief turn a {fatp_t*,slot} into an integral key. 254 * 255 * The key can be used to obtain the fatp_t, and the slot, 256 * as it directly encodes them. 257 */ 258 static inline uint32_t 259 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot) 260 { 261 CTASSERT(CACHE_LINE_SIZE == 32 || 262 CACHE_LINE_SIZE == 64 || 263 CACHE_LINE_SIZE == 128); 264 265 switch (fatp_ntags()) { 266 case 7: 267 return (fatp_index(fat, fp) << 3) | slot; 268 case 15: 269 return (fatp_index(fat, fp) << 4) | slot; 270 case 31: 271 return (fatp_index(fat, fp) << 5) | slot; 272 default: 273 KASSERT(0 && "no support, for no good reason"); 274 return ~0; 275 } 276 } 277 278 static inline uint32_t 279 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key) 280 { 281 CTASSERT(CACHE_LINE_SIZE == 32 || 282 CACHE_LINE_SIZE == 64 || 283 CACHE_LINE_SIZE == 128); 284 285 switch (fatp_ntags()) { 286 case 7: 287 return key & 7; 288 case 15: 289 return key & 15; 290 case 31: 291 return key & 31; 292 default: 293 KASSERT(0 && "no support, for no good reason"); 294 return ~0; 295 } 296 } 297 298 static inline fatp_t * 299 fatp_from_key(fatp_ctl_t *fat, uint32_t key) 300 { 301 CTASSERT(CACHE_LINE_SIZE == 32 || 302 CACHE_LINE_SIZE == 64 || 303 CACHE_LINE_SIZE == 128); 304 305 switch (fatp_ntags()) { 306 case 7: 307 key >>= 3; 308 break; 309 case 15: 310 key >>= 4; 311 break; 312 case 31: 313 key >>= 5; 314 break; 315 default: 316 KASSERT(0 && "no support, for no good reason"); 317 return 0; 318 } 319 320 return key ? fat->base + key - 1 : 0; 321 } 322 323 static inline uint32_t 324 idx_encode(vtw_ctl_t *ctl, uint32_t idx) 325 { 326 return (idx << ctl->idx_bits) | idx; 327 } 328 329 static inline uint32_t 330 idx_decode(vtw_ctl_t *ctl, uint32_t bits) 331 { 332 uint32_t idx = bits & ctl->idx_mask; 333 334 if (idx_encode(ctl, idx) == bits) 335 return idx; 336 else 337 return ~0; 338 } 339 340 /*!\brief insert index into fatp hash 341 * 342 *\param idx - index of element being placed in hash chain 343 *\param tag - 32-bit tag identifier 344 * 345 *\returns 346 * value which can be used to locate entry. 347 * 348 *\note 349 * we rely on the fact that there are unused high bits in the index 350 * for verification purposes on lookup. 351 */ 352 353 static inline uint32_t 354 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which, 355 void *dbg) 356 { 357 fatp_t *fp; 358 fatp_t **hash = (which ? fat->port : fat->hash); 359 int i; 360 361 fp = hash[tag & fat->mask]; 362 363 while (!fp || fatp_full(fp)) { 364 fatp_t *fq; 365 366 /* All entries are inuse at the top level. 367 * We allocate a spare, and push the top level 368 * down one. All entries in the fp we push down 369 * (think of a tape worm here) will be expelled sooner than 370 * any entries added subsequently to this hash bucket. 371 * This is a property of the time waits we are exploiting. 372 */ 373 374 fq = fatp_alloc(fat); 375 if (!fq) { 376 vtw_age(fat->vtw, 0); 377 fp = hash[tag & fat->mask]; 378 continue; 379 } 380 381 fq->inuse = 0; 382 fq->nxt = fatp_index(fat, fp); 383 384 hash[tag & fat->mask] = fq; 385 386 fp = fq; 387 } 388 389 KASSERT(!fatp_full(fp)); 390 391 /* Fill highest index first. Lookup is lowest first. 392 */ 393 for (i = fatp_ntags(); --i >= 0; ) { 394 if (!((1 << i) & fp->inuse)) { 395 break; 396 } 397 } 398 399 fp->inuse |= 1 << i; 400 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i]; 401 402 db_trace(KTR_VTW 403 , (fp, "fat: inuse %5.5x tag[%x] %8.8x" 404 , fp->inuse 405 , i, fp->tag[i])); 406 407 return fatp_key(fat, fp, i); 408 } 409 410 static inline int 411 vtw_alive(const vtw_t *vtw) 412 { 413 return vtw->hashed && vtw->expire.tv_sec; 414 } 415 416 static inline uint32_t 417 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4) 418 { 419 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4) 420 return v4 - ctl->base.v4; 421 422 KASSERT(0 && "vtw out of bounds"); 423 424 return ~0; 425 } 426 427 static inline uint32_t 428 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6) 429 { 430 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6) 431 return v6 - ctl->base.v6; 432 433 KASSERT(0 && "vtw out of bounds"); 434 435 return ~0; 436 } 437 438 static inline uint32_t 439 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw) 440 { 441 if (ctl->clidx) 442 ctl = ctl->ctl; 443 444 if (ctl->is_v4) 445 return vtw_index_v4(ctl, (vtw_v4_t *)vtw); 446 447 if (ctl->is_v6) 448 return vtw_index_v6(ctl, (vtw_v6_t *)vtw); 449 450 KASSERT(0 && "neither 4 nor 6. most curious."); 451 452 return ~0; 453 } 454 455 static inline vtw_t * 456 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx) 457 { 458 if (ctl->clidx) 459 ctl = ctl->ctl; 460 461 /* See if the index looks like it might be an index. 462 * Bits on outside of the valid index bits is a give away. 463 */ 464 idx = idx_decode(ctl, idx); 465 466 if (idx == ~0) { 467 return 0; 468 } else if (ctl->is_v4) { 469 vtw_v4_t *vtw = ctl->base.v4 + idx; 470 471 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4) 472 ? &vtw->common : 0; 473 } else if (ctl->is_v6) { 474 vtw_v6_t *vtw = ctl->base.v6 + idx; 475 476 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6) 477 ? &vtw->common : 0; 478 } else { 479 KASSERT(0 && "badness"); 480 return 0; 481 } 482 } 483 484 /*!\brief return the next vtw after this one. 485 * 486 * Due to the differing sizes of the entries in differing 487 * arenas, we have to ensure we ++ the correct pointer type. 488 * 489 * Also handles wrap. 490 */ 491 static inline vtw_t * 492 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw) 493 { 494 if (ctl->is_v4) { 495 vtw_v4_t *v4 = (void*)vtw; 496 497 vtw = &(++v4)->common; 498 } else { 499 vtw_v6_t *v6 = (void*)vtw; 500 501 vtw = &(++v6)->common; 502 } 503 504 if (vtw > ctl->lim.v) 505 vtw = ctl->base.v; 506 507 return vtw; 508 } 509 510 /*!\brief remove entry from FATP hash chains 511 */ 512 static inline void 513 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw) 514 { 515 fatp_ctl_t *fat = ctl->fat; 516 fatp_t *fp; 517 uint32_t key = vtw->key; 518 uint32_t tag, slot, idx; 519 vtw_v4_t *v4 = (void*)vtw; 520 vtw_v6_t *v6 = (void*)vtw; 521 522 if (!vtw->hashed) { 523 KASSERT(0 && "unhashed"); 524 return; 525 } 526 527 if (fat->vtw->is_v4) { 528 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport); 529 } else if (fat->vtw->is_v6) { 530 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport); 531 } else { 532 tag = 0; 533 KASSERT(0 && "not reached"); 534 } 535 536 /* Remove from fat->hash[] 537 */ 538 slot = fatp_slot_from_key(fat, key); 539 fp = fatp_from_key(fat, key); 540 idx = vtw_index(ctl, vtw); 541 542 db_trace(KTR_VTW 543 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x" 544 , fp->inuse, slot, idx, key, tag)); 545 546 KASSERT(fp->inuse & (1 << slot)); 547 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 548 ^ fatp_xtra[slot])); 549 550 if ((fp->inuse & (1 << slot)) 551 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 552 ^ fatp_xtra[slot])) { 553 fp->inuse ^= 1 << slot; 554 fp->tag[slot] = 0; 555 556 /* When we delete entries, we do not compact. This is 557 * due to temporality. We add entries, and they 558 * (eventually) expire. Older entries will be further 559 * down the chain. 560 */ 561 if (!fp->inuse) { 562 uint32_t hi = tag & fat->mask; 563 fatp_t *fq = 0; 564 fatp_t *fr = fat->hash[hi]; 565 566 while (fr && fr != fp) { 567 fr = fatp_next(fat, fq = fr); 568 } 569 570 if (fr == fp) { 571 if (fq) { 572 fq->nxt = fp->nxt; 573 fp->nxt = 0; 574 fatp_free(fat, fp); 575 } else { 576 KASSERT(fat->hash[hi] == fp); 577 578 if (fp->nxt) { 579 fat->hash[hi] 580 = fatp_next(fat, fp); 581 fp->nxt = 0; 582 fatp_free(fat, fp); 583 } else { 584 /* retain for next use. 585 */ 586 ; 587 } 588 } 589 } else { 590 fr = fat->hash[hi]; 591 592 do { 593 db_trace(KTR_VTW 594 , (fr 595 , "fat:*del inuse %5.5x" 596 " nxt %x" 597 , fr->inuse, fr->nxt)); 598 599 fr = fatp_next(fat, fq = fr); 600 } while (fr && fr != fp); 601 602 KASSERT(0 && "oops"); 603 } 604 } 605 vtw->key ^= ~0; 606 } 607 608 if (fat->vtw->is_v4) { 609 tag = v4_port_tag(v4->lport); 610 } else if (fat->vtw->is_v6) { 611 tag = v6_port_tag(v6->lport); 612 } 613 614 /* Remove from fat->port[] 615 */ 616 key = vtw->port_key; 617 slot = fatp_slot_from_key(fat, key); 618 fp = fatp_from_key(fat, key); 619 idx = vtw_index(ctl, vtw); 620 621 db_trace(KTR_VTW 622 , (fp, "fatport: del inuse %5.5x" 623 " slot %x idx %x key %x tag %x" 624 , fp->inuse, slot, idx, key, tag)); 625 626 KASSERT(fp->inuse & (1 << slot)); 627 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 628 ^ fatp_xtra[slot])); 629 630 if ((fp->inuse & (1 << slot)) 631 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 632 ^ fatp_xtra[slot])) { 633 fp->inuse ^= 1 << slot; 634 fp->tag[slot] = 0; 635 636 if (!fp->inuse) { 637 uint32_t hi = tag & fat->mask; 638 fatp_t *fq = 0; 639 fatp_t *fr = fat->port[hi]; 640 641 while (fr && fr != fp) { 642 fr = fatp_next(fat, fq = fr); 643 } 644 645 if (fr == fp) { 646 if (fq) { 647 fq->nxt = fp->nxt; 648 fp->nxt = 0; 649 fatp_free(fat, fp); 650 } else { 651 KASSERT(fat->port[hi] == fp); 652 653 if (fp->nxt) { 654 fat->port[hi] 655 = fatp_next(fat, fp); 656 fp->nxt = 0; 657 fatp_free(fat, fp); 658 } else { 659 /* retain for next use. 660 */ 661 ; 662 } 663 } 664 } 665 } 666 vtw->port_key ^= ~0; 667 } 668 669 vtw->hashed = 0; 670 } 671 672 /*!\brief remove entry from hash, possibly free. 673 */ 674 void 675 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw) 676 { 677 KASSERT(mutex_owned(softnet_lock)); 678 679 if (vtw->hashed) { 680 ++vtw_stats.del; 681 vtw_unhash(ctl, vtw); 682 } 683 684 /* We only delete the oldest entry. 685 */ 686 if (vtw != ctl->oldest.v) 687 return; 688 689 --ctl->nalloc; 690 ++ctl->nfree; 691 692 vtw->expire.tv_sec = 0; 693 vtw->expire.tv_usec = ~0; 694 695 if (!ctl->nalloc) 696 ctl->oldest.v = 0; 697 698 ctl->oldest.v = vtw_next(ctl, vtw); 699 } 700 701 /*!\brief insert vestigial timewait in hash chain 702 */ 703 static void 704 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw) 705 { 706 uint32_t idx = vtw_index(ctl, vtw); 707 uint32_t tag; 708 vtw_v4_t *v4 = (void*)vtw; 709 710 KASSERT(mutex_owned(softnet_lock)); 711 KASSERT(!vtw->hashed); 712 KASSERT(ctl->clidx == vtw->msl_class); 713 714 ++vtw_stats.ins; 715 716 tag = v4_tag(v4->faddr, v4->fport, 717 v4->laddr, v4->lport); 718 719 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 720 721 db_trace(KTR_VTW, (ctl 722 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x" 723 " tag %8.8x key %8.8x" 724 , v4->faddr, v4->fport 725 , v4->laddr, v4->lport 726 , tag 727 , vtw->key)); 728 729 tag = v4_port_tag(v4->lport); 730 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 731 732 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 733 , v4->lport, v4->lport 734 , tag 735 , vtw->key)); 736 737 vtw->hashed = 1; 738 } 739 740 /*!\brief insert vestigial timewait in hash chain 741 */ 742 static void 743 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw) 744 { 745 uint32_t idx = vtw_index(ctl, vtw); 746 uint32_t tag; 747 vtw_v6_t *v6 = (void*)vtw; 748 749 KASSERT(mutex_owned(softnet_lock)); 750 KASSERT(!vtw->hashed); 751 KASSERT(ctl->clidx == vtw->msl_class); 752 753 ++vtw_stats.ins; 754 755 tag = v6_tag(&v6->faddr, v6->fport, 756 &v6->laddr, v6->lport); 757 758 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 759 760 tag = v6_port_tag(v6->lport); 761 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 762 763 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 764 , v6->lport, v6->lport 765 , tag 766 , vtw->key)); 767 768 vtw->hashed = 1; 769 } 770 771 static vtw_t * 772 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport 773 , uint32_t laddr, uint16_t lport 774 , int which) 775 { 776 vtw_v4_t *v4; 777 vtw_t *vtw; 778 uint32_t tag; 779 fatp_t *fp; 780 int i; 781 uint32_t fatps = 0, probes = 0, losings = 0; 782 783 if (!ctl || !ctl->fat) 784 return 0; 785 786 ++vtw_stats.look[which]; 787 788 if (which) { 789 tag = v4_port_tag(lport); 790 fp = ctl->fat->port[tag & ctl->fat->mask]; 791 } else { 792 tag = v4_tag(faddr, fport, laddr, lport); 793 fp = ctl->fat->hash[tag & ctl->fat->mask]; 794 } 795 796 while (fp && fp->inuse) { 797 uint32_t inuse = fp->inuse; 798 799 ++fatps; 800 801 for (i = 0; inuse && i < fatp_ntags(); ++i) { 802 uint32_t idx; 803 804 if (!(inuse & (1 << i))) 805 continue; 806 807 inuse ^= 1 << i; 808 809 ++probes; 810 ++vtw_stats.probe[which]; 811 812 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 813 vtw = vtw_from_index(ctl, idx); 814 815 if (!vtw) { 816 /* Hopefully fast path. 817 */ 818 db_trace(KTR_VTW 819 , (fp, "vtw: fast %A:%P %A:%P" 820 " idx %x tag %x" 821 , faddr, fport 822 , laddr, lport 823 , idx, tag)); 824 continue; 825 } 826 827 v4 = (void*)vtw; 828 829 /* The de-referencing of vtw is what we want to avoid. 830 * Losing. 831 */ 832 if (vtw_alive(vtw) 833 && ((which ? vtw->port_key : vtw->key) 834 == fatp_key(ctl->fat, fp, i)) 835 && (which 836 || (v4->faddr == faddr && v4->laddr == laddr 837 && v4->fport == fport)) 838 && v4->lport == lport) { 839 ++vtw_stats.hit[which]; 840 841 db_trace(KTR_VTW 842 , (fp, "vtw: hit %8.8x:%4.4x" 843 " %8.8x:%4.4x idx %x key %x" 844 , faddr, fport 845 , laddr, lport 846 , idx_decode(ctl, idx), vtw->key)); 847 848 KASSERT(vtw->hashed); 849 850 goto out; 851 } 852 ++vtw_stats.losing[which]; 853 ++losings; 854 855 if (vtw_alive(vtw)) { 856 db_trace(KTR_VTW 857 , (fp, "vtw:!mis %8.8x:%4.4x" 858 " %8.8x:%4.4x key %x tag %x" 859 , faddr, fport 860 , laddr, lport 861 , fatp_key(ctl->fat, fp, i) 862 , v4_tag(faddr, fport 863 , laddr, lport))); 864 db_trace(KTR_VTW 865 , (vtw, "vtw:!mis %8.8x:%4.4x" 866 " %8.8x:%4.4x key %x tag %x" 867 , v4->faddr, v4->fport 868 , v4->laddr, v4->lport 869 , vtw->key 870 , v4_tag(v4->faddr, v4->fport 871 , v4->laddr, v4->lport))); 872 873 if (vtw->key == fatp_key(ctl->fat, fp, i)) { 874 db_trace(KTR_VTW 875 , (vtw, "vtw:!mis %8.8x:%4.4x" 876 " %8.8x:%4.4x key %x" 877 " which %x" 878 , v4->faddr, v4->fport 879 , v4->laddr, v4->lport 880 , vtw->key 881 , which)); 882 883 } else { 884 db_trace(KTR_VTW 885 , (vtw 886 , "vtw:!mis" 887 " key %8.8x != %8.8x" 888 " idx %x i %x which %x" 889 , vtw->key 890 , fatp_key(ctl->fat, fp, i) 891 , idx_decode(ctl, idx) 892 , i 893 , which)); 894 } 895 } else { 896 db_trace(KTR_VTW 897 , (fp 898 , "vtw:!mis free entry" 899 " idx %x vtw %p which %x" 900 , idx_decode(ctl, idx) 901 , vtw, which)); 902 } 903 } 904 905 if (fp->nxt) { 906 fp = fatp_next(ctl->fat, fp); 907 } else { 908 break; 909 } 910 } 911 ++vtw_stats.miss[which]; 912 vtw = 0; 913 out: 914 if (fatps > vtw_stats.max_chain[which]) 915 vtw_stats.max_chain[which] = fatps; 916 if (probes > vtw_stats.max_probe[which]) 917 vtw_stats.max_probe[which] = probes; 918 if (losings > vtw_stats.max_loss[which]) 919 vtw_stats.max_loss[which] = losings; 920 921 return vtw; 922 } 923 924 static vtw_t * 925 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport 926 , const struct in6_addr *laddr, uint16_t lport 927 , int which) 928 { 929 vtw_v6_t *v6; 930 vtw_t *vtw; 931 uint32_t tag; 932 fatp_t *fp; 933 int i; 934 uint32_t fatps = 0, probes = 0, losings = 0; 935 936 ++vtw_stats.look[which]; 937 938 if (!ctl || !ctl->fat) 939 return 0; 940 941 if (which) { 942 tag = v6_port_tag(lport); 943 fp = ctl->fat->port[tag & ctl->fat->mask]; 944 } else { 945 tag = v6_tag(faddr, fport, laddr, lport); 946 fp = ctl->fat->hash[tag & ctl->fat->mask]; 947 } 948 949 while (fp && fp->inuse) { 950 uint32_t inuse = fp->inuse; 951 952 ++fatps; 953 954 for (i = 0; inuse && i < fatp_ntags(); ++i) { 955 uint32_t idx; 956 957 if (!(inuse & (1 << i))) 958 continue; 959 960 inuse ^= 1 << i; 961 962 ++probes; 963 ++vtw_stats.probe[which]; 964 965 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 966 vtw = vtw_from_index(ctl, idx); 967 968 db_trace(KTR_VTW 969 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x" 970 , i 971 , db_store(faddr, sizeof (*faddr)), fport 972 , db_store(laddr, sizeof (*laddr)), lport 973 , idx_decode(ctl, idx))); 974 975 if (!vtw) { 976 /* Hopefully fast path. 977 */ 978 continue; 979 } 980 981 v6 = (void*)vtw; 982 983 if (vtw_alive(vtw) 984 && ((which ? vtw->port_key : vtw->key) 985 == fatp_key(ctl->fat, fp, i)) 986 && v6->lport == lport 987 && (which 988 || (v6->fport == fport 989 && !bcmp(&v6->faddr, faddr, sizeof (*faddr)) 990 && !bcmp(&v6->laddr, laddr 991 , sizeof (*laddr))))) { 992 ++vtw_stats.hit[which]; 993 994 KASSERT(vtw->hashed); 995 goto out; 996 } else { 997 ++vtw_stats.losing[which]; 998 ++losings; 999 } 1000 } 1001 1002 if (fp->nxt) { 1003 fp = fatp_next(ctl->fat, fp); 1004 } else { 1005 break; 1006 } 1007 } 1008 ++vtw_stats.miss[which]; 1009 vtw = 0; 1010 out: 1011 if (fatps > vtw_stats.max_chain[which]) 1012 vtw_stats.max_chain[which] = fatps; 1013 if (probes > vtw_stats.max_probe[which]) 1014 vtw_stats.max_probe[which] = probes; 1015 if (losings > vtw_stats.max_loss[which]) 1016 vtw_stats.max_loss[which] = losings; 1017 1018 return vtw; 1019 } 1020 1021 /*!\brief port iterator 1022 */ 1023 static vtw_t * 1024 vtw_next_port_v4(struct tcp_ports_iterator *it) 1025 { 1026 vtw_ctl_t *ctl = it->ctl; 1027 vtw_v4_t *v4; 1028 vtw_t *vtw; 1029 uint32_t tag; 1030 uint16_t lport = it->port; 1031 fatp_t *fp; 1032 int i; 1033 uint32_t fatps = 0, probes = 0, losings = 0; 1034 1035 tag = v4_port_tag(lport); 1036 if (!it->fp) { 1037 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1038 it->slot_idx = 0; 1039 } 1040 fp = it->fp; 1041 1042 while (fp) { 1043 uint32_t inuse = fp->inuse; 1044 1045 ++fatps; 1046 1047 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1048 uint32_t idx; 1049 1050 if (!(inuse & (1 << i))) 1051 continue; 1052 1053 inuse &= ~0U << i; 1054 1055 if (i < it->slot_idx) 1056 continue; 1057 1058 ++vtw_stats.probe[1]; 1059 ++probes; 1060 1061 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1062 vtw = vtw_from_index(ctl, idx); 1063 1064 if (!vtw) { 1065 /* Hopefully fast path. 1066 */ 1067 continue; 1068 } 1069 1070 v4 = (void*)vtw; 1071 1072 if (vtw_alive(vtw) 1073 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1074 && v4->lport == lport) { 1075 ++vtw_stats.hit[1]; 1076 1077 it->slot_idx = i + 1; 1078 1079 goto out; 1080 } else if (vtw_alive(vtw)) { 1081 ++vtw_stats.losing[1]; 1082 ++losings; 1083 1084 db_trace(KTR_VTW 1085 , (vtw, "vtw:!mis" 1086 " port %8.8x:%4.4x %8.8x:%4.4x" 1087 " key %x port %x" 1088 , v4->faddr, v4->fport 1089 , v4->laddr, v4->lport 1090 , vtw->key 1091 , lport)); 1092 } else { 1093 /* Really losing here. We are coming 1094 * up with references to free entries. 1095 * Might find it better to use 1096 * traditional, or need another 1097 * add-hockery. The other add-hockery 1098 * would be to pul more into into the 1099 * cache line to reject the false 1100 * hits. 1101 */ 1102 ++vtw_stats.losing[1]; 1103 ++losings; 1104 db_trace(KTR_VTW 1105 , (fp, "vtw:!mis port %x" 1106 " - free entry idx %x vtw %p" 1107 , lport 1108 , idx_decode(ctl, idx) 1109 , vtw)); 1110 } 1111 } 1112 1113 if (fp->nxt) { 1114 it->fp = fp = fatp_next(ctl->fat, fp); 1115 it->slot_idx = 0; 1116 } else { 1117 it->fp = 0; 1118 break; 1119 } 1120 } 1121 ++vtw_stats.miss[1]; 1122 1123 vtw = 0; 1124 out: 1125 if (fatps > vtw_stats.max_chain[1]) 1126 vtw_stats.max_chain[1] = fatps; 1127 if (probes > vtw_stats.max_probe[1]) 1128 vtw_stats.max_probe[1] = probes; 1129 if (losings > vtw_stats.max_loss[1]) 1130 vtw_stats.max_loss[1] = losings; 1131 1132 return vtw; 1133 } 1134 1135 /*!\brief port iterator 1136 */ 1137 static vtw_t * 1138 vtw_next_port_v6(struct tcp_ports_iterator *it) 1139 { 1140 vtw_ctl_t *ctl = it->ctl; 1141 vtw_v6_t *v6; 1142 vtw_t *vtw; 1143 uint32_t tag; 1144 uint16_t lport = it->port; 1145 fatp_t *fp; 1146 int i; 1147 uint32_t fatps = 0, probes = 0, losings = 0; 1148 1149 tag = v6_port_tag(lport); 1150 if (!it->fp) { 1151 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1152 it->slot_idx = 0; 1153 } 1154 fp = it->fp; 1155 1156 while (fp) { 1157 uint32_t inuse = fp->inuse; 1158 1159 ++fatps; 1160 1161 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1162 uint32_t idx; 1163 1164 if (!(inuse & (1 << i))) 1165 continue; 1166 1167 inuse &= ~0U << i; 1168 1169 if (i < it->slot_idx) 1170 continue; 1171 1172 ++vtw_stats.probe[1]; 1173 ++probes; 1174 1175 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1176 vtw = vtw_from_index(ctl, idx); 1177 1178 if (!vtw) { 1179 /* Hopefully fast path. 1180 */ 1181 continue; 1182 } 1183 1184 v6 = (void*)vtw; 1185 1186 db_trace(KTR_VTW 1187 , (vtw, "vtw: i %x idx %x fp->tag %x" 1188 " tag %x xtra %x" 1189 , i, idx_decode(ctl, idx) 1190 , fp->tag[i], tag, fatp_xtra[i])); 1191 1192 if (vtw_alive(vtw) 1193 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1194 && v6->lport == lport) { 1195 ++vtw_stats.hit[1]; 1196 1197 db_trace(KTR_VTW 1198 , (fp, "vtw: nxt port %P - %4.4x" 1199 " idx %x key %x" 1200 , lport, lport 1201 , idx_decode(ctl, idx), vtw->key)); 1202 1203 it->slot_idx = i + 1; 1204 goto out; 1205 } else if (vtw_alive(vtw)) { 1206 ++vtw_stats.losing[1]; 1207 1208 db_trace(KTR_VTW 1209 , (vtw, "vtw:!mis port %6A:%4.4x" 1210 " %6A:%4.4x key %x port %x" 1211 , db_store(&v6->faddr 1212 , sizeof (v6->faddr)) 1213 , v6->fport 1214 , db_store(&v6->laddr 1215 , sizeof (v6->faddr)) 1216 , v6->lport 1217 , vtw->key 1218 , lport)); 1219 } else { 1220 /* Really losing here. We are coming 1221 * up with references to free entries. 1222 * Might find it better to use 1223 * traditional, or need another 1224 * add-hockery. The other add-hockery 1225 * would be to pul more into into the 1226 * cache line to reject the false 1227 * hits. 1228 */ 1229 ++vtw_stats.losing[1]; 1230 ++losings; 1231 1232 db_trace(KTR_VTW 1233 , (fp 1234 , "vtw:!mis port %x" 1235 " - free entry idx %x vtw %p" 1236 , lport, idx_decode(ctl, idx) 1237 , vtw)); 1238 } 1239 } 1240 1241 if (fp->nxt) { 1242 it->fp = fp = fatp_next(ctl->fat, fp); 1243 it->slot_idx = 0; 1244 } else { 1245 it->fp = 0; 1246 break; 1247 } 1248 } 1249 ++vtw_stats.miss[1]; 1250 1251 vtw = 0; 1252 out: 1253 if (fatps > vtw_stats.max_chain[1]) 1254 vtw_stats.max_chain[1] = fatps; 1255 if (probes > vtw_stats.max_probe[1]) 1256 vtw_stats.max_probe[1] = probes; 1257 if (losings > vtw_stats.max_loss[1]) 1258 vtw_stats.max_loss[1] = losings; 1259 1260 return vtw; 1261 } 1262 1263 /*!\brief initialise the VTW allocation arena 1264 * 1265 * There are 1+3 allocation classes: 1266 * 0 classless 1267 * {1,2,3} MSL-class based allocation 1268 * 1269 * The allocation arenas are all initialised. Classless gets all the 1270 * space. MSL-class based divides the arena, so that allocation 1271 * within a class can proceed without having to consider entries 1272 * (aka: cache lines) from different classes. 1273 * 1274 * Usually, we are completely classless or class-based, but there can be 1275 * transition periods, corresponding to dynamic adjustments in the config 1276 * by the operator. 1277 */ 1278 static void 1279 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v) 1280 { 1281 int class_n, i; 1282 vtw_t *base; 1283 1284 ctl->base.v = ctl_base_v; 1285 1286 if (ctl->is_v4) { 1287 ctl->lim.v4 = ctl->base.v4 + n - 1; 1288 ctl->alloc.v4 = ctl->base.v4; 1289 } else { 1290 ctl->lim.v6 = ctl->base.v6 + n - 1; 1291 ctl->alloc.v6 = ctl->base.v6; 1292 } 1293 1294 ctl->nfree = n; 1295 ctl->ctl = ctl; 1296 1297 ctl->idx_bits = 32; 1298 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) { 1299 ctl->idx_mask >>= 1; 1300 ctl->idx_bits -= 1; 1301 } 1302 1303 ctl->idx_mask <<= 1; 1304 ctl->idx_mask |= 1; 1305 ctl->idx_bits += 1; 1306 1307 ctl->fat = fat; 1308 fat->vtw = ctl; 1309 1310 /* Divide the resources equally amongst the classes. 1311 * This is not optimal, as the different classes 1312 * arrive and leave at different rates, but it is 1313 * the best I can do for now. 1314 */ 1315 class_n = n / (VTW_NCLASS-1); 1316 base = ctl->base.v; 1317 1318 for (i = 1; i < VTW_NCLASS; ++i) { 1319 int j; 1320 1321 ctl[i] = ctl[0]; 1322 ctl[i].clidx = i; 1323 1324 ctl[i].base.v = base; 1325 ctl[i].alloc = ctl[i].base; 1326 1327 for (j = 0; j < class_n - 1; ++j) { 1328 if (tcp_msl_enable) 1329 base->msl_class = i; 1330 base = vtw_next(ctl, base); 1331 } 1332 1333 ctl[i].lim.v = base; 1334 base = vtw_next(ctl, base); 1335 ctl[i].nfree = class_n; 1336 } 1337 1338 vtw_debug_init(); 1339 } 1340 1341 /*!\brief map class to TCP MSL 1342 */ 1343 static inline uint32_t 1344 class_to_msl(int msl_class) 1345 { 1346 switch (msl_class) { 1347 case 0: 1348 case 1: 1349 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0); 1350 case 2: 1351 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); 1352 default: 1353 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); 1354 } 1355 } 1356 1357 /*!\brief map TCP MSL to class 1358 */ 1359 static inline uint32_t 1360 msl_to_class(int msl) 1361 { 1362 if (tcp_msl_enable) { 1363 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2))) 1364 return 1+2; 1365 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1))) 1366 return 1+1; 1367 return 1; 1368 } 1369 return 0; 1370 } 1371 1372 /*!\brief allocate a vtw entry 1373 */ 1374 static inline vtw_t * 1375 vtw_alloc(vtw_ctl_t *ctl) 1376 { 1377 vtw_t *vtw = 0; 1378 int stuck = 0; 1379 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0; 1380 int msl; 1381 1382 KASSERT(mutex_owned(softnet_lock)); 1383 1384 /* If no resources, we will not get far. 1385 */ 1386 if (!ctl || !ctl->base.v4 || avail <= 0) 1387 return 0; 1388 1389 /* Obtain a free one. 1390 */ 1391 while (!ctl->nfree) { 1392 vtw_age(ctl, 0); 1393 1394 if (++stuck > avail) { 1395 /* When in transition between 1396 * schemes (classless, classed) we 1397 * can be stuck having to await the 1398 * expiration of cross-allocated entries. 1399 * 1400 * Returning zero means we will fall back to the 1401 * traditional TIME_WAIT handling, except in the 1402 * case of a re-shed, in which case we cannot 1403 * perform the reshecd, but will retain the extant 1404 * entry. 1405 */ 1406 db_trace(KTR_VTW 1407 , (ctl, "vtw:!none free in class %x %x/%x" 1408 , ctl->clidx 1409 , ctl->nalloc, ctl->nfree)); 1410 1411 return 0; 1412 } 1413 } 1414 1415 vtw = ctl->alloc.v; 1416 1417 if (vtw->msl_class != ctl->clidx) { 1418 /* Usurping rules: 1419 * 0 -> {1,2,3} or {1,2,3} -> 0 1420 */ 1421 KASSERT(!vtw->msl_class || !ctl->clidx); 1422 1423 if (vtw->hashed || vtw->expire.tv_sec) { 1424 /* As this is owned by some other class, 1425 * we must wait for it to expire it. 1426 * This will only happen on class/classless 1427 * transitions, which are guaranteed to progress 1428 * to completion in small finite time, barring bugs. 1429 */ 1430 db_trace(KTR_VTW 1431 , (ctl, "vtw:!%p class %x!=%x %x:%x%s" 1432 , vtw, vtw->msl_class, ctl->clidx 1433 , vtw->expire.tv_sec 1434 , vtw->expire.tv_usec 1435 , vtw->hashed ? " hashed" : "")); 1436 1437 return 0; 1438 } 1439 1440 db_trace(KTR_VTW 1441 , (ctl, "vtw:!%p usurped from %x to %x" 1442 , vtw, vtw->msl_class, ctl->clidx)); 1443 1444 vtw->msl_class = ctl->clidx; 1445 } 1446 1447 if (vtw_alive(vtw)) { 1448 KASSERT(0 && "next free not free"); 1449 return 0; 1450 } 1451 1452 /* Advance allocation poiter. 1453 */ 1454 ctl->alloc.v = vtw_next(ctl, vtw); 1455 1456 --ctl->nfree; 1457 ++ctl->nalloc; 1458 1459 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec 1460 1461 /* mark expiration 1462 */ 1463 getmicrouptime(&vtw->expire); 1464 1465 /* Move expiration into the future. 1466 */ 1467 vtw->expire.tv_sec += msl / 1000; 1468 vtw->expire.tv_usec += 1000 * (msl % 1000); 1469 1470 while (vtw->expire.tv_usec >= 1000*1000) { 1471 vtw->expire.tv_usec -= 1000*1000; 1472 vtw->expire.tv_sec += 1; 1473 } 1474 1475 if (!ctl->oldest.v) 1476 ctl->oldest.v = vtw; 1477 1478 return vtw; 1479 } 1480 1481 /*!\brief expiration 1482 */ 1483 static int 1484 vtw_age(vtw_ctl_t *ctl, struct timeval *_when) 1485 { 1486 vtw_t *vtw; 1487 struct timeval then, *when = _when; 1488 int maxtries = 0; 1489 1490 if (!ctl->oldest.v) { 1491 KASSERT(!ctl->nalloc); 1492 return 0; 1493 } 1494 1495 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) { 1496 if (++maxtries > ctl->nalloc) 1497 break; 1498 1499 if (vtw->msl_class != ctl->clidx) { 1500 db_trace(KTR_VTW 1501 , (vtw, "vtw:!age class mismatch %x != %x" 1502 , vtw->msl_class, ctl->clidx)); 1503 /* XXXX 1504 * See if the appropriate action is to skip to the next. 1505 * XXXX 1506 */ 1507 ctl->oldest.v = vtw = vtw_next(ctl, vtw); 1508 continue; 1509 } 1510 if (!when) { 1511 /* Latch oldest timeval if none specified. 1512 */ 1513 then = vtw->expire; 1514 when = &then; 1515 } 1516 1517 if (!timercmp(&vtw->expire, when, <=)) 1518 break; 1519 1520 db_trace(KTR_VTW 1521 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x" 1522 , ctl->clidx 1523 , vtw->expire.tv_sec 1524 , vtw->expire.tv_usec 1525 , ctl->nalloc 1526 , ctl->nfree)); 1527 1528 if (!_when) 1529 ++vtw_stats.kill; 1530 1531 vtw_del(ctl, vtw); 1532 vtw = ctl->oldest.v; 1533 } 1534 1535 return ctl->nalloc; // # remaining allocated 1536 } 1537 1538 static callout_t vtw_cs; 1539 1540 /*!\brief notice the passage of time. 1541 * It seems to be getting faster. What happened to the year? 1542 */ 1543 static void 1544 vtw_tick(void *arg) 1545 { 1546 struct timeval now; 1547 int i, cnt = 0; 1548 1549 getmicrouptime(&now); 1550 1551 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x" 1552 , now.tv_sec, now.tv_usec)); 1553 1554 mutex_enter(softnet_lock); 1555 1556 for (i = 0; i < VTW_NCLASS; ++i) { 1557 cnt += vtw_age(&vtw_tcpv4[i], &now); 1558 cnt += vtw_age(&vtw_tcpv6[i], &now); 1559 } 1560 1561 /* Keep ticks coming while we need them. 1562 */ 1563 if (cnt) 1564 callout_schedule(&vtw_cs, hz / 5); 1565 else { 1566 tcp_vtw_was_enabled = 0; 1567 tcbtable.vestige = 0; 1568 } 1569 mutex_exit(softnet_lock); 1570 } 1571 1572 /* in_pcblookup_ports assist for handling vestigial entries. 1573 */ 1574 static void * 1575 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild) 1576 { 1577 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4; 1578 1579 bzero(it, sizeof (*it)); 1580 1581 /* Note: the reference to vtw_tcpv4[0] is fine. 1582 * We do not need per-class iteration. We just 1583 * need to get to the fat, and there is one 1584 * shared fat. 1585 */ 1586 if (vtw_tcpv4[0].fat) { 1587 it->addr.v4 = addr; 1588 it->port = port; 1589 it->wild = !!wild; 1590 it->ctl = &vtw_tcpv4[0]; 1591 1592 ++vtw_stats.look[1]; 1593 } 1594 1595 return it; 1596 } 1597 1598 /*!\brief export an IPv4 vtw. 1599 */ 1600 static int 1601 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1602 { 1603 vtw_v4_t *v4 = (void*)vtw; 1604 1605 bzero(res, sizeof (*res)); 1606 1607 if (ctl && vtw) { 1608 if (!ctl->clidx && vtw->msl_class) 1609 ctl += vtw->msl_class; 1610 else 1611 KASSERT(ctl->clidx == vtw->msl_class); 1612 1613 res->valid = 1; 1614 res->v4 = 1; 1615 1616 res->faddr.v4.s_addr = v4->faddr; 1617 res->laddr.v4.s_addr = v4->laddr; 1618 res->fport = v4->fport; 1619 res->lport = v4->lport; 1620 res->vtw = vtw; // netlock held over call(s) 1621 res->ctl = ctl; 1622 res->reuse_addr = vtw->reuse_addr; 1623 res->reuse_port = vtw->reuse_port; 1624 res->snd_nxt = vtw->snd_nxt; 1625 res->rcv_nxt = vtw->rcv_nxt; 1626 res->rcv_wnd = vtw->rcv_wnd; 1627 res->uid = vtw->uid; 1628 } 1629 1630 return res->valid; 1631 } 1632 1633 /*!\brief return next port in the port iterator. yowza. 1634 */ 1635 static int 1636 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res) 1637 { 1638 struct tcp_ports_iterator *it = arg; 1639 vtw_t *vtw = 0; 1640 1641 if (it->ctl) 1642 vtw = vtw_next_port_v4(it); 1643 1644 if (!vtw) 1645 it->ctl = 0; 1646 1647 return vtw_export_v4(it->ctl, vtw, res); 1648 } 1649 1650 static int 1651 tcp_lookup_v4(struct in_addr faddr, uint16_t fport, 1652 struct in_addr laddr, uint16_t lport, 1653 struct vestigial_inpcb *res) 1654 { 1655 vtw_t *vtw; 1656 vtw_ctl_t *ctl; 1657 1658 1659 db_trace(KTR_VTW 1660 , (res, "vtw: lookup %A:%P %A:%P" 1661 , faddr, fport 1662 , laddr, lport)); 1663 1664 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0]) 1665 , faddr.s_addr, fport 1666 , laddr.s_addr, lport, 0); 1667 1668 return vtw_export_v4(ctl, vtw, res); 1669 } 1670 1671 /* in_pcblookup_ports assist for handling vestigial entries. 1672 */ 1673 static void * 1674 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild) 1675 { 1676 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6; 1677 1678 bzero(it, sizeof (*it)); 1679 1680 /* Note: the reference to vtw_tcpv6[0] is fine. 1681 * We do not need per-class iteration. We just 1682 * need to get to the fat, and there is one 1683 * shared fat. 1684 */ 1685 if (vtw_tcpv6[0].fat) { 1686 it->addr.v6 = *addr; 1687 it->port = port; 1688 it->wild = !!wild; 1689 it->ctl = &vtw_tcpv6[0]; 1690 1691 ++vtw_stats.look[1]; 1692 } 1693 1694 return it; 1695 } 1696 1697 /*!\brief export an IPv6 vtw. 1698 */ 1699 static int 1700 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1701 { 1702 vtw_v6_t *v6 = (void*)vtw; 1703 1704 bzero(res, sizeof (*res)); 1705 1706 if (ctl && vtw) { 1707 if (!ctl->clidx && vtw->msl_class) 1708 ctl += vtw->msl_class; 1709 else 1710 KASSERT(ctl->clidx == vtw->msl_class); 1711 1712 res->valid = 1; 1713 res->v4 = 0; 1714 1715 res->faddr.v6 = v6->faddr; 1716 res->laddr.v6 = v6->laddr; 1717 res->fport = v6->fport; 1718 res->lport = v6->lport; 1719 res->vtw = vtw; // netlock held over call(s) 1720 res->ctl = ctl; 1721 1722 res->v6only = vtw->v6only; 1723 res->reuse_addr = vtw->reuse_addr; 1724 res->reuse_port = vtw->reuse_port; 1725 1726 res->snd_nxt = vtw->snd_nxt; 1727 res->rcv_nxt = vtw->rcv_nxt; 1728 res->rcv_wnd = vtw->rcv_wnd; 1729 res->uid = vtw->uid; 1730 } 1731 1732 return res->valid; 1733 } 1734 1735 static int 1736 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res) 1737 { 1738 struct tcp_ports_iterator *it = arg; 1739 vtw_t *vtw = 0; 1740 1741 if (it->ctl) 1742 vtw = vtw_next_port_v6(it); 1743 1744 if (!vtw) 1745 it->ctl = 0; 1746 1747 return vtw_export_v6(it->ctl, vtw, res); 1748 } 1749 1750 static int 1751 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport, 1752 const struct in6_addr *laddr, uint16_t lport, 1753 struct vestigial_inpcb *res) 1754 { 1755 vtw_ctl_t *ctl; 1756 vtw_t *vtw; 1757 1758 db_trace(KTR_VTW 1759 , (res, "vtw: lookup %6A:%P %6A:%P" 1760 , db_store(faddr, sizeof (*faddr)), fport 1761 , db_store(laddr, sizeof (*laddr)), lport)); 1762 1763 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0]) 1764 , faddr, fport 1765 , laddr, lport, 0); 1766 1767 return vtw_export_v6(ctl, vtw, res); 1768 } 1769 1770 static vestigial_hooks_t tcp_hooks = { 1771 .init_ports4 = tcp_init_ports_v4, 1772 .next_port4 = tcp_next_port_v4, 1773 .lookup4 = tcp_lookup_v4, 1774 .init_ports6 = tcp_init_ports_v6, 1775 .next_port6 = tcp_next_port_v6, 1776 .lookup6 = tcp_lookup_v6, 1777 }; 1778 1779 static bool 1780 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp) 1781 { 1782 fatp_ctl_t *fat; 1783 vtw_ctl_t *ctl; 1784 1785 switch (af) { 1786 case AF_INET: 1787 fat = &fat_tcpv4; 1788 ctl = &vtw_tcpv4[0]; 1789 break; 1790 case AF_INET6: 1791 fat = &fat_tcpv6; 1792 ctl = &vtw_tcpv6[0]; 1793 break; 1794 default: 1795 return false; 1796 } 1797 if (fatp != NULL) 1798 *fatp = fat; 1799 if (ctlp != NULL) 1800 *ctlp = ctl; 1801 return true; 1802 } 1803 1804 /*!\brief initialize controlling instance 1805 */ 1806 static int 1807 vtw_control_init(int af) 1808 { 1809 fatp_ctl_t *fat; 1810 vtw_ctl_t *ctl; 1811 fatp_t *fat_base; 1812 fatp_t **fat_hash; 1813 vtw_t *ctl_base_v; 1814 uint32_t n, m; 1815 size_t sz; 1816 1817 KASSERT(powerof2(tcp_vtw_entries)); 1818 1819 if (!vtw_select(af, &fat, &ctl)) 1820 return EAFNOSUPPORT; 1821 1822 if (fat->hash != NULL) { 1823 KASSERT(fat->base != NULL && ctl->base.v != NULL); 1824 return 0; 1825 } 1826 1827 /* Allocate 10% more capacity in the fat pointers. 1828 * We should only need ~#hash additional based on 1829 * how they age, but TIME_WAIT assassination could cause 1830 * sparse fat pointer utilisation. 1831 */ 1832 m = 512; 1833 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10; 1834 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t)); 1835 1836 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP); 1837 1838 if (fat_hash == NULL) { 1839 printf("%s: could not allocate %zu bytes for " 1840 "hash anchors", __func__, 2*m * sizeof(fatp_t *)); 1841 return ENOMEM; 1842 } 1843 1844 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP); 1845 1846 if (fat_base == NULL) { 1847 kmem_free(fat_hash, 2*m * sizeof (fatp_t *)); 1848 printf("%s: could not allocate %zu bytes for " 1849 "fatp_t array", __func__, 2*n * sizeof(fatp_t)); 1850 return ENOMEM; 1851 } 1852 1853 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP); 1854 1855 if (ctl_base_v == NULL) { 1856 kmem_free(fat_hash, 2*m * sizeof (fatp_t *)); 1857 kmem_free(fat_base, 2*n * sizeof(fatp_t)); 1858 printf("%s: could not allocate %zu bytes for " 1859 "vtw_t array", __func__, tcp_vtw_entries * sz); 1860 return ENOMEM; 1861 } 1862 1863 fatp_init(fat, n, m, fat_base, fat_hash); 1864 1865 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v); 1866 1867 return 0; 1868 } 1869 1870 /*!\brief select controlling instance 1871 */ 1872 static vtw_ctl_t * 1873 vtw_control(int af, uint32_t msl) 1874 { 1875 fatp_ctl_t *fat; 1876 vtw_ctl_t *ctl; 1877 int msl_class = msl_to_class(msl); 1878 1879 if (!vtw_select(af, &fat, &ctl)) 1880 return NULL; 1881 1882 if (!fat->base || !ctl->base.v) 1883 return NULL; 1884 1885 if (!tcp_vtw_was_enabled) { 1886 /* This guarantees is timer ticks until we no longer need them. 1887 */ 1888 tcp_vtw_was_enabled = 1; 1889 1890 callout_schedule(&vtw_cs, hz / 5); 1891 1892 tcbtable.vestige = &tcp_hooks; 1893 } 1894 1895 return ctl + msl_class; 1896 } 1897 1898 /*!\brief add TCP pcb to vestigial timewait 1899 */ 1900 int 1901 vtw_add(int af, struct tcpcb *tp) 1902 { 1903 #ifdef VTW_DEBUG 1904 int enable; 1905 #endif 1906 vtw_ctl_t *ctl; 1907 vtw_t *vtw; 1908 1909 KASSERT(mutex_owned(softnet_lock)); 1910 1911 ctl = vtw_control(af, tp->t_msl); 1912 if (!ctl) 1913 return 0; 1914 1915 #ifdef VTW_DEBUG 1916 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable; 1917 #endif 1918 1919 vtw = vtw_alloc(ctl); 1920 1921 if (vtw) { 1922 vtw->snd_nxt = tp->snd_nxt; 1923 vtw->rcv_nxt = tp->rcv_nxt; 1924 1925 switch (af) { 1926 case AF_INET: { 1927 struct inpcb *inp = tp->t_inpcb; 1928 vtw_v4_t *v4 = (void*)vtw; 1929 1930 v4->faddr = inp->inp_faddr.s_addr; 1931 v4->laddr = inp->inp_laddr.s_addr; 1932 v4->fport = inp->inp_fport; 1933 v4->lport = inp->inp_lport; 1934 1935 vtw->reuse_port = !!(inp->inp_socket->so_options 1936 & SO_REUSEPORT); 1937 vtw->reuse_addr = !!(inp->inp_socket->so_options 1938 & SO_REUSEADDR); 1939 vtw->v6only = 0; 1940 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid; 1941 1942 vtw_inshash_v4(ctl, vtw); 1943 1944 1945 #ifdef VTW_DEBUG 1946 /* Immediate lookup (connected and port) to 1947 * ensure at least that works! 1948 */ 1949 if (enable & 4) { 1950 KASSERT(vtw_lookup_hash_v4 1951 (ctl 1952 , inp->inp_faddr.s_addr, inp->inp_fport 1953 , inp->inp_laddr.s_addr, inp->inp_lport 1954 , 0) 1955 == vtw); 1956 KASSERT(vtw_lookup_hash_v4 1957 (ctl 1958 , inp->inp_faddr.s_addr, inp->inp_fport 1959 , inp->inp_laddr.s_addr, inp->inp_lport 1960 , 1)); 1961 } 1962 /* Immediate port iterator functionality check: not wild 1963 */ 1964 if (enable & 8) { 1965 struct tcp_ports_iterator *it; 1966 struct vestigial_inpcb res; 1967 int cnt = 0; 1968 1969 it = tcp_init_ports_v4(inp->inp_laddr 1970 , inp->inp_lport, 0); 1971 1972 while (tcp_next_port_v4(it, &res)) { 1973 ++cnt; 1974 } 1975 KASSERT(cnt); 1976 } 1977 /* Immediate port iterator functionality check: wild 1978 */ 1979 if (enable & 16) { 1980 struct tcp_ports_iterator *it; 1981 struct vestigial_inpcb res; 1982 struct in_addr any; 1983 int cnt = 0; 1984 1985 any.s_addr = htonl(INADDR_ANY); 1986 1987 it = tcp_init_ports_v4(any, inp->inp_lport, 1); 1988 1989 while (tcp_next_port_v4(it, &res)) { 1990 ++cnt; 1991 } 1992 KASSERT(cnt); 1993 } 1994 #endif /* VTW_DEBUG */ 1995 break; 1996 } 1997 1998 case AF_INET6: { 1999 struct in6pcb *inp = tp->t_in6pcb; 2000 vtw_v6_t *v6 = (void*)vtw; 2001 2002 v6->faddr = inp->in6p_faddr; 2003 v6->laddr = inp->in6p_laddr; 2004 v6->fport = inp->in6p_fport; 2005 v6->lport = inp->in6p_lport; 2006 2007 vtw->reuse_port = !!(inp->in6p_socket->so_options 2008 & SO_REUSEPORT); 2009 vtw->reuse_addr = !!(inp->in6p_socket->so_options 2010 & SO_REUSEADDR); 2011 vtw->v6only = !!(inp->in6p_flags 2012 & IN6P_IPV6_V6ONLY); 2013 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid; 2014 2015 vtw_inshash_v6(ctl, vtw); 2016 #ifdef VTW_DEBUG 2017 /* Immediate lookup (connected and port) to 2018 * ensure at least that works! 2019 */ 2020 if (enable & 4) { 2021 KASSERT(vtw_lookup_hash_v6(ctl 2022 , &inp->in6p_faddr, inp->in6p_fport 2023 , &inp->in6p_laddr, inp->in6p_lport 2024 , 0) 2025 == vtw); 2026 KASSERT(vtw_lookup_hash_v6 2027 (ctl 2028 , &inp->in6p_faddr, inp->in6p_fport 2029 , &inp->in6p_laddr, inp->in6p_lport 2030 , 1)); 2031 } 2032 /* Immediate port iterator functionality check: not wild 2033 */ 2034 if (enable & 8) { 2035 struct tcp_ports_iterator *it; 2036 struct vestigial_inpcb res; 2037 int cnt = 0; 2038 2039 it = tcp_init_ports_v6(&inp->in6p_laddr 2040 , inp->in6p_lport, 0); 2041 2042 while (tcp_next_port_v6(it, &res)) { 2043 ++cnt; 2044 } 2045 KASSERT(cnt); 2046 } 2047 /* Immediate port iterator functionality check: wild 2048 */ 2049 if (enable & 16) { 2050 struct tcp_ports_iterator *it; 2051 struct vestigial_inpcb res; 2052 static struct in6_addr any = IN6ADDR_ANY_INIT; 2053 int cnt = 0; 2054 2055 it = tcp_init_ports_v6(&any 2056 , inp->in6p_lport, 1); 2057 2058 while (tcp_next_port_v6(it, &res)) { 2059 ++cnt; 2060 } 2061 KASSERT(cnt); 2062 } 2063 #endif /* VTW_DEBUG */ 2064 break; 2065 } 2066 } 2067 2068 tcp_canceltimers(tp); 2069 tp = tcp_close(tp); 2070 KASSERT(!tp); 2071 2072 return 1; 2073 } 2074 2075 return 0; 2076 } 2077 2078 /*!\brief restart timer for vestigial time-wait entry 2079 */ 2080 static void 2081 vtw_restart_v4(vestigial_inpcb_t *vp) 2082 { 2083 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw; 2084 vtw_t *vtw; 2085 vtw_t *cp = ©.common; 2086 vtw_ctl_t *ctl; 2087 2088 KASSERT(mutex_owned(softnet_lock)); 2089 2090 db_trace(KTR_VTW 2091 , (vp->vtw, "vtw: restart %A:%P %A:%P" 2092 , vp->faddr.v4.s_addr, vp->fport 2093 , vp->laddr.v4.s_addr, vp->lport)); 2094 2095 /* Class might have changed, so have a squiz. 2096 */ 2097 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class)); 2098 vtw = vtw_alloc(ctl); 2099 2100 if (vtw) { 2101 vtw_v4_t *v4 = (void*)vtw; 2102 2103 /* Safe now to unhash the old entry 2104 */ 2105 vtw_del(vp->ctl, vp->vtw); 2106 2107 vtw->snd_nxt = cp->snd_nxt; 2108 vtw->rcv_nxt = cp->rcv_nxt; 2109 2110 v4->faddr = copy.faddr; 2111 v4->laddr = copy.laddr; 2112 v4->fport = copy.fport; 2113 v4->lport = copy.lport; 2114 2115 vtw->reuse_port = cp->reuse_port; 2116 vtw->reuse_addr = cp->reuse_addr; 2117 vtw->v6only = 0; 2118 vtw->uid = cp->uid; 2119 2120 vtw_inshash_v4(ctl, vtw); 2121 } 2122 2123 vp->valid = 0; 2124 } 2125 2126 /*!\brief restart timer for vestigial time-wait entry 2127 */ 2128 static void 2129 vtw_restart_v6(vestigial_inpcb_t *vp) 2130 { 2131 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw; 2132 vtw_t *vtw; 2133 vtw_t *cp = ©.common; 2134 vtw_ctl_t *ctl; 2135 2136 KASSERT(mutex_owned(softnet_lock)); 2137 2138 db_trace(KTR_VTW 2139 , (vp->vtw, "vtw: restart %6A:%P %6A:%P" 2140 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6)) 2141 , vp->fport 2142 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6)) 2143 , vp->lport)); 2144 2145 /* Class might have changed, so have a squiz. 2146 */ 2147 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class)); 2148 vtw = vtw_alloc(ctl); 2149 2150 if (vtw) { 2151 vtw_v6_t *v6 = (void*)vtw; 2152 2153 /* Safe now to unhash the old entry 2154 */ 2155 vtw_del(vp->ctl, vp->vtw); 2156 2157 vtw->snd_nxt = cp->snd_nxt; 2158 vtw->rcv_nxt = cp->rcv_nxt; 2159 2160 v6->faddr = copy.faddr; 2161 v6->laddr = copy.laddr; 2162 v6->fport = copy.fport; 2163 v6->lport = copy.lport; 2164 2165 vtw->reuse_port = cp->reuse_port; 2166 vtw->reuse_addr = cp->reuse_addr; 2167 vtw->v6only = cp->v6only; 2168 vtw->uid = cp->uid; 2169 2170 vtw_inshash_v6(ctl, vtw); 2171 } 2172 2173 vp->valid = 0; 2174 } 2175 2176 /*!\brief restart timer for vestigial time-wait entry 2177 */ 2178 void 2179 vtw_restart(vestigial_inpcb_t *vp) 2180 { 2181 if (!vp || !vp->valid) 2182 return; 2183 2184 if (vp->v4) 2185 vtw_restart_v4(vp); 2186 else 2187 vtw_restart_v6(vp); 2188 } 2189 2190 int 2191 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS) 2192 { 2193 int en, rc; 2194 struct sysctlnode node; 2195 2196 node = *rnode; 2197 en = *(int *)rnode->sysctl_data; 2198 node.sysctl_data = &en; 2199 2200 rc = sysctl_lookup(SYSCTLFN_CALL(&node)); 2201 if (rc != 0 || newp == NULL) 2202 return rc; 2203 2204 if (rnode->sysctl_data != &tcp4_vtw_enable && 2205 rnode->sysctl_data != &tcp6_vtw_enable) 2206 rc = ENOENT; 2207 else if ((en & 1) == 0) 2208 rc = 0; 2209 else if (rnode->sysctl_data == &tcp4_vtw_enable) 2210 rc = vtw_control_init(AF_INET); 2211 else /* rnode->sysctl_data == &tcp6_vtw_enable */ 2212 rc = vtw_control_init(AF_INET6); 2213 2214 if (rc == 0) 2215 *(int *)rnode->sysctl_data = en; 2216 2217 return rc; 2218 } 2219 2220 int 2221 vtw_earlyinit(void) 2222 { 2223 int i, rc; 2224 2225 callout_init(&vtw_cs, 0); 2226 callout_setfunc(&vtw_cs, vtw_tick, 0); 2227 2228 for (i = 0; i < VTW_NCLASS; ++i) { 2229 vtw_tcpv4[i].is_v4 = 1; 2230 vtw_tcpv6[i].is_v6 = 1; 2231 } 2232 2233 if ((tcp4_vtw_enable & 1) != 0 && 2234 (rc = vtw_control_init(AF_INET)) != 0) 2235 return rc; 2236 2237 if ((tcp6_vtw_enable & 1) != 0 && 2238 (rc = vtw_control_init(AF_INET6)) != 0) 2239 return rc; 2240 2241 return 0; 2242 } 2243 2244 #ifdef VTW_DEBUG 2245 #include <sys/syscallargs.h> 2246 #include <sys/sysctl.h> 2247 2248 /*!\brief add lalp, fafp entries for debug 2249 */ 2250 int 2251 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class) 2252 { 2253 vtw_ctl_t *ctl; 2254 vtw_t *vtw; 2255 2256 ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class)); 2257 if (!ctl) 2258 return 0; 2259 2260 vtw = vtw_alloc(ctl); 2261 2262 if (vtw) { 2263 vtw->snd_nxt = 0; 2264 vtw->rcv_nxt = 0; 2265 2266 switch (af) { 2267 case AF_INET: { 2268 vtw_v4_t *v4 = (void*)vtw; 2269 2270 v4->faddr = fa->sin_addr.v4.s_addr; 2271 v4->laddr = la->sin_addr.v4.s_addr; 2272 v4->fport = fa->sin_port; 2273 v4->lport = la->sin_port; 2274 2275 vtw->reuse_port = 1; 2276 vtw->reuse_addr = 1; 2277 vtw->v6only = 0; 2278 vtw->uid = 0; 2279 2280 vtw_inshash_v4(ctl, vtw); 2281 break; 2282 } 2283 2284 case AF_INET6: { 2285 vtw_v6_t *v6 = (void*)vtw; 2286 2287 v6->faddr = fa->sin_addr.v6; 2288 v6->laddr = la->sin_addr.v6; 2289 2290 v6->fport = fa->sin_port; 2291 v6->lport = la->sin_port; 2292 2293 vtw->reuse_port = 1; 2294 vtw->reuse_addr = 1; 2295 vtw->v6only = 0; 2296 vtw->uid = 0; 2297 2298 vtw_inshash_v6(ctl, vtw); 2299 break; 2300 } 2301 2302 default: 2303 break; 2304 } 2305 2306 return 1; 2307 } 2308 2309 return 0; 2310 } 2311 2312 static int vtw_syscall = 0; 2313 2314 static int 2315 vtw_debug_process(vtw_sysargs_t *ap) 2316 { 2317 struct vestigial_inpcb vestige; 2318 int rc = 0; 2319 2320 mutex_enter(softnet_lock); 2321 2322 switch (ap->op) { 2323 case 0: // insert 2324 vtw_debug_add(ap->la.sin_family 2325 , &ap->la 2326 , &ap->fa 2327 , TCPTV_MSL 2328 , 0); 2329 break; 2330 2331 case 1: // lookup 2332 case 2: // restart 2333 switch (ap->la.sin_family) { 2334 case AF_INET: 2335 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port, 2336 ap->la.sin_addr.v4, ap->la.sin_port, 2337 &vestige)) { 2338 if (ap->op == 2) { 2339 vtw_restart(&vestige); 2340 } 2341 rc = 0; 2342 } else 2343 rc = ESRCH; 2344 break; 2345 2346 case AF_INET6: 2347 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port, 2348 &ap->la.sin_addr.v6, ap->la.sin_port, 2349 &vestige)) { 2350 if (ap->op == 2) { 2351 vtw_restart(&vestige); 2352 } 2353 rc = 0; 2354 } else 2355 rc = ESRCH; 2356 break; 2357 default: 2358 rc = EINVAL; 2359 } 2360 break; 2361 2362 default: 2363 rc = EINVAL; 2364 } 2365 2366 mutex_exit(softnet_lock); 2367 return rc; 2368 } 2369 2370 struct sys_vtw_args { 2371 syscallarg(const vtw_sysargs_t *) req; 2372 syscallarg(size_t) len; 2373 }; 2374 2375 static int 2376 vtw_sys(struct lwp *l, const void *_, register_t *retval) 2377 { 2378 const struct sys_vtw_args *uap = _; 2379 void *buf; 2380 int rc; 2381 size_t len = SCARG(uap, len); 2382 2383 if (len != sizeof (vtw_sysargs_t)) 2384 return EINVAL; 2385 2386 buf = kmem_alloc(len, KM_SLEEP); 2387 if (!buf) 2388 return ENOMEM; 2389 2390 rc = copyin(SCARG(uap, req), buf, len); 2391 if (!rc) { 2392 rc = vtw_debug_process(buf); 2393 } 2394 kmem_free(buf, len); 2395 2396 return rc; 2397 } 2398 2399 static void 2400 vtw_sanity_check(void) 2401 { 2402 vtw_ctl_t *ctl; 2403 vtw_t *vtw; 2404 int i; 2405 int n; 2406 2407 for (i = 0; i < VTW_NCLASS; ++i) { 2408 ctl = &vtw_tcpv4[i]; 2409 2410 if (!ctl->base.v || ctl->nalloc) 2411 continue; 2412 2413 for (n = 0, vtw = ctl->base.v; ; ) { 2414 ++n; 2415 vtw = vtw_next(ctl, vtw); 2416 if (vtw == ctl->base.v) 2417 break; 2418 } 2419 db_trace(KTR_VTW 2420 , (ctl, "sanity: class %x n %x nfree %x" 2421 , i, n, ctl->nfree)); 2422 2423 KASSERT(n == ctl->nfree); 2424 } 2425 2426 for (i = 0; i < VTW_NCLASS; ++i) { 2427 ctl = &vtw_tcpv6[i]; 2428 2429 if (!ctl->base.v || ctl->nalloc) 2430 continue; 2431 2432 for (n = 0, vtw = ctl->base.v; ; ) { 2433 ++n; 2434 vtw = vtw_next(ctl, vtw); 2435 if (vtw == ctl->base.v) 2436 break; 2437 } 2438 db_trace(KTR_VTW 2439 , (ctl, "sanity: class %x n %x nfree %x" 2440 , i, n, ctl->nfree)); 2441 KASSERT(n == ctl->nfree); 2442 } 2443 } 2444 2445 /*!\brief Initialise debug support. 2446 */ 2447 static void 2448 vtw_debug_init(void) 2449 { 2450 int i; 2451 2452 vtw_sanity_check(); 2453 2454 if (vtw_syscall) 2455 return; 2456 2457 for (i = 511; i; --i) { 2458 if (sysent[i].sy_call == sys_nosys) { 2459 sysent[i].sy_call = vtw_sys; 2460 sysent[i].sy_narg = 2; 2461 sysent[i].sy_argsize = sizeof (struct sys_vtw_args); 2462 sysent[i].sy_flags = 0; 2463 2464 vtw_syscall = i; 2465 break; 2466 } 2467 } 2468 if (i) { 2469 const struct sysctlnode *node; 2470 uint32_t flags; 2471 2472 flags = sysctl_root.sysctl_flags; 2473 2474 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE; 2475 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT; 2476 2477 sysctl_createv(0, 0, 0, &node, 2478 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2479 "koff", 2480 SYSCTL_DESCR("Kernel Obscure Feature Finder"), 2481 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2482 2483 if (!node) { 2484 sysctl_createv(0, 0, 0, &node, 2485 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2486 "koffka", 2487 SYSCTL_DESCR("The Real(tm) Kernel" 2488 " Obscure Feature Finder"), 2489 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2490 } 2491 if (node) { 2492 sysctl_createv(0, 0, 0, 0, 2493 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 2494 CTLTYPE_INT, "vtw_debug_syscall", 2495 SYSCTL_DESCR("vtw debug" 2496 " system call number"), 2497 0, 0, &vtw_syscall, 0, node->sysctl_num, 2498 CTL_CREATE, CTL_EOL); 2499 } 2500 sysctl_root.sysctl_flags = flags; 2501 } 2502 } 2503 #else /* !VTW_DEBUG */ 2504 static void 2505 vtw_debug_init(void) 2506 { 2507 return; 2508 } 2509 #endif /* !VTW_DEBUG */ 2510