1 /* 2 * Copyright (c) 2011 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Coyote Point Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 /* 31 * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using 32 * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime 33 * Truncation (MSLT). 34 * 35 * MSLT and VTW were contributed by Coyote Point Systems, Inc. 36 * 37 * Even after a TCP session enters the TIME_WAIT state, its corresponding 38 * socket and protocol control blocks (PCBs) stick around until the TCP 39 * Maximum Segment Lifetime (MSL) expires. On a host whose workload 40 * necessarily creates and closes down many TCP sockets, the sockets & PCBs 41 * for TCP sessions in TIME_WAIT state amount to many megabytes of dead 42 * weight in RAM. 43 * 44 * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to 45 * a class based on the nearness of the peer. Corresponding to each class 46 * is an MSL, and a session uses the MSL of its class. The classes are 47 * loopback (local host equals remote host), local (local host and remote 48 * host are on the same link/subnet), and remote (local host and remote 49 * host communicate via one or more gateways). Classes corresponding to 50 * nearer peers have lower MSLs by default: 2 seconds for loopback, 10 51 * seconds for local, 60 seconds for remote. Loopback and local sessions 52 * expire more quickly when MSLT is used. 53 * 54 * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket 55 * dead weight with a compact representation of the session, called a 56 * "vestigial PCB". VTW data structures are designed to be very fast and 57 * memory-efficient: for fast insertion and lookup of vestigial PCBs, 58 * the PCBs are stored in a hash table that is designed to minimize the 59 * number of cacheline visits per lookup/insertion. The memory both 60 * for vestigial PCBs and for elements of the PCB hashtable come from 61 * fixed-size pools, and linked data structures exploit this to conserve 62 * memory by representing references with a narrow index/offset from the 63 * start of a pool instead of a pointer. When space for new vestigial PCBs 64 * runs out, VTW makes room by discarding old vestigial PCBs, oldest first. 65 * VTW cooperates with MSLT. 66 * 67 * It may help to think of VTW as a "FIN cache" by analogy to the SYN 68 * cache. 69 * 70 * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT 71 * sessions as fast as it can is approximately 17% idle when VTW is active 72 * versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM 73 * when VTW is active (approximately 64k vestigial PCBs are created) than 74 * when it is inactive. 75 */ 76 77 #include <sys/cdefs.h> 78 79 #ifdef _KERNEL_OPT 80 #include "opt_ddb.h" 81 #include "opt_inet.h" 82 #include "opt_inet_csum.h" 83 #include "opt_tcp_debug.h" 84 #endif 85 86 #include <sys/param.h> 87 #include <sys/systm.h> 88 #include <sys/kmem.h> 89 #include <sys/mbuf.h> 90 #include <sys/protosw.h> 91 #include <sys/socket.h> 92 #include <sys/socketvar.h> 93 #include <sys/errno.h> 94 #include <sys/syslog.h> 95 #include <sys/pool.h> 96 #include <sys/domain.h> 97 #include <sys/kernel.h> 98 #include <net/if.h> 99 #include <net/if_types.h> 100 101 #include <netinet/in.h> 102 #include <netinet/in_systm.h> 103 #include <netinet/ip.h> 104 #include <netinet/in_pcb.h> 105 #include <netinet/in_var.h> 106 #include <netinet/ip_var.h> 107 #include <netinet/in_offload.h> 108 #include <netinet/ip6.h> 109 #include <netinet6/ip6_var.h> 110 #include <netinet6/in6_pcb.h> 111 #include <netinet6/ip6_var.h> 112 #include <netinet6/in6_var.h> 113 #include <netinet/icmp6.h> 114 115 #include <netinet/tcp.h> 116 #include <netinet/tcp_fsm.h> 117 #include <netinet/tcp_seq.h> 118 #include <netinet/tcp_timer.h> 119 #include <netinet/tcp_var.h> 120 #include <netinet/tcp_private.h> 121 122 #include <netinet/tcp_vtw.h> 123 124 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.20 2019/10/01 18:00:09 chs Exp $"); 125 126 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0) 127 128 static void vtw_debug_init(void); 129 130 fatp_ctl_t fat_tcpv4; 131 fatp_ctl_t fat_tcpv6; 132 vtw_ctl_t vtw_tcpv4[VTW_NCLASS]; 133 vtw_ctl_t vtw_tcpv6[VTW_NCLASS]; 134 vtw_stats_t vtw_stats; 135 136 /* We provide state for the lookup_ports iterator. 137 * As currently we are netlock-protected, there is one. 138 * If we were finer-grain, we would have one per CPU. 139 * I do not want to be in the business of alloc/free. 140 * The best alternate would be allocate on the caller's 141 * stack, but that would require them to know the struct, 142 * or at least the size. 143 * See how she goes. 144 */ 145 struct tcp_ports_iterator { 146 union { 147 struct in_addr v4; 148 struct in6_addr v6; 149 } addr; 150 u_int port; 151 152 uint32_t wild : 1; 153 154 vtw_ctl_t *ctl; 155 fatp_t *fp; 156 157 uint16_t slot_idx; 158 uint16_t ctl_idx; 159 }; 160 161 static struct tcp_ports_iterator tcp_ports_iterator_v4; 162 static struct tcp_ports_iterator tcp_ports_iterator_v6; 163 164 static int vtw_age(vtw_ctl_t *, struct timeval *); 165 166 /*!\brief allocate a fat pointer from a collection. 167 */ 168 static fatp_t * 169 fatp_alloc(fatp_ctl_t *fat) 170 { 171 fatp_t *fp = 0; 172 173 if (fat->nfree) { 174 fp = fat->free; 175 if (fp) { 176 fat->free = fatp_next(fat, fp); 177 --fat->nfree; 178 ++fat->nalloc; 179 fp->nxt = 0; 180 181 KASSERT(!fp->inuse); 182 } 183 } 184 185 return fp; 186 } 187 188 /*!\brief free a fat pointer. 189 */ 190 static void 191 fatp_free(fatp_ctl_t *fat, fatp_t *fp) 192 { 193 if (fp) { 194 KASSERT(!fp->inuse); 195 KASSERT(!fp->nxt); 196 197 fp->nxt = fatp_index(fat, fat->free); 198 fat->free = fp; 199 200 ++fat->nfree; 201 --fat->nalloc; 202 } 203 } 204 205 /*!\brief initialise a collection of fat pointers. 206 * 207 *\param n # hash buckets 208 *\param m total # fat pointers to allocate 209 * 210 * We allocate 2x as much, as we have two hashes: full and lport only. 211 */ 212 static void 213 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m, 214 fatp_t *fat_base, fatp_t **fat_hash) 215 { 216 fatp_t *fp; 217 218 KASSERT(n <= FATP_MAX / 2); 219 220 fat->hash = fat_hash; 221 fat->base = fat_base; 222 223 fat->port = &fat->hash[m]; 224 225 fat->mask = m - 1; // ASSERT is power of 2 (m) 226 fat->lim = fat->base + 2*n - 1; 227 fat->nfree = 0; 228 fat->nalloc = 2*n; 229 230 /* Initialise the free list. 231 */ 232 for (fp = fat->lim; fp >= fat->base; --fp) { 233 fatp_free(fat, fp); 234 } 235 } 236 237 /* 238 * The `xtra' is XORed into the tag stored. 239 */ 240 static uint32_t fatp_xtra[] = { 241 0x11111111,0x22222222,0x33333333,0x44444444, 242 0x55555555,0x66666666,0x77777777,0x88888888, 243 0x12121212,0x21212121,0x34343434,0x43434343, 244 0x56565656,0x65656565,0x78787878,0x87878787, 245 0x11221122,0x22112211,0x33443344,0x44334433, 246 0x55665566,0x66556655,0x77887788,0x88778877, 247 0x11112222,0x22221111,0x33334444,0x44443333, 248 0x55556666,0x66665555,0x77778888,0x88887777, 249 }; 250 251 /*!\brief turn a {fatp_t*,slot} into an integral key. 252 * 253 * The key can be used to obtain the fatp_t, and the slot, 254 * as it directly encodes them. 255 */ 256 static inline uint32_t 257 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot) 258 { 259 CTASSERT(CACHE_LINE_SIZE == 32 || 260 CACHE_LINE_SIZE == 64 || 261 CACHE_LINE_SIZE == 128); 262 263 switch (fatp_ntags()) { 264 case 7: 265 return (fatp_index(fat, fp) << 3) | slot; 266 case 15: 267 return (fatp_index(fat, fp) << 4) | slot; 268 case 31: 269 return (fatp_index(fat, fp) << 5) | slot; 270 default: 271 KASSERT(0 && "no support, for no good reason"); 272 return ~0; 273 } 274 } 275 276 static inline uint32_t 277 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key) 278 { 279 CTASSERT(CACHE_LINE_SIZE == 32 || 280 CACHE_LINE_SIZE == 64 || 281 CACHE_LINE_SIZE == 128); 282 283 switch (fatp_ntags()) { 284 case 7: 285 return key & 7; 286 case 15: 287 return key & 15; 288 case 31: 289 return key & 31; 290 default: 291 KASSERT(0 && "no support, for no good reason"); 292 return ~0; 293 } 294 } 295 296 static inline fatp_t * 297 fatp_from_key(fatp_ctl_t *fat, uint32_t key) 298 { 299 CTASSERT(CACHE_LINE_SIZE == 32 || 300 CACHE_LINE_SIZE == 64 || 301 CACHE_LINE_SIZE == 128); 302 303 switch (fatp_ntags()) { 304 case 7: 305 key >>= 3; 306 break; 307 case 15: 308 key >>= 4; 309 break; 310 case 31: 311 key >>= 5; 312 break; 313 default: 314 KASSERT(0 && "no support, for no good reason"); 315 return 0; 316 } 317 318 return key ? fat->base + key - 1 : 0; 319 } 320 321 static inline uint32_t 322 idx_encode(vtw_ctl_t *ctl, uint32_t idx) 323 { 324 return (idx << ctl->idx_bits) | idx; 325 } 326 327 static inline uint32_t 328 idx_decode(vtw_ctl_t *ctl, uint32_t bits) 329 { 330 uint32_t idx = bits & ctl->idx_mask; 331 332 if (idx_encode(ctl, idx) == bits) 333 return idx; 334 else 335 return ~0; 336 } 337 338 /*!\brief insert index into fatp hash 339 * 340 *\param idx - index of element being placed in hash chain 341 *\param tag - 32-bit tag identifier 342 * 343 *\returns 344 * value which can be used to locate entry. 345 * 346 *\note 347 * we rely on the fact that there are unused high bits in the index 348 * for verification purposes on lookup. 349 */ 350 351 static inline uint32_t 352 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which, 353 void *dbg) 354 { 355 fatp_t *fp; 356 fatp_t **hash = (which ? fat->port : fat->hash); 357 int i; 358 359 fp = hash[tag & fat->mask]; 360 361 while (!fp || fatp_full(fp)) { 362 fatp_t *fq; 363 364 /* All entries are inuse at the top level. 365 * We allocate a spare, and push the top level 366 * down one. All entries in the fp we push down 367 * (think of a tape worm here) will be expelled sooner than 368 * any entries added subsequently to this hash bucket. 369 * This is a property of the time waits we are exploiting. 370 */ 371 372 fq = fatp_alloc(fat); 373 if (!fq) { 374 vtw_age(fat->vtw, 0); 375 fp = hash[tag & fat->mask]; 376 continue; 377 } 378 379 fq->inuse = 0; 380 fq->nxt = fatp_index(fat, fp); 381 382 hash[tag & fat->mask] = fq; 383 384 fp = fq; 385 } 386 387 KASSERT(!fatp_full(fp)); 388 389 /* Fill highest index first. Lookup is lowest first. 390 */ 391 for (i = fatp_ntags(); --i >= 0; ) { 392 if (!((1 << i) & fp->inuse)) { 393 break; 394 } 395 } 396 397 fp->inuse |= 1 << i; 398 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i]; 399 400 db_trace(KTR_VTW 401 , (fp, "fat: inuse %5.5x tag[%x] %8.8x" 402 , fp->inuse 403 , i, fp->tag[i])); 404 405 return fatp_key(fat, fp, i); 406 } 407 408 static inline int 409 vtw_alive(const vtw_t *vtw) 410 { 411 return vtw->hashed && vtw->expire.tv_sec; 412 } 413 414 static inline uint32_t 415 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4) 416 { 417 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4) 418 return v4 - ctl->base.v4; 419 420 KASSERT(0 && "vtw out of bounds"); 421 422 return ~0; 423 } 424 425 static inline uint32_t 426 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6) 427 { 428 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6) 429 return v6 - ctl->base.v6; 430 431 KASSERT(0 && "vtw out of bounds"); 432 433 return ~0; 434 } 435 436 static inline uint32_t 437 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw) 438 { 439 if (ctl->clidx) 440 ctl = ctl->ctl; 441 442 if (ctl->is_v4) 443 return vtw_index_v4(ctl, (vtw_v4_t *)vtw); 444 445 if (ctl->is_v6) 446 return vtw_index_v6(ctl, (vtw_v6_t *)vtw); 447 448 KASSERT(0 && "neither 4 nor 6. most curious."); 449 450 return ~0; 451 } 452 453 static inline vtw_t * 454 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx) 455 { 456 if (ctl->clidx) 457 ctl = ctl->ctl; 458 459 /* See if the index looks like it might be an index. 460 * Bits on outside of the valid index bits is a give away. 461 */ 462 idx = idx_decode(ctl, idx); 463 464 if (idx == ~0) { 465 return 0; 466 } else if (ctl->is_v4) { 467 vtw_v4_t *vtw = ctl->base.v4 + idx; 468 469 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4) 470 ? &vtw->common : 0; 471 } else if (ctl->is_v6) { 472 vtw_v6_t *vtw = ctl->base.v6 + idx; 473 474 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6) 475 ? &vtw->common : 0; 476 } else { 477 KASSERT(0 && "badness"); 478 return 0; 479 } 480 } 481 482 /*!\brief return the next vtw after this one. 483 * 484 * Due to the differing sizes of the entries in differing 485 * arenas, we have to ensure we ++ the correct pointer type. 486 * 487 * Also handles wrap. 488 */ 489 static inline vtw_t * 490 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw) 491 { 492 if (ctl->is_v4) { 493 vtw_v4_t *v4 = (void*)vtw; 494 495 vtw = &(++v4)->common; 496 } else { 497 vtw_v6_t *v6 = (void*)vtw; 498 499 vtw = &(++v6)->common; 500 } 501 502 if (vtw > ctl->lim.v) 503 vtw = ctl->base.v; 504 505 return vtw; 506 } 507 508 /*!\brief remove entry from FATP hash chains 509 */ 510 static inline void 511 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw) 512 { 513 fatp_ctl_t *fat = ctl->fat; 514 fatp_t *fp; 515 uint32_t key = vtw->key; 516 uint32_t tag, slot, idx; 517 vtw_v4_t *v4 = (void*)vtw; 518 vtw_v6_t *v6 = (void*)vtw; 519 520 if (!vtw->hashed) { 521 KASSERT(0 && "unhashed"); 522 return; 523 } 524 525 if (fat->vtw->is_v4) { 526 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport); 527 } else if (fat->vtw->is_v6) { 528 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport); 529 } else { 530 tag = 0; 531 KASSERT(0 && "not reached"); 532 } 533 534 /* Remove from fat->hash[] 535 */ 536 slot = fatp_slot_from_key(fat, key); 537 fp = fatp_from_key(fat, key); 538 idx = vtw_index(ctl, vtw); 539 540 db_trace(KTR_VTW 541 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x" 542 , fp->inuse, slot, idx, key, tag)); 543 544 KASSERT(fp->inuse & (1 << slot)); 545 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 546 ^ fatp_xtra[slot])); 547 548 if ((fp->inuse & (1 << slot)) 549 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 550 ^ fatp_xtra[slot])) { 551 fp->inuse ^= 1 << slot; 552 fp->tag[slot] = 0; 553 554 /* When we delete entries, we do not compact. This is 555 * due to temporality. We add entries, and they 556 * (eventually) expire. Older entries will be further 557 * down the chain. 558 */ 559 if (!fp->inuse) { 560 uint32_t hi = tag & fat->mask; 561 fatp_t *fq = 0; 562 fatp_t *fr = fat->hash[hi]; 563 564 while (fr && fr != fp) { 565 fr = fatp_next(fat, fq = fr); 566 } 567 568 if (fr == fp) { 569 if (fq) { 570 fq->nxt = fp->nxt; 571 fp->nxt = 0; 572 fatp_free(fat, fp); 573 } else { 574 KASSERT(fat->hash[hi] == fp); 575 576 if (fp->nxt) { 577 fat->hash[hi] 578 = fatp_next(fat, fp); 579 fp->nxt = 0; 580 fatp_free(fat, fp); 581 } else { 582 /* retain for next use. 583 */ 584 ; 585 } 586 } 587 } else { 588 fr = fat->hash[hi]; 589 590 do { 591 db_trace(KTR_VTW 592 , (fr 593 , "fat:*del inuse %5.5x" 594 " nxt %x" 595 , fr->inuse, fr->nxt)); 596 597 fr = fatp_next(fat, fq = fr); 598 } while (fr && fr != fp); 599 600 KASSERT(0 && "oops"); 601 } 602 } 603 vtw->key ^= ~0; 604 } 605 606 if (fat->vtw->is_v4) { 607 tag = v4_port_tag(v4->lport); 608 } else if (fat->vtw->is_v6) { 609 tag = v6_port_tag(v6->lport); 610 } 611 612 /* Remove from fat->port[] 613 */ 614 key = vtw->port_key; 615 slot = fatp_slot_from_key(fat, key); 616 fp = fatp_from_key(fat, key); 617 idx = vtw_index(ctl, vtw); 618 619 db_trace(KTR_VTW 620 , (fp, "fatport: del inuse %5.5x" 621 " slot %x idx %x key %x tag %x" 622 , fp->inuse, slot, idx, key, tag)); 623 624 KASSERT(fp->inuse & (1 << slot)); 625 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 626 ^ fatp_xtra[slot])); 627 628 if ((fp->inuse & (1 << slot)) 629 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 630 ^ fatp_xtra[slot])) { 631 fp->inuse ^= 1 << slot; 632 fp->tag[slot] = 0; 633 634 if (!fp->inuse) { 635 uint32_t hi = tag & fat->mask; 636 fatp_t *fq = 0; 637 fatp_t *fr = fat->port[hi]; 638 639 while (fr && fr != fp) { 640 fr = fatp_next(fat, fq = fr); 641 } 642 643 if (fr == fp) { 644 if (fq) { 645 fq->nxt = fp->nxt; 646 fp->nxt = 0; 647 fatp_free(fat, fp); 648 } else { 649 KASSERT(fat->port[hi] == fp); 650 651 if (fp->nxt) { 652 fat->port[hi] 653 = fatp_next(fat, fp); 654 fp->nxt = 0; 655 fatp_free(fat, fp); 656 } else { 657 /* retain for next use. 658 */ 659 ; 660 } 661 } 662 } 663 } 664 vtw->port_key ^= ~0; 665 } 666 667 vtw->hashed = 0; 668 } 669 670 /*!\brief remove entry from hash, possibly free. 671 */ 672 void 673 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw) 674 { 675 KASSERT(mutex_owned(softnet_lock)); 676 677 if (vtw->hashed) { 678 ++vtw_stats.del; 679 vtw_unhash(ctl, vtw); 680 } 681 682 /* We only delete the oldest entry. 683 */ 684 if (vtw != ctl->oldest.v) 685 return; 686 687 --ctl->nalloc; 688 ++ctl->nfree; 689 690 vtw->expire.tv_sec = 0; 691 vtw->expire.tv_usec = ~0; 692 693 if (!ctl->nalloc) 694 ctl->oldest.v = 0; 695 696 ctl->oldest.v = vtw_next(ctl, vtw); 697 } 698 699 /*!\brief insert vestigial timewait in hash chain 700 */ 701 static void 702 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw) 703 { 704 uint32_t idx = vtw_index(ctl, vtw); 705 uint32_t tag; 706 vtw_v4_t *v4 = (void*)vtw; 707 708 KASSERT(mutex_owned(softnet_lock)); 709 KASSERT(!vtw->hashed); 710 KASSERT(ctl->clidx == vtw->msl_class); 711 712 ++vtw_stats.ins; 713 714 tag = v4_tag(v4->faddr, v4->fport, 715 v4->laddr, v4->lport); 716 717 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 718 719 db_trace(KTR_VTW, (ctl 720 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x" 721 " tag %8.8x key %8.8x" 722 , v4->faddr, v4->fport 723 , v4->laddr, v4->lport 724 , tag 725 , vtw->key)); 726 727 tag = v4_port_tag(v4->lport); 728 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 729 730 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 731 , v4->lport, v4->lport 732 , tag 733 , vtw->key)); 734 735 vtw->hashed = 1; 736 } 737 738 /*!\brief insert vestigial timewait in hash chain 739 */ 740 static void 741 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw) 742 { 743 uint32_t idx = vtw_index(ctl, vtw); 744 uint32_t tag; 745 vtw_v6_t *v6 = (void*)vtw; 746 747 KASSERT(mutex_owned(softnet_lock)); 748 KASSERT(!vtw->hashed); 749 KASSERT(ctl->clidx == vtw->msl_class); 750 751 ++vtw_stats.ins; 752 753 tag = v6_tag(&v6->faddr, v6->fport, 754 &v6->laddr, v6->lport); 755 756 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 757 758 tag = v6_port_tag(v6->lport); 759 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 760 761 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 762 , v6->lport, v6->lport 763 , tag 764 , vtw->key)); 765 766 vtw->hashed = 1; 767 } 768 769 static vtw_t * 770 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport 771 , uint32_t laddr, uint16_t lport 772 , int which) 773 { 774 vtw_v4_t *v4; 775 vtw_t *vtw; 776 uint32_t tag; 777 fatp_t *fp; 778 int i; 779 uint32_t fatps = 0, probes = 0, losings = 0; 780 781 if (!ctl || !ctl->fat) 782 return 0; 783 784 ++vtw_stats.look[which]; 785 786 if (which) { 787 tag = v4_port_tag(lport); 788 fp = ctl->fat->port[tag & ctl->fat->mask]; 789 } else { 790 tag = v4_tag(faddr, fport, laddr, lport); 791 fp = ctl->fat->hash[tag & ctl->fat->mask]; 792 } 793 794 while (fp && fp->inuse) { 795 uint32_t inuse = fp->inuse; 796 797 ++fatps; 798 799 for (i = 0; inuse && i < fatp_ntags(); ++i) { 800 uint32_t idx; 801 802 if (!(inuse & (1 << i))) 803 continue; 804 805 inuse ^= 1 << i; 806 807 ++probes; 808 ++vtw_stats.probe[which]; 809 810 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 811 vtw = vtw_from_index(ctl, idx); 812 813 if (!vtw) { 814 /* Hopefully fast path. 815 */ 816 db_trace(KTR_VTW 817 , (fp, "vtw: fast %A:%P %A:%P" 818 " idx %x tag %x" 819 , faddr, fport 820 , laddr, lport 821 , idx, tag)); 822 continue; 823 } 824 825 v4 = (void*)vtw; 826 827 /* The de-referencing of vtw is what we want to avoid. 828 * Losing. 829 */ 830 if (vtw_alive(vtw) 831 && ((which ? vtw->port_key : vtw->key) 832 == fatp_key(ctl->fat, fp, i)) 833 && (which 834 || (v4->faddr == faddr && v4->laddr == laddr 835 && v4->fport == fport)) 836 && v4->lport == lport) { 837 ++vtw_stats.hit[which]; 838 839 db_trace(KTR_VTW 840 , (fp, "vtw: hit %8.8x:%4.4x" 841 " %8.8x:%4.4x idx %x key %x" 842 , faddr, fport 843 , laddr, lport 844 , idx_decode(ctl, idx), vtw->key)); 845 846 KASSERT(vtw->hashed); 847 848 goto out; 849 } 850 ++vtw_stats.losing[which]; 851 ++losings; 852 853 if (vtw_alive(vtw)) { 854 db_trace(KTR_VTW 855 , (fp, "vtw:!mis %8.8x:%4.4x" 856 " %8.8x:%4.4x key %x tag %x" 857 , faddr, fport 858 , laddr, lport 859 , fatp_key(ctl->fat, fp, i) 860 , v4_tag(faddr, fport 861 , laddr, lport))); 862 db_trace(KTR_VTW 863 , (vtw, "vtw:!mis %8.8x:%4.4x" 864 " %8.8x:%4.4x key %x tag %x" 865 , v4->faddr, v4->fport 866 , v4->laddr, v4->lport 867 , vtw->key 868 , v4_tag(v4->faddr, v4->fport 869 , v4->laddr, v4->lport))); 870 871 if (vtw->key == fatp_key(ctl->fat, fp, i)) { 872 db_trace(KTR_VTW 873 , (vtw, "vtw:!mis %8.8x:%4.4x" 874 " %8.8x:%4.4x key %x" 875 " which %x" 876 , v4->faddr, v4->fport 877 , v4->laddr, v4->lport 878 , vtw->key 879 , which)); 880 881 } else { 882 db_trace(KTR_VTW 883 , (vtw 884 , "vtw:!mis" 885 " key %8.8x != %8.8x" 886 " idx %x i %x which %x" 887 , vtw->key 888 , fatp_key(ctl->fat, fp, i) 889 , idx_decode(ctl, idx) 890 , i 891 , which)); 892 } 893 } else { 894 db_trace(KTR_VTW 895 , (fp 896 , "vtw:!mis free entry" 897 " idx %x vtw %p which %x" 898 , idx_decode(ctl, idx) 899 , vtw, which)); 900 } 901 } 902 903 if (fp->nxt) { 904 fp = fatp_next(ctl->fat, fp); 905 } else { 906 break; 907 } 908 } 909 ++vtw_stats.miss[which]; 910 vtw = 0; 911 out: 912 if (fatps > vtw_stats.max_chain[which]) 913 vtw_stats.max_chain[which] = fatps; 914 if (probes > vtw_stats.max_probe[which]) 915 vtw_stats.max_probe[which] = probes; 916 if (losings > vtw_stats.max_loss[which]) 917 vtw_stats.max_loss[which] = losings; 918 919 return vtw; 920 } 921 922 static vtw_t * 923 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport 924 , const struct in6_addr *laddr, uint16_t lport 925 , int which) 926 { 927 vtw_v6_t *v6; 928 vtw_t *vtw; 929 uint32_t tag; 930 fatp_t *fp; 931 int i; 932 uint32_t fatps = 0, probes = 0, losings = 0; 933 934 ++vtw_stats.look[which]; 935 936 if (!ctl || !ctl->fat) 937 return 0; 938 939 if (which) { 940 tag = v6_port_tag(lport); 941 fp = ctl->fat->port[tag & ctl->fat->mask]; 942 } else { 943 tag = v6_tag(faddr, fport, laddr, lport); 944 fp = ctl->fat->hash[tag & ctl->fat->mask]; 945 } 946 947 while (fp && fp->inuse) { 948 uint32_t inuse = fp->inuse; 949 950 ++fatps; 951 952 for (i = 0; inuse && i < fatp_ntags(); ++i) { 953 uint32_t idx; 954 955 if (!(inuse & (1 << i))) 956 continue; 957 958 inuse ^= 1 << i; 959 960 ++probes; 961 ++vtw_stats.probe[which]; 962 963 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 964 vtw = vtw_from_index(ctl, idx); 965 966 db_trace(KTR_VTW 967 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x" 968 , i 969 , db_store(faddr, sizeof (*faddr)), fport 970 , db_store(laddr, sizeof (*laddr)), lport 971 , idx_decode(ctl, idx))); 972 973 if (!vtw) { 974 /* Hopefully fast path. 975 */ 976 continue; 977 } 978 979 v6 = (void*)vtw; 980 981 if (vtw_alive(vtw) 982 && ((which ? vtw->port_key : vtw->key) 983 == fatp_key(ctl->fat, fp, i)) 984 && v6->lport == lport 985 && (which 986 || (v6->fport == fport 987 && !bcmp(&v6->faddr, faddr, sizeof (*faddr)) 988 && !bcmp(&v6->laddr, laddr 989 , sizeof (*laddr))))) { 990 ++vtw_stats.hit[which]; 991 992 KASSERT(vtw->hashed); 993 goto out; 994 } else { 995 ++vtw_stats.losing[which]; 996 ++losings; 997 } 998 } 999 1000 if (fp->nxt) { 1001 fp = fatp_next(ctl->fat, fp); 1002 } else { 1003 break; 1004 } 1005 } 1006 ++vtw_stats.miss[which]; 1007 vtw = 0; 1008 out: 1009 if (fatps > vtw_stats.max_chain[which]) 1010 vtw_stats.max_chain[which] = fatps; 1011 if (probes > vtw_stats.max_probe[which]) 1012 vtw_stats.max_probe[which] = probes; 1013 if (losings > vtw_stats.max_loss[which]) 1014 vtw_stats.max_loss[which] = losings; 1015 1016 return vtw; 1017 } 1018 1019 /*!\brief port iterator 1020 */ 1021 static vtw_t * 1022 vtw_next_port_v4(struct tcp_ports_iterator *it) 1023 { 1024 vtw_ctl_t *ctl = it->ctl; 1025 vtw_v4_t *v4; 1026 vtw_t *vtw; 1027 uint32_t tag; 1028 uint16_t lport = it->port; 1029 fatp_t *fp; 1030 int i; 1031 uint32_t fatps = 0, probes = 0, losings = 0; 1032 1033 tag = v4_port_tag(lport); 1034 if (!it->fp) { 1035 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1036 it->slot_idx = 0; 1037 } 1038 fp = it->fp; 1039 1040 while (fp) { 1041 uint32_t inuse = fp->inuse; 1042 1043 ++fatps; 1044 1045 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1046 uint32_t idx; 1047 1048 if (!(inuse & (1 << i))) 1049 continue; 1050 1051 inuse &= ~0U << i; 1052 1053 if (i < it->slot_idx) 1054 continue; 1055 1056 ++vtw_stats.probe[1]; 1057 ++probes; 1058 1059 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1060 vtw = vtw_from_index(ctl, idx); 1061 1062 if (!vtw) { 1063 /* Hopefully fast path. 1064 */ 1065 continue; 1066 } 1067 1068 v4 = (void*)vtw; 1069 1070 if (vtw_alive(vtw) 1071 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1072 && v4->lport == lport) { 1073 ++vtw_stats.hit[1]; 1074 1075 it->slot_idx = i + 1; 1076 1077 goto out; 1078 } else if (vtw_alive(vtw)) { 1079 ++vtw_stats.losing[1]; 1080 ++losings; 1081 1082 db_trace(KTR_VTW 1083 , (vtw, "vtw:!mis" 1084 " port %8.8x:%4.4x %8.8x:%4.4x" 1085 " key %x port %x" 1086 , v4->faddr, v4->fport 1087 , v4->laddr, v4->lport 1088 , vtw->key 1089 , lport)); 1090 } else { 1091 /* Really losing here. We are coming 1092 * up with references to free entries. 1093 * Might find it better to use 1094 * traditional, or need another 1095 * add-hockery. The other add-hockery 1096 * would be to pul more into into the 1097 * cache line to reject the false 1098 * hits. 1099 */ 1100 ++vtw_stats.losing[1]; 1101 ++losings; 1102 db_trace(KTR_VTW 1103 , (fp, "vtw:!mis port %x" 1104 " - free entry idx %x vtw %p" 1105 , lport 1106 , idx_decode(ctl, idx) 1107 , vtw)); 1108 } 1109 } 1110 1111 if (fp->nxt) { 1112 it->fp = fp = fatp_next(ctl->fat, fp); 1113 it->slot_idx = 0; 1114 } else { 1115 it->fp = 0; 1116 break; 1117 } 1118 } 1119 ++vtw_stats.miss[1]; 1120 1121 vtw = 0; 1122 out: 1123 if (fatps > vtw_stats.max_chain[1]) 1124 vtw_stats.max_chain[1] = fatps; 1125 if (probes > vtw_stats.max_probe[1]) 1126 vtw_stats.max_probe[1] = probes; 1127 if (losings > vtw_stats.max_loss[1]) 1128 vtw_stats.max_loss[1] = losings; 1129 1130 return vtw; 1131 } 1132 1133 /*!\brief port iterator 1134 */ 1135 static vtw_t * 1136 vtw_next_port_v6(struct tcp_ports_iterator *it) 1137 { 1138 vtw_ctl_t *ctl = it->ctl; 1139 vtw_v6_t *v6; 1140 vtw_t *vtw; 1141 uint32_t tag; 1142 uint16_t lport = it->port; 1143 fatp_t *fp; 1144 int i; 1145 uint32_t fatps = 0, probes = 0, losings = 0; 1146 1147 tag = v6_port_tag(lport); 1148 if (!it->fp) { 1149 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1150 it->slot_idx = 0; 1151 } 1152 fp = it->fp; 1153 1154 while (fp) { 1155 uint32_t inuse = fp->inuse; 1156 1157 ++fatps; 1158 1159 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1160 uint32_t idx; 1161 1162 if (!(inuse & (1 << i))) 1163 continue; 1164 1165 inuse &= ~0U << i; 1166 1167 if (i < it->slot_idx) 1168 continue; 1169 1170 ++vtw_stats.probe[1]; 1171 ++probes; 1172 1173 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1174 vtw = vtw_from_index(ctl, idx); 1175 1176 if (!vtw) { 1177 /* Hopefully fast path. 1178 */ 1179 continue; 1180 } 1181 1182 v6 = (void*)vtw; 1183 1184 db_trace(KTR_VTW 1185 , (vtw, "vtw: i %x idx %x fp->tag %x" 1186 " tag %x xtra %x" 1187 , i, idx_decode(ctl, idx) 1188 , fp->tag[i], tag, fatp_xtra[i])); 1189 1190 if (vtw_alive(vtw) 1191 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1192 && v6->lport == lport) { 1193 ++vtw_stats.hit[1]; 1194 1195 db_trace(KTR_VTW 1196 , (fp, "vtw: nxt port %P - %4.4x" 1197 " idx %x key %x" 1198 , lport, lport 1199 , idx_decode(ctl, idx), vtw->key)); 1200 1201 it->slot_idx = i + 1; 1202 goto out; 1203 } else if (vtw_alive(vtw)) { 1204 ++vtw_stats.losing[1]; 1205 1206 db_trace(KTR_VTW 1207 , (vtw, "vtw:!mis port %6A:%4.4x" 1208 " %6A:%4.4x key %x port %x" 1209 , db_store(&v6->faddr 1210 , sizeof (v6->faddr)) 1211 , v6->fport 1212 , db_store(&v6->laddr 1213 , sizeof (v6->faddr)) 1214 , v6->lport 1215 , vtw->key 1216 , lport)); 1217 } else { 1218 /* Really losing here. We are coming 1219 * up with references to free entries. 1220 * Might find it better to use 1221 * traditional, or need another 1222 * add-hockery. The other add-hockery 1223 * would be to pul more into into the 1224 * cache line to reject the false 1225 * hits. 1226 */ 1227 ++vtw_stats.losing[1]; 1228 ++losings; 1229 1230 db_trace(KTR_VTW 1231 , (fp 1232 , "vtw:!mis port %x" 1233 " - free entry idx %x vtw %p" 1234 , lport, idx_decode(ctl, idx) 1235 , vtw)); 1236 } 1237 } 1238 1239 if (fp->nxt) { 1240 it->fp = fp = fatp_next(ctl->fat, fp); 1241 it->slot_idx = 0; 1242 } else { 1243 it->fp = 0; 1244 break; 1245 } 1246 } 1247 ++vtw_stats.miss[1]; 1248 1249 vtw = 0; 1250 out: 1251 if (fatps > vtw_stats.max_chain[1]) 1252 vtw_stats.max_chain[1] = fatps; 1253 if (probes > vtw_stats.max_probe[1]) 1254 vtw_stats.max_probe[1] = probes; 1255 if (losings > vtw_stats.max_loss[1]) 1256 vtw_stats.max_loss[1] = losings; 1257 1258 return vtw; 1259 } 1260 1261 /*!\brief initialise the VTW allocation arena 1262 * 1263 * There are 1+3 allocation classes: 1264 * 0 classless 1265 * {1,2,3} MSL-class based allocation 1266 * 1267 * The allocation arenas are all initialised. Classless gets all the 1268 * space. MSL-class based divides the arena, so that allocation 1269 * within a class can proceed without having to consider entries 1270 * (aka: cache lines) from different classes. 1271 * 1272 * Usually, we are completely classless or class-based, but there can be 1273 * transition periods, corresponding to dynamic adjustments in the config 1274 * by the operator. 1275 */ 1276 static void 1277 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v) 1278 { 1279 int class_n, i; 1280 vtw_t *base; 1281 1282 ctl->base.v = ctl_base_v; 1283 1284 if (ctl->is_v4) { 1285 ctl->lim.v4 = ctl->base.v4 + n - 1; 1286 ctl->alloc.v4 = ctl->base.v4; 1287 } else { 1288 ctl->lim.v6 = ctl->base.v6 + n - 1; 1289 ctl->alloc.v6 = ctl->base.v6; 1290 } 1291 1292 ctl->nfree = n; 1293 ctl->ctl = ctl; 1294 1295 ctl->idx_bits = 32; 1296 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) { 1297 ctl->idx_mask >>= 1; 1298 ctl->idx_bits -= 1; 1299 } 1300 1301 ctl->idx_mask <<= 1; 1302 ctl->idx_mask |= 1; 1303 ctl->idx_bits += 1; 1304 1305 ctl->fat = fat; 1306 fat->vtw = ctl; 1307 1308 /* Divide the resources equally amongst the classes. 1309 * This is not optimal, as the different classes 1310 * arrive and leave at different rates, but it is 1311 * the best I can do for now. 1312 */ 1313 class_n = n / (VTW_NCLASS-1); 1314 base = ctl->base.v; 1315 1316 for (i = 1; i < VTW_NCLASS; ++i) { 1317 int j; 1318 1319 ctl[i] = ctl[0]; 1320 ctl[i].clidx = i; 1321 1322 ctl[i].base.v = base; 1323 ctl[i].alloc = ctl[i].base; 1324 1325 for (j = 0; j < class_n - 1; ++j) { 1326 if (tcp_msl_enable) 1327 base->msl_class = i; 1328 base = vtw_next(ctl, base); 1329 } 1330 1331 ctl[i].lim.v = base; 1332 base = vtw_next(ctl, base); 1333 ctl[i].nfree = class_n; 1334 } 1335 1336 vtw_debug_init(); 1337 } 1338 1339 /*!\brief map class to TCP MSL 1340 */ 1341 static inline uint32_t 1342 class_to_msl(int msl_class) 1343 { 1344 switch (msl_class) { 1345 case 0: 1346 case 1: 1347 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0); 1348 case 2: 1349 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); 1350 default: 1351 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); 1352 } 1353 } 1354 1355 /*!\brief map TCP MSL to class 1356 */ 1357 static inline uint32_t 1358 msl_to_class(int msl) 1359 { 1360 if (tcp_msl_enable) { 1361 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2))) 1362 return 1+2; 1363 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1))) 1364 return 1+1; 1365 return 1; 1366 } 1367 return 0; 1368 } 1369 1370 /*!\brief allocate a vtw entry 1371 */ 1372 static inline vtw_t * 1373 vtw_alloc(vtw_ctl_t *ctl) 1374 { 1375 vtw_t *vtw = 0; 1376 int stuck = 0; 1377 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0; 1378 int msl; 1379 1380 KASSERT(mutex_owned(softnet_lock)); 1381 1382 /* If no resources, we will not get far. 1383 */ 1384 if (!ctl || !ctl->base.v4 || avail <= 0) 1385 return 0; 1386 1387 /* Obtain a free one. 1388 */ 1389 while (!ctl->nfree) { 1390 vtw_age(ctl, 0); 1391 1392 if (++stuck > avail) { 1393 /* When in transition between 1394 * schemes (classless, classed) we 1395 * can be stuck having to await the 1396 * expiration of cross-allocated entries. 1397 * 1398 * Returning zero means we will fall back to the 1399 * traditional TIME_WAIT handling, except in the 1400 * case of a re-shed, in which case we cannot 1401 * perform the reshecd, but will retain the extant 1402 * entry. 1403 */ 1404 db_trace(KTR_VTW 1405 , (ctl, "vtw:!none free in class %x %x/%x" 1406 , ctl->clidx 1407 , ctl->nalloc, ctl->nfree)); 1408 1409 return 0; 1410 } 1411 } 1412 1413 vtw = ctl->alloc.v; 1414 1415 if (vtw->msl_class != ctl->clidx) { 1416 /* Usurping rules: 1417 * 0 -> {1,2,3} or {1,2,3} -> 0 1418 */ 1419 KASSERT(!vtw->msl_class || !ctl->clidx); 1420 1421 if (vtw->hashed || vtw->expire.tv_sec) { 1422 /* As this is owned by some other class, 1423 * we must wait for it to expire it. 1424 * This will only happen on class/classless 1425 * transitions, which are guaranteed to progress 1426 * to completion in small finite time, barring bugs. 1427 */ 1428 db_trace(KTR_VTW 1429 , (ctl, "vtw:!%p class %x!=%x %x:%x%s" 1430 , vtw, vtw->msl_class, ctl->clidx 1431 , vtw->expire.tv_sec 1432 , vtw->expire.tv_usec 1433 , vtw->hashed ? " hashed" : "")); 1434 1435 return 0; 1436 } 1437 1438 db_trace(KTR_VTW 1439 , (ctl, "vtw:!%p usurped from %x to %x" 1440 , vtw, vtw->msl_class, ctl->clidx)); 1441 1442 vtw->msl_class = ctl->clidx; 1443 } 1444 1445 if (vtw_alive(vtw)) { 1446 KASSERT(0 && "next free not free"); 1447 return 0; 1448 } 1449 1450 /* Advance allocation poiter. 1451 */ 1452 ctl->alloc.v = vtw_next(ctl, vtw); 1453 1454 --ctl->nfree; 1455 ++ctl->nalloc; 1456 1457 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec 1458 1459 /* mark expiration 1460 */ 1461 getmicrouptime(&vtw->expire); 1462 1463 /* Move expiration into the future. 1464 */ 1465 vtw->expire.tv_sec += msl / 1000; 1466 vtw->expire.tv_usec += 1000 * (msl % 1000); 1467 1468 while (vtw->expire.tv_usec >= 1000*1000) { 1469 vtw->expire.tv_usec -= 1000*1000; 1470 vtw->expire.tv_sec += 1; 1471 } 1472 1473 if (!ctl->oldest.v) 1474 ctl->oldest.v = vtw; 1475 1476 return vtw; 1477 } 1478 1479 /*!\brief expiration 1480 */ 1481 static int 1482 vtw_age(vtw_ctl_t *ctl, struct timeval *_when) 1483 { 1484 vtw_t *vtw; 1485 struct timeval then, *when = _when; 1486 int maxtries = 0; 1487 1488 if (!ctl->oldest.v) { 1489 KASSERT(!ctl->nalloc); 1490 return 0; 1491 } 1492 1493 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) { 1494 if (++maxtries > ctl->nalloc) 1495 break; 1496 1497 if (vtw->msl_class != ctl->clidx) { 1498 db_trace(KTR_VTW 1499 , (vtw, "vtw:!age class mismatch %x != %x" 1500 , vtw->msl_class, ctl->clidx)); 1501 /* XXXX 1502 * See if the appropriate action is to skip to the next. 1503 * XXXX 1504 */ 1505 ctl->oldest.v = vtw = vtw_next(ctl, vtw); 1506 continue; 1507 } 1508 if (!when) { 1509 /* Latch oldest timeval if none specified. 1510 */ 1511 then = vtw->expire; 1512 when = &then; 1513 } 1514 1515 if (!timercmp(&vtw->expire, when, <=)) 1516 break; 1517 1518 db_trace(KTR_VTW 1519 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x" 1520 , ctl->clidx 1521 , vtw->expire.tv_sec 1522 , vtw->expire.tv_usec 1523 , ctl->nalloc 1524 , ctl->nfree)); 1525 1526 if (!_when) 1527 ++vtw_stats.kill; 1528 1529 vtw_del(ctl, vtw); 1530 vtw = ctl->oldest.v; 1531 } 1532 1533 return ctl->nalloc; // # remaining allocated 1534 } 1535 1536 static callout_t vtw_cs; 1537 1538 /*!\brief notice the passage of time. 1539 * It seems to be getting faster. What happened to the year? 1540 */ 1541 static void 1542 vtw_tick(void *arg) 1543 { 1544 struct timeval now; 1545 int i, cnt = 0; 1546 1547 getmicrouptime(&now); 1548 1549 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x" 1550 , now.tv_sec, now.tv_usec)); 1551 1552 mutex_enter(softnet_lock); 1553 1554 for (i = 0; i < VTW_NCLASS; ++i) { 1555 cnt += vtw_age(&vtw_tcpv4[i], &now); 1556 cnt += vtw_age(&vtw_tcpv6[i], &now); 1557 } 1558 1559 /* Keep ticks coming while we need them. 1560 */ 1561 if (cnt) 1562 callout_schedule(&vtw_cs, hz / 5); 1563 else { 1564 tcp_vtw_was_enabled = 0; 1565 tcbtable.vestige = 0; 1566 } 1567 mutex_exit(softnet_lock); 1568 } 1569 1570 /* in_pcblookup_ports assist for handling vestigial entries. 1571 */ 1572 static void * 1573 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild) 1574 { 1575 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4; 1576 1577 bzero(it, sizeof (*it)); 1578 1579 /* Note: the reference to vtw_tcpv4[0] is fine. 1580 * We do not need per-class iteration. We just 1581 * need to get to the fat, and there is one 1582 * shared fat. 1583 */ 1584 if (vtw_tcpv4[0].fat) { 1585 it->addr.v4 = addr; 1586 it->port = port; 1587 it->wild = !!wild; 1588 it->ctl = &vtw_tcpv4[0]; 1589 1590 ++vtw_stats.look[1]; 1591 } 1592 1593 return it; 1594 } 1595 1596 /*!\brief export an IPv4 vtw. 1597 */ 1598 static int 1599 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1600 { 1601 vtw_v4_t *v4 = (void*)vtw; 1602 1603 bzero(res, sizeof (*res)); 1604 1605 if (ctl && vtw) { 1606 if (!ctl->clidx && vtw->msl_class) 1607 ctl += vtw->msl_class; 1608 else 1609 KASSERT(ctl->clidx == vtw->msl_class); 1610 1611 res->valid = 1; 1612 res->v4 = 1; 1613 1614 res->faddr.v4.s_addr = v4->faddr; 1615 res->laddr.v4.s_addr = v4->laddr; 1616 res->fport = v4->fport; 1617 res->lport = v4->lport; 1618 res->vtw = vtw; // netlock held over call(s) 1619 res->ctl = ctl; 1620 res->reuse_addr = vtw->reuse_addr; 1621 res->reuse_port = vtw->reuse_port; 1622 res->snd_nxt = vtw->snd_nxt; 1623 res->rcv_nxt = vtw->rcv_nxt; 1624 res->rcv_wnd = vtw->rcv_wnd; 1625 res->uid = vtw->uid; 1626 } 1627 1628 return res->valid; 1629 } 1630 1631 /*!\brief return next port in the port iterator. yowza. 1632 */ 1633 static int 1634 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res) 1635 { 1636 struct tcp_ports_iterator *it = arg; 1637 vtw_t *vtw = 0; 1638 1639 if (it->ctl) 1640 vtw = vtw_next_port_v4(it); 1641 1642 if (!vtw) 1643 it->ctl = 0; 1644 1645 return vtw_export_v4(it->ctl, vtw, res); 1646 } 1647 1648 static int 1649 tcp_lookup_v4(struct in_addr faddr, uint16_t fport, 1650 struct in_addr laddr, uint16_t lport, 1651 struct vestigial_inpcb *res) 1652 { 1653 vtw_t *vtw; 1654 vtw_ctl_t *ctl; 1655 1656 1657 db_trace(KTR_VTW 1658 , (res, "vtw: lookup %A:%P %A:%P" 1659 , faddr, fport 1660 , laddr, lport)); 1661 1662 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0]) 1663 , faddr.s_addr, fport 1664 , laddr.s_addr, lport, 0); 1665 1666 return vtw_export_v4(ctl, vtw, res); 1667 } 1668 1669 /* in_pcblookup_ports assist for handling vestigial entries. 1670 */ 1671 static void * 1672 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild) 1673 { 1674 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6; 1675 1676 bzero(it, sizeof (*it)); 1677 1678 /* Note: the reference to vtw_tcpv6[0] is fine. 1679 * We do not need per-class iteration. We just 1680 * need to get to the fat, and there is one 1681 * shared fat. 1682 */ 1683 if (vtw_tcpv6[0].fat) { 1684 it->addr.v6 = *addr; 1685 it->port = port; 1686 it->wild = !!wild; 1687 it->ctl = &vtw_tcpv6[0]; 1688 1689 ++vtw_stats.look[1]; 1690 } 1691 1692 return it; 1693 } 1694 1695 /*!\brief export an IPv6 vtw. 1696 */ 1697 static int 1698 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1699 { 1700 vtw_v6_t *v6 = (void*)vtw; 1701 1702 bzero(res, sizeof (*res)); 1703 1704 if (ctl && vtw) { 1705 if (!ctl->clidx && vtw->msl_class) 1706 ctl += vtw->msl_class; 1707 else 1708 KASSERT(ctl->clidx == vtw->msl_class); 1709 1710 res->valid = 1; 1711 res->v4 = 0; 1712 1713 res->faddr.v6 = v6->faddr; 1714 res->laddr.v6 = v6->laddr; 1715 res->fport = v6->fport; 1716 res->lport = v6->lport; 1717 res->vtw = vtw; // netlock held over call(s) 1718 res->ctl = ctl; 1719 1720 res->v6only = vtw->v6only; 1721 res->reuse_addr = vtw->reuse_addr; 1722 res->reuse_port = vtw->reuse_port; 1723 1724 res->snd_nxt = vtw->snd_nxt; 1725 res->rcv_nxt = vtw->rcv_nxt; 1726 res->rcv_wnd = vtw->rcv_wnd; 1727 res->uid = vtw->uid; 1728 } 1729 1730 return res->valid; 1731 } 1732 1733 static int 1734 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res) 1735 { 1736 struct tcp_ports_iterator *it = arg; 1737 vtw_t *vtw = 0; 1738 1739 if (it->ctl) 1740 vtw = vtw_next_port_v6(it); 1741 1742 if (!vtw) 1743 it->ctl = 0; 1744 1745 return vtw_export_v6(it->ctl, vtw, res); 1746 } 1747 1748 static int 1749 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport, 1750 const struct in6_addr *laddr, uint16_t lport, 1751 struct vestigial_inpcb *res) 1752 { 1753 vtw_ctl_t *ctl; 1754 vtw_t *vtw; 1755 1756 db_trace(KTR_VTW 1757 , (res, "vtw: lookup %6A:%P %6A:%P" 1758 , db_store(faddr, sizeof (*faddr)), fport 1759 , db_store(laddr, sizeof (*laddr)), lport)); 1760 1761 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0]) 1762 , faddr, fport 1763 , laddr, lport, 0); 1764 1765 return vtw_export_v6(ctl, vtw, res); 1766 } 1767 1768 static vestigial_hooks_t tcp_hooks = { 1769 .init_ports4 = tcp_init_ports_v4, 1770 .next_port4 = tcp_next_port_v4, 1771 .lookup4 = tcp_lookup_v4, 1772 .init_ports6 = tcp_init_ports_v6, 1773 .next_port6 = tcp_next_port_v6, 1774 .lookup6 = tcp_lookup_v6, 1775 }; 1776 1777 static bool 1778 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp) 1779 { 1780 fatp_ctl_t *fat; 1781 vtw_ctl_t *ctl; 1782 1783 switch (af) { 1784 case AF_INET: 1785 fat = &fat_tcpv4; 1786 ctl = &vtw_tcpv4[0]; 1787 break; 1788 case AF_INET6: 1789 fat = &fat_tcpv6; 1790 ctl = &vtw_tcpv6[0]; 1791 break; 1792 default: 1793 return false; 1794 } 1795 if (fatp != NULL) 1796 *fatp = fat; 1797 if (ctlp != NULL) 1798 *ctlp = ctl; 1799 return true; 1800 } 1801 1802 /*!\brief initialize controlling instance 1803 */ 1804 static int 1805 vtw_control_init(int af) 1806 { 1807 fatp_ctl_t *fat; 1808 vtw_ctl_t *ctl; 1809 fatp_t *fat_base; 1810 fatp_t **fat_hash; 1811 vtw_t *ctl_base_v; 1812 uint32_t n, m; 1813 size_t sz; 1814 1815 KASSERT(powerof2(tcp_vtw_entries)); 1816 1817 if (!vtw_select(af, &fat, &ctl)) 1818 return EAFNOSUPPORT; 1819 1820 if (fat->hash != NULL) { 1821 KASSERT(fat->base != NULL && ctl->base.v != NULL); 1822 return 0; 1823 } 1824 1825 /* Allocate 10% more capacity in the fat pointers. 1826 * We should only need ~#hash additional based on 1827 * how they age, but TIME_WAIT assassination could cause 1828 * sparse fat pointer utilisation. 1829 */ 1830 m = 512; 1831 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10; 1832 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t)); 1833 1834 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_SLEEP); 1835 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_SLEEP); 1836 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_SLEEP); 1837 fatp_init(fat, n, m, fat_base, fat_hash); 1838 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v); 1839 1840 return 0; 1841 } 1842 1843 /*!\brief select controlling instance 1844 */ 1845 static vtw_ctl_t * 1846 vtw_control(int af, uint32_t msl) 1847 { 1848 fatp_ctl_t *fat; 1849 vtw_ctl_t *ctl; 1850 int msl_class = msl_to_class(msl); 1851 1852 if (!vtw_select(af, &fat, &ctl)) 1853 return NULL; 1854 1855 if (!fat->base || !ctl->base.v) 1856 return NULL; 1857 1858 if (!tcp_vtw_was_enabled) { 1859 /* This guarantees is timer ticks until we no longer need them. 1860 */ 1861 tcp_vtw_was_enabled = 1; 1862 1863 callout_schedule(&vtw_cs, hz / 5); 1864 1865 tcbtable.vestige = &tcp_hooks; 1866 } 1867 1868 return ctl + msl_class; 1869 } 1870 1871 /*!\brief add TCP pcb to vestigial timewait 1872 */ 1873 int 1874 vtw_add(int af, struct tcpcb *tp) 1875 { 1876 #ifdef VTW_DEBUG 1877 int enable; 1878 #endif 1879 vtw_ctl_t *ctl; 1880 vtw_t *vtw; 1881 1882 KASSERT(mutex_owned(softnet_lock)); 1883 1884 ctl = vtw_control(af, tp->t_msl); 1885 if (!ctl) 1886 return 0; 1887 1888 #ifdef VTW_DEBUG 1889 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable; 1890 #endif 1891 1892 vtw = vtw_alloc(ctl); 1893 1894 if (vtw) { 1895 vtw->snd_nxt = tp->snd_nxt; 1896 vtw->rcv_nxt = tp->rcv_nxt; 1897 1898 switch (af) { 1899 case AF_INET: { 1900 struct inpcb *inp = tp->t_inpcb; 1901 vtw_v4_t *v4 = (void*)vtw; 1902 1903 v4->faddr = inp->inp_faddr.s_addr; 1904 v4->laddr = inp->inp_laddr.s_addr; 1905 v4->fport = inp->inp_fport; 1906 v4->lport = inp->inp_lport; 1907 1908 vtw->reuse_port = !!(inp->inp_socket->so_options 1909 & SO_REUSEPORT); 1910 vtw->reuse_addr = !!(inp->inp_socket->so_options 1911 & SO_REUSEADDR); 1912 vtw->v6only = 0; 1913 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid; 1914 1915 vtw_inshash_v4(ctl, vtw); 1916 1917 1918 #ifdef VTW_DEBUG 1919 /* Immediate lookup (connected and port) to 1920 * ensure at least that works! 1921 */ 1922 if (enable & 4) { 1923 KASSERT(vtw_lookup_hash_v4 1924 (ctl 1925 , inp->inp_faddr.s_addr, inp->inp_fport 1926 , inp->inp_laddr.s_addr, inp->inp_lport 1927 , 0) 1928 == vtw); 1929 KASSERT(vtw_lookup_hash_v4 1930 (ctl 1931 , inp->inp_faddr.s_addr, inp->inp_fport 1932 , inp->inp_laddr.s_addr, inp->inp_lport 1933 , 1)); 1934 } 1935 /* Immediate port iterator functionality check: not wild 1936 */ 1937 if (enable & 8) { 1938 struct tcp_ports_iterator *it; 1939 struct vestigial_inpcb res; 1940 int cnt = 0; 1941 1942 it = tcp_init_ports_v4(inp->inp_laddr 1943 , inp->inp_lport, 0); 1944 1945 while (tcp_next_port_v4(it, &res)) { 1946 ++cnt; 1947 } 1948 KASSERT(cnt); 1949 } 1950 /* Immediate port iterator functionality check: wild 1951 */ 1952 if (enable & 16) { 1953 struct tcp_ports_iterator *it; 1954 struct vestigial_inpcb res; 1955 struct in_addr any; 1956 int cnt = 0; 1957 1958 any.s_addr = htonl(INADDR_ANY); 1959 1960 it = tcp_init_ports_v4(any, inp->inp_lport, 1); 1961 1962 while (tcp_next_port_v4(it, &res)) { 1963 ++cnt; 1964 } 1965 KASSERT(cnt); 1966 } 1967 #endif /* VTW_DEBUG */ 1968 break; 1969 } 1970 1971 case AF_INET6: { 1972 struct in6pcb *inp = tp->t_in6pcb; 1973 vtw_v6_t *v6 = (void*)vtw; 1974 1975 v6->faddr = inp->in6p_faddr; 1976 v6->laddr = inp->in6p_laddr; 1977 v6->fport = inp->in6p_fport; 1978 v6->lport = inp->in6p_lport; 1979 1980 vtw->reuse_port = !!(inp->in6p_socket->so_options 1981 & SO_REUSEPORT); 1982 vtw->reuse_addr = !!(inp->in6p_socket->so_options 1983 & SO_REUSEADDR); 1984 vtw->v6only = !!(inp->in6p_flags 1985 & IN6P_IPV6_V6ONLY); 1986 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid; 1987 1988 vtw_inshash_v6(ctl, vtw); 1989 #ifdef VTW_DEBUG 1990 /* Immediate lookup (connected and port) to 1991 * ensure at least that works! 1992 */ 1993 if (enable & 4) { 1994 KASSERT(vtw_lookup_hash_v6(ctl 1995 , &inp->in6p_faddr, inp->in6p_fport 1996 , &inp->in6p_laddr, inp->in6p_lport 1997 , 0) 1998 == vtw); 1999 KASSERT(vtw_lookup_hash_v6 2000 (ctl 2001 , &inp->in6p_faddr, inp->in6p_fport 2002 , &inp->in6p_laddr, inp->in6p_lport 2003 , 1)); 2004 } 2005 /* Immediate port iterator functionality check: not wild 2006 */ 2007 if (enable & 8) { 2008 struct tcp_ports_iterator *it; 2009 struct vestigial_inpcb res; 2010 int cnt = 0; 2011 2012 it = tcp_init_ports_v6(&inp->in6p_laddr 2013 , inp->in6p_lport, 0); 2014 2015 while (tcp_next_port_v6(it, &res)) { 2016 ++cnt; 2017 } 2018 KASSERT(cnt); 2019 } 2020 /* Immediate port iterator functionality check: wild 2021 */ 2022 if (enable & 16) { 2023 struct tcp_ports_iterator *it; 2024 struct vestigial_inpcb res; 2025 static struct in6_addr any = IN6ADDR_ANY_INIT; 2026 int cnt = 0; 2027 2028 it = tcp_init_ports_v6(&any 2029 , inp->in6p_lport, 1); 2030 2031 while (tcp_next_port_v6(it, &res)) { 2032 ++cnt; 2033 } 2034 KASSERT(cnt); 2035 } 2036 #endif /* VTW_DEBUG */ 2037 break; 2038 } 2039 } 2040 2041 tcp_canceltimers(tp); 2042 tp = tcp_close(tp); 2043 KASSERT(!tp); 2044 2045 return 1; 2046 } 2047 2048 return 0; 2049 } 2050 2051 /*!\brief restart timer for vestigial time-wait entry 2052 */ 2053 static void 2054 vtw_restart_v4(vestigial_inpcb_t *vp) 2055 { 2056 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw; 2057 vtw_t *vtw; 2058 vtw_t *cp = ©.common; 2059 vtw_ctl_t *ctl; 2060 2061 KASSERT(mutex_owned(softnet_lock)); 2062 2063 db_trace(KTR_VTW 2064 , (vp->vtw, "vtw: restart %A:%P %A:%P" 2065 , vp->faddr.v4.s_addr, vp->fport 2066 , vp->laddr.v4.s_addr, vp->lport)); 2067 2068 /* Class might have changed, so have a squiz. 2069 */ 2070 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class)); 2071 vtw = vtw_alloc(ctl); 2072 2073 if (vtw) { 2074 vtw_v4_t *v4 = (void*)vtw; 2075 2076 /* Safe now to unhash the old entry 2077 */ 2078 vtw_del(vp->ctl, vp->vtw); 2079 2080 vtw->snd_nxt = cp->snd_nxt; 2081 vtw->rcv_nxt = cp->rcv_nxt; 2082 2083 v4->faddr = copy.faddr; 2084 v4->laddr = copy.laddr; 2085 v4->fport = copy.fport; 2086 v4->lport = copy.lport; 2087 2088 vtw->reuse_port = cp->reuse_port; 2089 vtw->reuse_addr = cp->reuse_addr; 2090 vtw->v6only = 0; 2091 vtw->uid = cp->uid; 2092 2093 vtw_inshash_v4(ctl, vtw); 2094 } 2095 2096 vp->valid = 0; 2097 } 2098 2099 /*!\brief restart timer for vestigial time-wait entry 2100 */ 2101 static void 2102 vtw_restart_v6(vestigial_inpcb_t *vp) 2103 { 2104 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw; 2105 vtw_t *vtw; 2106 vtw_t *cp = ©.common; 2107 vtw_ctl_t *ctl; 2108 2109 KASSERT(mutex_owned(softnet_lock)); 2110 2111 db_trace(KTR_VTW 2112 , (vp->vtw, "vtw: restart %6A:%P %6A:%P" 2113 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6)) 2114 , vp->fport 2115 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6)) 2116 , vp->lport)); 2117 2118 /* Class might have changed, so have a squiz. 2119 */ 2120 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class)); 2121 vtw = vtw_alloc(ctl); 2122 2123 if (vtw) { 2124 vtw_v6_t *v6 = (void*)vtw; 2125 2126 /* Safe now to unhash the old entry 2127 */ 2128 vtw_del(vp->ctl, vp->vtw); 2129 2130 vtw->snd_nxt = cp->snd_nxt; 2131 vtw->rcv_nxt = cp->rcv_nxt; 2132 2133 v6->faddr = copy.faddr; 2134 v6->laddr = copy.laddr; 2135 v6->fport = copy.fport; 2136 v6->lport = copy.lport; 2137 2138 vtw->reuse_port = cp->reuse_port; 2139 vtw->reuse_addr = cp->reuse_addr; 2140 vtw->v6only = cp->v6only; 2141 vtw->uid = cp->uid; 2142 2143 vtw_inshash_v6(ctl, vtw); 2144 } 2145 2146 vp->valid = 0; 2147 } 2148 2149 /*!\brief restart timer for vestigial time-wait entry 2150 */ 2151 void 2152 vtw_restart(vestigial_inpcb_t *vp) 2153 { 2154 if (!vp || !vp->valid) 2155 return; 2156 2157 if (vp->v4) 2158 vtw_restart_v4(vp); 2159 else 2160 vtw_restart_v6(vp); 2161 } 2162 2163 int 2164 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS) 2165 { 2166 int en, rc; 2167 struct sysctlnode node; 2168 2169 node = *rnode; 2170 en = *(int *)rnode->sysctl_data; 2171 node.sysctl_data = &en; 2172 2173 rc = sysctl_lookup(SYSCTLFN_CALL(&node)); 2174 if (rc != 0 || newp == NULL) 2175 return rc; 2176 2177 if (rnode->sysctl_data != &tcp4_vtw_enable && 2178 rnode->sysctl_data != &tcp6_vtw_enable) 2179 rc = ENOENT; 2180 else if ((en & 1) == 0) 2181 rc = 0; 2182 else if (rnode->sysctl_data == &tcp4_vtw_enable) 2183 rc = vtw_control_init(AF_INET); 2184 else /* rnode->sysctl_data == &tcp6_vtw_enable */ 2185 rc = vtw_control_init(AF_INET6); 2186 2187 if (rc == 0) 2188 *(int *)rnode->sysctl_data = en; 2189 2190 return rc; 2191 } 2192 2193 int 2194 vtw_earlyinit(void) 2195 { 2196 int i, rc; 2197 2198 callout_init(&vtw_cs, 0); 2199 callout_setfunc(&vtw_cs, vtw_tick, 0); 2200 2201 for (i = 0; i < VTW_NCLASS; ++i) { 2202 vtw_tcpv4[i].is_v4 = 1; 2203 vtw_tcpv6[i].is_v6 = 1; 2204 } 2205 2206 if ((tcp4_vtw_enable & 1) != 0 && 2207 (rc = vtw_control_init(AF_INET)) != 0) 2208 return rc; 2209 2210 if ((tcp6_vtw_enable & 1) != 0 && 2211 (rc = vtw_control_init(AF_INET6)) != 0) 2212 return rc; 2213 2214 return 0; 2215 } 2216 2217 #ifdef VTW_DEBUG 2218 #include <sys/syscallargs.h> 2219 #include <sys/sysctl.h> 2220 2221 /*!\brief add lalp, fafp entries for debug 2222 */ 2223 int 2224 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class) 2225 { 2226 vtw_ctl_t *ctl; 2227 vtw_t *vtw; 2228 2229 ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class)); 2230 if (!ctl) 2231 return 0; 2232 2233 vtw = vtw_alloc(ctl); 2234 2235 if (vtw) { 2236 vtw->snd_nxt = 0; 2237 vtw->rcv_nxt = 0; 2238 2239 switch (af) { 2240 case AF_INET: { 2241 vtw_v4_t *v4 = (void*)vtw; 2242 2243 v4->faddr = fa->sin_addr.v4.s_addr; 2244 v4->laddr = la->sin_addr.v4.s_addr; 2245 v4->fport = fa->sin_port; 2246 v4->lport = la->sin_port; 2247 2248 vtw->reuse_port = 1; 2249 vtw->reuse_addr = 1; 2250 vtw->v6only = 0; 2251 vtw->uid = 0; 2252 2253 vtw_inshash_v4(ctl, vtw); 2254 break; 2255 } 2256 2257 case AF_INET6: { 2258 vtw_v6_t *v6 = (void*)vtw; 2259 2260 v6->faddr = fa->sin_addr.v6; 2261 v6->laddr = la->sin_addr.v6; 2262 2263 v6->fport = fa->sin_port; 2264 v6->lport = la->sin_port; 2265 2266 vtw->reuse_port = 1; 2267 vtw->reuse_addr = 1; 2268 vtw->v6only = 0; 2269 vtw->uid = 0; 2270 2271 vtw_inshash_v6(ctl, vtw); 2272 break; 2273 } 2274 2275 default: 2276 break; 2277 } 2278 2279 return 1; 2280 } 2281 2282 return 0; 2283 } 2284 2285 static int vtw_syscall = 0; 2286 2287 static int 2288 vtw_debug_process(vtw_sysargs_t *ap) 2289 { 2290 struct vestigial_inpcb vestige; 2291 int rc = 0; 2292 2293 mutex_enter(softnet_lock); 2294 2295 switch (ap->op) { 2296 case 0: // insert 2297 vtw_debug_add(ap->la.sin_family 2298 , &ap->la 2299 , &ap->fa 2300 , TCPTV_MSL 2301 , 0); 2302 break; 2303 2304 case 1: // lookup 2305 case 2: // restart 2306 switch (ap->la.sin_family) { 2307 case AF_INET: 2308 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port, 2309 ap->la.sin_addr.v4, ap->la.sin_port, 2310 &vestige)) { 2311 if (ap->op == 2) { 2312 vtw_restart(&vestige); 2313 } 2314 rc = 0; 2315 } else 2316 rc = ESRCH; 2317 break; 2318 2319 case AF_INET6: 2320 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port, 2321 &ap->la.sin_addr.v6, ap->la.sin_port, 2322 &vestige)) { 2323 if (ap->op == 2) { 2324 vtw_restart(&vestige); 2325 } 2326 rc = 0; 2327 } else 2328 rc = ESRCH; 2329 break; 2330 default: 2331 rc = EINVAL; 2332 } 2333 break; 2334 2335 default: 2336 rc = EINVAL; 2337 } 2338 2339 mutex_exit(softnet_lock); 2340 return rc; 2341 } 2342 2343 struct sys_vtw_args { 2344 syscallarg(const vtw_sysargs_t *) req; 2345 syscallarg(size_t) len; 2346 }; 2347 2348 static int 2349 vtw_sys(struct lwp *l, const void *_, register_t *retval) 2350 { 2351 const struct sys_vtw_args *uap = _; 2352 void *buf; 2353 int rc; 2354 size_t len = SCARG(uap, len); 2355 2356 if (len != sizeof (vtw_sysargs_t)) 2357 return EINVAL; 2358 2359 buf = kmem_alloc(len, KM_SLEEP); 2360 rc = copyin(SCARG(uap, req), buf, len); 2361 if (!rc) { 2362 rc = vtw_debug_process(buf); 2363 } 2364 kmem_free(buf, len); 2365 2366 return rc; 2367 } 2368 2369 static void 2370 vtw_sanity_check(void) 2371 { 2372 vtw_ctl_t *ctl; 2373 vtw_t *vtw; 2374 int i; 2375 int n; 2376 2377 for (i = 0; i < VTW_NCLASS; ++i) { 2378 ctl = &vtw_tcpv4[i]; 2379 2380 if (!ctl->base.v || ctl->nalloc) 2381 continue; 2382 2383 for (n = 0, vtw = ctl->base.v; ; ) { 2384 ++n; 2385 vtw = vtw_next(ctl, vtw); 2386 if (vtw == ctl->base.v) 2387 break; 2388 } 2389 db_trace(KTR_VTW 2390 , (ctl, "sanity: class %x n %x nfree %x" 2391 , i, n, ctl->nfree)); 2392 2393 KASSERT(n == ctl->nfree); 2394 } 2395 2396 for (i = 0; i < VTW_NCLASS; ++i) { 2397 ctl = &vtw_tcpv6[i]; 2398 2399 if (!ctl->base.v || ctl->nalloc) 2400 continue; 2401 2402 for (n = 0, vtw = ctl->base.v; ; ) { 2403 ++n; 2404 vtw = vtw_next(ctl, vtw); 2405 if (vtw == ctl->base.v) 2406 break; 2407 } 2408 db_trace(KTR_VTW 2409 , (ctl, "sanity: class %x n %x nfree %x" 2410 , i, n, ctl->nfree)); 2411 KASSERT(n == ctl->nfree); 2412 } 2413 } 2414 2415 /*!\brief Initialise debug support. 2416 */ 2417 static void 2418 vtw_debug_init(void) 2419 { 2420 int i; 2421 2422 vtw_sanity_check(); 2423 2424 if (vtw_syscall) 2425 return; 2426 2427 for (i = 511; i; --i) { 2428 if (sysent[i].sy_call == sys_nosys) { 2429 sysent[i].sy_call = vtw_sys; 2430 sysent[i].sy_narg = 2; 2431 sysent[i].sy_argsize = sizeof (struct sys_vtw_args); 2432 sysent[i].sy_flags = 0; 2433 2434 vtw_syscall = i; 2435 break; 2436 } 2437 } 2438 if (i) { 2439 const struct sysctlnode *node; 2440 uint32_t flags; 2441 2442 flags = sysctl_root.sysctl_flags; 2443 2444 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE; 2445 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT; 2446 2447 sysctl_createv(0, 0, 0, &node, 2448 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2449 "koff", 2450 SYSCTL_DESCR("Kernel Obscure Feature Finder"), 2451 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2452 2453 if (!node) { 2454 sysctl_createv(0, 0, 0, &node, 2455 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2456 "koffka", 2457 SYSCTL_DESCR("The Real(tm) Kernel" 2458 " Obscure Feature Finder"), 2459 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2460 } 2461 if (node) { 2462 sysctl_createv(0, 0, 0, 0, 2463 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 2464 CTLTYPE_INT, "vtw_debug_syscall", 2465 SYSCTL_DESCR("vtw debug" 2466 " system call number"), 2467 0, 0, &vtw_syscall, 0, node->sysctl_num, 2468 CTL_CREATE, CTL_EOL); 2469 } 2470 sysctl_root.sysctl_flags = flags; 2471 } 2472 } 2473 #else /* !VTW_DEBUG */ 2474 static void 2475 vtw_debug_init(void) 2476 { 2477 return; 2478 } 2479 #endif /* !VTW_DEBUG */ 2480