1 /* 2 * Copyright (c) 2011 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Coyote Point Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 /* 31 * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using 32 * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime 33 * Truncation (MSLT). 34 * 35 * MSLT and VTW were contributed by Coyote Point Systems, Inc. 36 * 37 * Even after a TCP session enters the TIME_WAIT state, its corresponding 38 * socket and protocol control blocks (PCBs) stick around until the TCP 39 * Maximum Segment Lifetime (MSL) expires. On a host whose workload 40 * necessarily creates and closes down many TCP sockets, the sockets & PCBs 41 * for TCP sessions in TIME_WAIT state amount to many megabytes of dead 42 * weight in RAM. 43 * 44 * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to 45 * a class based on the nearness of the peer. Corresponding to each class 46 * is an MSL, and a session uses the MSL of its class. The classes are 47 * loopback (local host equals remote host), local (local host and remote 48 * host are on the same link/subnet), and remote (local host and remote 49 * host communicate via one or more gateways). Classes corresponding to 50 * nearer peers have lower MSLs by default: 2 seconds for loopback, 10 51 * seconds for local, 60 seconds for remote. Loopback and local sessions 52 * expire more quickly when MSLT is used. 53 * 54 * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket 55 * dead weight with a compact representation of the session, called a 56 * "vestigial PCB". VTW data structures are designed to be very fast and 57 * memory-efficient: for fast insertion and lookup of vestigial PCBs, 58 * the PCBs are stored in a hash table that is designed to minimize the 59 * number of cacheline visits per lookup/insertion. The memory both 60 * for vestigial PCBs and for elements of the PCB hashtable come from 61 * fixed-size pools, and linked data structures exploit this to conserve 62 * memory by representing references with a narrow index/offset from the 63 * start of a pool instead of a pointer. When space for new vestigial PCBs 64 * runs out, VTW makes room by discarding old vestigial PCBs, oldest first. 65 * VTW cooperates with MSLT. 66 * 67 * It may help to think of VTW as a "FIN cache" by analogy to the SYN 68 * cache. 69 * 70 * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT 71 * sessions as fast as it can is approximately 17% idle when VTW is active 72 * versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM 73 * when VTW is active (approximately 64k vestigial PCBs are created) than 74 * when it is inactive. 75 */ 76 77 #include <sys/cdefs.h> 78 79 #ifdef _KERNEL_OPT 80 #include "opt_ddb.h" 81 #include "opt_inet.h" 82 #include "opt_inet_csum.h" 83 #include "opt_tcp_debug.h" 84 #endif 85 86 #include <sys/param.h> 87 #include <sys/systm.h> 88 #include <sys/kmem.h> 89 #include <sys/mbuf.h> 90 #include <sys/protosw.h> 91 #include <sys/socket.h> 92 #include <sys/socketvar.h> 93 #include <sys/errno.h> 94 #include <sys/syslog.h> 95 #include <sys/pool.h> 96 #include <sys/domain.h> 97 #include <sys/kernel.h> 98 #include <net/if.h> 99 #include <net/if_types.h> 100 101 #include <netinet/in.h> 102 #include <netinet/in_systm.h> 103 #include <netinet/ip.h> 104 #include <netinet/in_pcb.h> 105 #include <netinet/in_var.h> 106 #include <netinet/ip_var.h> 107 #include <netinet/in_offload.h> 108 #include <netinet/ip6.h> 109 #include <netinet6/ip6_var.h> 110 #include <netinet6/in6_pcb.h> 111 #include <netinet6/ip6_var.h> 112 #include <netinet6/in6_var.h> 113 #include <netinet/icmp6.h> 114 115 #include <netinet/tcp.h> 116 #include <netinet/tcp_fsm.h> 117 #include <netinet/tcp_seq.h> 118 #include <netinet/tcp_timer.h> 119 #include <netinet/tcp_var.h> 120 #include <netinet/tcp_private.h> 121 #include <netinet/tcpip.h> 122 123 #include <netinet/tcp_vtw.h> 124 125 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.17 2016/12/13 08:29:03 ozaki-r Exp $"); 126 127 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0) 128 129 static void vtw_debug_init(void); 130 131 fatp_ctl_t fat_tcpv4; 132 fatp_ctl_t fat_tcpv6; 133 vtw_ctl_t vtw_tcpv4[VTW_NCLASS]; 134 vtw_ctl_t vtw_tcpv6[VTW_NCLASS]; 135 vtw_stats_t vtw_stats; 136 137 /* We provide state for the lookup_ports iterator. 138 * As currently we are netlock-protected, there is one. 139 * If we were finer-grain, we would have one per CPU. 140 * I do not want to be in the business of alloc/free. 141 * The best alternate would be allocate on the caller's 142 * stack, but that would require them to know the struct, 143 * or at least the size. 144 * See how she goes. 145 */ 146 struct tcp_ports_iterator { 147 union { 148 struct in_addr v4; 149 struct in6_addr v6; 150 } addr; 151 u_int port; 152 153 uint32_t wild : 1; 154 155 vtw_ctl_t *ctl; 156 fatp_t *fp; 157 158 uint16_t slot_idx; 159 uint16_t ctl_idx; 160 }; 161 162 static struct tcp_ports_iterator tcp_ports_iterator_v4; 163 static struct tcp_ports_iterator tcp_ports_iterator_v6; 164 165 static int vtw_age(vtw_ctl_t *, struct timeval *); 166 167 /*!\brief allocate a fat pointer from a collection. 168 */ 169 static fatp_t * 170 fatp_alloc(fatp_ctl_t *fat) 171 { 172 fatp_t *fp = 0; 173 174 if (fat->nfree) { 175 fp = fat->free; 176 if (fp) { 177 fat->free = fatp_next(fat, fp); 178 --fat->nfree; 179 ++fat->nalloc; 180 fp->nxt = 0; 181 182 KASSERT(!fp->inuse); 183 } 184 } 185 186 return fp; 187 } 188 189 /*!\brief free a fat pointer. 190 */ 191 static void 192 fatp_free(fatp_ctl_t *fat, fatp_t *fp) 193 { 194 if (fp) { 195 KASSERT(!fp->inuse); 196 KASSERT(!fp->nxt); 197 198 fp->nxt = fatp_index(fat, fat->free); 199 fat->free = fp; 200 201 ++fat->nfree; 202 --fat->nalloc; 203 } 204 } 205 206 /*!\brief initialise a collection of fat pointers. 207 * 208 *\param n # hash buckets 209 *\param m total # fat pointers to allocate 210 * 211 * We allocate 2x as much, as we have two hashes: full and lport only. 212 */ 213 static void 214 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m, 215 fatp_t *fat_base, fatp_t **fat_hash) 216 { 217 fatp_t *fp; 218 219 KASSERT(n <= FATP_MAX / 2); 220 221 fat->hash = fat_hash; 222 fat->base = fat_base; 223 224 fat->port = &fat->hash[m]; 225 226 fat->mask = m - 1; // ASSERT is power of 2 (m) 227 fat->lim = fat->base + 2*n - 1; 228 fat->nfree = 0; 229 fat->nalloc = 2*n; 230 231 /* Initialise the free list. 232 */ 233 for (fp = fat->lim; fp >= fat->base; --fp) { 234 fatp_free(fat, fp); 235 } 236 } 237 238 /* 239 * The `xtra' is XORed into the tag stored. 240 */ 241 static uint32_t fatp_xtra[] = { 242 0x11111111,0x22222222,0x33333333,0x44444444, 243 0x55555555,0x66666666,0x77777777,0x88888888, 244 0x12121212,0x21212121,0x34343434,0x43434343, 245 0x56565656,0x65656565,0x78787878,0x87878787, 246 0x11221122,0x22112211,0x33443344,0x44334433, 247 0x55665566,0x66556655,0x77887788,0x88778877, 248 0x11112222,0x22221111,0x33334444,0x44443333, 249 0x55556666,0x66665555,0x77778888,0x88887777, 250 }; 251 252 /*!\brief turn a {fatp_t*,slot} into an integral key. 253 * 254 * The key can be used to obtain the fatp_t, and the slot, 255 * as it directly encodes them. 256 */ 257 static inline uint32_t 258 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot) 259 { 260 CTASSERT(CACHE_LINE_SIZE == 32 || 261 CACHE_LINE_SIZE == 64 || 262 CACHE_LINE_SIZE == 128); 263 264 switch (fatp_ntags()) { 265 case 7: 266 return (fatp_index(fat, fp) << 3) | slot; 267 case 15: 268 return (fatp_index(fat, fp) << 4) | slot; 269 case 31: 270 return (fatp_index(fat, fp) << 5) | slot; 271 default: 272 KASSERT(0 && "no support, for no good reason"); 273 return ~0; 274 } 275 } 276 277 static inline uint32_t 278 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key) 279 { 280 CTASSERT(CACHE_LINE_SIZE == 32 || 281 CACHE_LINE_SIZE == 64 || 282 CACHE_LINE_SIZE == 128); 283 284 switch (fatp_ntags()) { 285 case 7: 286 return key & 7; 287 case 15: 288 return key & 15; 289 case 31: 290 return key & 31; 291 default: 292 KASSERT(0 && "no support, for no good reason"); 293 return ~0; 294 } 295 } 296 297 static inline fatp_t * 298 fatp_from_key(fatp_ctl_t *fat, uint32_t key) 299 { 300 CTASSERT(CACHE_LINE_SIZE == 32 || 301 CACHE_LINE_SIZE == 64 || 302 CACHE_LINE_SIZE == 128); 303 304 switch (fatp_ntags()) { 305 case 7: 306 key >>= 3; 307 break; 308 case 15: 309 key >>= 4; 310 break; 311 case 31: 312 key >>= 5; 313 break; 314 default: 315 KASSERT(0 && "no support, for no good reason"); 316 return 0; 317 } 318 319 return key ? fat->base + key - 1 : 0; 320 } 321 322 static inline uint32_t 323 idx_encode(vtw_ctl_t *ctl, uint32_t idx) 324 { 325 return (idx << ctl->idx_bits) | idx; 326 } 327 328 static inline uint32_t 329 idx_decode(vtw_ctl_t *ctl, uint32_t bits) 330 { 331 uint32_t idx = bits & ctl->idx_mask; 332 333 if (idx_encode(ctl, idx) == bits) 334 return idx; 335 else 336 return ~0; 337 } 338 339 /*!\brief insert index into fatp hash 340 * 341 *\param idx - index of element being placed in hash chain 342 *\param tag - 32-bit tag identifier 343 * 344 *\returns 345 * value which can be used to locate entry. 346 * 347 *\note 348 * we rely on the fact that there are unused high bits in the index 349 * for verification purposes on lookup. 350 */ 351 352 static inline uint32_t 353 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which, 354 void *dbg) 355 { 356 fatp_t *fp; 357 fatp_t **hash = (which ? fat->port : fat->hash); 358 int i; 359 360 fp = hash[tag & fat->mask]; 361 362 while (!fp || fatp_full(fp)) { 363 fatp_t *fq; 364 365 /* All entries are inuse at the top level. 366 * We allocate a spare, and push the top level 367 * down one. All entries in the fp we push down 368 * (think of a tape worm here) will be expelled sooner than 369 * any entries added subsequently to this hash bucket. 370 * This is a property of the time waits we are exploiting. 371 */ 372 373 fq = fatp_alloc(fat); 374 if (!fq) { 375 vtw_age(fat->vtw, 0); 376 fp = hash[tag & fat->mask]; 377 continue; 378 } 379 380 fq->inuse = 0; 381 fq->nxt = fatp_index(fat, fp); 382 383 hash[tag & fat->mask] = fq; 384 385 fp = fq; 386 } 387 388 KASSERT(!fatp_full(fp)); 389 390 /* Fill highest index first. Lookup is lowest first. 391 */ 392 for (i = fatp_ntags(); --i >= 0; ) { 393 if (!((1 << i) & fp->inuse)) { 394 break; 395 } 396 } 397 398 fp->inuse |= 1 << i; 399 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i]; 400 401 db_trace(KTR_VTW 402 , (fp, "fat: inuse %5.5x tag[%x] %8.8x" 403 , fp->inuse 404 , i, fp->tag[i])); 405 406 return fatp_key(fat, fp, i); 407 } 408 409 static inline int 410 vtw_alive(const vtw_t *vtw) 411 { 412 return vtw->hashed && vtw->expire.tv_sec; 413 } 414 415 static inline uint32_t 416 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4) 417 { 418 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4) 419 return v4 - ctl->base.v4; 420 421 KASSERT(0 && "vtw out of bounds"); 422 423 return ~0; 424 } 425 426 static inline uint32_t 427 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6) 428 { 429 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6) 430 return v6 - ctl->base.v6; 431 432 KASSERT(0 && "vtw out of bounds"); 433 434 return ~0; 435 } 436 437 static inline uint32_t 438 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw) 439 { 440 if (ctl->clidx) 441 ctl = ctl->ctl; 442 443 if (ctl->is_v4) 444 return vtw_index_v4(ctl, (vtw_v4_t *)vtw); 445 446 if (ctl->is_v6) 447 return vtw_index_v6(ctl, (vtw_v6_t *)vtw); 448 449 KASSERT(0 && "neither 4 nor 6. most curious."); 450 451 return ~0; 452 } 453 454 static inline vtw_t * 455 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx) 456 { 457 if (ctl->clidx) 458 ctl = ctl->ctl; 459 460 /* See if the index looks like it might be an index. 461 * Bits on outside of the valid index bits is a give away. 462 */ 463 idx = idx_decode(ctl, idx); 464 465 if (idx == ~0) { 466 return 0; 467 } else if (ctl->is_v4) { 468 vtw_v4_t *vtw = ctl->base.v4 + idx; 469 470 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4) 471 ? &vtw->common : 0; 472 } else if (ctl->is_v6) { 473 vtw_v6_t *vtw = ctl->base.v6 + idx; 474 475 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6) 476 ? &vtw->common : 0; 477 } else { 478 KASSERT(0 && "badness"); 479 return 0; 480 } 481 } 482 483 /*!\brief return the next vtw after this one. 484 * 485 * Due to the differing sizes of the entries in differing 486 * arenas, we have to ensure we ++ the correct pointer type. 487 * 488 * Also handles wrap. 489 */ 490 static inline vtw_t * 491 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw) 492 { 493 if (ctl->is_v4) { 494 vtw_v4_t *v4 = (void*)vtw; 495 496 vtw = &(++v4)->common; 497 } else { 498 vtw_v6_t *v6 = (void*)vtw; 499 500 vtw = &(++v6)->common; 501 } 502 503 if (vtw > ctl->lim.v) 504 vtw = ctl->base.v; 505 506 return vtw; 507 } 508 509 /*!\brief remove entry from FATP hash chains 510 */ 511 static inline void 512 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw) 513 { 514 fatp_ctl_t *fat = ctl->fat; 515 fatp_t *fp; 516 uint32_t key = vtw->key; 517 uint32_t tag, slot, idx; 518 vtw_v4_t *v4 = (void*)vtw; 519 vtw_v6_t *v6 = (void*)vtw; 520 521 if (!vtw->hashed) { 522 KASSERT(0 && "unhashed"); 523 return; 524 } 525 526 if (fat->vtw->is_v4) { 527 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport); 528 } else if (fat->vtw->is_v6) { 529 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport); 530 } else { 531 tag = 0; 532 KASSERT(0 && "not reached"); 533 } 534 535 /* Remove from fat->hash[] 536 */ 537 slot = fatp_slot_from_key(fat, key); 538 fp = fatp_from_key(fat, key); 539 idx = vtw_index(ctl, vtw); 540 541 db_trace(KTR_VTW 542 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x" 543 , fp->inuse, slot, idx, key, tag)); 544 545 KASSERT(fp->inuse & (1 << slot)); 546 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 547 ^ fatp_xtra[slot])); 548 549 if ((fp->inuse & (1 << slot)) 550 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 551 ^ fatp_xtra[slot])) { 552 fp->inuse ^= 1 << slot; 553 fp->tag[slot] = 0; 554 555 /* When we delete entries, we do not compact. This is 556 * due to temporality. We add entries, and they 557 * (eventually) expire. Older entries will be further 558 * down the chain. 559 */ 560 if (!fp->inuse) { 561 uint32_t hi = tag & fat->mask; 562 fatp_t *fq = 0; 563 fatp_t *fr = fat->hash[hi]; 564 565 while (fr && fr != fp) { 566 fr = fatp_next(fat, fq = fr); 567 } 568 569 if (fr == fp) { 570 if (fq) { 571 fq->nxt = fp->nxt; 572 fp->nxt = 0; 573 fatp_free(fat, fp); 574 } else { 575 KASSERT(fat->hash[hi] == fp); 576 577 if (fp->nxt) { 578 fat->hash[hi] 579 = fatp_next(fat, fp); 580 fp->nxt = 0; 581 fatp_free(fat, fp); 582 } else { 583 /* retain for next use. 584 */ 585 ; 586 } 587 } 588 } else { 589 fr = fat->hash[hi]; 590 591 do { 592 db_trace(KTR_VTW 593 , (fr 594 , "fat:*del inuse %5.5x" 595 " nxt %x" 596 , fr->inuse, fr->nxt)); 597 598 fr = fatp_next(fat, fq = fr); 599 } while (fr && fr != fp); 600 601 KASSERT(0 && "oops"); 602 } 603 } 604 vtw->key ^= ~0; 605 } 606 607 if (fat->vtw->is_v4) { 608 tag = v4_port_tag(v4->lport); 609 } else if (fat->vtw->is_v6) { 610 tag = v6_port_tag(v6->lport); 611 } 612 613 /* Remove from fat->port[] 614 */ 615 key = vtw->port_key; 616 slot = fatp_slot_from_key(fat, key); 617 fp = fatp_from_key(fat, key); 618 idx = vtw_index(ctl, vtw); 619 620 db_trace(KTR_VTW 621 , (fp, "fatport: del inuse %5.5x" 622 " slot %x idx %x key %x tag %x" 623 , fp->inuse, slot, idx, key, tag)); 624 625 KASSERT(fp->inuse & (1 << slot)); 626 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 627 ^ fatp_xtra[slot])); 628 629 if ((fp->inuse & (1 << slot)) 630 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 631 ^ fatp_xtra[slot])) { 632 fp->inuse ^= 1 << slot; 633 fp->tag[slot] = 0; 634 635 if (!fp->inuse) { 636 uint32_t hi = tag & fat->mask; 637 fatp_t *fq = 0; 638 fatp_t *fr = fat->port[hi]; 639 640 while (fr && fr != fp) { 641 fr = fatp_next(fat, fq = fr); 642 } 643 644 if (fr == fp) { 645 if (fq) { 646 fq->nxt = fp->nxt; 647 fp->nxt = 0; 648 fatp_free(fat, fp); 649 } else { 650 KASSERT(fat->port[hi] == fp); 651 652 if (fp->nxt) { 653 fat->port[hi] 654 = fatp_next(fat, fp); 655 fp->nxt = 0; 656 fatp_free(fat, fp); 657 } else { 658 /* retain for next use. 659 */ 660 ; 661 } 662 } 663 } 664 } 665 vtw->port_key ^= ~0; 666 } 667 668 vtw->hashed = 0; 669 } 670 671 /*!\brief remove entry from hash, possibly free. 672 */ 673 void 674 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw) 675 { 676 KASSERT(mutex_owned(softnet_lock)); 677 678 if (vtw->hashed) { 679 ++vtw_stats.del; 680 vtw_unhash(ctl, vtw); 681 } 682 683 /* We only delete the oldest entry. 684 */ 685 if (vtw != ctl->oldest.v) 686 return; 687 688 --ctl->nalloc; 689 ++ctl->nfree; 690 691 vtw->expire.tv_sec = 0; 692 vtw->expire.tv_usec = ~0; 693 694 if (!ctl->nalloc) 695 ctl->oldest.v = 0; 696 697 ctl->oldest.v = vtw_next(ctl, vtw); 698 } 699 700 /*!\brief insert vestigial timewait in hash chain 701 */ 702 static void 703 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw) 704 { 705 uint32_t idx = vtw_index(ctl, vtw); 706 uint32_t tag; 707 vtw_v4_t *v4 = (void*)vtw; 708 709 KASSERT(mutex_owned(softnet_lock)); 710 KASSERT(!vtw->hashed); 711 KASSERT(ctl->clidx == vtw->msl_class); 712 713 ++vtw_stats.ins; 714 715 tag = v4_tag(v4->faddr, v4->fport, 716 v4->laddr, v4->lport); 717 718 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 719 720 db_trace(KTR_VTW, (ctl 721 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x" 722 " tag %8.8x key %8.8x" 723 , v4->faddr, v4->fport 724 , v4->laddr, v4->lport 725 , tag 726 , vtw->key)); 727 728 tag = v4_port_tag(v4->lport); 729 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 730 731 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 732 , v4->lport, v4->lport 733 , tag 734 , vtw->key)); 735 736 vtw->hashed = 1; 737 } 738 739 /*!\brief insert vestigial timewait in hash chain 740 */ 741 static void 742 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw) 743 { 744 uint32_t idx = vtw_index(ctl, vtw); 745 uint32_t tag; 746 vtw_v6_t *v6 = (void*)vtw; 747 748 KASSERT(mutex_owned(softnet_lock)); 749 KASSERT(!vtw->hashed); 750 KASSERT(ctl->clidx == vtw->msl_class); 751 752 ++vtw_stats.ins; 753 754 tag = v6_tag(&v6->faddr, v6->fport, 755 &v6->laddr, v6->lport); 756 757 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 758 759 tag = v6_port_tag(v6->lport); 760 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 761 762 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 763 , v6->lport, v6->lport 764 , tag 765 , vtw->key)); 766 767 vtw->hashed = 1; 768 } 769 770 static vtw_t * 771 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport 772 , uint32_t laddr, uint16_t lport 773 , int which) 774 { 775 vtw_v4_t *v4; 776 vtw_t *vtw; 777 uint32_t tag; 778 fatp_t *fp; 779 int i; 780 uint32_t fatps = 0, probes = 0, losings = 0; 781 782 if (!ctl || !ctl->fat) 783 return 0; 784 785 ++vtw_stats.look[which]; 786 787 if (which) { 788 tag = v4_port_tag(lport); 789 fp = ctl->fat->port[tag & ctl->fat->mask]; 790 } else { 791 tag = v4_tag(faddr, fport, laddr, lport); 792 fp = ctl->fat->hash[tag & ctl->fat->mask]; 793 } 794 795 while (fp && fp->inuse) { 796 uint32_t inuse = fp->inuse; 797 798 ++fatps; 799 800 for (i = 0; inuse && i < fatp_ntags(); ++i) { 801 uint32_t idx; 802 803 if (!(inuse & (1 << i))) 804 continue; 805 806 inuse ^= 1 << i; 807 808 ++probes; 809 ++vtw_stats.probe[which]; 810 811 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 812 vtw = vtw_from_index(ctl, idx); 813 814 if (!vtw) { 815 /* Hopefully fast path. 816 */ 817 db_trace(KTR_VTW 818 , (fp, "vtw: fast %A:%P %A:%P" 819 " idx %x tag %x" 820 , faddr, fport 821 , laddr, lport 822 , idx, tag)); 823 continue; 824 } 825 826 v4 = (void*)vtw; 827 828 /* The de-referencing of vtw is what we want to avoid. 829 * Losing. 830 */ 831 if (vtw_alive(vtw) 832 && ((which ? vtw->port_key : vtw->key) 833 == fatp_key(ctl->fat, fp, i)) 834 && (which 835 || (v4->faddr == faddr && v4->laddr == laddr 836 && v4->fport == fport)) 837 && v4->lport == lport) { 838 ++vtw_stats.hit[which]; 839 840 db_trace(KTR_VTW 841 , (fp, "vtw: hit %8.8x:%4.4x" 842 " %8.8x:%4.4x idx %x key %x" 843 , faddr, fport 844 , laddr, lport 845 , idx_decode(ctl, idx), vtw->key)); 846 847 KASSERT(vtw->hashed); 848 849 goto out; 850 } 851 ++vtw_stats.losing[which]; 852 ++losings; 853 854 if (vtw_alive(vtw)) { 855 db_trace(KTR_VTW 856 , (fp, "vtw:!mis %8.8x:%4.4x" 857 " %8.8x:%4.4x key %x tag %x" 858 , faddr, fport 859 , laddr, lport 860 , fatp_key(ctl->fat, fp, i) 861 , v4_tag(faddr, fport 862 , laddr, lport))); 863 db_trace(KTR_VTW 864 , (vtw, "vtw:!mis %8.8x:%4.4x" 865 " %8.8x:%4.4x key %x tag %x" 866 , v4->faddr, v4->fport 867 , v4->laddr, v4->lport 868 , vtw->key 869 , v4_tag(v4->faddr, v4->fport 870 , v4->laddr, v4->lport))); 871 872 if (vtw->key == fatp_key(ctl->fat, fp, i)) { 873 db_trace(KTR_VTW 874 , (vtw, "vtw:!mis %8.8x:%4.4x" 875 " %8.8x:%4.4x key %x" 876 " which %x" 877 , v4->faddr, v4->fport 878 , v4->laddr, v4->lport 879 , vtw->key 880 , which)); 881 882 } else { 883 db_trace(KTR_VTW 884 , (vtw 885 , "vtw:!mis" 886 " key %8.8x != %8.8x" 887 " idx %x i %x which %x" 888 , vtw->key 889 , fatp_key(ctl->fat, fp, i) 890 , idx_decode(ctl, idx) 891 , i 892 , which)); 893 } 894 } else { 895 db_trace(KTR_VTW 896 , (fp 897 , "vtw:!mis free entry" 898 " idx %x vtw %p which %x" 899 , idx_decode(ctl, idx) 900 , vtw, which)); 901 } 902 } 903 904 if (fp->nxt) { 905 fp = fatp_next(ctl->fat, fp); 906 } else { 907 break; 908 } 909 } 910 ++vtw_stats.miss[which]; 911 vtw = 0; 912 out: 913 if (fatps > vtw_stats.max_chain[which]) 914 vtw_stats.max_chain[which] = fatps; 915 if (probes > vtw_stats.max_probe[which]) 916 vtw_stats.max_probe[which] = probes; 917 if (losings > vtw_stats.max_loss[which]) 918 vtw_stats.max_loss[which] = losings; 919 920 return vtw; 921 } 922 923 static vtw_t * 924 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport 925 , const struct in6_addr *laddr, uint16_t lport 926 , int which) 927 { 928 vtw_v6_t *v6; 929 vtw_t *vtw; 930 uint32_t tag; 931 fatp_t *fp; 932 int i; 933 uint32_t fatps = 0, probes = 0, losings = 0; 934 935 ++vtw_stats.look[which]; 936 937 if (!ctl || !ctl->fat) 938 return 0; 939 940 if (which) { 941 tag = v6_port_tag(lport); 942 fp = ctl->fat->port[tag & ctl->fat->mask]; 943 } else { 944 tag = v6_tag(faddr, fport, laddr, lport); 945 fp = ctl->fat->hash[tag & ctl->fat->mask]; 946 } 947 948 while (fp && fp->inuse) { 949 uint32_t inuse = fp->inuse; 950 951 ++fatps; 952 953 for (i = 0; inuse && i < fatp_ntags(); ++i) { 954 uint32_t idx; 955 956 if (!(inuse & (1 << i))) 957 continue; 958 959 inuse ^= 1 << i; 960 961 ++probes; 962 ++vtw_stats.probe[which]; 963 964 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 965 vtw = vtw_from_index(ctl, idx); 966 967 db_trace(KTR_VTW 968 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x" 969 , i 970 , db_store(faddr, sizeof (*faddr)), fport 971 , db_store(laddr, sizeof (*laddr)), lport 972 , idx_decode(ctl, idx))); 973 974 if (!vtw) { 975 /* Hopefully fast path. 976 */ 977 continue; 978 } 979 980 v6 = (void*)vtw; 981 982 if (vtw_alive(vtw) 983 && ((which ? vtw->port_key : vtw->key) 984 == fatp_key(ctl->fat, fp, i)) 985 && v6->lport == lport 986 && (which 987 || (v6->fport == fport 988 && !bcmp(&v6->faddr, faddr, sizeof (*faddr)) 989 && !bcmp(&v6->laddr, laddr 990 , sizeof (*laddr))))) { 991 ++vtw_stats.hit[which]; 992 993 KASSERT(vtw->hashed); 994 goto out; 995 } else { 996 ++vtw_stats.losing[which]; 997 ++losings; 998 } 999 } 1000 1001 if (fp->nxt) { 1002 fp = fatp_next(ctl->fat, fp); 1003 } else { 1004 break; 1005 } 1006 } 1007 ++vtw_stats.miss[which]; 1008 vtw = 0; 1009 out: 1010 if (fatps > vtw_stats.max_chain[which]) 1011 vtw_stats.max_chain[which] = fatps; 1012 if (probes > vtw_stats.max_probe[which]) 1013 vtw_stats.max_probe[which] = probes; 1014 if (losings > vtw_stats.max_loss[which]) 1015 vtw_stats.max_loss[which] = losings; 1016 1017 return vtw; 1018 } 1019 1020 /*!\brief port iterator 1021 */ 1022 static vtw_t * 1023 vtw_next_port_v4(struct tcp_ports_iterator *it) 1024 { 1025 vtw_ctl_t *ctl = it->ctl; 1026 vtw_v4_t *v4; 1027 vtw_t *vtw; 1028 uint32_t tag; 1029 uint16_t lport = it->port; 1030 fatp_t *fp; 1031 int i; 1032 uint32_t fatps = 0, probes = 0, losings = 0; 1033 1034 tag = v4_port_tag(lport); 1035 if (!it->fp) { 1036 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1037 it->slot_idx = 0; 1038 } 1039 fp = it->fp; 1040 1041 while (fp) { 1042 uint32_t inuse = fp->inuse; 1043 1044 ++fatps; 1045 1046 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1047 uint32_t idx; 1048 1049 if (!(inuse & (1 << i))) 1050 continue; 1051 1052 inuse &= ~0U << i; 1053 1054 if (i < it->slot_idx) 1055 continue; 1056 1057 ++vtw_stats.probe[1]; 1058 ++probes; 1059 1060 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1061 vtw = vtw_from_index(ctl, idx); 1062 1063 if (!vtw) { 1064 /* Hopefully fast path. 1065 */ 1066 continue; 1067 } 1068 1069 v4 = (void*)vtw; 1070 1071 if (vtw_alive(vtw) 1072 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1073 && v4->lport == lport) { 1074 ++vtw_stats.hit[1]; 1075 1076 it->slot_idx = i + 1; 1077 1078 goto out; 1079 } else if (vtw_alive(vtw)) { 1080 ++vtw_stats.losing[1]; 1081 ++losings; 1082 1083 db_trace(KTR_VTW 1084 , (vtw, "vtw:!mis" 1085 " port %8.8x:%4.4x %8.8x:%4.4x" 1086 " key %x port %x" 1087 , v4->faddr, v4->fport 1088 , v4->laddr, v4->lport 1089 , vtw->key 1090 , lport)); 1091 } else { 1092 /* Really losing here. We are coming 1093 * up with references to free entries. 1094 * Might find it better to use 1095 * traditional, or need another 1096 * add-hockery. The other add-hockery 1097 * would be to pul more into into the 1098 * cache line to reject the false 1099 * hits. 1100 */ 1101 ++vtw_stats.losing[1]; 1102 ++losings; 1103 db_trace(KTR_VTW 1104 , (fp, "vtw:!mis port %x" 1105 " - free entry idx %x vtw %p" 1106 , lport 1107 , idx_decode(ctl, idx) 1108 , vtw)); 1109 } 1110 } 1111 1112 if (fp->nxt) { 1113 it->fp = fp = fatp_next(ctl->fat, fp); 1114 it->slot_idx = 0; 1115 } else { 1116 it->fp = 0; 1117 break; 1118 } 1119 } 1120 ++vtw_stats.miss[1]; 1121 1122 vtw = 0; 1123 out: 1124 if (fatps > vtw_stats.max_chain[1]) 1125 vtw_stats.max_chain[1] = fatps; 1126 if (probes > vtw_stats.max_probe[1]) 1127 vtw_stats.max_probe[1] = probes; 1128 if (losings > vtw_stats.max_loss[1]) 1129 vtw_stats.max_loss[1] = losings; 1130 1131 return vtw; 1132 } 1133 1134 /*!\brief port iterator 1135 */ 1136 static vtw_t * 1137 vtw_next_port_v6(struct tcp_ports_iterator *it) 1138 { 1139 vtw_ctl_t *ctl = it->ctl; 1140 vtw_v6_t *v6; 1141 vtw_t *vtw; 1142 uint32_t tag; 1143 uint16_t lport = it->port; 1144 fatp_t *fp; 1145 int i; 1146 uint32_t fatps = 0, probes = 0, losings = 0; 1147 1148 tag = v6_port_tag(lport); 1149 if (!it->fp) { 1150 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1151 it->slot_idx = 0; 1152 } 1153 fp = it->fp; 1154 1155 while (fp) { 1156 uint32_t inuse = fp->inuse; 1157 1158 ++fatps; 1159 1160 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1161 uint32_t idx; 1162 1163 if (!(inuse & (1 << i))) 1164 continue; 1165 1166 inuse &= ~0U << i; 1167 1168 if (i < it->slot_idx) 1169 continue; 1170 1171 ++vtw_stats.probe[1]; 1172 ++probes; 1173 1174 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1175 vtw = vtw_from_index(ctl, idx); 1176 1177 if (!vtw) { 1178 /* Hopefully fast path. 1179 */ 1180 continue; 1181 } 1182 1183 v6 = (void*)vtw; 1184 1185 db_trace(KTR_VTW 1186 , (vtw, "vtw: i %x idx %x fp->tag %x" 1187 " tag %x xtra %x" 1188 , i, idx_decode(ctl, idx) 1189 , fp->tag[i], tag, fatp_xtra[i])); 1190 1191 if (vtw_alive(vtw) 1192 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1193 && v6->lport == lport) { 1194 ++vtw_stats.hit[1]; 1195 1196 db_trace(KTR_VTW 1197 , (fp, "vtw: nxt port %P - %4.4x" 1198 " idx %x key %x" 1199 , lport, lport 1200 , idx_decode(ctl, idx), vtw->key)); 1201 1202 it->slot_idx = i + 1; 1203 goto out; 1204 } else if (vtw_alive(vtw)) { 1205 ++vtw_stats.losing[1]; 1206 1207 db_trace(KTR_VTW 1208 , (vtw, "vtw:!mis port %6A:%4.4x" 1209 " %6A:%4.4x key %x port %x" 1210 , db_store(&v6->faddr 1211 , sizeof (v6->faddr)) 1212 , v6->fport 1213 , db_store(&v6->laddr 1214 , sizeof (v6->faddr)) 1215 , v6->lport 1216 , vtw->key 1217 , lport)); 1218 } else { 1219 /* Really losing here. We are coming 1220 * up with references to free entries. 1221 * Might find it better to use 1222 * traditional, or need another 1223 * add-hockery. The other add-hockery 1224 * would be to pul more into into the 1225 * cache line to reject the false 1226 * hits. 1227 */ 1228 ++vtw_stats.losing[1]; 1229 ++losings; 1230 1231 db_trace(KTR_VTW 1232 , (fp 1233 , "vtw:!mis port %x" 1234 " - free entry idx %x vtw %p" 1235 , lport, idx_decode(ctl, idx) 1236 , vtw)); 1237 } 1238 } 1239 1240 if (fp->nxt) { 1241 it->fp = fp = fatp_next(ctl->fat, fp); 1242 it->slot_idx = 0; 1243 } else { 1244 it->fp = 0; 1245 break; 1246 } 1247 } 1248 ++vtw_stats.miss[1]; 1249 1250 vtw = 0; 1251 out: 1252 if (fatps > vtw_stats.max_chain[1]) 1253 vtw_stats.max_chain[1] = fatps; 1254 if (probes > vtw_stats.max_probe[1]) 1255 vtw_stats.max_probe[1] = probes; 1256 if (losings > vtw_stats.max_loss[1]) 1257 vtw_stats.max_loss[1] = losings; 1258 1259 return vtw; 1260 } 1261 1262 /*!\brief initialise the VTW allocation arena 1263 * 1264 * There are 1+3 allocation classes: 1265 * 0 classless 1266 * {1,2,3} MSL-class based allocation 1267 * 1268 * The allocation arenas are all initialised. Classless gets all the 1269 * space. MSL-class based divides the arena, so that allocation 1270 * within a class can proceed without having to consider entries 1271 * (aka: cache lines) from different classes. 1272 * 1273 * Usually, we are completely classless or class-based, but there can be 1274 * transition periods, corresponding to dynamic adjustments in the config 1275 * by the operator. 1276 */ 1277 static void 1278 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v) 1279 { 1280 int class_n, i; 1281 vtw_t *base; 1282 1283 ctl->base.v = ctl_base_v; 1284 1285 if (ctl->is_v4) { 1286 ctl->lim.v4 = ctl->base.v4 + n - 1; 1287 ctl->alloc.v4 = ctl->base.v4; 1288 } else { 1289 ctl->lim.v6 = ctl->base.v6 + n - 1; 1290 ctl->alloc.v6 = ctl->base.v6; 1291 } 1292 1293 ctl->nfree = n; 1294 ctl->ctl = ctl; 1295 1296 ctl->idx_bits = 32; 1297 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) { 1298 ctl->idx_mask >>= 1; 1299 ctl->idx_bits -= 1; 1300 } 1301 1302 ctl->idx_mask <<= 1; 1303 ctl->idx_mask |= 1; 1304 ctl->idx_bits += 1; 1305 1306 ctl->fat = fat; 1307 fat->vtw = ctl; 1308 1309 /* Divide the resources equally amongst the classes. 1310 * This is not optimal, as the different classes 1311 * arrive and leave at different rates, but it is 1312 * the best I can do for now. 1313 */ 1314 class_n = n / (VTW_NCLASS-1); 1315 base = ctl->base.v; 1316 1317 for (i = 1; i < VTW_NCLASS; ++i) { 1318 int j; 1319 1320 ctl[i] = ctl[0]; 1321 ctl[i].clidx = i; 1322 1323 ctl[i].base.v = base; 1324 ctl[i].alloc = ctl[i].base; 1325 1326 for (j = 0; j < class_n - 1; ++j) { 1327 if (tcp_msl_enable) 1328 base->msl_class = i; 1329 base = vtw_next(ctl, base); 1330 } 1331 1332 ctl[i].lim.v = base; 1333 base = vtw_next(ctl, base); 1334 ctl[i].nfree = class_n; 1335 } 1336 1337 vtw_debug_init(); 1338 } 1339 1340 /*!\brief map class to TCP MSL 1341 */ 1342 static inline uint32_t 1343 class_to_msl(int msl_class) 1344 { 1345 switch (msl_class) { 1346 case 0: 1347 case 1: 1348 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0); 1349 case 2: 1350 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); 1351 default: 1352 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); 1353 } 1354 } 1355 1356 /*!\brief map TCP MSL to class 1357 */ 1358 static inline uint32_t 1359 msl_to_class(int msl) 1360 { 1361 if (tcp_msl_enable) { 1362 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2))) 1363 return 1+2; 1364 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1))) 1365 return 1+1; 1366 return 1; 1367 } 1368 return 0; 1369 } 1370 1371 /*!\brief allocate a vtw entry 1372 */ 1373 static inline vtw_t * 1374 vtw_alloc(vtw_ctl_t *ctl) 1375 { 1376 vtw_t *vtw = 0; 1377 int stuck = 0; 1378 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0; 1379 int msl; 1380 1381 KASSERT(mutex_owned(softnet_lock)); 1382 1383 /* If no resources, we will not get far. 1384 */ 1385 if (!ctl || !ctl->base.v4 || avail <= 0) 1386 return 0; 1387 1388 /* Obtain a free one. 1389 */ 1390 while (!ctl->nfree) { 1391 vtw_age(ctl, 0); 1392 1393 if (++stuck > avail) { 1394 /* When in transition between 1395 * schemes (classless, classed) we 1396 * can be stuck having to await the 1397 * expiration of cross-allocated entries. 1398 * 1399 * Returning zero means we will fall back to the 1400 * traditional TIME_WAIT handling, except in the 1401 * case of a re-shed, in which case we cannot 1402 * perform the reshecd, but will retain the extant 1403 * entry. 1404 */ 1405 db_trace(KTR_VTW 1406 , (ctl, "vtw:!none free in class %x %x/%x" 1407 , ctl->clidx 1408 , ctl->nalloc, ctl->nfree)); 1409 1410 return 0; 1411 } 1412 } 1413 1414 vtw = ctl->alloc.v; 1415 1416 if (vtw->msl_class != ctl->clidx) { 1417 /* Usurping rules: 1418 * 0 -> {1,2,3} or {1,2,3} -> 0 1419 */ 1420 KASSERT(!vtw->msl_class || !ctl->clidx); 1421 1422 if (vtw->hashed || vtw->expire.tv_sec) { 1423 /* As this is owned by some other class, 1424 * we must wait for it to expire it. 1425 * This will only happen on class/classless 1426 * transitions, which are guaranteed to progress 1427 * to completion in small finite time, barring bugs. 1428 */ 1429 db_trace(KTR_VTW 1430 , (ctl, "vtw:!%p class %x!=%x %x:%x%s" 1431 , vtw, vtw->msl_class, ctl->clidx 1432 , vtw->expire.tv_sec 1433 , vtw->expire.tv_usec 1434 , vtw->hashed ? " hashed" : "")); 1435 1436 return 0; 1437 } 1438 1439 db_trace(KTR_VTW 1440 , (ctl, "vtw:!%p usurped from %x to %x" 1441 , vtw, vtw->msl_class, ctl->clidx)); 1442 1443 vtw->msl_class = ctl->clidx; 1444 } 1445 1446 if (vtw_alive(vtw)) { 1447 KASSERT(0 && "next free not free"); 1448 return 0; 1449 } 1450 1451 /* Advance allocation poiter. 1452 */ 1453 ctl->alloc.v = vtw_next(ctl, vtw); 1454 1455 --ctl->nfree; 1456 ++ctl->nalloc; 1457 1458 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec 1459 1460 /* mark expiration 1461 */ 1462 getmicrouptime(&vtw->expire); 1463 1464 /* Move expiration into the future. 1465 */ 1466 vtw->expire.tv_sec += msl / 1000; 1467 vtw->expire.tv_usec += 1000 * (msl % 1000); 1468 1469 while (vtw->expire.tv_usec >= 1000*1000) { 1470 vtw->expire.tv_usec -= 1000*1000; 1471 vtw->expire.tv_sec += 1; 1472 } 1473 1474 if (!ctl->oldest.v) 1475 ctl->oldest.v = vtw; 1476 1477 return vtw; 1478 } 1479 1480 /*!\brief expiration 1481 */ 1482 static int 1483 vtw_age(vtw_ctl_t *ctl, struct timeval *_when) 1484 { 1485 vtw_t *vtw; 1486 struct timeval then, *when = _when; 1487 int maxtries = 0; 1488 1489 if (!ctl->oldest.v) { 1490 KASSERT(!ctl->nalloc); 1491 return 0; 1492 } 1493 1494 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) { 1495 if (++maxtries > ctl->nalloc) 1496 break; 1497 1498 if (vtw->msl_class != ctl->clidx) { 1499 db_trace(KTR_VTW 1500 , (vtw, "vtw:!age class mismatch %x != %x" 1501 , vtw->msl_class, ctl->clidx)); 1502 /* XXXX 1503 * See if the appropriate action is to skip to the next. 1504 * XXXX 1505 */ 1506 ctl->oldest.v = vtw = vtw_next(ctl, vtw); 1507 continue; 1508 } 1509 if (!when) { 1510 /* Latch oldest timeval if none specified. 1511 */ 1512 then = vtw->expire; 1513 when = &then; 1514 } 1515 1516 if (!timercmp(&vtw->expire, when, <=)) 1517 break; 1518 1519 db_trace(KTR_VTW 1520 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x" 1521 , ctl->clidx 1522 , vtw->expire.tv_sec 1523 , vtw->expire.tv_usec 1524 , ctl->nalloc 1525 , ctl->nfree)); 1526 1527 if (!_when) 1528 ++vtw_stats.kill; 1529 1530 vtw_del(ctl, vtw); 1531 vtw = ctl->oldest.v; 1532 } 1533 1534 return ctl->nalloc; // # remaining allocated 1535 } 1536 1537 static callout_t vtw_cs; 1538 1539 /*!\brief notice the passage of time. 1540 * It seems to be getting faster. What happened to the year? 1541 */ 1542 static void 1543 vtw_tick(void *arg) 1544 { 1545 struct timeval now; 1546 int i, cnt = 0; 1547 1548 getmicrouptime(&now); 1549 1550 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x" 1551 , now.tv_sec, now.tv_usec)); 1552 1553 mutex_enter(softnet_lock); 1554 1555 for (i = 0; i < VTW_NCLASS; ++i) { 1556 cnt += vtw_age(&vtw_tcpv4[i], &now); 1557 cnt += vtw_age(&vtw_tcpv6[i], &now); 1558 } 1559 1560 /* Keep ticks coming while we need them. 1561 */ 1562 if (cnt) 1563 callout_schedule(&vtw_cs, hz / 5); 1564 else { 1565 tcp_vtw_was_enabled = 0; 1566 tcbtable.vestige = 0; 1567 } 1568 mutex_exit(softnet_lock); 1569 } 1570 1571 /* in_pcblookup_ports assist for handling vestigial entries. 1572 */ 1573 static void * 1574 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild) 1575 { 1576 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4; 1577 1578 bzero(it, sizeof (*it)); 1579 1580 /* Note: the reference to vtw_tcpv4[0] is fine. 1581 * We do not need per-class iteration. We just 1582 * need to get to the fat, and there is one 1583 * shared fat. 1584 */ 1585 if (vtw_tcpv4[0].fat) { 1586 it->addr.v4 = addr; 1587 it->port = port; 1588 it->wild = !!wild; 1589 it->ctl = &vtw_tcpv4[0]; 1590 1591 ++vtw_stats.look[1]; 1592 } 1593 1594 return it; 1595 } 1596 1597 /*!\brief export an IPv4 vtw. 1598 */ 1599 static int 1600 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1601 { 1602 vtw_v4_t *v4 = (void*)vtw; 1603 1604 bzero(res, sizeof (*res)); 1605 1606 if (ctl && vtw) { 1607 if (!ctl->clidx && vtw->msl_class) 1608 ctl += vtw->msl_class; 1609 else 1610 KASSERT(ctl->clidx == vtw->msl_class); 1611 1612 res->valid = 1; 1613 res->v4 = 1; 1614 1615 res->faddr.v4.s_addr = v4->faddr; 1616 res->laddr.v4.s_addr = v4->laddr; 1617 res->fport = v4->fport; 1618 res->lport = v4->lport; 1619 res->vtw = vtw; // netlock held over call(s) 1620 res->ctl = ctl; 1621 res->reuse_addr = vtw->reuse_addr; 1622 res->reuse_port = vtw->reuse_port; 1623 res->snd_nxt = vtw->snd_nxt; 1624 res->rcv_nxt = vtw->rcv_nxt; 1625 res->rcv_wnd = vtw->rcv_wnd; 1626 res->uid = vtw->uid; 1627 } 1628 1629 return res->valid; 1630 } 1631 1632 /*!\brief return next port in the port iterator. yowza. 1633 */ 1634 static int 1635 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res) 1636 { 1637 struct tcp_ports_iterator *it = arg; 1638 vtw_t *vtw = 0; 1639 1640 if (it->ctl) 1641 vtw = vtw_next_port_v4(it); 1642 1643 if (!vtw) 1644 it->ctl = 0; 1645 1646 return vtw_export_v4(it->ctl, vtw, res); 1647 } 1648 1649 static int 1650 tcp_lookup_v4(struct in_addr faddr, uint16_t fport, 1651 struct in_addr laddr, uint16_t lport, 1652 struct vestigial_inpcb *res) 1653 { 1654 vtw_t *vtw; 1655 vtw_ctl_t *ctl; 1656 1657 1658 db_trace(KTR_VTW 1659 , (res, "vtw: lookup %A:%P %A:%P" 1660 , faddr, fport 1661 , laddr, lport)); 1662 1663 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0]) 1664 , faddr.s_addr, fport 1665 , laddr.s_addr, lport, 0); 1666 1667 return vtw_export_v4(ctl, vtw, res); 1668 } 1669 1670 /* in_pcblookup_ports assist for handling vestigial entries. 1671 */ 1672 static void * 1673 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild) 1674 { 1675 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6; 1676 1677 bzero(it, sizeof (*it)); 1678 1679 /* Note: the reference to vtw_tcpv6[0] is fine. 1680 * We do not need per-class iteration. We just 1681 * need to get to the fat, and there is one 1682 * shared fat. 1683 */ 1684 if (vtw_tcpv6[0].fat) { 1685 it->addr.v6 = *addr; 1686 it->port = port; 1687 it->wild = !!wild; 1688 it->ctl = &vtw_tcpv6[0]; 1689 1690 ++vtw_stats.look[1]; 1691 } 1692 1693 return it; 1694 } 1695 1696 /*!\brief export an IPv6 vtw. 1697 */ 1698 static int 1699 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1700 { 1701 vtw_v6_t *v6 = (void*)vtw; 1702 1703 bzero(res, sizeof (*res)); 1704 1705 if (ctl && vtw) { 1706 if (!ctl->clidx && vtw->msl_class) 1707 ctl += vtw->msl_class; 1708 else 1709 KASSERT(ctl->clidx == vtw->msl_class); 1710 1711 res->valid = 1; 1712 res->v4 = 0; 1713 1714 res->faddr.v6 = v6->faddr; 1715 res->laddr.v6 = v6->laddr; 1716 res->fport = v6->fport; 1717 res->lport = v6->lport; 1718 res->vtw = vtw; // netlock held over call(s) 1719 res->ctl = ctl; 1720 1721 res->v6only = vtw->v6only; 1722 res->reuse_addr = vtw->reuse_addr; 1723 res->reuse_port = vtw->reuse_port; 1724 1725 res->snd_nxt = vtw->snd_nxt; 1726 res->rcv_nxt = vtw->rcv_nxt; 1727 res->rcv_wnd = vtw->rcv_wnd; 1728 res->uid = vtw->uid; 1729 } 1730 1731 return res->valid; 1732 } 1733 1734 static int 1735 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res) 1736 { 1737 struct tcp_ports_iterator *it = arg; 1738 vtw_t *vtw = 0; 1739 1740 if (it->ctl) 1741 vtw = vtw_next_port_v6(it); 1742 1743 if (!vtw) 1744 it->ctl = 0; 1745 1746 return vtw_export_v6(it->ctl, vtw, res); 1747 } 1748 1749 static int 1750 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport, 1751 const struct in6_addr *laddr, uint16_t lport, 1752 struct vestigial_inpcb *res) 1753 { 1754 vtw_ctl_t *ctl; 1755 vtw_t *vtw; 1756 1757 db_trace(KTR_VTW 1758 , (res, "vtw: lookup %6A:%P %6A:%P" 1759 , db_store(faddr, sizeof (*faddr)), fport 1760 , db_store(laddr, sizeof (*laddr)), lport)); 1761 1762 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0]) 1763 , faddr, fport 1764 , laddr, lport, 0); 1765 1766 return vtw_export_v6(ctl, vtw, res); 1767 } 1768 1769 static vestigial_hooks_t tcp_hooks = { 1770 .init_ports4 = tcp_init_ports_v4, 1771 .next_port4 = tcp_next_port_v4, 1772 .lookup4 = tcp_lookup_v4, 1773 .init_ports6 = tcp_init_ports_v6, 1774 .next_port6 = tcp_next_port_v6, 1775 .lookup6 = tcp_lookup_v6, 1776 }; 1777 1778 static bool 1779 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp) 1780 { 1781 fatp_ctl_t *fat; 1782 vtw_ctl_t *ctl; 1783 1784 switch (af) { 1785 case AF_INET: 1786 fat = &fat_tcpv4; 1787 ctl = &vtw_tcpv4[0]; 1788 break; 1789 case AF_INET6: 1790 fat = &fat_tcpv6; 1791 ctl = &vtw_tcpv6[0]; 1792 break; 1793 default: 1794 return false; 1795 } 1796 if (fatp != NULL) 1797 *fatp = fat; 1798 if (ctlp != NULL) 1799 *ctlp = ctl; 1800 return true; 1801 } 1802 1803 /*!\brief initialize controlling instance 1804 */ 1805 static int 1806 vtw_control_init(int af) 1807 { 1808 fatp_ctl_t *fat; 1809 vtw_ctl_t *ctl; 1810 fatp_t *fat_base; 1811 fatp_t **fat_hash; 1812 vtw_t *ctl_base_v; 1813 uint32_t n, m; 1814 size_t sz; 1815 1816 KASSERT(powerof2(tcp_vtw_entries)); 1817 1818 if (!vtw_select(af, &fat, &ctl)) 1819 return EAFNOSUPPORT; 1820 1821 if (fat->hash != NULL) { 1822 KASSERT(fat->base != NULL && ctl->base.v != NULL); 1823 return 0; 1824 } 1825 1826 /* Allocate 10% more capacity in the fat pointers. 1827 * We should only need ~#hash additional based on 1828 * how they age, but TIME_WAIT assassination could cause 1829 * sparse fat pointer utilisation. 1830 */ 1831 m = 512; 1832 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10; 1833 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t)); 1834 1835 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP); 1836 1837 if (fat_hash == NULL) { 1838 printf("%s: could not allocate %zu bytes for " 1839 "hash anchors", __func__, 2*m * sizeof(fatp_t *)); 1840 return ENOMEM; 1841 } 1842 1843 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP); 1844 1845 if (fat_base == NULL) { 1846 kmem_free(fat_hash, 2*m * sizeof (fatp_t *)); 1847 printf("%s: could not allocate %zu bytes for " 1848 "fatp_t array", __func__, 2*n * sizeof(fatp_t)); 1849 return ENOMEM; 1850 } 1851 1852 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP); 1853 1854 if (ctl_base_v == NULL) { 1855 kmem_free(fat_hash, 2*m * sizeof (fatp_t *)); 1856 kmem_free(fat_base, 2*n * sizeof(fatp_t)); 1857 printf("%s: could not allocate %zu bytes for " 1858 "vtw_t array", __func__, tcp_vtw_entries * sz); 1859 return ENOMEM; 1860 } 1861 1862 fatp_init(fat, n, m, fat_base, fat_hash); 1863 1864 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v); 1865 1866 return 0; 1867 } 1868 1869 /*!\brief select controlling instance 1870 */ 1871 static vtw_ctl_t * 1872 vtw_control(int af, uint32_t msl) 1873 { 1874 fatp_ctl_t *fat; 1875 vtw_ctl_t *ctl; 1876 int msl_class = msl_to_class(msl); 1877 1878 if (!vtw_select(af, &fat, &ctl)) 1879 return NULL; 1880 1881 if (!fat->base || !ctl->base.v) 1882 return NULL; 1883 1884 if (!tcp_vtw_was_enabled) { 1885 /* This guarantees is timer ticks until we no longer need them. 1886 */ 1887 tcp_vtw_was_enabled = 1; 1888 1889 callout_schedule(&vtw_cs, hz / 5); 1890 1891 tcbtable.vestige = &tcp_hooks; 1892 } 1893 1894 return ctl + msl_class; 1895 } 1896 1897 /*!\brief add TCP pcb to vestigial timewait 1898 */ 1899 int 1900 vtw_add(int af, struct tcpcb *tp) 1901 { 1902 #ifdef VTW_DEBUG 1903 int enable; 1904 #endif 1905 vtw_ctl_t *ctl; 1906 vtw_t *vtw; 1907 1908 KASSERT(mutex_owned(softnet_lock)); 1909 1910 ctl = vtw_control(af, tp->t_msl); 1911 if (!ctl) 1912 return 0; 1913 1914 #ifdef VTW_DEBUG 1915 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable; 1916 #endif 1917 1918 vtw = vtw_alloc(ctl); 1919 1920 if (vtw) { 1921 vtw->snd_nxt = tp->snd_nxt; 1922 vtw->rcv_nxt = tp->rcv_nxt; 1923 1924 switch (af) { 1925 case AF_INET: { 1926 struct inpcb *inp = tp->t_inpcb; 1927 vtw_v4_t *v4 = (void*)vtw; 1928 1929 v4->faddr = inp->inp_faddr.s_addr; 1930 v4->laddr = inp->inp_laddr.s_addr; 1931 v4->fport = inp->inp_fport; 1932 v4->lport = inp->inp_lport; 1933 1934 vtw->reuse_port = !!(inp->inp_socket->so_options 1935 & SO_REUSEPORT); 1936 vtw->reuse_addr = !!(inp->inp_socket->so_options 1937 & SO_REUSEADDR); 1938 vtw->v6only = 0; 1939 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid; 1940 1941 vtw_inshash_v4(ctl, vtw); 1942 1943 1944 #ifdef VTW_DEBUG 1945 /* Immediate lookup (connected and port) to 1946 * ensure at least that works! 1947 */ 1948 if (enable & 4) { 1949 KASSERT(vtw_lookup_hash_v4 1950 (ctl 1951 , inp->inp_faddr.s_addr, inp->inp_fport 1952 , inp->inp_laddr.s_addr, inp->inp_lport 1953 , 0) 1954 == vtw); 1955 KASSERT(vtw_lookup_hash_v4 1956 (ctl 1957 , inp->inp_faddr.s_addr, inp->inp_fport 1958 , inp->inp_laddr.s_addr, inp->inp_lport 1959 , 1)); 1960 } 1961 /* Immediate port iterator functionality check: not wild 1962 */ 1963 if (enable & 8) { 1964 struct tcp_ports_iterator *it; 1965 struct vestigial_inpcb res; 1966 int cnt = 0; 1967 1968 it = tcp_init_ports_v4(inp->inp_laddr 1969 , inp->inp_lport, 0); 1970 1971 while (tcp_next_port_v4(it, &res)) { 1972 ++cnt; 1973 } 1974 KASSERT(cnt); 1975 } 1976 /* Immediate port iterator functionality check: wild 1977 */ 1978 if (enable & 16) { 1979 struct tcp_ports_iterator *it; 1980 struct vestigial_inpcb res; 1981 struct in_addr any; 1982 int cnt = 0; 1983 1984 any.s_addr = htonl(INADDR_ANY); 1985 1986 it = tcp_init_ports_v4(any, inp->inp_lport, 1); 1987 1988 while (tcp_next_port_v4(it, &res)) { 1989 ++cnt; 1990 } 1991 KASSERT(cnt); 1992 } 1993 #endif /* VTW_DEBUG */ 1994 break; 1995 } 1996 1997 case AF_INET6: { 1998 struct in6pcb *inp = tp->t_in6pcb; 1999 vtw_v6_t *v6 = (void*)vtw; 2000 2001 v6->faddr = inp->in6p_faddr; 2002 v6->laddr = inp->in6p_laddr; 2003 v6->fport = inp->in6p_fport; 2004 v6->lport = inp->in6p_lport; 2005 2006 vtw->reuse_port = !!(inp->in6p_socket->so_options 2007 & SO_REUSEPORT); 2008 vtw->reuse_addr = !!(inp->in6p_socket->so_options 2009 & SO_REUSEADDR); 2010 vtw->v6only = !!(inp->in6p_flags 2011 & IN6P_IPV6_V6ONLY); 2012 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid; 2013 2014 vtw_inshash_v6(ctl, vtw); 2015 #ifdef VTW_DEBUG 2016 /* Immediate lookup (connected and port) to 2017 * ensure at least that works! 2018 */ 2019 if (enable & 4) { 2020 KASSERT(vtw_lookup_hash_v6(ctl 2021 , &inp->in6p_faddr, inp->in6p_fport 2022 , &inp->in6p_laddr, inp->in6p_lport 2023 , 0) 2024 == vtw); 2025 KASSERT(vtw_lookup_hash_v6 2026 (ctl 2027 , &inp->in6p_faddr, inp->in6p_fport 2028 , &inp->in6p_laddr, inp->in6p_lport 2029 , 1)); 2030 } 2031 /* Immediate port iterator functionality check: not wild 2032 */ 2033 if (enable & 8) { 2034 struct tcp_ports_iterator *it; 2035 struct vestigial_inpcb res; 2036 int cnt = 0; 2037 2038 it = tcp_init_ports_v6(&inp->in6p_laddr 2039 , inp->in6p_lport, 0); 2040 2041 while (tcp_next_port_v6(it, &res)) { 2042 ++cnt; 2043 } 2044 KASSERT(cnt); 2045 } 2046 /* Immediate port iterator functionality check: wild 2047 */ 2048 if (enable & 16) { 2049 struct tcp_ports_iterator *it; 2050 struct vestigial_inpcb res; 2051 static struct in6_addr any = IN6ADDR_ANY_INIT; 2052 int cnt = 0; 2053 2054 it = tcp_init_ports_v6(&any 2055 , inp->in6p_lport, 1); 2056 2057 while (tcp_next_port_v6(it, &res)) { 2058 ++cnt; 2059 } 2060 KASSERT(cnt); 2061 } 2062 #endif /* VTW_DEBUG */ 2063 break; 2064 } 2065 } 2066 2067 tcp_canceltimers(tp); 2068 tp = tcp_close(tp); 2069 KASSERT(!tp); 2070 2071 return 1; 2072 } 2073 2074 return 0; 2075 } 2076 2077 /*!\brief restart timer for vestigial time-wait entry 2078 */ 2079 static void 2080 vtw_restart_v4(vestigial_inpcb_t *vp) 2081 { 2082 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw; 2083 vtw_t *vtw; 2084 vtw_t *cp = ©.common; 2085 vtw_ctl_t *ctl; 2086 2087 KASSERT(mutex_owned(softnet_lock)); 2088 2089 db_trace(KTR_VTW 2090 , (vp->vtw, "vtw: restart %A:%P %A:%P" 2091 , vp->faddr.v4.s_addr, vp->fport 2092 , vp->laddr.v4.s_addr, vp->lport)); 2093 2094 /* Class might have changed, so have a squiz. 2095 */ 2096 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class)); 2097 vtw = vtw_alloc(ctl); 2098 2099 if (vtw) { 2100 vtw_v4_t *v4 = (void*)vtw; 2101 2102 /* Safe now to unhash the old entry 2103 */ 2104 vtw_del(vp->ctl, vp->vtw); 2105 2106 vtw->snd_nxt = cp->snd_nxt; 2107 vtw->rcv_nxt = cp->rcv_nxt; 2108 2109 v4->faddr = copy.faddr; 2110 v4->laddr = copy.laddr; 2111 v4->fport = copy.fport; 2112 v4->lport = copy.lport; 2113 2114 vtw->reuse_port = cp->reuse_port; 2115 vtw->reuse_addr = cp->reuse_addr; 2116 vtw->v6only = 0; 2117 vtw->uid = cp->uid; 2118 2119 vtw_inshash_v4(ctl, vtw); 2120 } 2121 2122 vp->valid = 0; 2123 } 2124 2125 /*!\brief restart timer for vestigial time-wait entry 2126 */ 2127 static void 2128 vtw_restart_v6(vestigial_inpcb_t *vp) 2129 { 2130 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw; 2131 vtw_t *vtw; 2132 vtw_t *cp = ©.common; 2133 vtw_ctl_t *ctl; 2134 2135 KASSERT(mutex_owned(softnet_lock)); 2136 2137 db_trace(KTR_VTW 2138 , (vp->vtw, "vtw: restart %6A:%P %6A:%P" 2139 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6)) 2140 , vp->fport 2141 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6)) 2142 , vp->lport)); 2143 2144 /* Class might have changed, so have a squiz. 2145 */ 2146 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class)); 2147 vtw = vtw_alloc(ctl); 2148 2149 if (vtw) { 2150 vtw_v6_t *v6 = (void*)vtw; 2151 2152 /* Safe now to unhash the old entry 2153 */ 2154 vtw_del(vp->ctl, vp->vtw); 2155 2156 vtw->snd_nxt = cp->snd_nxt; 2157 vtw->rcv_nxt = cp->rcv_nxt; 2158 2159 v6->faddr = copy.faddr; 2160 v6->laddr = copy.laddr; 2161 v6->fport = copy.fport; 2162 v6->lport = copy.lport; 2163 2164 vtw->reuse_port = cp->reuse_port; 2165 vtw->reuse_addr = cp->reuse_addr; 2166 vtw->v6only = cp->v6only; 2167 vtw->uid = cp->uid; 2168 2169 vtw_inshash_v6(ctl, vtw); 2170 } 2171 2172 vp->valid = 0; 2173 } 2174 2175 /*!\brief restart timer for vestigial time-wait entry 2176 */ 2177 void 2178 vtw_restart(vestigial_inpcb_t *vp) 2179 { 2180 if (!vp || !vp->valid) 2181 return; 2182 2183 if (vp->v4) 2184 vtw_restart_v4(vp); 2185 else 2186 vtw_restart_v6(vp); 2187 } 2188 2189 int 2190 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS) 2191 { 2192 int en, rc; 2193 struct sysctlnode node; 2194 2195 node = *rnode; 2196 en = *(int *)rnode->sysctl_data; 2197 node.sysctl_data = &en; 2198 2199 rc = sysctl_lookup(SYSCTLFN_CALL(&node)); 2200 if (rc != 0 || newp == NULL) 2201 return rc; 2202 2203 if (rnode->sysctl_data != &tcp4_vtw_enable && 2204 rnode->sysctl_data != &tcp6_vtw_enable) 2205 rc = ENOENT; 2206 else if ((en & 1) == 0) 2207 rc = 0; 2208 else if (rnode->sysctl_data == &tcp4_vtw_enable) 2209 rc = vtw_control_init(AF_INET); 2210 else /* rnode->sysctl_data == &tcp6_vtw_enable */ 2211 rc = vtw_control_init(AF_INET6); 2212 2213 if (rc == 0) 2214 *(int *)rnode->sysctl_data = en; 2215 2216 return rc; 2217 } 2218 2219 int 2220 vtw_earlyinit(void) 2221 { 2222 int i, rc; 2223 2224 callout_init(&vtw_cs, 0); 2225 callout_setfunc(&vtw_cs, vtw_tick, 0); 2226 2227 for (i = 0; i < VTW_NCLASS; ++i) { 2228 vtw_tcpv4[i].is_v4 = 1; 2229 vtw_tcpv6[i].is_v6 = 1; 2230 } 2231 2232 if ((tcp4_vtw_enable & 1) != 0 && 2233 (rc = vtw_control_init(AF_INET)) != 0) 2234 return rc; 2235 2236 if ((tcp6_vtw_enable & 1) != 0 && 2237 (rc = vtw_control_init(AF_INET6)) != 0) 2238 return rc; 2239 2240 return 0; 2241 } 2242 2243 #ifdef VTW_DEBUG 2244 #include <sys/syscallargs.h> 2245 #include <sys/sysctl.h> 2246 2247 /*!\brief add lalp, fafp entries for debug 2248 */ 2249 int 2250 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class) 2251 { 2252 vtw_ctl_t *ctl; 2253 vtw_t *vtw; 2254 2255 ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class)); 2256 if (!ctl) 2257 return 0; 2258 2259 vtw = vtw_alloc(ctl); 2260 2261 if (vtw) { 2262 vtw->snd_nxt = 0; 2263 vtw->rcv_nxt = 0; 2264 2265 switch (af) { 2266 case AF_INET: { 2267 vtw_v4_t *v4 = (void*)vtw; 2268 2269 v4->faddr = fa->sin_addr.v4.s_addr; 2270 v4->laddr = la->sin_addr.v4.s_addr; 2271 v4->fport = fa->sin_port; 2272 v4->lport = la->sin_port; 2273 2274 vtw->reuse_port = 1; 2275 vtw->reuse_addr = 1; 2276 vtw->v6only = 0; 2277 vtw->uid = 0; 2278 2279 vtw_inshash_v4(ctl, vtw); 2280 break; 2281 } 2282 2283 case AF_INET6: { 2284 vtw_v6_t *v6 = (void*)vtw; 2285 2286 v6->faddr = fa->sin_addr.v6; 2287 v6->laddr = la->sin_addr.v6; 2288 2289 v6->fport = fa->sin_port; 2290 v6->lport = la->sin_port; 2291 2292 vtw->reuse_port = 1; 2293 vtw->reuse_addr = 1; 2294 vtw->v6only = 0; 2295 vtw->uid = 0; 2296 2297 vtw_inshash_v6(ctl, vtw); 2298 break; 2299 } 2300 2301 default: 2302 break; 2303 } 2304 2305 return 1; 2306 } 2307 2308 return 0; 2309 } 2310 2311 static int vtw_syscall = 0; 2312 2313 static int 2314 vtw_debug_process(vtw_sysargs_t *ap) 2315 { 2316 struct vestigial_inpcb vestige; 2317 int rc = 0; 2318 2319 mutex_enter(softnet_lock); 2320 2321 switch (ap->op) { 2322 case 0: // insert 2323 vtw_debug_add(ap->la.sin_family 2324 , &ap->la 2325 , &ap->fa 2326 , TCPTV_MSL 2327 , 0); 2328 break; 2329 2330 case 1: // lookup 2331 case 2: // restart 2332 switch (ap->la.sin_family) { 2333 case AF_INET: 2334 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port, 2335 ap->la.sin_addr.v4, ap->la.sin_port, 2336 &vestige)) { 2337 if (ap->op == 2) { 2338 vtw_restart(&vestige); 2339 } 2340 rc = 0; 2341 } else 2342 rc = ESRCH; 2343 break; 2344 2345 case AF_INET6: 2346 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port, 2347 &ap->la.sin_addr.v6, ap->la.sin_port, 2348 &vestige)) { 2349 if (ap->op == 2) { 2350 vtw_restart(&vestige); 2351 } 2352 rc = 0; 2353 } else 2354 rc = ESRCH; 2355 break; 2356 default: 2357 rc = EINVAL; 2358 } 2359 break; 2360 2361 default: 2362 rc = EINVAL; 2363 } 2364 2365 mutex_exit(softnet_lock); 2366 return rc; 2367 } 2368 2369 struct sys_vtw_args { 2370 syscallarg(const vtw_sysargs_t *) req; 2371 syscallarg(size_t) len; 2372 }; 2373 2374 static int 2375 vtw_sys(struct lwp *l, const void *_, register_t *retval) 2376 { 2377 const struct sys_vtw_args *uap = _; 2378 void *buf; 2379 int rc; 2380 size_t len = SCARG(uap, len); 2381 2382 if (len != sizeof (vtw_sysargs_t)) 2383 return EINVAL; 2384 2385 buf = kmem_alloc(len, KM_SLEEP); 2386 if (!buf) 2387 return ENOMEM; 2388 2389 rc = copyin(SCARG(uap, req), buf, len); 2390 if (!rc) { 2391 rc = vtw_debug_process(buf); 2392 } 2393 kmem_free(buf, len); 2394 2395 return rc; 2396 } 2397 2398 static void 2399 vtw_sanity_check(void) 2400 { 2401 vtw_ctl_t *ctl; 2402 vtw_t *vtw; 2403 int i; 2404 int n; 2405 2406 for (i = 0; i < VTW_NCLASS; ++i) { 2407 ctl = &vtw_tcpv4[i]; 2408 2409 if (!ctl->base.v || ctl->nalloc) 2410 continue; 2411 2412 for (n = 0, vtw = ctl->base.v; ; ) { 2413 ++n; 2414 vtw = vtw_next(ctl, vtw); 2415 if (vtw == ctl->base.v) 2416 break; 2417 } 2418 db_trace(KTR_VTW 2419 , (ctl, "sanity: class %x n %x nfree %x" 2420 , i, n, ctl->nfree)); 2421 2422 KASSERT(n == ctl->nfree); 2423 } 2424 2425 for (i = 0; i < VTW_NCLASS; ++i) { 2426 ctl = &vtw_tcpv6[i]; 2427 2428 if (!ctl->base.v || ctl->nalloc) 2429 continue; 2430 2431 for (n = 0, vtw = ctl->base.v; ; ) { 2432 ++n; 2433 vtw = vtw_next(ctl, vtw); 2434 if (vtw == ctl->base.v) 2435 break; 2436 } 2437 db_trace(KTR_VTW 2438 , (ctl, "sanity: class %x n %x nfree %x" 2439 , i, n, ctl->nfree)); 2440 KASSERT(n == ctl->nfree); 2441 } 2442 } 2443 2444 /*!\brief Initialise debug support. 2445 */ 2446 static void 2447 vtw_debug_init(void) 2448 { 2449 int i; 2450 2451 vtw_sanity_check(); 2452 2453 if (vtw_syscall) 2454 return; 2455 2456 for (i = 511; i; --i) { 2457 if (sysent[i].sy_call == sys_nosys) { 2458 sysent[i].sy_call = vtw_sys; 2459 sysent[i].sy_narg = 2; 2460 sysent[i].sy_argsize = sizeof (struct sys_vtw_args); 2461 sysent[i].sy_flags = 0; 2462 2463 vtw_syscall = i; 2464 break; 2465 } 2466 } 2467 if (i) { 2468 const struct sysctlnode *node; 2469 uint32_t flags; 2470 2471 flags = sysctl_root.sysctl_flags; 2472 2473 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE; 2474 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT; 2475 2476 sysctl_createv(0, 0, 0, &node, 2477 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2478 "koff", 2479 SYSCTL_DESCR("Kernel Obscure Feature Finder"), 2480 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2481 2482 if (!node) { 2483 sysctl_createv(0, 0, 0, &node, 2484 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2485 "koffka", 2486 SYSCTL_DESCR("The Real(tm) Kernel" 2487 " Obscure Feature Finder"), 2488 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2489 } 2490 if (node) { 2491 sysctl_createv(0, 0, 0, 0, 2492 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 2493 CTLTYPE_INT, "vtw_debug_syscall", 2494 SYSCTL_DESCR("vtw debug" 2495 " system call number"), 2496 0, 0, &vtw_syscall, 0, node->sysctl_num, 2497 CTL_CREATE, CTL_EOL); 2498 } 2499 sysctl_root.sysctl_flags = flags; 2500 } 2501 } 2502 #else /* !VTW_DEBUG */ 2503 static void 2504 vtw_debug_init(void) 2505 { 2506 return; 2507 } 2508 #endif /* !VTW_DEBUG */ 2509