1 /* 2 * Copyright (c) 2011 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Coyote Point Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 #include <sys/cdefs.h> 30 31 #include "opt_ddb.h" 32 #include "opt_inet.h" 33 #include "opt_ipsec.h" 34 #include "opt_inet_csum.h" 35 #include "opt_tcp_debug.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/malloc.h> 40 #include <sys/kmem.h> 41 #include <sys/mbuf.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/errno.h> 46 #include <sys/syslog.h> 47 #include <sys/pool.h> 48 #include <sys/domain.h> 49 #include <sys/kernel.h> 50 #include <net/if.h> 51 #include <net/route.h> 52 #include <net/if_types.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_systm.h> 56 #include <netinet/ip.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/in_var.h> 59 #include <netinet/ip_var.h> 60 #include <netinet/in_offload.h> 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #include <netinet6/in6_pcb.h> 64 #include <netinet6/ip6_var.h> 65 #include <netinet6/in6_var.h> 66 #include <netinet/icmp6.h> 67 #include <netinet6/nd6.h> 68 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_seq.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_private.h> 75 #include <netinet/tcpip.h> 76 77 #include <machine/stdarg.h> 78 #include <netinet/tcp_vtw.h> 79 80 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.7 2011/06/06 19:15:43 dyoung Exp $"); 81 82 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0) 83 84 static void vtw_debug_init(void); 85 86 fatp_ctl_t fat_tcpv4; 87 fatp_ctl_t fat_tcpv6; 88 vtw_ctl_t vtw_tcpv4[VTW_NCLASS]; 89 vtw_ctl_t vtw_tcpv6[VTW_NCLASS]; 90 vtw_stats_t vtw_stats; 91 92 /* We provide state for the lookup_ports iterator. 93 * As currently we are netlock-protected, there is one. 94 * If we were finer-grain, we would have one per CPU. 95 * I do not want to be in the business of alloc/free. 96 * The best alternate would be allocate on the caller's 97 * stack, but that would require them to know the struct, 98 * or at least the size. 99 * See how she goes. 100 */ 101 struct tcp_ports_iterator { 102 union { 103 struct in_addr v4; 104 struct in6_addr v6; 105 } addr; 106 u_int port; 107 108 uint32_t wild : 1; 109 110 vtw_ctl_t *ctl; 111 fatp_t *fp; 112 113 uint16_t slot_idx; 114 uint16_t ctl_idx; 115 }; 116 117 static struct tcp_ports_iterator tcp_ports_iterator_v4; 118 static struct tcp_ports_iterator tcp_ports_iterator_v6; 119 120 static int vtw_age(vtw_ctl_t *, struct timeval *); 121 122 /*!\brief allocate a fat pointer from a collection. 123 */ 124 static fatp_t * 125 fatp_alloc(fatp_ctl_t *fat) 126 { 127 fatp_t *fp = 0; 128 129 if (fat->nfree) { 130 fp = fat->free; 131 if (fp) { 132 fat->free = fatp_next(fat, fp); 133 --fat->nfree; 134 ++fat->nalloc; 135 fp->nxt = 0; 136 137 KASSERT(!fp->inuse); 138 } 139 } 140 141 return fp; 142 } 143 144 /*!\brief free a fat pointer. 145 */ 146 static void 147 fatp_free(fatp_ctl_t *fat, fatp_t *fp) 148 { 149 if (fp) { 150 KASSERT(!fp->inuse); 151 KASSERT(!fp->nxt); 152 153 fp->nxt = fatp_index(fat, fat->free); 154 fat->free = fp; 155 156 ++fat->nfree; 157 --fat->nalloc; 158 } 159 } 160 161 /*!\brief initialise a collection of fat pointers. 162 * 163 *\param n # hash buckets 164 *\param m total # fat pointers to allocate 165 * 166 * We allocate 2x as much, as we have two hashes: full and lport only. 167 */ 168 static void 169 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m, 170 fatp_t *fat_base, fatp_t **fat_hash) 171 { 172 fatp_t *fp; 173 174 KASSERT(n <= FATP_MAX / 2); 175 176 fat->hash = fat_hash; 177 fat->base = fat_base; 178 179 fat->port = &fat->hash[m]; 180 181 fat->mask = m - 1; // ASSERT is power of 2 (m) 182 fat->lim = fat->base + 2*n - 1; 183 fat->nfree = 0; 184 fat->nalloc = 2*n; 185 186 /* Initialise the free list. 187 */ 188 for (fp = fat->lim; fp >= fat->base; --fp) { 189 fatp_free(fat, fp); 190 } 191 } 192 193 /* 194 * The `xtra' is XORed into the tag stored. 195 */ 196 static uint32_t fatp_xtra[] = { 197 0x11111111,0x22222222,0x33333333,0x44444444, 198 0x55555555,0x66666666,0x77777777,0x88888888, 199 0x12121212,0x21212121,0x34343434,0x43434343, 200 0x56565656,0x65656565,0x78787878,0x87878787, 201 0x11221122,0x22112211,0x33443344,0x44334433, 202 0x55665566,0x66556655,0x77887788,0x88778877, 203 0x11112222,0x22221111,0x33334444,0x44443333, 204 0x55556666,0x66665555,0x77778888,0x88887777, 205 }; 206 207 /*!\brief turn a {fatp_t*,slot} into an integral key. 208 * 209 * The key can be used to obtain the fatp_t, and the slot, 210 * as it directly encodes them. 211 */ 212 static inline uint32_t 213 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot) 214 { 215 CTASSERT(CACHE_LINE_SIZE == 32 || 216 CACHE_LINE_SIZE == 64 || 217 CACHE_LINE_SIZE == 128); 218 219 switch (fatp_ntags()) { 220 case 7: 221 return (fatp_index(fat, fp) << 3) | slot; 222 case 15: 223 return (fatp_index(fat, fp) << 4) | slot; 224 case 31: 225 return (fatp_index(fat, fp) << 5) | slot; 226 default: 227 KASSERT(0 && "no support, for no good reason"); 228 return ~0; 229 } 230 } 231 232 static inline uint32_t 233 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key) 234 { 235 CTASSERT(CACHE_LINE_SIZE == 32 || 236 CACHE_LINE_SIZE == 64 || 237 CACHE_LINE_SIZE == 128); 238 239 switch (fatp_ntags()) { 240 case 7: 241 return key & 7; 242 case 15: 243 return key & 15; 244 case 31: 245 return key & 31; 246 default: 247 KASSERT(0 && "no support, for no good reason"); 248 return ~0; 249 } 250 } 251 252 static inline fatp_t * 253 fatp_from_key(fatp_ctl_t *fat, uint32_t key) 254 { 255 CTASSERT(CACHE_LINE_SIZE == 32 || 256 CACHE_LINE_SIZE == 64 || 257 CACHE_LINE_SIZE == 128); 258 259 switch (fatp_ntags()) { 260 case 7: 261 key >>= 3; 262 break; 263 case 15: 264 key >>= 4; 265 break; 266 case 31: 267 key >>= 5; 268 break; 269 default: 270 KASSERT(0 && "no support, for no good reason"); 271 return 0; 272 } 273 274 return key ? fat->base + key - 1 : 0; 275 } 276 277 static inline uint32_t 278 idx_encode(vtw_ctl_t *ctl, uint32_t idx) 279 { 280 return (idx << ctl->idx_bits) | idx; 281 } 282 283 static inline uint32_t 284 idx_decode(vtw_ctl_t *ctl, uint32_t bits) 285 { 286 uint32_t idx = bits & ctl->idx_mask; 287 288 if (idx_encode(ctl, idx) == bits) 289 return idx; 290 else 291 return ~0; 292 } 293 294 /*!\brief insert index into fatp hash 295 * 296 *\param idx - index of element being placed in hash chain 297 *\param tag - 32-bit tag identifier 298 * 299 *\returns 300 * value which can be used to locate entry. 301 * 302 *\note 303 * we rely on the fact that there are unused high bits in the index 304 * for verification purposes on lookup. 305 */ 306 307 static inline uint32_t 308 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which, 309 void *dbg) 310 { 311 fatp_t *fp; 312 fatp_t **hash = (which ? fat->port : fat->hash); 313 int i; 314 315 fp = hash[tag & fat->mask]; 316 317 while (!fp || fatp_full(fp)) { 318 fatp_t *fq; 319 320 /* All entries are inuse at the top level. 321 * We allocate a spare, and push the top level 322 * down one. All entries in the fp we push down 323 * (think of a tape worm here) will be expelled sooner than 324 * any entries added subsequently to this hash bucket. 325 * This is a property of the time waits we are exploiting. 326 */ 327 328 fq = fatp_alloc(fat); 329 if (!fq) { 330 vtw_age(fat->vtw, 0); 331 fp = hash[tag & fat->mask]; 332 continue; 333 } 334 335 fq->inuse = 0; 336 fq->nxt = fatp_index(fat, fp); 337 338 hash[tag & fat->mask] = fq; 339 340 fp = fq; 341 } 342 343 KASSERT(!fatp_full(fp)); 344 345 /* Fill highest index first. Lookup is lowest first. 346 */ 347 for (i = fatp_ntags(); --i >= 0; ) { 348 if (!((1 << i) & fp->inuse)) { 349 break; 350 } 351 } 352 353 fp->inuse |= 1 << i; 354 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i]; 355 356 db_trace(KTR_VTW 357 , (fp, "fat: inuse %5.5x tag[%x] %8.8x" 358 , fp->inuse 359 , i, fp->tag[i])); 360 361 return fatp_key(fat, fp, i); 362 } 363 364 static inline int 365 vtw_alive(const vtw_t *vtw) 366 { 367 return vtw->hashed && vtw->expire.tv_sec; 368 } 369 370 static inline uint32_t 371 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4) 372 { 373 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4) 374 return v4 - ctl->base.v4; 375 376 KASSERT(0 && "vtw out of bounds"); 377 378 return ~0; 379 } 380 381 static inline uint32_t 382 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6) 383 { 384 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6) 385 return v6 - ctl->base.v6; 386 387 KASSERT(0 && "vtw out of bounds"); 388 389 return ~0; 390 } 391 392 static inline uint32_t 393 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw) 394 { 395 if (ctl->clidx) 396 ctl = ctl->ctl; 397 398 if (ctl->is_v4) 399 return vtw_index_v4(ctl, (vtw_v4_t *)vtw); 400 401 if (ctl->is_v6) 402 return vtw_index_v6(ctl, (vtw_v6_t *)vtw); 403 404 KASSERT(0 && "neither 4 nor 6. most curious."); 405 406 return ~0; 407 } 408 409 static inline vtw_t * 410 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx) 411 { 412 if (ctl->clidx) 413 ctl = ctl->ctl; 414 415 /* See if the index looks like it might be an index. 416 * Bits on outside of the valid index bits is a give away. 417 */ 418 idx = idx_decode(ctl, idx); 419 420 if (idx == ~0) { 421 return 0; 422 } else if (ctl->is_v4) { 423 vtw_v4_t *vtw = ctl->base.v4 + idx; 424 425 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4) 426 ? &vtw->common : 0; 427 } else if (ctl->is_v6) { 428 vtw_v6_t *vtw = ctl->base.v6 + idx; 429 430 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6) 431 ? &vtw->common : 0; 432 } else { 433 KASSERT(0 && "badness"); 434 return 0; 435 } 436 } 437 438 /*!\brief return the next vtw after this one. 439 * 440 * Due to the differing sizes of the entries in differing 441 * arenas, we have to ensure we ++ the correct pointer type. 442 * 443 * Also handles wrap. 444 */ 445 static inline vtw_t * 446 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw) 447 { 448 if (ctl->is_v4) { 449 vtw_v4_t *v4 = (void*)vtw; 450 451 vtw = &(++v4)->common; 452 } else { 453 vtw_v6_t *v6 = (void*)vtw; 454 455 vtw = &(++v6)->common; 456 } 457 458 if (vtw > ctl->lim.v) 459 vtw = ctl->base.v; 460 461 return vtw; 462 } 463 464 /*!\brief remove entry from FATP hash chains 465 */ 466 static inline void 467 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw) 468 { 469 fatp_ctl_t *fat = ctl->fat; 470 fatp_t *fp; 471 uint32_t key = vtw->key; 472 uint32_t tag, slot, idx; 473 vtw_v4_t *v4 = (void*)vtw; 474 vtw_v6_t *v6 = (void*)vtw; 475 476 if (!vtw->hashed) { 477 KASSERT(0 && "unhashed"); 478 return; 479 } 480 481 if (fat->vtw->is_v4) { 482 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport); 483 } else if (fat->vtw->is_v6) { 484 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport); 485 } else { 486 tag = 0; 487 KASSERT(0 && "not reached"); 488 } 489 490 /* Remove from fat->hash[] 491 */ 492 slot = fatp_slot_from_key(fat, key); 493 fp = fatp_from_key(fat, key); 494 idx = vtw_index(ctl, vtw); 495 496 db_trace(KTR_VTW 497 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x" 498 , fp->inuse, slot, idx, key, tag)); 499 500 KASSERT(fp->inuse & (1 << slot)); 501 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 502 ^ fatp_xtra[slot])); 503 504 if ((fp->inuse & (1 << slot)) 505 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 506 ^ fatp_xtra[slot])) { 507 fp->inuse ^= 1 << slot; 508 fp->tag[slot] = 0; 509 510 /* When we delete entries, we do not compact. This is 511 * due to temporality. We add entries, and they 512 * (eventually) expire. Older entries will be further 513 * down the chain. 514 */ 515 if (!fp->inuse) { 516 uint32_t hi = tag & fat->mask; 517 fatp_t *fq = 0; 518 fatp_t *fr = fat->hash[hi]; 519 520 while (fr && fr != fp) { 521 fr = fatp_next(fat, fq = fr); 522 } 523 524 if (fr == fp) { 525 if (fq) { 526 fq->nxt = fp->nxt; 527 fp->nxt = 0; 528 fatp_free(fat, fp); 529 } else { 530 KASSERT(fat->hash[hi] == fp); 531 532 if (fp->nxt) { 533 fat->hash[hi] 534 = fatp_next(fat, fp); 535 fp->nxt = 0; 536 fatp_free(fat, fp); 537 } else { 538 /* retain for next use. 539 */ 540 ; 541 } 542 } 543 } else { 544 fr = fat->hash[hi]; 545 546 do { 547 db_trace(KTR_VTW 548 , (fr 549 , "fat:*del inuse %5.5x" 550 " nxt %x" 551 , fr->inuse, fr->nxt)); 552 553 fr = fatp_next(fat, fq = fr); 554 } while (fr && fr != fp); 555 556 KASSERT(0 && "oops"); 557 } 558 } 559 vtw->key ^= ~0; 560 } 561 562 if (fat->vtw->is_v4) { 563 tag = v4_port_tag(v4->lport); 564 } else if (fat->vtw->is_v6) { 565 tag = v6_port_tag(v6->lport); 566 } 567 568 /* Remove from fat->port[] 569 */ 570 key = vtw->port_key; 571 slot = fatp_slot_from_key(fat, key); 572 fp = fatp_from_key(fat, key); 573 idx = vtw_index(ctl, vtw); 574 575 db_trace(KTR_VTW 576 , (fp, "fatport: del inuse %5.5x" 577 " slot %x idx %x key %x tag %x" 578 , fp->inuse, slot, idx, key, tag)); 579 580 KASSERT(fp->inuse & (1 << slot)); 581 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 582 ^ fatp_xtra[slot])); 583 584 if ((fp->inuse & (1 << slot)) 585 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 586 ^ fatp_xtra[slot])) { 587 fp->inuse ^= 1 << slot; 588 fp->tag[slot] = 0; 589 590 if (!fp->inuse) { 591 uint32_t hi = tag & fat->mask; 592 fatp_t *fq = 0; 593 fatp_t *fr = fat->port[hi]; 594 595 while (fr && fr != fp) { 596 fr = fatp_next(fat, fq = fr); 597 } 598 599 if (fr == fp) { 600 if (fq) { 601 fq->nxt = fp->nxt; 602 fp->nxt = 0; 603 fatp_free(fat, fp); 604 } else { 605 KASSERT(fat->port[hi] == fp); 606 607 if (fp->nxt) { 608 fat->port[hi] 609 = fatp_next(fat, fp); 610 fp->nxt = 0; 611 fatp_free(fat, fp); 612 } else { 613 /* retain for next use. 614 */ 615 ; 616 } 617 } 618 } 619 } 620 vtw->port_key ^= ~0; 621 } 622 623 vtw->hashed = 0; 624 } 625 626 /*!\brief remove entry from hash, possibly free. 627 */ 628 void 629 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw) 630 { 631 KASSERT(mutex_owned(softnet_lock)); 632 633 if (vtw->hashed) { 634 ++vtw_stats.del; 635 vtw_unhash(ctl, vtw); 636 } 637 638 /* We only delete the oldest entry. 639 */ 640 if (vtw != ctl->oldest.v) 641 return; 642 643 --ctl->nalloc; 644 ++ctl->nfree; 645 646 vtw->expire.tv_sec = 0; 647 vtw->expire.tv_usec = ~0; 648 649 if (!ctl->nalloc) 650 ctl->oldest.v = 0; 651 652 ctl->oldest.v = vtw_next(ctl, vtw); 653 } 654 655 /*!\brief insert vestigial timewait in hash chain 656 */ 657 static void 658 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw) 659 { 660 uint32_t idx = vtw_index(ctl, vtw); 661 uint32_t tag; 662 vtw_v4_t *v4 = (void*)vtw; 663 664 KASSERT(mutex_owned(softnet_lock)); 665 KASSERT(!vtw->hashed); 666 KASSERT(ctl->clidx == vtw->msl_class); 667 668 ++vtw_stats.ins; 669 670 tag = v4_tag(v4->faddr, v4->fport, 671 v4->laddr, v4->lport); 672 673 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 674 675 db_trace(KTR_VTW, (ctl 676 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x" 677 " tag %8.8x key %8.8x" 678 , v4->faddr, v4->fport 679 , v4->laddr, v4->lport 680 , tag 681 , vtw->key)); 682 683 tag = v4_port_tag(v4->lport); 684 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 685 686 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 687 , v4->lport, v4->lport 688 , tag 689 , vtw->key)); 690 691 vtw->hashed = 1; 692 } 693 694 /*!\brief insert vestigial timewait in hash chain 695 */ 696 static void 697 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw) 698 { 699 uint32_t idx = vtw_index(ctl, vtw); 700 uint32_t tag; 701 vtw_v6_t *v6 = (void*)vtw; 702 703 KASSERT(mutex_owned(softnet_lock)); 704 KASSERT(!vtw->hashed); 705 KASSERT(ctl->clidx == vtw->msl_class); 706 707 ++vtw_stats.ins; 708 709 tag = v6_tag(&v6->faddr, v6->fport, 710 &v6->laddr, v6->lport); 711 712 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 713 714 tag = v6_port_tag(v6->lport); 715 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 716 717 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 718 , v6->lport, v6->lport 719 , tag 720 , vtw->key)); 721 722 vtw->hashed = 1; 723 } 724 725 static vtw_t * 726 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport 727 , uint32_t laddr, uint16_t lport 728 , int which) 729 { 730 vtw_v4_t *v4; 731 vtw_t *vtw; 732 uint32_t tag; 733 fatp_t *fp; 734 int i; 735 uint32_t fatps = 0, probes = 0, losings = 0; 736 737 if (!ctl || !ctl->fat) 738 return 0; 739 740 ++vtw_stats.look[which]; 741 742 if (which) { 743 tag = v4_port_tag(lport); 744 fp = ctl->fat->port[tag & ctl->fat->mask]; 745 } else { 746 tag = v4_tag(faddr, fport, laddr, lport); 747 fp = ctl->fat->hash[tag & ctl->fat->mask]; 748 } 749 750 while (fp && fp->inuse) { 751 uint32_t inuse = fp->inuse; 752 753 ++fatps; 754 755 for (i = 0; inuse && i < fatp_ntags(); ++i) { 756 uint32_t idx; 757 758 if (!(inuse & (1 << i))) 759 continue; 760 761 inuse ^= 1 << i; 762 763 ++probes; 764 ++vtw_stats.probe[which]; 765 766 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 767 vtw = vtw_from_index(ctl, idx); 768 769 if (!vtw) { 770 /* Hopefully fast path. 771 */ 772 db_trace(KTR_VTW 773 , (fp, "vtw: fast %A:%P %A:%P" 774 " idx %x tag %x" 775 , faddr, fport 776 , laddr, lport 777 , idx, tag)); 778 continue; 779 } 780 781 v4 = (void*)vtw; 782 783 /* The de-referencing of vtw is what we want to avoid. 784 * Losing. 785 */ 786 if (vtw_alive(vtw) 787 && ((which ? vtw->port_key : vtw->key) 788 == fatp_key(ctl->fat, fp, i)) 789 && (which 790 || (v4->faddr == faddr && v4->laddr == laddr 791 && v4->fport == fport)) 792 && v4->lport == lport) { 793 ++vtw_stats.hit[which]; 794 795 db_trace(KTR_VTW 796 , (fp, "vtw: hit %8.8x:%4.4x" 797 " %8.8x:%4.4x idx %x key %x" 798 , faddr, fport 799 , laddr, lport 800 , idx_decode(ctl, idx), vtw->key)); 801 802 KASSERT(vtw->hashed); 803 804 goto out; 805 } 806 ++vtw_stats.losing[which]; 807 ++losings; 808 809 if (vtw_alive(vtw)) { 810 db_trace(KTR_VTW 811 , (fp, "vtw:!mis %8.8x:%4.4x" 812 " %8.8x:%4.4x key %x tag %x" 813 , faddr, fport 814 , laddr, lport 815 , fatp_key(ctl->fat, fp, i) 816 , v4_tag(faddr, fport 817 , laddr, lport))); 818 db_trace(KTR_VTW 819 , (vtw, "vtw:!mis %8.8x:%4.4x" 820 " %8.8x:%4.4x key %x tag %x" 821 , v4->faddr, v4->fport 822 , v4->laddr, v4->lport 823 , vtw->key 824 , v4_tag(v4->faddr, v4->fport 825 , v4->laddr, v4->lport))); 826 827 if (vtw->key == fatp_key(ctl->fat, fp, i)) { 828 db_trace(KTR_VTW 829 , (vtw, "vtw:!mis %8.8x:%4.4x" 830 " %8.8x:%4.4x key %x" 831 " which %x" 832 , v4->faddr, v4->fport 833 , v4->laddr, v4->lport 834 , vtw->key 835 , which)); 836 837 } else { 838 db_trace(KTR_VTW 839 , (vtw 840 , "vtw:!mis" 841 " key %8.8x != %8.8x" 842 " idx %x i %x which %x" 843 , vtw->key 844 , fatp_key(ctl->fat, fp, i) 845 , idx_decode(ctl, idx) 846 , i 847 , which)); 848 } 849 } else { 850 db_trace(KTR_VTW 851 , (fp 852 , "vtw:!mis free entry" 853 " idx %x vtw %p which %x" 854 , idx_decode(ctl, idx) 855 , vtw, which)); 856 } 857 } 858 859 if (fp->nxt) { 860 fp = fatp_next(ctl->fat, fp); 861 } else { 862 break; 863 } 864 } 865 ++vtw_stats.miss[which]; 866 vtw = 0; 867 out: 868 if (fatps > vtw_stats.max_chain[which]) 869 vtw_stats.max_chain[which] = fatps; 870 if (probes > vtw_stats.max_probe[which]) 871 vtw_stats.max_probe[which] = probes; 872 if (losings > vtw_stats.max_loss[which]) 873 vtw_stats.max_loss[which] = losings; 874 875 return vtw; 876 } 877 878 static vtw_t * 879 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport 880 , const struct in6_addr *laddr, uint16_t lport 881 , int which) 882 { 883 vtw_v6_t *v6; 884 vtw_t *vtw; 885 uint32_t tag; 886 fatp_t *fp; 887 int i; 888 uint32_t fatps = 0, probes = 0, losings = 0; 889 890 ++vtw_stats.look[which]; 891 892 if (!ctl || !ctl->fat) 893 return 0; 894 895 if (which) { 896 tag = v6_port_tag(lport); 897 fp = ctl->fat->port[tag & ctl->fat->mask]; 898 } else { 899 tag = v6_tag(faddr, fport, laddr, lport); 900 fp = ctl->fat->hash[tag & ctl->fat->mask]; 901 } 902 903 while (fp && fp->inuse) { 904 uint32_t inuse = fp->inuse; 905 906 ++fatps; 907 908 for (i = 0; inuse && i < fatp_ntags(); ++i) { 909 uint32_t idx; 910 911 if (!(inuse & (1 << i))) 912 continue; 913 914 inuse ^= 1 << i; 915 916 ++probes; 917 ++vtw_stats.probe[which]; 918 919 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 920 vtw = vtw_from_index(ctl, idx); 921 922 db_trace(KTR_VTW 923 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x" 924 , i 925 , db_store(faddr, sizeof (*faddr)), fport 926 , db_store(laddr, sizeof (*laddr)), lport 927 , idx_decode(ctl, idx))); 928 929 if (!vtw) { 930 /* Hopefully fast path. 931 */ 932 continue; 933 } 934 935 v6 = (void*)vtw; 936 937 if (vtw_alive(vtw) 938 && ((which ? vtw->port_key : vtw->key) 939 == fatp_key(ctl->fat, fp, i)) 940 && v6->lport == lport 941 && (which 942 || (v6->fport == fport 943 && !bcmp(&v6->faddr, faddr, sizeof (*faddr)) 944 && !bcmp(&v6->laddr, laddr 945 , sizeof (*laddr))))) { 946 ++vtw_stats.hit[which]; 947 948 KASSERT(vtw->hashed); 949 goto out; 950 } else { 951 ++vtw_stats.losing[which]; 952 ++losings; 953 } 954 } 955 956 if (fp->nxt) { 957 fp = fatp_next(ctl->fat, fp); 958 } else { 959 break; 960 } 961 } 962 ++vtw_stats.miss[which]; 963 vtw = 0; 964 out: 965 if (fatps > vtw_stats.max_chain[which]) 966 vtw_stats.max_chain[which] = fatps; 967 if (probes > vtw_stats.max_probe[which]) 968 vtw_stats.max_probe[which] = probes; 969 if (losings > vtw_stats.max_loss[which]) 970 vtw_stats.max_loss[which] = losings; 971 972 return vtw; 973 } 974 975 /*!\brief port iterator 976 */ 977 static vtw_t * 978 vtw_next_port_v4(struct tcp_ports_iterator *it) 979 { 980 vtw_ctl_t *ctl = it->ctl; 981 vtw_v4_t *v4; 982 vtw_t *vtw; 983 uint32_t tag; 984 uint16_t lport = it->port; 985 fatp_t *fp; 986 int i; 987 uint32_t fatps = 0, probes = 0, losings = 0; 988 989 tag = v4_port_tag(lport); 990 if (!it->fp) { 991 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 992 it->slot_idx = 0; 993 } 994 fp = it->fp; 995 996 while (fp) { 997 uint32_t inuse = fp->inuse; 998 999 ++fatps; 1000 1001 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1002 uint32_t idx; 1003 1004 if (!(inuse & (1 << i))) 1005 continue; 1006 1007 inuse &= ~0 << i; 1008 1009 if (i < it->slot_idx) 1010 continue; 1011 1012 ++vtw_stats.probe[1]; 1013 ++probes; 1014 1015 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1016 vtw = vtw_from_index(ctl, idx); 1017 1018 if (!vtw) { 1019 /* Hopefully fast path. 1020 */ 1021 continue; 1022 } 1023 1024 v4 = (void*)vtw; 1025 1026 if (vtw_alive(vtw) 1027 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1028 && v4->lport == lport) { 1029 ++vtw_stats.hit[1]; 1030 1031 it->slot_idx = i + 1; 1032 1033 goto out; 1034 } else if (vtw_alive(vtw)) { 1035 ++vtw_stats.losing[1]; 1036 ++losings; 1037 1038 db_trace(KTR_VTW 1039 , (vtw, "vtw:!mis" 1040 " port %8.8x:%4.4x %8.8x:%4.4x" 1041 " key %x port %x" 1042 , v4->faddr, v4->fport 1043 , v4->laddr, v4->lport 1044 , vtw->key 1045 , lport)); 1046 } else { 1047 /* Really losing here. We are coming 1048 * up with references to free entries. 1049 * Might find it better to use 1050 * traditional, or need another 1051 * add-hockery. The other add-hockery 1052 * would be to pul more into into the 1053 * cache line to reject the false 1054 * hits. 1055 */ 1056 ++vtw_stats.losing[1]; 1057 ++losings; 1058 db_trace(KTR_VTW 1059 , (fp, "vtw:!mis port %x" 1060 " - free entry idx %x vtw %p" 1061 , lport 1062 , idx_decode(ctl, idx) 1063 , vtw)); 1064 } 1065 } 1066 1067 if (fp->nxt) { 1068 it->fp = fp = fatp_next(ctl->fat, fp); 1069 it->slot_idx = 0; 1070 } else { 1071 it->fp = 0; 1072 break; 1073 } 1074 } 1075 ++vtw_stats.miss[1]; 1076 1077 vtw = 0; 1078 out: 1079 if (fatps > vtw_stats.max_chain[1]) 1080 vtw_stats.max_chain[1] = fatps; 1081 if (probes > vtw_stats.max_probe[1]) 1082 vtw_stats.max_probe[1] = probes; 1083 if (losings > vtw_stats.max_loss[1]) 1084 vtw_stats.max_loss[1] = losings; 1085 1086 return vtw; 1087 } 1088 1089 /*!\brief port iterator 1090 */ 1091 static vtw_t * 1092 vtw_next_port_v6(struct tcp_ports_iterator *it) 1093 { 1094 vtw_ctl_t *ctl = it->ctl; 1095 vtw_v6_t *v6; 1096 vtw_t *vtw; 1097 uint32_t tag; 1098 uint16_t lport = it->port; 1099 fatp_t *fp; 1100 int i; 1101 uint32_t fatps = 0, probes = 0, losings = 0; 1102 1103 tag = v6_port_tag(lport); 1104 if (!it->fp) { 1105 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1106 it->slot_idx = 0; 1107 } 1108 fp = it->fp; 1109 1110 while (fp) { 1111 uint32_t inuse = fp->inuse; 1112 1113 ++fatps; 1114 1115 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1116 uint32_t idx; 1117 1118 if (!(inuse & (1 << i))) 1119 continue; 1120 1121 inuse &= ~0 << i; 1122 1123 if (i < it->slot_idx) 1124 continue; 1125 1126 ++vtw_stats.probe[1]; 1127 ++probes; 1128 1129 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1130 vtw = vtw_from_index(ctl, idx); 1131 1132 if (!vtw) { 1133 /* Hopefully fast path. 1134 */ 1135 continue; 1136 } 1137 1138 v6 = (void*)vtw; 1139 1140 db_trace(KTR_VTW 1141 , (vtw, "vtw: i %x idx %x fp->tag %x" 1142 " tag %x xtra %x" 1143 , i, idx_decode(ctl, idx) 1144 , fp->tag[i], tag, fatp_xtra[i])); 1145 1146 if (vtw_alive(vtw) 1147 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1148 && v6->lport == lport) { 1149 ++vtw_stats.hit[1]; 1150 1151 db_trace(KTR_VTW 1152 , (fp, "vtw: nxt port %P - %4.4x" 1153 " idx %x key %x" 1154 , lport, lport 1155 , idx_decode(ctl, idx), vtw->key)); 1156 1157 it->slot_idx = i + 1; 1158 goto out; 1159 } else if (vtw_alive(vtw)) { 1160 ++vtw_stats.losing[1]; 1161 1162 db_trace(KTR_VTW 1163 , (vtw, "vtw:!mis port %6A:%4.4x" 1164 " %6A:%4.4x key %x port %x" 1165 , db_store(&v6->faddr 1166 , sizeof (v6->faddr)) 1167 , v6->fport 1168 , db_store(&v6->laddr 1169 , sizeof (v6->faddr)) 1170 , v6->lport 1171 , vtw->key 1172 , lport)); 1173 } else { 1174 /* Really losing here. We are coming 1175 * up with references to free entries. 1176 * Might find it better to use 1177 * traditional, or need another 1178 * add-hockery. The other add-hockery 1179 * would be to pul more into into the 1180 * cache line to reject the false 1181 * hits. 1182 */ 1183 ++vtw_stats.losing[1]; 1184 ++losings; 1185 1186 db_trace(KTR_VTW 1187 , (fp 1188 , "vtw:!mis port %x" 1189 " - free entry idx %x vtw %p" 1190 , lport, idx_decode(ctl, idx) 1191 , vtw)); 1192 } 1193 } 1194 1195 if (fp->nxt) { 1196 it->fp = fp = fatp_next(ctl->fat, fp); 1197 it->slot_idx = 0; 1198 } else { 1199 it->fp = 0; 1200 break; 1201 } 1202 } 1203 ++vtw_stats.miss[1]; 1204 1205 vtw = 0; 1206 out: 1207 if (fatps > vtw_stats.max_chain[1]) 1208 vtw_stats.max_chain[1] = fatps; 1209 if (probes > vtw_stats.max_probe[1]) 1210 vtw_stats.max_probe[1] = probes; 1211 if (losings > vtw_stats.max_loss[1]) 1212 vtw_stats.max_loss[1] = losings; 1213 1214 return vtw; 1215 } 1216 1217 /*!\brief initialise the VTW allocation arena 1218 * 1219 * There are 1+3 allocation classes: 1220 * 0 classless 1221 * {1,2,3} MSL-class based allocation 1222 * 1223 * The allocation arenas are all initialised. Classless gets all the 1224 * space. MSL-class based divides the arena, so that allocation 1225 * within a class can proceed without having to consider entries 1226 * (aka: cache lines) from different classes. 1227 * 1228 * Usually, we are completely classless or class-based, but there can be 1229 * transition periods, corresponding to dynamic adjustments in the config 1230 * by the operator. 1231 */ 1232 static void 1233 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v) 1234 { 1235 int class_n, i; 1236 vtw_t *base; 1237 1238 ctl->base.v = ctl_base_v; 1239 1240 if (ctl->is_v4) { 1241 ctl->lim.v4 = ctl->base.v4 + n - 1; 1242 ctl->alloc.v4 = ctl->base.v4; 1243 } else { 1244 ctl->lim.v6 = ctl->base.v6 + n - 1; 1245 ctl->alloc.v6 = ctl->base.v6; 1246 } 1247 1248 ctl->nfree = n; 1249 ctl->ctl = ctl; 1250 1251 ctl->idx_bits = 32; 1252 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) { 1253 ctl->idx_mask >>= 1; 1254 ctl->idx_bits -= 1; 1255 } 1256 1257 ctl->idx_mask <<= 1; 1258 ctl->idx_mask |= 1; 1259 ctl->idx_bits += 1; 1260 1261 ctl->fat = fat; 1262 fat->vtw = ctl; 1263 1264 /* Divide the resources equally amongst the classes. 1265 * This is not optimal, as the different classes 1266 * arrive and leave at different rates, but it is 1267 * the best I can do for now. 1268 */ 1269 class_n = n / (VTW_NCLASS-1); 1270 base = ctl->base.v; 1271 1272 for (i = 1; i < VTW_NCLASS; ++i) { 1273 int j; 1274 1275 ctl[i] = ctl[0]; 1276 ctl[i].clidx = i; 1277 1278 ctl[i].base.v = base; 1279 ctl[i].alloc = ctl[i].base; 1280 1281 for (j = 0; j < class_n - 1; ++j) { 1282 if (tcp_msl_enable) 1283 base->msl_class = i; 1284 base = vtw_next(ctl, base); 1285 } 1286 1287 ctl[i].lim.v = base; 1288 base = vtw_next(ctl, base); 1289 ctl[i].nfree = class_n; 1290 } 1291 1292 vtw_debug_init(); 1293 } 1294 1295 /*!\brief map class to TCP MSL 1296 */ 1297 static inline uint32_t 1298 class_to_msl(int class) 1299 { 1300 switch (class) { 1301 case 0: 1302 case 1: 1303 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0); 1304 case 2: 1305 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); 1306 default: 1307 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); 1308 } 1309 } 1310 1311 /*!\brief map TCP MSL to class 1312 */ 1313 static inline uint32_t 1314 msl_to_class(int msl) 1315 { 1316 if (tcp_msl_enable) { 1317 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2))) 1318 return 1+2; 1319 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1))) 1320 return 1+1; 1321 return 1; 1322 } 1323 return 0; 1324 } 1325 1326 /*!\brief allocate a vtw entry 1327 */ 1328 static inline vtw_t * 1329 vtw_alloc(vtw_ctl_t *ctl) 1330 { 1331 vtw_t *vtw = 0; 1332 int stuck = 0; 1333 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0; 1334 int msl; 1335 1336 KASSERT(mutex_owned(softnet_lock)); 1337 1338 /* If no resources, we will not get far. 1339 */ 1340 if (!ctl || !ctl->base.v4 || avail <= 0) 1341 return 0; 1342 1343 /* Obtain a free one. 1344 */ 1345 while (!ctl->nfree) { 1346 vtw_age(ctl, 0); 1347 1348 if (++stuck > avail) { 1349 /* When in transition between 1350 * schemes (classless, classed) we 1351 * can be stuck having to await the 1352 * expiration of cross-allocated entries. 1353 * 1354 * Returning zero means we will fall back to the 1355 * traditional TIME_WAIT handling, except in the 1356 * case of a re-shed, in which case we cannot 1357 * perform the reshecd, but will retain the extant 1358 * entry. 1359 */ 1360 db_trace(KTR_VTW 1361 , (ctl, "vtw:!none free in class %x %x/%x" 1362 , ctl->clidx 1363 , ctl->nalloc, ctl->nfree)); 1364 1365 return 0; 1366 } 1367 } 1368 1369 vtw = ctl->alloc.v; 1370 1371 if (vtw->msl_class != ctl->clidx) { 1372 /* Usurping rules: 1373 * 0 -> {1,2,3} or {1,2,3} -> 0 1374 */ 1375 KASSERT(!vtw->msl_class || !ctl->clidx); 1376 1377 if (vtw->hashed || vtw->expire.tv_sec) { 1378 /* As this is owned by some other class, 1379 * we must wait for it to expire it. 1380 * This will only happen on class/classless 1381 * transitions, which are guaranteed to progress 1382 * to completion in small finite time, barring bugs. 1383 */ 1384 db_trace(KTR_VTW 1385 , (ctl, "vtw:!%p class %x!=%x %x:%x%s" 1386 , vtw, vtw->msl_class, ctl->clidx 1387 , vtw->expire.tv_sec 1388 , vtw->expire.tv_usec 1389 , vtw->hashed ? " hashed" : "")); 1390 1391 return 0; 1392 } 1393 1394 db_trace(KTR_VTW 1395 , (ctl, "vtw:!%p usurped from %x to %x" 1396 , vtw, vtw->msl_class, ctl->clidx)); 1397 1398 vtw->msl_class = ctl->clidx; 1399 } 1400 1401 if (vtw_alive(vtw)) { 1402 KASSERT(0 && "next free not free"); 1403 return 0; 1404 } 1405 1406 /* Advance allocation poiter. 1407 */ 1408 ctl->alloc.v = vtw_next(ctl, vtw); 1409 1410 --ctl->nfree; 1411 ++ctl->nalloc; 1412 1413 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec 1414 1415 /* mark expiration 1416 */ 1417 getmicrouptime(&vtw->expire); 1418 1419 /* Move expiration into the future. 1420 */ 1421 vtw->expire.tv_sec += msl / 1000; 1422 vtw->expire.tv_usec += 1000 * (msl % 1000); 1423 1424 while (vtw->expire.tv_usec >= 1000*1000) { 1425 vtw->expire.tv_usec -= 1000*1000; 1426 vtw->expire.tv_sec += 1; 1427 } 1428 1429 if (!ctl->oldest.v) 1430 ctl->oldest.v = vtw; 1431 1432 return vtw; 1433 } 1434 1435 /*!\brief expiration 1436 */ 1437 static int 1438 vtw_age(vtw_ctl_t *ctl, struct timeval *_when) 1439 { 1440 vtw_t *vtw; 1441 struct timeval then, *when = _when; 1442 int maxtries = 0; 1443 1444 if (!ctl->oldest.v) { 1445 KASSERT(!ctl->nalloc); 1446 return 0; 1447 } 1448 1449 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) { 1450 if (++maxtries > ctl->nalloc) 1451 break; 1452 1453 if (vtw->msl_class != ctl->clidx) { 1454 db_trace(KTR_VTW 1455 , (vtw, "vtw:!age class mismatch %x != %x" 1456 , vtw->msl_class, ctl->clidx)); 1457 /* XXXX 1458 * See if the appropriate action is to skip to the next. 1459 * XXXX 1460 */ 1461 ctl->oldest.v = vtw = vtw_next(ctl, vtw); 1462 continue; 1463 } 1464 if (!when) { 1465 /* Latch oldest timeval if none specified. 1466 */ 1467 then = vtw->expire; 1468 when = &then; 1469 } 1470 1471 if (!timercmp(&vtw->expire, when, <=)) 1472 break; 1473 1474 db_trace(KTR_VTW 1475 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x" 1476 , ctl->clidx 1477 , vtw->expire.tv_sec 1478 , vtw->expire.tv_usec 1479 , ctl->nalloc 1480 , ctl->nfree)); 1481 1482 if (!_when) 1483 ++vtw_stats.kill; 1484 1485 vtw_del(ctl, vtw); 1486 vtw = ctl->oldest.v; 1487 } 1488 1489 return ctl->nalloc; // # remaining allocated 1490 } 1491 1492 static callout_t vtw_cs; 1493 1494 /*!\brief notice the passage of time. 1495 * It seems to be getting faster. What happened to the year? 1496 */ 1497 static void 1498 vtw_tick(void *arg) 1499 { 1500 struct timeval now; 1501 int i, cnt = 0; 1502 1503 getmicrouptime(&now); 1504 1505 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x" 1506 , now.tv_sec, now.tv_usec)); 1507 1508 mutex_enter(softnet_lock); 1509 1510 for (i = 0; i < VTW_NCLASS; ++i) { 1511 cnt += vtw_age(&vtw_tcpv4[i], &now); 1512 cnt += vtw_age(&vtw_tcpv6[i], &now); 1513 } 1514 1515 /* Keep ticks coming while we need them. 1516 */ 1517 if (cnt) 1518 callout_schedule(&vtw_cs, hz / 5); 1519 else { 1520 tcp_vtw_was_enabled = 0; 1521 tcbtable.vestige = 0; 1522 } 1523 mutex_exit(softnet_lock); 1524 } 1525 1526 /* in_pcblookup_ports assist for handling vestigial entries. 1527 */ 1528 static void * 1529 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild) 1530 { 1531 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4; 1532 1533 bzero(it, sizeof (*it)); 1534 1535 /* Note: the reference to vtw_tcpv4[0] is fine. 1536 * We do not need per-class iteration. We just 1537 * need to get to the fat, and there is one 1538 * shared fat. 1539 */ 1540 if (vtw_tcpv4[0].fat) { 1541 it->addr.v4 = addr; 1542 it->port = port; 1543 it->wild = !!wild; 1544 it->ctl = &vtw_tcpv4[0]; 1545 1546 ++vtw_stats.look[1]; 1547 } 1548 1549 return it; 1550 } 1551 1552 /*!\brief export an IPv4 vtw. 1553 */ 1554 static int 1555 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1556 { 1557 vtw_v4_t *v4 = (void*)vtw; 1558 1559 bzero(res, sizeof (*res)); 1560 1561 if (ctl && vtw) { 1562 if (!ctl->clidx && vtw->msl_class) 1563 ctl += vtw->msl_class; 1564 else 1565 KASSERT(ctl->clidx == vtw->msl_class); 1566 1567 res->valid = 1; 1568 res->v4 = 1; 1569 1570 res->faddr.v4.s_addr = v4->faddr; 1571 res->laddr.v4.s_addr = v4->laddr; 1572 res->fport = v4->fport; 1573 res->lport = v4->lport; 1574 res->vtw = vtw; // netlock held over call(s) 1575 res->ctl = ctl; 1576 res->reuse_addr = vtw->reuse_addr; 1577 res->reuse_port = vtw->reuse_port; 1578 res->snd_nxt = vtw->snd_nxt; 1579 res->rcv_nxt = vtw->rcv_nxt; 1580 res->rcv_wnd = vtw->rcv_wnd; 1581 res->uid = vtw->uid; 1582 } 1583 1584 return res->valid; 1585 } 1586 1587 /*!\brief return next port in the port iterator. yowza. 1588 */ 1589 static int 1590 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res) 1591 { 1592 struct tcp_ports_iterator *it = arg; 1593 vtw_t *vtw = 0; 1594 1595 if (it->ctl) 1596 vtw = vtw_next_port_v4(it); 1597 1598 if (!vtw) 1599 it->ctl = 0; 1600 1601 return vtw_export_v4(it->ctl, vtw, res); 1602 } 1603 1604 static int 1605 tcp_lookup_v4(struct in_addr faddr, uint16_t fport, 1606 struct in_addr laddr, uint16_t lport, 1607 struct vestigial_inpcb *res) 1608 { 1609 vtw_t *vtw; 1610 vtw_ctl_t *ctl; 1611 1612 1613 db_trace(KTR_VTW 1614 , (res, "vtw: lookup %A:%P %A:%P" 1615 , faddr, fport 1616 , laddr, lport)); 1617 1618 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0]) 1619 , faddr.s_addr, fport 1620 , laddr.s_addr, lport, 0); 1621 1622 return vtw_export_v4(ctl, vtw, res); 1623 } 1624 1625 /* in_pcblookup_ports assist for handling vestigial entries. 1626 */ 1627 static void * 1628 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild) 1629 { 1630 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6; 1631 1632 bzero(it, sizeof (*it)); 1633 1634 /* Note: the reference to vtw_tcpv6[0] is fine. 1635 * We do not need per-class iteration. We just 1636 * need to get to the fat, and there is one 1637 * shared fat. 1638 */ 1639 if (vtw_tcpv6[0].fat) { 1640 it->addr.v6 = *addr; 1641 it->port = port; 1642 it->wild = !!wild; 1643 it->ctl = &vtw_tcpv6[0]; 1644 1645 ++vtw_stats.look[1]; 1646 } 1647 1648 return it; 1649 } 1650 1651 /*!\brief export an IPv6 vtw. 1652 */ 1653 static int 1654 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1655 { 1656 vtw_v6_t *v6 = (void*)vtw; 1657 1658 bzero(res, sizeof (*res)); 1659 1660 if (ctl && vtw) { 1661 if (!ctl->clidx && vtw->msl_class) 1662 ctl += vtw->msl_class; 1663 else 1664 KASSERT(ctl->clidx == vtw->msl_class); 1665 1666 res->valid = 1; 1667 res->v4 = 0; 1668 1669 res->faddr.v6 = v6->faddr; 1670 res->laddr.v6 = v6->laddr; 1671 res->fport = v6->fport; 1672 res->lport = v6->lport; 1673 res->vtw = vtw; // netlock held over call(s) 1674 res->ctl = ctl; 1675 1676 res->v6only = vtw->v6only; 1677 res->reuse_addr = vtw->reuse_addr; 1678 res->reuse_port = vtw->reuse_port; 1679 1680 res->snd_nxt = vtw->snd_nxt; 1681 res->rcv_nxt = vtw->rcv_nxt; 1682 res->rcv_wnd = vtw->rcv_wnd; 1683 res->uid = vtw->uid; 1684 } 1685 1686 return res->valid; 1687 } 1688 1689 static int 1690 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res) 1691 { 1692 struct tcp_ports_iterator *it = arg; 1693 vtw_t *vtw = 0; 1694 1695 if (it->ctl) 1696 vtw = vtw_next_port_v6(it); 1697 1698 if (!vtw) 1699 it->ctl = 0; 1700 1701 return vtw_export_v6(it->ctl, vtw, res); 1702 } 1703 1704 static int 1705 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport, 1706 const struct in6_addr *laddr, uint16_t lport, 1707 struct vestigial_inpcb *res) 1708 { 1709 vtw_ctl_t *ctl; 1710 vtw_t *vtw; 1711 1712 db_trace(KTR_VTW 1713 , (res, "vtw: lookup %6A:%P %6A:%P" 1714 , db_store(faddr, sizeof (*faddr)), fport 1715 , db_store(laddr, sizeof (*laddr)), lport)); 1716 1717 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0]) 1718 , faddr, fport 1719 , laddr, lport, 0); 1720 1721 return vtw_export_v6(ctl, vtw, res); 1722 } 1723 1724 static vestigial_hooks_t tcp_hooks = { 1725 .init_ports4 = tcp_init_ports_v4, 1726 .next_port4 = tcp_next_port_v4, 1727 .lookup4 = tcp_lookup_v4, 1728 .init_ports6 = tcp_init_ports_v6, 1729 .next_port6 = tcp_next_port_v6, 1730 .lookup6 = tcp_lookup_v6, 1731 }; 1732 1733 static bool 1734 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp) 1735 { 1736 fatp_ctl_t *fat; 1737 vtw_ctl_t *ctl; 1738 1739 switch (af) { 1740 case AF_INET: 1741 fat = &fat_tcpv4; 1742 ctl = &vtw_tcpv4[0]; 1743 break; 1744 case AF_INET6: 1745 fat = &fat_tcpv6; 1746 ctl = &vtw_tcpv6[0]; 1747 break; 1748 default: 1749 return false; 1750 } 1751 if (fatp != NULL) 1752 *fatp = fat; 1753 if (ctlp != NULL) 1754 *ctlp = ctl; 1755 return true; 1756 } 1757 1758 /*!\brief initialize controlling instance 1759 */ 1760 static int 1761 vtw_control_init(int af) 1762 { 1763 fatp_ctl_t *fat; 1764 vtw_ctl_t *ctl; 1765 fatp_t *fat_base; 1766 fatp_t **fat_hash; 1767 vtw_t *ctl_base_v; 1768 uint32_t n, m; 1769 size_t sz; 1770 1771 KASSERT(powerof2(tcp_vtw_entries)); 1772 1773 if (!vtw_select(af, &fat, &ctl)) 1774 return EAFNOSUPPORT; 1775 1776 if (fat->hash != NULL) { 1777 KASSERT(fat->base != NULL && ctl->base.v != NULL); 1778 return 0; 1779 } 1780 1781 /* Allocate 10% more capacity in the fat pointers. 1782 * We should only need ~#hash additional based on 1783 * how they age, but TIME_WAIT assassination could cause 1784 * sparse fat pointer utilisation. 1785 */ 1786 m = 512; 1787 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10; 1788 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t)); 1789 1790 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP); 1791 1792 if (fat_hash == NULL) { 1793 printf("%s: could not allocate %zu bytes for " 1794 "hash anchors", __func__, 2*m * sizeof(fatp_t *)); 1795 return ENOMEM; 1796 } 1797 1798 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP); 1799 1800 if (fat_base == NULL) { 1801 kmem_free(fat_hash, 2*m * sizeof (fatp_t *)); 1802 printf("%s: could not allocate %zu bytes for " 1803 "fatp_t array", __func__, 2*n * sizeof(fatp_t)); 1804 return ENOMEM; 1805 } 1806 1807 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP); 1808 1809 if (ctl_base_v == NULL) { 1810 kmem_free(fat_hash, 2*m * sizeof (fatp_t *)); 1811 kmem_free(fat_base, 2*n * sizeof(fatp_t)); 1812 printf("%s: could not allocate %zu bytes for " 1813 "vtw_t array", __func__, tcp_vtw_entries * sz); 1814 return ENOMEM; 1815 } 1816 1817 fatp_init(fat, n, m, fat_base, fat_hash); 1818 1819 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v); 1820 1821 return 0; 1822 } 1823 1824 /*!\brief select controlling instance 1825 */ 1826 static vtw_ctl_t * 1827 vtw_control(int af, uint32_t msl) 1828 { 1829 fatp_ctl_t *fat; 1830 vtw_ctl_t *ctl; 1831 int class = msl_to_class(msl); 1832 1833 if (!vtw_select(af, &fat, &ctl)) 1834 return NULL; 1835 1836 if (!fat->base || !ctl->base.v) 1837 return NULL; 1838 1839 if (!tcp_vtw_was_enabled) { 1840 /* This guarantees is timer ticks until we no longer need them. 1841 */ 1842 tcp_vtw_was_enabled = 1; 1843 1844 callout_schedule(&vtw_cs, hz / 5); 1845 1846 tcbtable.vestige = &tcp_hooks; 1847 } 1848 1849 return ctl + class; 1850 } 1851 1852 /*!\brief add TCP pcb to vestigial timewait 1853 */ 1854 int 1855 vtw_add(int af, struct tcpcb *tp) 1856 { 1857 int enable; 1858 vtw_ctl_t *ctl; 1859 vtw_t *vtw; 1860 1861 KASSERT(mutex_owned(softnet_lock)); 1862 1863 ctl = vtw_control(af, tp->t_msl); 1864 if (!ctl) 1865 return 0; 1866 1867 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable; 1868 1869 vtw = vtw_alloc(ctl); 1870 1871 if (vtw) { 1872 vtw->snd_nxt = tp->snd_nxt; 1873 vtw->rcv_nxt = tp->rcv_nxt; 1874 1875 switch (af) { 1876 case AF_INET: { 1877 struct inpcb *inp = tp->t_inpcb; 1878 vtw_v4_t *v4 = (void*)vtw; 1879 1880 v4->faddr = inp->inp_faddr.s_addr; 1881 v4->laddr = inp->inp_laddr.s_addr; 1882 v4->fport = inp->inp_fport; 1883 v4->lport = inp->inp_lport; 1884 1885 vtw->reuse_port = !!(inp->inp_socket->so_options 1886 & SO_REUSEPORT); 1887 vtw->reuse_addr = !!(inp->inp_socket->so_options 1888 & SO_REUSEADDR); 1889 vtw->v6only = 0; 1890 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid; 1891 1892 vtw_inshash_v4(ctl, vtw); 1893 1894 1895 #ifdef VTW_DEBUG 1896 /* Immediate lookup (connected and port) to 1897 * ensure at least that works! 1898 */ 1899 if (enable & 4) { 1900 KASSERT(vtw_lookup_hash_v4 1901 (ctl 1902 , inp->inp_faddr.s_addr, inp->inp_fport 1903 , inp->inp_laddr.s_addr, inp->inp_lport 1904 , 0) 1905 == vtw); 1906 KASSERT(vtw_lookup_hash_v4 1907 (ctl 1908 , inp->inp_faddr.s_addr, inp->inp_fport 1909 , inp->inp_laddr.s_addr, inp->inp_lport 1910 , 1)); 1911 } 1912 /* Immediate port iterator functionality check: not wild 1913 */ 1914 if (enable & 8) { 1915 struct tcp_ports_iterator *it; 1916 struct vestigial_inpcb res; 1917 int cnt = 0; 1918 1919 it = tcp_init_ports_v4(inp->inp_laddr 1920 , inp->inp_lport, 0); 1921 1922 while (tcp_next_port_v4(it, &res)) { 1923 ++cnt; 1924 } 1925 KASSERT(cnt); 1926 } 1927 /* Immediate port iterator functionality check: wild 1928 */ 1929 if (enable & 16) { 1930 struct tcp_ports_iterator *it; 1931 struct vestigial_inpcb res; 1932 struct in_addr any; 1933 int cnt = 0; 1934 1935 any.s_addr = htonl(INADDR_ANY); 1936 1937 it = tcp_init_ports_v4(any, inp->inp_lport, 1); 1938 1939 while (tcp_next_port_v4(it, &res)) { 1940 ++cnt; 1941 } 1942 KASSERT(cnt); 1943 } 1944 #endif /* VTW_DEBUG */ 1945 break; 1946 } 1947 1948 case AF_INET6: { 1949 struct in6pcb *inp = tp->t_in6pcb; 1950 vtw_v6_t *v6 = (void*)vtw; 1951 1952 v6->faddr = inp->in6p_faddr; 1953 v6->laddr = inp->in6p_laddr; 1954 v6->fport = inp->in6p_fport; 1955 v6->lport = inp->in6p_lport; 1956 1957 vtw->reuse_port = !!(inp->in6p_socket->so_options 1958 & SO_REUSEPORT); 1959 vtw->reuse_addr = !!(inp->in6p_socket->so_options 1960 & SO_REUSEADDR); 1961 vtw->v6only = !!(inp->in6p_flags 1962 & IN6P_IPV6_V6ONLY); 1963 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid; 1964 1965 vtw_inshash_v6(ctl, vtw); 1966 #ifdef VTW_DEBUG 1967 /* Immediate lookup (connected and port) to 1968 * ensure at least that works! 1969 */ 1970 if (enable & 4) { 1971 KASSERT(vtw_lookup_hash_v6(ctl 1972 , &inp->in6p_faddr, inp->in6p_fport 1973 , &inp->in6p_laddr, inp->in6p_lport 1974 , 0) 1975 == vtw); 1976 KASSERT(vtw_lookup_hash_v6 1977 (ctl 1978 , &inp->in6p_faddr, inp->in6p_fport 1979 , &inp->in6p_laddr, inp->in6p_lport 1980 , 1)); 1981 } 1982 /* Immediate port iterator functionality check: not wild 1983 */ 1984 if (enable & 8) { 1985 struct tcp_ports_iterator *it; 1986 struct vestigial_inpcb res; 1987 int cnt = 0; 1988 1989 it = tcp_init_ports_v6(&inp->in6p_laddr 1990 , inp->in6p_lport, 0); 1991 1992 while (tcp_next_port_v6(it, &res)) { 1993 ++cnt; 1994 } 1995 KASSERT(cnt); 1996 } 1997 /* Immediate port iterator functionality check: wild 1998 */ 1999 if (enable & 16) { 2000 struct tcp_ports_iterator *it; 2001 struct vestigial_inpcb res; 2002 static struct in6_addr any = IN6ADDR_ANY_INIT; 2003 int cnt = 0; 2004 2005 it = tcp_init_ports_v6(&any 2006 , inp->in6p_lport, 1); 2007 2008 while (tcp_next_port_v6(it, &res)) { 2009 ++cnt; 2010 } 2011 KASSERT(cnt); 2012 } 2013 #endif /* VTW_DEBUG */ 2014 break; 2015 } 2016 } 2017 2018 tcp_canceltimers(tp); 2019 tp = tcp_close(tp); 2020 KASSERT(!tp); 2021 2022 return 1; 2023 } 2024 2025 return 0; 2026 } 2027 2028 /*!\brief restart timer for vestigial time-wait entry 2029 */ 2030 static void 2031 vtw_restart_v4(vestigial_inpcb_t *vp) 2032 { 2033 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw; 2034 vtw_t *vtw; 2035 vtw_t *cp = ©.common; 2036 vtw_ctl_t *ctl; 2037 2038 KASSERT(mutex_owned(softnet_lock)); 2039 2040 db_trace(KTR_VTW 2041 , (vp->vtw, "vtw: restart %A:%P %A:%P" 2042 , vp->faddr.v4.s_addr, vp->fport 2043 , vp->laddr.v4.s_addr, vp->lport)); 2044 2045 /* Class might have changed, so have a squiz. 2046 */ 2047 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class)); 2048 vtw = vtw_alloc(ctl); 2049 2050 if (vtw) { 2051 vtw_v4_t *v4 = (void*)vtw; 2052 2053 /* Safe now to unhash the old entry 2054 */ 2055 vtw_del(vp->ctl, vp->vtw); 2056 2057 vtw->snd_nxt = cp->snd_nxt; 2058 vtw->rcv_nxt = cp->rcv_nxt; 2059 2060 v4->faddr = copy.faddr; 2061 v4->laddr = copy.laddr; 2062 v4->fport = copy.fport; 2063 v4->lport = copy.lport; 2064 2065 vtw->reuse_port = cp->reuse_port; 2066 vtw->reuse_addr = cp->reuse_addr; 2067 vtw->v6only = 0; 2068 vtw->uid = cp->uid; 2069 2070 vtw_inshash_v4(ctl, vtw); 2071 } 2072 2073 vp->valid = 0; 2074 } 2075 2076 /*!\brief restart timer for vestigial time-wait entry 2077 */ 2078 static void 2079 vtw_restart_v6(vestigial_inpcb_t *vp) 2080 { 2081 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw; 2082 vtw_t *vtw; 2083 vtw_t *cp = ©.common; 2084 vtw_ctl_t *ctl; 2085 2086 KASSERT(mutex_owned(softnet_lock)); 2087 2088 db_trace(KTR_VTW 2089 , (vp->vtw, "vtw: restart %6A:%P %6A:%P" 2090 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6)) 2091 , vp->fport 2092 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6)) 2093 , vp->lport)); 2094 2095 /* Class might have changed, so have a squiz. 2096 */ 2097 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class)); 2098 vtw = vtw_alloc(ctl); 2099 2100 if (vtw) { 2101 vtw_v6_t *v6 = (void*)vtw; 2102 2103 /* Safe now to unhash the old entry 2104 */ 2105 vtw_del(vp->ctl, vp->vtw); 2106 2107 vtw->snd_nxt = cp->snd_nxt; 2108 vtw->rcv_nxt = cp->rcv_nxt; 2109 2110 v6->faddr = copy.faddr; 2111 v6->laddr = copy.laddr; 2112 v6->fport = copy.fport; 2113 v6->lport = copy.lport; 2114 2115 vtw->reuse_port = cp->reuse_port; 2116 vtw->reuse_addr = cp->reuse_addr; 2117 vtw->v6only = cp->v6only; 2118 vtw->uid = cp->uid; 2119 2120 vtw_inshash_v6(ctl, vtw); 2121 } 2122 2123 vp->valid = 0; 2124 } 2125 2126 /*!\brief restart timer for vestigial time-wait entry 2127 */ 2128 void 2129 vtw_restart(vestigial_inpcb_t *vp) 2130 { 2131 if (!vp || !vp->valid) 2132 return; 2133 2134 if (vp->v4) 2135 vtw_restart_v4(vp); 2136 else 2137 vtw_restart_v6(vp); 2138 } 2139 2140 int 2141 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS) 2142 { 2143 int en, rc; 2144 struct sysctlnode node; 2145 2146 node = *rnode; 2147 en = *(int *)rnode->sysctl_data; 2148 node.sysctl_data = &en; 2149 2150 rc = sysctl_lookup(SYSCTLFN_CALL(&node)); 2151 if (rc != 0 || newp == NULL) 2152 return rc; 2153 2154 if (rnode->sysctl_data != &tcp4_vtw_enable && 2155 rnode->sysctl_data != &tcp6_vtw_enable) 2156 rc = ENOENT; 2157 else if ((en & 1) == 0) 2158 rc = 0; 2159 else if (rnode->sysctl_data == &tcp4_vtw_enable) 2160 rc = vtw_control_init(AF_INET); 2161 else /* rnode->sysctl_data == &tcp6_vtw_enable */ 2162 rc = vtw_control_init(AF_INET6); 2163 2164 if (rc == 0) 2165 *(int *)rnode->sysctl_data = en; 2166 2167 return rc; 2168 } 2169 2170 int 2171 vtw_earlyinit(void) 2172 { 2173 int i, rc; 2174 2175 callout_init(&vtw_cs, 0); 2176 callout_setfunc(&vtw_cs, vtw_tick, 0); 2177 2178 for (i = 0; i < VTW_NCLASS; ++i) { 2179 vtw_tcpv4[i].is_v4 = 1; 2180 vtw_tcpv6[i].is_v6 = 1; 2181 } 2182 2183 if ((tcp4_vtw_enable & 1) != 0 && 2184 (rc = vtw_control_init(AF_INET)) != 0) 2185 return rc; 2186 2187 if ((tcp6_vtw_enable & 1) != 0 && 2188 (rc = vtw_control_init(AF_INET6)) != 0) 2189 return rc; 2190 2191 return 0; 2192 } 2193 2194 #ifdef VTW_DEBUG 2195 #include <sys/syscallargs.h> 2196 #include <sys/sysctl.h> 2197 2198 /*!\brief add lalp, fafp entries for debug 2199 */ 2200 int 2201 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int class) 2202 { 2203 vtw_ctl_t *ctl; 2204 vtw_t *vtw; 2205 2206 ctl = vtw_control(af, msl ? msl : class_to_msl(class)); 2207 if (!ctl) 2208 return 0; 2209 2210 vtw = vtw_alloc(ctl); 2211 2212 if (vtw) { 2213 vtw->snd_nxt = 0; 2214 vtw->rcv_nxt = 0; 2215 2216 switch (af) { 2217 case AF_INET: { 2218 vtw_v4_t *v4 = (void*)vtw; 2219 2220 v4->faddr = fa->sin_addr.v4.s_addr; 2221 v4->laddr = la->sin_addr.v4.s_addr; 2222 v4->fport = fa->sin_port; 2223 v4->lport = la->sin_port; 2224 2225 vtw->reuse_port = 1; 2226 vtw->reuse_addr = 1; 2227 vtw->v6only = 0; 2228 vtw->uid = 0; 2229 2230 vtw_inshash_v4(ctl, vtw); 2231 break; 2232 } 2233 2234 case AF_INET6: { 2235 vtw_v6_t *v6 = (void*)vtw; 2236 2237 v6->faddr = fa->sin_addr.v6; 2238 v6->laddr = la->sin_addr.v6; 2239 2240 v6->fport = fa->sin_port; 2241 v6->lport = la->sin_port; 2242 2243 vtw->reuse_port = 1; 2244 vtw->reuse_addr = 1; 2245 vtw->v6only = 0; 2246 vtw->uid = 0; 2247 2248 vtw_inshash_v6(ctl, vtw); 2249 break; 2250 } 2251 2252 default: 2253 break; 2254 } 2255 2256 return 1; 2257 } 2258 2259 return 0; 2260 } 2261 2262 static int vtw_syscall = 0; 2263 2264 static int 2265 vtw_debug_process(vtw_sysargs_t *ap) 2266 { 2267 struct vestigial_inpcb vestige; 2268 int rc = 0; 2269 2270 mutex_enter(softnet_lock); 2271 2272 switch (ap->op) { 2273 case 0: // insert 2274 vtw_debug_add(ap->la.sin_family 2275 , &ap->la 2276 , &ap->fa 2277 , TCPTV_MSL 2278 , 0); 2279 break; 2280 2281 case 1: // lookup 2282 case 2: // restart 2283 switch (ap->la.sin_family) { 2284 case AF_INET: 2285 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port, 2286 ap->la.sin_addr.v4, ap->la.sin_port, 2287 &vestige)) { 2288 if (ap->op == 2) { 2289 vtw_restart(&vestige); 2290 } 2291 rc = 0; 2292 } else 2293 rc = ESRCH; 2294 break; 2295 2296 case AF_INET6: 2297 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port, 2298 &ap->la.sin_addr.v6, ap->la.sin_port, 2299 &vestige)) { 2300 if (ap->op == 2) { 2301 vtw_restart(&vestige); 2302 } 2303 rc = 0; 2304 } else 2305 rc = ESRCH; 2306 break; 2307 default: 2308 rc = EINVAL; 2309 } 2310 break; 2311 2312 default: 2313 rc = EINVAL; 2314 } 2315 2316 mutex_exit(softnet_lock); 2317 return rc; 2318 } 2319 2320 struct sys_vtw_args { 2321 syscallarg(const vtw_sysargs_t *) req; 2322 syscallarg(size_t) len; 2323 }; 2324 2325 static int 2326 vtw_sys(struct lwp *l, const void *_, register_t *retval) 2327 { 2328 const struct sys_vtw_args *uap = _; 2329 void *buf; 2330 int rc; 2331 size_t len = SCARG(uap, len); 2332 2333 if (len != sizeof (vtw_sysargs_t)) 2334 return EINVAL; 2335 2336 buf = kmem_alloc(len, KM_SLEEP); 2337 if (!buf) 2338 return ENOMEM; 2339 2340 rc = copyin(SCARG(uap, req), buf, len); 2341 if (!rc) { 2342 rc = vtw_debug_process(buf); 2343 } 2344 kmem_free(buf, len); 2345 2346 return rc; 2347 } 2348 2349 static void 2350 vtw_sanity_check(void) 2351 { 2352 vtw_ctl_t *ctl; 2353 vtw_t *vtw; 2354 int i; 2355 int n; 2356 2357 for (i = 0; i < VTW_NCLASS; ++i) { 2358 ctl = &vtw_tcpv4[i]; 2359 2360 if (!ctl->base.v || ctl->nalloc) 2361 continue; 2362 2363 for (n = 0, vtw = ctl->base.v; ; ) { 2364 ++n; 2365 vtw = vtw_next(ctl, vtw); 2366 if (vtw == ctl->base.v) 2367 break; 2368 } 2369 db_trace(KTR_VTW 2370 , (ctl, "sanity: class %x n %x nfree %x" 2371 , i, n, ctl->nfree)); 2372 2373 KASSERT(n == ctl->nfree); 2374 } 2375 2376 for (i = 0; i < VTW_NCLASS; ++i) { 2377 ctl = &vtw_tcpv6[i]; 2378 2379 if (!ctl->base.v || ctl->nalloc) 2380 continue; 2381 2382 for (n = 0, vtw = ctl->base.v; ; ) { 2383 ++n; 2384 vtw = vtw_next(ctl, vtw); 2385 if (vtw == ctl->base.v) 2386 break; 2387 } 2388 db_trace(KTR_VTW 2389 , (ctl, "sanity: class %x n %x nfree %x" 2390 , i, n, ctl->nfree)); 2391 KASSERT(n == ctl->nfree); 2392 } 2393 } 2394 2395 /*!\brief Initialise debug support. 2396 */ 2397 static void 2398 vtw_debug_init(void) 2399 { 2400 int i; 2401 2402 vtw_sanity_check(); 2403 2404 if (vtw_syscall) 2405 return; 2406 2407 for (i = 511; i; --i) { 2408 if (sysent[i].sy_call == sys_nosys) { 2409 sysent[i].sy_call = vtw_sys; 2410 sysent[i].sy_narg = 2; 2411 sysent[i].sy_argsize = sizeof (struct sys_vtw_args); 2412 sysent[i].sy_flags = 0; 2413 2414 vtw_syscall = i; 2415 break; 2416 } 2417 } 2418 if (i) { 2419 const struct sysctlnode *node; 2420 uint32_t flags; 2421 2422 flags = sysctl_root.sysctl_flags; 2423 2424 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE; 2425 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT; 2426 2427 sysctl_createv(0, 0, 0, &node, 2428 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2429 "koff", 2430 SYSCTL_DESCR("Kernel Obscure Feature Finder"), 2431 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2432 2433 if (!node) { 2434 sysctl_createv(0, 0, 0, &node, 2435 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2436 "koffka", 2437 SYSCTL_DESCR("The Real(tm) Kernel" 2438 " Obscure Feature Finder"), 2439 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2440 } 2441 if (node) { 2442 sysctl_createv(0, 0, 0, 0, 2443 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 2444 CTLTYPE_INT, "vtw_debug_syscall", 2445 SYSCTL_DESCR("vtw debug" 2446 " system call number"), 2447 0, 0, &vtw_syscall, 0, node->sysctl_num, 2448 CTL_CREATE, CTL_EOL); 2449 } 2450 sysctl_root.sysctl_flags = flags; 2451 } 2452 } 2453 #else /* !VTW_DEBUG */ 2454 static void 2455 vtw_debug_init(void) 2456 { 2457 return; 2458 } 2459 #endif /* !VTW_DEBUG */ 2460