1 /* 2 * Copyright (c) 2011 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to The NetBSD Foundation 6 * by Coyote Point Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 #include <sys/cdefs.h> 30 31 #include "opt_ddb.h" 32 #include "opt_inet.h" 33 #include "opt_ipsec.h" 34 #include "opt_inet_csum.h" 35 #include "opt_tcp_debug.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/malloc.h> 40 #include <sys/kmem.h> 41 #include <sys/mbuf.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/errno.h> 46 #include <sys/syslog.h> 47 #include <sys/pool.h> 48 #include <sys/domain.h> 49 #include <sys/kernel.h> 50 #include <net/if.h> 51 #include <net/route.h> 52 #include <net/if_types.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_systm.h> 56 #include <netinet/ip.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/in_var.h> 59 #include <netinet/ip_var.h> 60 #include <netinet/in_offload.h> 61 #include <netinet/ip6.h> 62 #include <netinet6/ip6_var.h> 63 #include <netinet6/in6_pcb.h> 64 #include <netinet6/ip6_var.h> 65 #include <netinet6/in6_var.h> 66 #include <netinet/icmp6.h> 67 #include <netinet6/nd6.h> 68 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_seq.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_private.h> 75 #include <netinet/tcpip.h> 76 77 #include <machine/stdarg.h> 78 #include <netinet/tcp_vtw.h> 79 80 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.3 2011/05/11 15:08:59 drochner Exp $"); 81 82 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0) 83 84 static void vtw_debug_init(void); 85 86 fatp_ctl_t fat_tcpv4; 87 fatp_ctl_t fat_tcpv6; 88 vtw_ctl_t vtw_tcpv4[VTW_NCLASS]; 89 vtw_ctl_t vtw_tcpv6[VTW_NCLASS]; 90 vtw_stats_t vtw_stats; 91 92 /* We provide state for the lookup_ports iterator. 93 * As currently we are netlock-protected, there is one. 94 * If we were finer-grain, we would have one per CPU. 95 * I do not want to be in the business of alloc/free. 96 * The best alternate would be allocate on the caller's 97 * stack, but that would require them to know the struct, 98 * or at least the size. 99 * See how she goes. 100 */ 101 struct tcp_ports_iterator { 102 union { 103 struct in_addr v4; 104 struct in6_addr v6; 105 } addr; 106 u_int port; 107 108 uint32_t wild : 1; 109 110 vtw_ctl_t *ctl; 111 fatp_t *fp; 112 113 uint16_t slot_idx; 114 uint16_t ctl_idx; 115 }; 116 117 static struct tcp_ports_iterator tcp_ports_iterator_v4; 118 static struct tcp_ports_iterator tcp_ports_iterator_v6; 119 120 static int vtw_age(vtw_ctl_t *, struct timeval *); 121 122 /*!\brief allocate a fat pointer from a collection. 123 */ 124 static fatp_t * 125 fatp_alloc(fatp_ctl_t *fat) 126 { 127 fatp_t *fp = 0; 128 129 if (fat->nfree) { 130 fp = fat->free; 131 if (fp) { 132 fat->free = fatp_next(fat, fp); 133 --fat->nfree; 134 ++fat->nalloc; 135 fp->nxt = 0; 136 137 KASSERT(!fp->inuse); 138 } 139 } 140 141 return fp; 142 } 143 144 /*!\brief free a fat pointer. 145 */ 146 static void 147 fatp_free(fatp_ctl_t *fat, fatp_t *fp) 148 { 149 if (fp) { 150 KASSERT(!fp->inuse); 151 KASSERT(!fp->nxt); 152 153 fp->nxt = fatp_index(fat, fat->free); 154 fat->free = fp; 155 156 ++fat->nfree; 157 --fat->nalloc; 158 } 159 } 160 161 /*!\brief initialise a collection of fat pointers. 162 * 163 *\param n # hash buckets 164 *\param m total # fat pointers to allocate 165 * 166 * We allocate 2x as much, as we have two hashes: full and lport only. 167 */ 168 static void 169 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m) 170 { 171 fatp_t *fp; 172 173 KASSERT(n <= FATP_MAX / 2); 174 175 fat->hash = kmem_alloc(2*m * sizeof (fatp_t *), KM_SLEEP); 176 fat->base = kmem_alloc(2*n * sizeof (fatp_t), KM_SLEEP); 177 178 if (!fat->base) { 179 if (fat->hash) 180 kmem_free(fat->hash, 2*m * sizeof (fatp_t *)); 181 182 bzero(fat, sizeof (*fat)); 183 return; 184 } 185 186 fat->port = &fat->hash[m]; 187 188 fat->mask = m - 1; // ASSERT is power of 2 (m) 189 fat->lim = fat->base + 2*n - 1; 190 fat->nfree = 0; 191 fat->nalloc = 2*n; 192 193 bzero(fat->hash, 2*m * sizeof (fatp_t *)); 194 bzero(fat->base, 2*n * sizeof (fatp_t)); 195 196 /* Initialise the free list. 197 */ 198 for (fp = fat->lim; fp >= fat->base; --fp) { 199 fatp_free(fat, fp); 200 } 201 } 202 203 /* 204 * The `xtra' is XORed into the tag stored. 205 */ 206 static uint32_t fatp_xtra[] = { 207 0x11111111,0x22222222,0x33333333,0x44444444, 208 0x55555555,0x66666666,0x77777777,0x88888888, 209 0x12121212,0x21212121,0x34343434,0x43434343, 210 0x56565656,0x65656565,0x78787878,0x87878787, 211 0x11221122,0x22112211,0x33443344,0x44334433, 212 0x55665566,0x66556655,0x77887788,0x88778877, 213 0x11112222,0x22221111,0x33334444,0x44443333, 214 0x55556666,0x66665555,0x77778888,0x88887777, 215 }; 216 217 /*!\brief turn a {fatp_t*,slot} into an integral key. 218 * 219 * The key can be used to obtain the fatp_t, and the slot, 220 * as it directly encodes them. 221 */ 222 static inline uint32_t 223 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot) 224 { 225 CTASSERT(CACHE_LINE_SIZE == 32 || 226 CACHE_LINE_SIZE == 64 || 227 CACHE_LINE_SIZE == 128); 228 229 switch (fatp_ntags()) { 230 case 7: 231 return (fatp_index(fat, fp) << 3) | slot; 232 case 15: 233 return (fatp_index(fat, fp) << 4) | slot; 234 case 31: 235 return (fatp_index(fat, fp) << 5) | slot; 236 default: 237 KASSERT(0 && "no support, for no good reason"); 238 return ~0; 239 } 240 } 241 242 static inline uint32_t 243 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key) 244 { 245 CTASSERT(CACHE_LINE_SIZE == 32 || 246 CACHE_LINE_SIZE == 64 || 247 CACHE_LINE_SIZE == 128); 248 249 switch (fatp_ntags()) { 250 case 7: 251 return key & 7; 252 case 15: 253 return key & 15; 254 case 31: 255 return key & 31; 256 default: 257 KASSERT(0 && "no support, for no good reason"); 258 return ~0; 259 } 260 } 261 262 static inline fatp_t * 263 fatp_from_key(fatp_ctl_t *fat, uint32_t key) 264 { 265 CTASSERT(CACHE_LINE_SIZE == 32 || 266 CACHE_LINE_SIZE == 64 || 267 CACHE_LINE_SIZE == 128); 268 269 switch (fatp_ntags()) { 270 case 7: 271 key >>= 3; 272 break; 273 case 15: 274 key >>= 4; 275 break; 276 case 31: 277 key >>= 5; 278 break; 279 default: 280 KASSERT(0 && "no support, for no good reason"); 281 return 0; 282 } 283 284 return key ? fat->base + key - 1 : 0; 285 } 286 287 static inline uint32_t 288 idx_encode(vtw_ctl_t *ctl, uint32_t idx) 289 { 290 return (idx << ctl->idx_bits) | idx; 291 } 292 293 static inline uint32_t 294 idx_decode(vtw_ctl_t *ctl, uint32_t bits) 295 { 296 uint32_t idx = bits & ctl->idx_mask; 297 298 if (idx_encode(ctl, idx) == bits) 299 return idx; 300 else 301 return ~0; 302 } 303 304 /*!\brief insert index into fatp hash 305 * 306 *\param idx - index of element being placed in hash chain 307 *\param tag - 32-bit tag identifier 308 * 309 *\returns 310 * value which can be used to locate entry. 311 * 312 *\note 313 * we rely on the fact that there are unused high bits in the index 314 * for verification purposes on lookup. 315 */ 316 317 static inline uint32_t 318 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which, 319 void *dbg) 320 { 321 fatp_t *fp; 322 fatp_t **hash = (which ? fat->port : fat->hash); 323 int i; 324 325 fp = hash[tag & fat->mask]; 326 327 while (!fp || fatp_full(fp)) { 328 fatp_t *fq; 329 330 /* All entries are inuse at the top level. 331 * We allocate a spare, and push the top level 332 * down one. All entries in the fp we push down 333 * (think of a tape worm here) will be expelled sooner than 334 * any entries added subsequently to this hash bucket. 335 * This is a property of the time waits we are exploiting. 336 */ 337 338 fq = fatp_alloc(fat); 339 if (!fq) { 340 vtw_age(fat->vtw, 0); 341 fp = hash[tag & fat->mask]; 342 continue; 343 } 344 345 fq->inuse = 0; 346 fq->nxt = fatp_index(fat, fp); 347 348 hash[tag & fat->mask] = fq; 349 350 fp = fq; 351 } 352 353 KASSERT(!fatp_full(fp)); 354 355 /* Fill highest index first. Lookup is lowest first. 356 */ 357 for (i = fatp_ntags(); --i >= 0; ) { 358 if (!((1 << i) & fp->inuse)) { 359 break; 360 } 361 } 362 363 fp->inuse |= 1 << i; 364 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i]; 365 366 db_trace(KTR_VTW 367 , (fp, "fat: inuse %5.5x tag[%x] %8.8x" 368 , fp->inuse 369 , i, fp->tag[i])); 370 371 return fatp_key(fat, fp, i); 372 } 373 374 static inline int 375 vtw_alive(const vtw_t *vtw) 376 { 377 return vtw->hashed && vtw->expire.tv_sec; 378 } 379 380 static inline uint32_t 381 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4) 382 { 383 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4) 384 return v4 - ctl->base.v4; 385 386 KASSERT(0 && "vtw out of bounds"); 387 388 return ~0; 389 } 390 391 static inline uint32_t 392 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6) 393 { 394 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6) 395 return v6 - ctl->base.v6; 396 397 KASSERT(0 && "vtw out of bounds"); 398 399 return ~0; 400 } 401 402 static inline uint32_t 403 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw) 404 { 405 if (ctl->clidx) 406 ctl = ctl->ctl; 407 408 if (ctl->is_v4) 409 return vtw_index_v4(ctl, (vtw_v4_t *)vtw); 410 411 if (ctl->is_v6) 412 return vtw_index_v6(ctl, (vtw_v6_t *)vtw); 413 414 KASSERT(0 && "neither 4 nor 6. most curious."); 415 416 return ~0; 417 } 418 419 static inline vtw_t * 420 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx) 421 { 422 if (ctl->clidx) 423 ctl = ctl->ctl; 424 425 /* See if the index looks like it might be an index. 426 * Bits on outside of the valid index bits is a give away. 427 */ 428 idx = idx_decode(ctl, idx); 429 430 if (idx == ~0) { 431 return 0; 432 } else if (ctl->is_v4) { 433 vtw_v4_t *vtw = ctl->base.v4 + idx; 434 435 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4) 436 ? &vtw->common : 0; 437 } else if (ctl->is_v6) { 438 vtw_v6_t *vtw = ctl->base.v6 + idx; 439 440 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6) 441 ? &vtw->common : 0; 442 } else { 443 KASSERT(0 && "badness"); 444 return 0; 445 } 446 } 447 448 /*!\brief return the next vtw after this one. 449 * 450 * Due to the differing sizes of the entries in differing 451 * arenas, we have to ensure we ++ the correct pointer type. 452 * 453 * Also handles wrap. 454 */ 455 static inline vtw_t * 456 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw) 457 { 458 if (ctl->is_v4) { 459 vtw_v4_t *v4 = (void*)vtw; 460 461 vtw = &(++v4)->common; 462 } else { 463 vtw_v6_t *v6 = (void*)vtw; 464 465 vtw = &(++v6)->common; 466 } 467 468 if (vtw > ctl->lim.v) 469 vtw = ctl->base.v; 470 471 return vtw; 472 } 473 474 /*!\brief remove entry from FATP hash chains 475 */ 476 static inline void 477 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw) 478 { 479 fatp_ctl_t *fat = ctl->fat; 480 fatp_t *fp; 481 uint32_t key = vtw->key; 482 uint32_t tag, slot, idx; 483 vtw_v4_t *v4 = (void*)vtw; 484 vtw_v6_t *v6 = (void*)vtw; 485 486 if (!vtw->hashed) { 487 KASSERT(0 && "unhashed"); 488 return; 489 } 490 491 if (fat->vtw->is_v4) { 492 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport); 493 } else if (fat->vtw->is_v6) { 494 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport); 495 } else { 496 tag = 0; 497 KASSERT(0 && "not reached"); 498 } 499 500 /* Remove from fat->hash[] 501 */ 502 slot = fatp_slot_from_key(fat, key); 503 fp = fatp_from_key(fat, key); 504 idx = vtw_index(ctl, vtw); 505 506 db_trace(KTR_VTW 507 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x" 508 , fp->inuse, slot, idx, key, tag)); 509 510 KASSERT(fp->inuse & (1 << slot)); 511 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 512 ^ fatp_xtra[slot])); 513 514 if ((fp->inuse & (1 << slot)) 515 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 516 ^ fatp_xtra[slot])) { 517 fp->inuse ^= 1 << slot; 518 fp->tag[slot] = 0; 519 520 /* When we delete entries, we do not compact. This is 521 * due to temporality. We add entries, and they 522 * (eventually) expire. Older entries will be further 523 * down the chain. 524 */ 525 if (!fp->inuse) { 526 uint32_t hi = tag & fat->mask; 527 fatp_t *fq = 0; 528 fatp_t *fr = fat->hash[hi]; 529 530 while (fr && fr != fp) { 531 fr = fatp_next(fat, fq = fr); 532 } 533 534 if (fr == fp) { 535 if (fq) { 536 fq->nxt = fp->nxt; 537 fp->nxt = 0; 538 fatp_free(fat, fp); 539 } else { 540 KASSERT(fat->hash[hi] == fp); 541 542 if (fp->nxt) { 543 fat->hash[hi] 544 = fatp_next(fat, fp); 545 fp->nxt = 0; 546 fatp_free(fat, fp); 547 } else { 548 /* retain for next use. 549 */ 550 ; 551 } 552 } 553 } else { 554 fr = fat->hash[hi]; 555 556 do { 557 db_trace(KTR_VTW 558 , (fr 559 , "fat:*del inuse %5.5x" 560 " nxt %x" 561 , fr->inuse, fr->nxt)); 562 563 fr = fatp_next(fat, fq = fr); 564 } while (fr && fr != fp); 565 566 KASSERT(0 && "oops"); 567 } 568 } 569 vtw->key ^= ~0; 570 } 571 572 if (fat->vtw->is_v4) { 573 tag = v4_port_tag(v4->lport); 574 } else if (fat->vtw->is_v6) { 575 tag = v6_port_tag(v6->lport); 576 } 577 578 /* Remove from fat->port[] 579 */ 580 key = vtw->port_key; 581 slot = fatp_slot_from_key(fat, key); 582 fp = fatp_from_key(fat, key); 583 idx = vtw_index(ctl, vtw); 584 585 db_trace(KTR_VTW 586 , (fp, "fatport: del inuse %5.5x" 587 " slot %x idx %x key %x tag %x" 588 , fp->inuse, slot, idx, key, tag)); 589 590 KASSERT(fp->inuse & (1 << slot)); 591 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 592 ^ fatp_xtra[slot])); 593 594 if ((fp->inuse & (1 << slot)) 595 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx) 596 ^ fatp_xtra[slot])) { 597 fp->inuse ^= 1 << slot; 598 fp->tag[slot] = 0; 599 600 if (!fp->inuse) { 601 uint32_t hi = tag & fat->mask; 602 fatp_t *fq = 0; 603 fatp_t *fr = fat->port[hi]; 604 605 while (fr && fr != fp) { 606 fr = fatp_next(fat, fq = fr); 607 } 608 609 if (fr == fp) { 610 if (fq) { 611 fq->nxt = fp->nxt; 612 fp->nxt = 0; 613 fatp_free(fat, fp); 614 } else { 615 KASSERT(fat->port[hi] == fp); 616 617 if (fp->nxt) { 618 fat->port[hi] 619 = fatp_next(fat, fp); 620 fp->nxt = 0; 621 fatp_free(fat, fp); 622 } else { 623 /* retain for next use. 624 */ 625 ; 626 } 627 } 628 } 629 } 630 vtw->port_key ^= ~0; 631 } 632 633 vtw->hashed = 0; 634 } 635 636 /*!\brief remove entry from hash, possibly free. 637 */ 638 void 639 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw) 640 { 641 KASSERT(mutex_owned(softnet_lock)); 642 643 if (vtw->hashed) { 644 ++vtw_stats.del; 645 vtw_unhash(ctl, vtw); 646 } 647 648 /* We only delete the oldest entry. 649 */ 650 if (vtw != ctl->oldest.v) 651 return; 652 653 --ctl->nalloc; 654 ++ctl->nfree; 655 656 vtw->expire.tv_sec = 0; 657 vtw->expire.tv_usec = ~0; 658 659 if (!ctl->nalloc) 660 ctl->oldest.v = 0; 661 662 ctl->oldest.v = vtw_next(ctl, vtw); 663 } 664 665 /*!\brief insert vestigeal timewait in hash chain 666 */ 667 static void 668 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw) 669 { 670 uint32_t idx = vtw_index(ctl, vtw); 671 uint32_t tag; 672 vtw_v4_t *v4 = (void*)vtw; 673 674 KASSERT(mutex_owned(softnet_lock)); 675 KASSERT(!vtw->hashed); 676 KASSERT(ctl->clidx == vtw->msl_class); 677 678 ++vtw_stats.ins; 679 680 tag = v4_tag(v4->faddr, v4->fport, 681 v4->laddr, v4->lport); 682 683 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 684 685 db_trace(KTR_VTW, (ctl 686 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x" 687 " tag %8.8x key %8.8x" 688 , v4->faddr, v4->fport 689 , v4->laddr, v4->lport 690 , tag 691 , vtw->key)); 692 693 tag = v4_port_tag(v4->lport); 694 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 695 696 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 697 , v4->lport, v4->lport 698 , tag 699 , vtw->key)); 700 701 vtw->hashed = 1; 702 } 703 704 /*!\brief insert vestigeal timewait in hash chain 705 */ 706 static void 707 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw) 708 { 709 uint32_t idx = vtw_index(ctl, vtw); 710 uint32_t tag; 711 vtw_v6_t *v6 = (void*)vtw; 712 713 KASSERT(mutex_owned(softnet_lock)); 714 KASSERT(!vtw->hashed); 715 KASSERT(ctl->clidx == vtw->msl_class); 716 717 ++vtw_stats.ins; 718 719 tag = v6_tag(&v6->faddr, v6->fport, 720 &v6->laddr, v6->lport); 721 722 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw); 723 724 tag = v6_port_tag(v6->lport); 725 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw); 726 727 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x" 728 , v6->lport, v6->lport 729 , tag 730 , vtw->key)); 731 732 vtw->hashed = 1; 733 } 734 735 static vtw_t * 736 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport 737 , uint32_t laddr, uint16_t lport 738 , int which) 739 { 740 vtw_v4_t *v4; 741 vtw_t *vtw; 742 uint32_t tag; 743 fatp_t *fp; 744 int i; 745 uint32_t fatps = 0, probes = 0, losings = 0; 746 747 if (!ctl || !ctl->fat) 748 return 0; 749 750 ++vtw_stats.look[which]; 751 752 if (which) { 753 tag = v4_port_tag(lport); 754 fp = ctl->fat->port[tag & ctl->fat->mask]; 755 } else { 756 tag = v4_tag(faddr, fport, laddr, lport); 757 fp = ctl->fat->hash[tag & ctl->fat->mask]; 758 } 759 760 while (fp && fp->inuse) { 761 uint32_t inuse = fp->inuse; 762 763 ++fatps; 764 765 for (i = 0; inuse && i < fatp_ntags(); ++i) { 766 uint32_t idx; 767 768 if (!(inuse & (1 << i))) 769 continue; 770 771 inuse ^= 1 << i; 772 773 ++probes; 774 ++vtw_stats.probe[which]; 775 776 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 777 vtw = vtw_from_index(ctl, idx); 778 779 if (!vtw) { 780 /* Hopefully fast path. 781 */ 782 db_trace(KTR_VTW 783 , (fp, "vtw: fast %A:%P %A:%P" 784 " idx %x tag %x" 785 , faddr, fport 786 , laddr, lport 787 , idx, tag)); 788 continue; 789 } 790 791 v4 = (void*)vtw; 792 793 /* The de-referencing of vtw is what we want to avoid. 794 * Losing. 795 */ 796 if (vtw_alive(vtw) 797 && ((which ? vtw->port_key : vtw->key) 798 == fatp_key(ctl->fat, fp, i)) 799 && (which 800 || (v4->faddr == faddr && v4->laddr == laddr 801 && v4->fport == fport)) 802 && v4->lport == lport) { 803 ++vtw_stats.hit[which]; 804 805 db_trace(KTR_VTW 806 , (fp, "vtw: hit %8.8x:%4.4x" 807 " %8.8x:%4.4x idx %x key %x" 808 , faddr, fport 809 , laddr, lport 810 , idx_decode(ctl, idx), vtw->key)); 811 812 KASSERT(vtw->hashed); 813 814 goto out; 815 } 816 ++vtw_stats.losing[which]; 817 ++losings; 818 819 if (vtw_alive(vtw)) { 820 db_trace(KTR_VTW 821 , (fp, "vtw:!mis %8.8x:%4.4x" 822 " %8.8x:%4.4x key %x tag %x" 823 , faddr, fport 824 , laddr, lport 825 , fatp_key(ctl->fat, fp, i) 826 , v4_tag(faddr, fport 827 , laddr, lport))); 828 db_trace(KTR_VTW 829 , (vtw, "vtw:!mis %8.8x:%4.4x" 830 " %8.8x:%4.4x key %x tag %x" 831 , v4->faddr, v4->fport 832 , v4->laddr, v4->lport 833 , vtw->key 834 , v4_tag(v4->faddr, v4->fport 835 , v4->laddr, v4->lport))); 836 837 if (vtw->key == fatp_key(ctl->fat, fp, i)) { 838 db_trace(KTR_VTW 839 , (vtw, "vtw:!mis %8.8x:%4.4x" 840 " %8.8x:%4.4x key %x" 841 " which %x" 842 , v4->faddr, v4->fport 843 , v4->laddr, v4->lport 844 , vtw->key 845 , which)); 846 847 } else { 848 db_trace(KTR_VTW 849 , (vtw 850 , "vtw:!mis" 851 " key %8.8x != %8.8x" 852 " idx %x i %x which %x" 853 , vtw->key 854 , fatp_key(ctl->fat, fp, i) 855 , idx_decode(ctl, idx) 856 , i 857 , which)); 858 } 859 } else { 860 db_trace(KTR_VTW 861 , (fp 862 , "vtw:!mis free entry" 863 " idx %x vtw %p which %x" 864 , idx_decode(ctl, idx) 865 , vtw, which)); 866 } 867 } 868 869 if (fp->nxt) { 870 fp = fatp_next(ctl->fat, fp); 871 } else { 872 break; 873 } 874 } 875 ++vtw_stats.miss[which]; 876 vtw = 0; 877 out: 878 if (fatps > vtw_stats.max_chain[which]) 879 vtw_stats.max_chain[which] = fatps; 880 if (probes > vtw_stats.max_probe[which]) 881 vtw_stats.max_probe[which] = probes; 882 if (losings > vtw_stats.max_loss[which]) 883 vtw_stats.max_loss[which] = losings; 884 885 return vtw; 886 } 887 888 static vtw_t * 889 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport 890 , const struct in6_addr *laddr, uint16_t lport 891 , int which) 892 { 893 vtw_v6_t *v6; 894 vtw_t *vtw; 895 uint32_t tag; 896 fatp_t *fp; 897 int i; 898 uint32_t fatps = 0, probes = 0, losings = 0; 899 900 ++vtw_stats.look[which]; 901 902 if (!ctl || !ctl->fat) 903 return 0; 904 905 if (which) { 906 tag = v6_port_tag(lport); 907 fp = ctl->fat->port[tag & ctl->fat->mask]; 908 } else { 909 tag = v6_tag(faddr, fport, laddr, lport); 910 fp = ctl->fat->hash[tag & ctl->fat->mask]; 911 } 912 913 while (fp && fp->inuse) { 914 uint32_t inuse = fp->inuse; 915 916 ++fatps; 917 918 for (i = 0; inuse && i < fatp_ntags(); ++i) { 919 uint32_t idx; 920 921 if (!(inuse & (1 << i))) 922 continue; 923 924 inuse ^= 1 << i; 925 926 ++probes; 927 ++vtw_stats.probe[which]; 928 929 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 930 vtw = vtw_from_index(ctl, idx); 931 932 db_trace(KTR_VTW 933 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x" 934 , i 935 , db_store(faddr, sizeof (*faddr)), fport 936 , db_store(laddr, sizeof (*laddr)), lport 937 , idx_decode(ctl, idx))); 938 939 if (!vtw) { 940 /* Hopefully fast path. 941 */ 942 continue; 943 } 944 945 v6 = (void*)vtw; 946 947 if (vtw_alive(vtw) 948 && ((which ? vtw->port_key : vtw->key) 949 == fatp_key(ctl->fat, fp, i)) 950 && v6->lport == lport 951 && (which 952 || (v6->fport == fport 953 && !bcmp(&v6->faddr, faddr, sizeof (*faddr)) 954 && !bcmp(&v6->laddr, laddr 955 , sizeof (*laddr))))) { 956 ++vtw_stats.hit[which]; 957 958 KASSERT(vtw->hashed); 959 goto out; 960 } else { 961 ++vtw_stats.losing[which]; 962 ++losings; 963 } 964 } 965 966 if (fp->nxt) { 967 fp = fatp_next(ctl->fat, fp); 968 } else { 969 break; 970 } 971 } 972 ++vtw_stats.miss[which]; 973 vtw = 0; 974 out: 975 if (fatps > vtw_stats.max_chain[which]) 976 vtw_stats.max_chain[which] = fatps; 977 if (probes > vtw_stats.max_probe[which]) 978 vtw_stats.max_probe[which] = probes; 979 if (losings > vtw_stats.max_loss[which]) 980 vtw_stats.max_loss[which] = losings; 981 982 return vtw; 983 } 984 985 /*!\brief port iterator 986 */ 987 static vtw_t * 988 vtw_next_port_v4(struct tcp_ports_iterator *it) 989 { 990 vtw_ctl_t *ctl = it->ctl; 991 vtw_v4_t *v4; 992 vtw_t *vtw; 993 uint32_t tag; 994 uint16_t lport = it->port; 995 fatp_t *fp; 996 int i; 997 uint32_t fatps = 0, probes = 0, losings = 0; 998 999 tag = v4_port_tag(lport); 1000 if (!it->fp) { 1001 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1002 it->slot_idx = 0; 1003 } 1004 fp = it->fp; 1005 1006 while (fp) { 1007 uint32_t inuse = fp->inuse; 1008 1009 ++fatps; 1010 1011 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1012 uint32_t idx; 1013 1014 if (!(inuse & (1 << i))) 1015 continue; 1016 1017 inuse &= ~0 << i; 1018 1019 if (i < it->slot_idx) 1020 continue; 1021 1022 ++vtw_stats.probe[1]; 1023 ++probes; 1024 1025 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1026 vtw = vtw_from_index(ctl, idx); 1027 1028 if (!vtw) { 1029 /* Hopefully fast path. 1030 */ 1031 continue; 1032 } 1033 1034 v4 = (void*)vtw; 1035 1036 if (vtw_alive(vtw) 1037 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1038 && v4->lport == lport) { 1039 ++vtw_stats.hit[1]; 1040 1041 it->slot_idx = i + 1; 1042 1043 goto out; 1044 } else if (vtw_alive(vtw)) { 1045 ++vtw_stats.losing[1]; 1046 ++losings; 1047 1048 db_trace(KTR_VTW 1049 , (vtw, "vtw:!mis" 1050 " port %8.8x:%4.4x %8.8x:%4.4x" 1051 " key %x port %x" 1052 , v4->faddr, v4->fport 1053 , v4->laddr, v4->lport 1054 , vtw->key 1055 , lport)); 1056 } else { 1057 /* Really losing here. We are coming 1058 * up with references to free entries. 1059 * Might find it better to use 1060 * traditional, or need another 1061 * add-hockery. The other add-hockery 1062 * would be to pul more into into the 1063 * cache line to reject the false 1064 * hits. 1065 */ 1066 ++vtw_stats.losing[1]; 1067 ++losings; 1068 db_trace(KTR_VTW 1069 , (fp, "vtw:!mis port %x" 1070 " - free entry idx %x vtw %p" 1071 , lport 1072 , idx_decode(ctl, idx) 1073 , vtw)); 1074 } 1075 } 1076 1077 if (fp->nxt) { 1078 it->fp = fp = fatp_next(ctl->fat, fp); 1079 it->slot_idx = 0; 1080 } else { 1081 it->fp = 0; 1082 break; 1083 } 1084 } 1085 ++vtw_stats.miss[1]; 1086 1087 vtw = 0; 1088 out: 1089 if (fatps > vtw_stats.max_chain[1]) 1090 vtw_stats.max_chain[1] = fatps; 1091 if (probes > vtw_stats.max_probe[1]) 1092 vtw_stats.max_probe[1] = probes; 1093 if (losings > vtw_stats.max_loss[1]) 1094 vtw_stats.max_loss[1] = losings; 1095 1096 return vtw; 1097 } 1098 1099 /*!\brief port iterator 1100 */ 1101 static vtw_t * 1102 vtw_next_port_v6(struct tcp_ports_iterator *it) 1103 { 1104 vtw_ctl_t *ctl = it->ctl; 1105 vtw_v6_t *v6; 1106 vtw_t *vtw; 1107 uint32_t tag; 1108 uint16_t lport = it->port; 1109 fatp_t *fp; 1110 int i; 1111 uint32_t fatps = 0, probes = 0, losings = 0; 1112 1113 tag = v6_port_tag(lport); 1114 if (!it->fp) { 1115 it->fp = ctl->fat->port[tag & ctl->fat->mask]; 1116 it->slot_idx = 0; 1117 } 1118 fp = it->fp; 1119 1120 while (fp) { 1121 uint32_t inuse = fp->inuse; 1122 1123 ++fatps; 1124 1125 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) { 1126 uint32_t idx; 1127 1128 if (!(inuse & (1 << i))) 1129 continue; 1130 1131 inuse &= ~0 << i; 1132 1133 if (i < it->slot_idx) 1134 continue; 1135 1136 ++vtw_stats.probe[1]; 1137 ++probes; 1138 1139 idx = fp->tag[i] ^ tag ^ fatp_xtra[i]; 1140 vtw = vtw_from_index(ctl, idx); 1141 1142 if (!vtw) { 1143 /* Hopefully fast path. 1144 */ 1145 continue; 1146 } 1147 1148 v6 = (void*)vtw; 1149 1150 db_trace(KTR_VTW 1151 , (vtw, "vtw: i %x idx %x fp->tag %x" 1152 " tag %x xtra %x" 1153 , i, idx_decode(ctl, idx) 1154 , fp->tag[i], tag, fatp_xtra[i])); 1155 1156 if (vtw_alive(vtw) 1157 && vtw->port_key == fatp_key(ctl->fat, fp, i) 1158 && v6->lport == lport) { 1159 ++vtw_stats.hit[1]; 1160 1161 db_trace(KTR_VTW 1162 , (fp, "vtw: nxt port %P - %4.4x" 1163 " idx %x key %x" 1164 , lport, lport 1165 , idx_decode(ctl, idx), vtw->key)); 1166 1167 it->slot_idx = i + 1; 1168 goto out; 1169 } else if (vtw_alive(vtw)) { 1170 ++vtw_stats.losing[1]; 1171 1172 db_trace(KTR_VTW 1173 , (vtw, "vtw:!mis port %6A:%4.4x" 1174 " %6A:%4.4x key %x port %x" 1175 , db_store(&v6->faddr 1176 , sizeof (v6->faddr)) 1177 , v6->fport 1178 , db_store(&v6->laddr 1179 , sizeof (v6->faddr)) 1180 , v6->lport 1181 , vtw->key 1182 , lport)); 1183 } else { 1184 /* Really losing here. We are coming 1185 * up with references to free entries. 1186 * Might find it better to use 1187 * traditional, or need another 1188 * add-hockery. The other add-hockery 1189 * would be to pul more into into the 1190 * cache line to reject the false 1191 * hits. 1192 */ 1193 ++vtw_stats.losing[1]; 1194 ++losings; 1195 1196 db_trace(KTR_VTW 1197 , (fp 1198 , "vtw:!mis port %x" 1199 " - free entry idx %x vtw %p" 1200 , lport, idx_decode(ctl, idx) 1201 , vtw)); 1202 } 1203 } 1204 1205 if (fp->nxt) { 1206 it->fp = fp = fatp_next(ctl->fat, fp); 1207 it->slot_idx = 0; 1208 } else { 1209 it->fp = 0; 1210 break; 1211 } 1212 } 1213 ++vtw_stats.miss[1]; 1214 1215 vtw = 0; 1216 out: 1217 if (fatps > vtw_stats.max_chain[1]) 1218 vtw_stats.max_chain[1] = fatps; 1219 if (probes > vtw_stats.max_probe[1]) 1220 vtw_stats.max_probe[1] = probes; 1221 if (losings > vtw_stats.max_loss[1]) 1222 vtw_stats.max_loss[1] = losings; 1223 1224 return vtw; 1225 } 1226 1227 /*!\brief initialise the VTW allocation arena 1228 * 1229 * There are 1+3 allocation classes: 1230 * 0 classless 1231 * {1,2,3} MSL-class based allocation 1232 * 1233 * The allocation arenas are all initialised. Classless gets all the 1234 * space. MSL-class based divides the arena, so that allocation 1235 * within a class can proceed without having to consider entries 1236 * (aka: cache lines) from different classes. 1237 * 1238 * Usually, we are completely classless or class-based, but there can be 1239 * transition periods, corresponding to dynamic adjustments in the config 1240 * by the operator. 1241 */ 1242 static void 1243 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, uint32_t n) 1244 { 1245 int i; 1246 int sz = (ctl->is_v4 ? sizeof (vtw_v4_t) : sizeof (vtw_v6_t)); 1247 1248 ctl->base.v4 = kmem_alloc(n * sz, KM_SLEEP); 1249 if (ctl->base.v4) { 1250 vtw_t *base; 1251 int class_n; 1252 1253 bzero(ctl->base.v4, n * sz); 1254 1255 if (ctl->is_v4) { 1256 ctl->lim.v4 = ctl->base.v4 + n - 1; 1257 ctl->alloc.v4 = ctl->base.v4; 1258 } else { 1259 ctl->lim.v6 = ctl->base.v6 + n - 1; 1260 ctl->alloc.v6 = ctl->base.v6; 1261 } 1262 1263 ctl->nfree = n; 1264 ctl->ctl = ctl; 1265 1266 ctl->idx_bits = 32; 1267 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) { 1268 ctl->idx_mask >>= 1; 1269 ctl->idx_bits -= 1; 1270 } 1271 1272 ctl->idx_mask <<= 1; 1273 ctl->idx_mask |= 1; 1274 ctl->idx_bits += 1; 1275 1276 ctl->fat = fat; 1277 fat->vtw = ctl; 1278 1279 /* Divide the resources equally amongst the classes. 1280 * This is not optimal, as the different classes 1281 * arrive and leave at different rates, but it is 1282 * the best I can do for now. 1283 */ 1284 class_n = n / (VTW_NCLASS-1); 1285 base = ctl->base.v; 1286 1287 for (i = 1; i < VTW_NCLASS; ++i) { 1288 int j; 1289 1290 ctl[i] = ctl[0]; 1291 ctl[i].clidx = i; 1292 1293 ctl[i].base.v = base; 1294 ctl[i].alloc = ctl[i].base; 1295 1296 for (j = 0; j < class_n - 1; ++j) { 1297 if (tcp_msl_enable) 1298 base->msl_class = i; 1299 base = vtw_next(ctl, base); 1300 } 1301 1302 ctl[i].lim.v = base; 1303 base = vtw_next(ctl, base); 1304 ctl[i].nfree = class_n; 1305 } 1306 } 1307 1308 vtw_debug_init(); 1309 } 1310 1311 /*!\brief map class to TCP MSL 1312 */ 1313 static inline uint32_t 1314 class_to_msl(int class) 1315 { 1316 switch (class) { 1317 case 0: 1318 case 1: 1319 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0); 1320 case 2: 1321 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1); 1322 default: 1323 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2); 1324 } 1325 } 1326 1327 /*!\brief map TCP MSL to class 1328 */ 1329 static inline uint32_t 1330 msl_to_class(int msl) 1331 { 1332 if (tcp_msl_enable) { 1333 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2))) 1334 return 1+2; 1335 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1))) 1336 return 1+1; 1337 return 1; 1338 } 1339 return 0; 1340 } 1341 1342 /*!\brief allocate a vtw entry 1343 */ 1344 static inline vtw_t * 1345 vtw_alloc(vtw_ctl_t *ctl) 1346 { 1347 vtw_t *vtw = 0; 1348 int stuck = 0; 1349 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0; 1350 int msl; 1351 1352 KASSERT(mutex_owned(softnet_lock)); 1353 1354 /* If no resources, we will not get far. 1355 */ 1356 if (!ctl || !ctl->base.v4 || avail <= 0) 1357 return 0; 1358 1359 /* Obtain a free one. 1360 */ 1361 while (!ctl->nfree) { 1362 vtw_age(ctl, 0); 1363 1364 if (++stuck > avail) { 1365 /* When in transition between 1366 * schemes (classless, classed) we 1367 * can be stuck having to await the 1368 * expiration of cross-allocated entries. 1369 * 1370 * Returning zero means we will fall back to the 1371 * traditional TIME_WAIT handling, except in the 1372 * case of a re-shed, in which case we cannot 1373 * perform the reshecd, but will retain the extant 1374 * entry. 1375 */ 1376 db_trace(KTR_VTW 1377 , (ctl, "vtw:!none free in class %x %x/%x" 1378 , ctl->clidx 1379 , ctl->nalloc, ctl->nfree)); 1380 1381 return 0; 1382 } 1383 } 1384 1385 vtw = ctl->alloc.v; 1386 1387 if (vtw->msl_class != ctl->clidx) { 1388 /* Usurping rules: 1389 * 0 -> {1,2,3} or {1,2,3} -> 0 1390 */ 1391 KASSERT(!vtw->msl_class || !ctl->clidx); 1392 1393 if (vtw->hashed || vtw->expire.tv_sec) { 1394 /* As this is owned by some other class, 1395 * we must wait for it to expire it. 1396 * This will only happen on class/classless 1397 * transitions, which are guaranteed to progress 1398 * to completion in small finite time, barring bugs. 1399 */ 1400 db_trace(KTR_VTW 1401 , (ctl, "vtw:!%p class %x!=%x %x:%x%s" 1402 , vtw, vtw->msl_class, ctl->clidx 1403 , vtw->expire.tv_sec 1404 , vtw->expire.tv_usec 1405 , vtw->hashed ? " hashed" : "")); 1406 1407 return 0; 1408 } 1409 1410 db_trace(KTR_VTW 1411 , (ctl, "vtw:!%p usurped from %x to %x" 1412 , vtw, vtw->msl_class, ctl->clidx)); 1413 1414 vtw->msl_class = ctl->clidx; 1415 } 1416 1417 if (vtw_alive(vtw)) { 1418 KASSERT(0 && "next free not free"); 1419 return 0; 1420 } 1421 1422 /* Advance allocation poiter. 1423 */ 1424 ctl->alloc.v = vtw_next(ctl, vtw); 1425 1426 --ctl->nfree; 1427 ++ctl->nalloc; 1428 1429 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec 1430 1431 /* mark expiration 1432 */ 1433 getmicrouptime(&vtw->expire); 1434 1435 /* Move expiration into the future. 1436 */ 1437 vtw->expire.tv_sec += msl / 1000; 1438 vtw->expire.tv_usec += 1000 * (msl % 1000); 1439 1440 while (vtw->expire.tv_usec >= 1000*1000) { 1441 vtw->expire.tv_usec -= 1000*1000; 1442 vtw->expire.tv_sec += 1; 1443 } 1444 1445 if (!ctl->oldest.v) 1446 ctl->oldest.v = vtw; 1447 1448 return vtw; 1449 } 1450 1451 /*!\brief expiration 1452 */ 1453 static int 1454 vtw_age(vtw_ctl_t *ctl, struct timeval *_when) 1455 { 1456 vtw_t *vtw; 1457 struct timeval then, *when = _when; 1458 int maxtries = 0; 1459 1460 if (!ctl->oldest.v) { 1461 KASSERT(!ctl->nalloc); 1462 return 0; 1463 } 1464 1465 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) { 1466 if (++maxtries > ctl->nalloc) 1467 break; 1468 1469 if (vtw->msl_class != ctl->clidx) { 1470 db_trace(KTR_VTW 1471 , (vtw, "vtw:!age class mismatch %x != %x" 1472 , vtw->msl_class, ctl->clidx)); 1473 /* XXXX 1474 * See if the appropriate action is to skip to the next. 1475 * XXXX 1476 */ 1477 ctl->oldest.v = vtw = vtw_next(ctl, vtw); 1478 continue; 1479 } 1480 if (!when) { 1481 /* Latch oldest timeval if none specified. 1482 */ 1483 then = vtw->expire; 1484 when = &then; 1485 } 1486 1487 if (!timercmp(&vtw->expire, when, <=)) 1488 break; 1489 1490 db_trace(KTR_VTW 1491 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x" 1492 , ctl->clidx 1493 , vtw->expire.tv_sec 1494 , vtw->expire.tv_usec 1495 , ctl->nalloc 1496 , ctl->nfree)); 1497 1498 if (!_when) 1499 ++vtw_stats.kill; 1500 1501 vtw_del(ctl, vtw); 1502 vtw = ctl->oldest.v; 1503 } 1504 1505 return ctl->nalloc; // # remaining allocated 1506 } 1507 1508 static callout_t vtw_cs; 1509 1510 /*!\brief notice the passage of time. 1511 * It seems to be getting faster. What happened to the year? 1512 */ 1513 static void 1514 vtw_tick(void *arg) 1515 { 1516 struct timeval now; 1517 int i, cnt = 0; 1518 1519 getmicrouptime(&now); 1520 1521 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x" 1522 , now.tv_sec, now.tv_usec)); 1523 1524 mutex_enter(softnet_lock); 1525 1526 for (i = 0; i < VTW_NCLASS; ++i) { 1527 cnt += vtw_age(&vtw_tcpv4[i], &now); 1528 cnt += vtw_age(&vtw_tcpv6[i], &now); 1529 } 1530 1531 /* Keep ticks coming while we need them. 1532 */ 1533 if (cnt) 1534 callout_schedule(&vtw_cs, hz / 5); 1535 else { 1536 tcp_vtw_was_enabled = 0; 1537 tcbtable.vestige = 0; 1538 } 1539 mutex_exit(softnet_lock); 1540 } 1541 1542 /* in_pcblookup_ports assist for handling vestigial entries. 1543 */ 1544 static void * 1545 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild) 1546 { 1547 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4; 1548 1549 bzero(it, sizeof (*it)); 1550 1551 /* Note: the reference to vtw_tcpv4[0] is fine. 1552 * We do not need per-class iteration. We just 1553 * need to get to the fat, and there is one 1554 * shared fat. 1555 */ 1556 if (vtw_tcpv4[0].fat) { 1557 it->addr.v4 = addr; 1558 it->port = port; 1559 it->wild = !!wild; 1560 it->ctl = &vtw_tcpv4[0]; 1561 1562 ++vtw_stats.look[1]; 1563 } 1564 1565 return it; 1566 } 1567 1568 /*!\brief export an IPv4 vtw. 1569 */ 1570 static int 1571 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1572 { 1573 vtw_v4_t *v4 = (void*)vtw; 1574 1575 bzero(res, sizeof (*res)); 1576 1577 if (ctl && vtw) { 1578 if (!ctl->clidx && vtw->msl_class) 1579 ctl += vtw->msl_class; 1580 else 1581 KASSERT(ctl->clidx == vtw->msl_class); 1582 1583 res->valid = 1; 1584 res->v4 = 1; 1585 1586 res->faddr.v4.s_addr = v4->faddr; 1587 res->laddr.v4.s_addr = v4->laddr; 1588 res->fport = v4->fport; 1589 res->lport = v4->lport; 1590 res->vtw = vtw; // netlock held over call(s) 1591 res->ctl = ctl; 1592 res->reuse_addr = vtw->reuse_addr; 1593 res->reuse_port = vtw->reuse_port; 1594 res->snd_nxt = vtw->snd_nxt; 1595 res->rcv_nxt = vtw->rcv_nxt; 1596 res->rcv_wnd = vtw->rcv_wnd; 1597 res->uid = vtw->uid; 1598 } 1599 1600 return res->valid; 1601 } 1602 1603 /*!\brief return next port in the port iterator. yowza. 1604 */ 1605 static int 1606 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res) 1607 { 1608 struct tcp_ports_iterator *it = arg; 1609 vtw_t *vtw = 0; 1610 1611 if (it->ctl) 1612 vtw = vtw_next_port_v4(it); 1613 1614 if (!vtw) 1615 it->ctl = 0; 1616 1617 return vtw_export_v4(it->ctl, vtw, res); 1618 } 1619 1620 static int 1621 tcp_lookup_v4(struct in_addr faddr, uint16_t fport, 1622 struct in_addr laddr, uint16_t lport, 1623 struct vestigial_inpcb *res) 1624 { 1625 vtw_t *vtw; 1626 vtw_ctl_t *ctl; 1627 1628 1629 db_trace(KTR_VTW 1630 , (res, "vtw: lookup %A:%P %A:%P" 1631 , faddr, fport 1632 , laddr, lport)); 1633 1634 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0]) 1635 , faddr.s_addr, fport 1636 , laddr.s_addr, lport, 0); 1637 1638 return vtw_export_v4(ctl, vtw, res); 1639 } 1640 1641 /* in_pcblookup_ports assist for handling vestigial entries. 1642 */ 1643 static void * 1644 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild) 1645 { 1646 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6; 1647 1648 bzero(it, sizeof (*it)); 1649 1650 /* Note: the reference to vtw_tcpv6[0] is fine. 1651 * We do not need per-class iteration. We just 1652 * need to get to the fat, and there is one 1653 * shared fat. 1654 */ 1655 if (vtw_tcpv6[0].fat) { 1656 it->addr.v6 = *addr; 1657 it->port = port; 1658 it->wild = !!wild; 1659 it->ctl = &vtw_tcpv6[0]; 1660 1661 ++vtw_stats.look[1]; 1662 } 1663 1664 return it; 1665 } 1666 1667 /*!\brief export an IPv6 vtw. 1668 */ 1669 static int 1670 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res) 1671 { 1672 vtw_v6_t *v6 = (void*)vtw; 1673 1674 bzero(res, sizeof (*res)); 1675 1676 if (ctl && vtw) { 1677 if (!ctl->clidx && vtw->msl_class) 1678 ctl += vtw->msl_class; 1679 else 1680 KASSERT(ctl->clidx == vtw->msl_class); 1681 1682 res->valid = 1; 1683 res->v4 = 0; 1684 1685 res->faddr.v6 = v6->faddr; 1686 res->laddr.v6 = v6->laddr; 1687 res->fport = v6->fport; 1688 res->lport = v6->lport; 1689 res->vtw = vtw; // netlock held over call(s) 1690 res->ctl = ctl; 1691 1692 res->v6only = vtw->v6only; 1693 res->reuse_addr = vtw->reuse_addr; 1694 res->reuse_port = vtw->reuse_port; 1695 1696 res->snd_nxt = vtw->snd_nxt; 1697 res->rcv_nxt = vtw->rcv_nxt; 1698 res->rcv_wnd = vtw->rcv_wnd; 1699 res->uid = vtw->uid; 1700 } 1701 1702 return res->valid; 1703 } 1704 1705 static int 1706 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res) 1707 { 1708 struct tcp_ports_iterator *it = arg; 1709 vtw_t *vtw = 0; 1710 1711 if (it->ctl) 1712 vtw = vtw_next_port_v6(it); 1713 1714 if (!vtw) 1715 it->ctl = 0; 1716 1717 return vtw_export_v6(it->ctl, vtw, res); 1718 } 1719 1720 static int 1721 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport, 1722 const struct in6_addr *laddr, uint16_t lport, 1723 struct vestigial_inpcb *res) 1724 { 1725 vtw_ctl_t *ctl; 1726 vtw_t *vtw; 1727 1728 db_trace(KTR_VTW 1729 , (res, "vtw: lookup %6A:%P %6A:%P" 1730 , db_store(faddr, sizeof (*faddr)), fport 1731 , db_store(laddr, sizeof (*laddr)), lport)); 1732 1733 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0]) 1734 , faddr, fport 1735 , laddr, lport, 0); 1736 1737 return vtw_export_v6(ctl, vtw, res); 1738 } 1739 1740 static vestigial_hooks_t tcp_hooks = { 1741 .init_ports4 = tcp_init_ports_v4, 1742 .next_port4 = tcp_next_port_v4, 1743 .lookup4 = tcp_lookup_v4, 1744 .init_ports6 = tcp_init_ports_v6, 1745 .next_port6 = tcp_next_port_v6, 1746 .lookup6 = tcp_lookup_v6, 1747 }; 1748 1749 static bool 1750 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp) 1751 { 1752 fatp_ctl_t *fat; 1753 vtw_ctl_t *ctl; 1754 1755 switch (af) { 1756 case AF_INET: 1757 fat = &fat_tcpv4; 1758 ctl = &vtw_tcpv4[0]; 1759 break; 1760 case AF_INET6: 1761 fat = &fat_tcpv6; 1762 ctl = &vtw_tcpv6[0]; 1763 break; 1764 default: 1765 return false; 1766 } 1767 if (fatp != NULL) 1768 *fatp = fat; 1769 if (ctlp != NULL) 1770 *ctlp = ctl; 1771 return true; 1772 } 1773 1774 /*!\brief initialize controlling instance 1775 */ 1776 static int 1777 vtw_control_init(int af) 1778 { 1779 fatp_ctl_t *fat; 1780 vtw_ctl_t *ctl; 1781 1782 if (!vtw_select(af, &fat, &ctl)) 1783 return EAFNOSUPPORT; 1784 1785 if (!fat->base) { 1786 uint32_t n, m; 1787 1788 KASSERT(powerof2(tcp_vtw_entries)); 1789 1790 /* Allocate 10% more capacity in the fat pointers. 1791 * We should only need ~#hash additional based on 1792 * how they age, but TIME_WAIT assassination could cause 1793 * sparse fat pointer utilisation. 1794 */ 1795 m = 512; 1796 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10; 1797 1798 fatp_init(fat, n, m); 1799 1800 if (!fat->base) 1801 return ENOMEM; 1802 } 1803 1804 if (!ctl->base.v) { 1805 1806 vtw_init(fat, ctl, tcp_vtw_entries); 1807 if (!ctl->base.v) 1808 return ENOMEM; 1809 } 1810 1811 return 0; 1812 } 1813 1814 /*!\brief select controlling instance 1815 */ 1816 static vtw_ctl_t * 1817 vtw_control(int af, uint32_t msl) 1818 { 1819 fatp_ctl_t *fat; 1820 vtw_ctl_t *ctl; 1821 int class = msl_to_class(msl); 1822 1823 if (!vtw_select(af, &fat, &ctl)) 1824 return NULL; 1825 1826 if (!fat->base || !ctl->base.v) 1827 return NULL; 1828 1829 return ctl + class; 1830 } 1831 1832 /*!\brief add TCP pcb to vestigial timewait 1833 */ 1834 int 1835 vtw_add(int af, struct tcpcb *tp) 1836 { 1837 int enable; 1838 vtw_ctl_t *ctl; 1839 vtw_t *vtw; 1840 1841 KASSERT(mutex_owned(softnet_lock)); 1842 1843 ctl = vtw_control(af, tp->t_msl); 1844 if (!ctl) 1845 return 0; 1846 1847 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable; 1848 1849 vtw = vtw_alloc(ctl); 1850 1851 if (vtw) { 1852 vtw->snd_nxt = tp->snd_nxt; 1853 vtw->rcv_nxt = tp->rcv_nxt; 1854 1855 switch (af) { 1856 case AF_INET: { 1857 struct inpcb *inp = tp->t_inpcb; 1858 vtw_v4_t *v4 = (void*)vtw; 1859 1860 v4->faddr = inp->inp_faddr.s_addr; 1861 v4->laddr = inp->inp_laddr.s_addr; 1862 v4->fport = inp->inp_fport; 1863 v4->lport = inp->inp_lport; 1864 1865 vtw->reuse_port = !!(inp->inp_socket->so_options 1866 & SO_REUSEPORT); 1867 vtw->reuse_addr = !!(inp->inp_socket->so_options 1868 & SO_REUSEADDR); 1869 vtw->v6only = 0; 1870 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid; 1871 1872 vtw_inshash_v4(ctl, vtw); 1873 1874 1875 #ifdef VTW_DEBUG 1876 /* Immediate lookup (connected and port) to 1877 * ensure at least that works! 1878 */ 1879 if (enable & 4) { 1880 KASSERT(vtw_lookup_hash_v4 1881 (ctl 1882 , inp->inp_faddr.s_addr, inp->inp_fport 1883 , inp->inp_laddr.s_addr, inp->inp_lport 1884 , 0) 1885 == vtw); 1886 KASSERT(vtw_lookup_hash_v4 1887 (ctl 1888 , inp->inp_faddr.s_addr, inp->inp_fport 1889 , inp->inp_laddr.s_addr, inp->inp_lport 1890 , 1)); 1891 } 1892 /* Immediate port iterator functionality check: not wild 1893 */ 1894 if (enable & 8) { 1895 struct tcp_ports_iterator *it; 1896 struct vestigial_inpcb res; 1897 int cnt = 0; 1898 1899 it = tcp_init_ports_v4(inp->inp_laddr 1900 , inp->inp_lport, 0); 1901 1902 while (tcp_next_port_v4(it, &res)) { 1903 ++cnt; 1904 } 1905 KASSERT(cnt); 1906 } 1907 /* Immediate port iterator functionality check: wild 1908 */ 1909 if (enable & 16) { 1910 struct tcp_ports_iterator *it; 1911 struct vestigial_inpcb res; 1912 struct in_addr any; 1913 int cnt = 0; 1914 1915 any.s_addr = htonl(INADDR_ANY); 1916 1917 it = tcp_init_ports_v4(any, inp->inp_lport, 1); 1918 1919 while (tcp_next_port_v4(it, &res)) { 1920 ++cnt; 1921 } 1922 KASSERT(cnt); 1923 } 1924 #endif /* VTW_DEBUG */ 1925 break; 1926 } 1927 1928 case AF_INET6: { 1929 struct in6pcb *inp = tp->t_in6pcb; 1930 vtw_v6_t *v6 = (void*)vtw; 1931 1932 v6->faddr = inp->in6p_faddr; 1933 v6->laddr = inp->in6p_laddr; 1934 v6->fport = inp->in6p_fport; 1935 v6->lport = inp->in6p_lport; 1936 1937 vtw->reuse_port = !!(inp->in6p_socket->so_options 1938 & SO_REUSEPORT); 1939 vtw->reuse_addr = !!(inp->in6p_socket->so_options 1940 & SO_REUSEADDR); 1941 vtw->v6only = !!(inp->in6p_flags 1942 & IN6P_IPV6_V6ONLY); 1943 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid; 1944 1945 vtw_inshash_v6(ctl, vtw); 1946 #ifdef VTW_DEBUG 1947 /* Immediate lookup (connected and port) to 1948 * ensure at least that works! 1949 */ 1950 if (enable & 4) { 1951 KASSERT(vtw_lookup_hash_v6(ctl 1952 , &inp->in6p_faddr, inp->in6p_fport 1953 , &inp->in6p_laddr, inp->in6p_lport 1954 , 0) 1955 == vtw); 1956 KASSERT(vtw_lookup_hash_v6 1957 (ctl 1958 , &inp->in6p_faddr, inp->in6p_fport 1959 , &inp->in6p_laddr, inp->in6p_lport 1960 , 1)); 1961 } 1962 /* Immediate port iterator functionality check: not wild 1963 */ 1964 if (enable & 8) { 1965 struct tcp_ports_iterator *it; 1966 struct vestigial_inpcb res; 1967 int cnt = 0; 1968 1969 it = tcp_init_ports_v6(&inp->in6p_laddr 1970 , inp->in6p_lport, 0); 1971 1972 while (tcp_next_port_v6(it, &res)) { 1973 ++cnt; 1974 } 1975 KASSERT(cnt); 1976 } 1977 /* Immediate port iterator functionality check: wild 1978 */ 1979 if (enable & 16) { 1980 struct tcp_ports_iterator *it; 1981 struct vestigial_inpcb res; 1982 static struct in6_addr any = IN6ADDR_ANY_INIT; 1983 int cnt = 0; 1984 1985 it = tcp_init_ports_v6(&any 1986 , inp->in6p_lport, 1); 1987 1988 while (tcp_next_port_v6(it, &res)) { 1989 ++cnt; 1990 } 1991 KASSERT(cnt); 1992 } 1993 #endif /* VTW_DEBUG */ 1994 break; 1995 } 1996 } 1997 1998 tcp_canceltimers(tp); 1999 tp = tcp_close(tp); 2000 KASSERT(!tp); 2001 2002 return 1; 2003 } 2004 2005 return 0; 2006 } 2007 2008 /*!\brief restart timer for vestigial time-wait entry 2009 */ 2010 static void 2011 vtw_restart_v4(vestigial_inpcb_t *vp) 2012 { 2013 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw; 2014 vtw_t *vtw; 2015 vtw_t *cp = ©.common; 2016 vtw_ctl_t *ctl; 2017 2018 KASSERT(mutex_owned(softnet_lock)); 2019 2020 db_trace(KTR_VTW 2021 , (vp->vtw, "vtw: restart %A:%P %A:%P" 2022 , vp->faddr.v4.s_addr, vp->fport 2023 , vp->laddr.v4.s_addr, vp->lport)); 2024 2025 /* Class might have changed, so have a squiz. 2026 */ 2027 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class)); 2028 vtw = vtw_alloc(ctl); 2029 2030 if (vtw) { 2031 vtw_v4_t *v4 = (void*)vtw; 2032 2033 /* Safe now to unhash the old entry 2034 */ 2035 vtw_del(vp->ctl, vp->vtw); 2036 2037 vtw->snd_nxt = cp->snd_nxt; 2038 vtw->rcv_nxt = cp->rcv_nxt; 2039 2040 v4->faddr = copy.faddr; 2041 v4->laddr = copy.laddr; 2042 v4->fport = copy.fport; 2043 v4->lport = copy.lport; 2044 2045 vtw->reuse_port = cp->reuse_port; 2046 vtw->reuse_addr = cp->reuse_addr; 2047 vtw->v6only = 0; 2048 vtw->uid = cp->uid; 2049 2050 vtw_inshash_v4(ctl, vtw); 2051 } 2052 2053 vp->valid = 0; 2054 } 2055 2056 /*!\brief restart timer for vestigial time-wait entry 2057 */ 2058 static void 2059 vtw_restart_v6(vestigial_inpcb_t *vp) 2060 { 2061 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw; 2062 vtw_t *vtw; 2063 vtw_t *cp = ©.common; 2064 vtw_ctl_t *ctl; 2065 2066 KASSERT(mutex_owned(softnet_lock)); 2067 2068 db_trace(KTR_VTW 2069 , (vp->vtw, "vtw: restart %6A:%P %6A:%P" 2070 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6)) 2071 , vp->fport 2072 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6)) 2073 , vp->lport)); 2074 2075 /* Class might have changed, so have a squiz. 2076 */ 2077 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class)); 2078 vtw = vtw_alloc(ctl); 2079 2080 if (vtw) { 2081 vtw_v6_t *v6 = (void*)vtw; 2082 2083 /* Safe now to unhash the old entry 2084 */ 2085 vtw_del(vp->ctl, vp->vtw); 2086 2087 vtw->snd_nxt = cp->snd_nxt; 2088 vtw->rcv_nxt = cp->rcv_nxt; 2089 2090 v6->faddr = copy.faddr; 2091 v6->laddr = copy.laddr; 2092 v6->fport = copy.fport; 2093 v6->lport = copy.lport; 2094 2095 vtw->reuse_port = cp->reuse_port; 2096 vtw->reuse_addr = cp->reuse_addr; 2097 vtw->v6only = cp->v6only; 2098 vtw->uid = cp->uid; 2099 2100 vtw_inshash_v6(ctl, vtw); 2101 } 2102 2103 vp->valid = 0; 2104 } 2105 2106 /*!\brief restart timer for vestigial time-wait entry 2107 */ 2108 void 2109 vtw_restart(vestigial_inpcb_t *vp) 2110 { 2111 if (!vp || !vp->valid) 2112 return; 2113 2114 if (vp->v4) 2115 vtw_restart_v4(vp); 2116 else 2117 vtw_restart_v6(vp); 2118 } 2119 2120 int 2121 vtw_earlyinit(void) 2122 { 2123 int rc; 2124 2125 if (!tcp_vtw_was_enabled) { 2126 int i; 2127 2128 /* This guarantees is timer ticks until we no longer need them. 2129 */ 2130 tcp_vtw_was_enabled = 1; 2131 2132 callout_init(&vtw_cs, 0); 2133 callout_setfunc(&vtw_cs, vtw_tick, 0); 2134 callout_schedule(&vtw_cs, hz / 5); 2135 2136 for (i = 0; i < VTW_NCLASS; ++i) { 2137 vtw_tcpv4[i].is_v4 = 1; 2138 vtw_tcpv6[i].is_v6 = 1; 2139 } 2140 2141 tcbtable.vestige = &tcp_hooks; 2142 } 2143 2144 if ((rc = vtw_control_init(AF_INET)) != 0 || 2145 (rc = vtw_control_init(AF_INET6)) != 0) 2146 return rc; 2147 2148 return 0; 2149 } 2150 2151 #ifdef VTW_DEBUG 2152 #include <sys/syscallargs.h> 2153 #include <sys/sysctl.h> 2154 2155 /*!\brief add lalp, fafp entries for debug 2156 */ 2157 int 2158 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int class) 2159 { 2160 vtw_ctl_t *ctl; 2161 vtw_t *vtw; 2162 2163 ctl = vtw_control(af, msl ? msl : class_to_msl(class)); 2164 if (!ctl) 2165 return 0; 2166 2167 vtw = vtw_alloc(ctl); 2168 2169 if (vtw) { 2170 vtw->snd_nxt = 0; 2171 vtw->rcv_nxt = 0; 2172 2173 switch (af) { 2174 case AF_INET: { 2175 vtw_v4_t *v4 = (void*)vtw; 2176 2177 v4->faddr = fa->sin_addr.v4.s_addr; 2178 v4->laddr = la->sin_addr.v4.s_addr; 2179 v4->fport = fa->sin_port; 2180 v4->lport = la->sin_port; 2181 2182 vtw->reuse_port = 1; 2183 vtw->reuse_addr = 1; 2184 vtw->v6only = 0; 2185 vtw->uid = 0; 2186 2187 vtw_inshash_v4(ctl, vtw); 2188 break; 2189 } 2190 2191 case AF_INET6: { 2192 vtw_v6_t *v6 = (void*)vtw; 2193 2194 v6->faddr = fa->sin_addr.v6; 2195 v6->laddr = la->sin_addr.v6; 2196 2197 v6->fport = fa->sin_port; 2198 v6->lport = la->sin_port; 2199 2200 vtw->reuse_port = 1; 2201 vtw->reuse_addr = 1; 2202 vtw->v6only = 0; 2203 vtw->uid = 0; 2204 2205 vtw_inshash_v6(ctl, vtw); 2206 break; 2207 } 2208 2209 default: 2210 break; 2211 } 2212 2213 return 1; 2214 } 2215 2216 return 0; 2217 } 2218 2219 static int vtw_syscall = 0; 2220 2221 static int 2222 vtw_debug_process(vtw_sysargs_t *ap) 2223 { 2224 struct vestigial_inpcb vestige; 2225 int rc = 0; 2226 2227 mutex_enter(softnet_lock); 2228 2229 switch (ap->op) { 2230 case 0: // insert 2231 vtw_debug_add(ap->la.sin_family 2232 , &ap->la 2233 , &ap->fa 2234 , TCPTV_MSL 2235 , 0); 2236 break; 2237 2238 case 1: // lookup 2239 case 2: // restart 2240 switch (ap->la.sin_family) { 2241 case AF_INET: 2242 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port, 2243 ap->la.sin_addr.v4, ap->la.sin_port, 2244 &vestige)) { 2245 if (ap->op == 2) { 2246 vtw_restart(&vestige); 2247 } 2248 rc = 0; 2249 } else 2250 rc = ESRCH; 2251 break; 2252 2253 case AF_INET6: 2254 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port, 2255 &ap->la.sin_addr.v6, ap->la.sin_port, 2256 &vestige)) { 2257 if (ap->op == 2) { 2258 vtw_restart(&vestige); 2259 } 2260 rc = 0; 2261 } else 2262 rc = ESRCH; 2263 break; 2264 default: 2265 rc = EINVAL; 2266 } 2267 break; 2268 2269 default: 2270 rc = EINVAL; 2271 } 2272 2273 mutex_exit(softnet_lock); 2274 return rc; 2275 } 2276 2277 struct sys_vtw_args { 2278 syscallarg(const vtw_sysargs_t *) req; 2279 syscallarg(size_t) len; 2280 }; 2281 2282 static int 2283 vtw_sys(struct lwp *l, const void *_, register_t *retval) 2284 { 2285 const struct sys_vtw_args *uap = _; 2286 void *buf; 2287 int rc; 2288 size_t len = SCARG(uap, len); 2289 2290 if (len != sizeof (vtw_sysargs_t)) 2291 return EINVAL; 2292 2293 buf = kmem_alloc(len, KM_SLEEP); 2294 if (!buf) 2295 return ENOMEM; 2296 2297 rc = copyin(SCARG(uap, req), buf, len); 2298 if (!rc) { 2299 rc = vtw_debug_process(buf); 2300 } 2301 kmem_free(buf, len); 2302 2303 return rc; 2304 } 2305 2306 static void 2307 vtw_sanity_check(void) 2308 { 2309 vtw_ctl_t *ctl; 2310 vtw_t *vtw; 2311 int i; 2312 int n; 2313 2314 for (i = 0; i < VTW_NCLASS; ++i) { 2315 ctl = &vtw_tcpv4[i]; 2316 2317 if (!ctl->base.v || ctl->nalloc) 2318 continue; 2319 2320 for (n = 0, vtw = ctl->base.v; ; ) { 2321 ++n; 2322 vtw = vtw_next(ctl, vtw); 2323 if (vtw == ctl->base.v) 2324 break; 2325 } 2326 db_trace(KTR_VTW 2327 , (ctl, "sanity: class %x n %x nfree %x" 2328 , i, n, ctl->nfree)); 2329 2330 KASSERT(n == ctl->nfree); 2331 } 2332 2333 for (i = 0; i < VTW_NCLASS; ++i) { 2334 ctl = &vtw_tcpv6[i]; 2335 2336 if (!ctl->base.v || ctl->nalloc) 2337 continue; 2338 2339 for (n = 0, vtw = ctl->base.v; ; ) { 2340 ++n; 2341 vtw = vtw_next(ctl, vtw); 2342 if (vtw == ctl->base.v) 2343 break; 2344 } 2345 db_trace(KTR_VTW 2346 , (ctl, "sanity: class %x n %x nfree %x" 2347 , i, n, ctl->nfree)); 2348 KASSERT(n == ctl->nfree); 2349 } 2350 } 2351 2352 /*!\brief Initialise debug support. 2353 */ 2354 static void 2355 vtw_debug_init(void) 2356 { 2357 int i; 2358 2359 vtw_sanity_check(); 2360 2361 if (vtw_syscall) 2362 return; 2363 2364 for (i = 511; i; --i) { 2365 if (sysent[i].sy_call == sys_nosys) { 2366 sysent[i].sy_call = vtw_sys; 2367 sysent[i].sy_narg = 2; 2368 sysent[i].sy_argsize = sizeof (struct sys_vtw_args); 2369 sysent[i].sy_flags = 0; 2370 2371 vtw_syscall = i; 2372 break; 2373 } 2374 } 2375 if (i) { 2376 const struct sysctlnode *node; 2377 uint32_t flags; 2378 2379 flags = sysctl_root.sysctl_flags; 2380 2381 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE; 2382 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT; 2383 2384 sysctl_createv(0, 0, 0, &node, 2385 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2386 "koff", 2387 SYSCTL_DESCR("Kernel Obscure Feature Finder"), 2388 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2389 2390 if (!node) { 2391 sysctl_createv(0, 0, 0, &node, 2392 CTLFLAG_PERMANENT, CTLTYPE_NODE, 2393 "koffka", 2394 SYSCTL_DESCR("The Real(tm) Kernel" 2395 " Obscure Feature Finder"), 2396 0, 0, 0, 0, CTL_CREATE, CTL_EOL); 2397 } 2398 if (node) { 2399 sysctl_createv(0, 0, 0, 0, 2400 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 2401 CTLTYPE_INT, "vtw_debug_syscall", 2402 SYSCTL_DESCR("vtw debug" 2403 " system call number"), 2404 0, 0, &vtw_syscall, 0, node->sysctl_num, 2405 CTL_CREATE, CTL_EOL); 2406 } 2407 sysctl_root.sysctl_flags = flags; 2408 } 2409 } 2410 #else /* !VTW_DEBUG */ 2411 static void 2412 vtw_debug_init(void) 2413 { 2414 return; 2415 } 2416 #endif /* !VTW_DEBUG */ 2417