xref: /netbsd-src/sys/netinet/tcp_vtw.c (revision b757af438b42b93f8c6571f026d8b8ef3eaf5fc9)
1 /*
2  * Copyright (c) 2011 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to The NetBSD Foundation
6  * by Coyote Point Systems, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  * POSSIBILITY OF SUCH DAMAGE.
28  */
29 #include <sys/cdefs.h>
30 
31 #include "opt_ddb.h"
32 #include "opt_inet.h"
33 #include "opt_ipsec.h"
34 #include "opt_inet_csum.h"
35 #include "opt_tcp_debug.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/kmem.h>
41 #include <sys/mbuf.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/errno.h>
46 #include <sys/syslog.h>
47 #include <sys/pool.h>
48 #include <sys/domain.h>
49 #include <sys/kernel.h>
50 #include <net/if.h>
51 #include <net/route.h>
52 #include <net/if_types.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/in_offload.h>
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #include <netinet6/in6_pcb.h>
64 #include <netinet6/ip6_var.h>
65 #include <netinet6/in6_var.h>
66 #include <netinet/icmp6.h>
67 #include <netinet6/nd6.h>
68 
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcp_private.h>
75 #include <netinet/tcpip.h>
76 
77 #include <netinet/tcp_vtw.h>
78 
79 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.8 2011/07/17 20:54:53 joerg Exp $");
80 
81 #define db_trace(__a, __b)	do { } while (/*CONSTCOND*/0)
82 
83 static void vtw_debug_init(void);
84 
85 fatp_ctl_t fat_tcpv4;
86 fatp_ctl_t fat_tcpv6;
87 vtw_ctl_t  vtw_tcpv4[VTW_NCLASS];
88 vtw_ctl_t  vtw_tcpv6[VTW_NCLASS];
89 vtw_stats_t vtw_stats;
90 
91 /* We provide state for the lookup_ports iterator.
92  * As currently we are netlock-protected, there is one.
93  * If we were finer-grain, we would have one per CPU.
94  * I do not want to be in the business of alloc/free.
95  * The best alternate would be allocate on the caller's
96  * stack, but that would require them to know the struct,
97  * or at least the size.
98  * See how she goes.
99  */
100 struct tcp_ports_iterator {
101 	union {
102 		struct in_addr	v4;
103 		struct in6_addr	v6;
104 	}		addr;
105 	u_int		port;
106 
107 	uint32_t	wild	: 1;
108 
109 	vtw_ctl_t	*ctl;
110 	fatp_t		*fp;
111 
112 	uint16_t	slot_idx;
113 	uint16_t	ctl_idx;
114 };
115 
116 static struct tcp_ports_iterator tcp_ports_iterator_v4;
117 static struct tcp_ports_iterator tcp_ports_iterator_v6;
118 
119 static int vtw_age(vtw_ctl_t *, struct timeval *);
120 
121 /*!\brief allocate a fat pointer from a collection.
122  */
123 static fatp_t *
124 fatp_alloc(fatp_ctl_t *fat)
125 {
126 	fatp_t	*fp	= 0;
127 
128 	if (fat->nfree) {
129 		fp = fat->free;
130 		if (fp) {
131 			fat->free = fatp_next(fat, fp);
132 			--fat->nfree;
133 			++fat->nalloc;
134 			fp->nxt = 0;
135 
136 			KASSERT(!fp->inuse);
137 		}
138 	}
139 
140 	return fp;
141 }
142 
143 /*!\brief free a fat pointer.
144  */
145 static void
146 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
147 {
148 	if (fp) {
149 		KASSERT(!fp->inuse);
150 		KASSERT(!fp->nxt);
151 
152 		fp->nxt = fatp_index(fat, fat->free);
153 		fat->free = fp;
154 
155 		++fat->nfree;
156 		--fat->nalloc;
157 	}
158 }
159 
160 /*!\brief initialise a collection of fat pointers.
161  *
162  *\param n	# hash buckets
163  *\param m	total # fat pointers to allocate
164  *
165  * We allocate 2x as much, as we have two hashes: full and lport only.
166  */
167 static void
168 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m,
169     fatp_t *fat_base, fatp_t **fat_hash)
170 {
171 	fatp_t	*fp;
172 
173 	KASSERT(n <= FATP_MAX / 2);
174 
175 	fat->hash = fat_hash;
176 	fat->base = fat_base;
177 
178 	fat->port = &fat->hash[m];
179 
180 	fat->mask   = m - 1;	// ASSERT is power of 2 (m)
181 	fat->lim    = fat->base + 2*n - 1;
182 	fat->nfree  = 0;
183 	fat->nalloc = 2*n;
184 
185 	/* Initialise the free list.
186 	 */
187 	for (fp = fat->lim; fp >= fat->base; --fp) {
188 		fatp_free(fat, fp);
189 	}
190 }
191 
192 /*
193  * The `xtra' is XORed into the tag stored.
194  */
195 static uint32_t fatp_xtra[] = {
196 	0x11111111,0x22222222,0x33333333,0x44444444,
197 	0x55555555,0x66666666,0x77777777,0x88888888,
198 	0x12121212,0x21212121,0x34343434,0x43434343,
199 	0x56565656,0x65656565,0x78787878,0x87878787,
200 	0x11221122,0x22112211,0x33443344,0x44334433,
201 	0x55665566,0x66556655,0x77887788,0x88778877,
202 	0x11112222,0x22221111,0x33334444,0x44443333,
203 	0x55556666,0x66665555,0x77778888,0x88887777,
204 };
205 
206 /*!\brief turn a {fatp_t*,slot} into an integral key.
207  *
208  * The key can be used to obtain the fatp_t, and the slot,
209  * as it directly encodes them.
210  */
211 static inline uint32_t
212 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
213 {
214 	CTASSERT(CACHE_LINE_SIZE == 32 ||
215 	         CACHE_LINE_SIZE == 64 ||
216 		 CACHE_LINE_SIZE == 128);
217 
218 	switch (fatp_ntags()) {
219 	case 7:
220 		return (fatp_index(fat, fp) << 3) | slot;
221 	case 15:
222 		return (fatp_index(fat, fp) << 4) | slot;
223 	case 31:
224 		return (fatp_index(fat, fp) << 5) | slot;
225 	default:
226 		KASSERT(0 && "no support, for no good reason");
227 		return ~0;
228 	}
229 }
230 
231 static inline uint32_t
232 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
233 {
234 	CTASSERT(CACHE_LINE_SIZE == 32 ||
235 	         CACHE_LINE_SIZE == 64 ||
236 		 CACHE_LINE_SIZE == 128);
237 
238 	switch (fatp_ntags()) {
239 	case 7:
240 		return key & 7;
241 	case 15:
242 		return key & 15;
243 	case 31:
244 		return key & 31;
245 	default:
246 		KASSERT(0 && "no support, for no good reason");
247 		return ~0;
248 	}
249 }
250 
251 static inline fatp_t *
252 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
253 {
254 	CTASSERT(CACHE_LINE_SIZE == 32 ||
255 	         CACHE_LINE_SIZE == 64 ||
256 		 CACHE_LINE_SIZE == 128);
257 
258 	switch (fatp_ntags()) {
259 	case 7:
260 		key >>= 3;
261 		break;
262 	case 15:
263 		key >>= 4;
264 		break;
265 	case 31:
266 		key >>= 5;
267 		break;
268 	default:
269 		KASSERT(0 && "no support, for no good reason");
270 		return 0;
271 	}
272 
273 	return key ? fat->base + key - 1 : 0;
274 }
275 
276 static inline uint32_t
277 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
278 {
279 	return (idx << ctl->idx_bits) | idx;
280 }
281 
282 static inline uint32_t
283 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
284 {
285 	uint32_t	idx	= bits & ctl->idx_mask;
286 
287 	if (idx_encode(ctl, idx) == bits)
288 		return idx;
289 	else
290 		return ~0;
291 }
292 
293 /*!\brief	insert index into fatp hash
294  *
295  *\param	idx	-	index of element being placed in hash chain
296  *\param	tag	-	32-bit tag identifier
297  *
298  *\returns
299  *	value which can be used to locate entry.
300  *
301  *\note
302  *	we rely on the fact that there are unused high bits in the index
303  *	for verification purposes on lookup.
304  */
305 
306 static inline uint32_t
307 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
308     void *dbg)
309 {
310 	fatp_t	*fp;
311 	fatp_t	**hash = (which ? fat->port : fat->hash);
312 	int	i;
313 
314 	fp = hash[tag & fat->mask];
315 
316 	while (!fp || fatp_full(fp)) {
317 		fatp_t	*fq;
318 
319 		/* All entries are inuse at the top level.
320 		 * We allocate a spare, and push the top level
321 		 * down one.  All entries in the fp we push down
322 		 * (think of a tape worm here) will be expelled sooner than
323 		 * any entries added subsequently to this hash bucket.
324 		 * This is a property of the time waits we are exploiting.
325 		 */
326 
327 		fq = fatp_alloc(fat);
328 		if (!fq) {
329 			vtw_age(fat->vtw, 0);
330 			fp = hash[tag & fat->mask];
331 			continue;
332 		}
333 
334 		fq->inuse = 0;
335 		fq->nxt   = fatp_index(fat, fp);
336 
337 		hash[tag & fat->mask] = fq;
338 
339 		fp = fq;
340 	}
341 
342 	KASSERT(!fatp_full(fp));
343 
344 	/* Fill highest index first.  Lookup is lowest first.
345 	 */
346 	for (i = fatp_ntags(); --i >= 0; ) {
347 		if (!((1 << i) & fp->inuse)) {
348 			break;
349 		}
350 	}
351 
352 	fp->inuse |= 1 << i;
353 	fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
354 
355 	db_trace(KTR_VTW
356 		 , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
357 		    , fp->inuse
358 		    , i, fp->tag[i]));
359 
360 	return fatp_key(fat, fp, i);
361 }
362 
363 static inline int
364 vtw_alive(const vtw_t *vtw)
365 {
366 	return vtw->hashed && vtw->expire.tv_sec;
367 }
368 
369 static inline uint32_t
370 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
371 {
372 	if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
373 		return v4 - ctl->base.v4;
374 
375 	KASSERT(0 && "vtw out of bounds");
376 
377 	return ~0;
378 }
379 
380 static inline uint32_t
381 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
382 {
383 	if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
384 		return v6 - ctl->base.v6;
385 
386 	KASSERT(0 && "vtw out of bounds");
387 
388 	return ~0;
389 }
390 
391 static inline uint32_t
392 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
393 {
394 	if (ctl->clidx)
395 		ctl = ctl->ctl;
396 
397 	if (ctl->is_v4)
398 		return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
399 
400 	if (ctl->is_v6)
401 		return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
402 
403 	KASSERT(0 && "neither 4 nor 6.  most curious.");
404 
405 	return ~0;
406 }
407 
408 static inline vtw_t *
409 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
410 {
411 	if (ctl->clidx)
412 		ctl = ctl->ctl;
413 
414 	/* See if the index looks like it might be an index.
415 	 * Bits on outside of the valid index bits is a give away.
416 	 */
417 	idx = idx_decode(ctl, idx);
418 
419 	if (idx == ~0) {
420 		return 0;
421 	} else if (ctl->is_v4) {
422 		vtw_v4_t	*vtw = ctl->base.v4 + idx;
423 
424 		return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
425 			? &vtw->common : 0;
426 	} else if (ctl->is_v6) {
427 		vtw_v6_t	*vtw = ctl->base.v6 + idx;
428 
429 		return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
430 			? &vtw->common : 0;
431 	} else {
432 		KASSERT(0 && "badness");
433 		return 0;
434 	}
435 }
436 
437 /*!\brief return the next vtw after this one.
438  *
439  * Due to the differing sizes of the entries in differing
440  * arenas, we have to ensure we ++ the correct pointer type.
441  *
442  * Also handles wrap.
443  */
444 static inline vtw_t *
445 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
446 {
447 	if (ctl->is_v4) {
448 		vtw_v4_t	*v4 = (void*)vtw;
449 
450 		vtw = &(++v4)->common;
451 	} else {
452 		vtw_v6_t	*v6 = (void*)vtw;
453 
454 		vtw = &(++v6)->common;
455 	}
456 
457 	if (vtw > ctl->lim.v)
458 		vtw = ctl->base.v;
459 
460 	return vtw;
461 }
462 
463 /*!\brief	remove entry from FATP hash chains
464  */
465 static inline void
466 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
467 {
468 	fatp_ctl_t	*fat	= ctl->fat;
469 	fatp_t		*fp;
470 	uint32_t	key = vtw->key;
471 	uint32_t	tag, slot, idx;
472 	vtw_v4_t	*v4 = (void*)vtw;
473 	vtw_v6_t	*v6 = (void*)vtw;
474 
475 	if (!vtw->hashed) {
476 		KASSERT(0 && "unhashed");
477 		return;
478 	}
479 
480 	if (fat->vtw->is_v4) {
481 		tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
482 	} else if (fat->vtw->is_v6) {
483 		tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
484 	} else {
485 		tag = 0;
486 		KASSERT(0 && "not reached");
487 	}
488 
489 	/* Remove from fat->hash[]
490 	 */
491 	slot = fatp_slot_from_key(fat, key);
492 	fp   = fatp_from_key(fat, key);
493 	idx  = vtw_index(ctl, vtw);
494 
495 	db_trace(KTR_VTW
496 		 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
497 		    , fp->inuse, slot, idx, key, tag));
498 
499 	KASSERT(fp->inuse & (1 << slot));
500 	KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
501 				  ^ fatp_xtra[slot]));
502 
503 	if ((fp->inuse & (1 << slot))
504 	    && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
505 				 ^ fatp_xtra[slot])) {
506 		fp->inuse ^= 1 << slot;
507 		fp->tag[slot] = 0;
508 
509 		/* When we delete entries, we do not compact.  This is
510 		 * due to temporality.  We add entries, and they
511 		 * (eventually) expire. Older entries will be further
512 		 * down the chain.
513 		 */
514 		if (!fp->inuse) {
515 			uint32_t hi = tag & fat->mask;
516 			fatp_t	*fq = 0;
517 			fatp_t	*fr = fat->hash[hi];
518 
519 			while (fr && fr != fp) {
520 				fr = fatp_next(fat, fq = fr);
521 			}
522 
523 			if (fr == fp) {
524 				if (fq) {
525 					fq->nxt = fp->nxt;
526 					fp->nxt = 0;
527 					fatp_free(fat, fp);
528 				} else {
529 					KASSERT(fat->hash[hi] == fp);
530 
531 					if (fp->nxt) {
532 						fat->hash[hi]
533 							= fatp_next(fat, fp);
534 						fp->nxt = 0;
535 						fatp_free(fat, fp);
536 					} else {
537 						/* retain for next use.
538 						 */
539 						;
540 					}
541 				}
542 			} else {
543 				fr = fat->hash[hi];
544 
545 				do {
546 					db_trace(KTR_VTW
547 						 , (fr
548 						    , "fat:*del inuse %5.5x"
549 						    " nxt %x"
550 						    , fr->inuse, fr->nxt));
551 
552 					fr = fatp_next(fat, fq = fr);
553 				} while (fr && fr != fp);
554 
555 				KASSERT(0 && "oops");
556 			}
557 		}
558 		vtw->key ^= ~0;
559 	}
560 
561 	if (fat->vtw->is_v4) {
562 		tag = v4_port_tag(v4->lport);
563 	} else if (fat->vtw->is_v6) {
564 		tag = v6_port_tag(v6->lport);
565 	}
566 
567 	/* Remove from fat->port[]
568 	 */
569 	key  = vtw->port_key;
570 	slot = fatp_slot_from_key(fat, key);
571 	fp   = fatp_from_key(fat, key);
572 	idx  = vtw_index(ctl, vtw);
573 
574 	db_trace(KTR_VTW
575 		 , (fp, "fatport: del inuse %5.5x"
576 		    " slot %x idx %x key %x tag %x"
577 		    , fp->inuse, slot, idx, key, tag));
578 
579 	KASSERT(fp->inuse & (1 << slot));
580 	KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
581 				  ^ fatp_xtra[slot]));
582 
583 	if ((fp->inuse & (1 << slot))
584 	    && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
585 				 ^ fatp_xtra[slot])) {
586 		fp->inuse ^= 1 << slot;
587 		fp->tag[slot] = 0;
588 
589 		if (!fp->inuse) {
590 			uint32_t hi = tag & fat->mask;
591 			fatp_t	*fq = 0;
592 			fatp_t	*fr = fat->port[hi];
593 
594 			while (fr && fr != fp) {
595 				fr = fatp_next(fat, fq = fr);
596 			}
597 
598 			if (fr == fp) {
599 				if (fq) {
600 					fq->nxt = fp->nxt;
601 					fp->nxt = 0;
602 					fatp_free(fat, fp);
603 				} else {
604 					KASSERT(fat->port[hi] == fp);
605 
606 					if (fp->nxt) {
607 						fat->port[hi]
608 							= fatp_next(fat, fp);
609 						fp->nxt = 0;
610 						fatp_free(fat, fp);
611 					} else {
612 						/* retain for next use.
613 						 */
614 						;
615 					}
616 				}
617 			}
618 		}
619 		vtw->port_key ^= ~0;
620 	}
621 
622 	vtw->hashed = 0;
623 }
624 
625 /*!\brief	remove entry from hash, possibly free.
626  */
627 void
628 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
629 {
630 	KASSERT(mutex_owned(softnet_lock));
631 
632 	if (vtw->hashed) {
633 		++vtw_stats.del;
634 		vtw_unhash(ctl, vtw);
635 	}
636 
637 	/* We only delete the oldest entry.
638 	 */
639 	if (vtw != ctl->oldest.v)
640 		return;
641 
642 	--ctl->nalloc;
643 	++ctl->nfree;
644 
645 	vtw->expire.tv_sec  = 0;
646 	vtw->expire.tv_usec = ~0;
647 
648 	if (!ctl->nalloc)
649 		ctl->oldest.v = 0;
650 
651 	ctl->oldest.v = vtw_next(ctl, vtw);
652 }
653 
654 /*!\brief	insert vestigial timewait in hash chain
655  */
656 static void
657 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
658 {
659 	uint32_t	idx	= vtw_index(ctl, vtw);
660 	uint32_t	tag;
661 	vtw_v4_t	*v4 = (void*)vtw;
662 
663 	KASSERT(mutex_owned(softnet_lock));
664 	KASSERT(!vtw->hashed);
665 	KASSERT(ctl->clidx == vtw->msl_class);
666 
667 	++vtw_stats.ins;
668 
669 	tag = v4_tag(v4->faddr, v4->fport,
670 		     v4->laddr, v4->lport);
671 
672 	vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
673 
674 	db_trace(KTR_VTW, (ctl
675 			   , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
676 			   " tag %8.8x key %8.8x"
677 			   , v4->faddr, v4->fport
678 			   , v4->laddr, v4->lport
679 			   , tag
680 			   , vtw->key));
681 
682 	tag = v4_port_tag(v4->lport);
683 	vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
684 
685 	db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
686 			   , v4->lport, v4->lport
687 			   , tag
688 			   , vtw->key));
689 
690 	vtw->hashed = 1;
691 }
692 
693 /*!\brief	insert vestigial timewait in hash chain
694  */
695 static void
696 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
697 {
698 	uint32_t	idx	= vtw_index(ctl, vtw);
699 	uint32_t	tag;
700 	vtw_v6_t	*v6	= (void*)vtw;
701 
702 	KASSERT(mutex_owned(softnet_lock));
703 	KASSERT(!vtw->hashed);
704 	KASSERT(ctl->clidx == vtw->msl_class);
705 
706 	++vtw_stats.ins;
707 
708 	tag = v6_tag(&v6->faddr, v6->fport,
709 		     &v6->laddr, v6->lport);
710 
711 	vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
712 
713 	tag = v6_port_tag(v6->lport);
714 	vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
715 
716 	db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
717 			   , v6->lport, v6->lport
718 			   , tag
719 			   , vtw->key));
720 
721 	vtw->hashed = 1;
722 }
723 
724 static vtw_t *
725 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
726 				 , uint32_t laddr, uint16_t lport
727 				 , int which)
728 {
729 	vtw_v4_t	*v4;
730 	vtw_t		*vtw;
731 	uint32_t	tag;
732 	fatp_t		*fp;
733 	int		i;
734 	uint32_t	fatps = 0, probes = 0, losings = 0;
735 
736 	if (!ctl || !ctl->fat)
737 		return 0;
738 
739 	++vtw_stats.look[which];
740 
741 	if (which) {
742 		tag = v4_port_tag(lport);
743 		fp  = ctl->fat->port[tag & ctl->fat->mask];
744 	} else {
745 		tag = v4_tag(faddr, fport, laddr, lport);
746 		fp  = ctl->fat->hash[tag & ctl->fat->mask];
747 	}
748 
749 	while (fp && fp->inuse) {
750 		uint32_t	inuse = fp->inuse;
751 
752 		++fatps;
753 
754 		for (i = 0; inuse && i < fatp_ntags(); ++i) {
755 			uint32_t	idx;
756 
757 			if (!(inuse & (1 << i)))
758 				continue;
759 
760 			inuse ^= 1 << i;
761 
762 			++probes;
763 			++vtw_stats.probe[which];
764 
765 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
766 			vtw = vtw_from_index(ctl, idx);
767 
768 			if (!vtw) {
769 				/* Hopefully fast path.
770 				 */
771 				db_trace(KTR_VTW
772 					 , (fp, "vtw: fast %A:%P %A:%P"
773 					    " idx %x tag %x"
774 					    , faddr, fport
775 					    , laddr, lport
776 					    , idx, tag));
777 				continue;
778 			}
779 
780 			v4 = (void*)vtw;
781 
782 			/* The de-referencing of vtw is what we want to avoid.
783 			 * Losing.
784 			 */
785 			if (vtw_alive(vtw)
786 			    && ((which ? vtw->port_key : vtw->key)
787 				== fatp_key(ctl->fat, fp, i))
788 			    && (which
789 				|| (v4->faddr == faddr && v4->laddr == laddr
790 				    && v4->fport == fport))
791 			    && v4->lport == lport) {
792 				++vtw_stats.hit[which];
793 
794 				db_trace(KTR_VTW
795 					 , (fp, "vtw: hit %8.8x:%4.4x"
796 					    " %8.8x:%4.4x idx %x key %x"
797 					    , faddr, fport
798 					    , laddr, lport
799 					    , idx_decode(ctl, idx), vtw->key));
800 
801 				KASSERT(vtw->hashed);
802 
803 				goto out;
804 			}
805 			++vtw_stats.losing[which];
806 			++losings;
807 
808 			if (vtw_alive(vtw)) {
809 				db_trace(KTR_VTW
810 					 , (fp, "vtw:!mis %8.8x:%4.4x"
811 					    " %8.8x:%4.4x key %x tag %x"
812 					    , faddr, fport
813 					    , laddr, lport
814 					    , fatp_key(ctl->fat, fp, i)
815 					    , v4_tag(faddr, fport
816 						     , laddr, lport)));
817 				db_trace(KTR_VTW
818 					 , (vtw, "vtw:!mis %8.8x:%4.4x"
819 					    " %8.8x:%4.4x key %x tag %x"
820 					    , v4->faddr, v4->fport
821 					    , v4->laddr, v4->lport
822 					    , vtw->key
823 					    , v4_tag(v4->faddr, v4->fport
824 						     , v4->laddr, v4->lport)));
825 
826 				if (vtw->key == fatp_key(ctl->fat, fp, i)) {
827 					db_trace(KTR_VTW
828 						 , (vtw, "vtw:!mis %8.8x:%4.4x"
829 						    " %8.8x:%4.4x key %x"
830 						    " which %x"
831 						    , v4->faddr, v4->fport
832 						    , v4->laddr, v4->lport
833 						    , vtw->key
834 						    , which));
835 
836 				} else {
837 					db_trace(KTR_VTW
838 						 , (vtw
839 						    , "vtw:!mis"
840 						    " key %8.8x != %8.8x"
841 						    " idx %x i %x which %x"
842 						    , vtw->key
843 						    , fatp_key(ctl->fat, fp, i)
844 						    , idx_decode(ctl, idx)
845 						    , i
846 						    , which));
847 				}
848 			} else {
849 				db_trace(KTR_VTW
850 					 , (fp
851 					    , "vtw:!mis free entry"
852 					    " idx %x vtw %p which %x"
853 					    , idx_decode(ctl, idx)
854 					    , vtw, which));
855 			}
856 		}
857 
858 		if (fp->nxt) {
859 			fp = fatp_next(ctl->fat, fp);
860 		} else {
861 			break;
862 		}
863 	}
864 	++vtw_stats.miss[which];
865 	vtw = 0;
866 out:
867 	if (fatps > vtw_stats.max_chain[which])
868 		vtw_stats.max_chain[which] = fatps;
869 	if (probes > vtw_stats.max_probe[which])
870 		vtw_stats.max_probe[which] = probes;
871 	if (losings > vtw_stats.max_loss[which])
872 		vtw_stats.max_loss[which] = losings;
873 
874 	return vtw;
875 }
876 
877 static vtw_t *
878 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
879 				 , const struct in6_addr *laddr, uint16_t lport
880 				 , int which)
881 {
882 	vtw_v6_t	*v6;
883 	vtw_t		*vtw;
884 	uint32_t	tag;
885 	fatp_t		*fp;
886 	int		i;
887 	uint32_t	fatps = 0, probes = 0, losings = 0;
888 
889 	++vtw_stats.look[which];
890 
891 	if (!ctl || !ctl->fat)
892 		return 0;
893 
894 	if (which) {
895 		tag = v6_port_tag(lport);
896 		fp  = ctl->fat->port[tag & ctl->fat->mask];
897 	} else {
898 		tag = v6_tag(faddr, fport, laddr, lport);
899 		fp  = ctl->fat->hash[tag & ctl->fat->mask];
900 	}
901 
902 	while (fp && fp->inuse) {
903 		uint32_t	inuse = fp->inuse;
904 
905 		++fatps;
906 
907 		for (i = 0; inuse && i < fatp_ntags(); ++i) {
908 			uint32_t	idx;
909 
910 			if (!(inuse & (1 << i)))
911 				continue;
912 
913 			inuse ^= 1 << i;
914 
915 			++probes;
916 			++vtw_stats.probe[which];
917 
918 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
919 			vtw = vtw_from_index(ctl, idx);
920 
921 			db_trace(KTR_VTW
922 				 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
923 				    , i
924 				    , db_store(faddr, sizeof (*faddr)), fport
925 				    , db_store(laddr, sizeof (*laddr)), lport
926 				    , idx_decode(ctl, idx)));
927 
928 			if (!vtw) {
929 				/* Hopefully fast path.
930 				 */
931 				continue;
932 			}
933 
934 			v6 = (void*)vtw;
935 
936 			if (vtw_alive(vtw)
937 			    && ((which ? vtw->port_key : vtw->key)
938 				== fatp_key(ctl->fat, fp, i))
939 			    && v6->lport == lport
940 			    && (which
941 				|| (v6->fport == fport
942 				    && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
943 				    && !bcmp(&v6->laddr, laddr
944 					     , sizeof (*laddr))))) {
945 				++vtw_stats.hit[which];
946 
947 				KASSERT(vtw->hashed);
948 				goto out;
949 			} else {
950 				++vtw_stats.losing[which];
951 				++losings;
952 			}
953 		}
954 
955 		if (fp->nxt) {
956 			fp = fatp_next(ctl->fat, fp);
957 		} else {
958 			break;
959 		}
960 	}
961 	++vtw_stats.miss[which];
962 	vtw = 0;
963 out:
964 	if (fatps > vtw_stats.max_chain[which])
965 		vtw_stats.max_chain[which] = fatps;
966 	if (probes > vtw_stats.max_probe[which])
967 		vtw_stats.max_probe[which] = probes;
968 	if (losings > vtw_stats.max_loss[which])
969 		vtw_stats.max_loss[which] = losings;
970 
971 	return vtw;
972 }
973 
974 /*!\brief port iterator
975  */
976 static vtw_t *
977 vtw_next_port_v4(struct tcp_ports_iterator *it)
978 {
979 	vtw_ctl_t	*ctl = it->ctl;
980 	vtw_v4_t	*v4;
981 	vtw_t		*vtw;
982 	uint32_t	tag;
983 	uint16_t	lport = it->port;
984 	fatp_t		*fp;
985 	int		i;
986 	uint32_t	fatps = 0, probes = 0, losings = 0;
987 
988 	tag = v4_port_tag(lport);
989 	if (!it->fp) {
990 		it->fp = ctl->fat->port[tag & ctl->fat->mask];
991 		it->slot_idx = 0;
992 	}
993 	fp  = it->fp;
994 
995 	while (fp) {
996 		uint32_t	inuse = fp->inuse;
997 
998 		++fatps;
999 
1000 		for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1001 			uint32_t	idx;
1002 
1003 			if (!(inuse & (1 << i)))
1004 				continue;
1005 
1006 			inuse &= ~0 << i;
1007 
1008 			if (i < it->slot_idx)
1009 				continue;
1010 
1011 			++vtw_stats.probe[1];
1012 			++probes;
1013 
1014 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1015 			vtw = vtw_from_index(ctl, idx);
1016 
1017 			if (!vtw) {
1018 				/* Hopefully fast path.
1019 				 */
1020 				continue;
1021 			}
1022 
1023 			v4 = (void*)vtw;
1024 
1025 			if (vtw_alive(vtw)
1026 			    && vtw->port_key == fatp_key(ctl->fat, fp, i)
1027 			    && v4->lport == lport) {
1028 				++vtw_stats.hit[1];
1029 
1030 				it->slot_idx = i + 1;
1031 
1032 				goto out;
1033 			} else if (vtw_alive(vtw)) {
1034 				++vtw_stats.losing[1];
1035 				++losings;
1036 
1037 				db_trace(KTR_VTW
1038 					 , (vtw, "vtw:!mis"
1039 					    " port %8.8x:%4.4x %8.8x:%4.4x"
1040 					    " key %x port %x"
1041 					    , v4->faddr, v4->fport
1042 					    , v4->laddr, v4->lport
1043 					    , vtw->key
1044 					    , lport));
1045 			} else {
1046 				/* Really losing here.  We are coming
1047 				 * up with references to free entries.
1048 				 * Might find it better to use
1049 				 * traditional, or need another
1050 				 * add-hockery.  The other add-hockery
1051 				 * would be to pul more into into the
1052 				 * cache line to reject the false
1053 				 * hits.
1054 				 */
1055 				++vtw_stats.losing[1];
1056 				++losings;
1057 				db_trace(KTR_VTW
1058 					 , (fp, "vtw:!mis port %x"
1059 					    " - free entry idx %x vtw %p"
1060 					    , lport
1061 					    , idx_decode(ctl, idx)
1062 					    , vtw));
1063 			}
1064 		}
1065 
1066 		if (fp->nxt) {
1067 			it->fp = fp = fatp_next(ctl->fat, fp);
1068 			it->slot_idx = 0;
1069 		} else {
1070 			it->fp = 0;
1071 			break;
1072 		}
1073 	}
1074 	++vtw_stats.miss[1];
1075 
1076 	vtw = 0;
1077 out:
1078 	if (fatps > vtw_stats.max_chain[1])
1079 		vtw_stats.max_chain[1] = fatps;
1080 	if (probes > vtw_stats.max_probe[1])
1081 		vtw_stats.max_probe[1] = probes;
1082 	if (losings > vtw_stats.max_loss[1])
1083 		vtw_stats.max_loss[1] = losings;
1084 
1085 	return vtw;
1086 }
1087 
1088 /*!\brief port iterator
1089  */
1090 static vtw_t *
1091 vtw_next_port_v6(struct tcp_ports_iterator *it)
1092 {
1093 	vtw_ctl_t	*ctl = it->ctl;
1094 	vtw_v6_t	*v6;
1095 	vtw_t		*vtw;
1096 	uint32_t	tag;
1097 	uint16_t	lport = it->port;
1098 	fatp_t		*fp;
1099 	int		i;
1100 	uint32_t	fatps = 0, probes = 0, losings = 0;
1101 
1102 	tag = v6_port_tag(lport);
1103 	if (!it->fp) {
1104 		it->fp = ctl->fat->port[tag & ctl->fat->mask];
1105 		it->slot_idx = 0;
1106 	}
1107 	fp  = it->fp;
1108 
1109 	while (fp) {
1110 		uint32_t	inuse = fp->inuse;
1111 
1112 		++fatps;
1113 
1114 		for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1115 			uint32_t	idx;
1116 
1117 			if (!(inuse & (1 << i)))
1118 				continue;
1119 
1120 			inuse &= ~0 << i;
1121 
1122 			if (i < it->slot_idx)
1123 				continue;
1124 
1125 			++vtw_stats.probe[1];
1126 			++probes;
1127 
1128 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1129 			vtw = vtw_from_index(ctl, idx);
1130 
1131 			if (!vtw) {
1132 				/* Hopefully fast path.
1133 				 */
1134 				continue;
1135 			}
1136 
1137 			v6 = (void*)vtw;
1138 
1139 			db_trace(KTR_VTW
1140 				 , (vtw, "vtw: i %x idx %x fp->tag %x"
1141 				    " tag %x xtra %x"
1142 				    , i, idx_decode(ctl, idx)
1143 				    , fp->tag[i], tag, fatp_xtra[i]));
1144 
1145 			if (vtw_alive(vtw)
1146 			    && vtw->port_key == fatp_key(ctl->fat, fp, i)
1147 			    && v6->lport == lport) {
1148 				++vtw_stats.hit[1];
1149 
1150 				db_trace(KTR_VTW
1151 					 , (fp, "vtw: nxt port %P - %4.4x"
1152 					    " idx %x key %x"
1153 					    , lport, lport
1154 					    , idx_decode(ctl, idx), vtw->key));
1155 
1156 				it->slot_idx = i + 1;
1157 				goto out;
1158 			} else if (vtw_alive(vtw)) {
1159 				++vtw_stats.losing[1];
1160 
1161 				db_trace(KTR_VTW
1162 					 , (vtw, "vtw:!mis port %6A:%4.4x"
1163 					    " %6A:%4.4x key %x port %x"
1164 					    , db_store(&v6->faddr
1165 						       , sizeof (v6->faddr))
1166 					    , v6->fport
1167 					    , db_store(&v6->laddr
1168 						       , sizeof (v6->faddr))
1169 					    , v6->lport
1170 					    , vtw->key
1171 					    , lport));
1172 			} else {
1173 				/* Really losing here.  We are coming
1174 				 * up with references to free entries.
1175 				 * Might find it better to use
1176 				 * traditional, or need another
1177 				 * add-hockery.  The other add-hockery
1178 				 * would be to pul more into into the
1179 				 * cache line to reject the false
1180 				 * hits.
1181 				 */
1182 				++vtw_stats.losing[1];
1183 				++losings;
1184 
1185 				db_trace(KTR_VTW
1186 					 , (fp
1187 					    , "vtw:!mis port %x"
1188 					    " - free entry idx %x vtw %p"
1189 					    , lport, idx_decode(ctl, idx)
1190 					    , vtw));
1191 			}
1192 		}
1193 
1194 		if (fp->nxt) {
1195 			it->fp = fp = fatp_next(ctl->fat, fp);
1196 			it->slot_idx = 0;
1197 		} else {
1198 			it->fp = 0;
1199 			break;
1200 		}
1201 	}
1202 	++vtw_stats.miss[1];
1203 
1204 	vtw = 0;
1205 out:
1206 	if (fatps > vtw_stats.max_chain[1])
1207 		vtw_stats.max_chain[1] = fatps;
1208 	if (probes > vtw_stats.max_probe[1])
1209 		vtw_stats.max_probe[1] = probes;
1210 	if (losings > vtw_stats.max_loss[1])
1211 		vtw_stats.max_loss[1] = losings;
1212 
1213 	return vtw;
1214 }
1215 
1216 /*!\brief initialise the VTW allocation arena
1217  *
1218  * There are 1+3 allocation classes:
1219  *	0	classless
1220  *	{1,2,3}	MSL-class based allocation
1221  *
1222  * The allocation arenas are all initialised.  Classless gets all the
1223  * space.  MSL-class based divides the arena, so that allocation
1224  * within a class can proceed without having to consider entries
1225  * (aka: cache lines) from different classes.
1226  *
1227  * Usually, we are completely classless or class-based, but there can be
1228  * transition periods, corresponding to dynamic adjustments in the config
1229  * by the operator.
1230  */
1231 static void
1232 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v)
1233 {
1234 	int class_n, i;
1235 	vtw_t	*base;
1236 
1237 	ctl->base.v = ctl_base_v;
1238 
1239 	if (ctl->is_v4) {
1240 		ctl->lim.v4    = ctl->base.v4 + n - 1;
1241 		ctl->alloc.v4  = ctl->base.v4;
1242 	} else {
1243 		ctl->lim.v6    = ctl->base.v6 + n - 1;
1244 		ctl->alloc.v6  = ctl->base.v6;
1245 	}
1246 
1247 	ctl->nfree  = n;
1248 	ctl->ctl    = ctl;
1249 
1250 	ctl->idx_bits = 32;
1251 	for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1252 		ctl->idx_mask >>= 1;
1253 		ctl->idx_bits  -= 1;
1254 	}
1255 
1256 	ctl->idx_mask <<= 1;
1257 	ctl->idx_mask  |= 1;
1258 	ctl->idx_bits  += 1;
1259 
1260 	ctl->fat = fat;
1261 	fat->vtw = ctl;
1262 
1263 	/* Divide the resources equally amongst the classes.
1264 	 * This is not optimal, as the different classes
1265 	 * arrive and leave at different rates, but it is
1266 	 * the best I can do for now.
1267 	 */
1268 	class_n = n / (VTW_NCLASS-1);
1269 	base    = ctl->base.v;
1270 
1271 	for (i = 1; i < VTW_NCLASS; ++i) {
1272 		int j;
1273 
1274 		ctl[i] = ctl[0];
1275 		ctl[i].clidx = i;
1276 
1277 		ctl[i].base.v = base;
1278 		ctl[i].alloc  = ctl[i].base;
1279 
1280 		for (j = 0; j < class_n - 1; ++j) {
1281 			if (tcp_msl_enable)
1282 				base->msl_class = i;
1283 			base = vtw_next(ctl, base);
1284 		}
1285 
1286 		ctl[i].lim.v = base;
1287 		base = vtw_next(ctl, base);
1288 		ctl[i].nfree = class_n;
1289 	}
1290 
1291 	vtw_debug_init();
1292 }
1293 
1294 /*!\brief	map class to TCP MSL
1295  */
1296 static inline uint32_t
1297 class_to_msl(int class)
1298 {
1299 	switch (class) {
1300 	case 0:
1301 	case 1:
1302 		return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1303 	case 2:
1304 		return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1305 	default:
1306 		return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1307 	}
1308 }
1309 
1310 /*!\brief	map TCP MSL to class
1311  */
1312 static inline uint32_t
1313 msl_to_class(int msl)
1314 {
1315 	if (tcp_msl_enable) {
1316 		if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1317 			return 1+2;
1318 		if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1319 			return 1+1;
1320 		return 1;
1321 	}
1322 	return 0;
1323 }
1324 
1325 /*!\brief allocate a vtw entry
1326  */
1327 static inline vtw_t *
1328 vtw_alloc(vtw_ctl_t *ctl)
1329 {
1330 	vtw_t	*vtw	= 0;
1331 	int	stuck	= 0;
1332 	int	avail	= ctl ? (ctl->nalloc + ctl->nfree) : 0;
1333 	int	msl;
1334 
1335 	KASSERT(mutex_owned(softnet_lock));
1336 
1337 	/* If no resources, we will not get far.
1338 	 */
1339 	if (!ctl || !ctl->base.v4 || avail <= 0)
1340 		return 0;
1341 
1342 	/* Obtain a free one.
1343 	 */
1344 	while (!ctl->nfree) {
1345 		vtw_age(ctl, 0);
1346 
1347 		if (++stuck > avail) {
1348 			/* When in transition between
1349 			 * schemes (classless, classed) we
1350 			 * can be stuck having to await the
1351 			 * expiration of cross-allocated entries.
1352 			 *
1353 			 * Returning zero means we will fall back to the
1354 			 * traditional TIME_WAIT handling, except in the
1355 			 * case of a re-shed, in which case we cannot
1356 			 * perform the reshecd, but will retain the extant
1357 			 * entry.
1358 			 */
1359 			db_trace(KTR_VTW
1360 				 , (ctl, "vtw:!none free in class %x %x/%x"
1361 				    , ctl->clidx
1362 				    , ctl->nalloc, ctl->nfree));
1363 
1364 			return 0;
1365 		}
1366 	}
1367 
1368 	vtw = ctl->alloc.v;
1369 
1370 	if (vtw->msl_class != ctl->clidx) {
1371 		/* Usurping rules:
1372 		 * 	0 -> {1,2,3} or {1,2,3} -> 0
1373 		 */
1374 		KASSERT(!vtw->msl_class || !ctl->clidx);
1375 
1376 		if (vtw->hashed || vtw->expire.tv_sec) {
1377 		    /* As this is owned by some other class,
1378 		     * we must wait for it to expire it.
1379 		     * This will only happen on class/classless
1380 		     * transitions, which are guaranteed to progress
1381 		     * to completion in small finite time, barring bugs.
1382 		     */
1383 		    db_trace(KTR_VTW
1384 			     , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1385 				, vtw, vtw->msl_class, ctl->clidx
1386 				, vtw->expire.tv_sec
1387 				, vtw->expire.tv_usec
1388 				, vtw->hashed ? " hashed" : ""));
1389 
1390 		    return 0;
1391 		}
1392 
1393 		db_trace(KTR_VTW
1394 			 , (ctl, "vtw:!%p usurped from %x to %x"
1395 			    , vtw, vtw->msl_class, ctl->clidx));
1396 
1397 		vtw->msl_class = ctl->clidx;
1398 	}
1399 
1400 	if (vtw_alive(vtw)) {
1401 		KASSERT(0 && "next free not free");
1402 		return 0;
1403 	}
1404 
1405 	/* Advance allocation poiter.
1406 	 */
1407 	ctl->alloc.v = vtw_next(ctl, vtw);
1408 
1409 	--ctl->nfree;
1410 	++ctl->nalloc;
1411 
1412 	msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ;	// msec
1413 
1414 	/* mark expiration
1415 	 */
1416 	getmicrouptime(&vtw->expire);
1417 
1418 	/* Move expiration into the future.
1419 	 */
1420 	vtw->expire.tv_sec  += msl / 1000;
1421 	vtw->expire.tv_usec += 1000 * (msl % 1000);
1422 
1423 	while (vtw->expire.tv_usec >= 1000*1000) {
1424 		vtw->expire.tv_usec -= 1000*1000;
1425 		vtw->expire.tv_sec  += 1;
1426 	}
1427 
1428 	if (!ctl->oldest.v)
1429 		ctl->oldest.v = vtw;
1430 
1431 	return vtw;
1432 }
1433 
1434 /*!\brief expiration
1435  */
1436 static int
1437 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1438 {
1439 	vtw_t	*vtw;
1440 	struct timeval then, *when = _when;
1441 	int	maxtries = 0;
1442 
1443 	if (!ctl->oldest.v) {
1444 		KASSERT(!ctl->nalloc);
1445 		return 0;
1446 	}
1447 
1448 	for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1449 		if (++maxtries > ctl->nalloc)
1450 			break;
1451 
1452 		if (vtw->msl_class != ctl->clidx) {
1453 			db_trace(KTR_VTW
1454 				 , (vtw, "vtw:!age class mismatch %x != %x"
1455 				    , vtw->msl_class, ctl->clidx));
1456 			/* XXXX
1457 			 * See if the appropriate action is to skip to the next.
1458 			 * XXXX
1459 			 */
1460 			ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1461 			continue;
1462 		}
1463 		if (!when) {
1464 			/* Latch oldest timeval if none specified.
1465 			 */
1466 			then = vtw->expire;
1467 			when = &then;
1468 		}
1469 
1470 		if (!timercmp(&vtw->expire, when, <=))
1471 			break;
1472 
1473 		db_trace(KTR_VTW
1474 			 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1475 			    , ctl->clidx
1476 			    , vtw->expire.tv_sec
1477 			    , vtw->expire.tv_usec
1478 			    , ctl->nalloc
1479 			    , ctl->nfree));
1480 
1481 		if (!_when)
1482 			++vtw_stats.kill;
1483 
1484 		vtw_del(ctl, vtw);
1485 		vtw = ctl->oldest.v;
1486 	}
1487 
1488 	return ctl->nalloc;	// # remaining allocated
1489 }
1490 
1491 static callout_t vtw_cs;
1492 
1493 /*!\brief notice the passage of time.
1494  * It seems to be getting faster.  What happened to the year?
1495  */
1496 static void
1497 vtw_tick(void *arg)
1498 {
1499 	struct timeval now;
1500 	int i, cnt = 0;
1501 
1502 	getmicrouptime(&now);
1503 
1504 	db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1505 			   , now.tv_sec, now.tv_usec));
1506 
1507 	mutex_enter(softnet_lock);
1508 
1509 	for (i = 0; i < VTW_NCLASS; ++i) {
1510 		cnt += vtw_age(&vtw_tcpv4[i], &now);
1511 		cnt += vtw_age(&vtw_tcpv6[i], &now);
1512 	}
1513 
1514 	/* Keep ticks coming while we need them.
1515 	 */
1516 	if (cnt)
1517 		callout_schedule(&vtw_cs, hz / 5);
1518 	else {
1519 		tcp_vtw_was_enabled = 0;
1520 		tcbtable.vestige    = 0;
1521 	}
1522 	mutex_exit(softnet_lock);
1523 }
1524 
1525 /* in_pcblookup_ports assist for handling vestigial entries.
1526  */
1527 static void *
1528 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1529 {
1530 	struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1531 
1532 	bzero(it, sizeof (*it));
1533 
1534 	/* Note: the reference to vtw_tcpv4[0] is fine.
1535 	 * We do not need per-class iteration.  We just
1536 	 * need to get to the fat, and there is one
1537 	 * shared fat.
1538 	 */
1539 	if (vtw_tcpv4[0].fat) {
1540 		it->addr.v4 = addr;
1541 		it->port = port;
1542 		it->wild = !!wild;
1543 		it->ctl  = &vtw_tcpv4[0];
1544 
1545 		++vtw_stats.look[1];
1546 	}
1547 
1548 	return it;
1549 }
1550 
1551 /*!\brief export an IPv4 vtw.
1552  */
1553 static int
1554 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1555 {
1556 	vtw_v4_t	*v4 = (void*)vtw;
1557 
1558 	bzero(res, sizeof (*res));
1559 
1560 	if (ctl && vtw) {
1561 		if (!ctl->clidx && vtw->msl_class)
1562 			ctl += vtw->msl_class;
1563 		else
1564 			KASSERT(ctl->clidx == vtw->msl_class);
1565 
1566 		res->valid = 1;
1567 		res->v4    = 1;
1568 
1569 		res->faddr.v4.s_addr = v4->faddr;
1570 		res->laddr.v4.s_addr = v4->laddr;
1571 		res->fport	= v4->fport;
1572 		res->lport	= v4->lport;
1573 		res->vtw	= vtw;		// netlock held over call(s)
1574 		res->ctl	= ctl;
1575 		res->reuse_addr = vtw->reuse_addr;
1576 		res->reuse_port = vtw->reuse_port;
1577 		res->snd_nxt    = vtw->snd_nxt;
1578 		res->rcv_nxt	= vtw->rcv_nxt;
1579 		res->rcv_wnd	= vtw->rcv_wnd;
1580 		res->uid	= vtw->uid;
1581 	}
1582 
1583 	return res->valid;
1584 }
1585 
1586 /*!\brief return next port in the port iterator.  yowza.
1587  */
1588 static int
1589 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1590 {
1591 	struct tcp_ports_iterator *it = arg;
1592 	vtw_t		*vtw = 0;
1593 
1594 	if (it->ctl)
1595 		vtw = vtw_next_port_v4(it);
1596 
1597 	if (!vtw)
1598 		it->ctl = 0;
1599 
1600 	return vtw_export_v4(it->ctl, vtw, res);
1601 }
1602 
1603 static int
1604 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1605               struct in_addr laddr, uint16_t lport,
1606 	      struct vestigial_inpcb *res)
1607 {
1608 	vtw_t		*vtw;
1609 	vtw_ctl_t	*ctl;
1610 
1611 
1612 	db_trace(KTR_VTW
1613 		 , (res, "vtw: lookup %A:%P %A:%P"
1614 		    , faddr, fport
1615 		    , laddr, lport));
1616 
1617 	vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1618 				 , faddr.s_addr, fport
1619 				 , laddr.s_addr, lport, 0);
1620 
1621 	return vtw_export_v4(ctl, vtw, res);
1622 }
1623 
1624 /* in_pcblookup_ports assist for handling vestigial entries.
1625  */
1626 static void *
1627 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1628 {
1629 	struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1630 
1631 	bzero(it, sizeof (*it));
1632 
1633 	/* Note: the reference to vtw_tcpv6[0] is fine.
1634 	 * We do not need per-class iteration.  We just
1635 	 * need to get to the fat, and there is one
1636 	 * shared fat.
1637 	 */
1638 	if (vtw_tcpv6[0].fat) {
1639 		it->addr.v6 = *addr;
1640 		it->port = port;
1641 		it->wild = !!wild;
1642 		it->ctl  = &vtw_tcpv6[0];
1643 
1644 		++vtw_stats.look[1];
1645 	}
1646 
1647 	return it;
1648 }
1649 
1650 /*!\brief export an IPv6 vtw.
1651  */
1652 static int
1653 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1654 {
1655 	vtw_v6_t	*v6 = (void*)vtw;
1656 
1657 	bzero(res, sizeof (*res));
1658 
1659 	if (ctl && vtw) {
1660 		if (!ctl->clidx && vtw->msl_class)
1661 			ctl += vtw->msl_class;
1662 		else
1663 			KASSERT(ctl->clidx == vtw->msl_class);
1664 
1665 		res->valid = 1;
1666 		res->v4    = 0;
1667 
1668 		res->faddr.v6	= v6->faddr;
1669 		res->laddr.v6	= v6->laddr;
1670 		res->fport	= v6->fport;
1671 		res->lport	= v6->lport;
1672 		res->vtw	= vtw;		// netlock held over call(s)
1673 		res->ctl	= ctl;
1674 
1675 		res->v6only	= vtw->v6only;
1676 		res->reuse_addr = vtw->reuse_addr;
1677 		res->reuse_port = vtw->reuse_port;
1678 
1679 		res->snd_nxt    = vtw->snd_nxt;
1680 		res->rcv_nxt	= vtw->rcv_nxt;
1681 		res->rcv_wnd	= vtw->rcv_wnd;
1682 		res->uid	= vtw->uid;
1683 	}
1684 
1685 	return res->valid;
1686 }
1687 
1688 static int
1689 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1690 {
1691 	struct tcp_ports_iterator *it = arg;
1692 	vtw_t		*vtw = 0;
1693 
1694 	if (it->ctl)
1695 		vtw = vtw_next_port_v6(it);
1696 
1697 	if (!vtw)
1698 		it->ctl = 0;
1699 
1700 	return vtw_export_v6(it->ctl, vtw, res);
1701 }
1702 
1703 static int
1704 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1705               const struct in6_addr *laddr, uint16_t lport,
1706 	      struct vestigial_inpcb *res)
1707 {
1708 	vtw_ctl_t	*ctl;
1709 	vtw_t		*vtw;
1710 
1711 	db_trace(KTR_VTW
1712 		 , (res, "vtw: lookup %6A:%P %6A:%P"
1713 		    , db_store(faddr, sizeof (*faddr)), fport
1714 		    , db_store(laddr, sizeof (*laddr)), lport));
1715 
1716 	vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1717 				 , faddr, fport
1718 				 , laddr, lport, 0);
1719 
1720 	return vtw_export_v6(ctl, vtw, res);
1721 }
1722 
1723 static vestigial_hooks_t tcp_hooks = {
1724 	.init_ports4	= tcp_init_ports_v4,
1725 	.next_port4	= tcp_next_port_v4,
1726 	.lookup4	= tcp_lookup_v4,
1727 	.init_ports6	= tcp_init_ports_v6,
1728 	.next_port6	= tcp_next_port_v6,
1729 	.lookup6	= tcp_lookup_v6,
1730 };
1731 
1732 static bool
1733 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1734 {
1735 	fatp_ctl_t	*fat;
1736 	vtw_ctl_t	*ctl;
1737 
1738 	switch (af) {
1739 	case AF_INET:
1740 		fat = &fat_tcpv4;
1741 		ctl = &vtw_tcpv4[0];
1742 		break;
1743 	case AF_INET6:
1744 		fat = &fat_tcpv6;
1745 		ctl = &vtw_tcpv6[0];
1746 		break;
1747 	default:
1748 		return false;
1749 	}
1750 	if (fatp != NULL)
1751 		*fatp = fat;
1752 	if (ctlp != NULL)
1753 		*ctlp = ctl;
1754 	return true;
1755 }
1756 
1757 /*!\brief	initialize controlling instance
1758  */
1759 static int
1760 vtw_control_init(int af)
1761 {
1762 	fatp_ctl_t	*fat;
1763 	vtw_ctl_t	*ctl;
1764 	fatp_t		*fat_base;
1765 	fatp_t		**fat_hash;
1766 	vtw_t		*ctl_base_v;
1767 	uint32_t	n, m;
1768 	size_t sz;
1769 
1770 	KASSERT(powerof2(tcp_vtw_entries));
1771 
1772 	if (!vtw_select(af, &fat, &ctl))
1773 		return EAFNOSUPPORT;
1774 
1775 	if (fat->hash != NULL) {
1776 		KASSERT(fat->base != NULL && ctl->base.v != NULL);
1777 		return 0;
1778 	}
1779 
1780 	/* Allocate 10% more capacity in the fat pointers.
1781 	 * We should only need ~#hash additional based on
1782 	 * how they age, but TIME_WAIT assassination could cause
1783 	 * sparse fat pointer utilisation.
1784 	 */
1785 	m = 512;
1786 	n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1787 	sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t));
1788 
1789 	fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP);
1790 
1791 	if (fat_hash == NULL) {
1792 		printf("%s: could not allocate %zu bytes for "
1793 		    "hash anchors", __func__, 2*m * sizeof(fatp_t *));
1794 		return ENOMEM;
1795 	}
1796 
1797 	fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP);
1798 
1799 	if (fat_base == NULL) {
1800 		kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1801 		printf("%s: could not allocate %zu bytes for "
1802 		    "fatp_t array", __func__, 2*n * sizeof(fatp_t));
1803 		return ENOMEM;
1804 	}
1805 
1806 	ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP);
1807 
1808 	if (ctl_base_v == NULL) {
1809 		kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1810 		kmem_free(fat_base, 2*n * sizeof(fatp_t));
1811 		printf("%s: could not allocate %zu bytes for "
1812 		    "vtw_t array", __func__, tcp_vtw_entries * sz);
1813 		return ENOMEM;
1814 	}
1815 
1816 	fatp_init(fat, n, m, fat_base, fat_hash);
1817 
1818 	vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v);
1819 
1820 	return 0;
1821 }
1822 
1823 /*!\brief	select controlling instance
1824  */
1825 static vtw_ctl_t *
1826 vtw_control(int af, uint32_t msl)
1827 {
1828 	fatp_ctl_t	*fat;
1829 	vtw_ctl_t	*ctl;
1830 	int		class	= msl_to_class(msl);
1831 
1832 	if (!vtw_select(af, &fat, &ctl))
1833 		return NULL;
1834 
1835 	if (!fat->base || !ctl->base.v)
1836 		return NULL;
1837 
1838 	if (!tcp_vtw_was_enabled) {
1839 		/* This guarantees is timer ticks until we no longer need them.
1840 		 */
1841 		tcp_vtw_was_enabled = 1;
1842 
1843 		callout_schedule(&vtw_cs, hz / 5);
1844 
1845 		tcbtable.vestige = &tcp_hooks;
1846 	}
1847 
1848 	return ctl + class;
1849 }
1850 
1851 /*!\brief	add TCP pcb to vestigial timewait
1852  */
1853 int
1854 vtw_add(int af, struct tcpcb *tp)
1855 {
1856 	int		enable;
1857 	vtw_ctl_t	*ctl;
1858 	vtw_t		*vtw;
1859 
1860 	KASSERT(mutex_owned(softnet_lock));
1861 
1862 	ctl = vtw_control(af, tp->t_msl);
1863 	if (!ctl)
1864 		return 0;
1865 
1866 	enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1867 
1868 	vtw = vtw_alloc(ctl);
1869 
1870 	if (vtw) {
1871 		vtw->snd_nxt = tp->snd_nxt;
1872 		vtw->rcv_nxt = tp->rcv_nxt;
1873 
1874 		switch (af) {
1875 		case AF_INET: {
1876 			struct inpcb	*inp = tp->t_inpcb;
1877 			vtw_v4_t	*v4  = (void*)vtw;
1878 
1879 			v4->faddr = inp->inp_faddr.s_addr;
1880 			v4->laddr = inp->inp_laddr.s_addr;
1881 			v4->fport = inp->inp_fport;
1882 			v4->lport = inp->inp_lport;
1883 
1884 			vtw->reuse_port = !!(inp->inp_socket->so_options
1885 					     & SO_REUSEPORT);
1886 			vtw->reuse_addr = !!(inp->inp_socket->so_options
1887 					     & SO_REUSEADDR);
1888 			vtw->v6only	= 0;
1889 			vtw->uid	= inp->inp_socket->so_uidinfo->ui_uid;
1890 
1891 			vtw_inshash_v4(ctl, vtw);
1892 
1893 
1894 #ifdef VTW_DEBUG
1895 			/* Immediate lookup (connected and port) to
1896 			 * ensure at least that works!
1897 			 */
1898 			if (enable & 4) {
1899 				KASSERT(vtw_lookup_hash_v4
1900 					(ctl
1901 					 , inp->inp_faddr.s_addr, inp->inp_fport
1902 					 , inp->inp_laddr.s_addr, inp->inp_lport
1903 					 , 0)
1904 					== vtw);
1905 				KASSERT(vtw_lookup_hash_v4
1906 					(ctl
1907 					 , inp->inp_faddr.s_addr, inp->inp_fport
1908 					 , inp->inp_laddr.s_addr, inp->inp_lport
1909 					 , 1));
1910 			}
1911 			/* Immediate port iterator functionality check: not wild
1912 			 */
1913 			if (enable & 8) {
1914 				struct tcp_ports_iterator *it;
1915 				struct vestigial_inpcb res;
1916 				int cnt = 0;
1917 
1918 				it = tcp_init_ports_v4(inp->inp_laddr
1919 						       , inp->inp_lport, 0);
1920 
1921 				while (tcp_next_port_v4(it, &res)) {
1922 					++cnt;
1923 				}
1924 				KASSERT(cnt);
1925 			}
1926 			/* Immediate port iterator functionality check: wild
1927 			 */
1928 			if (enable & 16) {
1929 				struct tcp_ports_iterator *it;
1930 				struct vestigial_inpcb res;
1931 				struct in_addr any;
1932 				int cnt = 0;
1933 
1934 				any.s_addr = htonl(INADDR_ANY);
1935 
1936 				it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1937 
1938 				while (tcp_next_port_v4(it, &res)) {
1939 					++cnt;
1940 				}
1941 				KASSERT(cnt);
1942 			}
1943 #endif /* VTW_DEBUG */
1944 			break;
1945 		}
1946 
1947 		case AF_INET6: {
1948 			struct in6pcb	*inp = tp->t_in6pcb;
1949 			vtw_v6_t	*v6  = (void*)vtw;
1950 
1951 			v6->faddr = inp->in6p_faddr;
1952 			v6->laddr = inp->in6p_laddr;
1953 			v6->fport = inp->in6p_fport;
1954 			v6->lport = inp->in6p_lport;
1955 
1956 			vtw->reuse_port = !!(inp->in6p_socket->so_options
1957 					     & SO_REUSEPORT);
1958 			vtw->reuse_addr = !!(inp->in6p_socket->so_options
1959 					     & SO_REUSEADDR);
1960 			vtw->v6only	= !!(inp->in6p_flags
1961 					     & IN6P_IPV6_V6ONLY);
1962 			vtw->uid	= inp->in6p_socket->so_uidinfo->ui_uid;
1963 
1964 			vtw_inshash_v6(ctl, vtw);
1965 #ifdef VTW_DEBUG
1966 			/* Immediate lookup (connected and port) to
1967 			 * ensure at least that works!
1968 			 */
1969 			if (enable & 4) {
1970 				KASSERT(vtw_lookup_hash_v6(ctl
1971 					 , &inp->in6p_faddr, inp->in6p_fport
1972 					 , &inp->in6p_laddr, inp->in6p_lport
1973 					 , 0)
1974 					== vtw);
1975 				KASSERT(vtw_lookup_hash_v6
1976 					(ctl
1977 					 , &inp->in6p_faddr, inp->in6p_fport
1978 					 , &inp->in6p_laddr, inp->in6p_lport
1979 					 , 1));
1980 			}
1981 			/* Immediate port iterator functionality check: not wild
1982 			 */
1983 			if (enable & 8) {
1984 				struct tcp_ports_iterator *it;
1985 				struct vestigial_inpcb res;
1986 				int cnt = 0;
1987 
1988 				it = tcp_init_ports_v6(&inp->in6p_laddr
1989 						       , inp->in6p_lport, 0);
1990 
1991 				while (tcp_next_port_v6(it, &res)) {
1992 					++cnt;
1993 				}
1994 				KASSERT(cnt);
1995 			}
1996 			/* Immediate port iterator functionality check: wild
1997 			 */
1998 			if (enable & 16) {
1999 				struct tcp_ports_iterator *it;
2000 				struct vestigial_inpcb res;
2001 				static struct in6_addr any = IN6ADDR_ANY_INIT;
2002 				int cnt = 0;
2003 
2004 				it = tcp_init_ports_v6(&any
2005 						       , inp->in6p_lport, 1);
2006 
2007 				while (tcp_next_port_v6(it, &res)) {
2008 					++cnt;
2009 				}
2010 				KASSERT(cnt);
2011 			}
2012 #endif /* VTW_DEBUG */
2013 			break;
2014 		}
2015 		}
2016 
2017 		tcp_canceltimers(tp);
2018 		tp = tcp_close(tp);
2019 		KASSERT(!tp);
2020 
2021 		return 1;
2022 	}
2023 
2024 	return 0;
2025 }
2026 
2027 /*!\brief	restart timer for vestigial time-wait entry
2028  */
2029 static void
2030 vtw_restart_v4(vestigial_inpcb_t *vp)
2031 {
2032 	vtw_v4_t	copy = *(vtw_v4_t*)vp->vtw;
2033 	vtw_t		*vtw;
2034 	vtw_t		*cp  = &copy.common;
2035 	vtw_ctl_t	*ctl;
2036 
2037 	KASSERT(mutex_owned(softnet_lock));
2038 
2039 	db_trace(KTR_VTW
2040 		 , (vp->vtw, "vtw: restart %A:%P %A:%P"
2041 		    , vp->faddr.v4.s_addr, vp->fport
2042 		    , vp->laddr.v4.s_addr, vp->lport));
2043 
2044 	/* Class might have changed, so have a squiz.
2045 	 */
2046 	ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2047 	vtw = vtw_alloc(ctl);
2048 
2049 	if (vtw) {
2050 		vtw_v4_t	*v4  = (void*)vtw;
2051 
2052 		/* Safe now to unhash the old entry
2053 		 */
2054 		vtw_del(vp->ctl, vp->vtw);
2055 
2056 		vtw->snd_nxt = cp->snd_nxt;
2057 		vtw->rcv_nxt = cp->rcv_nxt;
2058 
2059 		v4->faddr = copy.faddr;
2060 		v4->laddr = copy.laddr;
2061 		v4->fport = copy.fport;
2062 		v4->lport = copy.lport;
2063 
2064 		vtw->reuse_port = cp->reuse_port;
2065 		vtw->reuse_addr = cp->reuse_addr;
2066 		vtw->v6only	= 0;
2067 		vtw->uid	= cp->uid;
2068 
2069 		vtw_inshash_v4(ctl, vtw);
2070 	}
2071 
2072 	vp->valid = 0;
2073 }
2074 
2075 /*!\brief	restart timer for vestigial time-wait entry
2076  */
2077 static void
2078 vtw_restart_v6(vestigial_inpcb_t *vp)
2079 {
2080 	vtw_v6_t	copy = *(vtw_v6_t*)vp->vtw;
2081 	vtw_t		*vtw;
2082 	vtw_t		*cp  = &copy.common;
2083 	vtw_ctl_t	*ctl;
2084 
2085 	KASSERT(mutex_owned(softnet_lock));
2086 
2087 	db_trace(KTR_VTW
2088 		 , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2089 		    , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2090 		    , vp->fport
2091 		    , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2092 		    , vp->lport));
2093 
2094 	/* Class might have changed, so have a squiz.
2095 	 */
2096 	ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2097 	vtw = vtw_alloc(ctl);
2098 
2099 	if (vtw) {
2100 		vtw_v6_t	*v6  = (void*)vtw;
2101 
2102 		/* Safe now to unhash the old entry
2103 		 */
2104 		vtw_del(vp->ctl, vp->vtw);
2105 
2106 		vtw->snd_nxt = cp->snd_nxt;
2107 		vtw->rcv_nxt = cp->rcv_nxt;
2108 
2109 		v6->faddr = copy.faddr;
2110 		v6->laddr = copy.laddr;
2111 		v6->fport = copy.fport;
2112 		v6->lport = copy.lport;
2113 
2114 		vtw->reuse_port = cp->reuse_port;
2115 		vtw->reuse_addr = cp->reuse_addr;
2116 		vtw->v6only	= cp->v6only;
2117 		vtw->uid	= cp->uid;
2118 
2119 		vtw_inshash_v6(ctl, vtw);
2120 	}
2121 
2122 	vp->valid = 0;
2123 }
2124 
2125 /*!\brief	restart timer for vestigial time-wait entry
2126  */
2127 void
2128 vtw_restart(vestigial_inpcb_t *vp)
2129 {
2130 	if (!vp || !vp->valid)
2131 		return;
2132 
2133 	if (vp->v4)
2134 		vtw_restart_v4(vp);
2135 	else
2136 		vtw_restart_v6(vp);
2137 }
2138 
2139 int
2140 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS)
2141 {
2142 	int en, rc;
2143 	struct sysctlnode node;
2144 
2145 	node = *rnode;
2146 	en = *(int *)rnode->sysctl_data;
2147 	node.sysctl_data = &en;
2148 
2149 	rc = sysctl_lookup(SYSCTLFN_CALL(&node));
2150 	if (rc != 0 || newp == NULL)
2151 		return rc;
2152 
2153 	if (rnode->sysctl_data != &tcp4_vtw_enable &&
2154 	    rnode->sysctl_data != &tcp6_vtw_enable)
2155 		rc = ENOENT;
2156 	else if ((en & 1) == 0)
2157 		rc = 0;
2158 	else if (rnode->sysctl_data == &tcp4_vtw_enable)
2159 		rc = vtw_control_init(AF_INET);
2160 	else /* rnode->sysctl_data == &tcp6_vtw_enable */
2161 		rc = vtw_control_init(AF_INET6);
2162 
2163 	if (rc == 0)
2164 		*(int *)rnode->sysctl_data = en;
2165 
2166 	return rc;
2167 }
2168 
2169 int
2170 vtw_earlyinit(void)
2171 {
2172 	int i, rc;
2173 
2174 	callout_init(&vtw_cs, 0);
2175 	callout_setfunc(&vtw_cs, vtw_tick, 0);
2176 
2177 	for (i = 0; i < VTW_NCLASS; ++i) {
2178 		vtw_tcpv4[i].is_v4 = 1;
2179 		vtw_tcpv6[i].is_v6 = 1;
2180 	}
2181 
2182 	if ((tcp4_vtw_enable & 1) != 0 &&
2183 	    (rc = vtw_control_init(AF_INET)) != 0)
2184 		return rc;
2185 
2186 	if ((tcp6_vtw_enable & 1) != 0 &&
2187 	    (rc = vtw_control_init(AF_INET6)) != 0)
2188 		return rc;
2189 
2190 	return 0;
2191 }
2192 
2193 #ifdef VTW_DEBUG
2194 #include <sys/syscallargs.h>
2195 #include <sys/sysctl.h>
2196 
2197 /*!\brief	add lalp, fafp entries for debug
2198  */
2199 int
2200 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int class)
2201 {
2202 	vtw_ctl_t	*ctl;
2203 	vtw_t		*vtw;
2204 
2205 	ctl = vtw_control(af, msl ? msl : class_to_msl(class));
2206 	if (!ctl)
2207 		return 0;
2208 
2209 	vtw = vtw_alloc(ctl);
2210 
2211 	if (vtw) {
2212 		vtw->snd_nxt = 0;
2213 		vtw->rcv_nxt = 0;
2214 
2215 		switch (af) {
2216 		case AF_INET: {
2217 			vtw_v4_t	*v4  = (void*)vtw;
2218 
2219 			v4->faddr = fa->sin_addr.v4.s_addr;
2220 			v4->laddr = la->sin_addr.v4.s_addr;
2221 			v4->fport = fa->sin_port;
2222 			v4->lport = la->sin_port;
2223 
2224 			vtw->reuse_port = 1;
2225 			vtw->reuse_addr = 1;
2226 			vtw->v6only	= 0;
2227 			vtw->uid	= 0;
2228 
2229 			vtw_inshash_v4(ctl, vtw);
2230 			break;
2231 		}
2232 
2233 		case AF_INET6: {
2234 			vtw_v6_t	*v6  = (void*)vtw;
2235 
2236 			v6->faddr = fa->sin_addr.v6;
2237 			v6->laddr = la->sin_addr.v6;
2238 
2239 			v6->fport = fa->sin_port;
2240 			v6->lport = la->sin_port;
2241 
2242 			vtw->reuse_port = 1;
2243 			vtw->reuse_addr = 1;
2244 			vtw->v6only	= 0;
2245 			vtw->uid	= 0;
2246 
2247 			vtw_inshash_v6(ctl, vtw);
2248 			break;
2249 		}
2250 
2251 		default:
2252 			break;
2253 		}
2254 
2255 		return 1;
2256 	}
2257 
2258 	return 0;
2259 }
2260 
2261 static int vtw_syscall = 0;
2262 
2263 static int
2264 vtw_debug_process(vtw_sysargs_t *ap)
2265 {
2266 	struct vestigial_inpcb vestige;
2267 	int	rc = 0;
2268 
2269 	mutex_enter(softnet_lock);
2270 
2271 	switch (ap->op) {
2272 	case 0:		// insert
2273 		vtw_debug_add(ap->la.sin_family
2274 			      , &ap->la
2275 			      , &ap->fa
2276 			      , TCPTV_MSL
2277 			      , 0);
2278 		break;
2279 
2280 	case 1:		// lookup
2281 	case 2:		// restart
2282 		switch (ap->la.sin_family) {
2283 		case AF_INET:
2284 			if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2285 					  ap->la.sin_addr.v4, ap->la.sin_port,
2286 					  &vestige)) {
2287 				if (ap->op == 2) {
2288 					vtw_restart(&vestige);
2289 				}
2290 				rc = 0;
2291 			} else
2292 				rc = ESRCH;
2293 			break;
2294 
2295 		case AF_INET6:
2296 			if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2297 					  &ap->la.sin_addr.v6, ap->la.sin_port,
2298 					  &vestige)) {
2299 				if (ap->op == 2) {
2300 					vtw_restart(&vestige);
2301 				}
2302 				rc = 0;
2303 			} else
2304 				rc = ESRCH;
2305 			break;
2306 		default:
2307 			rc = EINVAL;
2308 		}
2309 		break;
2310 
2311 	default:
2312 		rc = EINVAL;
2313 	}
2314 
2315 	mutex_exit(softnet_lock);
2316 	return rc;
2317 }
2318 
2319 struct sys_vtw_args {
2320 	syscallarg(const vtw_sysargs_t *) req;
2321 	syscallarg(size_t) len;
2322 };
2323 
2324 static int
2325 vtw_sys(struct lwp *l, const void *_, register_t *retval)
2326 {
2327 	const struct sys_vtw_args *uap = _;
2328 	void	*buf;
2329 	int	rc;
2330 	size_t	len	= SCARG(uap, len);
2331 
2332 	if (len != sizeof (vtw_sysargs_t))
2333 		return EINVAL;
2334 
2335 	buf = kmem_alloc(len, KM_SLEEP);
2336 	if (!buf)
2337 		return ENOMEM;
2338 
2339 	rc = copyin(SCARG(uap, req), buf, len);
2340 	if (!rc) {
2341 		rc = vtw_debug_process(buf);
2342 	}
2343 	kmem_free(buf, len);
2344 
2345 	return rc;
2346 }
2347 
2348 static void
2349 vtw_sanity_check(void)
2350 {
2351 	vtw_ctl_t	*ctl;
2352 	vtw_t		*vtw;
2353 	int		i;
2354 	int		n;
2355 
2356 	for (i = 0; i < VTW_NCLASS; ++i) {
2357 		ctl = &vtw_tcpv4[i];
2358 
2359 		if (!ctl->base.v || ctl->nalloc)
2360 			continue;
2361 
2362 		for (n = 0, vtw = ctl->base.v; ; ) {
2363 			++n;
2364 			vtw = vtw_next(ctl, vtw);
2365 			if (vtw == ctl->base.v)
2366 				break;
2367 		}
2368 		db_trace(KTR_VTW
2369 			 , (ctl, "sanity: class %x n %x nfree %x"
2370 			    , i, n, ctl->nfree));
2371 
2372 		KASSERT(n == ctl->nfree);
2373 	}
2374 
2375 	for (i = 0; i < VTW_NCLASS; ++i) {
2376 		ctl = &vtw_tcpv6[i];
2377 
2378 		if (!ctl->base.v || ctl->nalloc)
2379 			continue;
2380 
2381 		for (n = 0, vtw = ctl->base.v; ; ) {
2382 			++n;
2383 			vtw = vtw_next(ctl, vtw);
2384 			if (vtw == ctl->base.v)
2385 				break;
2386 		}
2387 		db_trace(KTR_VTW
2388 			 , (ctl, "sanity: class %x n %x nfree %x"
2389 			    , i, n, ctl->nfree));
2390 		KASSERT(n == ctl->nfree);
2391 	}
2392 }
2393 
2394 /*!\brief	Initialise debug support.
2395  */
2396 static void
2397 vtw_debug_init(void)
2398 {
2399 	int	i;
2400 
2401 	vtw_sanity_check();
2402 
2403 	if (vtw_syscall)
2404 		return;
2405 
2406 	for (i = 511; i; --i) {
2407 		if (sysent[i].sy_call == sys_nosys) {
2408 			sysent[i].sy_call    = vtw_sys;
2409 			sysent[i].sy_narg    = 2;
2410 			sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2411 			sysent[i].sy_flags   = 0;
2412 
2413 			vtw_syscall = i;
2414 			break;
2415 		}
2416 	}
2417 	if (i) {
2418 		const struct sysctlnode *node;
2419 		uint32_t	flags;
2420 
2421 		flags = sysctl_root.sysctl_flags;
2422 
2423 		sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2424 		sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2425 
2426 		sysctl_createv(0, 0, 0, &node,
2427 			       CTLFLAG_PERMANENT, CTLTYPE_NODE,
2428 			       "koff",
2429 			       SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2430 			       0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2431 
2432 		if (!node) {
2433 			sysctl_createv(0, 0, 0, &node,
2434 				       CTLFLAG_PERMANENT, CTLTYPE_NODE,
2435 				       "koffka",
2436 				       SYSCTL_DESCR("The Real(tm) Kernel"
2437 						    " Obscure Feature Finder"),
2438 				       0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2439 		}
2440 		if (node) {
2441 			sysctl_createv(0, 0, 0, 0,
2442 				       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2443 				       CTLTYPE_INT, "vtw_debug_syscall",
2444 				       SYSCTL_DESCR("vtw debug"
2445 						    " system call number"),
2446 				       0, 0, &vtw_syscall, 0, node->sysctl_num,
2447 				       CTL_CREATE, CTL_EOL);
2448 		}
2449 		sysctl_root.sysctl_flags = flags;
2450 	}
2451 }
2452 #else /* !VTW_DEBUG */
2453 static void
2454 vtw_debug_init(void)
2455 {
2456 	return;
2457 }
2458 #endif /* !VTW_DEBUG */
2459