xref: /netbsd-src/sys/netinet/tcp_vtw.c (revision 46f5119e40af2e51998f686b2fdcc76b5488f7f3)
1 /*
2  * Copyright (c) 2011 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to The NetBSD Foundation
6  * by Coyote Point Systems, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  * POSSIBILITY OF SUCH DAMAGE.
28  */
29 #include <sys/cdefs.h>
30 
31 #include "opt_ddb.h"
32 #include "opt_inet.h"
33 #include "opt_ipsec.h"
34 #include "opt_inet_csum.h"
35 #include "opt_tcp_debug.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/kmem.h>
41 #include <sys/mbuf.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/errno.h>
46 #include <sys/syslog.h>
47 #include <sys/pool.h>
48 #include <sys/domain.h>
49 #include <sys/kernel.h>
50 #include <net/if.h>
51 #include <net/route.h>
52 #include <net/if_types.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/in_offload.h>
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #include <netinet6/in6_pcb.h>
64 #include <netinet6/ip6_var.h>
65 #include <netinet6/in6_var.h>
66 #include <netinet/icmp6.h>
67 #include <netinet6/nd6.h>
68 
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcp_private.h>
75 #include <netinet/tcpip.h>
76 
77 #include <machine/stdarg.h>
78 #include <netinet/tcp_vtw.h>
79 
80 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.3 2011/05/11 15:08:59 drochner Exp $");
81 
82 #define db_trace(__a, __b)	do { } while (/*CONSTCOND*/0)
83 
84 static void vtw_debug_init(void);
85 
86 fatp_ctl_t fat_tcpv4;
87 fatp_ctl_t fat_tcpv6;
88 vtw_ctl_t  vtw_tcpv4[VTW_NCLASS];
89 vtw_ctl_t  vtw_tcpv6[VTW_NCLASS];
90 vtw_stats_t vtw_stats;
91 
92 /* We provide state for the lookup_ports iterator.
93  * As currently we are netlock-protected, there is one.
94  * If we were finer-grain, we would have one per CPU.
95  * I do not want to be in the business of alloc/free.
96  * The best alternate would be allocate on the caller's
97  * stack, but that would require them to know the struct,
98  * or at least the size.
99  * See how she goes.
100  */
101 struct tcp_ports_iterator {
102 	union {
103 		struct in_addr	v4;
104 		struct in6_addr	v6;
105 	}		addr;
106 	u_int		port;
107 
108 	uint32_t	wild	: 1;
109 
110 	vtw_ctl_t	*ctl;
111 	fatp_t		*fp;
112 
113 	uint16_t	slot_idx;
114 	uint16_t	ctl_idx;
115 };
116 
117 static struct tcp_ports_iterator tcp_ports_iterator_v4;
118 static struct tcp_ports_iterator tcp_ports_iterator_v6;
119 
120 static int vtw_age(vtw_ctl_t *, struct timeval *);
121 
122 /*!\brief allocate a fat pointer from a collection.
123  */
124 static fatp_t *
125 fatp_alloc(fatp_ctl_t *fat)
126 {
127 	fatp_t	*fp	= 0;
128 
129 	if (fat->nfree) {
130 		fp = fat->free;
131 		if (fp) {
132 			fat->free = fatp_next(fat, fp);
133 			--fat->nfree;
134 			++fat->nalloc;
135 			fp->nxt = 0;
136 
137 			KASSERT(!fp->inuse);
138 		}
139 	}
140 
141 	return fp;
142 }
143 
144 /*!\brief free a fat pointer.
145  */
146 static void
147 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
148 {
149 	if (fp) {
150 		KASSERT(!fp->inuse);
151 		KASSERT(!fp->nxt);
152 
153 		fp->nxt = fatp_index(fat, fat->free);
154 		fat->free = fp;
155 
156 		++fat->nfree;
157 		--fat->nalloc;
158 	}
159 }
160 
161 /*!\brief initialise a collection of fat pointers.
162  *
163  *\param n	# hash buckets
164  *\param m	total # fat pointers to allocate
165  *
166  * We allocate 2x as much, as we have two hashes: full and lport only.
167  */
168 static void
169 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m)
170 {
171 	fatp_t	*fp;
172 
173 	KASSERT(n <= FATP_MAX / 2);
174 
175 	fat->hash   = kmem_alloc(2*m * sizeof (fatp_t *), KM_SLEEP);
176 	fat->base   = kmem_alloc(2*n * sizeof (fatp_t), KM_SLEEP);
177 
178 	if (!fat->base) {
179 		if (fat->hash)
180 			kmem_free(fat->hash, 2*m * sizeof (fatp_t *));
181 
182 		bzero(fat, sizeof (*fat));
183 		return;
184 	}
185 
186 	fat->port = &fat->hash[m];
187 
188 	fat->mask   = m - 1;	// ASSERT is power of 2 (m)
189 	fat->lim    = fat->base + 2*n - 1;
190 	fat->nfree  = 0;
191 	fat->nalloc = 2*n;
192 
193 	bzero(fat->hash, 2*m * sizeof (fatp_t *));
194 	bzero(fat->base, 2*n * sizeof (fatp_t));
195 
196 	/* Initialise the free list.
197 	 */
198 	for (fp = fat->lim; fp >= fat->base; --fp) {
199 		fatp_free(fat, fp);
200 	}
201 }
202 
203 /*
204  * The `xtra' is XORed into the tag stored.
205  */
206 static uint32_t fatp_xtra[] = {
207 	0x11111111,0x22222222,0x33333333,0x44444444,
208 	0x55555555,0x66666666,0x77777777,0x88888888,
209 	0x12121212,0x21212121,0x34343434,0x43434343,
210 	0x56565656,0x65656565,0x78787878,0x87878787,
211 	0x11221122,0x22112211,0x33443344,0x44334433,
212 	0x55665566,0x66556655,0x77887788,0x88778877,
213 	0x11112222,0x22221111,0x33334444,0x44443333,
214 	0x55556666,0x66665555,0x77778888,0x88887777,
215 };
216 
217 /*!\brief turn a {fatp_t*,slot} into an integral key.
218  *
219  * The key can be used to obtain the fatp_t, and the slot,
220  * as it directly encodes them.
221  */
222 static inline uint32_t
223 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
224 {
225 	CTASSERT(CACHE_LINE_SIZE == 32 ||
226 	         CACHE_LINE_SIZE == 64 ||
227 		 CACHE_LINE_SIZE == 128);
228 
229 	switch (fatp_ntags()) {
230 	case 7:
231 		return (fatp_index(fat, fp) << 3) | slot;
232 	case 15:
233 		return (fatp_index(fat, fp) << 4) | slot;
234 	case 31:
235 		return (fatp_index(fat, fp) << 5) | slot;
236 	default:
237 		KASSERT(0 && "no support, for no good reason");
238 		return ~0;
239 	}
240 }
241 
242 static inline uint32_t
243 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
244 {
245 	CTASSERT(CACHE_LINE_SIZE == 32 ||
246 	         CACHE_LINE_SIZE == 64 ||
247 		 CACHE_LINE_SIZE == 128);
248 
249 	switch (fatp_ntags()) {
250 	case 7:
251 		return key & 7;
252 	case 15:
253 		return key & 15;
254 	case 31:
255 		return key & 31;
256 	default:
257 		KASSERT(0 && "no support, for no good reason");
258 		return ~0;
259 	}
260 }
261 
262 static inline fatp_t *
263 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
264 {
265 	CTASSERT(CACHE_LINE_SIZE == 32 ||
266 	         CACHE_LINE_SIZE == 64 ||
267 		 CACHE_LINE_SIZE == 128);
268 
269 	switch (fatp_ntags()) {
270 	case 7:
271 		key >>= 3;
272 		break;
273 	case 15:
274 		key >>= 4;
275 		break;
276 	case 31:
277 		key >>= 5;
278 		break;
279 	default:
280 		KASSERT(0 && "no support, for no good reason");
281 		return 0;
282 	}
283 
284 	return key ? fat->base + key - 1 : 0;
285 }
286 
287 static inline uint32_t
288 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
289 {
290 	return (idx << ctl->idx_bits) | idx;
291 }
292 
293 static inline uint32_t
294 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
295 {
296 	uint32_t	idx	= bits & ctl->idx_mask;
297 
298 	if (idx_encode(ctl, idx) == bits)
299 		return idx;
300 	else
301 		return ~0;
302 }
303 
304 /*!\brief	insert index into fatp hash
305  *
306  *\param	idx	-	index of element being placed in hash chain
307  *\param	tag	-	32-bit tag identifier
308  *
309  *\returns
310  *	value which can be used to locate entry.
311  *
312  *\note
313  *	we rely on the fact that there are unused high bits in the index
314  *	for verification purposes on lookup.
315  */
316 
317 static inline uint32_t
318 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
319     void *dbg)
320 {
321 	fatp_t	*fp;
322 	fatp_t	**hash = (which ? fat->port : fat->hash);
323 	int	i;
324 
325 	fp = hash[tag & fat->mask];
326 
327 	while (!fp || fatp_full(fp)) {
328 		fatp_t	*fq;
329 
330 		/* All entries are inuse at the top level.
331 		 * We allocate a spare, and push the top level
332 		 * down one.  All entries in the fp we push down
333 		 * (think of a tape worm here) will be expelled sooner than
334 		 * any entries added subsequently to this hash bucket.
335 		 * This is a property of the time waits we are exploiting.
336 		 */
337 
338 		fq = fatp_alloc(fat);
339 		if (!fq) {
340 			vtw_age(fat->vtw, 0);
341 			fp = hash[tag & fat->mask];
342 			continue;
343 		}
344 
345 		fq->inuse = 0;
346 		fq->nxt   = fatp_index(fat, fp);
347 
348 		hash[tag & fat->mask] = fq;
349 
350 		fp = fq;
351 	}
352 
353 	KASSERT(!fatp_full(fp));
354 
355 	/* Fill highest index first.  Lookup is lowest first.
356 	 */
357 	for (i = fatp_ntags(); --i >= 0; ) {
358 		if (!((1 << i) & fp->inuse)) {
359 			break;
360 		}
361 	}
362 
363 	fp->inuse |= 1 << i;
364 	fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
365 
366 	db_trace(KTR_VTW
367 		 , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
368 		    , fp->inuse
369 		    , i, fp->tag[i]));
370 
371 	return fatp_key(fat, fp, i);
372 }
373 
374 static inline int
375 vtw_alive(const vtw_t *vtw)
376 {
377 	return vtw->hashed && vtw->expire.tv_sec;
378 }
379 
380 static inline uint32_t
381 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
382 {
383 	if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
384 		return v4 - ctl->base.v4;
385 
386 	KASSERT(0 && "vtw out of bounds");
387 
388 	return ~0;
389 }
390 
391 static inline uint32_t
392 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
393 {
394 	if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
395 		return v6 - ctl->base.v6;
396 
397 	KASSERT(0 && "vtw out of bounds");
398 
399 	return ~0;
400 }
401 
402 static inline uint32_t
403 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
404 {
405 	if (ctl->clidx)
406 		ctl = ctl->ctl;
407 
408 	if (ctl->is_v4)
409 		return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
410 
411 	if (ctl->is_v6)
412 		return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
413 
414 	KASSERT(0 && "neither 4 nor 6.  most curious.");
415 
416 	return ~0;
417 }
418 
419 static inline vtw_t *
420 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
421 {
422 	if (ctl->clidx)
423 		ctl = ctl->ctl;
424 
425 	/* See if the index looks like it might be an index.
426 	 * Bits on outside of the valid index bits is a give away.
427 	 */
428 	idx = idx_decode(ctl, idx);
429 
430 	if (idx == ~0) {
431 		return 0;
432 	} else if (ctl->is_v4) {
433 		vtw_v4_t	*vtw = ctl->base.v4 + idx;
434 
435 		return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
436 			? &vtw->common : 0;
437 	} else if (ctl->is_v6) {
438 		vtw_v6_t	*vtw = ctl->base.v6 + idx;
439 
440 		return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
441 			? &vtw->common : 0;
442 	} else {
443 		KASSERT(0 && "badness");
444 		return 0;
445 	}
446 }
447 
448 /*!\brief return the next vtw after this one.
449  *
450  * Due to the differing sizes of the entries in differing
451  * arenas, we have to ensure we ++ the correct pointer type.
452  *
453  * Also handles wrap.
454  */
455 static inline vtw_t *
456 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
457 {
458 	if (ctl->is_v4) {
459 		vtw_v4_t	*v4 = (void*)vtw;
460 
461 		vtw = &(++v4)->common;
462 	} else {
463 		vtw_v6_t	*v6 = (void*)vtw;
464 
465 		vtw = &(++v6)->common;
466 	}
467 
468 	if (vtw > ctl->lim.v)
469 		vtw = ctl->base.v;
470 
471 	return vtw;
472 }
473 
474 /*!\brief	remove entry from FATP hash chains
475  */
476 static inline void
477 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
478 {
479 	fatp_ctl_t	*fat	= ctl->fat;
480 	fatp_t		*fp;
481 	uint32_t	key = vtw->key;
482 	uint32_t	tag, slot, idx;
483 	vtw_v4_t	*v4 = (void*)vtw;
484 	vtw_v6_t	*v6 = (void*)vtw;
485 
486 	if (!vtw->hashed) {
487 		KASSERT(0 && "unhashed");
488 		return;
489 	}
490 
491 	if (fat->vtw->is_v4) {
492 		tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
493 	} else if (fat->vtw->is_v6) {
494 		tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
495 	} else {
496 		tag = 0;
497 		KASSERT(0 && "not reached");
498 	}
499 
500 	/* Remove from fat->hash[]
501 	 */
502 	slot = fatp_slot_from_key(fat, key);
503 	fp   = fatp_from_key(fat, key);
504 	idx  = vtw_index(ctl, vtw);
505 
506 	db_trace(KTR_VTW
507 		 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
508 		    , fp->inuse, slot, idx, key, tag));
509 
510 	KASSERT(fp->inuse & (1 << slot));
511 	KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
512 				  ^ fatp_xtra[slot]));
513 
514 	if ((fp->inuse & (1 << slot))
515 	    && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
516 				 ^ fatp_xtra[slot])) {
517 		fp->inuse ^= 1 << slot;
518 		fp->tag[slot] = 0;
519 
520 		/* When we delete entries, we do not compact.  This is
521 		 * due to temporality.  We add entries, and they
522 		 * (eventually) expire. Older entries will be further
523 		 * down the chain.
524 		 */
525 		if (!fp->inuse) {
526 			uint32_t hi = tag & fat->mask;
527 			fatp_t	*fq = 0;
528 			fatp_t	*fr = fat->hash[hi];
529 
530 			while (fr && fr != fp) {
531 				fr = fatp_next(fat, fq = fr);
532 			}
533 
534 			if (fr == fp) {
535 				if (fq) {
536 					fq->nxt = fp->nxt;
537 					fp->nxt = 0;
538 					fatp_free(fat, fp);
539 				} else {
540 					KASSERT(fat->hash[hi] == fp);
541 
542 					if (fp->nxt) {
543 						fat->hash[hi]
544 							= fatp_next(fat, fp);
545 						fp->nxt = 0;
546 						fatp_free(fat, fp);
547 					} else {
548 						/* retain for next use.
549 						 */
550 						;
551 					}
552 				}
553 			} else {
554 				fr = fat->hash[hi];
555 
556 				do {
557 					db_trace(KTR_VTW
558 						 , (fr
559 						    , "fat:*del inuse %5.5x"
560 						    " nxt %x"
561 						    , fr->inuse, fr->nxt));
562 
563 					fr = fatp_next(fat, fq = fr);
564 				} while (fr && fr != fp);
565 
566 				KASSERT(0 && "oops");
567 			}
568 		}
569 		vtw->key ^= ~0;
570 	}
571 
572 	if (fat->vtw->is_v4) {
573 		tag = v4_port_tag(v4->lport);
574 	} else if (fat->vtw->is_v6) {
575 		tag = v6_port_tag(v6->lport);
576 	}
577 
578 	/* Remove from fat->port[]
579 	 */
580 	key  = vtw->port_key;
581 	slot = fatp_slot_from_key(fat, key);
582 	fp   = fatp_from_key(fat, key);
583 	idx  = vtw_index(ctl, vtw);
584 
585 	db_trace(KTR_VTW
586 		 , (fp, "fatport: del inuse %5.5x"
587 		    " slot %x idx %x key %x tag %x"
588 		    , fp->inuse, slot, idx, key, tag));
589 
590 	KASSERT(fp->inuse & (1 << slot));
591 	KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
592 				  ^ fatp_xtra[slot]));
593 
594 	if ((fp->inuse & (1 << slot))
595 	    && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
596 				 ^ fatp_xtra[slot])) {
597 		fp->inuse ^= 1 << slot;
598 		fp->tag[slot] = 0;
599 
600 		if (!fp->inuse) {
601 			uint32_t hi = tag & fat->mask;
602 			fatp_t	*fq = 0;
603 			fatp_t	*fr = fat->port[hi];
604 
605 			while (fr && fr != fp) {
606 				fr = fatp_next(fat, fq = fr);
607 			}
608 
609 			if (fr == fp) {
610 				if (fq) {
611 					fq->nxt = fp->nxt;
612 					fp->nxt = 0;
613 					fatp_free(fat, fp);
614 				} else {
615 					KASSERT(fat->port[hi] == fp);
616 
617 					if (fp->nxt) {
618 						fat->port[hi]
619 							= fatp_next(fat, fp);
620 						fp->nxt = 0;
621 						fatp_free(fat, fp);
622 					} else {
623 						/* retain for next use.
624 						 */
625 						;
626 					}
627 				}
628 			}
629 		}
630 		vtw->port_key ^= ~0;
631 	}
632 
633 	vtw->hashed = 0;
634 }
635 
636 /*!\brief	remove entry from hash, possibly free.
637  */
638 void
639 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
640 {
641 	KASSERT(mutex_owned(softnet_lock));
642 
643 	if (vtw->hashed) {
644 		++vtw_stats.del;
645 		vtw_unhash(ctl, vtw);
646 	}
647 
648 	/* We only delete the oldest entry.
649 	 */
650 	if (vtw != ctl->oldest.v)
651 		return;
652 
653 	--ctl->nalloc;
654 	++ctl->nfree;
655 
656 	vtw->expire.tv_sec  = 0;
657 	vtw->expire.tv_usec = ~0;
658 
659 	if (!ctl->nalloc)
660 		ctl->oldest.v = 0;
661 
662 	ctl->oldest.v = vtw_next(ctl, vtw);
663 }
664 
665 /*!\brief	insert vestigeal timewait in hash chain
666  */
667 static void
668 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
669 {
670 	uint32_t	idx	= vtw_index(ctl, vtw);
671 	uint32_t	tag;
672 	vtw_v4_t	*v4 = (void*)vtw;
673 
674 	KASSERT(mutex_owned(softnet_lock));
675 	KASSERT(!vtw->hashed);
676 	KASSERT(ctl->clidx == vtw->msl_class);
677 
678 	++vtw_stats.ins;
679 
680 	tag = v4_tag(v4->faddr, v4->fport,
681 		     v4->laddr, v4->lport);
682 
683 	vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
684 
685 	db_trace(KTR_VTW, (ctl
686 			   , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
687 			   " tag %8.8x key %8.8x"
688 			   , v4->faddr, v4->fport
689 			   , v4->laddr, v4->lport
690 			   , tag
691 			   , vtw->key));
692 
693 	tag = v4_port_tag(v4->lport);
694 	vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
695 
696 	db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
697 			   , v4->lport, v4->lport
698 			   , tag
699 			   , vtw->key));
700 
701 	vtw->hashed = 1;
702 }
703 
704 /*!\brief	insert vestigeal timewait in hash chain
705  */
706 static void
707 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
708 {
709 	uint32_t	idx	= vtw_index(ctl, vtw);
710 	uint32_t	tag;
711 	vtw_v6_t	*v6	= (void*)vtw;
712 
713 	KASSERT(mutex_owned(softnet_lock));
714 	KASSERT(!vtw->hashed);
715 	KASSERT(ctl->clidx == vtw->msl_class);
716 
717 	++vtw_stats.ins;
718 
719 	tag = v6_tag(&v6->faddr, v6->fport,
720 		     &v6->laddr, v6->lport);
721 
722 	vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
723 
724 	tag = v6_port_tag(v6->lport);
725 	vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
726 
727 	db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
728 			   , v6->lport, v6->lport
729 			   , tag
730 			   , vtw->key));
731 
732 	vtw->hashed = 1;
733 }
734 
735 static vtw_t *
736 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
737 				 , uint32_t laddr, uint16_t lport
738 				 , int which)
739 {
740 	vtw_v4_t	*v4;
741 	vtw_t		*vtw;
742 	uint32_t	tag;
743 	fatp_t		*fp;
744 	int		i;
745 	uint32_t	fatps = 0, probes = 0, losings = 0;
746 
747 	if (!ctl || !ctl->fat)
748 		return 0;
749 
750 	++vtw_stats.look[which];
751 
752 	if (which) {
753 		tag = v4_port_tag(lport);
754 		fp  = ctl->fat->port[tag & ctl->fat->mask];
755 	} else {
756 		tag = v4_tag(faddr, fport, laddr, lport);
757 		fp  = ctl->fat->hash[tag & ctl->fat->mask];
758 	}
759 
760 	while (fp && fp->inuse) {
761 		uint32_t	inuse = fp->inuse;
762 
763 		++fatps;
764 
765 		for (i = 0; inuse && i < fatp_ntags(); ++i) {
766 			uint32_t	idx;
767 
768 			if (!(inuse & (1 << i)))
769 				continue;
770 
771 			inuse ^= 1 << i;
772 
773 			++probes;
774 			++vtw_stats.probe[which];
775 
776 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
777 			vtw = vtw_from_index(ctl, idx);
778 
779 			if (!vtw) {
780 				/* Hopefully fast path.
781 				 */
782 				db_trace(KTR_VTW
783 					 , (fp, "vtw: fast %A:%P %A:%P"
784 					    " idx %x tag %x"
785 					    , faddr, fport
786 					    , laddr, lport
787 					    , idx, tag));
788 				continue;
789 			}
790 
791 			v4 = (void*)vtw;
792 
793 			/* The de-referencing of vtw is what we want to avoid.
794 			 * Losing.
795 			 */
796 			if (vtw_alive(vtw)
797 			    && ((which ? vtw->port_key : vtw->key)
798 				== fatp_key(ctl->fat, fp, i))
799 			    && (which
800 				|| (v4->faddr == faddr && v4->laddr == laddr
801 				    && v4->fport == fport))
802 			    && v4->lport == lport) {
803 				++vtw_stats.hit[which];
804 
805 				db_trace(KTR_VTW
806 					 , (fp, "vtw: hit %8.8x:%4.4x"
807 					    " %8.8x:%4.4x idx %x key %x"
808 					    , faddr, fport
809 					    , laddr, lport
810 					    , idx_decode(ctl, idx), vtw->key));
811 
812 				KASSERT(vtw->hashed);
813 
814 				goto out;
815 			}
816 			++vtw_stats.losing[which];
817 			++losings;
818 
819 			if (vtw_alive(vtw)) {
820 				db_trace(KTR_VTW
821 					 , (fp, "vtw:!mis %8.8x:%4.4x"
822 					    " %8.8x:%4.4x key %x tag %x"
823 					    , faddr, fport
824 					    , laddr, lport
825 					    , fatp_key(ctl->fat, fp, i)
826 					    , v4_tag(faddr, fport
827 						     , laddr, lport)));
828 				db_trace(KTR_VTW
829 					 , (vtw, "vtw:!mis %8.8x:%4.4x"
830 					    " %8.8x:%4.4x key %x tag %x"
831 					    , v4->faddr, v4->fport
832 					    , v4->laddr, v4->lport
833 					    , vtw->key
834 					    , v4_tag(v4->faddr, v4->fport
835 						     , v4->laddr, v4->lport)));
836 
837 				if (vtw->key == fatp_key(ctl->fat, fp, i)) {
838 					db_trace(KTR_VTW
839 						 , (vtw, "vtw:!mis %8.8x:%4.4x"
840 						    " %8.8x:%4.4x key %x"
841 						    " which %x"
842 						    , v4->faddr, v4->fport
843 						    , v4->laddr, v4->lport
844 						    , vtw->key
845 						    , which));
846 
847 				} else {
848 					db_trace(KTR_VTW
849 						 , (vtw
850 						    , "vtw:!mis"
851 						    " key %8.8x != %8.8x"
852 						    " idx %x i %x which %x"
853 						    , vtw->key
854 						    , fatp_key(ctl->fat, fp, i)
855 						    , idx_decode(ctl, idx)
856 						    , i
857 						    , which));
858 				}
859 			} else {
860 				db_trace(KTR_VTW
861 					 , (fp
862 					    , "vtw:!mis free entry"
863 					    " idx %x vtw %p which %x"
864 					    , idx_decode(ctl, idx)
865 					    , vtw, which));
866 			}
867 		}
868 
869 		if (fp->nxt) {
870 			fp = fatp_next(ctl->fat, fp);
871 		} else {
872 			break;
873 		}
874 	}
875 	++vtw_stats.miss[which];
876 	vtw = 0;
877 out:
878 	if (fatps > vtw_stats.max_chain[which])
879 		vtw_stats.max_chain[which] = fatps;
880 	if (probes > vtw_stats.max_probe[which])
881 		vtw_stats.max_probe[which] = probes;
882 	if (losings > vtw_stats.max_loss[which])
883 		vtw_stats.max_loss[which] = losings;
884 
885 	return vtw;
886 }
887 
888 static vtw_t *
889 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
890 				 , const struct in6_addr *laddr, uint16_t lport
891 				 , int which)
892 {
893 	vtw_v6_t	*v6;
894 	vtw_t		*vtw;
895 	uint32_t	tag;
896 	fatp_t		*fp;
897 	int		i;
898 	uint32_t	fatps = 0, probes = 0, losings = 0;
899 
900 	++vtw_stats.look[which];
901 
902 	if (!ctl || !ctl->fat)
903 		return 0;
904 
905 	if (which) {
906 		tag = v6_port_tag(lport);
907 		fp  = ctl->fat->port[tag & ctl->fat->mask];
908 	} else {
909 		tag = v6_tag(faddr, fport, laddr, lport);
910 		fp  = ctl->fat->hash[tag & ctl->fat->mask];
911 	}
912 
913 	while (fp && fp->inuse) {
914 		uint32_t	inuse = fp->inuse;
915 
916 		++fatps;
917 
918 		for (i = 0; inuse && i < fatp_ntags(); ++i) {
919 			uint32_t	idx;
920 
921 			if (!(inuse & (1 << i)))
922 				continue;
923 
924 			inuse ^= 1 << i;
925 
926 			++probes;
927 			++vtw_stats.probe[which];
928 
929 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
930 			vtw = vtw_from_index(ctl, idx);
931 
932 			db_trace(KTR_VTW
933 				 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
934 				    , i
935 				    , db_store(faddr, sizeof (*faddr)), fport
936 				    , db_store(laddr, sizeof (*laddr)), lport
937 				    , idx_decode(ctl, idx)));
938 
939 			if (!vtw) {
940 				/* Hopefully fast path.
941 				 */
942 				continue;
943 			}
944 
945 			v6 = (void*)vtw;
946 
947 			if (vtw_alive(vtw)
948 			    && ((which ? vtw->port_key : vtw->key)
949 				== fatp_key(ctl->fat, fp, i))
950 			    && v6->lport == lport
951 			    && (which
952 				|| (v6->fport == fport
953 				    && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
954 				    && !bcmp(&v6->laddr, laddr
955 					     , sizeof (*laddr))))) {
956 				++vtw_stats.hit[which];
957 
958 				KASSERT(vtw->hashed);
959 				goto out;
960 			} else {
961 				++vtw_stats.losing[which];
962 				++losings;
963 			}
964 		}
965 
966 		if (fp->nxt) {
967 			fp = fatp_next(ctl->fat, fp);
968 		} else {
969 			break;
970 		}
971 	}
972 	++vtw_stats.miss[which];
973 	vtw = 0;
974 out:
975 	if (fatps > vtw_stats.max_chain[which])
976 		vtw_stats.max_chain[which] = fatps;
977 	if (probes > vtw_stats.max_probe[which])
978 		vtw_stats.max_probe[which] = probes;
979 	if (losings > vtw_stats.max_loss[which])
980 		vtw_stats.max_loss[which] = losings;
981 
982 	return vtw;
983 }
984 
985 /*!\brief port iterator
986  */
987 static vtw_t *
988 vtw_next_port_v4(struct tcp_ports_iterator *it)
989 {
990 	vtw_ctl_t	*ctl = it->ctl;
991 	vtw_v4_t	*v4;
992 	vtw_t		*vtw;
993 	uint32_t	tag;
994 	uint16_t	lport = it->port;
995 	fatp_t		*fp;
996 	int		i;
997 	uint32_t	fatps = 0, probes = 0, losings = 0;
998 
999 	tag = v4_port_tag(lport);
1000 	if (!it->fp) {
1001 		it->fp = ctl->fat->port[tag & ctl->fat->mask];
1002 		it->slot_idx = 0;
1003 	}
1004 	fp  = it->fp;
1005 
1006 	while (fp) {
1007 		uint32_t	inuse = fp->inuse;
1008 
1009 		++fatps;
1010 
1011 		for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1012 			uint32_t	idx;
1013 
1014 			if (!(inuse & (1 << i)))
1015 				continue;
1016 
1017 			inuse &= ~0 << i;
1018 
1019 			if (i < it->slot_idx)
1020 				continue;
1021 
1022 			++vtw_stats.probe[1];
1023 			++probes;
1024 
1025 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1026 			vtw = vtw_from_index(ctl, idx);
1027 
1028 			if (!vtw) {
1029 				/* Hopefully fast path.
1030 				 */
1031 				continue;
1032 			}
1033 
1034 			v4 = (void*)vtw;
1035 
1036 			if (vtw_alive(vtw)
1037 			    && vtw->port_key == fatp_key(ctl->fat, fp, i)
1038 			    && v4->lport == lport) {
1039 				++vtw_stats.hit[1];
1040 
1041 				it->slot_idx = i + 1;
1042 
1043 				goto out;
1044 			} else if (vtw_alive(vtw)) {
1045 				++vtw_stats.losing[1];
1046 				++losings;
1047 
1048 				db_trace(KTR_VTW
1049 					 , (vtw, "vtw:!mis"
1050 					    " port %8.8x:%4.4x %8.8x:%4.4x"
1051 					    " key %x port %x"
1052 					    , v4->faddr, v4->fport
1053 					    , v4->laddr, v4->lport
1054 					    , vtw->key
1055 					    , lport));
1056 			} else {
1057 				/* Really losing here.  We are coming
1058 				 * up with references to free entries.
1059 				 * Might find it better to use
1060 				 * traditional, or need another
1061 				 * add-hockery.  The other add-hockery
1062 				 * would be to pul more into into the
1063 				 * cache line to reject the false
1064 				 * hits.
1065 				 */
1066 				++vtw_stats.losing[1];
1067 				++losings;
1068 				db_trace(KTR_VTW
1069 					 , (fp, "vtw:!mis port %x"
1070 					    " - free entry idx %x vtw %p"
1071 					    , lport
1072 					    , idx_decode(ctl, idx)
1073 					    , vtw));
1074 			}
1075 		}
1076 
1077 		if (fp->nxt) {
1078 			it->fp = fp = fatp_next(ctl->fat, fp);
1079 			it->slot_idx = 0;
1080 		} else {
1081 			it->fp = 0;
1082 			break;
1083 		}
1084 	}
1085 	++vtw_stats.miss[1];
1086 
1087 	vtw = 0;
1088 out:
1089 	if (fatps > vtw_stats.max_chain[1])
1090 		vtw_stats.max_chain[1] = fatps;
1091 	if (probes > vtw_stats.max_probe[1])
1092 		vtw_stats.max_probe[1] = probes;
1093 	if (losings > vtw_stats.max_loss[1])
1094 		vtw_stats.max_loss[1] = losings;
1095 
1096 	return vtw;
1097 }
1098 
1099 /*!\brief port iterator
1100  */
1101 static vtw_t *
1102 vtw_next_port_v6(struct tcp_ports_iterator *it)
1103 {
1104 	vtw_ctl_t	*ctl = it->ctl;
1105 	vtw_v6_t	*v6;
1106 	vtw_t		*vtw;
1107 	uint32_t	tag;
1108 	uint16_t	lport = it->port;
1109 	fatp_t		*fp;
1110 	int		i;
1111 	uint32_t	fatps = 0, probes = 0, losings = 0;
1112 
1113 	tag = v6_port_tag(lport);
1114 	if (!it->fp) {
1115 		it->fp = ctl->fat->port[tag & ctl->fat->mask];
1116 		it->slot_idx = 0;
1117 	}
1118 	fp  = it->fp;
1119 
1120 	while (fp) {
1121 		uint32_t	inuse = fp->inuse;
1122 
1123 		++fatps;
1124 
1125 		for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1126 			uint32_t	idx;
1127 
1128 			if (!(inuse & (1 << i)))
1129 				continue;
1130 
1131 			inuse &= ~0 << i;
1132 
1133 			if (i < it->slot_idx)
1134 				continue;
1135 
1136 			++vtw_stats.probe[1];
1137 			++probes;
1138 
1139 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1140 			vtw = vtw_from_index(ctl, idx);
1141 
1142 			if (!vtw) {
1143 				/* Hopefully fast path.
1144 				 */
1145 				continue;
1146 			}
1147 
1148 			v6 = (void*)vtw;
1149 
1150 			db_trace(KTR_VTW
1151 				 , (vtw, "vtw: i %x idx %x fp->tag %x"
1152 				    " tag %x xtra %x"
1153 				    , i, idx_decode(ctl, idx)
1154 				    , fp->tag[i], tag, fatp_xtra[i]));
1155 
1156 			if (vtw_alive(vtw)
1157 			    && vtw->port_key == fatp_key(ctl->fat, fp, i)
1158 			    && v6->lport == lport) {
1159 				++vtw_stats.hit[1];
1160 
1161 				db_trace(KTR_VTW
1162 					 , (fp, "vtw: nxt port %P - %4.4x"
1163 					    " idx %x key %x"
1164 					    , lport, lport
1165 					    , idx_decode(ctl, idx), vtw->key));
1166 
1167 				it->slot_idx = i + 1;
1168 				goto out;
1169 			} else if (vtw_alive(vtw)) {
1170 				++vtw_stats.losing[1];
1171 
1172 				db_trace(KTR_VTW
1173 					 , (vtw, "vtw:!mis port %6A:%4.4x"
1174 					    " %6A:%4.4x key %x port %x"
1175 					    , db_store(&v6->faddr
1176 						       , sizeof (v6->faddr))
1177 					    , v6->fport
1178 					    , db_store(&v6->laddr
1179 						       , sizeof (v6->faddr))
1180 					    , v6->lport
1181 					    , vtw->key
1182 					    , lport));
1183 			} else {
1184 				/* Really losing here.  We are coming
1185 				 * up with references to free entries.
1186 				 * Might find it better to use
1187 				 * traditional, or need another
1188 				 * add-hockery.  The other add-hockery
1189 				 * would be to pul more into into the
1190 				 * cache line to reject the false
1191 				 * hits.
1192 				 */
1193 				++vtw_stats.losing[1];
1194 				++losings;
1195 
1196 				db_trace(KTR_VTW
1197 					 , (fp
1198 					    , "vtw:!mis port %x"
1199 					    " - free entry idx %x vtw %p"
1200 					    , lport, idx_decode(ctl, idx)
1201 					    , vtw));
1202 			}
1203 		}
1204 
1205 		if (fp->nxt) {
1206 			it->fp = fp = fatp_next(ctl->fat, fp);
1207 			it->slot_idx = 0;
1208 		} else {
1209 			it->fp = 0;
1210 			break;
1211 		}
1212 	}
1213 	++vtw_stats.miss[1];
1214 
1215 	vtw = 0;
1216 out:
1217 	if (fatps > vtw_stats.max_chain[1])
1218 		vtw_stats.max_chain[1] = fatps;
1219 	if (probes > vtw_stats.max_probe[1])
1220 		vtw_stats.max_probe[1] = probes;
1221 	if (losings > vtw_stats.max_loss[1])
1222 		vtw_stats.max_loss[1] = losings;
1223 
1224 	return vtw;
1225 }
1226 
1227 /*!\brief initialise the VTW allocation arena
1228  *
1229  * There are 1+3 allocation classes:
1230  *	0	classless
1231  *	{1,2,3}	MSL-class based allocation
1232  *
1233  * The allocation arenas are all initialised.  Classless gets all the
1234  * space.  MSL-class based divides the arena, so that allocation
1235  * within a class can proceed without having to consider entries
1236  * (aka: cache lines) from different classes.
1237  *
1238  * Usually, we are completely classless or class-based, but there can be
1239  * transition periods, corresponding to dynamic adjustments in the config
1240  * by the operator.
1241  */
1242 static void
1243 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, uint32_t n)
1244 {
1245 	int i;
1246 	int sz = (ctl->is_v4 ? sizeof (vtw_v4_t) : sizeof (vtw_v6_t));
1247 
1248 	ctl->base.v4 = kmem_alloc(n * sz, KM_SLEEP);
1249 	if (ctl->base.v4) {
1250 		vtw_t	*base;
1251 		int	class_n;
1252 
1253 		bzero(ctl->base.v4, n * sz);
1254 
1255 		if (ctl->is_v4) {
1256 			ctl->lim.v4    = ctl->base.v4 + n - 1;
1257 			ctl->alloc.v4  = ctl->base.v4;
1258 		} else {
1259 			ctl->lim.v6    = ctl->base.v6 + n - 1;
1260 			ctl->alloc.v6  = ctl->base.v6;
1261 		}
1262 
1263 		ctl->nfree  = n;
1264 		ctl->ctl    = ctl;
1265 
1266 		ctl->idx_bits = 32;
1267 		for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1268 			ctl->idx_mask >>= 1;
1269 			ctl->idx_bits  -= 1;
1270 		}
1271 
1272 		ctl->idx_mask <<= 1;
1273 		ctl->idx_mask  |= 1;
1274 		ctl->idx_bits  += 1;
1275 
1276 		ctl->fat = fat;
1277 		fat->vtw = ctl;
1278 
1279 		/* Divide the resources equally amongst the classes.
1280 		 * This is not optimal, as the different classes
1281 		 * arrive and leave at different rates, but it is
1282 		 * the best I can do for now.
1283 		 */
1284 		class_n = n / (VTW_NCLASS-1);
1285 		base    = ctl->base.v;
1286 
1287 		for (i = 1; i < VTW_NCLASS; ++i) {
1288 			int j;
1289 
1290 			ctl[i] = ctl[0];
1291 			ctl[i].clidx = i;
1292 
1293 			ctl[i].base.v = base;
1294 			ctl[i].alloc  = ctl[i].base;
1295 
1296 			for (j = 0; j < class_n - 1; ++j) {
1297 				if (tcp_msl_enable)
1298 					base->msl_class = i;
1299 				base = vtw_next(ctl, base);
1300 			}
1301 
1302 			ctl[i].lim.v = base;
1303 			base = vtw_next(ctl, base);
1304 			ctl[i].nfree = class_n;
1305 		}
1306 	}
1307 
1308 	vtw_debug_init();
1309 }
1310 
1311 /*!\brief	map class to TCP MSL
1312  */
1313 static inline uint32_t
1314 class_to_msl(int class)
1315 {
1316 	switch (class) {
1317 	case 0:
1318 	case 1:
1319 		return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1320 	case 2:
1321 		return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1322 	default:
1323 		return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1324 	}
1325 }
1326 
1327 /*!\brief	map TCP MSL to class
1328  */
1329 static inline uint32_t
1330 msl_to_class(int msl)
1331 {
1332 	if (tcp_msl_enable) {
1333 		if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1334 			return 1+2;
1335 		if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1336 			return 1+1;
1337 		return 1;
1338 	}
1339 	return 0;
1340 }
1341 
1342 /*!\brief allocate a vtw entry
1343  */
1344 static inline vtw_t *
1345 vtw_alloc(vtw_ctl_t *ctl)
1346 {
1347 	vtw_t	*vtw	= 0;
1348 	int	stuck	= 0;
1349 	int	avail	= ctl ? (ctl->nalloc + ctl->nfree) : 0;
1350 	int	msl;
1351 
1352 	KASSERT(mutex_owned(softnet_lock));
1353 
1354 	/* If no resources, we will not get far.
1355 	 */
1356 	if (!ctl || !ctl->base.v4 || avail <= 0)
1357 		return 0;
1358 
1359 	/* Obtain a free one.
1360 	 */
1361 	while (!ctl->nfree) {
1362 		vtw_age(ctl, 0);
1363 
1364 		if (++stuck > avail) {
1365 			/* When in transition between
1366 			 * schemes (classless, classed) we
1367 			 * can be stuck having to await the
1368 			 * expiration of cross-allocated entries.
1369 			 *
1370 			 * Returning zero means we will fall back to the
1371 			 * traditional TIME_WAIT handling, except in the
1372 			 * case of a re-shed, in which case we cannot
1373 			 * perform the reshecd, but will retain the extant
1374 			 * entry.
1375 			 */
1376 			db_trace(KTR_VTW
1377 				 , (ctl, "vtw:!none free in class %x %x/%x"
1378 				    , ctl->clidx
1379 				    , ctl->nalloc, ctl->nfree));
1380 
1381 			return 0;
1382 		}
1383 	}
1384 
1385 	vtw = ctl->alloc.v;
1386 
1387 	if (vtw->msl_class != ctl->clidx) {
1388 		/* Usurping rules:
1389 		 * 	0 -> {1,2,3} or {1,2,3} -> 0
1390 		 */
1391 		KASSERT(!vtw->msl_class || !ctl->clidx);
1392 
1393 		if (vtw->hashed || vtw->expire.tv_sec) {
1394 		    /* As this is owned by some other class,
1395 		     * we must wait for it to expire it.
1396 		     * This will only happen on class/classless
1397 		     * transitions, which are guaranteed to progress
1398 		     * to completion in small finite time, barring bugs.
1399 		     */
1400 		    db_trace(KTR_VTW
1401 			     , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1402 				, vtw, vtw->msl_class, ctl->clidx
1403 				, vtw->expire.tv_sec
1404 				, vtw->expire.tv_usec
1405 				, vtw->hashed ? " hashed" : ""));
1406 
1407 		    return 0;
1408 		}
1409 
1410 		db_trace(KTR_VTW
1411 			 , (ctl, "vtw:!%p usurped from %x to %x"
1412 			    , vtw, vtw->msl_class, ctl->clidx));
1413 
1414 		vtw->msl_class = ctl->clidx;
1415 	}
1416 
1417 	if (vtw_alive(vtw)) {
1418 		KASSERT(0 && "next free not free");
1419 		return 0;
1420 	}
1421 
1422 	/* Advance allocation poiter.
1423 	 */
1424 	ctl->alloc.v = vtw_next(ctl, vtw);
1425 
1426 	--ctl->nfree;
1427 	++ctl->nalloc;
1428 
1429 	msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ;	// msec
1430 
1431 	/* mark expiration
1432 	 */
1433 	getmicrouptime(&vtw->expire);
1434 
1435 	/* Move expiration into the future.
1436 	 */
1437 	vtw->expire.tv_sec  += msl / 1000;
1438 	vtw->expire.tv_usec += 1000 * (msl % 1000);
1439 
1440 	while (vtw->expire.tv_usec >= 1000*1000) {
1441 		vtw->expire.tv_usec -= 1000*1000;
1442 		vtw->expire.tv_sec  += 1;
1443 	}
1444 
1445 	if (!ctl->oldest.v)
1446 		ctl->oldest.v = vtw;
1447 
1448 	return vtw;
1449 }
1450 
1451 /*!\brief expiration
1452  */
1453 static int
1454 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1455 {
1456 	vtw_t	*vtw;
1457 	struct timeval then, *when = _when;
1458 	int	maxtries = 0;
1459 
1460 	if (!ctl->oldest.v) {
1461 		KASSERT(!ctl->nalloc);
1462 		return 0;
1463 	}
1464 
1465 	for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1466 		if (++maxtries > ctl->nalloc)
1467 			break;
1468 
1469 		if (vtw->msl_class != ctl->clidx) {
1470 			db_trace(KTR_VTW
1471 				 , (vtw, "vtw:!age class mismatch %x != %x"
1472 				    , vtw->msl_class, ctl->clidx));
1473 			/* XXXX
1474 			 * See if the appropriate action is to skip to the next.
1475 			 * XXXX
1476 			 */
1477 			ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1478 			continue;
1479 		}
1480 		if (!when) {
1481 			/* Latch oldest timeval if none specified.
1482 			 */
1483 			then = vtw->expire;
1484 			when = &then;
1485 		}
1486 
1487 		if (!timercmp(&vtw->expire, when, <=))
1488 			break;
1489 
1490 		db_trace(KTR_VTW
1491 			 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1492 			    , ctl->clidx
1493 			    , vtw->expire.tv_sec
1494 			    , vtw->expire.tv_usec
1495 			    , ctl->nalloc
1496 			    , ctl->nfree));
1497 
1498 		if (!_when)
1499 			++vtw_stats.kill;
1500 
1501 		vtw_del(ctl, vtw);
1502 		vtw = ctl->oldest.v;
1503 	}
1504 
1505 	return ctl->nalloc;	// # remaining allocated
1506 }
1507 
1508 static callout_t vtw_cs;
1509 
1510 /*!\brief notice the passage of time.
1511  * It seems to be getting faster.  What happened to the year?
1512  */
1513 static void
1514 vtw_tick(void *arg)
1515 {
1516 	struct timeval now;
1517 	int i, cnt = 0;
1518 
1519 	getmicrouptime(&now);
1520 
1521 	db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1522 			   , now.tv_sec, now.tv_usec));
1523 
1524 	mutex_enter(softnet_lock);
1525 
1526 	for (i = 0; i < VTW_NCLASS; ++i) {
1527 		cnt += vtw_age(&vtw_tcpv4[i], &now);
1528 		cnt += vtw_age(&vtw_tcpv6[i], &now);
1529 	}
1530 
1531 	/* Keep ticks coming while we need them.
1532 	 */
1533 	if (cnt)
1534 		callout_schedule(&vtw_cs, hz / 5);
1535 	else {
1536 		tcp_vtw_was_enabled = 0;
1537 		tcbtable.vestige    = 0;
1538 	}
1539 	mutex_exit(softnet_lock);
1540 }
1541 
1542 /* in_pcblookup_ports assist for handling vestigial entries.
1543  */
1544 static void *
1545 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1546 {
1547 	struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1548 
1549 	bzero(it, sizeof (*it));
1550 
1551 	/* Note: the reference to vtw_tcpv4[0] is fine.
1552 	 * We do not need per-class iteration.  We just
1553 	 * need to get to the fat, and there is one
1554 	 * shared fat.
1555 	 */
1556 	if (vtw_tcpv4[0].fat) {
1557 		it->addr.v4 = addr;
1558 		it->port = port;
1559 		it->wild = !!wild;
1560 		it->ctl  = &vtw_tcpv4[0];
1561 
1562 		++vtw_stats.look[1];
1563 	}
1564 
1565 	return it;
1566 }
1567 
1568 /*!\brief export an IPv4 vtw.
1569  */
1570 static int
1571 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1572 {
1573 	vtw_v4_t	*v4 = (void*)vtw;
1574 
1575 	bzero(res, sizeof (*res));
1576 
1577 	if (ctl && vtw) {
1578 		if (!ctl->clidx && vtw->msl_class)
1579 			ctl += vtw->msl_class;
1580 		else
1581 			KASSERT(ctl->clidx == vtw->msl_class);
1582 
1583 		res->valid = 1;
1584 		res->v4    = 1;
1585 
1586 		res->faddr.v4.s_addr = v4->faddr;
1587 		res->laddr.v4.s_addr = v4->laddr;
1588 		res->fport	= v4->fport;
1589 		res->lport	= v4->lport;
1590 		res->vtw	= vtw;		// netlock held over call(s)
1591 		res->ctl	= ctl;
1592 		res->reuse_addr = vtw->reuse_addr;
1593 		res->reuse_port = vtw->reuse_port;
1594 		res->snd_nxt    = vtw->snd_nxt;
1595 		res->rcv_nxt	= vtw->rcv_nxt;
1596 		res->rcv_wnd	= vtw->rcv_wnd;
1597 		res->uid	= vtw->uid;
1598 	}
1599 
1600 	return res->valid;
1601 }
1602 
1603 /*!\brief return next port in the port iterator.  yowza.
1604  */
1605 static int
1606 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1607 {
1608 	struct tcp_ports_iterator *it = arg;
1609 	vtw_t		*vtw = 0;
1610 
1611 	if (it->ctl)
1612 		vtw = vtw_next_port_v4(it);
1613 
1614 	if (!vtw)
1615 		it->ctl = 0;
1616 
1617 	return vtw_export_v4(it->ctl, vtw, res);
1618 }
1619 
1620 static int
1621 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1622               struct in_addr laddr, uint16_t lport,
1623 	      struct vestigial_inpcb *res)
1624 {
1625 	vtw_t		*vtw;
1626 	vtw_ctl_t	*ctl;
1627 
1628 
1629 	db_trace(KTR_VTW
1630 		 , (res, "vtw: lookup %A:%P %A:%P"
1631 		    , faddr, fport
1632 		    , laddr, lport));
1633 
1634 	vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1635 				 , faddr.s_addr, fport
1636 				 , laddr.s_addr, lport, 0);
1637 
1638 	return vtw_export_v4(ctl, vtw, res);
1639 }
1640 
1641 /* in_pcblookup_ports assist for handling vestigial entries.
1642  */
1643 static void *
1644 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1645 {
1646 	struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1647 
1648 	bzero(it, sizeof (*it));
1649 
1650 	/* Note: the reference to vtw_tcpv6[0] is fine.
1651 	 * We do not need per-class iteration.  We just
1652 	 * need to get to the fat, and there is one
1653 	 * shared fat.
1654 	 */
1655 	if (vtw_tcpv6[0].fat) {
1656 		it->addr.v6 = *addr;
1657 		it->port = port;
1658 		it->wild = !!wild;
1659 		it->ctl  = &vtw_tcpv6[0];
1660 
1661 		++vtw_stats.look[1];
1662 	}
1663 
1664 	return it;
1665 }
1666 
1667 /*!\brief export an IPv6 vtw.
1668  */
1669 static int
1670 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1671 {
1672 	vtw_v6_t	*v6 = (void*)vtw;
1673 
1674 	bzero(res, sizeof (*res));
1675 
1676 	if (ctl && vtw) {
1677 		if (!ctl->clidx && vtw->msl_class)
1678 			ctl += vtw->msl_class;
1679 		else
1680 			KASSERT(ctl->clidx == vtw->msl_class);
1681 
1682 		res->valid = 1;
1683 		res->v4    = 0;
1684 
1685 		res->faddr.v6	= v6->faddr;
1686 		res->laddr.v6	= v6->laddr;
1687 		res->fport	= v6->fport;
1688 		res->lport	= v6->lport;
1689 		res->vtw	= vtw;		// netlock held over call(s)
1690 		res->ctl	= ctl;
1691 
1692 		res->v6only	= vtw->v6only;
1693 		res->reuse_addr = vtw->reuse_addr;
1694 		res->reuse_port = vtw->reuse_port;
1695 
1696 		res->snd_nxt    = vtw->snd_nxt;
1697 		res->rcv_nxt	= vtw->rcv_nxt;
1698 		res->rcv_wnd	= vtw->rcv_wnd;
1699 		res->uid	= vtw->uid;
1700 	}
1701 
1702 	return res->valid;
1703 }
1704 
1705 static int
1706 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1707 {
1708 	struct tcp_ports_iterator *it = arg;
1709 	vtw_t		*vtw = 0;
1710 
1711 	if (it->ctl)
1712 		vtw = vtw_next_port_v6(it);
1713 
1714 	if (!vtw)
1715 		it->ctl = 0;
1716 
1717 	return vtw_export_v6(it->ctl, vtw, res);
1718 }
1719 
1720 static int
1721 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1722               const struct in6_addr *laddr, uint16_t lport,
1723 	      struct vestigial_inpcb *res)
1724 {
1725 	vtw_ctl_t	*ctl;
1726 	vtw_t		*vtw;
1727 
1728 	db_trace(KTR_VTW
1729 		 , (res, "vtw: lookup %6A:%P %6A:%P"
1730 		    , db_store(faddr, sizeof (*faddr)), fport
1731 		    , db_store(laddr, sizeof (*laddr)), lport));
1732 
1733 	vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1734 				 , faddr, fport
1735 				 , laddr, lport, 0);
1736 
1737 	return vtw_export_v6(ctl, vtw, res);
1738 }
1739 
1740 static vestigial_hooks_t tcp_hooks = {
1741 	.init_ports4	= tcp_init_ports_v4,
1742 	.next_port4	= tcp_next_port_v4,
1743 	.lookup4	= tcp_lookup_v4,
1744 	.init_ports6	= tcp_init_ports_v6,
1745 	.next_port6	= tcp_next_port_v6,
1746 	.lookup6	= tcp_lookup_v6,
1747 };
1748 
1749 static bool
1750 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1751 {
1752 	fatp_ctl_t	*fat;
1753 	vtw_ctl_t	*ctl;
1754 
1755 	switch (af) {
1756 	case AF_INET:
1757 		fat = &fat_tcpv4;
1758 		ctl = &vtw_tcpv4[0];
1759 		break;
1760 	case AF_INET6:
1761 		fat = &fat_tcpv6;
1762 		ctl = &vtw_tcpv6[0];
1763 		break;
1764 	default:
1765 		return false;
1766 	}
1767 	if (fatp != NULL)
1768 		*fatp = fat;
1769 	if (ctlp != NULL)
1770 		*ctlp = ctl;
1771 	return true;
1772 }
1773 
1774 /*!\brief	initialize controlling instance
1775  */
1776 static int
1777 vtw_control_init(int af)
1778 {
1779 	fatp_ctl_t	*fat;
1780 	vtw_ctl_t	*ctl;
1781 
1782 	if (!vtw_select(af, &fat, &ctl))
1783 		return EAFNOSUPPORT;
1784 
1785 	if (!fat->base) {
1786 		uint32_t	n, m;
1787 
1788 		KASSERT(powerof2(tcp_vtw_entries));
1789 
1790 		/* Allocate 10% more capacity in the fat pointers.
1791 		 * We should only need ~#hash additional based on
1792 		 * how they age, but TIME_WAIT assassination could cause
1793 		 * sparse fat pointer utilisation.
1794 		 */
1795 		m = 512;
1796 		n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1797 
1798 		fatp_init(fat, n, m);
1799 
1800 		if (!fat->base)
1801 			return ENOMEM;
1802 	}
1803 
1804 	if (!ctl->base.v) {
1805 
1806 		vtw_init(fat, ctl, tcp_vtw_entries);
1807 		if (!ctl->base.v)
1808 			return ENOMEM;
1809 	}
1810 
1811 	return 0;
1812 }
1813 
1814 /*!\brief	select controlling instance
1815  */
1816 static vtw_ctl_t *
1817 vtw_control(int af, uint32_t msl)
1818 {
1819 	fatp_ctl_t	*fat;
1820 	vtw_ctl_t	*ctl;
1821 	int		class	= msl_to_class(msl);
1822 
1823 	if (!vtw_select(af, &fat, &ctl))
1824 		return NULL;
1825 
1826 	if (!fat->base || !ctl->base.v)
1827 		return NULL;
1828 
1829 	return ctl + class;
1830 }
1831 
1832 /*!\brief	add TCP pcb to vestigial timewait
1833  */
1834 int
1835 vtw_add(int af, struct tcpcb *tp)
1836 {
1837 	int		enable;
1838 	vtw_ctl_t	*ctl;
1839 	vtw_t		*vtw;
1840 
1841 	KASSERT(mutex_owned(softnet_lock));
1842 
1843 	ctl = vtw_control(af, tp->t_msl);
1844 	if (!ctl)
1845 		return 0;
1846 
1847 	enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1848 
1849 	vtw = vtw_alloc(ctl);
1850 
1851 	if (vtw) {
1852 		vtw->snd_nxt = tp->snd_nxt;
1853 		vtw->rcv_nxt = tp->rcv_nxt;
1854 
1855 		switch (af) {
1856 		case AF_INET: {
1857 			struct inpcb	*inp = tp->t_inpcb;
1858 			vtw_v4_t	*v4  = (void*)vtw;
1859 
1860 			v4->faddr = inp->inp_faddr.s_addr;
1861 			v4->laddr = inp->inp_laddr.s_addr;
1862 			v4->fport = inp->inp_fport;
1863 			v4->lport = inp->inp_lport;
1864 
1865 			vtw->reuse_port = !!(inp->inp_socket->so_options
1866 					     & SO_REUSEPORT);
1867 			vtw->reuse_addr = !!(inp->inp_socket->so_options
1868 					     & SO_REUSEADDR);
1869 			vtw->v6only	= 0;
1870 			vtw->uid	= inp->inp_socket->so_uidinfo->ui_uid;
1871 
1872 			vtw_inshash_v4(ctl, vtw);
1873 
1874 
1875 #ifdef VTW_DEBUG
1876 			/* Immediate lookup (connected and port) to
1877 			 * ensure at least that works!
1878 			 */
1879 			if (enable & 4) {
1880 				KASSERT(vtw_lookup_hash_v4
1881 					(ctl
1882 					 , inp->inp_faddr.s_addr, inp->inp_fport
1883 					 , inp->inp_laddr.s_addr, inp->inp_lport
1884 					 , 0)
1885 					== vtw);
1886 				KASSERT(vtw_lookup_hash_v4
1887 					(ctl
1888 					 , inp->inp_faddr.s_addr, inp->inp_fport
1889 					 , inp->inp_laddr.s_addr, inp->inp_lport
1890 					 , 1));
1891 			}
1892 			/* Immediate port iterator functionality check: not wild
1893 			 */
1894 			if (enable & 8) {
1895 				struct tcp_ports_iterator *it;
1896 				struct vestigial_inpcb res;
1897 				int cnt = 0;
1898 
1899 				it = tcp_init_ports_v4(inp->inp_laddr
1900 						       , inp->inp_lport, 0);
1901 
1902 				while (tcp_next_port_v4(it, &res)) {
1903 					++cnt;
1904 				}
1905 				KASSERT(cnt);
1906 			}
1907 			/* Immediate port iterator functionality check: wild
1908 			 */
1909 			if (enable & 16) {
1910 				struct tcp_ports_iterator *it;
1911 				struct vestigial_inpcb res;
1912 				struct in_addr any;
1913 				int cnt = 0;
1914 
1915 				any.s_addr = htonl(INADDR_ANY);
1916 
1917 				it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1918 
1919 				while (tcp_next_port_v4(it, &res)) {
1920 					++cnt;
1921 				}
1922 				KASSERT(cnt);
1923 			}
1924 #endif /* VTW_DEBUG */
1925 			break;
1926 		}
1927 
1928 		case AF_INET6: {
1929 			struct in6pcb	*inp = tp->t_in6pcb;
1930 			vtw_v6_t	*v6  = (void*)vtw;
1931 
1932 			v6->faddr = inp->in6p_faddr;
1933 			v6->laddr = inp->in6p_laddr;
1934 			v6->fport = inp->in6p_fport;
1935 			v6->lport = inp->in6p_lport;
1936 
1937 			vtw->reuse_port = !!(inp->in6p_socket->so_options
1938 					     & SO_REUSEPORT);
1939 			vtw->reuse_addr = !!(inp->in6p_socket->so_options
1940 					     & SO_REUSEADDR);
1941 			vtw->v6only	= !!(inp->in6p_flags
1942 					     & IN6P_IPV6_V6ONLY);
1943 			vtw->uid	= inp->in6p_socket->so_uidinfo->ui_uid;
1944 
1945 			vtw_inshash_v6(ctl, vtw);
1946 #ifdef VTW_DEBUG
1947 			/* Immediate lookup (connected and port) to
1948 			 * ensure at least that works!
1949 			 */
1950 			if (enable & 4) {
1951 				KASSERT(vtw_lookup_hash_v6(ctl
1952 					 , &inp->in6p_faddr, inp->in6p_fport
1953 					 , &inp->in6p_laddr, inp->in6p_lport
1954 					 , 0)
1955 					== vtw);
1956 				KASSERT(vtw_lookup_hash_v6
1957 					(ctl
1958 					 , &inp->in6p_faddr, inp->in6p_fport
1959 					 , &inp->in6p_laddr, inp->in6p_lport
1960 					 , 1));
1961 			}
1962 			/* Immediate port iterator functionality check: not wild
1963 			 */
1964 			if (enable & 8) {
1965 				struct tcp_ports_iterator *it;
1966 				struct vestigial_inpcb res;
1967 				int cnt = 0;
1968 
1969 				it = tcp_init_ports_v6(&inp->in6p_laddr
1970 						       , inp->in6p_lport, 0);
1971 
1972 				while (tcp_next_port_v6(it, &res)) {
1973 					++cnt;
1974 				}
1975 				KASSERT(cnt);
1976 			}
1977 			/* Immediate port iterator functionality check: wild
1978 			 */
1979 			if (enable & 16) {
1980 				struct tcp_ports_iterator *it;
1981 				struct vestigial_inpcb res;
1982 				static struct in6_addr any = IN6ADDR_ANY_INIT;
1983 				int cnt = 0;
1984 
1985 				it = tcp_init_ports_v6(&any
1986 						       , inp->in6p_lport, 1);
1987 
1988 				while (tcp_next_port_v6(it, &res)) {
1989 					++cnt;
1990 				}
1991 				KASSERT(cnt);
1992 			}
1993 #endif /* VTW_DEBUG */
1994 			break;
1995 		}
1996 		}
1997 
1998 		tcp_canceltimers(tp);
1999 		tp = tcp_close(tp);
2000 		KASSERT(!tp);
2001 
2002 		return 1;
2003 	}
2004 
2005 	return 0;
2006 }
2007 
2008 /*!\brief	restart timer for vestigial time-wait entry
2009  */
2010 static void
2011 vtw_restart_v4(vestigial_inpcb_t *vp)
2012 {
2013 	vtw_v4_t	copy = *(vtw_v4_t*)vp->vtw;
2014 	vtw_t		*vtw;
2015 	vtw_t		*cp  = &copy.common;
2016 	vtw_ctl_t	*ctl;
2017 
2018 	KASSERT(mutex_owned(softnet_lock));
2019 
2020 	db_trace(KTR_VTW
2021 		 , (vp->vtw, "vtw: restart %A:%P %A:%P"
2022 		    , vp->faddr.v4.s_addr, vp->fport
2023 		    , vp->laddr.v4.s_addr, vp->lport));
2024 
2025 	/* Class might have changed, so have a squiz.
2026 	 */
2027 	ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2028 	vtw = vtw_alloc(ctl);
2029 
2030 	if (vtw) {
2031 		vtw_v4_t	*v4  = (void*)vtw;
2032 
2033 		/* Safe now to unhash the old entry
2034 		 */
2035 		vtw_del(vp->ctl, vp->vtw);
2036 
2037 		vtw->snd_nxt = cp->snd_nxt;
2038 		vtw->rcv_nxt = cp->rcv_nxt;
2039 
2040 		v4->faddr = copy.faddr;
2041 		v4->laddr = copy.laddr;
2042 		v4->fport = copy.fport;
2043 		v4->lport = copy.lport;
2044 
2045 		vtw->reuse_port = cp->reuse_port;
2046 		vtw->reuse_addr = cp->reuse_addr;
2047 		vtw->v6only	= 0;
2048 		vtw->uid	= cp->uid;
2049 
2050 		vtw_inshash_v4(ctl, vtw);
2051 	}
2052 
2053 	vp->valid = 0;
2054 }
2055 
2056 /*!\brief	restart timer for vestigial time-wait entry
2057  */
2058 static void
2059 vtw_restart_v6(vestigial_inpcb_t *vp)
2060 {
2061 	vtw_v6_t	copy = *(vtw_v6_t*)vp->vtw;
2062 	vtw_t		*vtw;
2063 	vtw_t		*cp  = &copy.common;
2064 	vtw_ctl_t	*ctl;
2065 
2066 	KASSERT(mutex_owned(softnet_lock));
2067 
2068 	db_trace(KTR_VTW
2069 		 , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2070 		    , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2071 		    , vp->fport
2072 		    , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2073 		    , vp->lport));
2074 
2075 	/* Class might have changed, so have a squiz.
2076 	 */
2077 	ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2078 	vtw = vtw_alloc(ctl);
2079 
2080 	if (vtw) {
2081 		vtw_v6_t	*v6  = (void*)vtw;
2082 
2083 		/* Safe now to unhash the old entry
2084 		 */
2085 		vtw_del(vp->ctl, vp->vtw);
2086 
2087 		vtw->snd_nxt = cp->snd_nxt;
2088 		vtw->rcv_nxt = cp->rcv_nxt;
2089 
2090 		v6->faddr = copy.faddr;
2091 		v6->laddr = copy.laddr;
2092 		v6->fport = copy.fport;
2093 		v6->lport = copy.lport;
2094 
2095 		vtw->reuse_port = cp->reuse_port;
2096 		vtw->reuse_addr = cp->reuse_addr;
2097 		vtw->v6only	= cp->v6only;
2098 		vtw->uid	= cp->uid;
2099 
2100 		vtw_inshash_v6(ctl, vtw);
2101 	}
2102 
2103 	vp->valid = 0;
2104 }
2105 
2106 /*!\brief	restart timer for vestigial time-wait entry
2107  */
2108 void
2109 vtw_restart(vestigial_inpcb_t *vp)
2110 {
2111 	if (!vp || !vp->valid)
2112 		return;
2113 
2114 	if (vp->v4)
2115 		vtw_restart_v4(vp);
2116 	else
2117 		vtw_restart_v6(vp);
2118 }
2119 
2120 int
2121 vtw_earlyinit(void)
2122 {
2123 	int rc;
2124 
2125 	if (!tcp_vtw_was_enabled) {
2126 		int i;
2127 
2128 		/* This guarantees is timer ticks until we no longer need them.
2129 		 */
2130 		tcp_vtw_was_enabled = 1;
2131 
2132 		callout_init(&vtw_cs, 0);
2133 		callout_setfunc(&vtw_cs, vtw_tick, 0);
2134 		callout_schedule(&vtw_cs, hz / 5);
2135 
2136 		for (i = 0; i < VTW_NCLASS; ++i) {
2137 			vtw_tcpv4[i].is_v4 = 1;
2138 			vtw_tcpv6[i].is_v6 = 1;
2139 		}
2140 
2141 		tcbtable.vestige = &tcp_hooks;
2142 	}
2143 
2144 	if ((rc = vtw_control_init(AF_INET)) != 0 ||
2145 	    (rc = vtw_control_init(AF_INET6)) != 0)
2146 		return rc;
2147 
2148 	return 0;
2149 }
2150 
2151 #ifdef VTW_DEBUG
2152 #include <sys/syscallargs.h>
2153 #include <sys/sysctl.h>
2154 
2155 /*!\brief	add lalp, fafp entries for debug
2156  */
2157 int
2158 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int class)
2159 {
2160 	vtw_ctl_t	*ctl;
2161 	vtw_t		*vtw;
2162 
2163 	ctl = vtw_control(af, msl ? msl : class_to_msl(class));
2164 	if (!ctl)
2165 		return 0;
2166 
2167 	vtw = vtw_alloc(ctl);
2168 
2169 	if (vtw) {
2170 		vtw->snd_nxt = 0;
2171 		vtw->rcv_nxt = 0;
2172 
2173 		switch (af) {
2174 		case AF_INET: {
2175 			vtw_v4_t	*v4  = (void*)vtw;
2176 
2177 			v4->faddr = fa->sin_addr.v4.s_addr;
2178 			v4->laddr = la->sin_addr.v4.s_addr;
2179 			v4->fport = fa->sin_port;
2180 			v4->lport = la->sin_port;
2181 
2182 			vtw->reuse_port = 1;
2183 			vtw->reuse_addr = 1;
2184 			vtw->v6only	= 0;
2185 			vtw->uid	= 0;
2186 
2187 			vtw_inshash_v4(ctl, vtw);
2188 			break;
2189 		}
2190 
2191 		case AF_INET6: {
2192 			vtw_v6_t	*v6  = (void*)vtw;
2193 
2194 			v6->faddr = fa->sin_addr.v6;
2195 			v6->laddr = la->sin_addr.v6;
2196 
2197 			v6->fport = fa->sin_port;
2198 			v6->lport = la->sin_port;
2199 
2200 			vtw->reuse_port = 1;
2201 			vtw->reuse_addr = 1;
2202 			vtw->v6only	= 0;
2203 			vtw->uid	= 0;
2204 
2205 			vtw_inshash_v6(ctl, vtw);
2206 			break;
2207 		}
2208 
2209 		default:
2210 			break;
2211 		}
2212 
2213 		return 1;
2214 	}
2215 
2216 	return 0;
2217 }
2218 
2219 static int vtw_syscall = 0;
2220 
2221 static int
2222 vtw_debug_process(vtw_sysargs_t *ap)
2223 {
2224 	struct vestigial_inpcb vestige;
2225 	int	rc = 0;
2226 
2227 	mutex_enter(softnet_lock);
2228 
2229 	switch (ap->op) {
2230 	case 0:		// insert
2231 		vtw_debug_add(ap->la.sin_family
2232 			      , &ap->la
2233 			      , &ap->fa
2234 			      , TCPTV_MSL
2235 			      , 0);
2236 		break;
2237 
2238 	case 1:		// lookup
2239 	case 2:		// restart
2240 		switch (ap->la.sin_family) {
2241 		case AF_INET:
2242 			if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2243 					  ap->la.sin_addr.v4, ap->la.sin_port,
2244 					  &vestige)) {
2245 				if (ap->op == 2) {
2246 					vtw_restart(&vestige);
2247 				}
2248 				rc = 0;
2249 			} else
2250 				rc = ESRCH;
2251 			break;
2252 
2253 		case AF_INET6:
2254 			if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2255 					  &ap->la.sin_addr.v6, ap->la.sin_port,
2256 					  &vestige)) {
2257 				if (ap->op == 2) {
2258 					vtw_restart(&vestige);
2259 				}
2260 				rc = 0;
2261 			} else
2262 				rc = ESRCH;
2263 			break;
2264 		default:
2265 			rc = EINVAL;
2266 		}
2267 		break;
2268 
2269 	default:
2270 		rc = EINVAL;
2271 	}
2272 
2273 	mutex_exit(softnet_lock);
2274 	return rc;
2275 }
2276 
2277 struct sys_vtw_args {
2278 	syscallarg(const vtw_sysargs_t *) req;
2279 	syscallarg(size_t) len;
2280 };
2281 
2282 static int
2283 vtw_sys(struct lwp *l, const void *_, register_t *retval)
2284 {
2285 	const struct sys_vtw_args *uap = _;
2286 	void	*buf;
2287 	int	rc;
2288 	size_t	len	= SCARG(uap, len);
2289 
2290 	if (len != sizeof (vtw_sysargs_t))
2291 		return EINVAL;
2292 
2293 	buf = kmem_alloc(len, KM_SLEEP);
2294 	if (!buf)
2295 		return ENOMEM;
2296 
2297 	rc = copyin(SCARG(uap, req), buf, len);
2298 	if (!rc) {
2299 		rc = vtw_debug_process(buf);
2300 	}
2301 	kmem_free(buf, len);
2302 
2303 	return rc;
2304 }
2305 
2306 static void
2307 vtw_sanity_check(void)
2308 {
2309 	vtw_ctl_t	*ctl;
2310 	vtw_t		*vtw;
2311 	int		i;
2312 	int		n;
2313 
2314 	for (i = 0; i < VTW_NCLASS; ++i) {
2315 		ctl = &vtw_tcpv4[i];
2316 
2317 		if (!ctl->base.v || ctl->nalloc)
2318 			continue;
2319 
2320 		for (n = 0, vtw = ctl->base.v; ; ) {
2321 			++n;
2322 			vtw = vtw_next(ctl, vtw);
2323 			if (vtw == ctl->base.v)
2324 				break;
2325 		}
2326 		db_trace(KTR_VTW
2327 			 , (ctl, "sanity: class %x n %x nfree %x"
2328 			    , i, n, ctl->nfree));
2329 
2330 		KASSERT(n == ctl->nfree);
2331 	}
2332 
2333 	for (i = 0; i < VTW_NCLASS; ++i) {
2334 		ctl = &vtw_tcpv6[i];
2335 
2336 		if (!ctl->base.v || ctl->nalloc)
2337 			continue;
2338 
2339 		for (n = 0, vtw = ctl->base.v; ; ) {
2340 			++n;
2341 			vtw = vtw_next(ctl, vtw);
2342 			if (vtw == ctl->base.v)
2343 				break;
2344 		}
2345 		db_trace(KTR_VTW
2346 			 , (ctl, "sanity: class %x n %x nfree %x"
2347 			    , i, n, ctl->nfree));
2348 		KASSERT(n == ctl->nfree);
2349 	}
2350 }
2351 
2352 /*!\brief	Initialise debug support.
2353  */
2354 static void
2355 vtw_debug_init(void)
2356 {
2357 	int	i;
2358 
2359 	vtw_sanity_check();
2360 
2361 	if (vtw_syscall)
2362 		return;
2363 
2364 	for (i = 511; i; --i) {
2365 		if (sysent[i].sy_call == sys_nosys) {
2366 			sysent[i].sy_call    = vtw_sys;
2367 			sysent[i].sy_narg    = 2;
2368 			sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2369 			sysent[i].sy_flags   = 0;
2370 
2371 			vtw_syscall = i;
2372 			break;
2373 		}
2374 	}
2375 	if (i) {
2376 		const struct sysctlnode *node;
2377 		uint32_t	flags;
2378 
2379 		flags = sysctl_root.sysctl_flags;
2380 
2381 		sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2382 		sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2383 
2384 		sysctl_createv(0, 0, 0, &node,
2385 			       CTLFLAG_PERMANENT, CTLTYPE_NODE,
2386 			       "koff",
2387 			       SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2388 			       0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2389 
2390 		if (!node) {
2391 			sysctl_createv(0, 0, 0, &node,
2392 				       CTLFLAG_PERMANENT, CTLTYPE_NODE,
2393 				       "koffka",
2394 				       SYSCTL_DESCR("The Real(tm) Kernel"
2395 						    " Obscure Feature Finder"),
2396 				       0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2397 		}
2398 		if (node) {
2399 			sysctl_createv(0, 0, 0, 0,
2400 				       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2401 				       CTLTYPE_INT, "vtw_debug_syscall",
2402 				       SYSCTL_DESCR("vtw debug"
2403 						    " system call number"),
2404 				       0, 0, &vtw_syscall, 0, node->sysctl_num,
2405 				       CTL_CREATE, CTL_EOL);
2406 		}
2407 		sysctl_root.sysctl_flags = flags;
2408 	}
2409 }
2410 #else /* !VTW_DEBUG */
2411 static void
2412 vtw_debug_init(void)
2413 {
2414 	return;
2415 }
2416 #endif /* !VTW_DEBUG */
2417