xref: /netbsd-src/sys/netinet/tcp_vtw.c (revision 1b9578b8c2c1f848eeb16dabbfd7d1f0d9fdefbd)
1 /*
2  * Copyright (c) 2011 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to The NetBSD Foundation
6  * by Coyote Point Systems, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  * POSSIBILITY OF SUCH DAMAGE.
28  */
29 #include <sys/cdefs.h>
30 
31 #include "opt_ddb.h"
32 #include "opt_inet.h"
33 #include "opt_ipsec.h"
34 #include "opt_inet_csum.h"
35 #include "opt_tcp_debug.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/kmem.h>
41 #include <sys/mbuf.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/errno.h>
46 #include <sys/syslog.h>
47 #include <sys/pool.h>
48 #include <sys/domain.h>
49 #include <sys/kernel.h>
50 #include <net/if.h>
51 #include <net/route.h>
52 #include <net/if_types.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/in_offload.h>
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #include <netinet6/in6_pcb.h>
64 #include <netinet6/ip6_var.h>
65 #include <netinet6/in6_var.h>
66 #include <netinet/icmp6.h>
67 #include <netinet6/nd6.h>
68 
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcp_private.h>
75 #include <netinet/tcpip.h>
76 
77 #include <machine/stdarg.h>
78 #include <netinet/tcp_vtw.h>
79 
80 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.7 2011/06/06 19:15:43 dyoung Exp $");
81 
82 #define db_trace(__a, __b)	do { } while (/*CONSTCOND*/0)
83 
84 static void vtw_debug_init(void);
85 
86 fatp_ctl_t fat_tcpv4;
87 fatp_ctl_t fat_tcpv6;
88 vtw_ctl_t  vtw_tcpv4[VTW_NCLASS];
89 vtw_ctl_t  vtw_tcpv6[VTW_NCLASS];
90 vtw_stats_t vtw_stats;
91 
92 /* We provide state for the lookup_ports iterator.
93  * As currently we are netlock-protected, there is one.
94  * If we were finer-grain, we would have one per CPU.
95  * I do not want to be in the business of alloc/free.
96  * The best alternate would be allocate on the caller's
97  * stack, but that would require them to know the struct,
98  * or at least the size.
99  * See how she goes.
100  */
101 struct tcp_ports_iterator {
102 	union {
103 		struct in_addr	v4;
104 		struct in6_addr	v6;
105 	}		addr;
106 	u_int		port;
107 
108 	uint32_t	wild	: 1;
109 
110 	vtw_ctl_t	*ctl;
111 	fatp_t		*fp;
112 
113 	uint16_t	slot_idx;
114 	uint16_t	ctl_idx;
115 };
116 
117 static struct tcp_ports_iterator tcp_ports_iterator_v4;
118 static struct tcp_ports_iterator tcp_ports_iterator_v6;
119 
120 static int vtw_age(vtw_ctl_t *, struct timeval *);
121 
122 /*!\brief allocate a fat pointer from a collection.
123  */
124 static fatp_t *
125 fatp_alloc(fatp_ctl_t *fat)
126 {
127 	fatp_t	*fp	= 0;
128 
129 	if (fat->nfree) {
130 		fp = fat->free;
131 		if (fp) {
132 			fat->free = fatp_next(fat, fp);
133 			--fat->nfree;
134 			++fat->nalloc;
135 			fp->nxt = 0;
136 
137 			KASSERT(!fp->inuse);
138 		}
139 	}
140 
141 	return fp;
142 }
143 
144 /*!\brief free a fat pointer.
145  */
146 static void
147 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
148 {
149 	if (fp) {
150 		KASSERT(!fp->inuse);
151 		KASSERT(!fp->nxt);
152 
153 		fp->nxt = fatp_index(fat, fat->free);
154 		fat->free = fp;
155 
156 		++fat->nfree;
157 		--fat->nalloc;
158 	}
159 }
160 
161 /*!\brief initialise a collection of fat pointers.
162  *
163  *\param n	# hash buckets
164  *\param m	total # fat pointers to allocate
165  *
166  * We allocate 2x as much, as we have two hashes: full and lport only.
167  */
168 static void
169 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m,
170     fatp_t *fat_base, fatp_t **fat_hash)
171 {
172 	fatp_t	*fp;
173 
174 	KASSERT(n <= FATP_MAX / 2);
175 
176 	fat->hash = fat_hash;
177 	fat->base = fat_base;
178 
179 	fat->port = &fat->hash[m];
180 
181 	fat->mask   = m - 1;	// ASSERT is power of 2 (m)
182 	fat->lim    = fat->base + 2*n - 1;
183 	fat->nfree  = 0;
184 	fat->nalloc = 2*n;
185 
186 	/* Initialise the free list.
187 	 */
188 	for (fp = fat->lim; fp >= fat->base; --fp) {
189 		fatp_free(fat, fp);
190 	}
191 }
192 
193 /*
194  * The `xtra' is XORed into the tag stored.
195  */
196 static uint32_t fatp_xtra[] = {
197 	0x11111111,0x22222222,0x33333333,0x44444444,
198 	0x55555555,0x66666666,0x77777777,0x88888888,
199 	0x12121212,0x21212121,0x34343434,0x43434343,
200 	0x56565656,0x65656565,0x78787878,0x87878787,
201 	0x11221122,0x22112211,0x33443344,0x44334433,
202 	0x55665566,0x66556655,0x77887788,0x88778877,
203 	0x11112222,0x22221111,0x33334444,0x44443333,
204 	0x55556666,0x66665555,0x77778888,0x88887777,
205 };
206 
207 /*!\brief turn a {fatp_t*,slot} into an integral key.
208  *
209  * The key can be used to obtain the fatp_t, and the slot,
210  * as it directly encodes them.
211  */
212 static inline uint32_t
213 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
214 {
215 	CTASSERT(CACHE_LINE_SIZE == 32 ||
216 	         CACHE_LINE_SIZE == 64 ||
217 		 CACHE_LINE_SIZE == 128);
218 
219 	switch (fatp_ntags()) {
220 	case 7:
221 		return (fatp_index(fat, fp) << 3) | slot;
222 	case 15:
223 		return (fatp_index(fat, fp) << 4) | slot;
224 	case 31:
225 		return (fatp_index(fat, fp) << 5) | slot;
226 	default:
227 		KASSERT(0 && "no support, for no good reason");
228 		return ~0;
229 	}
230 }
231 
232 static inline uint32_t
233 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
234 {
235 	CTASSERT(CACHE_LINE_SIZE == 32 ||
236 	         CACHE_LINE_SIZE == 64 ||
237 		 CACHE_LINE_SIZE == 128);
238 
239 	switch (fatp_ntags()) {
240 	case 7:
241 		return key & 7;
242 	case 15:
243 		return key & 15;
244 	case 31:
245 		return key & 31;
246 	default:
247 		KASSERT(0 && "no support, for no good reason");
248 		return ~0;
249 	}
250 }
251 
252 static inline fatp_t *
253 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
254 {
255 	CTASSERT(CACHE_LINE_SIZE == 32 ||
256 	         CACHE_LINE_SIZE == 64 ||
257 		 CACHE_LINE_SIZE == 128);
258 
259 	switch (fatp_ntags()) {
260 	case 7:
261 		key >>= 3;
262 		break;
263 	case 15:
264 		key >>= 4;
265 		break;
266 	case 31:
267 		key >>= 5;
268 		break;
269 	default:
270 		KASSERT(0 && "no support, for no good reason");
271 		return 0;
272 	}
273 
274 	return key ? fat->base + key - 1 : 0;
275 }
276 
277 static inline uint32_t
278 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
279 {
280 	return (idx << ctl->idx_bits) | idx;
281 }
282 
283 static inline uint32_t
284 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
285 {
286 	uint32_t	idx	= bits & ctl->idx_mask;
287 
288 	if (idx_encode(ctl, idx) == bits)
289 		return idx;
290 	else
291 		return ~0;
292 }
293 
294 /*!\brief	insert index into fatp hash
295  *
296  *\param	idx	-	index of element being placed in hash chain
297  *\param	tag	-	32-bit tag identifier
298  *
299  *\returns
300  *	value which can be used to locate entry.
301  *
302  *\note
303  *	we rely on the fact that there are unused high bits in the index
304  *	for verification purposes on lookup.
305  */
306 
307 static inline uint32_t
308 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
309     void *dbg)
310 {
311 	fatp_t	*fp;
312 	fatp_t	**hash = (which ? fat->port : fat->hash);
313 	int	i;
314 
315 	fp = hash[tag & fat->mask];
316 
317 	while (!fp || fatp_full(fp)) {
318 		fatp_t	*fq;
319 
320 		/* All entries are inuse at the top level.
321 		 * We allocate a spare, and push the top level
322 		 * down one.  All entries in the fp we push down
323 		 * (think of a tape worm here) will be expelled sooner than
324 		 * any entries added subsequently to this hash bucket.
325 		 * This is a property of the time waits we are exploiting.
326 		 */
327 
328 		fq = fatp_alloc(fat);
329 		if (!fq) {
330 			vtw_age(fat->vtw, 0);
331 			fp = hash[tag & fat->mask];
332 			continue;
333 		}
334 
335 		fq->inuse = 0;
336 		fq->nxt   = fatp_index(fat, fp);
337 
338 		hash[tag & fat->mask] = fq;
339 
340 		fp = fq;
341 	}
342 
343 	KASSERT(!fatp_full(fp));
344 
345 	/* Fill highest index first.  Lookup is lowest first.
346 	 */
347 	for (i = fatp_ntags(); --i >= 0; ) {
348 		if (!((1 << i) & fp->inuse)) {
349 			break;
350 		}
351 	}
352 
353 	fp->inuse |= 1 << i;
354 	fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
355 
356 	db_trace(KTR_VTW
357 		 , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
358 		    , fp->inuse
359 		    , i, fp->tag[i]));
360 
361 	return fatp_key(fat, fp, i);
362 }
363 
364 static inline int
365 vtw_alive(const vtw_t *vtw)
366 {
367 	return vtw->hashed && vtw->expire.tv_sec;
368 }
369 
370 static inline uint32_t
371 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
372 {
373 	if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
374 		return v4 - ctl->base.v4;
375 
376 	KASSERT(0 && "vtw out of bounds");
377 
378 	return ~0;
379 }
380 
381 static inline uint32_t
382 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
383 {
384 	if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
385 		return v6 - ctl->base.v6;
386 
387 	KASSERT(0 && "vtw out of bounds");
388 
389 	return ~0;
390 }
391 
392 static inline uint32_t
393 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
394 {
395 	if (ctl->clidx)
396 		ctl = ctl->ctl;
397 
398 	if (ctl->is_v4)
399 		return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
400 
401 	if (ctl->is_v6)
402 		return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
403 
404 	KASSERT(0 && "neither 4 nor 6.  most curious.");
405 
406 	return ~0;
407 }
408 
409 static inline vtw_t *
410 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
411 {
412 	if (ctl->clidx)
413 		ctl = ctl->ctl;
414 
415 	/* See if the index looks like it might be an index.
416 	 * Bits on outside of the valid index bits is a give away.
417 	 */
418 	idx = idx_decode(ctl, idx);
419 
420 	if (idx == ~0) {
421 		return 0;
422 	} else if (ctl->is_v4) {
423 		vtw_v4_t	*vtw = ctl->base.v4 + idx;
424 
425 		return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
426 			? &vtw->common : 0;
427 	} else if (ctl->is_v6) {
428 		vtw_v6_t	*vtw = ctl->base.v6 + idx;
429 
430 		return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
431 			? &vtw->common : 0;
432 	} else {
433 		KASSERT(0 && "badness");
434 		return 0;
435 	}
436 }
437 
438 /*!\brief return the next vtw after this one.
439  *
440  * Due to the differing sizes of the entries in differing
441  * arenas, we have to ensure we ++ the correct pointer type.
442  *
443  * Also handles wrap.
444  */
445 static inline vtw_t *
446 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
447 {
448 	if (ctl->is_v4) {
449 		vtw_v4_t	*v4 = (void*)vtw;
450 
451 		vtw = &(++v4)->common;
452 	} else {
453 		vtw_v6_t	*v6 = (void*)vtw;
454 
455 		vtw = &(++v6)->common;
456 	}
457 
458 	if (vtw > ctl->lim.v)
459 		vtw = ctl->base.v;
460 
461 	return vtw;
462 }
463 
464 /*!\brief	remove entry from FATP hash chains
465  */
466 static inline void
467 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
468 {
469 	fatp_ctl_t	*fat	= ctl->fat;
470 	fatp_t		*fp;
471 	uint32_t	key = vtw->key;
472 	uint32_t	tag, slot, idx;
473 	vtw_v4_t	*v4 = (void*)vtw;
474 	vtw_v6_t	*v6 = (void*)vtw;
475 
476 	if (!vtw->hashed) {
477 		KASSERT(0 && "unhashed");
478 		return;
479 	}
480 
481 	if (fat->vtw->is_v4) {
482 		tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
483 	} else if (fat->vtw->is_v6) {
484 		tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
485 	} else {
486 		tag = 0;
487 		KASSERT(0 && "not reached");
488 	}
489 
490 	/* Remove from fat->hash[]
491 	 */
492 	slot = fatp_slot_from_key(fat, key);
493 	fp   = fatp_from_key(fat, key);
494 	idx  = vtw_index(ctl, vtw);
495 
496 	db_trace(KTR_VTW
497 		 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
498 		    , fp->inuse, slot, idx, key, tag));
499 
500 	KASSERT(fp->inuse & (1 << slot));
501 	KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
502 				  ^ fatp_xtra[slot]));
503 
504 	if ((fp->inuse & (1 << slot))
505 	    && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
506 				 ^ fatp_xtra[slot])) {
507 		fp->inuse ^= 1 << slot;
508 		fp->tag[slot] = 0;
509 
510 		/* When we delete entries, we do not compact.  This is
511 		 * due to temporality.  We add entries, and they
512 		 * (eventually) expire. Older entries will be further
513 		 * down the chain.
514 		 */
515 		if (!fp->inuse) {
516 			uint32_t hi = tag & fat->mask;
517 			fatp_t	*fq = 0;
518 			fatp_t	*fr = fat->hash[hi];
519 
520 			while (fr && fr != fp) {
521 				fr = fatp_next(fat, fq = fr);
522 			}
523 
524 			if (fr == fp) {
525 				if (fq) {
526 					fq->nxt = fp->nxt;
527 					fp->nxt = 0;
528 					fatp_free(fat, fp);
529 				} else {
530 					KASSERT(fat->hash[hi] == fp);
531 
532 					if (fp->nxt) {
533 						fat->hash[hi]
534 							= fatp_next(fat, fp);
535 						fp->nxt = 0;
536 						fatp_free(fat, fp);
537 					} else {
538 						/* retain for next use.
539 						 */
540 						;
541 					}
542 				}
543 			} else {
544 				fr = fat->hash[hi];
545 
546 				do {
547 					db_trace(KTR_VTW
548 						 , (fr
549 						    , "fat:*del inuse %5.5x"
550 						    " nxt %x"
551 						    , fr->inuse, fr->nxt));
552 
553 					fr = fatp_next(fat, fq = fr);
554 				} while (fr && fr != fp);
555 
556 				KASSERT(0 && "oops");
557 			}
558 		}
559 		vtw->key ^= ~0;
560 	}
561 
562 	if (fat->vtw->is_v4) {
563 		tag = v4_port_tag(v4->lport);
564 	} else if (fat->vtw->is_v6) {
565 		tag = v6_port_tag(v6->lport);
566 	}
567 
568 	/* Remove from fat->port[]
569 	 */
570 	key  = vtw->port_key;
571 	slot = fatp_slot_from_key(fat, key);
572 	fp   = fatp_from_key(fat, key);
573 	idx  = vtw_index(ctl, vtw);
574 
575 	db_trace(KTR_VTW
576 		 , (fp, "fatport: del inuse %5.5x"
577 		    " slot %x idx %x key %x tag %x"
578 		    , fp->inuse, slot, idx, key, tag));
579 
580 	KASSERT(fp->inuse & (1 << slot));
581 	KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
582 				  ^ fatp_xtra[slot]));
583 
584 	if ((fp->inuse & (1 << slot))
585 	    && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
586 				 ^ fatp_xtra[slot])) {
587 		fp->inuse ^= 1 << slot;
588 		fp->tag[slot] = 0;
589 
590 		if (!fp->inuse) {
591 			uint32_t hi = tag & fat->mask;
592 			fatp_t	*fq = 0;
593 			fatp_t	*fr = fat->port[hi];
594 
595 			while (fr && fr != fp) {
596 				fr = fatp_next(fat, fq = fr);
597 			}
598 
599 			if (fr == fp) {
600 				if (fq) {
601 					fq->nxt = fp->nxt;
602 					fp->nxt = 0;
603 					fatp_free(fat, fp);
604 				} else {
605 					KASSERT(fat->port[hi] == fp);
606 
607 					if (fp->nxt) {
608 						fat->port[hi]
609 							= fatp_next(fat, fp);
610 						fp->nxt = 0;
611 						fatp_free(fat, fp);
612 					} else {
613 						/* retain for next use.
614 						 */
615 						;
616 					}
617 				}
618 			}
619 		}
620 		vtw->port_key ^= ~0;
621 	}
622 
623 	vtw->hashed = 0;
624 }
625 
626 /*!\brief	remove entry from hash, possibly free.
627  */
628 void
629 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
630 {
631 	KASSERT(mutex_owned(softnet_lock));
632 
633 	if (vtw->hashed) {
634 		++vtw_stats.del;
635 		vtw_unhash(ctl, vtw);
636 	}
637 
638 	/* We only delete the oldest entry.
639 	 */
640 	if (vtw != ctl->oldest.v)
641 		return;
642 
643 	--ctl->nalloc;
644 	++ctl->nfree;
645 
646 	vtw->expire.tv_sec  = 0;
647 	vtw->expire.tv_usec = ~0;
648 
649 	if (!ctl->nalloc)
650 		ctl->oldest.v = 0;
651 
652 	ctl->oldest.v = vtw_next(ctl, vtw);
653 }
654 
655 /*!\brief	insert vestigial timewait in hash chain
656  */
657 static void
658 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
659 {
660 	uint32_t	idx	= vtw_index(ctl, vtw);
661 	uint32_t	tag;
662 	vtw_v4_t	*v4 = (void*)vtw;
663 
664 	KASSERT(mutex_owned(softnet_lock));
665 	KASSERT(!vtw->hashed);
666 	KASSERT(ctl->clidx == vtw->msl_class);
667 
668 	++vtw_stats.ins;
669 
670 	tag = v4_tag(v4->faddr, v4->fport,
671 		     v4->laddr, v4->lport);
672 
673 	vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
674 
675 	db_trace(KTR_VTW, (ctl
676 			   , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
677 			   " tag %8.8x key %8.8x"
678 			   , v4->faddr, v4->fport
679 			   , v4->laddr, v4->lport
680 			   , tag
681 			   , vtw->key));
682 
683 	tag = v4_port_tag(v4->lport);
684 	vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
685 
686 	db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
687 			   , v4->lport, v4->lport
688 			   , tag
689 			   , vtw->key));
690 
691 	vtw->hashed = 1;
692 }
693 
694 /*!\brief	insert vestigial timewait in hash chain
695  */
696 static void
697 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
698 {
699 	uint32_t	idx	= vtw_index(ctl, vtw);
700 	uint32_t	tag;
701 	vtw_v6_t	*v6	= (void*)vtw;
702 
703 	KASSERT(mutex_owned(softnet_lock));
704 	KASSERT(!vtw->hashed);
705 	KASSERT(ctl->clidx == vtw->msl_class);
706 
707 	++vtw_stats.ins;
708 
709 	tag = v6_tag(&v6->faddr, v6->fport,
710 		     &v6->laddr, v6->lport);
711 
712 	vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
713 
714 	tag = v6_port_tag(v6->lport);
715 	vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
716 
717 	db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
718 			   , v6->lport, v6->lport
719 			   , tag
720 			   , vtw->key));
721 
722 	vtw->hashed = 1;
723 }
724 
725 static vtw_t *
726 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
727 				 , uint32_t laddr, uint16_t lport
728 				 , int which)
729 {
730 	vtw_v4_t	*v4;
731 	vtw_t		*vtw;
732 	uint32_t	tag;
733 	fatp_t		*fp;
734 	int		i;
735 	uint32_t	fatps = 0, probes = 0, losings = 0;
736 
737 	if (!ctl || !ctl->fat)
738 		return 0;
739 
740 	++vtw_stats.look[which];
741 
742 	if (which) {
743 		tag = v4_port_tag(lport);
744 		fp  = ctl->fat->port[tag & ctl->fat->mask];
745 	} else {
746 		tag = v4_tag(faddr, fport, laddr, lport);
747 		fp  = ctl->fat->hash[tag & ctl->fat->mask];
748 	}
749 
750 	while (fp && fp->inuse) {
751 		uint32_t	inuse = fp->inuse;
752 
753 		++fatps;
754 
755 		for (i = 0; inuse && i < fatp_ntags(); ++i) {
756 			uint32_t	idx;
757 
758 			if (!(inuse & (1 << i)))
759 				continue;
760 
761 			inuse ^= 1 << i;
762 
763 			++probes;
764 			++vtw_stats.probe[which];
765 
766 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
767 			vtw = vtw_from_index(ctl, idx);
768 
769 			if (!vtw) {
770 				/* Hopefully fast path.
771 				 */
772 				db_trace(KTR_VTW
773 					 , (fp, "vtw: fast %A:%P %A:%P"
774 					    " idx %x tag %x"
775 					    , faddr, fport
776 					    , laddr, lport
777 					    , idx, tag));
778 				continue;
779 			}
780 
781 			v4 = (void*)vtw;
782 
783 			/* The de-referencing of vtw is what we want to avoid.
784 			 * Losing.
785 			 */
786 			if (vtw_alive(vtw)
787 			    && ((which ? vtw->port_key : vtw->key)
788 				== fatp_key(ctl->fat, fp, i))
789 			    && (which
790 				|| (v4->faddr == faddr && v4->laddr == laddr
791 				    && v4->fport == fport))
792 			    && v4->lport == lport) {
793 				++vtw_stats.hit[which];
794 
795 				db_trace(KTR_VTW
796 					 , (fp, "vtw: hit %8.8x:%4.4x"
797 					    " %8.8x:%4.4x idx %x key %x"
798 					    , faddr, fport
799 					    , laddr, lport
800 					    , idx_decode(ctl, idx), vtw->key));
801 
802 				KASSERT(vtw->hashed);
803 
804 				goto out;
805 			}
806 			++vtw_stats.losing[which];
807 			++losings;
808 
809 			if (vtw_alive(vtw)) {
810 				db_trace(KTR_VTW
811 					 , (fp, "vtw:!mis %8.8x:%4.4x"
812 					    " %8.8x:%4.4x key %x tag %x"
813 					    , faddr, fport
814 					    , laddr, lport
815 					    , fatp_key(ctl->fat, fp, i)
816 					    , v4_tag(faddr, fport
817 						     , laddr, lport)));
818 				db_trace(KTR_VTW
819 					 , (vtw, "vtw:!mis %8.8x:%4.4x"
820 					    " %8.8x:%4.4x key %x tag %x"
821 					    , v4->faddr, v4->fport
822 					    , v4->laddr, v4->lport
823 					    , vtw->key
824 					    , v4_tag(v4->faddr, v4->fport
825 						     , v4->laddr, v4->lport)));
826 
827 				if (vtw->key == fatp_key(ctl->fat, fp, i)) {
828 					db_trace(KTR_VTW
829 						 , (vtw, "vtw:!mis %8.8x:%4.4x"
830 						    " %8.8x:%4.4x key %x"
831 						    " which %x"
832 						    , v4->faddr, v4->fport
833 						    , v4->laddr, v4->lport
834 						    , vtw->key
835 						    , which));
836 
837 				} else {
838 					db_trace(KTR_VTW
839 						 , (vtw
840 						    , "vtw:!mis"
841 						    " key %8.8x != %8.8x"
842 						    " idx %x i %x which %x"
843 						    , vtw->key
844 						    , fatp_key(ctl->fat, fp, i)
845 						    , idx_decode(ctl, idx)
846 						    , i
847 						    , which));
848 				}
849 			} else {
850 				db_trace(KTR_VTW
851 					 , (fp
852 					    , "vtw:!mis free entry"
853 					    " idx %x vtw %p which %x"
854 					    , idx_decode(ctl, idx)
855 					    , vtw, which));
856 			}
857 		}
858 
859 		if (fp->nxt) {
860 			fp = fatp_next(ctl->fat, fp);
861 		} else {
862 			break;
863 		}
864 	}
865 	++vtw_stats.miss[which];
866 	vtw = 0;
867 out:
868 	if (fatps > vtw_stats.max_chain[which])
869 		vtw_stats.max_chain[which] = fatps;
870 	if (probes > vtw_stats.max_probe[which])
871 		vtw_stats.max_probe[which] = probes;
872 	if (losings > vtw_stats.max_loss[which])
873 		vtw_stats.max_loss[which] = losings;
874 
875 	return vtw;
876 }
877 
878 static vtw_t *
879 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
880 				 , const struct in6_addr *laddr, uint16_t lport
881 				 , int which)
882 {
883 	vtw_v6_t	*v6;
884 	vtw_t		*vtw;
885 	uint32_t	tag;
886 	fatp_t		*fp;
887 	int		i;
888 	uint32_t	fatps = 0, probes = 0, losings = 0;
889 
890 	++vtw_stats.look[which];
891 
892 	if (!ctl || !ctl->fat)
893 		return 0;
894 
895 	if (which) {
896 		tag = v6_port_tag(lport);
897 		fp  = ctl->fat->port[tag & ctl->fat->mask];
898 	} else {
899 		tag = v6_tag(faddr, fport, laddr, lport);
900 		fp  = ctl->fat->hash[tag & ctl->fat->mask];
901 	}
902 
903 	while (fp && fp->inuse) {
904 		uint32_t	inuse = fp->inuse;
905 
906 		++fatps;
907 
908 		for (i = 0; inuse && i < fatp_ntags(); ++i) {
909 			uint32_t	idx;
910 
911 			if (!(inuse & (1 << i)))
912 				continue;
913 
914 			inuse ^= 1 << i;
915 
916 			++probes;
917 			++vtw_stats.probe[which];
918 
919 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
920 			vtw = vtw_from_index(ctl, idx);
921 
922 			db_trace(KTR_VTW
923 				 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
924 				    , i
925 				    , db_store(faddr, sizeof (*faddr)), fport
926 				    , db_store(laddr, sizeof (*laddr)), lport
927 				    , idx_decode(ctl, idx)));
928 
929 			if (!vtw) {
930 				/* Hopefully fast path.
931 				 */
932 				continue;
933 			}
934 
935 			v6 = (void*)vtw;
936 
937 			if (vtw_alive(vtw)
938 			    && ((which ? vtw->port_key : vtw->key)
939 				== fatp_key(ctl->fat, fp, i))
940 			    && v6->lport == lport
941 			    && (which
942 				|| (v6->fport == fport
943 				    && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
944 				    && !bcmp(&v6->laddr, laddr
945 					     , sizeof (*laddr))))) {
946 				++vtw_stats.hit[which];
947 
948 				KASSERT(vtw->hashed);
949 				goto out;
950 			} else {
951 				++vtw_stats.losing[which];
952 				++losings;
953 			}
954 		}
955 
956 		if (fp->nxt) {
957 			fp = fatp_next(ctl->fat, fp);
958 		} else {
959 			break;
960 		}
961 	}
962 	++vtw_stats.miss[which];
963 	vtw = 0;
964 out:
965 	if (fatps > vtw_stats.max_chain[which])
966 		vtw_stats.max_chain[which] = fatps;
967 	if (probes > vtw_stats.max_probe[which])
968 		vtw_stats.max_probe[which] = probes;
969 	if (losings > vtw_stats.max_loss[which])
970 		vtw_stats.max_loss[which] = losings;
971 
972 	return vtw;
973 }
974 
975 /*!\brief port iterator
976  */
977 static vtw_t *
978 vtw_next_port_v4(struct tcp_ports_iterator *it)
979 {
980 	vtw_ctl_t	*ctl = it->ctl;
981 	vtw_v4_t	*v4;
982 	vtw_t		*vtw;
983 	uint32_t	tag;
984 	uint16_t	lport = it->port;
985 	fatp_t		*fp;
986 	int		i;
987 	uint32_t	fatps = 0, probes = 0, losings = 0;
988 
989 	tag = v4_port_tag(lport);
990 	if (!it->fp) {
991 		it->fp = ctl->fat->port[tag & ctl->fat->mask];
992 		it->slot_idx = 0;
993 	}
994 	fp  = it->fp;
995 
996 	while (fp) {
997 		uint32_t	inuse = fp->inuse;
998 
999 		++fatps;
1000 
1001 		for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1002 			uint32_t	idx;
1003 
1004 			if (!(inuse & (1 << i)))
1005 				continue;
1006 
1007 			inuse &= ~0 << i;
1008 
1009 			if (i < it->slot_idx)
1010 				continue;
1011 
1012 			++vtw_stats.probe[1];
1013 			++probes;
1014 
1015 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1016 			vtw = vtw_from_index(ctl, idx);
1017 
1018 			if (!vtw) {
1019 				/* Hopefully fast path.
1020 				 */
1021 				continue;
1022 			}
1023 
1024 			v4 = (void*)vtw;
1025 
1026 			if (vtw_alive(vtw)
1027 			    && vtw->port_key == fatp_key(ctl->fat, fp, i)
1028 			    && v4->lport == lport) {
1029 				++vtw_stats.hit[1];
1030 
1031 				it->slot_idx = i + 1;
1032 
1033 				goto out;
1034 			} else if (vtw_alive(vtw)) {
1035 				++vtw_stats.losing[1];
1036 				++losings;
1037 
1038 				db_trace(KTR_VTW
1039 					 , (vtw, "vtw:!mis"
1040 					    " port %8.8x:%4.4x %8.8x:%4.4x"
1041 					    " key %x port %x"
1042 					    , v4->faddr, v4->fport
1043 					    , v4->laddr, v4->lport
1044 					    , vtw->key
1045 					    , lport));
1046 			} else {
1047 				/* Really losing here.  We are coming
1048 				 * up with references to free entries.
1049 				 * Might find it better to use
1050 				 * traditional, or need another
1051 				 * add-hockery.  The other add-hockery
1052 				 * would be to pul more into into the
1053 				 * cache line to reject the false
1054 				 * hits.
1055 				 */
1056 				++vtw_stats.losing[1];
1057 				++losings;
1058 				db_trace(KTR_VTW
1059 					 , (fp, "vtw:!mis port %x"
1060 					    " - free entry idx %x vtw %p"
1061 					    , lport
1062 					    , idx_decode(ctl, idx)
1063 					    , vtw));
1064 			}
1065 		}
1066 
1067 		if (fp->nxt) {
1068 			it->fp = fp = fatp_next(ctl->fat, fp);
1069 			it->slot_idx = 0;
1070 		} else {
1071 			it->fp = 0;
1072 			break;
1073 		}
1074 	}
1075 	++vtw_stats.miss[1];
1076 
1077 	vtw = 0;
1078 out:
1079 	if (fatps > vtw_stats.max_chain[1])
1080 		vtw_stats.max_chain[1] = fatps;
1081 	if (probes > vtw_stats.max_probe[1])
1082 		vtw_stats.max_probe[1] = probes;
1083 	if (losings > vtw_stats.max_loss[1])
1084 		vtw_stats.max_loss[1] = losings;
1085 
1086 	return vtw;
1087 }
1088 
1089 /*!\brief port iterator
1090  */
1091 static vtw_t *
1092 vtw_next_port_v6(struct tcp_ports_iterator *it)
1093 {
1094 	vtw_ctl_t	*ctl = it->ctl;
1095 	vtw_v6_t	*v6;
1096 	vtw_t		*vtw;
1097 	uint32_t	tag;
1098 	uint16_t	lport = it->port;
1099 	fatp_t		*fp;
1100 	int		i;
1101 	uint32_t	fatps = 0, probes = 0, losings = 0;
1102 
1103 	tag = v6_port_tag(lport);
1104 	if (!it->fp) {
1105 		it->fp = ctl->fat->port[tag & ctl->fat->mask];
1106 		it->slot_idx = 0;
1107 	}
1108 	fp  = it->fp;
1109 
1110 	while (fp) {
1111 		uint32_t	inuse = fp->inuse;
1112 
1113 		++fatps;
1114 
1115 		for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1116 			uint32_t	idx;
1117 
1118 			if (!(inuse & (1 << i)))
1119 				continue;
1120 
1121 			inuse &= ~0 << i;
1122 
1123 			if (i < it->slot_idx)
1124 				continue;
1125 
1126 			++vtw_stats.probe[1];
1127 			++probes;
1128 
1129 			idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1130 			vtw = vtw_from_index(ctl, idx);
1131 
1132 			if (!vtw) {
1133 				/* Hopefully fast path.
1134 				 */
1135 				continue;
1136 			}
1137 
1138 			v6 = (void*)vtw;
1139 
1140 			db_trace(KTR_VTW
1141 				 , (vtw, "vtw: i %x idx %x fp->tag %x"
1142 				    " tag %x xtra %x"
1143 				    , i, idx_decode(ctl, idx)
1144 				    , fp->tag[i], tag, fatp_xtra[i]));
1145 
1146 			if (vtw_alive(vtw)
1147 			    && vtw->port_key == fatp_key(ctl->fat, fp, i)
1148 			    && v6->lport == lport) {
1149 				++vtw_stats.hit[1];
1150 
1151 				db_trace(KTR_VTW
1152 					 , (fp, "vtw: nxt port %P - %4.4x"
1153 					    " idx %x key %x"
1154 					    , lport, lport
1155 					    , idx_decode(ctl, idx), vtw->key));
1156 
1157 				it->slot_idx = i + 1;
1158 				goto out;
1159 			} else if (vtw_alive(vtw)) {
1160 				++vtw_stats.losing[1];
1161 
1162 				db_trace(KTR_VTW
1163 					 , (vtw, "vtw:!mis port %6A:%4.4x"
1164 					    " %6A:%4.4x key %x port %x"
1165 					    , db_store(&v6->faddr
1166 						       , sizeof (v6->faddr))
1167 					    , v6->fport
1168 					    , db_store(&v6->laddr
1169 						       , sizeof (v6->faddr))
1170 					    , v6->lport
1171 					    , vtw->key
1172 					    , lport));
1173 			} else {
1174 				/* Really losing here.  We are coming
1175 				 * up with references to free entries.
1176 				 * Might find it better to use
1177 				 * traditional, or need another
1178 				 * add-hockery.  The other add-hockery
1179 				 * would be to pul more into into the
1180 				 * cache line to reject the false
1181 				 * hits.
1182 				 */
1183 				++vtw_stats.losing[1];
1184 				++losings;
1185 
1186 				db_trace(KTR_VTW
1187 					 , (fp
1188 					    , "vtw:!mis port %x"
1189 					    " - free entry idx %x vtw %p"
1190 					    , lport, idx_decode(ctl, idx)
1191 					    , vtw));
1192 			}
1193 		}
1194 
1195 		if (fp->nxt) {
1196 			it->fp = fp = fatp_next(ctl->fat, fp);
1197 			it->slot_idx = 0;
1198 		} else {
1199 			it->fp = 0;
1200 			break;
1201 		}
1202 	}
1203 	++vtw_stats.miss[1];
1204 
1205 	vtw = 0;
1206 out:
1207 	if (fatps > vtw_stats.max_chain[1])
1208 		vtw_stats.max_chain[1] = fatps;
1209 	if (probes > vtw_stats.max_probe[1])
1210 		vtw_stats.max_probe[1] = probes;
1211 	if (losings > vtw_stats.max_loss[1])
1212 		vtw_stats.max_loss[1] = losings;
1213 
1214 	return vtw;
1215 }
1216 
1217 /*!\brief initialise the VTW allocation arena
1218  *
1219  * There are 1+3 allocation classes:
1220  *	0	classless
1221  *	{1,2,3}	MSL-class based allocation
1222  *
1223  * The allocation arenas are all initialised.  Classless gets all the
1224  * space.  MSL-class based divides the arena, so that allocation
1225  * within a class can proceed without having to consider entries
1226  * (aka: cache lines) from different classes.
1227  *
1228  * Usually, we are completely classless or class-based, but there can be
1229  * transition periods, corresponding to dynamic adjustments in the config
1230  * by the operator.
1231  */
1232 static void
1233 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v)
1234 {
1235 	int class_n, i;
1236 	vtw_t	*base;
1237 
1238 	ctl->base.v = ctl_base_v;
1239 
1240 	if (ctl->is_v4) {
1241 		ctl->lim.v4    = ctl->base.v4 + n - 1;
1242 		ctl->alloc.v4  = ctl->base.v4;
1243 	} else {
1244 		ctl->lim.v6    = ctl->base.v6 + n - 1;
1245 		ctl->alloc.v6  = ctl->base.v6;
1246 	}
1247 
1248 	ctl->nfree  = n;
1249 	ctl->ctl    = ctl;
1250 
1251 	ctl->idx_bits = 32;
1252 	for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1253 		ctl->idx_mask >>= 1;
1254 		ctl->idx_bits  -= 1;
1255 	}
1256 
1257 	ctl->idx_mask <<= 1;
1258 	ctl->idx_mask  |= 1;
1259 	ctl->idx_bits  += 1;
1260 
1261 	ctl->fat = fat;
1262 	fat->vtw = ctl;
1263 
1264 	/* Divide the resources equally amongst the classes.
1265 	 * This is not optimal, as the different classes
1266 	 * arrive and leave at different rates, but it is
1267 	 * the best I can do for now.
1268 	 */
1269 	class_n = n / (VTW_NCLASS-1);
1270 	base    = ctl->base.v;
1271 
1272 	for (i = 1; i < VTW_NCLASS; ++i) {
1273 		int j;
1274 
1275 		ctl[i] = ctl[0];
1276 		ctl[i].clidx = i;
1277 
1278 		ctl[i].base.v = base;
1279 		ctl[i].alloc  = ctl[i].base;
1280 
1281 		for (j = 0; j < class_n - 1; ++j) {
1282 			if (tcp_msl_enable)
1283 				base->msl_class = i;
1284 			base = vtw_next(ctl, base);
1285 		}
1286 
1287 		ctl[i].lim.v = base;
1288 		base = vtw_next(ctl, base);
1289 		ctl[i].nfree = class_n;
1290 	}
1291 
1292 	vtw_debug_init();
1293 }
1294 
1295 /*!\brief	map class to TCP MSL
1296  */
1297 static inline uint32_t
1298 class_to_msl(int class)
1299 {
1300 	switch (class) {
1301 	case 0:
1302 	case 1:
1303 		return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1304 	case 2:
1305 		return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1306 	default:
1307 		return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1308 	}
1309 }
1310 
1311 /*!\brief	map TCP MSL to class
1312  */
1313 static inline uint32_t
1314 msl_to_class(int msl)
1315 {
1316 	if (tcp_msl_enable) {
1317 		if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1318 			return 1+2;
1319 		if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1320 			return 1+1;
1321 		return 1;
1322 	}
1323 	return 0;
1324 }
1325 
1326 /*!\brief allocate a vtw entry
1327  */
1328 static inline vtw_t *
1329 vtw_alloc(vtw_ctl_t *ctl)
1330 {
1331 	vtw_t	*vtw	= 0;
1332 	int	stuck	= 0;
1333 	int	avail	= ctl ? (ctl->nalloc + ctl->nfree) : 0;
1334 	int	msl;
1335 
1336 	KASSERT(mutex_owned(softnet_lock));
1337 
1338 	/* If no resources, we will not get far.
1339 	 */
1340 	if (!ctl || !ctl->base.v4 || avail <= 0)
1341 		return 0;
1342 
1343 	/* Obtain a free one.
1344 	 */
1345 	while (!ctl->nfree) {
1346 		vtw_age(ctl, 0);
1347 
1348 		if (++stuck > avail) {
1349 			/* When in transition between
1350 			 * schemes (classless, classed) we
1351 			 * can be stuck having to await the
1352 			 * expiration of cross-allocated entries.
1353 			 *
1354 			 * Returning zero means we will fall back to the
1355 			 * traditional TIME_WAIT handling, except in the
1356 			 * case of a re-shed, in which case we cannot
1357 			 * perform the reshecd, but will retain the extant
1358 			 * entry.
1359 			 */
1360 			db_trace(KTR_VTW
1361 				 , (ctl, "vtw:!none free in class %x %x/%x"
1362 				    , ctl->clidx
1363 				    , ctl->nalloc, ctl->nfree));
1364 
1365 			return 0;
1366 		}
1367 	}
1368 
1369 	vtw = ctl->alloc.v;
1370 
1371 	if (vtw->msl_class != ctl->clidx) {
1372 		/* Usurping rules:
1373 		 * 	0 -> {1,2,3} or {1,2,3} -> 0
1374 		 */
1375 		KASSERT(!vtw->msl_class || !ctl->clidx);
1376 
1377 		if (vtw->hashed || vtw->expire.tv_sec) {
1378 		    /* As this is owned by some other class,
1379 		     * we must wait for it to expire it.
1380 		     * This will only happen on class/classless
1381 		     * transitions, which are guaranteed to progress
1382 		     * to completion in small finite time, barring bugs.
1383 		     */
1384 		    db_trace(KTR_VTW
1385 			     , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1386 				, vtw, vtw->msl_class, ctl->clidx
1387 				, vtw->expire.tv_sec
1388 				, vtw->expire.tv_usec
1389 				, vtw->hashed ? " hashed" : ""));
1390 
1391 		    return 0;
1392 		}
1393 
1394 		db_trace(KTR_VTW
1395 			 , (ctl, "vtw:!%p usurped from %x to %x"
1396 			    , vtw, vtw->msl_class, ctl->clidx));
1397 
1398 		vtw->msl_class = ctl->clidx;
1399 	}
1400 
1401 	if (vtw_alive(vtw)) {
1402 		KASSERT(0 && "next free not free");
1403 		return 0;
1404 	}
1405 
1406 	/* Advance allocation poiter.
1407 	 */
1408 	ctl->alloc.v = vtw_next(ctl, vtw);
1409 
1410 	--ctl->nfree;
1411 	++ctl->nalloc;
1412 
1413 	msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ;	// msec
1414 
1415 	/* mark expiration
1416 	 */
1417 	getmicrouptime(&vtw->expire);
1418 
1419 	/* Move expiration into the future.
1420 	 */
1421 	vtw->expire.tv_sec  += msl / 1000;
1422 	vtw->expire.tv_usec += 1000 * (msl % 1000);
1423 
1424 	while (vtw->expire.tv_usec >= 1000*1000) {
1425 		vtw->expire.tv_usec -= 1000*1000;
1426 		vtw->expire.tv_sec  += 1;
1427 	}
1428 
1429 	if (!ctl->oldest.v)
1430 		ctl->oldest.v = vtw;
1431 
1432 	return vtw;
1433 }
1434 
1435 /*!\brief expiration
1436  */
1437 static int
1438 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1439 {
1440 	vtw_t	*vtw;
1441 	struct timeval then, *when = _when;
1442 	int	maxtries = 0;
1443 
1444 	if (!ctl->oldest.v) {
1445 		KASSERT(!ctl->nalloc);
1446 		return 0;
1447 	}
1448 
1449 	for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1450 		if (++maxtries > ctl->nalloc)
1451 			break;
1452 
1453 		if (vtw->msl_class != ctl->clidx) {
1454 			db_trace(KTR_VTW
1455 				 , (vtw, "vtw:!age class mismatch %x != %x"
1456 				    , vtw->msl_class, ctl->clidx));
1457 			/* XXXX
1458 			 * See if the appropriate action is to skip to the next.
1459 			 * XXXX
1460 			 */
1461 			ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1462 			continue;
1463 		}
1464 		if (!when) {
1465 			/* Latch oldest timeval if none specified.
1466 			 */
1467 			then = vtw->expire;
1468 			when = &then;
1469 		}
1470 
1471 		if (!timercmp(&vtw->expire, when, <=))
1472 			break;
1473 
1474 		db_trace(KTR_VTW
1475 			 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1476 			    , ctl->clidx
1477 			    , vtw->expire.tv_sec
1478 			    , vtw->expire.tv_usec
1479 			    , ctl->nalloc
1480 			    , ctl->nfree));
1481 
1482 		if (!_when)
1483 			++vtw_stats.kill;
1484 
1485 		vtw_del(ctl, vtw);
1486 		vtw = ctl->oldest.v;
1487 	}
1488 
1489 	return ctl->nalloc;	// # remaining allocated
1490 }
1491 
1492 static callout_t vtw_cs;
1493 
1494 /*!\brief notice the passage of time.
1495  * It seems to be getting faster.  What happened to the year?
1496  */
1497 static void
1498 vtw_tick(void *arg)
1499 {
1500 	struct timeval now;
1501 	int i, cnt = 0;
1502 
1503 	getmicrouptime(&now);
1504 
1505 	db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1506 			   , now.tv_sec, now.tv_usec));
1507 
1508 	mutex_enter(softnet_lock);
1509 
1510 	for (i = 0; i < VTW_NCLASS; ++i) {
1511 		cnt += vtw_age(&vtw_tcpv4[i], &now);
1512 		cnt += vtw_age(&vtw_tcpv6[i], &now);
1513 	}
1514 
1515 	/* Keep ticks coming while we need them.
1516 	 */
1517 	if (cnt)
1518 		callout_schedule(&vtw_cs, hz / 5);
1519 	else {
1520 		tcp_vtw_was_enabled = 0;
1521 		tcbtable.vestige    = 0;
1522 	}
1523 	mutex_exit(softnet_lock);
1524 }
1525 
1526 /* in_pcblookup_ports assist for handling vestigial entries.
1527  */
1528 static void *
1529 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1530 {
1531 	struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1532 
1533 	bzero(it, sizeof (*it));
1534 
1535 	/* Note: the reference to vtw_tcpv4[0] is fine.
1536 	 * We do not need per-class iteration.  We just
1537 	 * need to get to the fat, and there is one
1538 	 * shared fat.
1539 	 */
1540 	if (vtw_tcpv4[0].fat) {
1541 		it->addr.v4 = addr;
1542 		it->port = port;
1543 		it->wild = !!wild;
1544 		it->ctl  = &vtw_tcpv4[0];
1545 
1546 		++vtw_stats.look[1];
1547 	}
1548 
1549 	return it;
1550 }
1551 
1552 /*!\brief export an IPv4 vtw.
1553  */
1554 static int
1555 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1556 {
1557 	vtw_v4_t	*v4 = (void*)vtw;
1558 
1559 	bzero(res, sizeof (*res));
1560 
1561 	if (ctl && vtw) {
1562 		if (!ctl->clidx && vtw->msl_class)
1563 			ctl += vtw->msl_class;
1564 		else
1565 			KASSERT(ctl->clidx == vtw->msl_class);
1566 
1567 		res->valid = 1;
1568 		res->v4    = 1;
1569 
1570 		res->faddr.v4.s_addr = v4->faddr;
1571 		res->laddr.v4.s_addr = v4->laddr;
1572 		res->fport	= v4->fport;
1573 		res->lport	= v4->lport;
1574 		res->vtw	= vtw;		// netlock held over call(s)
1575 		res->ctl	= ctl;
1576 		res->reuse_addr = vtw->reuse_addr;
1577 		res->reuse_port = vtw->reuse_port;
1578 		res->snd_nxt    = vtw->snd_nxt;
1579 		res->rcv_nxt	= vtw->rcv_nxt;
1580 		res->rcv_wnd	= vtw->rcv_wnd;
1581 		res->uid	= vtw->uid;
1582 	}
1583 
1584 	return res->valid;
1585 }
1586 
1587 /*!\brief return next port in the port iterator.  yowza.
1588  */
1589 static int
1590 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1591 {
1592 	struct tcp_ports_iterator *it = arg;
1593 	vtw_t		*vtw = 0;
1594 
1595 	if (it->ctl)
1596 		vtw = vtw_next_port_v4(it);
1597 
1598 	if (!vtw)
1599 		it->ctl = 0;
1600 
1601 	return vtw_export_v4(it->ctl, vtw, res);
1602 }
1603 
1604 static int
1605 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1606               struct in_addr laddr, uint16_t lport,
1607 	      struct vestigial_inpcb *res)
1608 {
1609 	vtw_t		*vtw;
1610 	vtw_ctl_t	*ctl;
1611 
1612 
1613 	db_trace(KTR_VTW
1614 		 , (res, "vtw: lookup %A:%P %A:%P"
1615 		    , faddr, fport
1616 		    , laddr, lport));
1617 
1618 	vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1619 				 , faddr.s_addr, fport
1620 				 , laddr.s_addr, lport, 0);
1621 
1622 	return vtw_export_v4(ctl, vtw, res);
1623 }
1624 
1625 /* in_pcblookup_ports assist for handling vestigial entries.
1626  */
1627 static void *
1628 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1629 {
1630 	struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1631 
1632 	bzero(it, sizeof (*it));
1633 
1634 	/* Note: the reference to vtw_tcpv6[0] is fine.
1635 	 * We do not need per-class iteration.  We just
1636 	 * need to get to the fat, and there is one
1637 	 * shared fat.
1638 	 */
1639 	if (vtw_tcpv6[0].fat) {
1640 		it->addr.v6 = *addr;
1641 		it->port = port;
1642 		it->wild = !!wild;
1643 		it->ctl  = &vtw_tcpv6[0];
1644 
1645 		++vtw_stats.look[1];
1646 	}
1647 
1648 	return it;
1649 }
1650 
1651 /*!\brief export an IPv6 vtw.
1652  */
1653 static int
1654 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1655 {
1656 	vtw_v6_t	*v6 = (void*)vtw;
1657 
1658 	bzero(res, sizeof (*res));
1659 
1660 	if (ctl && vtw) {
1661 		if (!ctl->clidx && vtw->msl_class)
1662 			ctl += vtw->msl_class;
1663 		else
1664 			KASSERT(ctl->clidx == vtw->msl_class);
1665 
1666 		res->valid = 1;
1667 		res->v4    = 0;
1668 
1669 		res->faddr.v6	= v6->faddr;
1670 		res->laddr.v6	= v6->laddr;
1671 		res->fport	= v6->fport;
1672 		res->lport	= v6->lport;
1673 		res->vtw	= vtw;		// netlock held over call(s)
1674 		res->ctl	= ctl;
1675 
1676 		res->v6only	= vtw->v6only;
1677 		res->reuse_addr = vtw->reuse_addr;
1678 		res->reuse_port = vtw->reuse_port;
1679 
1680 		res->snd_nxt    = vtw->snd_nxt;
1681 		res->rcv_nxt	= vtw->rcv_nxt;
1682 		res->rcv_wnd	= vtw->rcv_wnd;
1683 		res->uid	= vtw->uid;
1684 	}
1685 
1686 	return res->valid;
1687 }
1688 
1689 static int
1690 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1691 {
1692 	struct tcp_ports_iterator *it = arg;
1693 	vtw_t		*vtw = 0;
1694 
1695 	if (it->ctl)
1696 		vtw = vtw_next_port_v6(it);
1697 
1698 	if (!vtw)
1699 		it->ctl = 0;
1700 
1701 	return vtw_export_v6(it->ctl, vtw, res);
1702 }
1703 
1704 static int
1705 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1706               const struct in6_addr *laddr, uint16_t lport,
1707 	      struct vestigial_inpcb *res)
1708 {
1709 	vtw_ctl_t	*ctl;
1710 	vtw_t		*vtw;
1711 
1712 	db_trace(KTR_VTW
1713 		 , (res, "vtw: lookup %6A:%P %6A:%P"
1714 		    , db_store(faddr, sizeof (*faddr)), fport
1715 		    , db_store(laddr, sizeof (*laddr)), lport));
1716 
1717 	vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1718 				 , faddr, fport
1719 				 , laddr, lport, 0);
1720 
1721 	return vtw_export_v6(ctl, vtw, res);
1722 }
1723 
1724 static vestigial_hooks_t tcp_hooks = {
1725 	.init_ports4	= tcp_init_ports_v4,
1726 	.next_port4	= tcp_next_port_v4,
1727 	.lookup4	= tcp_lookup_v4,
1728 	.init_ports6	= tcp_init_ports_v6,
1729 	.next_port6	= tcp_next_port_v6,
1730 	.lookup6	= tcp_lookup_v6,
1731 };
1732 
1733 static bool
1734 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1735 {
1736 	fatp_ctl_t	*fat;
1737 	vtw_ctl_t	*ctl;
1738 
1739 	switch (af) {
1740 	case AF_INET:
1741 		fat = &fat_tcpv4;
1742 		ctl = &vtw_tcpv4[0];
1743 		break;
1744 	case AF_INET6:
1745 		fat = &fat_tcpv6;
1746 		ctl = &vtw_tcpv6[0];
1747 		break;
1748 	default:
1749 		return false;
1750 	}
1751 	if (fatp != NULL)
1752 		*fatp = fat;
1753 	if (ctlp != NULL)
1754 		*ctlp = ctl;
1755 	return true;
1756 }
1757 
1758 /*!\brief	initialize controlling instance
1759  */
1760 static int
1761 vtw_control_init(int af)
1762 {
1763 	fatp_ctl_t	*fat;
1764 	vtw_ctl_t	*ctl;
1765 	fatp_t		*fat_base;
1766 	fatp_t		**fat_hash;
1767 	vtw_t		*ctl_base_v;
1768 	uint32_t	n, m;
1769 	size_t sz;
1770 
1771 	KASSERT(powerof2(tcp_vtw_entries));
1772 
1773 	if (!vtw_select(af, &fat, &ctl))
1774 		return EAFNOSUPPORT;
1775 
1776 	if (fat->hash != NULL) {
1777 		KASSERT(fat->base != NULL && ctl->base.v != NULL);
1778 		return 0;
1779 	}
1780 
1781 	/* Allocate 10% more capacity in the fat pointers.
1782 	 * We should only need ~#hash additional based on
1783 	 * how they age, but TIME_WAIT assassination could cause
1784 	 * sparse fat pointer utilisation.
1785 	 */
1786 	m = 512;
1787 	n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1788 	sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t));
1789 
1790 	fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP);
1791 
1792 	if (fat_hash == NULL) {
1793 		printf("%s: could not allocate %zu bytes for "
1794 		    "hash anchors", __func__, 2*m * sizeof(fatp_t *));
1795 		return ENOMEM;
1796 	}
1797 
1798 	fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP);
1799 
1800 	if (fat_base == NULL) {
1801 		kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1802 		printf("%s: could not allocate %zu bytes for "
1803 		    "fatp_t array", __func__, 2*n * sizeof(fatp_t));
1804 		return ENOMEM;
1805 	}
1806 
1807 	ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP);
1808 
1809 	if (ctl_base_v == NULL) {
1810 		kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1811 		kmem_free(fat_base, 2*n * sizeof(fatp_t));
1812 		printf("%s: could not allocate %zu bytes for "
1813 		    "vtw_t array", __func__, tcp_vtw_entries * sz);
1814 		return ENOMEM;
1815 	}
1816 
1817 	fatp_init(fat, n, m, fat_base, fat_hash);
1818 
1819 	vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v);
1820 
1821 	return 0;
1822 }
1823 
1824 /*!\brief	select controlling instance
1825  */
1826 static vtw_ctl_t *
1827 vtw_control(int af, uint32_t msl)
1828 {
1829 	fatp_ctl_t	*fat;
1830 	vtw_ctl_t	*ctl;
1831 	int		class	= msl_to_class(msl);
1832 
1833 	if (!vtw_select(af, &fat, &ctl))
1834 		return NULL;
1835 
1836 	if (!fat->base || !ctl->base.v)
1837 		return NULL;
1838 
1839 	if (!tcp_vtw_was_enabled) {
1840 		/* This guarantees is timer ticks until we no longer need them.
1841 		 */
1842 		tcp_vtw_was_enabled = 1;
1843 
1844 		callout_schedule(&vtw_cs, hz / 5);
1845 
1846 		tcbtable.vestige = &tcp_hooks;
1847 	}
1848 
1849 	return ctl + class;
1850 }
1851 
1852 /*!\brief	add TCP pcb to vestigial timewait
1853  */
1854 int
1855 vtw_add(int af, struct tcpcb *tp)
1856 {
1857 	int		enable;
1858 	vtw_ctl_t	*ctl;
1859 	vtw_t		*vtw;
1860 
1861 	KASSERT(mutex_owned(softnet_lock));
1862 
1863 	ctl = vtw_control(af, tp->t_msl);
1864 	if (!ctl)
1865 		return 0;
1866 
1867 	enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1868 
1869 	vtw = vtw_alloc(ctl);
1870 
1871 	if (vtw) {
1872 		vtw->snd_nxt = tp->snd_nxt;
1873 		vtw->rcv_nxt = tp->rcv_nxt;
1874 
1875 		switch (af) {
1876 		case AF_INET: {
1877 			struct inpcb	*inp = tp->t_inpcb;
1878 			vtw_v4_t	*v4  = (void*)vtw;
1879 
1880 			v4->faddr = inp->inp_faddr.s_addr;
1881 			v4->laddr = inp->inp_laddr.s_addr;
1882 			v4->fport = inp->inp_fport;
1883 			v4->lport = inp->inp_lport;
1884 
1885 			vtw->reuse_port = !!(inp->inp_socket->so_options
1886 					     & SO_REUSEPORT);
1887 			vtw->reuse_addr = !!(inp->inp_socket->so_options
1888 					     & SO_REUSEADDR);
1889 			vtw->v6only	= 0;
1890 			vtw->uid	= inp->inp_socket->so_uidinfo->ui_uid;
1891 
1892 			vtw_inshash_v4(ctl, vtw);
1893 
1894 
1895 #ifdef VTW_DEBUG
1896 			/* Immediate lookup (connected and port) to
1897 			 * ensure at least that works!
1898 			 */
1899 			if (enable & 4) {
1900 				KASSERT(vtw_lookup_hash_v4
1901 					(ctl
1902 					 , inp->inp_faddr.s_addr, inp->inp_fport
1903 					 , inp->inp_laddr.s_addr, inp->inp_lport
1904 					 , 0)
1905 					== vtw);
1906 				KASSERT(vtw_lookup_hash_v4
1907 					(ctl
1908 					 , inp->inp_faddr.s_addr, inp->inp_fport
1909 					 , inp->inp_laddr.s_addr, inp->inp_lport
1910 					 , 1));
1911 			}
1912 			/* Immediate port iterator functionality check: not wild
1913 			 */
1914 			if (enable & 8) {
1915 				struct tcp_ports_iterator *it;
1916 				struct vestigial_inpcb res;
1917 				int cnt = 0;
1918 
1919 				it = tcp_init_ports_v4(inp->inp_laddr
1920 						       , inp->inp_lport, 0);
1921 
1922 				while (tcp_next_port_v4(it, &res)) {
1923 					++cnt;
1924 				}
1925 				KASSERT(cnt);
1926 			}
1927 			/* Immediate port iterator functionality check: wild
1928 			 */
1929 			if (enable & 16) {
1930 				struct tcp_ports_iterator *it;
1931 				struct vestigial_inpcb res;
1932 				struct in_addr any;
1933 				int cnt = 0;
1934 
1935 				any.s_addr = htonl(INADDR_ANY);
1936 
1937 				it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1938 
1939 				while (tcp_next_port_v4(it, &res)) {
1940 					++cnt;
1941 				}
1942 				KASSERT(cnt);
1943 			}
1944 #endif /* VTW_DEBUG */
1945 			break;
1946 		}
1947 
1948 		case AF_INET6: {
1949 			struct in6pcb	*inp = tp->t_in6pcb;
1950 			vtw_v6_t	*v6  = (void*)vtw;
1951 
1952 			v6->faddr = inp->in6p_faddr;
1953 			v6->laddr = inp->in6p_laddr;
1954 			v6->fport = inp->in6p_fport;
1955 			v6->lport = inp->in6p_lport;
1956 
1957 			vtw->reuse_port = !!(inp->in6p_socket->so_options
1958 					     & SO_REUSEPORT);
1959 			vtw->reuse_addr = !!(inp->in6p_socket->so_options
1960 					     & SO_REUSEADDR);
1961 			vtw->v6only	= !!(inp->in6p_flags
1962 					     & IN6P_IPV6_V6ONLY);
1963 			vtw->uid	= inp->in6p_socket->so_uidinfo->ui_uid;
1964 
1965 			vtw_inshash_v6(ctl, vtw);
1966 #ifdef VTW_DEBUG
1967 			/* Immediate lookup (connected and port) to
1968 			 * ensure at least that works!
1969 			 */
1970 			if (enable & 4) {
1971 				KASSERT(vtw_lookup_hash_v6(ctl
1972 					 , &inp->in6p_faddr, inp->in6p_fport
1973 					 , &inp->in6p_laddr, inp->in6p_lport
1974 					 , 0)
1975 					== vtw);
1976 				KASSERT(vtw_lookup_hash_v6
1977 					(ctl
1978 					 , &inp->in6p_faddr, inp->in6p_fport
1979 					 , &inp->in6p_laddr, inp->in6p_lport
1980 					 , 1));
1981 			}
1982 			/* Immediate port iterator functionality check: not wild
1983 			 */
1984 			if (enable & 8) {
1985 				struct tcp_ports_iterator *it;
1986 				struct vestigial_inpcb res;
1987 				int cnt = 0;
1988 
1989 				it = tcp_init_ports_v6(&inp->in6p_laddr
1990 						       , inp->in6p_lport, 0);
1991 
1992 				while (tcp_next_port_v6(it, &res)) {
1993 					++cnt;
1994 				}
1995 				KASSERT(cnt);
1996 			}
1997 			/* Immediate port iterator functionality check: wild
1998 			 */
1999 			if (enable & 16) {
2000 				struct tcp_ports_iterator *it;
2001 				struct vestigial_inpcb res;
2002 				static struct in6_addr any = IN6ADDR_ANY_INIT;
2003 				int cnt = 0;
2004 
2005 				it = tcp_init_ports_v6(&any
2006 						       , inp->in6p_lport, 1);
2007 
2008 				while (tcp_next_port_v6(it, &res)) {
2009 					++cnt;
2010 				}
2011 				KASSERT(cnt);
2012 			}
2013 #endif /* VTW_DEBUG */
2014 			break;
2015 		}
2016 		}
2017 
2018 		tcp_canceltimers(tp);
2019 		tp = tcp_close(tp);
2020 		KASSERT(!tp);
2021 
2022 		return 1;
2023 	}
2024 
2025 	return 0;
2026 }
2027 
2028 /*!\brief	restart timer for vestigial time-wait entry
2029  */
2030 static void
2031 vtw_restart_v4(vestigial_inpcb_t *vp)
2032 {
2033 	vtw_v4_t	copy = *(vtw_v4_t*)vp->vtw;
2034 	vtw_t		*vtw;
2035 	vtw_t		*cp  = &copy.common;
2036 	vtw_ctl_t	*ctl;
2037 
2038 	KASSERT(mutex_owned(softnet_lock));
2039 
2040 	db_trace(KTR_VTW
2041 		 , (vp->vtw, "vtw: restart %A:%P %A:%P"
2042 		    , vp->faddr.v4.s_addr, vp->fport
2043 		    , vp->laddr.v4.s_addr, vp->lport));
2044 
2045 	/* Class might have changed, so have a squiz.
2046 	 */
2047 	ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2048 	vtw = vtw_alloc(ctl);
2049 
2050 	if (vtw) {
2051 		vtw_v4_t	*v4  = (void*)vtw;
2052 
2053 		/* Safe now to unhash the old entry
2054 		 */
2055 		vtw_del(vp->ctl, vp->vtw);
2056 
2057 		vtw->snd_nxt = cp->snd_nxt;
2058 		vtw->rcv_nxt = cp->rcv_nxt;
2059 
2060 		v4->faddr = copy.faddr;
2061 		v4->laddr = copy.laddr;
2062 		v4->fport = copy.fport;
2063 		v4->lport = copy.lport;
2064 
2065 		vtw->reuse_port = cp->reuse_port;
2066 		vtw->reuse_addr = cp->reuse_addr;
2067 		vtw->v6only	= 0;
2068 		vtw->uid	= cp->uid;
2069 
2070 		vtw_inshash_v4(ctl, vtw);
2071 	}
2072 
2073 	vp->valid = 0;
2074 }
2075 
2076 /*!\brief	restart timer for vestigial time-wait entry
2077  */
2078 static void
2079 vtw_restart_v6(vestigial_inpcb_t *vp)
2080 {
2081 	vtw_v6_t	copy = *(vtw_v6_t*)vp->vtw;
2082 	vtw_t		*vtw;
2083 	vtw_t		*cp  = &copy.common;
2084 	vtw_ctl_t	*ctl;
2085 
2086 	KASSERT(mutex_owned(softnet_lock));
2087 
2088 	db_trace(KTR_VTW
2089 		 , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2090 		    , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2091 		    , vp->fport
2092 		    , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2093 		    , vp->lport));
2094 
2095 	/* Class might have changed, so have a squiz.
2096 	 */
2097 	ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2098 	vtw = vtw_alloc(ctl);
2099 
2100 	if (vtw) {
2101 		vtw_v6_t	*v6  = (void*)vtw;
2102 
2103 		/* Safe now to unhash the old entry
2104 		 */
2105 		vtw_del(vp->ctl, vp->vtw);
2106 
2107 		vtw->snd_nxt = cp->snd_nxt;
2108 		vtw->rcv_nxt = cp->rcv_nxt;
2109 
2110 		v6->faddr = copy.faddr;
2111 		v6->laddr = copy.laddr;
2112 		v6->fport = copy.fport;
2113 		v6->lport = copy.lport;
2114 
2115 		vtw->reuse_port = cp->reuse_port;
2116 		vtw->reuse_addr = cp->reuse_addr;
2117 		vtw->v6only	= cp->v6only;
2118 		vtw->uid	= cp->uid;
2119 
2120 		vtw_inshash_v6(ctl, vtw);
2121 	}
2122 
2123 	vp->valid = 0;
2124 }
2125 
2126 /*!\brief	restart timer for vestigial time-wait entry
2127  */
2128 void
2129 vtw_restart(vestigial_inpcb_t *vp)
2130 {
2131 	if (!vp || !vp->valid)
2132 		return;
2133 
2134 	if (vp->v4)
2135 		vtw_restart_v4(vp);
2136 	else
2137 		vtw_restart_v6(vp);
2138 }
2139 
2140 int
2141 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS)
2142 {
2143 	int en, rc;
2144 	struct sysctlnode node;
2145 
2146 	node = *rnode;
2147 	en = *(int *)rnode->sysctl_data;
2148 	node.sysctl_data = &en;
2149 
2150 	rc = sysctl_lookup(SYSCTLFN_CALL(&node));
2151 	if (rc != 0 || newp == NULL)
2152 		return rc;
2153 
2154 	if (rnode->sysctl_data != &tcp4_vtw_enable &&
2155 	    rnode->sysctl_data != &tcp6_vtw_enable)
2156 		rc = ENOENT;
2157 	else if ((en & 1) == 0)
2158 		rc = 0;
2159 	else if (rnode->sysctl_data == &tcp4_vtw_enable)
2160 		rc = vtw_control_init(AF_INET);
2161 	else /* rnode->sysctl_data == &tcp6_vtw_enable */
2162 		rc = vtw_control_init(AF_INET6);
2163 
2164 	if (rc == 0)
2165 		*(int *)rnode->sysctl_data = en;
2166 
2167 	return rc;
2168 }
2169 
2170 int
2171 vtw_earlyinit(void)
2172 {
2173 	int i, rc;
2174 
2175 	callout_init(&vtw_cs, 0);
2176 	callout_setfunc(&vtw_cs, vtw_tick, 0);
2177 
2178 	for (i = 0; i < VTW_NCLASS; ++i) {
2179 		vtw_tcpv4[i].is_v4 = 1;
2180 		vtw_tcpv6[i].is_v6 = 1;
2181 	}
2182 
2183 	if ((tcp4_vtw_enable & 1) != 0 &&
2184 	    (rc = vtw_control_init(AF_INET)) != 0)
2185 		return rc;
2186 
2187 	if ((tcp6_vtw_enable & 1) != 0 &&
2188 	    (rc = vtw_control_init(AF_INET6)) != 0)
2189 		return rc;
2190 
2191 	return 0;
2192 }
2193 
2194 #ifdef VTW_DEBUG
2195 #include <sys/syscallargs.h>
2196 #include <sys/sysctl.h>
2197 
2198 /*!\brief	add lalp, fafp entries for debug
2199  */
2200 int
2201 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int class)
2202 {
2203 	vtw_ctl_t	*ctl;
2204 	vtw_t		*vtw;
2205 
2206 	ctl = vtw_control(af, msl ? msl : class_to_msl(class));
2207 	if (!ctl)
2208 		return 0;
2209 
2210 	vtw = vtw_alloc(ctl);
2211 
2212 	if (vtw) {
2213 		vtw->snd_nxt = 0;
2214 		vtw->rcv_nxt = 0;
2215 
2216 		switch (af) {
2217 		case AF_INET: {
2218 			vtw_v4_t	*v4  = (void*)vtw;
2219 
2220 			v4->faddr = fa->sin_addr.v4.s_addr;
2221 			v4->laddr = la->sin_addr.v4.s_addr;
2222 			v4->fport = fa->sin_port;
2223 			v4->lport = la->sin_port;
2224 
2225 			vtw->reuse_port = 1;
2226 			vtw->reuse_addr = 1;
2227 			vtw->v6only	= 0;
2228 			vtw->uid	= 0;
2229 
2230 			vtw_inshash_v4(ctl, vtw);
2231 			break;
2232 		}
2233 
2234 		case AF_INET6: {
2235 			vtw_v6_t	*v6  = (void*)vtw;
2236 
2237 			v6->faddr = fa->sin_addr.v6;
2238 			v6->laddr = la->sin_addr.v6;
2239 
2240 			v6->fport = fa->sin_port;
2241 			v6->lport = la->sin_port;
2242 
2243 			vtw->reuse_port = 1;
2244 			vtw->reuse_addr = 1;
2245 			vtw->v6only	= 0;
2246 			vtw->uid	= 0;
2247 
2248 			vtw_inshash_v6(ctl, vtw);
2249 			break;
2250 		}
2251 
2252 		default:
2253 			break;
2254 		}
2255 
2256 		return 1;
2257 	}
2258 
2259 	return 0;
2260 }
2261 
2262 static int vtw_syscall = 0;
2263 
2264 static int
2265 vtw_debug_process(vtw_sysargs_t *ap)
2266 {
2267 	struct vestigial_inpcb vestige;
2268 	int	rc = 0;
2269 
2270 	mutex_enter(softnet_lock);
2271 
2272 	switch (ap->op) {
2273 	case 0:		// insert
2274 		vtw_debug_add(ap->la.sin_family
2275 			      , &ap->la
2276 			      , &ap->fa
2277 			      , TCPTV_MSL
2278 			      , 0);
2279 		break;
2280 
2281 	case 1:		// lookup
2282 	case 2:		// restart
2283 		switch (ap->la.sin_family) {
2284 		case AF_INET:
2285 			if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2286 					  ap->la.sin_addr.v4, ap->la.sin_port,
2287 					  &vestige)) {
2288 				if (ap->op == 2) {
2289 					vtw_restart(&vestige);
2290 				}
2291 				rc = 0;
2292 			} else
2293 				rc = ESRCH;
2294 			break;
2295 
2296 		case AF_INET6:
2297 			if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2298 					  &ap->la.sin_addr.v6, ap->la.sin_port,
2299 					  &vestige)) {
2300 				if (ap->op == 2) {
2301 					vtw_restart(&vestige);
2302 				}
2303 				rc = 0;
2304 			} else
2305 				rc = ESRCH;
2306 			break;
2307 		default:
2308 			rc = EINVAL;
2309 		}
2310 		break;
2311 
2312 	default:
2313 		rc = EINVAL;
2314 	}
2315 
2316 	mutex_exit(softnet_lock);
2317 	return rc;
2318 }
2319 
2320 struct sys_vtw_args {
2321 	syscallarg(const vtw_sysargs_t *) req;
2322 	syscallarg(size_t) len;
2323 };
2324 
2325 static int
2326 vtw_sys(struct lwp *l, const void *_, register_t *retval)
2327 {
2328 	const struct sys_vtw_args *uap = _;
2329 	void	*buf;
2330 	int	rc;
2331 	size_t	len	= SCARG(uap, len);
2332 
2333 	if (len != sizeof (vtw_sysargs_t))
2334 		return EINVAL;
2335 
2336 	buf = kmem_alloc(len, KM_SLEEP);
2337 	if (!buf)
2338 		return ENOMEM;
2339 
2340 	rc = copyin(SCARG(uap, req), buf, len);
2341 	if (!rc) {
2342 		rc = vtw_debug_process(buf);
2343 	}
2344 	kmem_free(buf, len);
2345 
2346 	return rc;
2347 }
2348 
2349 static void
2350 vtw_sanity_check(void)
2351 {
2352 	vtw_ctl_t	*ctl;
2353 	vtw_t		*vtw;
2354 	int		i;
2355 	int		n;
2356 
2357 	for (i = 0; i < VTW_NCLASS; ++i) {
2358 		ctl = &vtw_tcpv4[i];
2359 
2360 		if (!ctl->base.v || ctl->nalloc)
2361 			continue;
2362 
2363 		for (n = 0, vtw = ctl->base.v; ; ) {
2364 			++n;
2365 			vtw = vtw_next(ctl, vtw);
2366 			if (vtw == ctl->base.v)
2367 				break;
2368 		}
2369 		db_trace(KTR_VTW
2370 			 , (ctl, "sanity: class %x n %x nfree %x"
2371 			    , i, n, ctl->nfree));
2372 
2373 		KASSERT(n == ctl->nfree);
2374 	}
2375 
2376 	for (i = 0; i < VTW_NCLASS; ++i) {
2377 		ctl = &vtw_tcpv6[i];
2378 
2379 		if (!ctl->base.v || ctl->nalloc)
2380 			continue;
2381 
2382 		for (n = 0, vtw = ctl->base.v; ; ) {
2383 			++n;
2384 			vtw = vtw_next(ctl, vtw);
2385 			if (vtw == ctl->base.v)
2386 				break;
2387 		}
2388 		db_trace(KTR_VTW
2389 			 , (ctl, "sanity: class %x n %x nfree %x"
2390 			    , i, n, ctl->nfree));
2391 		KASSERT(n == ctl->nfree);
2392 	}
2393 }
2394 
2395 /*!\brief	Initialise debug support.
2396  */
2397 static void
2398 vtw_debug_init(void)
2399 {
2400 	int	i;
2401 
2402 	vtw_sanity_check();
2403 
2404 	if (vtw_syscall)
2405 		return;
2406 
2407 	for (i = 511; i; --i) {
2408 		if (sysent[i].sy_call == sys_nosys) {
2409 			sysent[i].sy_call    = vtw_sys;
2410 			sysent[i].sy_narg    = 2;
2411 			sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2412 			sysent[i].sy_flags   = 0;
2413 
2414 			vtw_syscall = i;
2415 			break;
2416 		}
2417 	}
2418 	if (i) {
2419 		const struct sysctlnode *node;
2420 		uint32_t	flags;
2421 
2422 		flags = sysctl_root.sysctl_flags;
2423 
2424 		sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2425 		sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2426 
2427 		sysctl_createv(0, 0, 0, &node,
2428 			       CTLFLAG_PERMANENT, CTLTYPE_NODE,
2429 			       "koff",
2430 			       SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2431 			       0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2432 
2433 		if (!node) {
2434 			sysctl_createv(0, 0, 0, &node,
2435 				       CTLFLAG_PERMANENT, CTLTYPE_NODE,
2436 				       "koffka",
2437 				       SYSCTL_DESCR("The Real(tm) Kernel"
2438 						    " Obscure Feature Finder"),
2439 				       0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2440 		}
2441 		if (node) {
2442 			sysctl_createv(0, 0, 0, 0,
2443 				       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2444 				       CTLTYPE_INT, "vtw_debug_syscall",
2445 				       SYSCTL_DESCR("vtw debug"
2446 						    " system call number"),
2447 				       0, 0, &vtw_syscall, 0, node->sysctl_num,
2448 				       CTL_CREATE, CTL_EOL);
2449 		}
2450 		sysctl_root.sysctl_flags = flags;
2451 	}
2452 }
2453 #else /* !VTW_DEBUG */
2454 static void
2455 vtw_debug_init(void)
2456 {
2457 	return;
2458 }
2459 #endif /* !VTW_DEBUG */
2460