xref: /netbsd-src/sys/netinet/tcp_vtw.h (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*	$NetBSD: tcp_vtw.h,v 1.9 2018/04/19 21:21:44 christos Exp $	*/
2 /*
3  * Copyright (c) 2011 The NetBSD Foundation, Inc.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to The NetBSD Foundation
7  * by Coyote Point Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Vestigial time-wait.
33  *
34  * This implementation uses cache-efficient techniques, which will
35  * appear somewhat peculiar.  The main philosophy is to optimise the
36  * amount of information available within a cache line.  Cache miss is
37  * expensive.  So we employ ad-hoc techniques to pull a series of
38  * linked-list follows into a cache line.  One cache line, multiple
39  * linked-list equivalents.
40  *
41  * One such ad-hoc technique is fat pointers.  Additional degrees of
42  * ad-hoqueness result from having to hand tune it for pointer size
43  * and for cache line size.
44  *
45  * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
46  * data structures into one cache line.  The additional 32 bits in the
47  * cache line are used for linking fat pointers, and for
48  * allocation/bookkeeping.
49  *
50  * The 15 32-bit tags encode the pointers to the linked list elements,
51  * and also encode the results of a search comparison.
52  *
53  * First, some more assumptions/restrictions.
54  *
55  * All the fat pointers are from a contiguous allocation arena.  Thus,
56  * we can refer to them by offset from a base, not as full pointers.
57  *
58  * All the linked list data elements are also from a contiguous
59  * allocation arena, again so that we can refer to them as offset from
60  * a base.
61  *
62  * In order to add a data element to a fat pointer, a key value is
63  * computed, based on unique data within the data element.  It is the
64  * linear searching of the linked lists of these elements based on
65  * these unique data that are being optimised here.
66  *
67  * Lets call the function that computes the key k(e), where e is the
68  * data element.  In this example, k(e) returns 32-bits.
69  *
70  * Consider a set E (say of order 15) of data elements.  Let K be
71  * the set of the k(e) for e in E.
72  *
73  * Let O be the set of the offsets from the base of the data elements in E.
74  *
75  * For each x in K, for each matching o in O, let t be x ^ o.  These
76  * are the tags. (More or less).
77  *
78  * In order to search all the data elements in E, we compute the
79  * search key, and one at a time, XOR the key into the tags.  If any
80  * result is a valid data element index, we have a possible match.  If
81  * not, there is no match.
82  *
83  * The no-match cases mean we do not have to de-reference the pointer
84  * to the data element in question.  We save cache miss penalty and
85  * cache load decreases.  Only in the case of a valid looking data
86  * element index, do we have to look closer.
87  *
88  * Thus, in the absence of false positives, 15 data elements can be
89  * searched with one cache line fill, as opposed to 15 cache line
90  * fills for the usual implementation.
91  *
92  * The vestigial time waits (vtw_t), the data elements in the above, are
93  * searched by faddr, fport, laddr, lport.  The key is a function of
94  * these values.
95  *
96  * We hash these keys into the traditional hash chains to reduce the
97  * search time, and use fat pointers to reduce the cache impacts of
98  * searching.
99  *
100  * The vtw_t are, per requirement, in a contiguous chunk.  Allocation
101  * is done with a clock hand, and all vtw_t within one allocation
102  * domain have the same lifetime, so they will always be sorted by
103  * age.
104  *
105  * A vtw_t will be allocated, timestamped, and have a fixed future
106  * expiration.  It will be added to a hash bucket implemented with fat
107  * pointers, which means that a cache line will be allocated in the
108  * hash bucket, placed at the head (more recent in time) and the vtw_t
109  * will be added to this.  As more entries are added, the fat pointer
110  * cache line will fill, requiring additional cache lines for fat
111  * pointers to be allocated. These will be added at the head, and the
112  * aged entries will hang down, tapeworm like.  As the vtw_t entries
113  * expire, the corresponding slot in the fat pointer will be
114  * reclaimed, and eventually the cache line will completely empty and
115  * be re-cycled, if not at the head of the chain.
116  *
117  * At times, a time-wait timer is restarted.  This corresponds to
118  * deleting the current entry and re-adding it.
119  *
120  * Most of the time, they are just placed here to die.
121  */
122 #ifndef _NETINET_TCP_VTW_H
123 #define _NETINET_TCP_VTW_H
124 
125 #include <sys/types.h>
126 #include <sys/socket.h>
127 #include <sys/sysctl.h>
128 #include <net/if.h>
129 #include <netinet/in.h>
130 #include <netinet/in_systm.h>
131 #include <netinet/ip.h>
132 #include <netinet/in_pcb.h>
133 #include <netinet/in_var.h>
134 #include <netinet/ip_var.h>
135 #include <netinet/in.h>
136 #include <netinet/tcp.h>
137 #include <netinet/tcp_timer.h>
138 #include <netinet/tcp_var.h>
139 #include <netinet6/in6.h>
140 #include <netinet/ip6.h>
141 #include <netinet6/ip6_var.h>
142 #include <netinet6/in6_pcb.h>
143 #include <netinet6/ip6_var.h>
144 #include <netinet6/in6_var.h>
145 #include <netinet/icmp6.h>
146 
147 #define	VTW_NCLASS	(1+3)		/* # different classes */
148 
149 /*
150  * fat pointers, MI.
151  */
152 struct fatp_mi;
153 
154 typedef uint32_t fatp_word_t;
155 
156 typedef struct fatp_mi	fatp_t;
157 
158 /* Supported cacheline sizes: 32 64 128 bytes.  See fatp_key(),
159  * fatp_slot_from_key(), fatp_xtra[].
160  */
161 #define	FATP_NTAGS	(CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1)
162 #define	FATP_NXT_WIDTH	(sizeof(fatp_word_t) * NBBY - FATP_NTAGS)
163 
164 #define	FATP_MAX	(1 << FATP_NXT_WIDTH)
165 
166 /* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
167  * 15 tags per cacheline.  At most 2^17 fat pointers per fatp_ctl_t.
168  * The comments on the fatp_mi members, below, correspond to the worked
169  * example.
170  */
171 struct fatp_mi {
172 	fatp_word_t	inuse	: FATP_NTAGS;	/* (1+15)*4 == CL_SIZE */
173 	fatp_word_t	nxt	: FATP_NXT_WIDTH;/* at most 2^17 fat pointers */
174 	fatp_word_t	tag[FATP_NTAGS];	/* 15 tags per CL */
175 };
176 
177 static __inline int
178 fatp_ntags(void)
179 {
180 	return FATP_NTAGS;
181 }
182 
183 static __inline int
184 fatp_full(fatp_t *fp)
185 {
186 	fatp_t full;
187 
188 	full.inuse = (1U << FATP_NTAGS) - 1U;
189 
190 	return (fp->inuse == full.inuse);
191 }
192 
193 struct vtw_common;
194 struct vtw_v4;
195 struct vtw_v6;
196 struct vtw_ctl;
197 
198 /*!\brief common to all vtw
199  */
200 typedef struct vtw_common {
201 	struct timeval	expire;		/* date of birth+msl */
202 	uint32_t	key;		/* hash key: full hash */
203 	uint32_t	port_key;	/* hash key: local port hash */
204 	uint32_t	rcv_nxt;
205 	uint32_t	rcv_wnd;
206 	uint32_t	snd_nxt;
207 	uint32_t	snd_scale	: 8;	/* window scaling for send win */
208 	uint32_t	msl_class	: 2;	/* TCP MSL class {0,1,2,3} */
209 	uint32_t	reuse_port	: 1;
210 	uint32_t	reuse_addr	: 1;
211 	uint32_t	v6only		: 1;
212 	uint32_t	hashed		: 1;	/* reachable via FATP */
213 	uint32_t	uid;
214 } vtw_t;
215 
216 /*!\brief vestigial timewait for IPv4
217  */
218 typedef struct vtw_v4 {
219 	vtw_t		common;		/*  must be first */
220 	uint16_t	lport;
221 	uint16_t	fport;
222 	uint32_t	laddr;
223 	uint32_t	faddr;
224 } vtw_v4_t;
225 
226 /*!\brief vestigial timewait for IPv6
227  */
228 typedef struct vtw_v6 {
229 	vtw_t		common;		/* must be first */
230 	uint16_t	lport;
231 	uint16_t	fport;
232 	struct in6_addr	laddr;
233 	struct in6_addr	faddr;
234 } vtw_v6_t;
235 
236 struct fatp_ctl;
237 typedef struct vtw_ctl		vtw_ctl_t;
238 typedef struct fatp_ctl		fatp_ctl_t;
239 
240 /*
241  * The vestigial time waits are kept in a contiguous chunk.
242  * Allocation and free pointers run as clock hands thru this array.
243  */
244 struct vtw_ctl {
245 	fatp_ctl_t	*fat;		/* collection of fatp to use	*/
246 	vtw_ctl_t	*ctl;		/* <! controller's controller	*/
247 	union {
248 		vtw_t		*v;	/* common			*/
249 		struct vtw_v4	*v4;	/* IPv4 resources		*/
250 		struct vtw_v6	*v6;	/* IPv6 resources		*/
251 	}		base,		/* base of vtw_t array		*/
252 		/**/	lim,		/* extent of vtw_t array	*/
253 		/**/	alloc,		/* allocation pointer		*/
254 		/**/	oldest;		/* ^ to oldest			*/
255 	uint32_t	nfree;		/* # free			*/
256 	uint32_t	nalloc;		/* # allocated			*/
257 	uint32_t	idx_mask;	/* mask capturing all index bits*/
258 	uint32_t	is_v4	: 1;
259 	uint32_t	is_v6	: 1;
260 	uint32_t	idx_bits: 6;
261 	uint32_t	clidx	: 3;	/* <! class index */
262 };
263 
264 /*!\brief Collections of fat pointers.
265  */
266 struct fatp_ctl {
267 	vtw_ctl_t	*vtw;		/* associated VTWs		*/
268 	fatp_t		*base;		/* base of fatp_t array		*/
269 	fatp_t		*lim;		/* extent of fatp_t array	*/
270 	fatp_t		*free;		/* free list			*/
271 	uint32_t	mask;		/* hash mask			*/
272 	uint32_t	nfree;		/* # free			*/
273 	uint32_t	nalloc;		/* # allocated			*/
274 	fatp_t		**hash;		/* hash anchors			*/
275 	fatp_t		**port;		/* port hash anchors		*/
276 };
277 
278 /*!\brief stats
279  */
280 struct vtw_stats {
281 	uint64_t	ins;		/* <! inserts */
282 	uint64_t	del;		/* <! deleted */
283 	uint64_t	kill;		/* <! assassination */
284 	uint64_t	look[2];	/* <! lookup: full hash, port hash */
285 	uint64_t	hit[2];		/* <! lookups that hit */
286 	uint64_t	miss[2];	/* <! lookups that miss */
287 	uint64_t	probe[2];	/* <! hits+miss */
288 	uint64_t	losing[2];	/* <! misses requiring dereference */
289 	uint64_t	max_chain[2];	/* <! max fatp chain traversed */
290 	uint64_t	max_probe[2];	/* <! max probes in any one chain */
291 	uint64_t	max_loss[2];	/* <! max losing probes in any one
292 					 * chain
293 					 */
294 };
295 
296 typedef struct vtw_stats	vtw_stats_t;
297 
298 /*!\brief	follow fatp next 'pointer'
299  */
300 static __inline fatp_t *
301 fatp_next(fatp_ctl_t *fat, fatp_t *fp)
302 {
303 	return fp->nxt ? fat->base + fp->nxt-1 : 0;
304 }
305 
306 /*!\brief determine a collection-relative fat pointer index.
307  */
308 static __inline uint32_t
309 fatp_index(fatp_ctl_t *fat, fatp_t *fp)
310 {
311 	return fp ? 1 + (fp - fat->base) : 0;
312 }
313 
314 
315 static __inline uint32_t
316 v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport)
317 {
318 	return (ntohl(faddr)   + ntohs(fport)
319 		+ ntohl(laddr) + ntohs(lport));
320 }
321 
322 static __inline uint32_t
323 v6_tag(const struct in6_addr *faddr, uint16_t fport,
324        const struct in6_addr *laddr, uint16_t lport)
325 {
326 #ifdef IN6_HASH
327 	return IN6_HASH(faddr, fport, laddr, lport);
328 #else
329 	return 0;
330 #endif
331 }
332 
333 static __inline uint32_t
334 v4_port_tag(uint16_t lport)
335 {
336 	uint32_t tag = lport ^ (lport << 11);
337 
338 	tag ^= tag << 3;
339 	tag += tag >> 5;
340 	tag ^= tag << 4;
341 	tag += tag >> 17;
342 	tag ^= tag << 25;
343 	tag += tag >> 6;
344 
345 	return tag;
346 }
347 
348 static __inline uint32_t
349 v6_port_tag(uint16_t lport)
350 {
351 	return v4_port_tag(lport);
352 }
353 
354 struct tcpcb;
355 struct tcphdr;
356 
357 int  vtw_add(int, struct tcpcb *);
358 void vtw_del(vtw_ctl_t *, vtw_t *);
359 int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th,
360 		  uint32_t faddr, uint16_t fport,
361 		  uint32_t laddr, uint16_t lport);
362 struct ip6_hdr;
363 struct in6_addr;
364 
365 int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th,
366 		  const struct in6_addr *faddr, uint16_t fport,
367 		  const struct in6_addr *laddr, uint16_t lport);
368 
369 typedef struct vestigial_inpcb {
370 	union {
371 		struct in_addr	v4;
372 		struct in6_addr	v6;
373 	} faddr, laddr;
374 	uint16_t		fport, lport;
375 	uint32_t		valid		: 1;
376 	uint32_t		v4		: 1;
377 	uint32_t		reuse_addr	: 1;
378 	uint32_t		reuse_port	: 1;
379 	uint32_t		v6only		: 1;
380 	uint32_t		more_tbd	: 1;
381 	uint32_t		uid;
382 	uint32_t		rcv_nxt;
383 	uint32_t		rcv_wnd;
384 	uint32_t		snd_nxt;
385 	struct vtw_common	*vtw;
386 	struct vtw_ctl		*ctl;
387 } vestigial_inpcb_t;
388 
389 #ifdef _KERNEL
390 void vtw_restart(vestigial_inpcb_t*);
391 int vtw_earlyinit(void);
392 int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO);
393 #endif /* _KERNEL */
394 
395 #ifdef VTW_DEBUG
396 typedef struct sin_either {
397 	uint8_t		sin_len;
398 	uint8_t		sin_family;
399 	uint16_t	sin_port;
400 	union {
401 		struct in_addr	v4;
402 		struct in6_addr	v6;
403 	}		sin_addr;
404 } sin_either_t;
405 
406 int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int);
407 
408 typedef struct vtw_sysargs {
409 	uint32_t	op;
410 	sin_either_t	fa;
411 	sin_either_t	la;
412 } vtw_sysargs_t;
413 
414 #endif /* VTW_DEBUG */
415 
416 #endif /* _NETINET_TCP_VTW_H */
417