1 /* $NetBSD: tcp_vtw.h,v 1.11 2024/10/07 23:17:00 jakllsch Exp $ */ 2 /* 3 * Copyright (c) 2011 The NetBSD Foundation, Inc. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to The NetBSD Foundation 7 * by Coyote Point Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * Vestigial time-wait. 33 * 34 * This implementation uses cache-efficient techniques, which will 35 * appear somewhat peculiar. The main philosophy is to optimise the 36 * amount of information available within a cache line. Cache miss is 37 * expensive. So we employ ad-hoc techniques to pull a series of 38 * linked-list follows into a cache line. One cache line, multiple 39 * linked-list equivalents. 40 * 41 * One such ad-hoc technique is fat pointers. Additional degrees of 42 * ad-hoqueness result from having to hand tune it for pointer size 43 * and for cache line size. 44 * 45 * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list 46 * data structures into one cache line. The additional 32 bits in the 47 * cache line are used for linking fat pointers, and for 48 * allocation/bookkeeping. 49 * 50 * The 15 32-bit tags encode the pointers to the linked list elements, 51 * and also encode the results of a search comparison. 52 * 53 * First, some more assumptions/restrictions. 54 * 55 * All the fat pointers are from a contiguous allocation arena. Thus, 56 * we can refer to them by offset from a base, not as full pointers. 57 * 58 * All the linked list data elements are also from a contiguous 59 * allocation arena, again so that we can refer to them as offset from 60 * a base. 61 * 62 * In order to add a data element to a fat pointer, a key value is 63 * computed, based on unique data within the data element. It is the 64 * linear searching of the linked lists of these elements based on 65 * these unique data that are being optimised here. 66 * 67 * Lets call the function that computes the key k(e), where e is the 68 * data element. In this example, k(e) returns 32-bits. 69 * 70 * Consider a set E (say of order 15) of data elements. Let K be 71 * the set of the k(e) for e in E. 72 * 73 * Let O be the set of the offsets from the base of the data elements in E. 74 * 75 * For each x in K, for each matching o in O, let t be x ^ o. These 76 * are the tags. (More or less). 77 * 78 * In order to search all the data elements in E, we compute the 79 * search key, and one at a time, XOR the key into the tags. If any 80 * result is a valid data element index, we have a possible match. If 81 * not, there is no match. 82 * 83 * The no-match cases mean we do not have to de-reference the pointer 84 * to the data element in question. We save cache miss penalty and 85 * cache load decreases. Only in the case of a valid looking data 86 * element index, do we have to look closer. 87 * 88 * Thus, in the absence of false positives, 15 data elements can be 89 * searched with one cache line fill, as opposed to 15 cache line 90 * fills for the usual implementation. 91 * 92 * The vestigial time waits (vtw_t), the data elements in the above, are 93 * searched by faddr, fport, laddr, lport. The key is a function of 94 * these values. 95 * 96 * We hash these keys into the traditional hash chains to reduce the 97 * search time, and use fat pointers to reduce the cache impacts of 98 * searching. 99 * 100 * The vtw_t are, per requirement, in a contiguous chunk. Allocation 101 * is done with a clock hand, and all vtw_t within one allocation 102 * domain have the same lifetime, so they will always be sorted by 103 * age. 104 * 105 * A vtw_t will be allocated, timestamped, and have a fixed future 106 * expiration. It will be added to a hash bucket implemented with fat 107 * pointers, which means that a cache line will be allocated in the 108 * hash bucket, placed at the head (more recent in time) and the vtw_t 109 * will be added to this. As more entries are added, the fat pointer 110 * cache line will fill, requiring additional cache lines for fat 111 * pointers to be allocated. These will be added at the head, and the 112 * aged entries will hang down, tapeworm like. As the vtw_t entries 113 * expire, the corresponding slot in the fat pointer will be 114 * reclaimed, and eventually the cache line will completely empty and 115 * be re-cycled, if not at the head of the chain. 116 * 117 * At times, a time-wait timer is restarted. This corresponds to 118 * deleting the current entry and re-adding it. 119 * 120 * Most of the time, they are just placed here to die. 121 */ 122 #ifndef _NETINET_TCP_VTW_H 123 #define _NETINET_TCP_VTW_H 124 125 #include <sys/types.h> 126 #include <sys/socket.h> 127 #include <sys/sysctl.h> 128 #include <net/if.h> 129 #include <netinet/in.h> 130 #include <netinet/in_systm.h> 131 #include <netinet/ip.h> 132 #include <netinet/in_pcb.h> 133 #include <netinet/in_var.h> 134 #include <netinet/ip_var.h> 135 #include <netinet/in.h> 136 #include <netinet/tcp.h> 137 #include <netinet/tcp_timer.h> 138 #include <netinet/tcp_var.h> 139 #include <netinet6/in6.h> 140 #include <netinet/ip6.h> 141 #include <netinet6/ip6_var.h> 142 #include <netinet6/in6_pcb.h> 143 #include <netinet6/ip6_var.h> 144 #include <netinet6/in6_var.h> 145 #include <netinet/icmp6.h> 146 147 #define VTW_NCLASS (1+3) /* # different classes */ 148 149 /* 150 * fat pointers, MI. 151 */ 152 struct fatp_mi; 153 154 #if CACHE_LINE_SIZE >= 128 155 typedef uint64_t fatp_word_t; 156 #else 157 typedef uint32_t fatp_word_t; 158 #endif 159 160 typedef struct fatp_mi fatp_t; 161 162 /* Supported cacheline sizes: 32 64 128 bytes. See fatp_key(), 163 * fatp_slot_from_key(), fatp_xtra[]. 164 */ 165 #define FATP_NTAGS (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1) 166 #define FATP_NXT_WIDTH (sizeof(fatp_word_t) * NBBY - FATP_NTAGS) 167 168 #define FATP_MAX (1 << (FATP_NXT_WIDTH < 31 ? FATP_NXT_WIDTH : 31)) 169 170 /* Worked example: ULP32 with 64-byte cacheline (32-bit x86): 171 * 15 tags per cacheline. At most 2^17 fat pointers per fatp_ctl_t. 172 * The comments on the fatp_mi members, below, correspond to the worked 173 * example. 174 */ 175 struct fatp_mi { 176 fatp_word_t inuse : FATP_NTAGS; /* (1+15)*4 == CL_SIZE */ 177 fatp_word_t nxt : FATP_NXT_WIDTH;/* at most 2^17 fat pointers */ 178 fatp_word_t tag[FATP_NTAGS]; /* 15 tags per CL */ 179 }; 180 181 static __inline int 182 fatp_ntags(void) 183 { 184 return FATP_NTAGS; 185 } 186 187 static __inline int 188 fatp_full(fatp_t *fp) 189 { 190 fatp_t full; 191 192 full.inuse = (1U << FATP_NTAGS) - 1U; 193 194 return (fp->inuse == full.inuse); 195 } 196 197 struct vtw_common; 198 struct vtw_v4; 199 struct vtw_v6; 200 struct vtw_ctl; 201 202 /*!\brief common to all vtw 203 */ 204 typedef struct vtw_common { 205 struct timeval expire; /* date of birth+msl */ 206 uint32_t key; /* hash key: full hash */ 207 uint32_t port_key; /* hash key: local port hash */ 208 uint32_t rcv_nxt; 209 uint32_t rcv_wnd; 210 uint32_t snd_nxt; 211 uint32_t snd_scale : 8; /* window scaling for send win */ 212 uint32_t msl_class : 2; /* TCP MSL class {0,1,2,3} */ 213 uint32_t reuse_port : 1; 214 uint32_t reuse_addr : 1; 215 uint32_t v6only : 1; 216 uint32_t hashed : 1; /* reachable via FATP */ 217 uint32_t uid; 218 } vtw_t; 219 220 /*!\brief vestigial timewait for IPv4 221 */ 222 typedef struct vtw_v4 { 223 vtw_t common; /* must be first */ 224 uint16_t lport; 225 uint16_t fport; 226 uint32_t laddr; 227 uint32_t faddr; 228 } vtw_v4_t; 229 230 /*!\brief vestigial timewait for IPv6 231 */ 232 typedef struct vtw_v6 { 233 vtw_t common; /* must be first */ 234 uint16_t lport; 235 uint16_t fport; 236 struct in6_addr laddr; 237 struct in6_addr faddr; 238 } vtw_v6_t; 239 240 struct fatp_ctl; 241 typedef struct vtw_ctl vtw_ctl_t; 242 typedef struct fatp_ctl fatp_ctl_t; 243 244 /* 245 * The vestigial time waits are kept in a contiguous chunk. 246 * Allocation and free pointers run as clock hands thru this array. 247 */ 248 struct vtw_ctl { 249 fatp_ctl_t *fat; /* collection of fatp to use */ 250 vtw_ctl_t *ctl; /* <! controller's controller */ 251 union { 252 vtw_t *v; /* common */ 253 struct vtw_v4 *v4; /* IPv4 resources */ 254 struct vtw_v6 *v6; /* IPv6 resources */ 255 } base, /* base of vtw_t array */ 256 /**/ lim, /* extent of vtw_t array */ 257 /**/ alloc, /* allocation pointer */ 258 /**/ oldest; /* ^ to oldest */ 259 uint32_t nfree; /* # free */ 260 uint32_t nalloc; /* # allocated */ 261 uint32_t idx_mask; /* mask capturing all index bits*/ 262 uint32_t is_v4 : 1; 263 uint32_t is_v6 : 1; 264 uint32_t idx_bits: 6; 265 uint32_t clidx : 3; /* <! class index */ 266 }; 267 268 /*!\brief Collections of fat pointers. 269 */ 270 struct fatp_ctl { 271 vtw_ctl_t *vtw; /* associated VTWs */ 272 fatp_t *base; /* base of fatp_t array */ 273 fatp_t *lim; /* extent of fatp_t array */ 274 fatp_t *free; /* free list */ 275 uint32_t mask; /* hash mask */ 276 uint32_t nfree; /* # free */ 277 uint32_t nalloc; /* # allocated */ 278 fatp_t **hash; /* hash anchors */ 279 fatp_t **port; /* port hash anchors */ 280 }; 281 282 /*!\brief stats 283 */ 284 struct vtw_stats { 285 uint64_t ins; /* <! inserts */ 286 uint64_t del; /* <! deleted */ 287 uint64_t kill; /* <! assassination */ 288 uint64_t look[2]; /* <! lookup: full hash, port hash */ 289 uint64_t hit[2]; /* <! lookups that hit */ 290 uint64_t miss[2]; /* <! lookups that miss */ 291 uint64_t probe[2]; /* <! hits+miss */ 292 uint64_t losing[2]; /* <! misses requiring dereference */ 293 uint64_t max_chain[2]; /* <! max fatp chain traversed */ 294 uint64_t max_probe[2]; /* <! max probes in any one chain */ 295 uint64_t max_loss[2]; /* <! max losing probes in any one 296 * chain 297 */ 298 }; 299 300 typedef struct vtw_stats vtw_stats_t; 301 302 /*!\brief follow fatp next 'pointer' 303 */ 304 static __inline fatp_t * 305 fatp_next(fatp_ctl_t *fat, fatp_t *fp) 306 { 307 return fp->nxt ? fat->base + fp->nxt-1 : 0; 308 } 309 310 /*!\brief determine a collection-relative fat pointer index. 311 */ 312 static __inline uint32_t 313 fatp_index(fatp_ctl_t *fat, fatp_t *fp) 314 { 315 return fp ? 1 + (fp - fat->base) : 0; 316 } 317 318 319 static __inline uint32_t 320 v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport) 321 { 322 return (ntohl(faddr) + ntohs(fport) 323 + ntohl(laddr) + ntohs(lport)); 324 } 325 326 static __inline uint32_t 327 v6_tag(const struct in6_addr *faddr, uint16_t fport, 328 const struct in6_addr *laddr, uint16_t lport) 329 { 330 #ifdef IN6_HASH 331 return IN6_HASH(faddr, fport, laddr, lport); 332 #else 333 return 0; 334 #endif 335 } 336 337 static __inline uint32_t 338 v4_port_tag(uint16_t lport) 339 { 340 uint32_t tag = lport ^ (lport << 11); 341 342 tag ^= tag << 3; 343 tag += tag >> 5; 344 tag ^= tag << 4; 345 tag += tag >> 17; 346 tag ^= tag << 25; 347 tag += tag >> 6; 348 349 return tag; 350 } 351 352 static __inline uint32_t 353 v6_port_tag(uint16_t lport) 354 { 355 return v4_port_tag(lport); 356 } 357 358 struct tcpcb; 359 struct tcphdr; 360 361 int vtw_add(int, struct tcpcb *); 362 void vtw_del(vtw_ctl_t *, vtw_t *); 363 int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th, 364 uint32_t faddr, uint16_t fport, 365 uint32_t laddr, uint16_t lport); 366 struct ip6_hdr; 367 struct in6_addr; 368 369 int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th, 370 const struct in6_addr *faddr, uint16_t fport, 371 const struct in6_addr *laddr, uint16_t lport); 372 373 typedef struct vestigial_inpcb { 374 union { 375 struct in_addr v4; 376 struct in6_addr v6; 377 } faddr, laddr; 378 uint16_t fport, lport; 379 uint32_t valid : 1; 380 uint32_t v4 : 1; 381 uint32_t reuse_addr : 1; 382 uint32_t reuse_port : 1; 383 uint32_t v6only : 1; 384 uint32_t more_tbd : 1; 385 uint32_t uid; 386 uint32_t rcv_nxt; 387 uint32_t rcv_wnd; 388 uint32_t snd_nxt; 389 struct vtw_common *vtw; 390 struct vtw_ctl *ctl; 391 } vestigial_inpcb_t; 392 393 #ifdef _KERNEL 394 void vtw_restart(vestigial_inpcb_t*); 395 int vtw_earlyinit(void); 396 int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO); 397 #endif /* _KERNEL */ 398 399 #ifdef VTW_DEBUG 400 typedef struct sin_either { 401 uint8_t sin_len; 402 uint8_t sin_family; 403 uint16_t sin_port; 404 union { 405 struct in_addr v4; 406 struct in6_addr v6; 407 } sin_addr; 408 } sin_either_t; 409 410 int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int); 411 412 typedef struct vtw_sysargs { 413 uint32_t op; 414 sin_either_t fa; 415 sin_either_t la; 416 } vtw_sysargs_t; 417 418 #endif /* VTW_DEBUG */ 419 420 #endif /* _NETINET_TCP_VTW_H */ 421