1 /*- 2 * Copyright (c) 2019 Mindaugas Rasiukevicius <rmind at noxt eu> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 /* 28 * NPF port map mechanism. 29 * 30 * The port map is a bitmap used to track TCP/UDP ports used for 31 * translation. Port maps are per IP addresses, therefore multiple 32 * NAT policies operating on the same IP address will share the 33 * same port map. 34 */ 35 36 #ifdef _KERNEL 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: npf_portmap.c,v 1.4 2019/08/11 20:26:34 rmind Exp $"); 39 40 #include <sys/param.h> 41 #include <sys/types.h> 42 43 #include <sys/atomic.h> 44 #include <sys/bitops.h> 45 #include <sys/kmem.h> 46 #include <sys/mutex.h> 47 #include <sys/cprng.h> 48 #include <sys/thmap.h> 49 #endif 50 51 #include "npf_impl.h" 52 53 /* 54 * Port map uses two-level bitmaps with compression to efficiently 55 * represent the maximum of 65536 (2^16) values. 56 * 57 * Level 0: 64 chunks each representing 1048 bits in two modes: 58 * 59 * a) If PORTMAP_L1_TAG, then up to 5 values are packed in the 60 * 64-bit integer using 12 bits for each value, starting from the 61 * most significant bits. The four 4 least significant bits are 62 * unused or reserved for pointer tagging. 63 * 64 * b) If there are more than 5 values, then PORTMAP_L1_TAG is set 65 * and the value serves as a pointer to the second level bitmap. 66 * 67 * Level 1: 16 chunks each representing 64 bits in plain uint64_t. 68 */ 69 70 #define PORTMAP_MAX_BITS (65536U) 71 #define PORTMAP_MASK (PORTMAP_MAX_BITS - 1) 72 73 #define PORTMAP_L0_SHIFT (10) // or 11 74 #define PORTMAP_L0_MASK ((1U << PORTMAP_L0_SHIFT) - 1) 75 #define PORTMAP_L0_WORDS (PORTMAP_MAX_BITS >> PORTMAP_L0_SHIFT) 76 77 #define PORTMAP_L1_SHIFT (6) 78 #define PORTMAP_L1_MASK ((1U << PORTMAP_L1_SHIFT) - 1) 79 #define PORTMAP_L1_WORDS \ 80 ((PORTMAP_MAX_BITS / PORTMAP_L0_WORDS) >> PORTMAP_L1_SHIFT) 81 82 #define PORTMAP_L1_TAG (UINT64_C(1)) // use level 1 83 #define PORTMAP_L1_GET(p) ((void *)((uintptr_t)(p) & ~(uintptr_t)3)) 84 85 CTASSERT(sizeof(uint64_t) >= sizeof(uintptr_t)); 86 87 typedef struct { 88 volatile uint64_t bits1[PORTMAP_L1_WORDS]; 89 } bitmap_l1_t; 90 91 typedef struct bitmap { 92 npf_addr_t addr; 93 volatile uint64_t bits0[PORTMAP_L0_WORDS]; 94 LIST_ENTRY(bitmap) entry; 95 unsigned addr_len; 96 } bitmap_t; 97 98 #define NPF_PORTMAP_MINPORT 1024 99 #define NPF_PORTMAP_MAXPORT 65535 100 101 struct npf_portmap { 102 thmap_t * addr_map; 103 LIST_HEAD(, bitmap) bitmap_list; 104 kmutex_t list_lock; 105 int min_port; 106 int max_port; 107 }; 108 109 static kmutex_t portmap_lock; 110 111 void 112 npf_portmap_init(npf_t *npf) 113 { 114 npf_portmap_t *pm = npf_portmap_create( 115 NPF_PORTMAP_MINPORT, NPF_PORTMAP_MAXPORT); 116 npf_param_t param_map[] = { 117 { 118 "portmap.min_port", 119 &pm->min_port, 120 .default_val = NPF_PORTMAP_MINPORT, 121 .min = 1024, .max = 65535 122 }, 123 { 124 "portmap.max_port", 125 &pm->max_port, 126 .default_val = NPF_PORTMAP_MAXPORT, 127 .min = 1024, .max = 65535 128 } 129 }; 130 npf_param_register(npf, param_map, __arraycount(param_map)); 131 mutex_init(&portmap_lock, MUTEX_DEFAULT, IPL_SOFTNET); 132 npf->portmap = pm; 133 } 134 135 void 136 npf_portmap_fini(npf_t *npf) 137 { 138 npf_portmap_destroy(npf->portmap); 139 mutex_destroy(&portmap_lock); 140 npf->portmap = NULL; // diagnostic 141 } 142 143 npf_portmap_t * 144 npf_portmap_create(int min_port, int max_port) 145 { 146 npf_portmap_t *pm; 147 148 pm = kmem_zalloc(sizeof(npf_portmap_t), KM_SLEEP); 149 mutex_init(&pm->list_lock, MUTEX_DEFAULT, IPL_SOFTNET); 150 pm->addr_map = thmap_create(0, NULL, THMAP_NOCOPY); 151 pm->min_port = min_port; 152 pm->max_port = max_port; 153 return pm; 154 } 155 156 void 157 npf_portmap_destroy(npf_portmap_t *pm) 158 { 159 npf_portmap_flush(pm); 160 KASSERT(LIST_EMPTY(&pm->bitmap_list)); 161 162 thmap_destroy(pm->addr_map); 163 mutex_destroy(&pm->list_lock); 164 kmem_free(pm, sizeof(npf_portmap_t)); 165 } 166 167 ///////////////////////////////////////////////////////////////////////// 168 169 #if defined(_LP64) 170 #define __npf_atomic_cas_64 atomic_cas_64 171 #else 172 static uint64_t 173 __npf_atomic_cas_64(volatile uint64_t *ptr, uint64_t old, uint64_t new) 174 { 175 uint64_t prev; 176 177 mutex_enter(&portmap_lock); 178 prev = *ptr; 179 if (prev == old) { 180 *ptr = new; 181 } 182 mutex_exit(&portmap_lock); 183 184 return prev; 185 } 186 #endif 187 188 /* 189 * bitmap_word_isset: test whether the bit value is in the packed array. 190 * 191 * => Return true if any value equals the bit number value. 192 * 193 * Packed array: 60 MSB bits, 5 values, 12 bits each. 194 * 195 * Reference: "Bit Twiddling Hacks" by S.E. Anderson, Stanford. 196 * Based on the hasvalue() and haszero() ideas. Since values are 197 * represented by upper 60 bits, we shift right by 4. 198 */ 199 static bool 200 bitmap_word_isset(uint64_t x, unsigned bit) 201 { 202 uint64_t m, r; 203 204 bit++; 205 KASSERT((x & PORTMAP_L1_TAG) == 0); 206 KASSERT(bit <= (PORTMAP_L0_MASK + 1)); 207 208 m = (x >> 4) ^ (UINT64_C(0x1001001001001) * bit); 209 r = (m - UINT64_C(0x1001001001001)) & (~m & UINT64_C(0x800800800800800)); 210 return r != 0; 211 } 212 213 /* 214 * bitmap_word_cax: compare-and-xor on packed array elements. 215 */ 216 static uint64_t 217 bitmap_word_cax(uint64_t x, int exp, int bit) 218 { 219 unsigned e = exp + 1; 220 221 /* 222 * We need to distinguish "no value" from zero. Just add one, 223 * since we use 12 bits to represent 11 bit values. 224 */ 225 bit++; 226 KASSERT((unsigned)bit <= (PORTMAP_L0_MASK + 1)); 227 KASSERT((x & PORTMAP_L1_TAG) == 0); 228 229 if (((x >> 52) & 0xfff) == e) 230 return x ^ ((uint64_t)bit << 52); 231 if (((x >> 40) & 0xfff) == e) 232 return x ^ ((uint64_t)bit << 40); 233 if (((x >> 28) & 0xfff) == e) 234 return x ^ ((uint64_t)bit << 28); 235 if (((x >> 16) & 0xfff) == e) 236 return x ^ ((uint64_t)bit << 16); 237 if (((x >> 4) & 0xfff) == e) 238 return x ^ ((uint64_t)bit << 4); 239 return 0; 240 } 241 242 static unsigned 243 bitmap_word_unpack(uint64_t x, unsigned bitvals[static 5]) 244 { 245 unsigned n = 0; 246 uint64_t v; 247 248 KASSERT((x & PORTMAP_L1_TAG) == 0); 249 250 if ((v = ((x >> 52)) & 0xfff) != 0) 251 bitvals[n++] = v - 1; 252 if ((v = ((x >> 40)) & 0xfff) != 0) 253 bitvals[n++] = v - 1; 254 if ((v = ((x >> 28)) & 0xfff) != 0) 255 bitvals[n++] = v - 1; 256 if ((v = ((x >> 16)) & 0xfff) != 0) 257 bitvals[n++] = v - 1; 258 if ((v = ((x >> 4)) & 0xfff) != 0) 259 bitvals[n++] = v - 1; 260 return n; 261 } 262 263 #if 0 264 static bool 265 bitmap_isset(const bitmap_t *bm, unsigned bit) 266 { 267 unsigned i, chunk_bit; 268 uint64_t bval, b; 269 bitmap_l1_t *bm1; 270 271 KASSERT(bit < PORTMAP_MAX_BITS); 272 i = bit >> PORTMAP_L0_SHIFT; 273 bval = bm->bits0[i]; 274 275 /* 276 * Empty check. Note: we can test the whole word against zero, 277 * since zero bit values in the packed array result in bits set. 278 */ 279 if (bval == 0) 280 return false; 281 282 /* Level 0 check. */ 283 chunk_bit = bit & PORTMAP_L0_MASK; 284 if ((bval & PORTMAP_L1_TAG) == 0) 285 return bitmap_word_isset(bval, chunk_bit); 286 287 /* Level 1 check. */ 288 bm1 = PORTMAP_L1_GET(bval); 289 KASSERT(bm1 != NULL); 290 i = chunk_bit >> PORTMAP_L1_SHIFT; 291 b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK); 292 return (bm1->bits1[i] & b) != 0; 293 } 294 #endif 295 296 static bool 297 bitmap_set(bitmap_t *bm, unsigned bit) 298 { 299 unsigned i, chunk_bit; 300 uint64_t bval, b, oval, nval; 301 bitmap_l1_t *bm1; 302 again: 303 KASSERT(bit < PORTMAP_MAX_BITS); 304 i = bit >> PORTMAP_L0_SHIFT; 305 chunk_bit = bit & PORTMAP_L0_MASK; 306 bval = bm->bits0[i]; // atomic fetch 307 308 if ((bval & PORTMAP_L1_TAG) == 0) { 309 unsigned n = 0, bitvals[5]; 310 uint64_t bm1p; 311 312 if (bitmap_word_isset(bval, chunk_bit)) { 313 return false; 314 } 315 316 /* 317 * Look for a zero-slot and put a value there. 318 */ 319 if ((nval = bitmap_word_cax(bval, -1, chunk_bit)) != 0) { 320 KASSERT((nval & PORTMAP_L1_TAG) == 0); 321 if (__npf_atomic_cas_64(&bm->bits0[i], bval, nval) != bval) { 322 goto again; 323 } 324 return true; 325 } 326 327 /* 328 * Full: allocate L1 block and copy over the current 329 * values into the level. 330 */ 331 bm1 = kmem_intr_zalloc(sizeof(bitmap_l1_t), KM_NOSLEEP); 332 if (bm1 == NULL) { 333 return false; // error 334 } 335 n = bitmap_word_unpack(bval, bitvals); 336 while (n--) { 337 const unsigned v = bitvals[n]; 338 const unsigned off = v >> PORTMAP_L1_SHIFT; 339 340 KASSERT(v <= PORTMAP_L0_MASK); 341 KASSERT(off < (sizeof(uint64_t) * CHAR_BIT)); 342 bm1->bits1[off] |= UINT64_C(1) << (v & PORTMAP_L1_MASK); 343 } 344 345 /* 346 * Attempt to set the L1 structure. Note: there is no 347 * ABA problem since the we compare the actual values. 348 * Note: CAS serves as a memory barrier. 349 */ 350 bm1p = (uintptr_t)bm1; 351 KASSERT((bm1p & PORTMAP_L1_TAG) == 0); 352 bm1p |= PORTMAP_L1_TAG; 353 if (__npf_atomic_cas_64(&bm->bits0[i], bval, bm1p) != bval) { 354 kmem_intr_free(bm1, sizeof(bitmap_l1_t)); 355 goto again; 356 } 357 bval = bm1p; 358 } 359 360 bm1 = PORTMAP_L1_GET(bval); 361 KASSERT(bm1 != NULL); 362 i = chunk_bit >> PORTMAP_L1_SHIFT; 363 b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK); 364 365 oval = bm1->bits1[i]; // atomic fetch 366 if (oval & b) { 367 return false; 368 } 369 nval = oval | b; 370 if (__npf_atomic_cas_64(&bm1->bits1[i], oval, nval) != oval) { 371 goto again; 372 } 373 return true; 374 } 375 376 static bool 377 bitmap_clr(bitmap_t *bm, unsigned bit) 378 { 379 unsigned i, chunk_bit; 380 uint64_t bval, b, oval, nval; 381 bitmap_l1_t *bm1; 382 again: 383 KASSERT(bit < PORTMAP_MAX_BITS); 384 i = bit >> PORTMAP_L0_SHIFT; 385 chunk_bit = bit & PORTMAP_L0_MASK; 386 bval = bm->bits0[i]; 387 388 if ((bval & PORTMAP_L1_TAG) == 0) { 389 if (!bitmap_word_isset(bval, chunk_bit)) { 390 return false; 391 } 392 nval = bitmap_word_cax(bval, chunk_bit, chunk_bit); 393 KASSERT((nval & PORTMAP_L1_TAG) == 0); 394 if (__npf_atomic_cas_64(&bm->bits0[i], bval, nval) != bval) { 395 goto again; 396 } 397 return true; 398 } 399 400 bm1 = PORTMAP_L1_GET(bval); 401 KASSERT(bm1 != NULL); 402 i = chunk_bit >> PORTMAP_L1_SHIFT; 403 b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK); 404 405 oval = bm1->bits1[i]; // atomic fetch 406 if ((oval & b) == 0) { 407 return false; 408 } 409 nval = oval & ~b; 410 if (__npf_atomic_cas_64(&bm1->bits1[i], oval, nval) != oval) { 411 goto again; 412 } 413 return true; 414 } 415 416 ///////////////////////////////////////////////////////////////////////// 417 418 static bitmap_t * 419 npf_portmap_autoget(npf_portmap_t *pm, unsigned alen, const npf_addr_t *addr) 420 { 421 bitmap_t *bm; 422 423 KASSERT(pm && pm->addr_map); 424 KASSERT(alen && alen <= sizeof(npf_addr_t)); 425 426 /* Lookup the port map for this address. */ 427 bm = thmap_get(pm->addr_map, addr, alen); 428 if (bm == NULL) { 429 void *ret; 430 431 /* 432 * Allocate a new port map for this address and 433 * attempt to insert it. 434 */ 435 bm = kmem_intr_zalloc(sizeof(bitmap_t), KM_NOSLEEP); 436 if (bm == NULL) { 437 return NULL; 438 } 439 memcpy(&bm->addr, addr, alen); 440 bm->addr_len = alen; 441 442 int s = splsoftnet(); 443 ret = thmap_put(pm->addr_map, &bm->addr, alen, bm); 444 splx(s); 445 446 if (ret == bm) { 447 /* Success: insert the bitmap into the list. */ 448 mutex_enter(&pm->list_lock); 449 LIST_INSERT_HEAD(&pm->bitmap_list, bm, entry); 450 mutex_exit(&pm->list_lock); 451 } else { 452 /* Race: use an existing bitmap. */ 453 kmem_free(bm, sizeof(bitmap_t)); 454 bm = ret; 455 } 456 } 457 return bm; 458 } 459 460 461 /* 462 * npf_portmap_flush: free all bitmaps and remove all addresses. 463 * 464 * => Concurrent calls to this routine are not allowed; therefore no 465 * need to acquire locks. 466 */ 467 void 468 npf_portmap_flush(npf_portmap_t *pm) 469 { 470 bitmap_t *bm; 471 472 while ((bm = LIST_FIRST(&pm->bitmap_list)) != NULL) { 473 for (unsigned i = 0; i < PORTMAP_L0_WORDS; i++) { 474 uintptr_t bm1 = bm->bits0[i]; 475 476 if (bm1 & PORTMAP_L1_TAG) { 477 bitmap_l1_t *bm1p = PORTMAP_L1_GET(bm1); 478 kmem_intr_free(bm1p, sizeof(bitmap_l1_t)); 479 } 480 bm->bits0[i] = UINT64_C(0); 481 } 482 LIST_REMOVE(bm, entry); 483 thmap_del(pm->addr_map, &bm->addr, bm->addr_len); 484 kmem_intr_free(bm, sizeof(bitmap_t)); 485 } 486 /* Note: the caller ensures there are no active references. */ 487 thmap_gc(pm->addr_map, thmap_stage_gc(pm->addr_map)); 488 } 489 490 /* 491 * npf_portmap_get: allocate and return a port from the given portmap. 492 * 493 * => Returns the port value in network byte-order. 494 * => Zero indicates a failure. 495 */ 496 in_port_t 497 npf_portmap_get(npf_portmap_t *pm, int alen, const npf_addr_t *addr) 498 { 499 const unsigned port_delta = pm->max_port - pm->min_port; 500 unsigned bit, target; 501 bitmap_t *bm; 502 503 bm = npf_portmap_autoget(pm, alen, addr); 504 if (bm == NULL) { 505 /* No memory. */ 506 return 0; 507 } 508 509 /* Randomly select a port. */ 510 target = pm->min_port + (cprng_fast32() % port_delta); 511 bit = target; 512 next: 513 if (bitmap_set(bm, bit)) { 514 /* Success. */ 515 return htons(bit); 516 } 517 bit = pm->min_port + ((bit + 1) % port_delta); 518 if (target != bit) { 519 /* Next.. */ 520 goto next; 521 } 522 /* No space. */ 523 return 0; 524 } 525 526 /* 527 * npf_portmap_take: allocate a specific port in the portmap. 528 */ 529 bool 530 npf_portmap_take(npf_portmap_t *pm, int alen, 531 const npf_addr_t *addr, in_port_t port) 532 { 533 bitmap_t *bm = npf_portmap_autoget(pm, alen, addr); 534 535 port = ntohs(port); 536 if (!bm || port < pm->min_port || port > pm->max_port) { 537 /* Out of memory / invalid port. */ 538 return false; 539 } 540 return bitmap_set(bm, port); 541 } 542 543 /* 544 * npf_portmap_put: release the port, making it available in the portmap. 545 * 546 * => The port value should be in network byte-order. 547 */ 548 void 549 npf_portmap_put(npf_portmap_t *pm, int alen, 550 const npf_addr_t *addr, in_port_t port) 551 { 552 bitmap_t *bm; 553 554 bm = npf_portmap_autoget(pm, alen, addr); 555 if (bm) { 556 port = ntohs(port); 557 bitmap_clr(bm, port); 558 } 559 } 560