1 /*- 2 * Copyright (c) 2010-2020 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This material is based upon work partially supported by The 6 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 /* 31 * NPF connection storage. 32 * 33 * Lock-free connection lookups are protected by EBR with an atomic 34 * reference acquisition before exiting the critical path. The caller 35 * is responsible for re-checking the connection state. 36 * 37 * Warning (not applicable for the userspace npfkern): 38 * 39 * thmap is partially lock-free data structure that uses its own 40 * spin-locks on the writer side (insert/delete operations). 41 * 42 * The relevant interrupt priority level (IPL) must be set and the 43 * kernel preemption disabled across the critical paths to prevent 44 * deadlocks and priority inversion problems. These are essentially 45 * the same guarantees as a spinning mutex(9) would provide. 46 * 47 * This is achieved with SPL routines splsoftnet() and splx() around 48 * the thmap_del() and thmap_put() calls. Note: we assume that the 49 * network stack invokes NPF at IPL_SOFTNET or lower, but not higher. 50 */ 51 52 #ifdef _KERNEL 53 #include <sys/cdefs.h> 54 __KERNEL_RCSID(0, "$NetBSD: npf_conndb.c,v 1.9 2020/05/30 14:16:56 rmind Exp $"); 55 56 #include <sys/param.h> 57 #include <sys/types.h> 58 59 #include <sys/atomic.h> 60 #include <sys/kmem.h> 61 #include <sys/thmap.h> 62 #endif 63 64 #define __NPF_CONN_PRIVATE 65 #include "npf_conn.h" 66 #include "npf_impl.h" 67 68 struct npf_conndb { 69 thmap_t * cd_map; 70 71 /* 72 * There are three lists for connections: new, all and G/C. 73 * 74 * New connections are atomically inserted into the "new-list". 75 * The G/C worker will move them to the doubly-linked list of all 76 * active connections. 77 */ 78 npf_conn_t * cd_new; 79 LIST_HEAD(, npf_conn) cd_list; 80 LIST_HEAD(, npf_conn) cd_gclist; 81 82 /* The last inspected connection (for circular iteration). */ 83 npf_conn_t * cd_marker; 84 }; 85 86 typedef struct { 87 int step; 88 int interval_min; 89 int interval_max; 90 } npf_conndb_params_t; 91 92 /* 93 * Pointer tag for connection keys which represent the "forwards" entry. 94 */ 95 #define CONNDB_FORW_BIT ((uintptr_t)0x1) 96 #define CONNDB_ISFORW_P(p) (((uintptr_t)(p) & CONNDB_FORW_BIT) != 0) 97 #define CONNDB_GET_PTR(p) ((void *)((uintptr_t)(p) & ~CONNDB_FORW_BIT)) 98 99 void 100 npf_conndb_sysinit(npf_t *npf) 101 { 102 npf_conndb_params_t *params = npf_param_allocgroup(npf, 103 NPF_PARAMS_CONNDB, sizeof(npf_conndb_params_t)); 104 npf_param_t param_map[] = { 105 { 106 "gc.step", 107 ¶ms->step, 108 .default_val = 256, 109 .min = 1, .max = INT_MAX 110 }, 111 { 112 "gc.interval_min", 113 ¶ms->interval_min, 114 .default_val = 50, // ms 115 .min = 10, .max = 10000 116 }, 117 { 118 "gc.interval_max", 119 ¶ms->interval_max, 120 .default_val = 5000, // ms 121 .min = 10, .max = 10000 122 }, 123 }; 124 npf_param_register(npf, param_map, __arraycount(param_map)); 125 } 126 127 void 128 npf_conndb_sysfini(npf_t *npf) 129 { 130 const size_t len = sizeof(npf_conndb_params_t); 131 npf_param_freegroup(npf, NPF_PARAMS_CONNDB, len); 132 } 133 134 npf_conndb_t * 135 npf_conndb_create(void) 136 { 137 npf_conndb_t *cd; 138 139 cd = kmem_zalloc(sizeof(npf_conndb_t), KM_SLEEP); 140 cd->cd_map = thmap_create(0, NULL, THMAP_NOCOPY); 141 KASSERT(cd->cd_map != NULL); 142 143 LIST_INIT(&cd->cd_list); 144 LIST_INIT(&cd->cd_gclist); 145 return cd; 146 } 147 148 void 149 npf_conndb_destroy(npf_conndb_t *cd) 150 { 151 KASSERT(cd->cd_new == NULL); 152 KASSERT(cd->cd_marker == NULL); 153 KASSERT(LIST_EMPTY(&cd->cd_list)); 154 KASSERT(LIST_EMPTY(&cd->cd_gclist)); 155 156 thmap_destroy(cd->cd_map); 157 kmem_free(cd, sizeof(npf_conndb_t)); 158 } 159 160 /* 161 * npf_conndb_lookup: find a connection given the key. 162 */ 163 npf_conn_t * 164 npf_conndb_lookup(npf_t *npf, const npf_connkey_t *ck, npf_flow_t *flow) 165 { 166 npf_conndb_t *cd = atomic_load_relaxed(&npf->conn_db); 167 const unsigned keylen = NPF_CONNKEY_LEN(ck); 168 npf_conn_t *con; 169 void *val; 170 171 /* 172 * Lookup the connection key in the key-value map. 173 */ 174 int s = npf_config_read_enter(npf); 175 val = thmap_get(cd->cd_map, ck->ck_key, keylen); 176 if (!val) { 177 npf_config_read_exit(npf, s); 178 return NULL; 179 } 180 181 /* 182 * Determine whether this is the "forwards" or "backwards" key 183 * and clear the pointer tag. 184 */ 185 *flow = CONNDB_ISFORW_P(val) ? NPF_FLOW_FORW : NPF_FLOW_BACK; 186 con = CONNDB_GET_PTR(val); 187 KASSERT(con != NULL); 188 189 /* 190 * Acquire a reference and return the connection. 191 */ 192 atomic_inc_uint(&con->c_refcnt); 193 npf_config_read_exit(npf, s); 194 return con; 195 } 196 197 /* 198 * npf_conndb_insert: insert the key representing the connection. 199 * 200 * => Returns true on success and false on failure. 201 */ 202 bool 203 npf_conndb_insert(npf_conndb_t *cd, const npf_connkey_t *ck, 204 npf_conn_t *con, npf_flow_t flow) 205 { 206 const unsigned keylen = NPF_CONNKEY_LEN(ck); 207 const uintptr_t tag = (CONNDB_FORW_BIT * !flow); 208 void *val; 209 bool ok; 210 211 /* 212 * Tag the connection pointer if this is the "forwards" key. 213 */ 214 KASSERT(!CONNDB_ISFORW_P(con)); 215 val = (void *)((uintptr_t)(void *)con | tag); 216 217 int s = splsoftnet(); 218 ok = thmap_put(cd->cd_map, ck->ck_key, keylen, val) == val; 219 splx(s); 220 221 return ok; 222 } 223 224 /* 225 * npf_conndb_remove: find and delete connection key, returning the 226 * connection it represents. 227 */ 228 npf_conn_t * 229 npf_conndb_remove(npf_conndb_t *cd, npf_connkey_t *ck) 230 { 231 const unsigned keylen = NPF_CONNKEY_LEN(ck); 232 void *val; 233 234 int s = splsoftnet(); 235 val = thmap_del(cd->cd_map, ck->ck_key, keylen); 236 splx(s); 237 238 return CONNDB_GET_PTR(val); 239 } 240 241 /* 242 * npf_conndb_enqueue: atomically insert the connection into the 243 * singly-linked list of the "new" connections. 244 */ 245 void 246 npf_conndb_enqueue(npf_conndb_t *cd, npf_conn_t *con) 247 { 248 npf_conn_t *head; 249 250 do { 251 head = atomic_load_relaxed(&cd->cd_new); 252 atomic_store_relaxed(&con->c_next, head); 253 } while (atomic_cas_ptr(&cd->cd_new, head, con) != head); 254 } 255 256 /* 257 * npf_conndb_update: migrate all new connections to the list of all 258 * connections; this must also be performed on npf_conndb_getlist() 259 * to provide a complete list of connections. 260 */ 261 static void 262 npf_conndb_update(npf_conndb_t *cd) 263 { 264 npf_conn_t *con; 265 266 con = atomic_swap_ptr(&cd->cd_new, NULL); 267 while (con) { 268 npf_conn_t *next = atomic_load_relaxed(&con->c_next); // union 269 LIST_INSERT_HEAD(&cd->cd_list, con, c_entry); 270 con = next; 271 } 272 } 273 274 /* 275 * npf_conndb_getlist: return the list of all connections. 276 */ 277 npf_conn_t * 278 npf_conndb_getlist(npf_conndb_t *cd) 279 { 280 npf_conndb_update(cd); 281 return LIST_FIRST(&cd->cd_list); 282 } 283 284 /* 285 * npf_conndb_getnext: return the next connection, implementing 286 * the circular iteration. 287 */ 288 npf_conn_t * 289 npf_conndb_getnext(npf_conndb_t *cd, npf_conn_t *con) 290 { 291 /* Next.. */ 292 if (con == NULL || (con = LIST_NEXT(con, c_entry)) == NULL) { 293 con = LIST_FIRST(&cd->cd_list); 294 } 295 return con; 296 } 297 298 /* 299 * npf_conndb_gc_incr: incremental G/C of the expired connections. 300 */ 301 static unsigned 302 npf_conndb_gc_incr(npf_t *npf, npf_conndb_t *cd, const time_t now) 303 { 304 const npf_conndb_params_t *params = npf->params[NPF_PARAMS_CONNDB]; 305 unsigned target = params->step; 306 unsigned gc_conns = 0; 307 npf_conn_t *con; 308 309 KASSERT(mutex_owned(&npf->conn_lock)); 310 311 /* 312 * Second, start from the "last" (marker) connection. 313 * We must initialise the marker if it is not set yet. 314 */ 315 if ((con = cd->cd_marker) == NULL) { 316 con = npf_conndb_getnext(cd, NULL); 317 cd->cd_marker = con; 318 } 319 320 /* 321 * Scan the connections: 322 * - Limit the scan to the G/C step size. 323 * - Stop if we scanned all of them. 324 * - Update the marker connection. 325 */ 326 while (con && target--) { 327 npf_conn_t *next = npf_conndb_getnext(cd, con); 328 329 /* 330 * Can we G/C this connection? 331 */ 332 if (npf_conn_expired(npf, con, now)) { 333 /* Yes: move to the G/C list. */ 334 LIST_REMOVE(con, c_entry); 335 LIST_INSERT_HEAD(&cd->cd_gclist, con, c_entry); 336 npf_conn_remove(cd, con); 337 gc_conns++; 338 339 /* This connection cannot be a new marker anymore. */ 340 if (con == next) { 341 next = NULL; 342 } 343 if (con == cd->cd_marker) { 344 cd->cd_marker = next; 345 con = next; 346 continue; 347 } 348 } 349 con = next; 350 351 /* 352 * Circular iteration: if we returned back to the 353 * marker connection, then stop. 354 */ 355 if (con == cd->cd_marker) { 356 break; 357 } 358 } 359 cd->cd_marker = con; 360 return gc_conns; 361 } 362 363 /* 364 * gc_freq_tune: G/C frequency self-tuning. 365 * 366 * If there is something to G/C, then exponentially increase the wake 367 * up frequency. Otherwise, reduce the frequency. Enforce the lower 368 * and upper bounds. 369 * 370 * => Returns the number milliseconds until next G/C. 371 */ 372 static unsigned 373 gc_freq_tune(const npf_t *npf, const npf_conndb_t *cd, const unsigned n) 374 { 375 const npf_conndb_params_t *params = npf->params[NPF_PARAMS_CONNDB]; 376 int wtime = npf->worker_wait_time; 377 wtime = n ? (wtime >> 1) : (wtime << 1); 378 return MAX(MIN(wtime, params->interval_max), params->interval_min); 379 } 380 381 /* 382 * npf_conndb_gc: garbage collect the expired connections. 383 * 384 * => Must run in a single-threaded manner. 385 * => If 'flush' is true, then destroy all connections. 386 * => If 'sync' is true, then perform passive serialisation. 387 */ 388 void 389 npf_conndb_gc(npf_t *npf, npf_conndb_t *cd, bool flush, bool sync) 390 { 391 struct timespec tsnow; 392 unsigned gc_conns = 0; 393 npf_conn_t *con; 394 void *gcref; 395 396 getnanouptime(&tsnow); 397 398 /* First, migrate all new connections. */ 399 mutex_enter(&npf->conn_lock); 400 npf_conndb_update(cd); 401 if (flush) { 402 /* Just unlink and move all connections to the G/C list. */ 403 while ((con = LIST_FIRST(&cd->cd_list)) != NULL) { 404 LIST_REMOVE(con, c_entry); 405 LIST_INSERT_HEAD(&cd->cd_gclist, con, c_entry); 406 npf_conn_remove(cd, con); 407 } 408 cd->cd_marker = NULL; 409 } else { 410 /* Incremental G/C of the expired connections. */ 411 gc_conns = npf_conndb_gc_incr(npf, cd, tsnow.tv_sec); 412 } 413 mutex_exit(&npf->conn_lock); 414 415 /* 416 * Ensure it is safe to destroy the connections. 417 * Note: drop the conn_lock (see the lock order). 418 */ 419 gcref = thmap_stage_gc(cd->cd_map); 420 if (sync && (gcref || !LIST_EMPTY(&cd->cd_gclist))) { 421 npf_config_enter(npf); 422 npf_config_sync(npf); 423 npf_config_exit(npf); 424 } 425 thmap_gc(cd->cd_map, gcref); 426 427 /* Self-tune the G/C frequency. */ 428 npf->worker_wait_time = gc_freq_tune(npf, cd, gc_conns); 429 430 if (LIST_EMPTY(&cd->cd_gclist)) { 431 return; 432 } 433 434 /* 435 * Garbage collect all expired connections. 436 * May need to wait for the references to drain. 437 */ 438 while ((con = LIST_FIRST(&cd->cd_gclist)) != NULL) { 439 /* 440 * Destroy only if removed and no references. Otherwise, 441 * just do it next time, unless we are destroying all. 442 */ 443 const unsigned refcnt = atomic_load_relaxed(&con->c_refcnt); 444 445 if (__predict_false(refcnt)) { 446 if (flush) { 447 kpause("npfcongc", false, 1, NULL); 448 continue; 449 } 450 break; // exit the loop 451 } 452 LIST_REMOVE(con, c_entry); 453 npf_conn_destroy(npf, con); 454 } 455 } 456