xref: /netbsd-src/sys/net/npf/npf_conndb.c (revision b899bfd96fd2cbaf2befc9ce4aaed9b9c230837a)
1 /*-
2  * Copyright (c) 2010-2020 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This material is based upon work partially supported by The
6  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  * POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 /*
31  * NPF connection storage.
32  *
33  * Lock-free connection lookups are protected by EBR with an atomic
34  * reference acquisition before exiting the critical path.  The caller
35  * is responsible for re-checking the connection state.
36  *
37  * Warning (not applicable for the userspace npfkern):
38  *
39  *	thmap is partially lock-free data structure that uses its own
40  *	spin-locks on the writer side (insert/delete operations).
41  *
42  *	The relevant interrupt priority level (IPL) must be set and the
43  *	kernel preemption disabled across the critical paths to prevent
44  *	deadlocks and priority inversion problems.  These are essentially
45  *	the same guarantees as a spinning mutex(9) would provide.
46  *
47  *	This is achieved with SPL routines splsoftnet() and splx() around
48  *	the thmap_del() and thmap_put() calls.  Note: we assume that the
49  *	network stack invokes NPF at IPL_SOFTNET or lower, but not higher.
50  */
51 
52 #ifdef _KERNEL
53 #include <sys/cdefs.h>
54 __KERNEL_RCSID(0, "$NetBSD: npf_conndb.c,v 1.9 2020/05/30 14:16:56 rmind Exp $");
55 
56 #include <sys/param.h>
57 #include <sys/types.h>
58 
59 #include <sys/atomic.h>
60 #include <sys/kmem.h>
61 #include <sys/thmap.h>
62 #endif
63 
64 #define __NPF_CONN_PRIVATE
65 #include "npf_conn.h"
66 #include "npf_impl.h"
67 
68 struct npf_conndb {
69 	thmap_t *		cd_map;
70 
71 	/*
72 	 * There are three lists for connections: new, all and G/C.
73 	 *
74 	 * New connections are atomically inserted into the "new-list".
75 	 * The G/C worker will move them to the doubly-linked list of all
76 	 * active connections.
77 	 */
78 	npf_conn_t *		cd_new;
79 	LIST_HEAD(, npf_conn)	cd_list;
80 	LIST_HEAD(, npf_conn)	cd_gclist;
81 
82 	/* The last inspected connection (for circular iteration). */
83 	npf_conn_t *		cd_marker;
84 };
85 
86 typedef struct {
87 	int		step;
88 	int		interval_min;
89 	int		interval_max;
90 } npf_conndb_params_t;
91 
92 /*
93  * Pointer tag for connection keys which represent the "forwards" entry.
94  */
95 #define	CONNDB_FORW_BIT		((uintptr_t)0x1)
96 #define	CONNDB_ISFORW_P(p)	(((uintptr_t)(p) & CONNDB_FORW_BIT) != 0)
97 #define	CONNDB_GET_PTR(p)	((void *)((uintptr_t)(p) & ~CONNDB_FORW_BIT))
98 
99 void
npf_conndb_sysinit(npf_t * npf)100 npf_conndb_sysinit(npf_t *npf)
101 {
102 	npf_conndb_params_t *params = npf_param_allocgroup(npf,
103 	    NPF_PARAMS_CONNDB, sizeof(npf_conndb_params_t));
104 	npf_param_t param_map[] = {
105 		{
106 			"gc.step",
107 			&params->step,
108 			.default_val = 256,
109 			.min = 1, .max = INT_MAX
110 		},
111 		{
112 			"gc.interval_min",
113 			&params->interval_min,
114 			.default_val = 50, // ms
115 			.min = 10, .max = 10000
116 		},
117 		{
118 			"gc.interval_max",
119 			&params->interval_max,
120 			.default_val = 5000, // ms
121 			.min = 10, .max = 10000
122 		},
123 	};
124 	npf_param_register(npf, param_map, __arraycount(param_map));
125 }
126 
127 void
npf_conndb_sysfini(npf_t * npf)128 npf_conndb_sysfini(npf_t *npf)
129 {
130 	const size_t len = sizeof(npf_conndb_params_t);
131 	npf_param_freegroup(npf, NPF_PARAMS_CONNDB, len);
132 }
133 
134 npf_conndb_t *
npf_conndb_create(void)135 npf_conndb_create(void)
136 {
137 	npf_conndb_t *cd;
138 
139 	cd = kmem_zalloc(sizeof(npf_conndb_t), KM_SLEEP);
140 	cd->cd_map = thmap_create(0, NULL, THMAP_NOCOPY);
141 	KASSERT(cd->cd_map != NULL);
142 
143 	LIST_INIT(&cd->cd_list);
144 	LIST_INIT(&cd->cd_gclist);
145 	return cd;
146 }
147 
148 void
npf_conndb_destroy(npf_conndb_t * cd)149 npf_conndb_destroy(npf_conndb_t *cd)
150 {
151 	KASSERT(cd->cd_new == NULL);
152 	KASSERT(cd->cd_marker == NULL);
153 	KASSERT(LIST_EMPTY(&cd->cd_list));
154 	KASSERT(LIST_EMPTY(&cd->cd_gclist));
155 
156 	thmap_destroy(cd->cd_map);
157 	kmem_free(cd, sizeof(npf_conndb_t));
158 }
159 
160 /*
161  * npf_conndb_lookup: find a connection given the key.
162  */
163 npf_conn_t *
npf_conndb_lookup(npf_t * npf,const npf_connkey_t * ck,npf_flow_t * flow)164 npf_conndb_lookup(npf_t *npf, const npf_connkey_t *ck, npf_flow_t *flow)
165 {
166 	npf_conndb_t *cd = atomic_load_relaxed(&npf->conn_db);
167 	const unsigned keylen = NPF_CONNKEY_LEN(ck);
168 	npf_conn_t *con;
169 	void *val;
170 
171 	/*
172 	 * Lookup the connection key in the key-value map.
173 	 */
174 	int s = npf_config_read_enter(npf);
175 	val = thmap_get(cd->cd_map, ck->ck_key, keylen);
176 	if (!val) {
177 		npf_config_read_exit(npf, s);
178 		return NULL;
179 	}
180 
181 	/*
182 	 * Determine whether this is the "forwards" or "backwards" key
183 	 * and clear the pointer tag.
184 	 */
185 	*flow = CONNDB_ISFORW_P(val) ? NPF_FLOW_FORW : NPF_FLOW_BACK;
186 	con = CONNDB_GET_PTR(val);
187 	KASSERT(con != NULL);
188 
189 	/*
190 	 * Acquire a reference and return the connection.
191 	 */
192 	atomic_inc_uint(&con->c_refcnt);
193 	npf_config_read_exit(npf, s);
194 	return con;
195 }
196 
197 /*
198  * npf_conndb_insert: insert the key representing the connection.
199  *
200  * => Returns true on success and false on failure.
201  */
202 bool
npf_conndb_insert(npf_conndb_t * cd,const npf_connkey_t * ck,npf_conn_t * con,npf_flow_t flow)203 npf_conndb_insert(npf_conndb_t *cd, const npf_connkey_t *ck,
204     npf_conn_t *con, npf_flow_t flow)
205 {
206 	const unsigned keylen = NPF_CONNKEY_LEN(ck);
207 	const uintptr_t tag = (CONNDB_FORW_BIT * !flow);
208 	void *val;
209 	bool ok;
210 
211 	/*
212 	 * Tag the connection pointer if this is the "forwards" key.
213 	 */
214 	KASSERT(!CONNDB_ISFORW_P(con));
215 	val = (void *)((uintptr_t)(void *)con | tag);
216 
217 	int s = splsoftnet();
218 	ok = thmap_put(cd->cd_map, ck->ck_key, keylen, val) == val;
219 	splx(s);
220 
221 	return ok;
222 }
223 
224 /*
225  * npf_conndb_remove: find and delete connection key, returning the
226  * connection it represents.
227  */
228 npf_conn_t *
npf_conndb_remove(npf_conndb_t * cd,npf_connkey_t * ck)229 npf_conndb_remove(npf_conndb_t *cd, npf_connkey_t *ck)
230 {
231 	const unsigned keylen = NPF_CONNKEY_LEN(ck);
232 	void *val;
233 
234 	int s = splsoftnet();
235 	val = thmap_del(cd->cd_map, ck->ck_key, keylen);
236 	splx(s);
237 
238 	return CONNDB_GET_PTR(val);
239 }
240 
241 /*
242  * npf_conndb_enqueue: atomically insert the connection into the
243  * singly-linked list of the "new" connections.
244  */
245 void
npf_conndb_enqueue(npf_conndb_t * cd,npf_conn_t * con)246 npf_conndb_enqueue(npf_conndb_t *cd, npf_conn_t *con)
247 {
248 	npf_conn_t *head;
249 
250 	do {
251 		head = atomic_load_relaxed(&cd->cd_new);
252 		atomic_store_relaxed(&con->c_next, head);
253 	} while (atomic_cas_ptr(&cd->cd_new, head, con) != head);
254 }
255 
256 /*
257  * npf_conndb_update: migrate all new connections to the list of all
258  * connections; this must also be performed on npf_conndb_getlist()
259  * to provide a complete list of connections.
260  */
261 static void
npf_conndb_update(npf_conndb_t * cd)262 npf_conndb_update(npf_conndb_t *cd)
263 {
264 	npf_conn_t *con;
265 
266 	con = atomic_swap_ptr(&cd->cd_new, NULL);
267 	while (con) {
268 		npf_conn_t *next = atomic_load_relaxed(&con->c_next); // union
269 		LIST_INSERT_HEAD(&cd->cd_list, con, c_entry);
270 		con = next;
271 	}
272 }
273 
274 /*
275  * npf_conndb_getlist: return the list of all connections.
276  */
277 npf_conn_t *
npf_conndb_getlist(npf_conndb_t * cd)278 npf_conndb_getlist(npf_conndb_t *cd)
279 {
280 	npf_conndb_update(cd);
281 	return LIST_FIRST(&cd->cd_list);
282 }
283 
284 /*
285  * npf_conndb_getnext: return the next connection, implementing
286  * the circular iteration.
287  */
288 npf_conn_t *
npf_conndb_getnext(npf_conndb_t * cd,npf_conn_t * con)289 npf_conndb_getnext(npf_conndb_t *cd, npf_conn_t *con)
290 {
291 	/* Next.. */
292 	if (con == NULL || (con = LIST_NEXT(con, c_entry)) == NULL) {
293 		con = LIST_FIRST(&cd->cd_list);
294 	}
295 	return con;
296 }
297 
298 /*
299  * npf_conndb_gc_incr: incremental G/C of the expired connections.
300  */
301 static unsigned
npf_conndb_gc_incr(npf_t * npf,npf_conndb_t * cd,const time_t now)302 npf_conndb_gc_incr(npf_t *npf, npf_conndb_t *cd, const time_t now)
303 {
304 	const npf_conndb_params_t *params = npf->params[NPF_PARAMS_CONNDB];
305 	unsigned target = params->step;
306 	unsigned gc_conns = 0;
307 	npf_conn_t *con;
308 
309 	KASSERT(mutex_owned(&npf->conn_lock));
310 
311 	/*
312 	 * Second, start from the "last" (marker) connection.
313 	 * We must initialise the marker if it is not set yet.
314 	 */
315 	if ((con = cd->cd_marker) == NULL) {
316 		con = npf_conndb_getnext(cd, NULL);
317 		cd->cd_marker = con;
318 	}
319 
320 	/*
321 	 * Scan the connections:
322 	 * - Limit the scan to the G/C step size.
323 	 * - Stop if we scanned all of them.
324 	 * - Update the marker connection.
325 	 */
326 	while (con && target--) {
327 		npf_conn_t *next = npf_conndb_getnext(cd, con);
328 
329 		/*
330 		 * Can we G/C this connection?
331 		 */
332 		if (npf_conn_expired(npf, con, now)) {
333 			/* Yes: move to the G/C list. */
334 			LIST_REMOVE(con, c_entry);
335 			LIST_INSERT_HEAD(&cd->cd_gclist, con, c_entry);
336 			npf_conn_remove(cd, con);
337 			gc_conns++;
338 
339 			/* This connection cannot be a new marker anymore. */
340 			if (con == next) {
341 				next = NULL;
342 			}
343 			if (con == cd->cd_marker) {
344 				cd->cd_marker = next;
345 				con = next;
346 				continue;
347 			}
348 		}
349 		con = next;
350 
351 		/*
352 		 * Circular iteration: if we returned back to the
353 		 * marker connection, then stop.
354 		 */
355 		if (con == cd->cd_marker) {
356 			break;
357 		}
358 	}
359 	cd->cd_marker = con;
360 	return gc_conns;
361 }
362 
363 /*
364  * gc_freq_tune: G/C frequency self-tuning.
365  *
366  * If there is something to G/C, then exponentially increase the wake
367  * up frequency.  Otherwise, reduce the frequency.  Enforce the lower
368  * and upper bounds.
369  *
370  * => Returns the number milliseconds until next G/C.
371  */
372 static unsigned
gc_freq_tune(const npf_t * npf,const npf_conndb_t * cd,const unsigned n)373 gc_freq_tune(const npf_t *npf, const npf_conndb_t *cd, const unsigned n)
374 {
375 	const npf_conndb_params_t *params = npf->params[NPF_PARAMS_CONNDB];
376 	int wtime = npf->worker_wait_time;
377 	wtime = n ? (wtime >> 1) : (wtime << 1);
378 	return MAX(MIN(wtime, params->interval_max), params->interval_min);
379 }
380 
381 /*
382  * npf_conndb_gc: garbage collect the expired connections.
383  *
384  * => Must run in a single-threaded manner.
385  * => If 'flush' is true, then destroy all connections.
386  * => If 'sync' is true, then perform passive serialisation.
387  */
388 void
npf_conndb_gc(npf_t * npf,npf_conndb_t * cd,bool flush,bool sync)389 npf_conndb_gc(npf_t *npf, npf_conndb_t *cd, bool flush, bool sync)
390 {
391 	struct timespec tsnow;
392 	unsigned gc_conns = 0;
393 	npf_conn_t *con;
394 	void *gcref;
395 
396 	getnanouptime(&tsnow);
397 
398 	/* First, migrate all new connections. */
399 	mutex_enter(&npf->conn_lock);
400 	npf_conndb_update(cd);
401 	if (flush) {
402 		/* Just unlink and move all connections to the G/C list. */
403 		while ((con = LIST_FIRST(&cd->cd_list)) != NULL) {
404 			LIST_REMOVE(con, c_entry);
405 			LIST_INSERT_HEAD(&cd->cd_gclist, con, c_entry);
406 			npf_conn_remove(cd, con);
407 		}
408 		cd->cd_marker = NULL;
409 	} else {
410 		/* Incremental G/C of the expired connections. */
411 		gc_conns = npf_conndb_gc_incr(npf, cd, tsnow.tv_sec);
412 	}
413 	mutex_exit(&npf->conn_lock);
414 
415 	/*
416 	 * Ensure it is safe to destroy the connections.
417 	 * Note: drop the conn_lock (see the lock order).
418 	 */
419 	gcref = thmap_stage_gc(cd->cd_map);
420 	if (sync && (gcref || !LIST_EMPTY(&cd->cd_gclist))) {
421 		npf_config_enter(npf);
422 		npf_config_sync(npf);
423 		npf_config_exit(npf);
424 	}
425 	thmap_gc(cd->cd_map, gcref);
426 
427 	/* Self-tune the G/C frequency. */
428 	npf->worker_wait_time = gc_freq_tune(npf, cd, gc_conns);
429 
430 	if (LIST_EMPTY(&cd->cd_gclist)) {
431 		return;
432 	}
433 
434 	/*
435 	 * Garbage collect all expired connections.
436 	 * May need to wait for the references to drain.
437 	 */
438 	while ((con = LIST_FIRST(&cd->cd_gclist)) != NULL) {
439 		/*
440 		 * Destroy only if removed and no references.  Otherwise,
441 		 * just do it next time, unless we are destroying all.
442 		 */
443 		const unsigned refcnt = atomic_load_relaxed(&con->c_refcnt);
444 
445 		if (__predict_false(refcnt)) {
446 			if (flush) {
447 				kpause("npfcongc", false, 1, NULL);
448 				continue;
449 			}
450 			break; // exit the loop
451 		}
452 		LIST_REMOVE(con, c_entry);
453 		npf_conn_destroy(npf, con);
454 	}
455 }
456