xref: /netbsd-src/external/mpl/dhcp/bind/dist/lib/isc/netmgr/netmgr.c (revision 4afad4b7fa6d4a0d3dedf41d1587a7250710ae54)
1 /*	$NetBSD: netmgr.c,v 1.1 2024/02/18 20:57:55 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 #include <inttypes.h>
17 #include <unistd.h>
18 #include <uv.h>
19 #ifdef HAVE_LIBCTRACE
20 #include <execinfo.h>
21 #endif /* ifdef HAVE_LIBCTRACE */
22 
23 #include <isc/atomic.h>
24 #include <isc/barrier.h>
25 #include <isc/buffer.h>
26 #include <isc/condition.h>
27 #include <isc/errno.h>
28 #include <isc/list.h>
29 #include <isc/log.h>
30 #include <isc/magic.h>
31 #include <isc/mem.h>
32 #include <isc/netmgr.h>
33 #include <isc/print.h>
34 #include <isc/quota.h>
35 #include <isc/random.h>
36 #include <isc/refcount.h>
37 #include <isc/region.h>
38 #include <isc/result.h>
39 #include <isc/sockaddr.h>
40 #include <isc/stats.h>
41 #include <isc/strerr.h>
42 #include <isc/task.h>
43 #include <isc/thread.h>
44 #include <isc/util.h>
45 
46 #include "netmgr-int.h"
47 #include "netmgr_p.h"
48 #include "openssl_shim.h"
49 #include "trampoline_p.h"
50 #include "uv-compat.h"
51 
52 /*%
53  * How many isc_nmhandles and isc_nm_uvreqs will we be
54  * caching for reuse in a socket.
55  */
56 #define ISC_NM_HANDLES_STACK_SIZE 600
57 #define ISC_NM_REQS_STACK_SIZE	  600
58 
59 /*%
60  * Shortcut index arrays to get access to statistics counters.
61  */
62 
63 static const isc_statscounter_t udp4statsindex[] = {
64 	isc_sockstatscounter_udp4open,
65 	isc_sockstatscounter_udp4openfail,
66 	isc_sockstatscounter_udp4close,
67 	isc_sockstatscounter_udp4bindfail,
68 	isc_sockstatscounter_udp4connectfail,
69 	isc_sockstatscounter_udp4connect,
70 	-1,
71 	-1,
72 	isc_sockstatscounter_udp4sendfail,
73 	isc_sockstatscounter_udp4recvfail,
74 	isc_sockstatscounter_udp4active
75 };
76 
77 static const isc_statscounter_t udp6statsindex[] = {
78 	isc_sockstatscounter_udp6open,
79 	isc_sockstatscounter_udp6openfail,
80 	isc_sockstatscounter_udp6close,
81 	isc_sockstatscounter_udp6bindfail,
82 	isc_sockstatscounter_udp6connectfail,
83 	isc_sockstatscounter_udp6connect,
84 	-1,
85 	-1,
86 	isc_sockstatscounter_udp6sendfail,
87 	isc_sockstatscounter_udp6recvfail,
88 	isc_sockstatscounter_udp6active
89 };
90 
91 static const isc_statscounter_t tcp4statsindex[] = {
92 	isc_sockstatscounter_tcp4open,	      isc_sockstatscounter_tcp4openfail,
93 	isc_sockstatscounter_tcp4close,	      isc_sockstatscounter_tcp4bindfail,
94 	isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
95 	isc_sockstatscounter_tcp4acceptfail,  isc_sockstatscounter_tcp4accept,
96 	isc_sockstatscounter_tcp4sendfail,    isc_sockstatscounter_tcp4recvfail,
97 	isc_sockstatscounter_tcp4active
98 };
99 
100 static const isc_statscounter_t tcp6statsindex[] = {
101 	isc_sockstatscounter_tcp6open,	      isc_sockstatscounter_tcp6openfail,
102 	isc_sockstatscounter_tcp6close,	      isc_sockstatscounter_tcp6bindfail,
103 	isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
104 	isc_sockstatscounter_tcp6acceptfail,  isc_sockstatscounter_tcp6accept,
105 	isc_sockstatscounter_tcp6sendfail,    isc_sockstatscounter_tcp6recvfail,
106 	isc_sockstatscounter_tcp6active
107 };
108 
109 #if 0
110 /* XXX: not currently used */
111 static const isc_statscounter_t unixstatsindex[] = {
112 	isc_sockstatscounter_unixopen,
113 	isc_sockstatscounter_unixopenfail,
114 	isc_sockstatscounter_unixclose,
115 	isc_sockstatscounter_unixbindfail,
116 	isc_sockstatscounter_unixconnectfail,
117 	isc_sockstatscounter_unixconnect,
118 	isc_sockstatscounter_unixacceptfail,
119 	isc_sockstatscounter_unixaccept,
120 	isc_sockstatscounter_unixsendfail,
121 	isc_sockstatscounter_unixrecvfail,
122 	isc_sockstatscounter_unixactive
123 };
124 #endif /* if 0 */
125 
126 /*
127  * libuv is not thread safe, but has mechanisms to pass messages
128  * between threads. Each socket is owned by a thread. For UDP
129  * sockets we have a set of sockets for each interface and we can
130  * choose a sibling and send the message directly. For TCP, or if
131  * we're calling from a non-networking thread, we need to pass the
132  * request using async_cb.
133  */
134 
135 #if defined(HAVE_THREAD_LOCAL)
136 #include <threads.h>
137 static thread_local int isc__nm_tid_v = ISC_NETMGR_TID_UNKNOWN;
138 #elif defined(HAVE___THREAD)
139 static __thread int isc__nm_tid_v = ISC_NETMGR_TID_UNKNOWN;
140 #elif HAVE___DECLSPEC_THREAD
141 __declspec(thread) int isc__nm_tid_v = ISC_NETMGR_TID_UNKNOWN;
142 #endif /* if defined(HAVE_THREAD_LOCAL) */
143 
144 static void
145 nmsocket_maybe_destroy(isc_nmsocket_t *sock FLARG);
146 static void
147 nmhandle_free(isc_nmsocket_t *sock, isc_nmhandle_t *handle);
148 static isc_threadresult_t
149 nm_thread(isc_threadarg_t worker0);
150 static void
151 async_cb(uv_async_t *handle);
152 
153 static bool
154 process_netievent(isc__networker_t *worker, isc__netievent_t *ievent);
155 static isc_result_t
156 process_queue(isc__networker_t *worker, netievent_type_t type);
157 static void
158 wait_for_priority_queue(isc__networker_t *worker);
159 static void
160 drain_queue(isc__networker_t *worker, netievent_type_t type);
161 
162 static void
163 isc__nm_async_stop(isc__networker_t *worker, isc__netievent_t *ev0);
164 static void
165 isc__nm_async_pause(isc__networker_t *worker, isc__netievent_t *ev0);
166 static void
167 isc__nm_async_resume(isc__networker_t *worker, isc__netievent_t *ev0);
168 static void
169 isc__nm_async_detach(isc__networker_t *worker, isc__netievent_t *ev0);
170 static void
171 isc__nm_async_close(isc__networker_t *worker, isc__netievent_t *ev0);
172 
173 static void
174 isc__nm_threadpool_initialize(uint32_t workers);
175 static void
176 isc__nm_work_cb(uv_work_t *req);
177 static void
178 isc__nm_after_work_cb(uv_work_t *req, int status);
179 
180 void
181 isc__nmsocket_reset(isc_nmsocket_t *sock);
182 
183 /*%<
184  * Issue a 'handle closed' callback on the socket.
185  */
186 
187 static void
188 nmhandle_detach_cb(isc_nmhandle_t **handlep FLARG);
189 
190 int
isc_nm_tid(void)191 isc_nm_tid(void) {
192 	return (isc__nm_tid_v);
193 }
194 
195 bool
isc__nm_in_netthread(void)196 isc__nm_in_netthread(void) {
197 	return (isc__nm_tid_v >= 0);
198 }
199 
200 #ifdef WIN32
201 static void
isc__nm_winsock_initialize(void)202 isc__nm_winsock_initialize(void) {
203 	WORD wVersionRequested = MAKEWORD(2, 2);
204 	WSADATA wsaData;
205 	int result;
206 
207 	result = WSAStartup(wVersionRequested, &wsaData);
208 	if (result != 0) {
209 		char strbuf[ISC_STRERRORSIZE];
210 		strerror_r(result, strbuf, sizeof(strbuf));
211 		UNEXPECTED_ERROR(__FILE__, __LINE__,
212 				 "WSAStartup() failed with error code %lu: %s",
213 				 result, strbuf);
214 	}
215 
216 	/*
217 	 * Confirm that the WinSock DLL supports version 2.2.  Note that if the
218 	 * DLL supports versions greater than 2.2 in addition to 2.2, it will
219 	 * still return 2.2 in wVersion since that is the version we requested.
220 	 */
221 	if (LOBYTE(wsaData.wVersion) != 2 || HIBYTE(wsaData.wVersion) != 2) {
222 		UNEXPECTED_ERROR(__FILE__, __LINE__,
223 				 "Unusable WinSock DLL version: %u.%u",
224 				 LOBYTE(wsaData.wVersion),
225 				 HIBYTE(wsaData.wVersion));
226 	}
227 }
228 
229 static void
isc__nm_winsock_destroy(void)230 isc__nm_winsock_destroy(void) {
231 	WSACleanup();
232 }
233 #endif /* WIN32 */
234 
235 static void
isc__nm_threadpool_initialize(uint32_t nworkers)236 isc__nm_threadpool_initialize(uint32_t nworkers) {
237 	char buf[11];
238 	int r = uv_os_getenv("UV_THREADPOOL_SIZE", buf,
239 			     &(size_t){ sizeof(buf) });
240 	if (r == UV_ENOENT) {
241 		snprintf(buf, sizeof(buf), "%" PRIu32, nworkers);
242 		uv_os_setenv("UV_THREADPOOL_SIZE", buf);
243 	}
244 }
245 
246 #if HAVE_DECL_UV_UDP_MMSG_FREE
247 #define MINIMAL_UV_VERSION UV_VERSION(1, 40, 0)
248 #elif HAVE_DECL_UV_UDP_RECVMMSG
249 #define MAXIMAL_UV_VERSION UV_VERSION(1, 39, 99)
250 #define MINIMAL_UV_VERSION UV_VERSION(1, 37, 0)
251 #elif _WIN32
252 #define MINIMAL_UV_VERSION UV_VERSION(1, 0, 0)
253 #else
254 #define MAXIMAL_UV_VERSION UV_VERSION(1, 34, 99)
255 #define MINIMAL_UV_VERSION UV_VERSION(1, 0, 0)
256 #endif
257 
258 void
isc__netmgr_create(isc_mem_t * mctx,uint32_t nworkers,isc_nm_t ** netmgrp)259 isc__netmgr_create(isc_mem_t *mctx, uint32_t nworkers, isc_nm_t **netmgrp) {
260 	isc_nm_t *mgr = NULL;
261 	char name[32];
262 
263 	REQUIRE(nworkers > 0);
264 
265 #ifdef MAXIMAL_UV_VERSION
266 	if (uv_version() > MAXIMAL_UV_VERSION) {
267 		isc_error_fatal(__FILE__, __LINE__,
268 				"libuv version too new: running with libuv %s "
269 				"when compiled with libuv %s will lead to "
270 				"libuv failures",
271 				uv_version_string(), UV_VERSION_STRING);
272 	}
273 #endif /* MAXIMAL_UV_VERSION */
274 
275 	if (uv_version() < MINIMAL_UV_VERSION) {
276 		isc_error_fatal(__FILE__, __LINE__,
277 				"libuv version too old: running with libuv %s "
278 				"when compiled with libuv %s will lead to "
279 				"libuv failures",
280 				uv_version_string(), UV_VERSION_STRING);
281 	}
282 
283 #ifdef WIN32
284 	isc__nm_winsock_initialize();
285 #endif /* WIN32 */
286 
287 	isc__nm_threadpool_initialize(nworkers);
288 
289 	mgr = isc_mem_get(mctx, sizeof(*mgr));
290 	*mgr = (isc_nm_t){
291 		.nworkers = nworkers * 2,
292 		.nlisteners = nworkers,
293 	};
294 
295 	isc_mem_attach(mctx, &mgr->mctx);
296 	isc_mutex_init(&mgr->lock);
297 	isc_condition_init(&mgr->wkstatecond);
298 	isc_condition_init(&mgr->wkpausecond);
299 	isc_refcount_init(&mgr->references, 1);
300 	atomic_init(&mgr->maxudp, 0);
301 	atomic_init(&mgr->interlocked, ISC_NETMGR_NON_INTERLOCKED);
302 	atomic_init(&mgr->workers_paused, 0);
303 	atomic_init(&mgr->paused, false);
304 	atomic_init(&mgr->closing, false);
305 #if HAVE_SO_REUSEPORT_LB
306 	mgr->load_balance_sockets = true;
307 #else
308 	mgr->load_balance_sockets = false;
309 #endif
310 
311 #ifdef NETMGR_TRACE
312 	ISC_LIST_INIT(mgr->active_sockets);
313 #endif
314 
315 	/*
316 	 * Default TCP timeout values.
317 	 * May be updated by isc_nm_tcptimeouts().
318 	 */
319 	atomic_init(&mgr->init, 30000);
320 	atomic_init(&mgr->idle, 30000);
321 	atomic_init(&mgr->keepalive, 30000);
322 	atomic_init(&mgr->advertised, 30000);
323 
324 	isc_barrier_init(&mgr->pausing, mgr->nworkers);
325 	isc_barrier_init(&mgr->resuming, mgr->nworkers);
326 
327 	mgr->workers = isc_mem_get(mctx,
328 				   mgr->nworkers * sizeof(isc__networker_t));
329 	for (int i = 0; i < mgr->nworkers; i++) {
330 		isc__networker_t *worker = &mgr->workers[i];
331 		int r;
332 
333 		*worker = (isc__networker_t){
334 			.mgr = mgr,
335 			.id = i,
336 		};
337 
338 		r = uv_loop_init(&worker->loop);
339 		UV_RUNTIME_CHECK(uv_loop_init, r);
340 
341 		worker->loop.data = &mgr->workers[i];
342 
343 		r = uv_async_init(&worker->loop, &worker->async, async_cb);
344 		UV_RUNTIME_CHECK(uv_async_init, r);
345 
346 		for (size_t type = 0; type < NETIEVENT_MAX; type++) {
347 			isc_mutex_init(&worker->ievents[type].lock);
348 			isc_condition_init(&worker->ievents[type].cond);
349 			ISC_LIST_INIT(worker->ievents[type].list);
350 		}
351 
352 		worker->recvbuf = isc_mem_get(mctx, ISC_NETMGR_RECVBUF_SIZE);
353 		worker->sendbuf = isc_mem_get(mctx, ISC_NETMGR_SENDBUF_SIZE);
354 
355 		/*
356 		 * We need to do this here and not in nm_thread to avoid a
357 		 * race - we could exit isc_nm_start, launch nm_destroy,
358 		 * and nm_thread would still not be up.
359 		 */
360 		mgr->workers_running++;
361 		isc_thread_create(nm_thread, &mgr->workers[i], &worker->thread);
362 
363 		snprintf(name, sizeof(name), "net-%d", i);
364 		isc_thread_setname(worker->thread, name);
365 	}
366 
367 	mgr->magic = NM_MAGIC;
368 	*netmgrp = mgr;
369 }
370 
371 /*
372  * Free the resources of the network manager.
373  */
374 static void
nm_destroy(isc_nm_t ** mgr0)375 nm_destroy(isc_nm_t **mgr0) {
376 	REQUIRE(VALID_NM(*mgr0));
377 	REQUIRE(!isc__nm_in_netthread());
378 
379 	isc_nm_t *mgr = *mgr0;
380 	*mgr0 = NULL;
381 
382 	isc_refcount_destroy(&mgr->references);
383 
384 	mgr->magic = 0;
385 
386 	for (int i = 0; i < mgr->nworkers; i++) {
387 		isc__networker_t *worker = &mgr->workers[i];
388 		isc__netievent_t *event = isc__nm_get_netievent_stop(mgr);
389 		isc__nm_enqueue_ievent(worker, event);
390 	}
391 
392 	LOCK(&mgr->lock);
393 	while (mgr->workers_running > 0) {
394 		WAIT(&mgr->wkstatecond, &mgr->lock);
395 	}
396 	UNLOCK(&mgr->lock);
397 
398 	for (int i = 0; i < mgr->nworkers; i++) {
399 		isc__networker_t *worker = &mgr->workers[i];
400 		int r;
401 
402 		r = uv_loop_close(&worker->loop);
403 		UV_RUNTIME_CHECK(uv_loop_close, r);
404 
405 		for (size_t type = 0; type < NETIEVENT_MAX; type++) {
406 			INSIST(ISC_LIST_EMPTY(worker->ievents[type].list));
407 			isc_condition_destroy(&worker->ievents[type].cond);
408 			isc_mutex_destroy(&worker->ievents[type].lock);
409 		}
410 
411 		isc_mem_put(mgr->mctx, worker->sendbuf,
412 			    ISC_NETMGR_SENDBUF_SIZE);
413 		isc_mem_put(mgr->mctx, worker->recvbuf,
414 			    ISC_NETMGR_RECVBUF_SIZE);
415 		isc_thread_join(worker->thread, NULL);
416 	}
417 
418 	if (mgr->stats != NULL) {
419 		isc_stats_detach(&mgr->stats);
420 	}
421 
422 	isc_barrier_destroy(&mgr->resuming);
423 	isc_barrier_destroy(&mgr->pausing);
424 
425 	isc_condition_destroy(&mgr->wkstatecond);
426 	isc_condition_destroy(&mgr->wkpausecond);
427 	isc_mutex_destroy(&mgr->lock);
428 
429 	isc_mem_put(mgr->mctx, mgr->workers,
430 		    mgr->nworkers * sizeof(isc__networker_t));
431 	isc_mem_putanddetach(&mgr->mctx, mgr, sizeof(*mgr));
432 
433 #ifdef WIN32
434 	isc__nm_winsock_destroy();
435 #endif /* WIN32 */
436 }
437 
438 static void
enqueue_pause(isc__networker_t * worker)439 enqueue_pause(isc__networker_t *worker) {
440 	isc__netievent_pause_t *event =
441 		isc__nm_get_netievent_pause(worker->mgr);
442 	isc__nm_enqueue_ievent(worker, (isc__netievent_t *)event);
443 }
444 
445 static void
isc__nm_async_pause(isc__networker_t * worker,isc__netievent_t * ev0)446 isc__nm_async_pause(isc__networker_t *worker, isc__netievent_t *ev0) {
447 	UNUSED(ev0);
448 	REQUIRE(worker->paused == false);
449 
450 	worker->paused = true;
451 	uv_stop(&worker->loop);
452 }
453 
454 void
isc_nm_pause(isc_nm_t * mgr)455 isc_nm_pause(isc_nm_t *mgr) {
456 	REQUIRE(VALID_NM(mgr));
457 	REQUIRE(!atomic_load(&mgr->paused));
458 
459 	isc__nm_acquire_interlocked_force(mgr);
460 
461 	if (isc__nm_in_netthread()) {
462 		REQUIRE(isc_nm_tid() == 0);
463 	}
464 
465 	for (int i = 0; i < mgr->nworkers; i++) {
466 		isc__networker_t *worker = &mgr->workers[i];
467 		if (i == isc_nm_tid()) {
468 			isc__nm_async_pause(worker, NULL);
469 		} else {
470 			enqueue_pause(worker);
471 		}
472 	}
473 
474 	if (isc__nm_in_netthread()) {
475 		atomic_fetch_add(&mgr->workers_paused, 1);
476 		isc_barrier_wait(&mgr->pausing);
477 	}
478 
479 	LOCK(&mgr->lock);
480 	while (atomic_load(&mgr->workers_paused) != mgr->workers_running) {
481 		WAIT(&mgr->wkstatecond, &mgr->lock);
482 	}
483 	UNLOCK(&mgr->lock);
484 
485 	REQUIRE(atomic_compare_exchange_strong(&mgr->paused, &(bool){ false },
486 					       true));
487 }
488 
489 static void
enqueue_resume(isc__networker_t * worker)490 enqueue_resume(isc__networker_t *worker) {
491 	isc__netievent_resume_t *event =
492 		isc__nm_get_netievent_resume(worker->mgr);
493 	isc__nm_enqueue_ievent(worker, (isc__netievent_t *)event);
494 }
495 
496 static void
isc__nm_async_resume(isc__networker_t * worker,isc__netievent_t * ev0)497 isc__nm_async_resume(isc__networker_t *worker, isc__netievent_t *ev0) {
498 	UNUSED(ev0);
499 	REQUIRE(worker->paused == true);
500 
501 	worker->paused = false;
502 }
503 
504 void
isc_nm_resume(isc_nm_t * mgr)505 isc_nm_resume(isc_nm_t *mgr) {
506 	REQUIRE(VALID_NM(mgr));
507 	REQUIRE(atomic_load(&mgr->paused));
508 
509 	if (isc__nm_in_netthread()) {
510 		REQUIRE(isc_nm_tid() == 0);
511 		drain_queue(&mgr->workers[isc_nm_tid()], NETIEVENT_PRIORITY);
512 	}
513 
514 	for (int i = 0; i < mgr->nworkers; i++) {
515 		isc__networker_t *worker = &mgr->workers[i];
516 		if (i == isc_nm_tid()) {
517 			isc__nm_async_resume(worker, NULL);
518 		} else {
519 			enqueue_resume(worker);
520 		}
521 	}
522 
523 	if (isc__nm_in_netthread()) {
524 		drain_queue(&mgr->workers[isc_nm_tid()], NETIEVENT_PRIVILEGED);
525 
526 		atomic_fetch_sub(&mgr->workers_paused, 1);
527 		isc_barrier_wait(&mgr->resuming);
528 	}
529 
530 	LOCK(&mgr->lock);
531 	while (atomic_load(&mgr->workers_paused) != 0) {
532 		WAIT(&mgr->wkstatecond, &mgr->lock);
533 	}
534 	UNLOCK(&mgr->lock);
535 
536 	REQUIRE(atomic_compare_exchange_strong(&mgr->paused, &(bool){ true },
537 					       false));
538 
539 	isc__nm_drop_interlocked(mgr);
540 }
541 
542 void
isc_nm_attach(isc_nm_t * mgr,isc_nm_t ** dst)543 isc_nm_attach(isc_nm_t *mgr, isc_nm_t **dst) {
544 	REQUIRE(VALID_NM(mgr));
545 	REQUIRE(dst != NULL && *dst == NULL);
546 
547 	isc_refcount_increment(&mgr->references);
548 
549 	*dst = mgr;
550 }
551 
552 void
isc_nm_detach(isc_nm_t ** mgr0)553 isc_nm_detach(isc_nm_t **mgr0) {
554 	isc_nm_t *mgr = NULL;
555 
556 	REQUIRE(mgr0 != NULL);
557 	REQUIRE(VALID_NM(*mgr0));
558 
559 	mgr = *mgr0;
560 	*mgr0 = NULL;
561 
562 	if (isc_refcount_decrement(&mgr->references) == 1) {
563 		nm_destroy(&mgr);
564 	}
565 }
566 
567 void
isc__netmgr_shutdown(isc_nm_t * mgr)568 isc__netmgr_shutdown(isc_nm_t *mgr) {
569 	REQUIRE(VALID_NM(mgr));
570 
571 	atomic_store(&mgr->closing, true);
572 	for (int i = 0; i < mgr->nworkers; i++) {
573 		isc__netievent_t *event = NULL;
574 		event = isc__nm_get_netievent_shutdown(mgr);
575 		isc__nm_enqueue_ievent(&mgr->workers[i], event);
576 	}
577 }
578 
579 void
isc__netmgr_destroy(isc_nm_t ** netmgrp)580 isc__netmgr_destroy(isc_nm_t **netmgrp) {
581 	isc_nm_t *mgr = NULL;
582 	int counter = 0;
583 
584 	REQUIRE(VALID_NM(*netmgrp));
585 
586 	mgr = *netmgrp;
587 
588 	/*
589 	 * Close active connections.
590 	 */
591 	isc__netmgr_shutdown(mgr);
592 
593 	/*
594 	 * Wait for the manager to be dereferenced elsewhere.
595 	 */
596 	while (isc_refcount_current(&mgr->references) > 1 && counter++ < 1000) {
597 		uv_sleep(10);
598 	}
599 
600 #ifdef NETMGR_TRACE
601 	if (isc_refcount_current(&mgr->references) > 1) {
602 		isc__nm_dump_active(mgr);
603 		UNREACHABLE();
604 	}
605 #endif
606 
607 	/*
608 	 * Now just patiently wait
609 	 */
610 	while (isc_refcount_current(&mgr->references) > 1) {
611 		uv_sleep(10);
612 	}
613 
614 	/*
615 	 * Detach final reference.
616 	 */
617 	isc_nm_detach(netmgrp);
618 }
619 
620 void
isc_nm_maxudp(isc_nm_t * mgr,uint32_t maxudp)621 isc_nm_maxudp(isc_nm_t *mgr, uint32_t maxudp) {
622 	REQUIRE(VALID_NM(mgr));
623 
624 	atomic_store(&mgr->maxudp, maxudp);
625 }
626 
627 void
isc_nmhandle_setwritetimeout(isc_nmhandle_t * handle,uint64_t write_timeout)628 isc_nmhandle_setwritetimeout(isc_nmhandle_t *handle, uint64_t write_timeout) {
629 	REQUIRE(VALID_NMHANDLE(handle));
630 	REQUIRE(VALID_NMSOCK(handle->sock));
631 
632 	handle->sock->write_timeout = write_timeout;
633 }
634 
635 void
isc_nm_settimeouts(isc_nm_t * mgr,uint32_t init,uint32_t idle,uint32_t keepalive,uint32_t advertised)636 isc_nm_settimeouts(isc_nm_t *mgr, uint32_t init, uint32_t idle,
637 		   uint32_t keepalive, uint32_t advertised) {
638 	REQUIRE(VALID_NM(mgr));
639 
640 	atomic_store(&mgr->init, init);
641 	atomic_store(&mgr->idle, idle);
642 	atomic_store(&mgr->keepalive, keepalive);
643 	atomic_store(&mgr->advertised, advertised);
644 }
645 
646 bool
isc_nm_getloadbalancesockets(isc_nm_t * mgr)647 isc_nm_getloadbalancesockets(isc_nm_t *mgr) {
648 	REQUIRE(VALID_NM(mgr));
649 
650 	return (mgr->load_balance_sockets);
651 }
652 
653 void
isc_nm_setloadbalancesockets(isc_nm_t * mgr,bool enabled)654 isc_nm_setloadbalancesockets(isc_nm_t *mgr, bool enabled) {
655 	REQUIRE(VALID_NM(mgr));
656 
657 #if HAVE_SO_REUSEPORT_LB
658 	mgr->load_balance_sockets = enabled;
659 #else
660 	UNUSED(enabled);
661 #endif
662 }
663 
664 void
isc_nm_gettimeouts(isc_nm_t * mgr,uint32_t * initial,uint32_t * idle,uint32_t * keepalive,uint32_t * advertised)665 isc_nm_gettimeouts(isc_nm_t *mgr, uint32_t *initial, uint32_t *idle,
666 		   uint32_t *keepalive, uint32_t *advertised) {
667 	REQUIRE(VALID_NM(mgr));
668 
669 	if (initial != NULL) {
670 		*initial = atomic_load(&mgr->init);
671 	}
672 
673 	if (idle != NULL) {
674 		*idle = atomic_load(&mgr->idle);
675 	}
676 
677 	if (keepalive != NULL) {
678 		*keepalive = atomic_load(&mgr->keepalive);
679 	}
680 
681 	if (advertised != NULL) {
682 		*advertised = atomic_load(&mgr->advertised);
683 	}
684 }
685 
686 /*
687  * nm_thread is a single worker thread, that runs uv_run event loop
688  * until asked to stop.
689  *
690  * There are four queues for asynchronous events:
691  *
692  * 1. priority queue - netievents on the priority queue are run even when
693  *    the taskmgr enters exclusive mode and the netmgr is paused.  This
694  *    is needed to properly start listening on the interfaces, free
695  *    resources on shutdown, or resume from a pause.
696  *
697  * 2. privileged task queue - only privileged tasks are queued here and
698  *    this is the first queue that gets processed when network manager
699  *    is unpaused using isc_nm_resume().  All netmgr workers need to
700  *    clean the privileged task queue before they all proceed to normal
701  *    operation.  Both task queues are processed when the workers are
702  *    shutting down.
703  *
704  * 3. task queue - only (traditional) tasks are scheduled here, and this
705  *    queue and the privileged task queue are both processed when the
706  *    netmgr workers are finishing.  This is needed to process the task
707  *    shutdown events.
708  *
709  * 4. normal queue - this is the queue with netmgr events, e.g. reading,
710  *    sending, callbacks, etc.
711  */
712 
713 static isc_threadresult_t
nm_thread(isc_threadarg_t worker0)714 nm_thread(isc_threadarg_t worker0) {
715 	isc__networker_t *worker = (isc__networker_t *)worker0;
716 	isc_nm_t *mgr = worker->mgr;
717 
718 	isc__nm_tid_v = worker->id;
719 
720 	while (true) {
721 		/*
722 		 * uv_run() runs async_cb() in a loop, which processes
723 		 * all four event queues until a "pause" or "stop" event
724 		 * is encountered. On pause, we process only priority and
725 		 * privileged events until resuming.
726 		 */
727 		int r = uv_run(&worker->loop, UV_RUN_DEFAULT);
728 		INSIST(r > 0 || worker->finished);
729 
730 		if (worker->paused) {
731 			INSIST(atomic_load(&mgr->interlocked) != isc_nm_tid());
732 
733 			atomic_fetch_add(&mgr->workers_paused, 1);
734 			if (isc_barrier_wait(&mgr->pausing) != 0) {
735 				LOCK(&mgr->lock);
736 				SIGNAL(&mgr->wkstatecond);
737 				UNLOCK(&mgr->lock);
738 			}
739 
740 			while (worker->paused) {
741 				wait_for_priority_queue(worker);
742 			}
743 
744 			/*
745 			 * All workers must drain the privileged event
746 			 * queue before we resume from pause.
747 			 */
748 			drain_queue(worker, NETIEVENT_PRIVILEGED);
749 
750 			atomic_fetch_sub(&mgr->workers_paused, 1);
751 			if (isc_barrier_wait(&mgr->resuming) != 0) {
752 				LOCK(&mgr->lock);
753 				SIGNAL(&mgr->wkstatecond);
754 				UNLOCK(&mgr->lock);
755 			}
756 		}
757 
758 		if (r == 0) {
759 			INSIST(worker->finished);
760 			break;
761 		}
762 
763 		INSIST(!worker->finished);
764 	}
765 
766 	/*
767 	 * We are shutting down.  Drain the queues.
768 	 */
769 	drain_queue(worker, NETIEVENT_PRIVILEGED);
770 	drain_queue(worker, NETIEVENT_TASK);
771 
772 	for (size_t type = 0; type < NETIEVENT_MAX; type++) {
773 		LOCK(&worker->ievents[type].lock);
774 		INSIST(ISC_LIST_EMPTY(worker->ievents[type].list));
775 		UNLOCK(&worker->ievents[type].lock);
776 	}
777 
778 	LOCK(&mgr->lock);
779 	mgr->workers_running--;
780 	SIGNAL(&mgr->wkstatecond);
781 	UNLOCK(&mgr->lock);
782 
783 	return ((isc_threadresult_t)0);
784 }
785 
786 static bool
process_all_queues(isc__networker_t * worker)787 process_all_queues(isc__networker_t *worker) {
788 	bool reschedule = false;
789 	/*
790 	 * The queue processing functions will return false when the
791 	 * system is pausing or stopping and we don't want to process
792 	 * the other queues in such case, but we need the async event
793 	 * to be rescheduled in the next uv_run().
794 	 */
795 	for (size_t type = 0; type < NETIEVENT_MAX; type++) {
796 		isc_result_t result = process_queue(worker, type);
797 		switch (result) {
798 		case ISC_R_SUSPEND:
799 			reschedule = true;
800 			break;
801 		case ISC_R_EMPTY:
802 			/* empty queue */
803 			break;
804 		case ISC_R_SUCCESS:
805 			reschedule = true;
806 			break;
807 		default:
808 			UNREACHABLE();
809 		}
810 	}
811 
812 	return (reschedule);
813 }
814 
815 /*
816  * async_cb() is a universal callback for 'async' events sent to event loop.
817  * It's the only way to safely pass data to the libuv event loop. We use a
818  * single async event and a set of lockless queues of 'isc__netievent_t'
819  * structures passed from other threads.
820  */
821 static void
async_cb(uv_async_t * handle)822 async_cb(uv_async_t *handle) {
823 	isc__networker_t *worker = (isc__networker_t *)handle->loop->data;
824 
825 	if (process_all_queues(worker)) {
826 		/*
827 		 * If we didn't process all the events, we need to enqueue
828 		 * async_cb to be run in the next iteration of the uv_loop
829 		 */
830 		uv_async_send(handle);
831 	}
832 }
833 
834 static void
isc__nm_async_stop(isc__networker_t * worker,isc__netievent_t * ev0)835 isc__nm_async_stop(isc__networker_t *worker, isc__netievent_t *ev0) {
836 	UNUSED(ev0);
837 	worker->finished = true;
838 	/* Close the async handler */
839 	uv_close((uv_handle_t *)&worker->async, NULL);
840 }
841 
842 void
isc_nm_task_enqueue(isc_nm_t * nm,isc_task_t * task,int threadid)843 isc_nm_task_enqueue(isc_nm_t *nm, isc_task_t *task, int threadid) {
844 	isc__netievent_t *event = NULL;
845 	int tid;
846 	isc__networker_t *worker = NULL;
847 
848 	if (threadid == -1) {
849 		tid = (int)isc_random_uniform(nm->nlisteners);
850 	} else if (threadid == ISC_NM_TASK_SLOW_OFFSET) {
851 		tid = nm->nlisteners +
852 		      (int)isc_random_uniform(nm->nworkers - nm->nlisteners);
853 	} else if (threadid < ISC_NM_TASK_SLOW_OFFSET) {
854 		tid = nm->nlisteners + (ISC_NM_TASK_SLOW(threadid) %
855 					(nm->nworkers - nm->nlisteners));
856 	} else {
857 		tid = threadid % nm->nlisteners;
858 	}
859 
860 	worker = &nm->workers[tid];
861 
862 	if (isc_task_privileged(task)) {
863 		event = (isc__netievent_t *)
864 			isc__nm_get_netievent_privilegedtask(nm, task);
865 	} else {
866 		event = (isc__netievent_t *)isc__nm_get_netievent_task(nm,
867 								       task);
868 	}
869 
870 	isc__nm_enqueue_ievent(worker, event);
871 }
872 
873 #define isc__nm_async_privilegedtask(worker, ev0) \
874 	isc__nm_async_task(worker, ev0)
875 
876 static void
isc__nm_async_task(isc__networker_t * worker,isc__netievent_t * ev0)877 isc__nm_async_task(isc__networker_t *worker, isc__netievent_t *ev0) {
878 	isc__netievent_task_t *ievent = (isc__netievent_task_t *)ev0;
879 	isc_result_t result;
880 
881 	UNUSED(worker);
882 
883 	result = isc_task_run(ievent->task);
884 
885 	switch (result) {
886 	case ISC_R_QUOTA:
887 		isc_task_ready(ievent->task);
888 		return;
889 	case ISC_R_SUCCESS:
890 		return;
891 	default:
892 		UNREACHABLE();
893 	}
894 }
895 
896 static void
wait_for_priority_queue(isc__networker_t * worker)897 wait_for_priority_queue(isc__networker_t *worker) {
898 	isc_condition_t *cond = &worker->ievents[NETIEVENT_PRIORITY].cond;
899 	isc_mutex_t *lock = &worker->ievents[NETIEVENT_PRIORITY].lock;
900 	isc__netievent_list_t *list =
901 		&(worker->ievents[NETIEVENT_PRIORITY].list);
902 
903 	LOCK(lock);
904 	while (ISC_LIST_EMPTY(*list)) {
905 		WAIT(cond, lock);
906 	}
907 	UNLOCK(lock);
908 
909 	drain_queue(worker, NETIEVENT_PRIORITY);
910 }
911 
912 static void
drain_queue(isc__networker_t * worker,netievent_type_t type)913 drain_queue(isc__networker_t *worker, netievent_type_t type) {
914 	bool empty = false;
915 	while (!empty) {
916 		if (process_queue(worker, type) == ISC_R_EMPTY) {
917 			LOCK(&worker->ievents[type].lock);
918 			empty = ISC_LIST_EMPTY(worker->ievents[type].list);
919 			UNLOCK(&worker->ievents[type].lock);
920 		}
921 	}
922 }
923 
924 /*
925  * The two macros here generate the individual cases for the process_netievent()
926  * function.  The NETIEVENT_CASE(type) macro is the common case, and
927  * NETIEVENT_CASE_NOMORE(type) is a macro that causes the loop in the
928  * process_queue() to stop, e.g. it's only used for the netievent that
929  * stops/pauses processing the enqueued netievents.
930  */
931 #define NETIEVENT_CASE(type)                                               \
932 	case netievent_##type: {                                           \
933 		isc__nm_async_##type(worker, ievent);                      \
934 		isc__nm_put_netievent_##type(                              \
935 			worker->mgr, (isc__netievent_##type##_t *)ievent); \
936 		return (true);                                             \
937 	}
938 
939 #define NETIEVENT_CASE_NOMORE(type)                                \
940 	case netievent_##type: {                                   \
941 		isc__nm_async_##type(worker, ievent);              \
942 		isc__nm_put_netievent_##type(worker->mgr, ievent); \
943 		return (false);                                    \
944 	}
945 
946 static bool
process_netievent(isc__networker_t * worker,isc__netievent_t * ievent)947 process_netievent(isc__networker_t *worker, isc__netievent_t *ievent) {
948 	REQUIRE(worker->id == isc_nm_tid());
949 
950 	switch (ievent->type) {
951 		/* Don't process more ievents when we are stopping */
952 		NETIEVENT_CASE_NOMORE(stop);
953 
954 		NETIEVENT_CASE(privilegedtask);
955 		NETIEVENT_CASE(task);
956 
957 		NETIEVENT_CASE(udpconnect);
958 		NETIEVENT_CASE(udplisten);
959 		NETIEVENT_CASE(udpstop);
960 		NETIEVENT_CASE(udpsend);
961 		NETIEVENT_CASE(udpread);
962 		NETIEVENT_CASE(udpcancel);
963 		NETIEVENT_CASE(udpclose);
964 
965 		NETIEVENT_CASE(tcpaccept);
966 		NETIEVENT_CASE(tcpconnect);
967 		NETIEVENT_CASE(tcplisten);
968 		NETIEVENT_CASE(tcpstartread);
969 		NETIEVENT_CASE(tcppauseread);
970 		NETIEVENT_CASE(tcpsend);
971 		NETIEVENT_CASE(tcpstop);
972 		NETIEVENT_CASE(tcpcancel);
973 		NETIEVENT_CASE(tcpclose);
974 
975 		NETIEVENT_CASE(tcpdnsaccept);
976 		NETIEVENT_CASE(tcpdnslisten);
977 		NETIEVENT_CASE(tcpdnsconnect);
978 		NETIEVENT_CASE(tcpdnssend);
979 		NETIEVENT_CASE(tcpdnscancel);
980 		NETIEVENT_CASE(tcpdnsclose);
981 		NETIEVENT_CASE(tcpdnsread);
982 		NETIEVENT_CASE(tcpdnsstop);
983 
984 		NETIEVENT_CASE(connectcb);
985 		NETIEVENT_CASE(readcb);
986 		NETIEVENT_CASE(sendcb);
987 
988 		NETIEVENT_CASE(close);
989 		NETIEVENT_CASE(detach);
990 
991 		NETIEVENT_CASE(shutdown);
992 		NETIEVENT_CASE(resume);
993 		NETIEVENT_CASE_NOMORE(pause);
994 	default:
995 		UNREACHABLE();
996 	}
997 	return (true);
998 }
999 
1000 static isc_result_t
process_queue(isc__networker_t * worker,netievent_type_t type)1001 process_queue(isc__networker_t *worker, netievent_type_t type) {
1002 	isc__netievent_t *ievent = NULL;
1003 	isc__netievent_list_t list;
1004 
1005 	ISC_LIST_INIT(list);
1006 
1007 	LOCK(&worker->ievents[type].lock);
1008 	ISC_LIST_MOVE(list, worker->ievents[type].list);
1009 	UNLOCK(&worker->ievents[type].lock);
1010 
1011 	ievent = ISC_LIST_HEAD(list);
1012 	if (ievent == NULL) {
1013 		/* There's nothing scheduled */
1014 		return (ISC_R_EMPTY);
1015 	}
1016 
1017 	while (ievent != NULL) {
1018 		isc__netievent_t *next = ISC_LIST_NEXT(ievent, link);
1019 		ISC_LIST_DEQUEUE(list, ievent, link);
1020 
1021 		if (!process_netievent(worker, ievent)) {
1022 			/* The netievent told us to stop */
1023 			if (!ISC_LIST_EMPTY(list)) {
1024 				/*
1025 				 * Reschedule the rest of the unprocessed
1026 				 * events.
1027 				 */
1028 				LOCK(&worker->ievents[type].lock);
1029 				ISC_LIST_PREPENDLIST(worker->ievents[type].list,
1030 						     list, link);
1031 				UNLOCK(&worker->ievents[type].lock);
1032 			}
1033 			return (ISC_R_SUSPEND);
1034 		}
1035 
1036 		ievent = next;
1037 	}
1038 
1039 	/* We processed at least one */
1040 	return (ISC_R_SUCCESS);
1041 }
1042 
1043 void *
isc__nm_get_netievent(isc_nm_t * mgr,isc__netievent_type type)1044 isc__nm_get_netievent(isc_nm_t *mgr, isc__netievent_type type) {
1045 	isc__netievent_storage_t *event = isc_mem_get(mgr->mctx,
1046 						      sizeof(*event));
1047 
1048 	*event = (isc__netievent_storage_t){ .ni.type = type };
1049 	ISC_LINK_INIT(&(event->ni), link);
1050 	return (event);
1051 }
1052 
1053 void
isc__nm_put_netievent(isc_nm_t * mgr,void * ievent)1054 isc__nm_put_netievent(isc_nm_t *mgr, void *ievent) {
1055 	isc_mem_put(mgr->mctx, ievent, sizeof(isc__netievent_storage_t));
1056 }
1057 
1058 NETIEVENT_SOCKET_DEF(tcpclose);
1059 NETIEVENT_SOCKET_DEF(tcplisten);
1060 NETIEVENT_SOCKET_DEF(tcppauseread);
1061 NETIEVENT_SOCKET_DEF(tcpstartread);
1062 NETIEVENT_SOCKET_DEF(tcpstop);
1063 NETIEVENT_SOCKET_DEF(udpclose);
1064 NETIEVENT_SOCKET_DEF(udplisten);
1065 NETIEVENT_SOCKET_DEF(udpread);
1066 NETIEVENT_SOCKET_DEF(udpsend);
1067 NETIEVENT_SOCKET_DEF(udpstop);
1068 
1069 NETIEVENT_SOCKET_DEF(tcpdnsclose);
1070 NETIEVENT_SOCKET_DEF(tcpdnsread);
1071 NETIEVENT_SOCKET_DEF(tcpdnsstop);
1072 NETIEVENT_SOCKET_DEF(tcpdnslisten);
1073 NETIEVENT_SOCKET_REQ_DEF(tcpdnsconnect);
1074 NETIEVENT_SOCKET_REQ_DEF(tcpdnssend);
1075 NETIEVENT_SOCKET_HANDLE_DEF(tcpdnscancel);
1076 NETIEVENT_SOCKET_QUOTA_DEF(tcpdnsaccept);
1077 
1078 NETIEVENT_SOCKET_REQ_DEF(tcpconnect);
1079 NETIEVENT_SOCKET_REQ_DEF(tcpsend);
1080 NETIEVENT_SOCKET_REQ_DEF(udpconnect);
1081 NETIEVENT_SOCKET_REQ_RESULT_DEF(connectcb);
1082 NETIEVENT_SOCKET_REQ_RESULT_DEF(readcb);
1083 NETIEVENT_SOCKET_REQ_RESULT_DEF(sendcb);
1084 
1085 NETIEVENT_SOCKET_DEF(detach);
1086 NETIEVENT_SOCKET_HANDLE_DEF(tcpcancel);
1087 NETIEVENT_SOCKET_HANDLE_DEF(udpcancel);
1088 
1089 NETIEVENT_SOCKET_QUOTA_DEF(tcpaccept);
1090 
1091 NETIEVENT_SOCKET_DEF(close);
1092 NETIEVENT_DEF(pause);
1093 NETIEVENT_DEF(resume);
1094 NETIEVENT_DEF(shutdown);
1095 NETIEVENT_DEF(stop);
1096 
1097 NETIEVENT_TASK_DEF(task);
1098 NETIEVENT_TASK_DEF(privilegedtask);
1099 
1100 void
isc__nm_maybe_enqueue_ievent(isc__networker_t * worker,isc__netievent_t * event)1101 isc__nm_maybe_enqueue_ievent(isc__networker_t *worker,
1102 			     isc__netievent_t *event) {
1103 	/*
1104 	 * If we are already in the matching nmthread, process the ievent
1105 	 * directly.
1106 	 */
1107 	if (worker->id == isc_nm_tid()) {
1108 		process_netievent(worker, event);
1109 		return;
1110 	}
1111 
1112 	isc__nm_enqueue_ievent(worker, event);
1113 }
1114 
1115 void
isc__nm_enqueue_ievent(isc__networker_t * worker,isc__netievent_t * event)1116 isc__nm_enqueue_ievent(isc__networker_t *worker, isc__netievent_t *event) {
1117 	netievent_type_t type;
1118 
1119 	if (event->type > netievent_prio) {
1120 		type = NETIEVENT_PRIORITY;
1121 	} else {
1122 		switch (event->type) {
1123 		case netievent_prio:
1124 			UNREACHABLE();
1125 			break;
1126 		case netievent_privilegedtask:
1127 			type = NETIEVENT_PRIVILEGED;
1128 			break;
1129 		case netievent_task:
1130 			type = NETIEVENT_TASK;
1131 			break;
1132 		default:
1133 			type = NETIEVENT_NORMAL;
1134 			break;
1135 		}
1136 	}
1137 
1138 	/*
1139 	 * We need to make sure this signal will be delivered and
1140 	 * the queue will be processed.
1141 	 */
1142 	LOCK(&worker->ievents[type].lock);
1143 	ISC_LIST_ENQUEUE(worker->ievents[type].list, event, link);
1144 	if (type == NETIEVENT_PRIORITY) {
1145 		SIGNAL(&worker->ievents[type].cond);
1146 	}
1147 	UNLOCK(&worker->ievents[type].lock);
1148 
1149 	uv_async_send(&worker->async);
1150 }
1151 
1152 bool
isc__nmsocket_active(isc_nmsocket_t * sock)1153 isc__nmsocket_active(isc_nmsocket_t *sock) {
1154 	REQUIRE(VALID_NMSOCK(sock));
1155 	if (sock->parent != NULL) {
1156 		return (atomic_load(&sock->parent->active));
1157 	}
1158 
1159 	return (atomic_load(&sock->active));
1160 }
1161 
1162 bool
isc__nmsocket_deactivate(isc_nmsocket_t * sock)1163 isc__nmsocket_deactivate(isc_nmsocket_t *sock) {
1164 	REQUIRE(VALID_NMSOCK(sock));
1165 
1166 	if (sock->parent != NULL) {
1167 		return (atomic_compare_exchange_strong(&sock->parent->active,
1168 						       &(bool){ true }, false));
1169 	}
1170 
1171 	return (atomic_compare_exchange_strong(&sock->active, &(bool){ true },
1172 					       false));
1173 }
1174 
1175 void
isc___nmsocket_attach(isc_nmsocket_t * sock,isc_nmsocket_t ** target FLARG)1176 isc___nmsocket_attach(isc_nmsocket_t *sock, isc_nmsocket_t **target FLARG) {
1177 	REQUIRE(VALID_NMSOCK(sock));
1178 	REQUIRE(target != NULL && *target == NULL);
1179 
1180 	isc_nmsocket_t *rsock = NULL;
1181 
1182 	if (sock->parent != NULL) {
1183 		rsock = sock->parent;
1184 		INSIST(rsock->parent == NULL); /* sanity check */
1185 	} else {
1186 		rsock = sock;
1187 	}
1188 
1189 	NETMGR_TRACE_LOG("isc__nmsocket_attach():%p->references = %" PRIuFAST32
1190 			 "\n",
1191 			 rsock, isc_refcount_current(&rsock->references) + 1);
1192 
1193 	isc_refcount_increment0(&rsock->references);
1194 
1195 	*target = sock;
1196 }
1197 
1198 /*
1199  * Free all resources inside a socket (including its children if any).
1200  */
1201 static void
nmsocket_cleanup(isc_nmsocket_t * sock,bool dofree FLARG)1202 nmsocket_cleanup(isc_nmsocket_t *sock, bool dofree FLARG) {
1203 	isc_nmhandle_t *handle = NULL;
1204 	isc__nm_uvreq_t *uvreq = NULL;
1205 
1206 	REQUIRE(VALID_NMSOCK(sock));
1207 	REQUIRE(!isc__nmsocket_active(sock));
1208 
1209 	NETMGR_TRACE_LOG("nmsocket_cleanup():%p->references = %" PRIuFAST32
1210 			 "\n",
1211 			 sock, isc_refcount_current(&sock->references));
1212 
1213 	atomic_store(&sock->destroying, true);
1214 
1215 	if (sock->parent == NULL && sock->children != NULL) {
1216 		/*
1217 		 * We shouldn't be here unless there are no active handles,
1218 		 * so we can clean up and free the children.
1219 		 */
1220 		for (size_t i = 0; i < sock->nchildren; i++) {
1221 			if (!atomic_load(&sock->children[i].destroying)) {
1222 				nmsocket_cleanup(&sock->children[i],
1223 						 false FLARG_PASS);
1224 			}
1225 		}
1226 
1227 		/*
1228 		 * This was a parent socket: destroy the listening
1229 		 * barriers that synchronized the children.
1230 		 */
1231 		isc_barrier_destroy(&sock->startlistening);
1232 		isc_barrier_destroy(&sock->stoplistening);
1233 
1234 		/*
1235 		 * Now free them.
1236 		 */
1237 		isc_mem_put(sock->mgr->mctx, sock->children,
1238 			    sock->nchildren * sizeof(*sock));
1239 		sock->children = NULL;
1240 		sock->nchildren = 0;
1241 	}
1242 	if (sock->statsindex != NULL) {
1243 		isc__nm_decstats(sock->mgr, sock->statsindex[STATID_ACTIVE]);
1244 	}
1245 
1246 	sock->statichandle = NULL;
1247 
1248 	if (sock->outerhandle != NULL) {
1249 		isc__nmhandle_detach(&sock->outerhandle FLARG_PASS);
1250 	}
1251 
1252 	if (sock->outer != NULL) {
1253 		isc___nmsocket_detach(&sock->outer FLARG_PASS);
1254 	}
1255 
1256 	while ((handle = isc_astack_pop(sock->inactivehandles)) != NULL) {
1257 		nmhandle_free(sock, handle);
1258 	}
1259 
1260 	if (sock->buf != NULL) {
1261 		isc_mem_free(sock->mgr->mctx, sock->buf);
1262 	}
1263 
1264 	if (sock->quota != NULL) {
1265 		isc_quota_detach(&sock->quota);
1266 	}
1267 
1268 	sock->pquota = NULL;
1269 
1270 	isc_astack_destroy(sock->inactivehandles);
1271 
1272 	while ((uvreq = isc_astack_pop(sock->inactivereqs)) != NULL) {
1273 		isc_mem_put(sock->mgr->mctx, uvreq, sizeof(*uvreq));
1274 	}
1275 
1276 	isc_astack_destroy(sock->inactivereqs);
1277 	sock->magic = 0;
1278 
1279 	isc_condition_destroy(&sock->scond);
1280 	isc_condition_destroy(&sock->cond);
1281 	isc_mutex_destroy(&sock->lock);
1282 #ifdef NETMGR_TRACE
1283 	LOCK(&sock->mgr->lock);
1284 	ISC_LIST_UNLINK(sock->mgr->active_sockets, sock, active_link);
1285 	UNLOCK(&sock->mgr->lock);
1286 #endif
1287 	if (dofree) {
1288 		isc_nm_t *mgr = sock->mgr;
1289 		isc_mem_put(mgr->mctx, sock, sizeof(*sock));
1290 		isc_nm_detach(&mgr);
1291 	} else {
1292 		isc_nm_detach(&sock->mgr);
1293 	}
1294 }
1295 
1296 static void
nmsocket_maybe_destroy(isc_nmsocket_t * sock FLARG)1297 nmsocket_maybe_destroy(isc_nmsocket_t *sock FLARG) {
1298 	int active_handles;
1299 	bool destroy = false;
1300 
1301 	NETMGR_TRACE_LOG("%s():%p->references = %" PRIuFAST32 "\n", __func__,
1302 			 sock, isc_refcount_current(&sock->references));
1303 
1304 	if (sock->parent != NULL) {
1305 		/*
1306 		 * This is a child socket and cannot be destroyed except
1307 		 * as a side effect of destroying the parent, so let's go
1308 		 * see if the parent is ready to be destroyed.
1309 		 */
1310 		nmsocket_maybe_destroy(sock->parent FLARG_PASS);
1311 		return;
1312 	}
1313 
1314 	/*
1315 	 * This is a parent socket (or a standalone). See whether the
1316 	 * children have active handles before deciding whether to
1317 	 * accept destruction.
1318 	 */
1319 	LOCK(&sock->lock);
1320 	if (atomic_load(&sock->active) || atomic_load(&sock->destroying) ||
1321 	    !atomic_load(&sock->closed) || atomic_load(&sock->references) != 0)
1322 	{
1323 		UNLOCK(&sock->lock);
1324 		return;
1325 	}
1326 
1327 	active_handles = atomic_load(&sock->ah);
1328 	if (sock->children != NULL) {
1329 		for (size_t i = 0; i < sock->nchildren; i++) {
1330 			LOCK(&sock->children[i].lock);
1331 			active_handles += atomic_load(&sock->children[i].ah);
1332 			UNLOCK(&sock->children[i].lock);
1333 		}
1334 	}
1335 
1336 	if (active_handles == 0 || sock->statichandle != NULL) {
1337 		destroy = true;
1338 	}
1339 
1340 	NETMGR_TRACE_LOG("%s:%p->active_handles = %d, .statichandle = %p\n",
1341 			 __func__, sock, active_handles, sock->statichandle);
1342 
1343 	if (destroy) {
1344 		atomic_store(&sock->destroying, true);
1345 		UNLOCK(&sock->lock);
1346 		nmsocket_cleanup(sock, true FLARG_PASS);
1347 	} else {
1348 		UNLOCK(&sock->lock);
1349 	}
1350 }
1351 
1352 void
isc___nmsocket_prep_destroy(isc_nmsocket_t * sock FLARG)1353 isc___nmsocket_prep_destroy(isc_nmsocket_t *sock FLARG) {
1354 	REQUIRE(sock->parent == NULL);
1355 
1356 	NETMGR_TRACE_LOG("isc___nmsocket_prep_destroy():%p->references = "
1357 			 "%" PRIuFAST32 "\n",
1358 			 sock, isc_refcount_current(&sock->references));
1359 
1360 	/*
1361 	 * The final external reference to the socket is gone. We can try
1362 	 * destroying the socket, but we have to wait for all the inflight
1363 	 * handles to finish first.
1364 	 */
1365 	atomic_store(&sock->active, false);
1366 
1367 	/*
1368 	 * If the socket has children, they'll need to be marked inactive
1369 	 * so they can be cleaned up too.
1370 	 */
1371 	if (sock->children != NULL) {
1372 		for (size_t i = 0; i < sock->nchildren; i++) {
1373 			atomic_store(&sock->children[i].active, false);
1374 		}
1375 	}
1376 
1377 	/*
1378 	 * If we're here then we already stopped listening; otherwise
1379 	 * we'd have a hanging reference from the listening process.
1380 	 *
1381 	 * If it's a regular socket we may need to close it.
1382 	 */
1383 	if (!atomic_load(&sock->closed)) {
1384 		switch (sock->type) {
1385 		case isc_nm_udpsocket:
1386 			isc__nm_udp_close(sock);
1387 			return;
1388 		case isc_nm_tcpsocket:
1389 			isc__nm_tcp_close(sock);
1390 			return;
1391 		case isc_nm_tcpdnssocket:
1392 			isc__nm_tcpdns_close(sock);
1393 			return;
1394 		default:
1395 			break;
1396 		}
1397 	}
1398 
1399 	nmsocket_maybe_destroy(sock FLARG_PASS);
1400 }
1401 
1402 void
isc___nmsocket_detach(isc_nmsocket_t ** sockp FLARG)1403 isc___nmsocket_detach(isc_nmsocket_t **sockp FLARG) {
1404 	REQUIRE(sockp != NULL && *sockp != NULL);
1405 	REQUIRE(VALID_NMSOCK(*sockp));
1406 
1407 	isc_nmsocket_t *sock = *sockp, *rsock = NULL;
1408 	*sockp = NULL;
1409 
1410 	/*
1411 	 * If the socket is a part of a set (a child socket) we are
1412 	 * counting references for the whole set at the parent.
1413 	 */
1414 	if (sock->parent != NULL) {
1415 		rsock = sock->parent;
1416 		INSIST(rsock->parent == NULL); /* Sanity check */
1417 	} else {
1418 		rsock = sock;
1419 	}
1420 
1421 	NETMGR_TRACE_LOG("isc__nmsocket_detach():%p->references = %" PRIuFAST32
1422 			 "\n",
1423 			 rsock, isc_refcount_current(&rsock->references) - 1);
1424 
1425 	if (isc_refcount_decrement(&rsock->references) == 1) {
1426 		isc___nmsocket_prep_destroy(rsock FLARG_PASS);
1427 	}
1428 }
1429 
1430 void
isc_nmsocket_close(isc_nmsocket_t ** sockp)1431 isc_nmsocket_close(isc_nmsocket_t **sockp) {
1432 	REQUIRE(sockp != NULL);
1433 	REQUIRE(VALID_NMSOCK(*sockp));
1434 	REQUIRE((*sockp)->type == isc_nm_udplistener ||
1435 		(*sockp)->type == isc_nm_tcplistener ||
1436 		(*sockp)->type == isc_nm_tcpdnslistener);
1437 
1438 	isc__nmsocket_detach(sockp);
1439 }
1440 
1441 void
isc___nmsocket_init(isc_nmsocket_t * sock,isc_nm_t * mgr,isc_nmsocket_type type,isc_sockaddr_t * iface FLARG)1442 isc___nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type,
1443 		    isc_sockaddr_t *iface FLARG) {
1444 	uint16_t family;
1445 
1446 	REQUIRE(sock != NULL);
1447 	REQUIRE(mgr != NULL);
1448 	REQUIRE(iface != NULL);
1449 
1450 	family = iface->type.sa.sa_family;
1451 
1452 	*sock = (isc_nmsocket_t){ .type = type,
1453 				  .iface = *iface,
1454 				  .fd = -1,
1455 				  .inactivehandles = isc_astack_new(
1456 					  mgr->mctx, ISC_NM_HANDLES_STACK_SIZE),
1457 				  .inactivereqs = isc_astack_new(
1458 					  mgr->mctx, ISC_NM_REQS_STACK_SIZE) };
1459 
1460 #if NETMGR_TRACE
1461 	sock->backtrace_size = backtrace(sock->backtrace, TRACE_SIZE);
1462 	ISC_LINK_INIT(sock, active_link);
1463 	ISC_LIST_INIT(sock->active_handles);
1464 	LOCK(&mgr->lock);
1465 	ISC_LIST_APPEND(mgr->active_sockets, sock, active_link);
1466 	UNLOCK(&mgr->lock);
1467 #endif
1468 
1469 	isc_nm_attach(mgr, &sock->mgr);
1470 	sock->uv_handle.handle.data = sock;
1471 
1472 	ISC_LINK_INIT(&sock->quotacb, link);
1473 
1474 	switch (type) {
1475 	case isc_nm_udpsocket:
1476 	case isc_nm_udplistener:
1477 		if (family == AF_INET) {
1478 			sock->statsindex = udp4statsindex;
1479 		} else {
1480 			sock->statsindex = udp6statsindex;
1481 		}
1482 		isc__nm_incstats(sock->mgr, sock->statsindex[STATID_ACTIVE]);
1483 		break;
1484 	case isc_nm_tcpsocket:
1485 	case isc_nm_tcplistener:
1486 	case isc_nm_tcpdnssocket:
1487 	case isc_nm_tcpdnslistener:
1488 		if (family == AF_INET) {
1489 			sock->statsindex = tcp4statsindex;
1490 		} else {
1491 			sock->statsindex = tcp6statsindex;
1492 		}
1493 		isc__nm_incstats(sock->mgr, sock->statsindex[STATID_ACTIVE]);
1494 		break;
1495 	default:
1496 		break;
1497 	}
1498 
1499 	isc_mutex_init(&sock->lock);
1500 	isc_condition_init(&sock->cond);
1501 	isc_condition_init(&sock->scond);
1502 	isc_refcount_init(&sock->references, 1);
1503 
1504 	NETMGR_TRACE_LOG("isc__nmsocket_init():%p->references = %" PRIuFAST32
1505 			 "\n",
1506 			 sock, isc_refcount_current(&sock->references));
1507 
1508 	atomic_init(&sock->active, true);
1509 	atomic_init(&sock->sequential, false);
1510 	atomic_init(&sock->readpaused, false);
1511 	atomic_init(&sock->closing, false);
1512 	atomic_init(&sock->listening, 0);
1513 	atomic_init(&sock->closed, 0);
1514 	atomic_init(&sock->destroying, 0);
1515 	atomic_init(&sock->ah, 0);
1516 	atomic_init(&sock->client, 0);
1517 	atomic_init(&sock->connecting, false);
1518 	atomic_init(&sock->keepalive, false);
1519 	atomic_init(&sock->connected, false);
1520 	atomic_init(&sock->timedout, false);
1521 
1522 	atomic_init(&sock->active_child_connections, 0);
1523 
1524 	sock->magic = NMSOCK_MAGIC;
1525 }
1526 
1527 void
isc__nmsocket_clearcb(isc_nmsocket_t * sock)1528 isc__nmsocket_clearcb(isc_nmsocket_t *sock) {
1529 	REQUIRE(VALID_NMSOCK(sock));
1530 	REQUIRE(!isc__nm_in_netthread() || sock->tid == isc_nm_tid());
1531 
1532 	sock->recv_cb = NULL;
1533 	sock->recv_cbarg = NULL;
1534 	sock->accept_cb = NULL;
1535 	sock->accept_cbarg = NULL;
1536 	sock->connect_cb = NULL;
1537 	sock->connect_cbarg = NULL;
1538 }
1539 
1540 void
isc__nm_free_uvbuf(isc_nmsocket_t * sock,const uv_buf_t * buf)1541 isc__nm_free_uvbuf(isc_nmsocket_t *sock, const uv_buf_t *buf) {
1542 	isc__networker_t *worker = NULL;
1543 
1544 	REQUIRE(VALID_NMSOCK(sock));
1545 
1546 	worker = &sock->mgr->workers[sock->tid];
1547 	REQUIRE(buf->base == worker->recvbuf);
1548 
1549 	worker->recvbuf_inuse = false;
1550 }
1551 
1552 static isc_nmhandle_t *
alloc_handle(isc_nmsocket_t * sock)1553 alloc_handle(isc_nmsocket_t *sock) {
1554 	isc_nmhandle_t *handle =
1555 		isc_mem_get(sock->mgr->mctx,
1556 			    sizeof(isc_nmhandle_t) + sock->extrahandlesize);
1557 
1558 	*handle = (isc_nmhandle_t){ .magic = NMHANDLE_MAGIC };
1559 #ifdef NETMGR_TRACE
1560 	ISC_LINK_INIT(handle, active_link);
1561 #endif
1562 	isc_refcount_init(&handle->references, 1);
1563 
1564 	return (handle);
1565 }
1566 
1567 isc_nmhandle_t *
isc___nmhandle_get(isc_nmsocket_t * sock,isc_sockaddr_t * peer,isc_sockaddr_t * local FLARG)1568 isc___nmhandle_get(isc_nmsocket_t *sock, isc_sockaddr_t *peer,
1569 		   isc_sockaddr_t *local FLARG) {
1570 	isc_nmhandle_t *handle = NULL;
1571 
1572 	REQUIRE(VALID_NMSOCK(sock));
1573 
1574 	handle = isc_astack_pop(sock->inactivehandles);
1575 
1576 	if (handle == NULL) {
1577 		handle = alloc_handle(sock);
1578 	} else {
1579 		isc_refcount_init(&handle->references, 1);
1580 		INSIST(VALID_NMHANDLE(handle));
1581 	}
1582 
1583 	NETMGR_TRACE_LOG(
1584 		"isc__nmhandle_get():handle %p->references = %" PRIuFAST32 "\n",
1585 		handle, isc_refcount_current(&handle->references));
1586 
1587 	isc___nmsocket_attach(sock, &handle->sock FLARG_PASS);
1588 
1589 #if NETMGR_TRACE
1590 	handle->backtrace_size = backtrace(handle->backtrace, TRACE_SIZE);
1591 #endif
1592 
1593 	if (peer != NULL) {
1594 		handle->peer = *peer;
1595 	} else {
1596 		handle->peer = sock->peer;
1597 	}
1598 
1599 	if (local != NULL) {
1600 		handle->local = *local;
1601 	} else {
1602 		handle->local = sock->iface;
1603 	}
1604 
1605 	(void)atomic_fetch_add(&sock->ah, 1);
1606 
1607 #ifdef NETMGR_TRACE
1608 	LOCK(&sock->lock);
1609 	ISC_LIST_APPEND(sock->active_handles, handle, active_link);
1610 	UNLOCK(&sock->lock);
1611 #endif
1612 
1613 	switch (sock->type) {
1614 	case isc_nm_udpsocket:
1615 	case isc_nm_tcpdnssocket:
1616 		if (!atomic_load(&sock->client)) {
1617 			break;
1618 		}
1619 		FALLTHROUGH;
1620 	case isc_nm_tcpsocket:
1621 		INSIST(sock->statichandle == NULL);
1622 
1623 		/*
1624 		 * statichandle must be assigned, not attached;
1625 		 * otherwise, if a handle was detached elsewhere
1626 		 * it could never reach 0 references, and the
1627 		 * handle and socket would never be freed.
1628 		 */
1629 		sock->statichandle = handle;
1630 		break;
1631 	default:
1632 		break;
1633 	}
1634 
1635 	return (handle);
1636 }
1637 
1638 void
isc__nmhandle_attach(isc_nmhandle_t * handle,isc_nmhandle_t ** handlep FLARG)1639 isc__nmhandle_attach(isc_nmhandle_t *handle, isc_nmhandle_t **handlep FLARG) {
1640 	REQUIRE(VALID_NMHANDLE(handle));
1641 	REQUIRE(handlep != NULL && *handlep == NULL);
1642 
1643 	NETMGR_TRACE_LOG("isc__nmhandle_attach():handle %p->references = "
1644 			 "%" PRIuFAST32 "\n",
1645 			 handle, isc_refcount_current(&handle->references) + 1);
1646 
1647 	isc_refcount_increment(&handle->references);
1648 	*handlep = handle;
1649 }
1650 
1651 bool
isc_nmhandle_is_stream(isc_nmhandle_t * handle)1652 isc_nmhandle_is_stream(isc_nmhandle_t *handle) {
1653 	REQUIRE(VALID_NMHANDLE(handle));
1654 
1655 	return (handle->sock->type == isc_nm_tcpsocket ||
1656 		handle->sock->type == isc_nm_tcpdnssocket);
1657 }
1658 
1659 static void
nmhandle_free(isc_nmsocket_t * sock,isc_nmhandle_t * handle)1660 nmhandle_free(isc_nmsocket_t *sock, isc_nmhandle_t *handle) {
1661 	size_t extra = sock->extrahandlesize;
1662 
1663 	isc_refcount_destroy(&handle->references);
1664 
1665 	if (handle->dofree != NULL) {
1666 		handle->dofree(handle->opaque);
1667 	}
1668 
1669 	*handle = (isc_nmhandle_t){ .magic = 0 };
1670 
1671 	isc_mem_put(sock->mgr->mctx, handle, sizeof(isc_nmhandle_t) + extra);
1672 }
1673 
1674 static void
nmhandle_deactivate(isc_nmsocket_t * sock,isc_nmhandle_t * handle)1675 nmhandle_deactivate(isc_nmsocket_t *sock, isc_nmhandle_t *handle) {
1676 	bool reuse = false;
1677 
1678 	/*
1679 	 * We do all of this under lock to avoid races with socket
1680 	 * destruction.  We have to do this now, because at this point the
1681 	 * socket is either unused or still attached to event->sock.
1682 	 */
1683 	LOCK(&sock->lock);
1684 
1685 #ifdef NETMGR_TRACE
1686 	ISC_LIST_UNLINK(sock->active_handles, handle, active_link);
1687 #endif
1688 
1689 	INSIST(atomic_fetch_sub(&sock->ah, 1) > 0);
1690 
1691 #if !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__
1692 	if (atomic_load(&sock->active)) {
1693 		reuse = isc_astack_trypush(sock->inactivehandles, handle);
1694 	}
1695 #endif /* !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__ */
1696 	if (!reuse) {
1697 		nmhandle_free(sock, handle);
1698 	}
1699 	UNLOCK(&sock->lock);
1700 }
1701 
1702 void
isc__nmhandle_detach(isc_nmhandle_t ** handlep FLARG)1703 isc__nmhandle_detach(isc_nmhandle_t **handlep FLARG) {
1704 	isc_nmsocket_t *sock = NULL;
1705 	isc_nmhandle_t *handle = NULL;
1706 
1707 	REQUIRE(handlep != NULL);
1708 	REQUIRE(VALID_NMHANDLE(*handlep));
1709 
1710 	handle = *handlep;
1711 	*handlep = NULL;
1712 
1713 	/*
1714 	 * If the closehandle_cb is set, it needs to run asynchronously to
1715 	 * ensure correct ordering of the isc__nm_process_sock_buffer().
1716 	 */
1717 	sock = handle->sock;
1718 	if (sock->tid == isc_nm_tid() && sock->closehandle_cb == NULL) {
1719 		nmhandle_detach_cb(&handle FLARG_PASS);
1720 	} else {
1721 		isc__netievent_detach_t *event =
1722 			isc__nm_get_netievent_detach(sock->mgr, sock);
1723 		/*
1724 		 * we are using implicit "attach" as the last reference
1725 		 * need to be destroyed explicitly in the async callback
1726 		 */
1727 		event->handle = handle;
1728 		FLARG_IEVENT_PASS(event);
1729 		isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid],
1730 				       (isc__netievent_t *)event);
1731 	}
1732 }
1733 
1734 void
1735 isc__nmsocket_shutdown(isc_nmsocket_t *sock);
1736 
1737 static void
nmhandle_detach_cb(isc_nmhandle_t ** handlep FLARG)1738 nmhandle_detach_cb(isc_nmhandle_t **handlep FLARG) {
1739 	isc_nmsocket_t *sock = NULL;
1740 	isc_nmhandle_t *handle = NULL;
1741 
1742 	REQUIRE(handlep != NULL);
1743 	REQUIRE(VALID_NMHANDLE(*handlep));
1744 
1745 	handle = *handlep;
1746 	*handlep = NULL;
1747 
1748 	NETMGR_TRACE_LOG("isc__nmhandle_detach():%p->references = %" PRIuFAST32
1749 			 "\n",
1750 			 handle, isc_refcount_current(&handle->references) - 1);
1751 
1752 	if (isc_refcount_decrement(&handle->references) > 1) {
1753 		return;
1754 	}
1755 
1756 	/* We need an acquire memory barrier here */
1757 	(void)isc_refcount_current(&handle->references);
1758 
1759 	sock = handle->sock;
1760 	handle->sock = NULL;
1761 
1762 	if (handle->doreset != NULL) {
1763 		handle->doreset(handle->opaque);
1764 	}
1765 
1766 	nmhandle_deactivate(sock, handle);
1767 
1768 	/*
1769 	 * The handle is gone now. If the socket has a callback configured
1770 	 * for that (e.g., to perform cleanup after request processing),
1771 	 * call it now, or schedule it to run asynchronously.
1772 	 */
1773 	if (sock->closehandle_cb != NULL) {
1774 		if (sock->tid == isc_nm_tid()) {
1775 			sock->closehandle_cb(sock);
1776 		} else {
1777 			isc__netievent_close_t *event =
1778 				isc__nm_get_netievent_close(sock->mgr, sock);
1779 			isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid],
1780 					       (isc__netievent_t *)event);
1781 		}
1782 	}
1783 
1784 	if (handle == sock->statichandle) {
1785 		/* statichandle is assigned, not attached. */
1786 		sock->statichandle = NULL;
1787 	}
1788 
1789 	isc___nmsocket_detach(&sock FLARG_PASS);
1790 }
1791 
1792 void *
isc_nmhandle_getdata(isc_nmhandle_t * handle)1793 isc_nmhandle_getdata(isc_nmhandle_t *handle) {
1794 	REQUIRE(VALID_NMHANDLE(handle));
1795 
1796 	return (handle->opaque);
1797 }
1798 
1799 int
isc_nmhandle_getfd(isc_nmhandle_t * handle)1800 isc_nmhandle_getfd(isc_nmhandle_t *handle) {
1801 	REQUIRE(VALID_NMHANDLE(handle));
1802 
1803 	return (handle->sock->fd);
1804 }
1805 
1806 void
isc_nmhandle_setdata(isc_nmhandle_t * handle,void * arg,isc_nm_opaquecb_t doreset,isc_nm_opaquecb_t dofree)1807 isc_nmhandle_setdata(isc_nmhandle_t *handle, void *arg,
1808 		     isc_nm_opaquecb_t doreset, isc_nm_opaquecb_t dofree) {
1809 	REQUIRE(VALID_NMHANDLE(handle));
1810 
1811 	handle->opaque = arg;
1812 	handle->doreset = doreset;
1813 	handle->dofree = dofree;
1814 }
1815 
1816 void
isc__nm_alloc_dnsbuf(isc_nmsocket_t * sock,size_t len)1817 isc__nm_alloc_dnsbuf(isc_nmsocket_t *sock, size_t len) {
1818 	REQUIRE(len <= NM_BIG_BUF);
1819 
1820 	if (sock->buf == NULL) {
1821 		/* We don't have the buffer at all */
1822 		size_t alloc_len = len < NM_REG_BUF ? NM_REG_BUF : NM_BIG_BUF;
1823 		sock->buf = isc_mem_allocate(sock->mgr->mctx, alloc_len);
1824 		sock->buf_size = alloc_len;
1825 	} else {
1826 		/* We have the buffer but it's too small */
1827 		sock->buf = isc_mem_reallocate(sock->mgr->mctx, sock->buf,
1828 					       NM_BIG_BUF);
1829 		sock->buf_size = NM_BIG_BUF;
1830 	}
1831 }
1832 
1833 void
isc__nm_failed_send_cb(isc_nmsocket_t * sock,isc__nm_uvreq_t * req,isc_result_t eresult)1834 isc__nm_failed_send_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req,
1835 		       isc_result_t eresult) {
1836 	REQUIRE(VALID_NMSOCK(sock));
1837 	REQUIRE(VALID_UVREQ(req));
1838 
1839 	if (req->cb.send != NULL) {
1840 		isc__nm_sendcb(sock, req, eresult, true);
1841 	} else {
1842 		isc__nm_uvreq_put(&req, sock);
1843 	}
1844 }
1845 
1846 void
isc__nm_failed_accept_cb(isc_nmsocket_t * sock,isc_result_t eresult)1847 isc__nm_failed_accept_cb(isc_nmsocket_t *sock, isc_result_t eresult) {
1848 	REQUIRE(sock->accepting);
1849 	REQUIRE(sock->server);
1850 
1851 	/*
1852 	 * Detach the quota early to make room for other connections;
1853 	 * otherwise it'd be detached later asynchronously, and clog
1854 	 * the quota unnecessarily.
1855 	 */
1856 	if (sock->quota != NULL) {
1857 		isc_quota_detach(&sock->quota);
1858 	}
1859 
1860 	isc__nmsocket_detach(&sock->server);
1861 
1862 	sock->accepting = false;
1863 
1864 	switch (eresult) {
1865 	case ISC_R_NOTCONNECTED:
1866 		/* IGNORE: The client disconnected before we could accept */
1867 		break;
1868 	default:
1869 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1870 			      ISC_LOGMODULE_NETMGR, ISC_LOG_ERROR,
1871 			      "Accepting TCP connection failed: %s",
1872 			      isc_result_totext(eresult));
1873 	}
1874 }
1875 
1876 void
isc__nm_failed_connect_cb(isc_nmsocket_t * sock,isc__nm_uvreq_t * req,isc_result_t eresult,bool async)1877 isc__nm_failed_connect_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req,
1878 			  isc_result_t eresult, bool async) {
1879 	REQUIRE(VALID_NMSOCK(sock));
1880 	REQUIRE(VALID_UVREQ(req));
1881 	REQUIRE(sock->tid == isc_nm_tid());
1882 	REQUIRE(req->cb.connect != NULL);
1883 
1884 	isc__nmsocket_timer_stop(sock);
1885 	uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock);
1886 
1887 	INSIST(atomic_compare_exchange_strong(&sock->connecting,
1888 					      &(bool){ true }, false));
1889 
1890 	isc__nmsocket_clearcb(sock);
1891 	isc__nm_connectcb(sock, req, eresult, async);
1892 
1893 	isc__nmsocket_prep_destroy(sock);
1894 }
1895 
1896 void
isc__nm_failed_read_cb(isc_nmsocket_t * sock,isc_result_t result,bool async)1897 isc__nm_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, bool async) {
1898 	REQUIRE(VALID_NMSOCK(sock));
1899 	UNUSED(async);
1900 
1901 	switch (sock->type) {
1902 	case isc_nm_udpsocket:
1903 		isc__nm_udp_failed_read_cb(sock, result);
1904 		return;
1905 	case isc_nm_tcpsocket:
1906 		isc__nm_tcp_failed_read_cb(sock, result);
1907 		return;
1908 	case isc_nm_tcpdnssocket:
1909 		isc__nm_tcpdns_failed_read_cb(sock, result);
1910 		return;
1911 	default:
1912 		UNREACHABLE();
1913 	}
1914 }
1915 
1916 void
isc__nmsocket_connecttimeout_cb(uv_timer_t * timer)1917 isc__nmsocket_connecttimeout_cb(uv_timer_t *timer) {
1918 	uv_connect_t *uvreq = uv_handle_get_data((uv_handle_t *)timer);
1919 	isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle);
1920 	isc__nm_uvreq_t *req = uv_handle_get_data((uv_handle_t *)uvreq);
1921 
1922 	REQUIRE(VALID_NMSOCK(sock));
1923 	REQUIRE(sock->tid == isc_nm_tid());
1924 	REQUIRE(atomic_load(&sock->connecting));
1925 	REQUIRE(VALID_UVREQ(req));
1926 	REQUIRE(VALID_NMHANDLE(req->handle));
1927 
1928 	isc__nmsocket_timer_stop(sock);
1929 
1930 	/*
1931 	 * Mark the connection as timed out and shutdown the socket.
1932 	 */
1933 
1934 	INSIST(atomic_compare_exchange_strong(&sock->timedout, &(bool){ false },
1935 					      true));
1936 	isc__nmsocket_clearcb(sock);
1937 	isc__nmsocket_shutdown(sock);
1938 }
1939 
1940 void
isc__nm_accept_connection_log(isc_result_t result,bool can_log_quota)1941 isc__nm_accept_connection_log(isc_result_t result, bool can_log_quota) {
1942 	int level;
1943 
1944 	switch (result) {
1945 	case ISC_R_SUCCESS:
1946 	case ISC_R_NOCONN:
1947 		return;
1948 	case ISC_R_QUOTA:
1949 	case ISC_R_SOFTQUOTA:
1950 		if (!can_log_quota) {
1951 			return;
1952 		}
1953 		level = ISC_LOG_INFO;
1954 		break;
1955 	case ISC_R_NOTCONNECTED:
1956 		level = ISC_LOG_INFO;
1957 		break;
1958 	default:
1959 		level = ISC_LOG_ERROR;
1960 	}
1961 
1962 	isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR,
1963 		      level, "Accepting TCP connection failed: %s",
1964 		      isc_result_totext(result));
1965 }
1966 
1967 void
isc__nmsocket_writetimeout_cb(void * data,isc_result_t eresult)1968 isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult) {
1969 	isc__nm_uvreq_t *req = data;
1970 	isc_nmsocket_t *sock = NULL;
1971 
1972 	REQUIRE(eresult == ISC_R_TIMEDOUT);
1973 	REQUIRE(VALID_UVREQ(req));
1974 	REQUIRE(VALID_NMSOCK(req->sock));
1975 
1976 	sock = req->sock;
1977 
1978 	isc__nmsocket_reset(sock);
1979 }
1980 
1981 void
isc__nmsocket_readtimeout_cb(uv_timer_t * timer)1982 isc__nmsocket_readtimeout_cb(uv_timer_t *timer) {
1983 	isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)timer);
1984 
1985 	REQUIRE(VALID_NMSOCK(sock));
1986 	REQUIRE(sock->tid == isc_nm_tid());
1987 	REQUIRE(sock->reading);
1988 
1989 	if (atomic_load(&sock->client)) {
1990 		uv_timer_stop(timer);
1991 
1992 		if (sock->recv_cb != NULL) {
1993 			isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL);
1994 			isc__nm_readcb(sock, req, ISC_R_TIMEDOUT);
1995 		}
1996 
1997 		if (!isc__nmsocket_timer_running(sock)) {
1998 			isc__nmsocket_clearcb(sock);
1999 			isc__nm_failed_read_cb(sock, ISC_R_CANCELED, false);
2000 		}
2001 	} else {
2002 		isc__nm_failed_read_cb(sock, ISC_R_TIMEDOUT, false);
2003 	}
2004 }
2005 
2006 void
isc__nmsocket_timer_restart(isc_nmsocket_t * sock)2007 isc__nmsocket_timer_restart(isc_nmsocket_t *sock) {
2008 	REQUIRE(VALID_NMSOCK(sock));
2009 
2010 	if (atomic_load(&sock->connecting)) {
2011 		int r;
2012 
2013 		if (sock->connect_timeout == 0) {
2014 			return;
2015 		}
2016 
2017 		r = uv_timer_start(&sock->read_timer,
2018 				   isc__nmsocket_connecttimeout_cb,
2019 				   sock->connect_timeout + 10, 0);
2020 		UV_RUNTIME_CHECK(uv_timer_start, r);
2021 
2022 	} else {
2023 		int r;
2024 
2025 		if (sock->read_timeout == 0) {
2026 			return;
2027 		}
2028 
2029 		r = uv_timer_start(&sock->read_timer,
2030 				   isc__nmsocket_readtimeout_cb,
2031 				   sock->read_timeout, 0);
2032 		UV_RUNTIME_CHECK(uv_timer_start, r);
2033 	}
2034 }
2035 
2036 bool
isc__nmsocket_timer_running(isc_nmsocket_t * sock)2037 isc__nmsocket_timer_running(isc_nmsocket_t *sock) {
2038 	REQUIRE(VALID_NMSOCK(sock));
2039 
2040 	return (uv_is_active((uv_handle_t *)&sock->read_timer));
2041 }
2042 
2043 void
isc__nmsocket_timer_start(isc_nmsocket_t * sock)2044 isc__nmsocket_timer_start(isc_nmsocket_t *sock) {
2045 	REQUIRE(VALID_NMSOCK(sock));
2046 
2047 	if (isc__nmsocket_timer_running(sock)) {
2048 		return;
2049 	}
2050 
2051 	isc__nmsocket_timer_restart(sock);
2052 }
2053 
2054 void
isc__nmsocket_timer_stop(isc_nmsocket_t * sock)2055 isc__nmsocket_timer_stop(isc_nmsocket_t *sock) {
2056 	int r;
2057 
2058 	REQUIRE(VALID_NMSOCK(sock));
2059 
2060 	/* uv_timer_stop() is idempotent, no need to check if running */
2061 
2062 	r = uv_timer_stop(&sock->read_timer);
2063 	UV_RUNTIME_CHECK(uv_timer_stop, r);
2064 }
2065 
2066 isc__nm_uvreq_t *
isc__nm_get_read_req(isc_nmsocket_t * sock,isc_sockaddr_t * sockaddr)2067 isc__nm_get_read_req(isc_nmsocket_t *sock, isc_sockaddr_t *sockaddr) {
2068 	isc__nm_uvreq_t *req = NULL;
2069 
2070 	req = isc__nm_uvreq_get(sock->mgr, sock);
2071 	req->cb.recv = sock->recv_cb;
2072 	req->cbarg = sock->recv_cbarg;
2073 
2074 	switch (sock->type) {
2075 	case isc_nm_tcpsocket:
2076 		isc_nmhandle_attach(sock->statichandle, &req->handle);
2077 		break;
2078 	default:
2079 		if (atomic_load(&sock->client)) {
2080 			isc_nmhandle_attach(sock->statichandle, &req->handle);
2081 		} else {
2082 			req->handle = isc__nmhandle_get(sock, sockaddr, NULL);
2083 		}
2084 		break;
2085 	}
2086 
2087 	return (req);
2088 }
2089 
2090 /*%<
2091  * Allocator callback for read operations.
2092  *
2093  * Note this doesn't actually allocate anything, it just assigns the
2094  * worker's receive buffer to a socket, and marks it as "in use".
2095  */
2096 void
isc__nm_alloc_cb(uv_handle_t * handle,size_t size,uv_buf_t * buf)2097 isc__nm_alloc_cb(uv_handle_t *handle, size_t size, uv_buf_t *buf) {
2098 	isc_nmsocket_t *sock = uv_handle_get_data(handle);
2099 	isc__networker_t *worker = NULL;
2100 
2101 	REQUIRE(VALID_NMSOCK(sock));
2102 	REQUIRE(isc__nm_in_netthread());
2103 	/*
2104 	 * The size provided by libuv is only suggested size, and it always
2105 	 * defaults to 64 * 1024 in the current versions of libuv (see
2106 	 * src/unix/udp.c and src/unix/stream.c).
2107 	 */
2108 	UNUSED(size);
2109 
2110 	worker = &sock->mgr->workers[sock->tid];
2111 	INSIST(!worker->recvbuf_inuse);
2112 	INSIST(worker->recvbuf != NULL);
2113 
2114 	switch (sock->type) {
2115 	case isc_nm_udpsocket:
2116 		buf->len = ISC_NETMGR_UDP_RECVBUF_SIZE;
2117 		break;
2118 	case isc_nm_tcpsocket:
2119 	case isc_nm_tcpdnssocket:
2120 		buf->len = ISC_NETMGR_TCP_RECVBUF_SIZE;
2121 		break;
2122 	default:
2123 		UNREACHABLE();
2124 	}
2125 
2126 	REQUIRE(buf->len <= ISC_NETMGR_RECVBUF_SIZE);
2127 	buf->base = worker->recvbuf;
2128 
2129 	worker->recvbuf_inuse = true;
2130 }
2131 
2132 isc_result_t
isc__nm_start_reading(isc_nmsocket_t * sock)2133 isc__nm_start_reading(isc_nmsocket_t *sock) {
2134 	isc_result_t result = ISC_R_SUCCESS;
2135 	int r;
2136 
2137 	if (sock->reading) {
2138 		return (ISC_R_SUCCESS);
2139 	}
2140 
2141 	switch (sock->type) {
2142 	case isc_nm_udpsocket:
2143 		r = uv_udp_recv_start(&sock->uv_handle.udp, isc__nm_alloc_cb,
2144 				      isc__nm_udp_read_cb);
2145 		break;
2146 	case isc_nm_tcpsocket:
2147 		r = uv_read_start(&sock->uv_handle.stream, isc__nm_alloc_cb,
2148 				  isc__nm_tcp_read_cb);
2149 		break;
2150 	case isc_nm_tcpdnssocket:
2151 		r = uv_read_start(&sock->uv_handle.stream, isc__nm_alloc_cb,
2152 				  isc__nm_tcpdns_read_cb);
2153 		break;
2154 	default:
2155 		UNREACHABLE();
2156 	}
2157 
2158 	if (r != 0) {
2159 		result = isc__nm_uverr2result(r);
2160 	} else {
2161 		sock->reading = true;
2162 	}
2163 
2164 	return (result);
2165 }
2166 
2167 void
isc__nm_stop_reading(isc_nmsocket_t * sock)2168 isc__nm_stop_reading(isc_nmsocket_t *sock) {
2169 	int r;
2170 
2171 	if (!sock->reading) {
2172 		return;
2173 	}
2174 
2175 	switch (sock->type) {
2176 	case isc_nm_udpsocket:
2177 		r = uv_udp_recv_stop(&sock->uv_handle.udp);
2178 		UV_RUNTIME_CHECK(uv_udp_recv_stop, r);
2179 		break;
2180 	case isc_nm_tcpsocket:
2181 	case isc_nm_tcpdnssocket:
2182 		r = uv_read_stop(&sock->uv_handle.stream);
2183 		UV_RUNTIME_CHECK(uv_read_stop, r);
2184 		break;
2185 	default:
2186 		UNREACHABLE();
2187 	}
2188 	sock->reading = false;
2189 }
2190 
2191 bool
isc__nm_closing(isc_nmsocket_t * sock)2192 isc__nm_closing(isc_nmsocket_t *sock) {
2193 	return (atomic_load(&sock->mgr->closing));
2194 }
2195 
2196 bool
isc__nmsocket_closing(isc_nmsocket_t * sock)2197 isc__nmsocket_closing(isc_nmsocket_t *sock) {
2198 	return (!isc__nmsocket_active(sock) || atomic_load(&sock->closing) ||
2199 		atomic_load(&sock->mgr->closing) ||
2200 		(sock->server != NULL && !isc__nmsocket_active(sock->server)));
2201 }
2202 
2203 static isc_result_t
processbuffer(isc_nmsocket_t * sock)2204 processbuffer(isc_nmsocket_t *sock) {
2205 	switch (sock->type) {
2206 	case isc_nm_tcpdnssocket:
2207 		return (isc__nm_tcpdns_processbuffer(sock));
2208 	default:
2209 		UNREACHABLE();
2210 	}
2211 }
2212 
2213 /*
2214  * Process a DNS message.
2215  *
2216  * If we only have an incomplete DNS message, we don't touch any
2217  * timers. If we do have a full message, reset the timer.
2218  *
2219  * Stop reading if this is a client socket, or if the server socket
2220  * has been set to sequential mode, or the number of queries we are
2221  * processing simultaneously has reached the clients-per-connection
2222  * limit. In this case we'll be called again by resume_processing()
2223  * later.
2224  */
2225 isc_result_t
isc__nm_process_sock_buffer(isc_nmsocket_t * sock)2226 isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
2227 	for (;;) {
2228 		int_fast32_t ah = atomic_load(&sock->ah);
2229 		isc_result_t result = processbuffer(sock);
2230 		switch (result) {
2231 		case ISC_R_NOMORE:
2232 			/*
2233 			 * Don't reset the timer until we have a
2234 			 * full DNS message.
2235 			 */
2236 			result = isc__nm_start_reading(sock);
2237 			if (result != ISC_R_SUCCESS) {
2238 				return (result);
2239 			}
2240 			/*
2241 			 * Start the timer only if there are no externally used
2242 			 * active handles, there's always one active handle
2243 			 * attached internally to sock->recv_handle in
2244 			 * accept_connection()
2245 			 */
2246 			if (ah == 1) {
2247 				isc__nmsocket_timer_start(sock);
2248 			}
2249 			goto done;
2250 		case ISC_R_CANCELED:
2251 			isc__nmsocket_timer_stop(sock);
2252 			isc__nm_stop_reading(sock);
2253 			goto done;
2254 		case ISC_R_SUCCESS:
2255 			/*
2256 			 * Stop the timer on the successful message read, this
2257 			 * also allows to restart the timer when we have no more
2258 			 * data.
2259 			 */
2260 			isc__nmsocket_timer_stop(sock);
2261 
2262 			if (atomic_load(&sock->client) ||
2263 			    atomic_load(&sock->sequential) ||
2264 			    ah >= STREAM_CLIENTS_PER_CONN)
2265 			{
2266 				isc__nm_stop_reading(sock);
2267 				goto done;
2268 			}
2269 			break;
2270 		default:
2271 			UNREACHABLE();
2272 		}
2273 	}
2274 done:
2275 	return (ISC_R_SUCCESS);
2276 }
2277 
2278 void
isc__nm_resume_processing(void * arg)2279 isc__nm_resume_processing(void *arg) {
2280 	isc_nmsocket_t *sock = (isc_nmsocket_t *)arg;
2281 
2282 	REQUIRE(VALID_NMSOCK(sock));
2283 	REQUIRE(sock->tid == isc_nm_tid());
2284 	REQUIRE(!atomic_load(&sock->client));
2285 
2286 	if (isc__nmsocket_closing(sock)) {
2287 		return;
2288 	}
2289 
2290 	isc__nm_process_sock_buffer(sock);
2291 }
2292 
2293 void
isc_nmhandle_cleartimeout(isc_nmhandle_t * handle)2294 isc_nmhandle_cleartimeout(isc_nmhandle_t *handle) {
2295 	REQUIRE(VALID_NMHANDLE(handle));
2296 	REQUIRE(VALID_NMSOCK(handle->sock));
2297 
2298 	switch (handle->sock->type) {
2299 	default:
2300 		handle->sock->read_timeout = 0;
2301 
2302 		if (uv_is_active((uv_handle_t *)&handle->sock->read_timer)) {
2303 			isc__nmsocket_timer_stop(handle->sock);
2304 		}
2305 	}
2306 }
2307 
2308 void
isc_nmhandle_settimeout(isc_nmhandle_t * handle,uint32_t timeout)2309 isc_nmhandle_settimeout(isc_nmhandle_t *handle, uint32_t timeout) {
2310 	REQUIRE(VALID_NMHANDLE(handle));
2311 	REQUIRE(VALID_NMSOCK(handle->sock));
2312 
2313 	switch (handle->sock->type) {
2314 	default:
2315 		handle->sock->read_timeout = timeout;
2316 		isc__nmsocket_timer_restart(handle->sock);
2317 	}
2318 }
2319 
2320 void
isc_nmhandle_keepalive(isc_nmhandle_t * handle,bool value)2321 isc_nmhandle_keepalive(isc_nmhandle_t *handle, bool value) {
2322 	isc_nmsocket_t *sock = NULL;
2323 
2324 	REQUIRE(VALID_NMHANDLE(handle));
2325 	REQUIRE(VALID_NMSOCK(handle->sock));
2326 
2327 	sock = handle->sock;
2328 
2329 	switch (sock->type) {
2330 	case isc_nm_tcpsocket:
2331 	case isc_nm_tcpdnssocket:
2332 		atomic_store(&sock->keepalive, value);
2333 		sock->read_timeout = value ? atomic_load(&sock->mgr->keepalive)
2334 					   : atomic_load(&sock->mgr->idle);
2335 		sock->write_timeout = value ? atomic_load(&sock->mgr->keepalive)
2336 					    : atomic_load(&sock->mgr->idle);
2337 		break;
2338 	default:
2339 		/*
2340 		 * For any other protocol, this is a no-op.
2341 		 */
2342 		return;
2343 	}
2344 }
2345 
2346 void *
isc_nmhandle_getextra(isc_nmhandle_t * handle)2347 isc_nmhandle_getextra(isc_nmhandle_t *handle) {
2348 	REQUIRE(VALID_NMHANDLE(handle));
2349 
2350 	return (handle->extra);
2351 }
2352 
2353 isc_sockaddr_t
isc_nmhandle_peeraddr(isc_nmhandle_t * handle)2354 isc_nmhandle_peeraddr(isc_nmhandle_t *handle) {
2355 	REQUIRE(VALID_NMHANDLE(handle));
2356 
2357 	return (handle->peer);
2358 }
2359 
2360 isc_sockaddr_t
isc_nmhandle_localaddr(isc_nmhandle_t * handle)2361 isc_nmhandle_localaddr(isc_nmhandle_t *handle) {
2362 	REQUIRE(VALID_NMHANDLE(handle));
2363 
2364 	return (handle->local);
2365 }
2366 
2367 isc_nm_t *
isc_nmhandle_netmgr(isc_nmhandle_t * handle)2368 isc_nmhandle_netmgr(isc_nmhandle_t *handle) {
2369 	REQUIRE(VALID_NMHANDLE(handle));
2370 	REQUIRE(VALID_NMSOCK(handle->sock));
2371 
2372 	return (handle->sock->mgr);
2373 }
2374 
2375 isc__nm_uvreq_t *
isc___nm_uvreq_get(isc_nm_t * mgr,isc_nmsocket_t * sock FLARG)2376 isc___nm_uvreq_get(isc_nm_t *mgr, isc_nmsocket_t *sock FLARG) {
2377 	isc__nm_uvreq_t *req = NULL;
2378 
2379 	REQUIRE(VALID_NM(mgr));
2380 	REQUIRE(VALID_NMSOCK(sock));
2381 
2382 	if (sock != NULL && isc__nmsocket_active(sock)) {
2383 		/* Try to reuse one */
2384 		req = isc_astack_pop(sock->inactivereqs);
2385 	}
2386 
2387 	if (req == NULL) {
2388 		req = isc_mem_get(mgr->mctx, sizeof(*req));
2389 	}
2390 
2391 	*req = (isc__nm_uvreq_t){ .magic = 0 };
2392 	ISC_LINK_INIT(req, link);
2393 	req->uv_req.req.data = req;
2394 	isc___nmsocket_attach(sock, &req->sock FLARG_PASS);
2395 	req->magic = UVREQ_MAGIC;
2396 
2397 	return (req);
2398 }
2399 
2400 void
isc___nm_uvreq_put(isc__nm_uvreq_t ** req0,isc_nmsocket_t * sock FLARG)2401 isc___nm_uvreq_put(isc__nm_uvreq_t **req0, isc_nmsocket_t *sock FLARG) {
2402 	isc__nm_uvreq_t *req = NULL;
2403 	isc_nmhandle_t *handle = NULL;
2404 
2405 	REQUIRE(req0 != NULL);
2406 	REQUIRE(VALID_UVREQ(*req0));
2407 
2408 	req = *req0;
2409 	*req0 = NULL;
2410 
2411 	INSIST(sock == req->sock);
2412 
2413 	req->magic = 0;
2414 
2415 	/*
2416 	 * We need to save this first to make sure that handle,
2417 	 * sock, and the netmgr won't all disappear.
2418 	 */
2419 	handle = req->handle;
2420 	req->handle = NULL;
2421 
2422 #if !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__
2423 	if (!isc__nmsocket_active(sock) ||
2424 	    !isc_astack_trypush(sock->inactivereqs, req))
2425 	{
2426 		isc_mem_put(sock->mgr->mctx, req, sizeof(*req));
2427 	}
2428 #else  /* !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__ */
2429 	isc_mem_put(sock->mgr->mctx, req, sizeof(*req));
2430 #endif /* !__SANITIZE_ADDRESS__ && !__SANITIZE_THREAD__ */
2431 
2432 	if (handle != NULL) {
2433 		isc__nmhandle_detach(&handle FLARG_PASS);
2434 	}
2435 
2436 	isc___nmsocket_detach(&sock FLARG_PASS);
2437 }
2438 
2439 void
isc_nm_send(isc_nmhandle_t * handle,isc_region_t * region,isc_nm_cb_t cb,void * cbarg)2440 isc_nm_send(isc_nmhandle_t *handle, isc_region_t *region, isc_nm_cb_t cb,
2441 	    void *cbarg) {
2442 	REQUIRE(VALID_NMHANDLE(handle));
2443 
2444 	switch (handle->sock->type) {
2445 	case isc_nm_udpsocket:
2446 	case isc_nm_udplistener:
2447 		isc__nm_udp_send(handle, region, cb, cbarg);
2448 		break;
2449 	case isc_nm_tcpsocket:
2450 		isc__nm_tcp_send(handle, region, cb, cbarg);
2451 		break;
2452 	case isc_nm_tcpdnssocket:
2453 		isc__nm_tcpdns_send(handle, region, cb, cbarg);
2454 		break;
2455 	default:
2456 		UNREACHABLE();
2457 	}
2458 }
2459 
2460 void
isc_nm_read(isc_nmhandle_t * handle,isc_nm_recv_cb_t cb,void * cbarg)2461 isc_nm_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) {
2462 	REQUIRE(VALID_NMHANDLE(handle));
2463 
2464 	/*
2465 	 * This is always called via callback (from accept or connect), and
2466 	 * caller must attach to the handle, so the references always need to be
2467 	 * at least 2.
2468 	 */
2469 	REQUIRE(isc_refcount_current(&handle->references) >= 2);
2470 
2471 	switch (handle->sock->type) {
2472 	case isc_nm_udpsocket:
2473 		isc__nm_udp_read(handle, cb, cbarg);
2474 		break;
2475 	case isc_nm_tcpsocket:
2476 		isc__nm_tcp_read(handle, cb, cbarg);
2477 		break;
2478 	case isc_nm_tcpdnssocket:
2479 		isc__nm_tcpdns_read(handle, cb, cbarg);
2480 		break;
2481 	default:
2482 		UNREACHABLE();
2483 	}
2484 }
2485 
2486 void
isc_nm_cancelread(isc_nmhandle_t * handle)2487 isc_nm_cancelread(isc_nmhandle_t *handle) {
2488 	REQUIRE(VALID_NMHANDLE(handle));
2489 
2490 	switch (handle->sock->type) {
2491 	case isc_nm_udpsocket:
2492 		isc__nm_udp_cancelread(handle);
2493 		break;
2494 	case isc_nm_tcpsocket:
2495 		isc__nm_tcp_cancelread(handle);
2496 		break;
2497 	case isc_nm_tcpdnssocket:
2498 		isc__nm_tcpdns_cancelread(handle);
2499 		break;
2500 	default:
2501 		UNREACHABLE();
2502 	}
2503 }
2504 
2505 void
isc_nm_pauseread(isc_nmhandle_t * handle)2506 isc_nm_pauseread(isc_nmhandle_t *handle) {
2507 	REQUIRE(VALID_NMHANDLE(handle));
2508 
2509 	isc_nmsocket_t *sock = handle->sock;
2510 
2511 	switch (sock->type) {
2512 	case isc_nm_tcpsocket:
2513 		isc__nm_tcp_pauseread(handle);
2514 		break;
2515 	default:
2516 		UNREACHABLE();
2517 	}
2518 }
2519 
2520 void
isc_nm_resumeread(isc_nmhandle_t * handle)2521 isc_nm_resumeread(isc_nmhandle_t *handle) {
2522 	REQUIRE(VALID_NMHANDLE(handle));
2523 
2524 	isc_nmsocket_t *sock = handle->sock;
2525 
2526 	switch (sock->type) {
2527 	case isc_nm_tcpsocket:
2528 		isc__nm_tcp_resumeread(handle);
2529 		break;
2530 	default:
2531 		UNREACHABLE();
2532 	}
2533 }
2534 
2535 void
isc_nm_stoplistening(isc_nmsocket_t * sock)2536 isc_nm_stoplistening(isc_nmsocket_t *sock) {
2537 	REQUIRE(VALID_NMSOCK(sock));
2538 
2539 	switch (sock->type) {
2540 	case isc_nm_udplistener:
2541 		isc__nm_udp_stoplistening(sock);
2542 		break;
2543 	case isc_nm_tcpdnslistener:
2544 		isc__nm_tcpdns_stoplistening(sock);
2545 		break;
2546 	case isc_nm_tcplistener:
2547 		isc__nm_tcp_stoplistening(sock);
2548 		break;
2549 	default:
2550 		UNREACHABLE();
2551 	}
2552 }
2553 
2554 void
isc__nm_connectcb(isc_nmsocket_t * sock,isc__nm_uvreq_t * uvreq,isc_result_t eresult,bool async)2555 isc__nm_connectcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq,
2556 		  isc_result_t eresult, bool async) {
2557 	REQUIRE(VALID_NMSOCK(sock));
2558 	REQUIRE(VALID_UVREQ(uvreq));
2559 	REQUIRE(VALID_NMHANDLE(uvreq->handle));
2560 
2561 	if (!async) {
2562 		isc__netievent_connectcb_t ievent = { .sock = sock,
2563 						      .req = uvreq,
2564 						      .result = eresult };
2565 		isc__nm_async_connectcb(NULL, (isc__netievent_t *)&ievent);
2566 	} else {
2567 		isc__netievent_connectcb_t *ievent =
2568 			isc__nm_get_netievent_connectcb(sock->mgr, sock, uvreq,
2569 							eresult);
2570 		isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid],
2571 				       (isc__netievent_t *)ievent);
2572 	}
2573 }
2574 
2575 void
isc__nm_async_connectcb(isc__networker_t * worker,isc__netievent_t * ev0)2576 isc__nm_async_connectcb(isc__networker_t *worker, isc__netievent_t *ev0) {
2577 	isc__netievent_connectcb_t *ievent = (isc__netievent_connectcb_t *)ev0;
2578 	isc_nmsocket_t *sock = ievent->sock;
2579 	isc__nm_uvreq_t *uvreq = ievent->req;
2580 	isc_result_t eresult = ievent->result;
2581 
2582 	UNUSED(worker);
2583 
2584 	REQUIRE(VALID_NMSOCK(sock));
2585 	REQUIRE(VALID_UVREQ(uvreq));
2586 	REQUIRE(VALID_NMHANDLE(uvreq->handle));
2587 	REQUIRE(ievent->sock->tid == isc_nm_tid());
2588 	REQUIRE(uvreq->cb.connect != NULL);
2589 
2590 	uvreq->cb.connect(uvreq->handle, eresult, uvreq->cbarg);
2591 
2592 	isc__nm_uvreq_put(&uvreq, sock);
2593 }
2594 
2595 void
isc__nm_readcb(isc_nmsocket_t * sock,isc__nm_uvreq_t * uvreq,isc_result_t eresult)2596 isc__nm_readcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq,
2597 	       isc_result_t eresult) {
2598 	REQUIRE(VALID_NMSOCK(sock));
2599 	REQUIRE(VALID_UVREQ(uvreq));
2600 	REQUIRE(VALID_NMHANDLE(uvreq->handle));
2601 
2602 	if (eresult == ISC_R_SUCCESS || eresult == ISC_R_TIMEDOUT) {
2603 		isc__netievent_readcb_t ievent = { .sock = sock,
2604 						   .req = uvreq,
2605 						   .result = eresult };
2606 
2607 		isc__nm_async_readcb(NULL, (isc__netievent_t *)&ievent);
2608 	} else {
2609 		isc__netievent_readcb_t *ievent = isc__nm_get_netievent_readcb(
2610 			sock->mgr, sock, uvreq, eresult);
2611 		isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid],
2612 				       (isc__netievent_t *)ievent);
2613 	}
2614 }
2615 
2616 void
isc__nm_async_readcb(isc__networker_t * worker,isc__netievent_t * ev0)2617 isc__nm_async_readcb(isc__networker_t *worker, isc__netievent_t *ev0) {
2618 	isc__netievent_readcb_t *ievent = (isc__netievent_readcb_t *)ev0;
2619 	isc_nmsocket_t *sock = ievent->sock;
2620 	isc__nm_uvreq_t *uvreq = ievent->req;
2621 	isc_result_t eresult = ievent->result;
2622 	isc_region_t region;
2623 
2624 	UNUSED(worker);
2625 
2626 	REQUIRE(VALID_NMSOCK(sock));
2627 	REQUIRE(VALID_UVREQ(uvreq));
2628 	REQUIRE(VALID_NMHANDLE(uvreq->handle));
2629 	REQUIRE(sock->tid == isc_nm_tid());
2630 
2631 	region.base = (unsigned char *)uvreq->uvbuf.base;
2632 	region.length = uvreq->uvbuf.len;
2633 
2634 	uvreq->cb.recv(uvreq->handle, eresult, &region, uvreq->cbarg);
2635 
2636 	isc__nm_uvreq_put(&uvreq, sock);
2637 }
2638 
2639 void
isc__nm_sendcb(isc_nmsocket_t * sock,isc__nm_uvreq_t * uvreq,isc_result_t eresult,bool async)2640 isc__nm_sendcb(isc_nmsocket_t *sock, isc__nm_uvreq_t *uvreq,
2641 	       isc_result_t eresult, bool async) {
2642 	REQUIRE(VALID_NMSOCK(sock));
2643 	REQUIRE(VALID_UVREQ(uvreq));
2644 	REQUIRE(VALID_NMHANDLE(uvreq->handle));
2645 
2646 	if (!async) {
2647 		isc__netievent_sendcb_t ievent = { .sock = sock,
2648 						   .req = uvreq,
2649 						   .result = eresult };
2650 		isc__nm_async_sendcb(NULL, (isc__netievent_t *)&ievent);
2651 		return;
2652 	}
2653 
2654 	isc__netievent_sendcb_t *ievent =
2655 		isc__nm_get_netievent_sendcb(sock->mgr, sock, uvreq, eresult);
2656 	isc__nm_enqueue_ievent(&sock->mgr->workers[sock->tid],
2657 			       (isc__netievent_t *)ievent);
2658 }
2659 
2660 void
isc__nm_async_sendcb(isc__networker_t * worker,isc__netievent_t * ev0)2661 isc__nm_async_sendcb(isc__networker_t *worker, isc__netievent_t *ev0) {
2662 	isc__netievent_sendcb_t *ievent = (isc__netievent_sendcb_t *)ev0;
2663 	isc_nmsocket_t *sock = ievent->sock;
2664 	isc__nm_uvreq_t *uvreq = ievent->req;
2665 	isc_result_t eresult = ievent->result;
2666 
2667 	UNUSED(worker);
2668 
2669 	REQUIRE(VALID_NMSOCK(sock));
2670 	REQUIRE(VALID_UVREQ(uvreq));
2671 	REQUIRE(VALID_NMHANDLE(uvreq->handle));
2672 	REQUIRE(sock->tid == isc_nm_tid());
2673 
2674 	uvreq->cb.send(uvreq->handle, eresult, uvreq->cbarg);
2675 
2676 	isc__nm_uvreq_put(&uvreq, sock);
2677 }
2678 
2679 static void
isc__nm_async_close(isc__networker_t * worker,isc__netievent_t * ev0)2680 isc__nm_async_close(isc__networker_t *worker, isc__netievent_t *ev0) {
2681 	isc__netievent_close_t *ievent = (isc__netievent_close_t *)ev0;
2682 	isc_nmsocket_t *sock = ievent->sock;
2683 
2684 	REQUIRE(VALID_NMSOCK(ievent->sock));
2685 	REQUIRE(sock->tid == isc_nm_tid());
2686 	REQUIRE(sock->closehandle_cb != NULL);
2687 
2688 	UNUSED(worker);
2689 
2690 	ievent->sock->closehandle_cb(sock);
2691 }
2692 
2693 void
isc__nm_async_detach(isc__networker_t * worker,isc__netievent_t * ev0)2694 isc__nm_async_detach(isc__networker_t *worker, isc__netievent_t *ev0) {
2695 	isc__netievent_detach_t *ievent = (isc__netievent_detach_t *)ev0;
2696 	FLARG_IEVENT(ievent);
2697 
2698 	REQUIRE(VALID_NMSOCK(ievent->sock));
2699 	REQUIRE(VALID_NMHANDLE(ievent->handle));
2700 	REQUIRE(ievent->sock->tid == isc_nm_tid());
2701 
2702 	UNUSED(worker);
2703 
2704 	nmhandle_detach_cb(&ievent->handle FLARG_PASS);
2705 }
2706 
2707 static void
reset_shutdown(uv_handle_t * handle)2708 reset_shutdown(uv_handle_t *handle) {
2709 	isc_nmsocket_t *sock = uv_handle_get_data(handle);
2710 
2711 	isc__nmsocket_shutdown(sock);
2712 	isc__nmsocket_detach(&sock);
2713 }
2714 
2715 void
isc__nmsocket_reset(isc_nmsocket_t * sock)2716 isc__nmsocket_reset(isc_nmsocket_t *sock) {
2717 	REQUIRE(VALID_NMSOCK(sock));
2718 
2719 	switch (sock->type) {
2720 	case isc_nm_tcpsocket:
2721 	case isc_nm_tcpdnssocket:
2722 		/*
2723 		 * This can be called from the TCP write timeout.
2724 		 */
2725 		REQUIRE(sock->parent == NULL);
2726 		break;
2727 	default:
2728 		UNREACHABLE();
2729 		break;
2730 	}
2731 
2732 	if (!uv_is_closing(&sock->uv_handle.handle) &&
2733 	    uv_is_active(&sock->uv_handle.handle))
2734 	{
2735 		/*
2736 		 * The real shutdown will be handled in the respective
2737 		 * close functions.
2738 		 */
2739 		isc__nmsocket_attach(sock, &(isc_nmsocket_t *){ NULL });
2740 		int r = uv_tcp_close_reset(&sock->uv_handle.tcp,
2741 					   reset_shutdown);
2742 		UV_RUNTIME_CHECK(uv_tcp_close_reset, r);
2743 	} else {
2744 		isc__nmsocket_shutdown(sock);
2745 	}
2746 }
2747 
2748 void
isc__nmsocket_shutdown(isc_nmsocket_t * sock)2749 isc__nmsocket_shutdown(isc_nmsocket_t *sock) {
2750 	REQUIRE(VALID_NMSOCK(sock));
2751 	switch (sock->type) {
2752 	case isc_nm_udpsocket:
2753 		isc__nm_udp_shutdown(sock);
2754 		break;
2755 	case isc_nm_tcpsocket:
2756 		isc__nm_tcp_shutdown(sock);
2757 		break;
2758 	case isc_nm_tcpdnssocket:
2759 		isc__nm_tcpdns_shutdown(sock);
2760 		break;
2761 	case isc_nm_udplistener:
2762 	case isc_nm_tcplistener:
2763 	case isc_nm_tcpdnslistener:
2764 		return;
2765 	default:
2766 		UNREACHABLE();
2767 	}
2768 }
2769 
2770 static void
shutdown_walk_cb(uv_handle_t * handle,void * arg)2771 shutdown_walk_cb(uv_handle_t *handle, void *arg) {
2772 	isc_nmsocket_t *sock = uv_handle_get_data(handle);
2773 	UNUSED(arg);
2774 
2775 	if (uv_is_closing(handle)) {
2776 		return;
2777 	}
2778 
2779 	switch (handle->type) {
2780 	case UV_UDP:
2781 		isc__nmsocket_shutdown(sock);
2782 		return;
2783 	case UV_TCP:
2784 		switch (sock->type) {
2785 		case isc_nm_tcpsocket:
2786 		case isc_nm_tcpdnssocket:
2787 			if (sock->parent == NULL) {
2788 				/* Reset the TCP connections on shutdown */
2789 				isc__nmsocket_reset(sock);
2790 				return;
2791 			}
2792 			FALLTHROUGH;
2793 		default:
2794 			isc__nmsocket_shutdown(sock);
2795 		}
2796 
2797 		return;
2798 	default:
2799 		return;
2800 	}
2801 }
2802 
2803 void
isc__nm_async_shutdown(isc__networker_t * worker,isc__netievent_t * ev0)2804 isc__nm_async_shutdown(isc__networker_t *worker, isc__netievent_t *ev0) {
2805 	UNUSED(ev0);
2806 
2807 	uv_walk(&worker->loop, shutdown_walk_cb, NULL);
2808 }
2809 
2810 bool
isc__nm_acquire_interlocked(isc_nm_t * mgr)2811 isc__nm_acquire_interlocked(isc_nm_t *mgr) {
2812 	if (!isc__nm_in_netthread()) {
2813 		return (false);
2814 	}
2815 
2816 	LOCK(&mgr->lock);
2817 	bool success = atomic_compare_exchange_strong(
2818 		&mgr->interlocked, &(int){ ISC_NETMGR_NON_INTERLOCKED },
2819 		isc_nm_tid());
2820 
2821 	UNLOCK(&mgr->lock);
2822 	return (success);
2823 }
2824 
2825 void
isc__nm_drop_interlocked(isc_nm_t * mgr)2826 isc__nm_drop_interlocked(isc_nm_t *mgr) {
2827 	if (!isc__nm_in_netthread()) {
2828 		return;
2829 	}
2830 
2831 	LOCK(&mgr->lock);
2832 	int tid = atomic_exchange(&mgr->interlocked,
2833 				  ISC_NETMGR_NON_INTERLOCKED);
2834 	INSIST(tid != ISC_NETMGR_NON_INTERLOCKED);
2835 	BROADCAST(&mgr->wkstatecond);
2836 	UNLOCK(&mgr->lock);
2837 }
2838 
2839 void
isc__nm_acquire_interlocked_force(isc_nm_t * mgr)2840 isc__nm_acquire_interlocked_force(isc_nm_t *mgr) {
2841 	if (!isc__nm_in_netthread()) {
2842 		return;
2843 	}
2844 
2845 	LOCK(&mgr->lock);
2846 	while (!atomic_compare_exchange_strong(
2847 		&mgr->interlocked, &(int){ ISC_NETMGR_NON_INTERLOCKED },
2848 		isc_nm_tid()))
2849 	{
2850 		WAIT(&mgr->wkstatecond, &mgr->lock);
2851 	}
2852 	UNLOCK(&mgr->lock);
2853 }
2854 
2855 void
isc_nm_setstats(isc_nm_t * mgr,isc_stats_t * stats)2856 isc_nm_setstats(isc_nm_t *mgr, isc_stats_t *stats) {
2857 	REQUIRE(VALID_NM(mgr));
2858 	REQUIRE(mgr->stats == NULL);
2859 	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2860 
2861 	isc_stats_attach(stats, &mgr->stats);
2862 }
2863 
2864 void
isc__nm_incstats(isc_nm_t * mgr,isc_statscounter_t counterid)2865 isc__nm_incstats(isc_nm_t *mgr, isc_statscounter_t counterid) {
2866 	REQUIRE(VALID_NM(mgr));
2867 	REQUIRE(counterid != -1);
2868 
2869 	if (mgr->stats != NULL) {
2870 		isc_stats_increment(mgr->stats, counterid);
2871 	}
2872 }
2873 
2874 void
isc__nm_decstats(isc_nm_t * mgr,isc_statscounter_t counterid)2875 isc__nm_decstats(isc_nm_t *mgr, isc_statscounter_t counterid) {
2876 	REQUIRE(VALID_NM(mgr));
2877 	REQUIRE(counterid != -1);
2878 
2879 	if (mgr->stats != NULL) {
2880 		isc_stats_decrement(mgr->stats, counterid);
2881 	}
2882 }
2883 
2884 isc_result_t
isc__nm_socket(int domain,int type,int protocol,uv_os_sock_t * sockp)2885 isc__nm_socket(int domain, int type, int protocol, uv_os_sock_t *sockp) {
2886 #ifdef WIN32
2887 	SOCKET sock;
2888 	sock = socket(domain, type, protocol);
2889 	if (sock == INVALID_SOCKET) {
2890 		char strbuf[ISC_STRERRORSIZE];
2891 		DWORD socket_errno = WSAGetLastError();
2892 		switch (socket_errno) {
2893 		case WSAEMFILE:
2894 		case WSAENOBUFS:
2895 			return (ISC_R_NORESOURCES);
2896 
2897 		case WSAEPROTONOSUPPORT:
2898 		case WSAEPFNOSUPPORT:
2899 		case WSAEAFNOSUPPORT:
2900 			return (ISC_R_FAMILYNOSUPPORT);
2901 		default:
2902 			strerror_r(socket_errno, strbuf, sizeof(strbuf));
2903 			UNEXPECTED_ERROR(
2904 				__FILE__, __LINE__,
2905 				"socket() failed with error code %lu: %s",
2906 				socket_errno, strbuf);
2907 			return (ISC_R_UNEXPECTED);
2908 		}
2909 	}
2910 #else
2911 	int sock = socket(domain, type, protocol);
2912 	if (sock < 0) {
2913 		return (isc_errno_toresult(errno));
2914 	}
2915 #endif
2916 	*sockp = (uv_os_sock_t)sock;
2917 	return (ISC_R_SUCCESS);
2918 }
2919 
2920 void
isc__nm_closesocket(uv_os_sock_t sock)2921 isc__nm_closesocket(uv_os_sock_t sock) {
2922 #ifdef WIN32
2923 	closesocket(sock);
2924 #else
2925 	close(sock);
2926 #endif
2927 }
2928 
2929 #define setsockopt_on(socket, level, name) \
2930 	setsockopt(socket, level, name, &(int){ 1 }, sizeof(int))
2931 
2932 #define setsockopt_off(socket, level, name) \
2933 	setsockopt(socket, level, name, &(int){ 0 }, sizeof(int))
2934 
2935 isc_result_t
isc__nm_socket_freebind(uv_os_sock_t fd,sa_family_t sa_family)2936 isc__nm_socket_freebind(uv_os_sock_t fd, sa_family_t sa_family) {
2937 	/*
2938 	 * Set the IP_FREEBIND (or equivalent option) on the uv_handle.
2939 	 */
2940 #ifdef IP_FREEBIND
2941 	UNUSED(sa_family);
2942 	if (setsockopt_on(fd, IPPROTO_IP, IP_FREEBIND) == -1) {
2943 		return (ISC_R_FAILURE);
2944 	}
2945 	return (ISC_R_SUCCESS);
2946 #elif defined(IP_BINDANY) || defined(IPV6_BINDANY)
2947 	if (sa_family == AF_INET) {
2948 #if defined(IP_BINDANY)
2949 		if (setsockopt_on(fd, IPPROTO_IP, IP_BINDANY) == -1) {
2950 			return (ISC_R_FAILURE);
2951 		}
2952 		return (ISC_R_SUCCESS);
2953 #endif
2954 	} else if (sa_family == AF_INET6) {
2955 #if defined(IPV6_BINDANY)
2956 		if (setsockopt_on(fd, IPPROTO_IPV6, IPV6_BINDANY) == -1) {
2957 			return (ISC_R_FAILURE);
2958 		}
2959 		return (ISC_R_SUCCESS);
2960 #endif
2961 	}
2962 	return (ISC_R_NOTIMPLEMENTED);
2963 #elif defined(SO_BINDANY)
2964 	UNUSED(sa_family);
2965 	if (setsockopt_on(fd, SOL_SOCKET, SO_BINDANY) == -1) {
2966 		return (ISC_R_FAILURE);
2967 	}
2968 	return (ISC_R_SUCCESS);
2969 #else
2970 	UNUSED(fd);
2971 	UNUSED(sa_family);
2972 	return (ISC_R_NOTIMPLEMENTED);
2973 #endif
2974 }
2975 
2976 isc_result_t
isc__nm_socket_reuse(uv_os_sock_t fd)2977 isc__nm_socket_reuse(uv_os_sock_t fd) {
2978 	/*
2979 	 * Generally, the SO_REUSEADDR socket option allows reuse of
2980 	 * local addresses.
2981 	 *
2982 	 * On the BSDs, SO_REUSEPORT implies SO_REUSEADDR but with some
2983 	 * additional refinements for programs that use multicast.
2984 	 *
2985 	 * On Linux, SO_REUSEPORT has different semantics: it _shares_ the port
2986 	 * rather than steal it from the current listener, so we don't use it
2987 	 * here, but rather in isc__nm_socket_reuse_lb().
2988 	 *
2989 	 * On Windows, it also allows a socket to forcibly bind to a port in use
2990 	 * by another socket.
2991 	 */
2992 
2993 #if defined(SO_REUSEPORT) && !defined(__linux__)
2994 	if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) {
2995 		return (ISC_R_FAILURE);
2996 	}
2997 	return (ISC_R_SUCCESS);
2998 #elif defined(SO_REUSEADDR)
2999 	if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEADDR) == -1) {
3000 		return (ISC_R_FAILURE);
3001 	}
3002 	return (ISC_R_SUCCESS);
3003 #else
3004 	UNUSED(fd);
3005 	return (ISC_R_NOTIMPLEMENTED);
3006 #endif
3007 }
3008 
3009 isc_result_t
isc__nm_socket_reuse_lb(uv_os_sock_t fd)3010 isc__nm_socket_reuse_lb(uv_os_sock_t fd) {
3011 	/*
3012 	 * On FreeBSD 12+, SO_REUSEPORT_LB socket option allows sockets to be
3013 	 * bound to an identical socket address. For UDP sockets, the use of
3014 	 * this option can provide better distribution of incoming datagrams to
3015 	 * multiple processes (or threads) as compared to the traditional
3016 	 * technique of having multiple processes compete to receive datagrams
3017 	 * on the same socket.
3018 	 *
3019 	 * On Linux, the same thing is achieved simply with SO_REUSEPORT.
3020 	 */
3021 #if defined(SO_REUSEPORT_LB)
3022 	if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT_LB) == -1) {
3023 		return (ISC_R_FAILURE);
3024 	} else {
3025 		return (ISC_R_SUCCESS);
3026 	}
3027 #elif defined(SO_REUSEPORT) && defined(__linux__)
3028 	if (setsockopt_on(fd, SOL_SOCKET, SO_REUSEPORT) == -1) {
3029 		return (ISC_R_FAILURE);
3030 	} else {
3031 		return (ISC_R_SUCCESS);
3032 	}
3033 #else
3034 	UNUSED(fd);
3035 	return (ISC_R_NOTIMPLEMENTED);
3036 #endif
3037 }
3038 
3039 isc_result_t
isc__nm_socket_incoming_cpu(uv_os_sock_t fd)3040 isc__nm_socket_incoming_cpu(uv_os_sock_t fd) {
3041 #ifdef SO_INCOMING_CPU
3042 	if (setsockopt_on(fd, SOL_SOCKET, SO_INCOMING_CPU) == -1) {
3043 		return (ISC_R_FAILURE);
3044 	} else {
3045 		return (ISC_R_SUCCESS);
3046 	}
3047 #else
3048 	UNUSED(fd);
3049 #endif
3050 	return (ISC_R_NOTIMPLEMENTED);
3051 }
3052 
3053 isc_result_t
isc__nm_socket_disable_pmtud(uv_os_sock_t fd,sa_family_t sa_family)3054 isc__nm_socket_disable_pmtud(uv_os_sock_t fd, sa_family_t sa_family) {
3055 	/*
3056 	 * Disable the Path MTU Discovery on IP packets
3057 	 */
3058 	if (sa_family == AF_INET6) {
3059 #if defined(IPV6_DONTFRAG)
3060 		if (setsockopt_off(fd, IPPROTO_IPV6, IPV6_DONTFRAG) == -1) {
3061 			return (ISC_R_FAILURE);
3062 		} else {
3063 			return (ISC_R_SUCCESS);
3064 		}
3065 #elif defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
3066 		if (setsockopt(fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
3067 			       &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1)
3068 		{
3069 			return (ISC_R_FAILURE);
3070 		} else {
3071 			return (ISC_R_SUCCESS);
3072 		}
3073 #else
3074 		UNUSED(fd);
3075 #endif
3076 	} else if (sa_family == AF_INET) {
3077 #if defined(IP_DONTFRAG)
3078 		if (setsockopt_off(fd, IPPROTO_IP, IP_DONTFRAG) == -1) {
3079 			return (ISC_R_FAILURE);
3080 		} else {
3081 			return (ISC_R_SUCCESS);
3082 		}
3083 #elif defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
3084 		if (setsockopt(fd, IPPROTO_IP, IP_MTU_DISCOVER,
3085 			       &(int){ IP_PMTUDISC_OMIT }, sizeof(int)) == -1)
3086 		{
3087 			return (ISC_R_FAILURE);
3088 		} else {
3089 			return (ISC_R_SUCCESS);
3090 		}
3091 #else
3092 		UNUSED(fd);
3093 #endif
3094 	} else {
3095 		return (ISC_R_FAMILYNOSUPPORT);
3096 	}
3097 
3098 	return (ISC_R_NOTIMPLEMENTED);
3099 }
3100 
3101 #if defined(_WIN32)
3102 #define TIMEOUT_TYPE	DWORD
3103 #define TIMEOUT_DIV	1000
3104 #define TIMEOUT_OPTNAME TCP_MAXRT
3105 #elif defined(TCP_CONNECTIONTIMEOUT)
3106 #define TIMEOUT_TYPE	int
3107 #define TIMEOUT_DIV	1000
3108 #define TIMEOUT_OPTNAME TCP_CONNECTIONTIMEOUT
3109 #elif defined(TCP_RXT_CONNDROPTIME)
3110 #define TIMEOUT_TYPE	int
3111 #define TIMEOUT_DIV	1000
3112 #define TIMEOUT_OPTNAME TCP_RXT_CONNDROPTIME
3113 #elif defined(TCP_USER_TIMEOUT)
3114 #define TIMEOUT_TYPE	unsigned int
3115 #define TIMEOUT_DIV	1
3116 #define TIMEOUT_OPTNAME TCP_USER_TIMEOUT
3117 #elif defined(TCP_KEEPINIT)
3118 #define TIMEOUT_TYPE	int
3119 #define TIMEOUT_DIV	1000
3120 #define TIMEOUT_OPTNAME TCP_KEEPINIT
3121 #endif
3122 
3123 isc_result_t
isc__nm_socket_connectiontimeout(uv_os_sock_t fd,int timeout_ms)3124 isc__nm_socket_connectiontimeout(uv_os_sock_t fd, int timeout_ms) {
3125 #if defined(TIMEOUT_OPTNAME)
3126 	TIMEOUT_TYPE timeout = timeout_ms / TIMEOUT_DIV;
3127 
3128 	if (timeout == 0) {
3129 		timeout = 1;
3130 	}
3131 
3132 	if (setsockopt(fd, IPPROTO_TCP, TIMEOUT_OPTNAME, &timeout,
3133 		       sizeof(timeout)) == -1)
3134 	{
3135 		return (ISC_R_FAILURE);
3136 	}
3137 
3138 	return (ISC_R_SUCCESS);
3139 #else
3140 	UNUSED(fd);
3141 	UNUSED(timeout_ms);
3142 
3143 	return (ISC_R_SUCCESS);
3144 #endif
3145 }
3146 
3147 isc_result_t
isc__nm_socket_tcp_nodelay(uv_os_sock_t fd)3148 isc__nm_socket_tcp_nodelay(uv_os_sock_t fd) {
3149 #ifdef TCP_NODELAY
3150 	if (setsockopt_on(fd, IPPROTO_TCP, TCP_NODELAY) == -1) {
3151 		return (ISC_R_FAILURE);
3152 	} else {
3153 		return (ISC_R_SUCCESS);
3154 	}
3155 #else
3156 	UNUSED(fd);
3157 	return (ISC_R_SUCCESS);
3158 #endif
3159 }
3160 
3161 static isc_threadresult_t
isc__nm_work_run(isc_threadarg_t arg)3162 isc__nm_work_run(isc_threadarg_t arg) {
3163 	isc__nm_work_t *work = (isc__nm_work_t *)arg;
3164 
3165 	work->cb(work->data);
3166 
3167 	return ((isc_threadresult_t)0);
3168 }
3169 
3170 static void
isc__nm_work_cb(uv_work_t * req)3171 isc__nm_work_cb(uv_work_t *req) {
3172 	isc__nm_work_t *work = uv_req_get_data((uv_req_t *)req);
3173 
3174 	if (isc_tid_v == SIZE_MAX) {
3175 		isc__trampoline_t *trampoline_arg =
3176 			isc__trampoline_get(isc__nm_work_run, work);
3177 		(void)isc__trampoline_run(trampoline_arg);
3178 	} else {
3179 		(void)isc__nm_work_run((isc_threadarg_t)work);
3180 	}
3181 }
3182 
3183 static void
isc__nm_after_work_cb(uv_work_t * req,int status)3184 isc__nm_after_work_cb(uv_work_t *req, int status) {
3185 	isc_result_t result = ISC_R_SUCCESS;
3186 	isc__nm_work_t *work = uv_req_get_data((uv_req_t *)req);
3187 	isc_nm_t *netmgr = work->netmgr;
3188 
3189 	if (status != 0) {
3190 		result = isc__nm_uverr2result(status);
3191 	}
3192 
3193 	work->after_cb(work->data, result);
3194 
3195 	isc_mem_put(netmgr->mctx, work, sizeof(*work));
3196 
3197 	isc_nm_detach(&netmgr);
3198 }
3199 
3200 void
isc_nm_work_offload(isc_nm_t * netmgr,isc_nm_workcb_t work_cb,isc_nm_after_workcb_t after_work_cb,void * data)3201 isc_nm_work_offload(isc_nm_t *netmgr, isc_nm_workcb_t work_cb,
3202 		    isc_nm_after_workcb_t after_work_cb, void *data) {
3203 	isc__networker_t *worker = NULL;
3204 	isc__nm_work_t *work = NULL;
3205 	int r;
3206 
3207 	REQUIRE(isc__nm_in_netthread());
3208 	REQUIRE(VALID_NM(netmgr));
3209 
3210 	worker = &netmgr->workers[isc_nm_tid()];
3211 
3212 	work = isc_mem_get(netmgr->mctx, sizeof(*work));
3213 	*work = (isc__nm_work_t){
3214 		.cb = work_cb,
3215 		.after_cb = after_work_cb,
3216 		.data = data,
3217 	};
3218 
3219 	isc_nm_attach(netmgr, &work->netmgr);
3220 
3221 	uv_req_set_data((uv_req_t *)&work->req, work);
3222 
3223 	r = uv_queue_work(&worker->loop, &work->req, isc__nm_work_cb,
3224 			  isc__nm_after_work_cb);
3225 	UV_RUNTIME_CHECK(uv_queue_work, r);
3226 }
3227 
3228 void
isc_nm_timer_create(isc_nmhandle_t * handle,isc_nm_timer_cb cb,void * cbarg,isc_nm_timer_t ** timerp)3229 isc_nm_timer_create(isc_nmhandle_t *handle, isc_nm_timer_cb cb, void *cbarg,
3230 		    isc_nm_timer_t **timerp) {
3231 	isc__networker_t *worker = NULL;
3232 	isc_nmsocket_t *sock = NULL;
3233 	isc_nm_timer_t *timer = NULL;
3234 	int r;
3235 
3236 	REQUIRE(isc__nm_in_netthread());
3237 	REQUIRE(VALID_NMHANDLE(handle));
3238 	REQUIRE(VALID_NMSOCK(handle->sock));
3239 
3240 	sock = handle->sock;
3241 	worker = &sock->mgr->workers[isc_nm_tid()];
3242 
3243 	timer = isc_mem_get(sock->mgr->mctx, sizeof(*timer));
3244 	*timer = (isc_nm_timer_t){ .cb = cb, .cbarg = cbarg };
3245 	isc_refcount_init(&timer->references, 1);
3246 	isc_nmhandle_attach(handle, &timer->handle);
3247 
3248 	r = uv_timer_init(&worker->loop, &timer->timer);
3249 	UV_RUNTIME_CHECK(uv_timer_init, r);
3250 
3251 	uv_handle_set_data((uv_handle_t *)&timer->timer, timer);
3252 
3253 	*timerp = timer;
3254 }
3255 
3256 void
isc_nm_timer_attach(isc_nm_timer_t * timer,isc_nm_timer_t ** timerp)3257 isc_nm_timer_attach(isc_nm_timer_t *timer, isc_nm_timer_t **timerp) {
3258 	REQUIRE(timer != NULL);
3259 	REQUIRE(timerp != NULL && *timerp == NULL);
3260 
3261 	isc_refcount_increment(&timer->references);
3262 	*timerp = timer;
3263 }
3264 
3265 static void
timer_destroy(uv_handle_t * uvhandle)3266 timer_destroy(uv_handle_t *uvhandle) {
3267 	isc_nm_timer_t *timer = uv_handle_get_data(uvhandle);
3268 	isc_nmhandle_t *handle = timer->handle;
3269 	isc_mem_t *mctx = timer->handle->sock->mgr->mctx;
3270 
3271 	isc_mem_put(mctx, timer, sizeof(*timer));
3272 
3273 	isc_nmhandle_detach(&handle);
3274 }
3275 
3276 void
isc_nm_timer_detach(isc_nm_timer_t ** timerp)3277 isc_nm_timer_detach(isc_nm_timer_t **timerp) {
3278 	isc_nm_timer_t *timer = NULL;
3279 	isc_nmhandle_t *handle = NULL;
3280 
3281 	REQUIRE(timerp != NULL && *timerp != NULL);
3282 
3283 	timer = *timerp;
3284 	*timerp = NULL;
3285 
3286 	handle = timer->handle;
3287 
3288 	REQUIRE(isc__nm_in_netthread());
3289 	REQUIRE(VALID_NMHANDLE(handle));
3290 	REQUIRE(VALID_NMSOCK(handle->sock));
3291 
3292 	if (isc_refcount_decrement(&timer->references) == 1) {
3293 		int r = uv_timer_stop(&timer->timer);
3294 		UV_RUNTIME_CHECK(uv_timer_stop, r);
3295 		uv_close((uv_handle_t *)&timer->timer, timer_destroy);
3296 	}
3297 }
3298 
3299 static void
timer_cb(uv_timer_t * uvtimer)3300 timer_cb(uv_timer_t *uvtimer) {
3301 	isc_nm_timer_t *timer = uv_handle_get_data((uv_handle_t *)uvtimer);
3302 
3303 	REQUIRE(timer->cb != NULL);
3304 
3305 	timer->cb(timer->cbarg, ISC_R_TIMEDOUT);
3306 }
3307 
3308 void
isc_nm_timer_start(isc_nm_timer_t * timer,uint64_t timeout)3309 isc_nm_timer_start(isc_nm_timer_t *timer, uint64_t timeout) {
3310 	int r = uv_timer_start(&timer->timer, timer_cb, timeout, 0);
3311 	UV_RUNTIME_CHECK(uv_timer_start, r);
3312 }
3313 
3314 void
isc_nm_timer_stop(isc_nm_timer_t * timer)3315 isc_nm_timer_stop(isc_nm_timer_t *timer) {
3316 	int r = uv_timer_stop(&timer->timer);
3317 	UV_RUNTIME_CHECK(uv_timer_stop, r);
3318 }
3319 
3320 #ifdef NETMGR_TRACE
3321 /*
3322  * Dump all active sockets in netmgr. We output to stderr
3323  * as the logger might be already shut down.
3324  */
3325 
3326 static const char *
nmsocket_type_totext(isc_nmsocket_type type)3327 nmsocket_type_totext(isc_nmsocket_type type) {
3328 	switch (type) {
3329 	case isc_nm_udpsocket:
3330 		return ("isc_nm_udpsocket");
3331 	case isc_nm_udplistener:
3332 		return ("isc_nm_udplistener");
3333 	case isc_nm_tcpsocket:
3334 		return ("isc_nm_tcpsocket");
3335 	case isc_nm_tcplistener:
3336 		return ("isc_nm_tcplistener");
3337 	case isc_nm_tcpdnslistener:
3338 		return ("isc_nm_tcpdnslistener");
3339 	case isc_nm_tcpdnssocket:
3340 		return ("isc_nm_tcpdnssocket");
3341 	default:
3342 		UNREACHABLE();
3343 	}
3344 }
3345 
3346 static void
nmhandle_dump(isc_nmhandle_t * handle)3347 nmhandle_dump(isc_nmhandle_t *handle) {
3348 	fprintf(stderr, "Active handle %p, refs %" PRIuFAST32 "\n", handle,
3349 		isc_refcount_current(&handle->references));
3350 	fprintf(stderr, "Created by:\n");
3351 	backtrace_symbols_fd(handle->backtrace, handle->backtrace_size,
3352 			     STDERR_FILENO);
3353 	fprintf(stderr, "\n\n");
3354 }
3355 
3356 static void
nmsocket_dump(isc_nmsocket_t * sock)3357 nmsocket_dump(isc_nmsocket_t *sock) {
3358 	isc_nmhandle_t *handle = NULL;
3359 
3360 	LOCK(&sock->lock);
3361 	fprintf(stderr, "\n=================\n");
3362 	fprintf(stderr, "Active %s socket %p, type %s, refs %" PRIuFAST32 "\n",
3363 		atomic_load(&sock->client) ? "client" : "server", sock,
3364 		nmsocket_type_totext(sock->type),
3365 		isc_refcount_current(&sock->references));
3366 	fprintf(stderr,
3367 		"Parent %p, listener %p, server %p, statichandle = "
3368 		"%p\n",
3369 		sock->parent, sock->listener, sock->server, sock->statichandle);
3370 	fprintf(stderr, "Flags:%s%s%s%s%s\n",
3371 		atomic_load(&sock->active) ? " active" : "",
3372 		atomic_load(&sock->closing) ? " closing" : "",
3373 		atomic_load(&sock->destroying) ? " destroying" : "",
3374 		atomic_load(&sock->connecting) ? " connecting" : "",
3375 		sock->accepting ? " accepting" : "");
3376 	fprintf(stderr, "Created by:\n");
3377 	backtrace_symbols_fd(sock->backtrace, sock->backtrace_size,
3378 			     STDERR_FILENO);
3379 	fprintf(stderr, "\n");
3380 
3381 	for (handle = ISC_LIST_HEAD(sock->active_handles); handle != NULL;
3382 	     handle = ISC_LIST_NEXT(handle, active_link))
3383 	{
3384 		static bool first = true;
3385 		if (first) {
3386 			fprintf(stderr, "Active handles:\n");
3387 			first = false;
3388 		}
3389 		nmhandle_dump(handle);
3390 	}
3391 
3392 	fprintf(stderr, "\n");
3393 	UNLOCK(&sock->lock);
3394 }
3395 
3396 void
isc__nm_dump_active(isc_nm_t * nm)3397 isc__nm_dump_active(isc_nm_t *nm) {
3398 	isc_nmsocket_t *sock = NULL;
3399 
3400 	REQUIRE(VALID_NM(nm));
3401 
3402 	LOCK(&nm->lock);
3403 	for (sock = ISC_LIST_HEAD(nm->active_sockets); sock != NULL;
3404 	     sock = ISC_LIST_NEXT(sock, active_link))
3405 	{
3406 		static bool first = true;
3407 		if (first) {
3408 			fprintf(stderr, "Outstanding sockets\n");
3409 			first = false;
3410 		}
3411 		nmsocket_dump(sock);
3412 	}
3413 	UNLOCK(&nm->lock);
3414 }
3415 #endif
3416