1 /* $NetBSD: kqueue.c,v 1.7 2024/08/18 20:47:21 christos Exp $ */ 2 3 /* $OpenBSD: kqueue.c,v 1.5 2002/07/10 14:41:31 art Exp $ */ 4 5 /* 6 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu> 7 * Copyright 2007-2012 Niels Provos and Nick Mathewson 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. The name of the author may not be used to endorse or promote products 18 * derived from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 #include "event2/event-config.h" 32 #include "evconfig-private.h" 33 34 #ifdef EVENT__HAVE_KQUEUE 35 36 #include <sys/types.h> 37 #ifdef EVENT__HAVE_SYS_TIME_H 38 #include <sys/time.h> 39 #endif 40 #include <sys/queue.h> 41 #include <sys/event.h> 42 #include <limits.h> 43 #include <signal.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <unistd.h> 48 #include <errno.h> 49 #ifdef EVENT__HAVE_INTTYPES_H 50 #include <inttypes.h> 51 #endif 52 53 /* Some platforms apparently define the udata field of struct kevent as 54 * intptr_t, whereas others define it as void*. There doesn't seem to be an 55 * easy way to tell them apart via autoconf, so we need to use OS macros. */ 56 #if defined(__NetBSD__) 57 #define PTR_TO_UDATA(x) ((typeof(((struct kevent *)0)->udata))(x)) 58 #define INT_TO_UDATA(x) ((typeof(((struct kevent *)0)->udata))(intptr_t)(x)) 59 #elif defined(EVENT__HAVE_INTTYPES_H) && !defined(__OpenBSD__) && !defined(__FreeBSD__) && !defined(__darwin__) && !defined(__APPLE__) && !defined(__CloudABI__) 60 #define PTR_TO_UDATA(x) ((intptr_t)(x)) 61 #define INT_TO_UDATA(x) ((intptr_t)(x)) 62 #else 63 #define PTR_TO_UDATA(x) (x) 64 #define INT_TO_UDATA(x) ((void*)(x)) 65 #endif 66 67 #include "event-internal.h" 68 #include "log-internal.h" 69 #include "evmap-internal.h" 70 #include "event2/thread.h" 71 #include "event2/util.h" 72 #include "evthread-internal.h" 73 #include "changelist-internal.h" 74 75 #include "kqueue-internal.h" 76 77 #define NEVENT 64 78 79 struct kqop { 80 struct kevent *changes; 81 int changes_size; 82 83 struct kevent *events; 84 int events_size; 85 int kq; 86 int notify_event_added; 87 pid_t pid; 88 }; 89 90 static void kqop_free(struct kqop *kqop); 91 92 static void *kq_init(struct event_base *); 93 static int kq_sig_add(struct event_base *, int, short, short, void *); 94 static int kq_sig_del(struct event_base *, int, short, short, void *); 95 static int kq_dispatch(struct event_base *, struct timeval *); 96 static void kq_dealloc(struct event_base *); 97 98 const struct eventop kqops = { 99 "kqueue", 100 kq_init, 101 event_changelist_add_, 102 event_changelist_del_, 103 kq_dispatch, 104 kq_dealloc, 105 1 /* need reinit */, 106 EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_FDS, 107 EVENT_CHANGELIST_FDINFO_SIZE 108 }; 109 110 static const struct eventop kqsigops = { 111 "kqueue_signal", 112 NULL, 113 kq_sig_add, 114 kq_sig_del, 115 NULL, 116 NULL, 117 1 /* need reinit */, 118 0, 119 0 120 }; 121 122 static void * 123 kq_init(struct event_base *base) 124 { 125 int kq = -1; 126 struct kqop *kqueueop = NULL; 127 128 if (!(kqueueop = mm_calloc(1, sizeof(struct kqop)))) 129 return (NULL); 130 131 /* Initialize the kernel queue */ 132 133 if ((kq = kqueue()) == -1) { 134 event_warn("kqueue"); 135 goto err; 136 } 137 138 kqueueop->kq = kq; 139 140 kqueueop->pid = getpid(); 141 142 /* Initialize fields */ 143 kqueueop->changes = mm_calloc(NEVENT, sizeof(struct kevent)); 144 if (kqueueop->changes == NULL) 145 goto err; 146 kqueueop->events = mm_calloc(NEVENT, sizeof(struct kevent)); 147 if (kqueueop->events == NULL) 148 goto err; 149 kqueueop->events_size = kqueueop->changes_size = NEVENT; 150 151 /* Check for Mac OS X kqueue bug. */ 152 memset(&kqueueop->changes[0], 0, sizeof kqueueop->changes[0]); 153 kqueueop->changes[0].ident = -1; 154 kqueueop->changes[0].filter = EVFILT_READ; 155 kqueueop->changes[0].flags = EV_ADD; 156 /* 157 * If kqueue works, then kevent will succeed, and it will 158 * stick an error in events[0]. If kqueue is broken, then 159 * kevent will fail. 160 */ 161 if (kevent(kq, 162 kqueueop->changes, 1, kqueueop->events, NEVENT, NULL) != 1 || 163 (int)kqueueop->events[0].ident != -1 || 164 !(kqueueop->events[0].flags & EV_ERROR)) { 165 event_warn("%s: detected broken kqueue; not using.", __func__); 166 goto err; 167 } 168 169 base->evsigsel = &kqsigops; 170 171 return (kqueueop); 172 err: 173 if (kqueueop) 174 kqop_free(kqueueop); 175 176 return (NULL); 177 } 178 179 #define ADD_UDATA 0x30303 180 181 static void 182 kq_setup_kevent(struct kevent *out, evutil_socket_t fd, int filter, short change) 183 { 184 memset(out, 0, sizeof(struct kevent)); 185 out->ident = fd; 186 out->filter = filter; 187 188 if (change & EV_CHANGE_ADD) { 189 out->flags = EV_ADD; 190 /* We set a magic number here so that we can tell 'add' 191 * errors from 'del' errors. */ 192 out->udata = INT_TO_UDATA(ADD_UDATA); 193 if (change & EV_ET) 194 out->flags |= EV_CLEAR; 195 #ifdef NOTE_EOF 196 /* Make it behave like select() and poll() */ 197 if (filter == EVFILT_READ) 198 out->fflags = NOTE_EOF; 199 #endif 200 } else { 201 EVUTIL_ASSERT(change & EV_CHANGE_DEL); 202 out->flags = EV_DELETE; 203 } 204 } 205 206 static int 207 kq_build_changes_list(const struct event_changelist *changelist, 208 struct kqop *kqop) 209 { 210 int i; 211 int n_changes = 0; 212 213 for (i = 0; i < changelist->n_changes; ++i) { 214 struct event_change *in_ch = &changelist->changes[i]; 215 struct kevent *out_ch; 216 if (n_changes >= kqop->changes_size - 1) { 217 int newsize; 218 struct kevent *newchanges; 219 220 if (kqop->changes_size > INT_MAX / 2 || 221 (size_t)kqop->changes_size * 2 > EV_SIZE_MAX / 222 sizeof(struct kevent)) { 223 event_warnx("%s: int overflow", __func__); 224 return (-1); 225 } 226 227 newsize = kqop->changes_size * 2; 228 newchanges = mm_realloc(kqop->changes, 229 newsize * sizeof(struct kevent)); 230 if (newchanges == NULL) { 231 event_warn("%s: realloc", __func__); 232 return (-1); 233 } 234 kqop->changes = newchanges; 235 kqop->changes_size = newsize; 236 } 237 if (in_ch->read_change) { 238 out_ch = &kqop->changes[n_changes++]; 239 kq_setup_kevent(out_ch, in_ch->fd, EVFILT_READ, 240 in_ch->read_change); 241 } 242 if (in_ch->write_change) { 243 out_ch = &kqop->changes[n_changes++]; 244 kq_setup_kevent(out_ch, in_ch->fd, EVFILT_WRITE, 245 in_ch->write_change); 246 } 247 } 248 return n_changes; 249 } 250 251 static int 252 kq_grow_events(struct kqop *kqop, size_t new_size) 253 { 254 struct kevent *newresult; 255 256 newresult = mm_realloc(kqop->events, 257 new_size * sizeof(struct kevent)); 258 259 if (newresult) { 260 kqop->events = newresult; 261 kqop->events_size = new_size; 262 return 0; 263 } else { 264 return -1; 265 } 266 } 267 268 static int 269 kq_dispatch(struct event_base *base, struct timeval *tv) 270 { 271 struct kqop *kqop = base->evbase; 272 struct kevent *events = kqop->events; 273 struct kevent *changes; 274 struct timespec ts, *ts_p = NULL; 275 int i, n_changes, res; 276 277 if (tv != NULL) { 278 ts.tv_sec = tv->tv_sec; 279 ts.tv_nsec = tv->tv_usec * 1000; 280 ts_p = &ts; 281 } 282 283 /* Build "changes" from "base->changes" */ 284 EVUTIL_ASSERT(kqop->changes); 285 n_changes = kq_build_changes_list(&base->changelist, kqop); 286 if (n_changes < 0) 287 return -1; 288 289 event_changelist_remove_all_(&base->changelist, base); 290 291 /* steal the changes array in case some broken code tries to call 292 * dispatch twice at once. */ 293 changes = kqop->changes; 294 kqop->changes = NULL; 295 296 /* Make sure that 'events' is at least as long as the list of changes: 297 * otherwise errors in the changes can get reported as a -1 return 298 * value from kevent() rather than as EV_ERROR events in the events 299 * array. 300 * 301 * (We could instead handle -1 return values from kevent() by 302 * retrying with a smaller changes array or a larger events array, 303 * but this approach seems less risky for now.) 304 */ 305 if (kqop->events_size < n_changes) { 306 int new_size = kqop->events_size; 307 do { 308 new_size *= 2; 309 } while (new_size < n_changes); 310 311 kq_grow_events(kqop, new_size); 312 events = kqop->events; 313 } 314 315 EVBASE_RELEASE_LOCK(base, th_base_lock); 316 317 res = kevent(kqop->kq, changes, n_changes, 318 events, kqop->events_size, ts_p); 319 320 EVBASE_ACQUIRE_LOCK(base, th_base_lock); 321 322 EVUTIL_ASSERT(kqop->changes == NULL); 323 kqop->changes = changes; 324 325 if (res == -1) { 326 if (errno != EINTR) { 327 event_warn("kevent"); 328 return (-1); 329 } 330 331 return (0); 332 } 333 334 event_debug(("%s: kevent reports %d", __func__, res)); 335 336 for (i = 0; i < res; i++) { 337 int which = 0; 338 339 if (events[i].flags & EV_ERROR) { 340 switch (events[i].data) { 341 342 /* Can occur on delete if we are not currently 343 * watching any events on this fd. That can 344 * happen when the fd was closed and another 345 * file was opened with that fd. */ 346 case ENOENT: 347 /* Can occur for reasons not fully understood 348 * on FreeBSD. */ 349 case EINVAL: 350 continue; 351 #if defined(__FreeBSD__) && defined(ENOTCAPABLE) 352 /* 353 * This currently occurs if an FD is closed 354 * before the EV_DELETE makes it out via kevent(). 355 * The FreeBSD capabilities code sees the blank 356 * capability set and rejects the request to 357 * modify an event. 358 * 359 * To be strictly correct - when an FD is closed, 360 * all the registered events are also removed. 361 * Queuing EV_DELETE to a closed FD is wrong. 362 * The event(s) should just be deleted from 363 * the pending changelist. 364 */ 365 case ENOTCAPABLE: 366 continue; 367 #endif 368 369 /* Can occur on a delete if the fd is closed. */ 370 case EBADF: 371 /* XXXX On NetBSD, we can also get EBADF if we 372 * try to add the write side of a pipe, but 373 * the read side has already been closed. 374 * Other BSDs call this situation 'EPIPE'. It 375 * would be good if we had a way to report 376 * this situation. */ 377 continue; 378 /* These two can occur on an add if the fd was one side 379 * of a pipe, and the other side was closed. */ 380 case EPERM: 381 case EPIPE: 382 /* Report read events, if we're listening for 383 * them, so that the user can learn about any 384 * add errors. (If the operation was a 385 * delete, then udata should be cleared.) */ 386 if (events[i].udata) { 387 /* The operation was an add: 388 * report the error as a read. */ 389 which |= EV_READ; 390 break; 391 } else { 392 /* The operation was a del: 393 * report nothing. */ 394 continue; 395 } 396 397 /* Other errors shouldn't occur. */ 398 default: 399 errno = events[i].data; 400 return (-1); 401 } 402 } else if (events[i].filter == EVFILT_READ) { 403 which |= EV_READ; 404 } else if (events[i].filter == EVFILT_WRITE) { 405 which |= EV_WRITE; 406 } else if (events[i].filter == EVFILT_SIGNAL) { 407 which |= EV_SIGNAL; 408 #ifdef EVFILT_USER 409 } else if (events[i].filter == EVFILT_USER) { 410 base->is_notify_pending = 0; 411 #endif 412 } 413 414 if (!which) 415 continue; 416 417 if (events[i].filter == EVFILT_SIGNAL) { 418 evmap_signal_active_(base, events[i].ident, 1); 419 } else { 420 evmap_io_active_(base, events[i].ident, which | EV_ET); 421 } 422 } 423 424 if (res == kqop->events_size) { 425 /* We used all the events space that we have. Maybe we should 426 make it bigger. */ 427 kq_grow_events(kqop, kqop->events_size * 2); 428 } 429 430 return (0); 431 } 432 433 static void 434 kqop_free(struct kqop *kqop) 435 { 436 if (kqop->changes) 437 mm_free(kqop->changes); 438 if (kqop->events) 439 mm_free(kqop->events); 440 if (kqop->kq >= 0 && kqop->pid == getpid()) 441 close(kqop->kq); 442 memset(kqop, 0, sizeof(struct kqop)); 443 mm_free(kqop); 444 } 445 446 static void 447 kq_dealloc(struct event_base *base) 448 { 449 struct kqop *kqop = base->evbase; 450 evsig_dealloc_(base); 451 kqop_free(kqop); 452 } 453 454 /* signal handling */ 455 static int 456 kq_sig_add(struct event_base *base, int nsignal, short old, short events, void *p) 457 { 458 struct kqop *kqop = base->evbase; 459 struct kevent kev; 460 struct timespec timeout = { 0, 0 }; 461 (void)p; 462 463 EVUTIL_ASSERT(nsignal >= 0 && nsignal < NSIG); 464 465 memset(&kev, 0, sizeof(kev)); 466 kev.ident = nsignal; 467 kev.filter = EVFILT_SIGNAL; 468 kev.flags = EV_ADD; 469 470 /* Be ready for the signal if it is sent any 471 * time between now and the next call to 472 * kq_dispatch. */ 473 if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1) 474 return (-1); 475 476 /* We can set the handler for most signals to SIG_IGN and 477 * still have them reported to us in the queue. However, 478 * if the handler for SIGCHLD is SIG_IGN, the system reaps 479 * zombie processes for us, and we don't get any notification. 480 * This appears to be the only signal with this quirk. */ 481 if (evsig_set_handler_(base, nsignal, 482 nsignal == SIGCHLD ? SIG_DFL : SIG_IGN) == -1) 483 return (-1); 484 485 return (0); 486 } 487 488 static int 489 kq_sig_del(struct event_base *base, int nsignal, short old, short events, void *p) 490 { 491 struct kqop *kqop = base->evbase; 492 struct kevent kev; 493 494 struct timespec timeout = { 0, 0 }; 495 (void)p; 496 497 EVUTIL_ASSERT(nsignal >= 0 && nsignal < NSIG); 498 499 memset(&kev, 0, sizeof(kev)); 500 kev.ident = nsignal; 501 kev.filter = EVFILT_SIGNAL; 502 kev.flags = EV_DELETE; 503 504 /* Because we insert signal events 505 * immediately, we need to delete them 506 * immediately, too */ 507 if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1) 508 return (-1); 509 510 if (evsig_restore_handler_(base, nsignal) == -1) 511 return (-1); 512 513 return (0); 514 } 515 516 517 /* OSX 10.6 and FreeBSD 8.1 add support for EVFILT_USER, which we can use 518 * to wake up the event loop from another thread. */ 519 520 /* Magic number we use for our filter ID. */ 521 #define NOTIFY_IDENT 42 522 523 int 524 event_kq_add_notify_event_(struct event_base *base) 525 { 526 struct kqop *kqop = base->evbase; 527 #if defined(EVFILT_USER) && defined(NOTE_TRIGGER) 528 struct kevent kev; 529 struct timespec timeout = { 0, 0 }; 530 #endif 531 532 if (kqop->notify_event_added) 533 return 0; 534 535 #if defined(EVFILT_USER) && defined(NOTE_TRIGGER) 536 memset(&kev, 0, sizeof(kev)); 537 kev.ident = NOTIFY_IDENT; 538 kev.filter = EVFILT_USER; 539 kev.flags = EV_ADD | EV_CLEAR; 540 541 if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1) { 542 event_warn("kevent: adding EVFILT_USER event"); 543 return -1; 544 } 545 546 kqop->notify_event_added = 1; 547 548 return 0; 549 #else 550 return -1; 551 #endif 552 } 553 554 int 555 event_kq_notify_base_(struct event_base *base) 556 { 557 struct kqop *kqop = base->evbase; 558 #if defined(EVFILT_USER) && defined(NOTE_TRIGGER) 559 struct kevent kev; 560 struct timespec timeout = { 0, 0 }; 561 #endif 562 if (! kqop->notify_event_added) 563 return -1; 564 565 #if defined(EVFILT_USER) && defined(NOTE_TRIGGER) 566 memset(&kev, 0, sizeof(kev)); 567 kev.ident = NOTIFY_IDENT; 568 kev.filter = EVFILT_USER; 569 kev.fflags = NOTE_TRIGGER; 570 571 if (kevent(kqop->kq, &kev, 1, NULL, 0, &timeout) == -1) { 572 event_warn("kevent: triggering EVFILT_USER event"); 573 return -1; 574 } 575 576 return 0; 577 #else 578 return -1; 579 #endif 580 } 581 582 #endif /* EVENT__HAVE_KQUEUE */ 583