1 /* $NetBSD: sys_eventfd.c,v 1.9 2022/02/17 16:28:29 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.9 2022/02/17 16:28:29 thorpej Exp $"); 34 35 /* 36 * eventfd 37 * 38 * Eventfd objects present a simple counting object associated with a 39 * file descriptor. Writes and reads to this file descriptor increment 40 * and decrement the count, respectively. When the count is non-zero, 41 * the descriptor is considered "readable", and when less than the max 42 * value (EVENTFD_MAXVAL), is considered "writable". 43 * 44 * This implementation is API compatible with the Linux eventfd(2) 45 * interface. 46 */ 47 48 #include <sys/param.h> 49 #include <sys/types.h> 50 #include <sys/condvar.h> 51 #include <sys/eventfd.h> 52 #include <sys/file.h> 53 #include <sys/filedesc.h> 54 #include <sys/kauth.h> 55 #include <sys/mutex.h> 56 #include <sys/poll.h> 57 #include <sys/proc.h> 58 #include <sys/select.h> 59 #include <sys/stat.h> 60 #include <sys/syscallargs.h> 61 #include <sys/uio.h> 62 63 struct eventfd { 64 kmutex_t efd_lock; 65 kcondvar_t efd_read_wait; 66 kcondvar_t efd_write_wait; 67 struct selinfo efd_read_sel; 68 struct selinfo efd_write_sel; 69 eventfd_t efd_val; 70 int64_t efd_nwaiters; 71 bool efd_restarting; 72 bool efd_has_read_waiters; 73 bool efd_has_write_waiters; 74 bool efd_is_semaphore; 75 76 /* 77 * Information kept for stat(2). 78 */ 79 struct timespec efd_btime; /* time created */ 80 struct timespec efd_mtime; /* last write */ 81 struct timespec efd_atime; /* last read */ 82 }; 83 84 #define EVENTFD_MAXVAL (UINT64_MAX - 1) 85 86 /* 87 * eventfd_create: 88 * 89 * Create an eventfd object. 90 */ 91 static struct eventfd * 92 eventfd_create(unsigned int const val, int const flags) 93 { 94 struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP); 95 96 mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE); 97 cv_init(&efd->efd_read_wait, "efdread"); 98 cv_init(&efd->efd_write_wait, "efdwrite"); 99 selinit(&efd->efd_read_sel); 100 selinit(&efd->efd_write_sel); 101 efd->efd_val = val; 102 efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE); 103 getnanotime(&efd->efd_btime); 104 105 /* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */ 106 107 return efd; 108 } 109 110 /* 111 * eventfd_destroy: 112 * 113 * Destroy an eventfd object. 114 */ 115 static void 116 eventfd_destroy(struct eventfd * const efd) 117 { 118 119 KASSERT(efd->efd_nwaiters == 0); 120 KASSERT(efd->efd_has_read_waiters == false); 121 KASSERT(efd->efd_has_write_waiters == false); 122 123 cv_destroy(&efd->efd_read_wait); 124 cv_destroy(&efd->efd_write_wait); 125 126 seldestroy(&efd->efd_read_sel); 127 seldestroy(&efd->efd_write_sel); 128 129 mutex_destroy(&efd->efd_lock); 130 131 kmem_free(efd, sizeof(*efd)); 132 } 133 134 /* 135 * eventfd_wait: 136 * 137 * Block on an eventfd. Handles non-blocking, as well as 138 * the restart cases. 139 */ 140 static int 141 eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write) 142 { 143 kcondvar_t *waitcv; 144 int error; 145 146 if (fflag & FNONBLOCK) { 147 return EAGAIN; 148 } 149 150 /* 151 * We're going to block. Check if we need to return ERESTART. 152 */ 153 if (efd->efd_restarting) { 154 return ERESTART; 155 } 156 157 if (is_write) { 158 efd->efd_has_write_waiters = true; 159 waitcv = &efd->efd_write_wait; 160 } else { 161 efd->efd_has_read_waiters = true; 162 waitcv = &efd->efd_read_wait; 163 } 164 165 efd->efd_nwaiters++; 166 KASSERT(efd->efd_nwaiters > 0); 167 error = cv_wait_sig(waitcv, &efd->efd_lock); 168 efd->efd_nwaiters--; 169 KASSERT(efd->efd_nwaiters >= 0); 170 171 /* 172 * If a restart was triggered while we were asleep, we need 173 * to return ERESTART if no other error was returned. 174 */ 175 if (efd->efd_restarting) { 176 if (error == 0) { 177 error = ERESTART; 178 } 179 } 180 181 return error; 182 } 183 184 /* 185 * eventfd_wake: 186 * 187 * Wake LWPs block on an eventfd. 188 */ 189 static void 190 eventfd_wake(struct eventfd * const efd, bool const is_write) 191 { 192 kcondvar_t *waitcv = NULL; 193 struct selinfo *sel; 194 int pollev; 195 196 if (is_write) { 197 if (efd->efd_has_read_waiters) { 198 waitcv = &efd->efd_read_wait; 199 efd->efd_has_read_waiters = false; 200 } 201 sel = &efd->efd_read_sel; 202 pollev = POLLIN | POLLRDNORM; 203 } else { 204 if (efd->efd_has_write_waiters) { 205 waitcv = &efd->efd_write_wait; 206 efd->efd_has_write_waiters = false; 207 } 208 sel = &efd->efd_write_sel; 209 pollev = POLLOUT | POLLWRNORM; 210 } 211 if (waitcv != NULL) { 212 cv_broadcast(waitcv); 213 } 214 selnotify(sel, pollev, NOTE_SUBMIT); 215 } 216 217 /* 218 * eventfd file operations 219 */ 220 221 static int 222 eventfd_fop_read(file_t * const fp, off_t * const offset, 223 struct uio * const uio, kauth_cred_t const cred, int const flags) 224 { 225 struct eventfd * const efd = fp->f_eventfd; 226 int const fflag = fp->f_flag; 227 eventfd_t return_value; 228 int error; 229 230 if (uio->uio_resid < sizeof(eventfd_t)) { 231 return EINVAL; 232 } 233 234 mutex_enter(&efd->efd_lock); 235 236 while (efd->efd_val == 0) { 237 if ((error = eventfd_wait(efd, fflag, false)) != 0) { 238 mutex_exit(&efd->efd_lock); 239 return error; 240 } 241 } 242 243 if (efd->efd_is_semaphore) { 244 return_value = 1; 245 efd->efd_val--; 246 } else { 247 return_value = efd->efd_val; 248 efd->efd_val = 0; 249 } 250 251 getnanotime(&efd->efd_atime); 252 eventfd_wake(efd, false); 253 254 mutex_exit(&efd->efd_lock); 255 256 error = uiomove(&return_value, sizeof(return_value), uio); 257 258 return error; 259 } 260 261 static int 262 eventfd_fop_write(file_t * const fp, off_t * const offset, 263 struct uio * const uio, kauth_cred_t const cred, int const flags) 264 { 265 struct eventfd * const efd = fp->f_eventfd; 266 int const fflag = fp->f_flag; 267 eventfd_t write_value; 268 int error; 269 270 if (uio->uio_resid < sizeof(eventfd_t)) { 271 return EINVAL; 272 } 273 274 if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) { 275 return error; 276 } 277 278 if (write_value > EVENTFD_MAXVAL) { 279 error = EINVAL; 280 goto out; 281 } 282 283 mutex_enter(&efd->efd_lock); 284 285 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 286 while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) { 287 if ((error = eventfd_wait(efd, fflag, true)) != 0) { 288 mutex_exit(&efd->efd_lock); 289 goto out; 290 } 291 } 292 293 efd->efd_val += write_value; 294 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 295 296 getnanotime(&efd->efd_mtime); 297 eventfd_wake(efd, true); 298 299 mutex_exit(&efd->efd_lock); 300 301 out: 302 if (error) { 303 /* 304 * Undo the effect of uiomove() so that the error 305 * gets reported correctly; see dofilewrite(). 306 */ 307 uio->uio_resid += sizeof(write_value); 308 } 309 return error; 310 } 311 312 static int 313 eventfd_ioctl(file_t * const fp, u_long const cmd, void * const data) 314 { 315 struct eventfd * const efd = fp->f_eventfd; 316 317 switch (cmd) { 318 case FIONBIO: 319 return 0; 320 321 case FIONREAD: 322 mutex_enter(&efd->efd_lock); 323 *(int *)data = efd->efd_val != 0 ? sizeof(eventfd_t) : 0; 324 mutex_exit(&efd->efd_lock); 325 return 0; 326 327 case FIONWRITE: 328 *(int *)data = 0; 329 return 0; 330 331 case FIONSPACE: 332 /* 333 * FIONSPACE doesn't really work for eventfd, because the 334 * writability depends on the contents (value) being written. 335 */ 336 break; 337 338 default: 339 break; 340 } 341 342 return EPASSTHROUGH; 343 } 344 345 static int 346 eventfd_fop_poll(file_t * const fp, int const events) 347 { 348 struct eventfd * const efd = fp->f_eventfd; 349 int revents = 0; 350 351 /* 352 * Note that Linux will return POLLERR if the eventfd count 353 * overflows, but that is not possible in the normal read/write 354 * API, only with Linux kernel-internal interfaces. So, this 355 * implementation never returns POLLERR. 356 * 357 * Also note that the Linux eventfd(2) man page does not 358 * specifically discuss returning POLLRDNORM, but we check 359 * for that event in addition to POLLIN. 360 */ 361 362 mutex_enter(&efd->efd_lock); 363 364 if (events & (POLLIN | POLLRDNORM)) { 365 if (efd->efd_val != 0) { 366 revents |= events & (POLLIN | POLLRDNORM); 367 } else { 368 selrecord(curlwp, &efd->efd_read_sel); 369 } 370 } 371 372 if (events & (POLLOUT | POLLWRNORM)) { 373 if (efd->efd_val < EVENTFD_MAXVAL) { 374 revents |= events & (POLLOUT | POLLWRNORM); 375 } else { 376 selrecord(curlwp, &efd->efd_write_sel); 377 } 378 } 379 380 mutex_exit(&efd->efd_lock); 381 382 return revents; 383 } 384 385 static int 386 eventfd_fop_stat(file_t * const fp, struct stat * const st) 387 { 388 struct eventfd * const efd = fp->f_eventfd; 389 390 memset(st, 0, sizeof(*st)); 391 392 mutex_enter(&efd->efd_lock); 393 st->st_size = (off_t)efd->efd_val; 394 st->st_blksize = sizeof(eventfd_t); 395 st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 396 st->st_blocks = 1; 397 st->st_birthtimespec = st->st_ctimespec = efd->efd_btime; 398 st->st_atimespec = efd->efd_atime; 399 st->st_mtimespec = efd->efd_mtime; 400 st->st_uid = kauth_cred_geteuid(fp->f_cred); 401 st->st_gid = kauth_cred_getegid(fp->f_cred); 402 mutex_exit(&efd->efd_lock); 403 404 return 0; 405 } 406 407 static int 408 eventfd_fop_close(file_t * const fp) 409 { 410 struct eventfd * const efd = fp->f_eventfd; 411 412 fp->f_eventfd = NULL; 413 eventfd_destroy(efd); 414 415 return 0; 416 } 417 418 static void 419 eventfd_filt_read_detach(struct knote * const kn) 420 { 421 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 422 423 mutex_enter(&efd->efd_lock); 424 KASSERT(kn->kn_hook == efd); 425 selremove_knote(&efd->efd_read_sel, kn); 426 mutex_exit(&efd->efd_lock); 427 } 428 429 static int 430 eventfd_filt_read(struct knote * const kn, long const hint) 431 { 432 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 433 int rv; 434 435 if (hint & NOTE_SUBMIT) { 436 KASSERT(mutex_owned(&efd->efd_lock)); 437 } else { 438 mutex_enter(&efd->efd_lock); 439 } 440 441 kn->kn_data = (int64_t)efd->efd_val; 442 rv = (eventfd_t)kn->kn_data > 0; 443 444 if ((hint & NOTE_SUBMIT) == 0) { 445 mutex_exit(&efd->efd_lock); 446 } 447 448 return rv; 449 } 450 451 static const struct filterops eventfd_read_filterops = { 452 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 453 .f_detach = eventfd_filt_read_detach, 454 .f_event = eventfd_filt_read, 455 }; 456 457 static void 458 eventfd_filt_write_detach(struct knote * const kn) 459 { 460 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 461 462 mutex_enter(&efd->efd_lock); 463 KASSERT(kn->kn_hook == efd); 464 selremove_knote(&efd->efd_write_sel, kn); 465 mutex_exit(&efd->efd_lock); 466 } 467 468 static int 469 eventfd_filt_write(struct knote * const kn, long const hint) 470 { 471 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 472 int rv; 473 474 if (hint & NOTE_SUBMIT) { 475 KASSERT(mutex_owned(&efd->efd_lock)); 476 } else { 477 mutex_enter(&efd->efd_lock); 478 } 479 480 kn->kn_data = (int64_t)efd->efd_val; 481 rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL; 482 483 if ((hint & NOTE_SUBMIT) == 0) { 484 mutex_exit(&efd->efd_lock); 485 } 486 487 return rv; 488 } 489 490 static const struct filterops eventfd_write_filterops = { 491 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 492 .f_detach = eventfd_filt_write_detach, 493 .f_event = eventfd_filt_write, 494 }; 495 496 static int 497 eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn) 498 { 499 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 500 struct selinfo *sel; 501 502 switch (kn->kn_filter) { 503 case EVFILT_READ: 504 sel = &efd->efd_read_sel; 505 kn->kn_fop = &eventfd_read_filterops; 506 break; 507 508 case EVFILT_WRITE: 509 sel = &efd->efd_write_sel; 510 kn->kn_fop = &eventfd_write_filterops; 511 break; 512 513 default: 514 return EINVAL; 515 } 516 517 kn->kn_hook = efd; 518 519 mutex_enter(&efd->efd_lock); 520 selrecord_knote(sel, kn); 521 mutex_exit(&efd->efd_lock); 522 523 return 0; 524 } 525 526 static void 527 eventfd_fop_restart(file_t * const fp) 528 { 529 struct eventfd * const efd = fp->f_eventfd; 530 531 /* 532 * Unblock blocked reads/writes in order to allow close() to complete. 533 * System calls return ERESTART so that the fd is revalidated. 534 */ 535 536 mutex_enter(&efd->efd_lock); 537 538 if (efd->efd_nwaiters != 0) { 539 efd->efd_restarting = true; 540 if (efd->efd_has_read_waiters) { 541 cv_broadcast(&efd->efd_read_wait); 542 efd->efd_has_read_waiters = false; 543 } 544 if (efd->efd_has_write_waiters) { 545 cv_broadcast(&efd->efd_write_wait); 546 efd->efd_has_write_waiters = false; 547 } 548 } 549 550 mutex_exit(&efd->efd_lock); 551 } 552 553 static const struct fileops eventfd_fileops = { 554 .fo_name = "eventfd", 555 .fo_read = eventfd_fop_read, 556 .fo_write = eventfd_fop_write, 557 .fo_ioctl = eventfd_ioctl, 558 .fo_fcntl = fnullop_fcntl, 559 .fo_poll = eventfd_fop_poll, 560 .fo_stat = eventfd_fop_stat, 561 .fo_close = eventfd_fop_close, 562 .fo_kqfilter = eventfd_fop_kqfilter, 563 .fo_restart = eventfd_fop_restart, 564 }; 565 566 /* 567 * eventfd(2) system call 568 */ 569 int 570 do_eventfd(struct lwp * const l, unsigned int const val, int const flags, 571 register_t *retval) 572 { 573 file_t *fp; 574 int fd, error; 575 576 if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) { 577 return EINVAL; 578 } 579 580 if ((error = fd_allocfile(&fp, &fd)) != 0) { 581 return error; 582 } 583 584 fp->f_flag = FREAD | FWRITE; 585 if (flags & EFD_NONBLOCK) { 586 fp->f_flag |= FNONBLOCK; 587 } 588 fp->f_type = DTYPE_EVENTFD; 589 fp->f_ops = &eventfd_fileops; 590 fp->f_eventfd = eventfd_create(val, flags); 591 fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC)); 592 fd_affix(curproc, fp, fd); 593 594 *retval = fd; 595 return 0; 596 } 597 598 int 599 sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap, 600 register_t *retval) 601 { 602 /* { 603 syscallarg(unsigned int) val; 604 syscallarg(int) flags; 605 } */ 606 607 return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval); 608 } 609