1 /* $NetBSD: sys_eventfd.c,v 1.7 2021/09/27 00:40:49 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.7 2021/09/27 00:40:49 thorpej Exp $"); 34 35 /* 36 * eventfd 37 * 38 * Eventfd objects present a simple counting object associated with a 39 * file descriptor. Writes and reads to this file descriptor increment 40 * and decrement the count, respectively. When the count is non-zero, 41 * the descriptor is considered "readable", and when less than the max 42 * value (EVENTFD_MAXVAL), is considered "writable". 43 * 44 * This implementation is API compatible with the Linux eventfd(2) 45 * interface. 46 */ 47 48 #include <sys/param.h> 49 #include <sys/types.h> 50 #include <sys/condvar.h> 51 #include <sys/eventfd.h> 52 #include <sys/file.h> 53 #include <sys/filedesc.h> 54 #include <sys/kauth.h> 55 #include <sys/mutex.h> 56 #include <sys/poll.h> 57 #include <sys/proc.h> 58 #include <sys/select.h> 59 #include <sys/stat.h> 60 #include <sys/syscallargs.h> 61 #include <sys/uio.h> 62 63 struct eventfd { 64 kmutex_t efd_lock; 65 kcondvar_t efd_read_wait; 66 kcondvar_t efd_write_wait; 67 kcondvar_t efd_restart_wait; 68 struct selinfo efd_read_sel; 69 struct selinfo efd_write_sel; 70 eventfd_t efd_val; 71 int64_t efd_nwaiters; 72 bool efd_restarting; 73 bool efd_has_read_waiters; 74 bool efd_has_write_waiters; 75 bool efd_is_semaphore; 76 77 /* 78 * Information kept for stat(2). 79 */ 80 struct timespec efd_btime; /* time created */ 81 struct timespec efd_mtime; /* last write */ 82 struct timespec efd_atime; /* last read */ 83 }; 84 85 #define EVENTFD_MAXVAL (UINT64_MAX - 1) 86 87 /* 88 * eventfd_create: 89 * 90 * Create an eventfd object. 91 */ 92 static struct eventfd * 93 eventfd_create(unsigned int const val, int const flags) 94 { 95 struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP); 96 97 mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE); 98 cv_init(&efd->efd_read_wait, "efdread"); 99 cv_init(&efd->efd_write_wait, "efdwrite"); 100 cv_init(&efd->efd_restart_wait, "efdrstrt"); 101 selinit(&efd->efd_read_sel); 102 selinit(&efd->efd_write_sel); 103 efd->efd_val = val; 104 efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE); 105 getnanotime(&efd->efd_btime); 106 107 /* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */ 108 109 return efd; 110 } 111 112 /* 113 * eventfd_destroy: 114 * 115 * Destroy an eventfd object. 116 */ 117 static void 118 eventfd_destroy(struct eventfd * const efd) 119 { 120 121 KASSERT(efd->efd_nwaiters == 0); 122 KASSERT(efd->efd_restarting == false); 123 KASSERT(efd->efd_has_read_waiters == false); 124 KASSERT(efd->efd_has_write_waiters == false); 125 126 cv_destroy(&efd->efd_read_wait); 127 cv_destroy(&efd->efd_write_wait); 128 cv_destroy(&efd->efd_restart_wait); 129 130 seldestroy(&efd->efd_read_sel); 131 seldestroy(&efd->efd_write_sel); 132 133 mutex_destroy(&efd->efd_lock); 134 135 kmem_free(efd, sizeof(*efd)); 136 } 137 138 /* 139 * eventfd_wait: 140 * 141 * Block on an eventfd. Handles non-blocking, as well as 142 * the restart cases. 143 */ 144 static int 145 eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write) 146 { 147 kcondvar_t *waitcv; 148 int error; 149 150 if (fflag & FNONBLOCK) { 151 return EAGAIN; 152 } 153 154 /* 155 * We're going to block. If there is a restart in-progress, 156 * wait for that to complete first. 157 */ 158 while (efd->efd_restarting) { 159 cv_wait(&efd->efd_restart_wait, &efd->efd_lock); 160 } 161 162 if (is_write) { 163 efd->efd_has_write_waiters = true; 164 waitcv = &efd->efd_write_wait; 165 } else { 166 efd->efd_has_read_waiters = true; 167 waitcv = &efd->efd_read_wait; 168 } 169 170 efd->efd_nwaiters++; 171 KASSERT(efd->efd_nwaiters > 0); 172 error = cv_wait_sig(waitcv, &efd->efd_lock); 173 efd->efd_nwaiters--; 174 KASSERT(efd->efd_nwaiters >= 0); 175 176 /* 177 * If a restart was triggered while we were asleep, we need 178 * to return ERESTART if no other error was returned. If we 179 * are the last waiter coming out of the restart drain, clear 180 * the condition. 181 */ 182 if (efd->efd_restarting) { 183 if (error == 0) { 184 error = ERESTART; 185 } 186 if (efd->efd_nwaiters == 0) { 187 efd->efd_restarting = false; 188 cv_broadcast(&efd->efd_restart_wait); 189 } 190 } 191 192 return error; 193 } 194 195 /* 196 * eventfd_wake: 197 * 198 * Wake LWPs block on an eventfd. 199 */ 200 static void 201 eventfd_wake(struct eventfd * const efd, bool const is_write) 202 { 203 kcondvar_t *waitcv = NULL; 204 struct selinfo *sel; 205 int pollev; 206 207 if (is_write) { 208 if (efd->efd_has_read_waiters) { 209 waitcv = &efd->efd_read_wait; 210 efd->efd_has_read_waiters = false; 211 } 212 sel = &efd->efd_read_sel; 213 pollev = POLLIN | POLLRDNORM; 214 } else { 215 if (efd->efd_has_write_waiters) { 216 waitcv = &efd->efd_write_wait; 217 efd->efd_has_write_waiters = false; 218 } 219 sel = &efd->efd_write_sel; 220 pollev = POLLOUT | POLLWRNORM; 221 } 222 if (waitcv != NULL) { 223 cv_broadcast(waitcv); 224 } 225 selnotify(sel, pollev, NOTE_SUBMIT); 226 } 227 228 /* 229 * eventfd file operations 230 */ 231 232 static int 233 eventfd_fop_read(file_t * const fp, off_t * const offset, 234 struct uio * const uio, kauth_cred_t const cred, int const flags) 235 { 236 struct eventfd * const efd = fp->f_eventfd; 237 int const fflag = fp->f_flag; 238 eventfd_t return_value; 239 int error; 240 241 if (uio->uio_resid < sizeof(eventfd_t)) { 242 return EINVAL; 243 } 244 245 mutex_enter(&efd->efd_lock); 246 247 while (efd->efd_val == 0) { 248 if ((error = eventfd_wait(efd, fflag, false)) != 0) { 249 mutex_exit(&efd->efd_lock); 250 return error; 251 } 252 } 253 254 if (efd->efd_is_semaphore) { 255 return_value = 1; 256 efd->efd_val--; 257 } else { 258 return_value = efd->efd_val; 259 efd->efd_val = 0; 260 } 261 262 getnanotime(&efd->efd_atime); 263 eventfd_wake(efd, false); 264 265 mutex_exit(&efd->efd_lock); 266 267 error = uiomove(&return_value, sizeof(return_value), uio); 268 269 return error; 270 } 271 272 static int 273 eventfd_fop_write(file_t * const fp, off_t * const offset, 274 struct uio * const uio, kauth_cred_t const cred, int const flags) 275 { 276 struct eventfd * const efd = fp->f_eventfd; 277 int const fflag = fp->f_flag; 278 eventfd_t write_value; 279 int error; 280 281 if (uio->uio_resid < sizeof(eventfd_t)) { 282 return EINVAL; 283 } 284 285 if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) { 286 return error; 287 } 288 289 if (write_value > EVENTFD_MAXVAL) { 290 error = EINVAL; 291 goto out; 292 } 293 294 mutex_enter(&efd->efd_lock); 295 296 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 297 while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) { 298 if ((error = eventfd_wait(efd, fflag, true)) != 0) { 299 mutex_exit(&efd->efd_lock); 300 goto out; 301 } 302 } 303 304 efd->efd_val += write_value; 305 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 306 307 getnanotime(&efd->efd_mtime); 308 eventfd_wake(efd, true); 309 310 mutex_exit(&efd->efd_lock); 311 312 out: 313 if (error) { 314 /* 315 * Undo the effect of uiomove() so that the error 316 * gets reported correctly; see dofilewrite(). 317 */ 318 uio->uio_resid += sizeof(write_value); 319 } 320 return error; 321 } 322 323 static int 324 eventfd_fop_poll(file_t * const fp, int const events) 325 { 326 struct eventfd * const efd = fp->f_eventfd; 327 int revents = 0; 328 329 /* 330 * Note that Linux will return POLLERR if the eventfd count 331 * overflows, but that is not possible in the normal read/write 332 * API, only with Linux kernel-internal interfaces. So, this 333 * implementation never returns POLLERR. 334 * 335 * Also note that the Linux eventfd(2) man page does not 336 * specifically discuss returning POLLRDNORM, but we check 337 * for that event in addition to POLLIN. 338 */ 339 340 mutex_enter(&efd->efd_lock); 341 342 if (events & (POLLIN | POLLRDNORM)) { 343 if (efd->efd_val != 0) { 344 revents |= events & (POLLIN | POLLRDNORM); 345 } else { 346 selrecord(curlwp, &efd->efd_read_sel); 347 } 348 } 349 350 if (events & (POLLOUT | POLLWRNORM)) { 351 if (efd->efd_val < EVENTFD_MAXVAL) { 352 revents |= events & (POLLOUT | POLLWRNORM); 353 } else { 354 selrecord(curlwp, &efd->efd_write_sel); 355 } 356 } 357 358 mutex_exit(&efd->efd_lock); 359 360 return revents; 361 } 362 363 static int 364 eventfd_fop_stat(file_t * const fp, struct stat * const st) 365 { 366 struct eventfd * const efd = fp->f_eventfd; 367 368 memset(st, 0, sizeof(*st)); 369 370 mutex_enter(&efd->efd_lock); 371 st->st_size = (off_t)efd->efd_val; 372 st->st_blksize = sizeof(eventfd_t); 373 st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 374 st->st_blocks = 1; 375 st->st_birthtimespec = st->st_ctimespec = efd->efd_btime; 376 st->st_atimespec = efd->efd_atime; 377 st->st_mtimespec = efd->efd_mtime; 378 st->st_uid = kauth_cred_geteuid(fp->f_cred); 379 st->st_gid = kauth_cred_getegid(fp->f_cred); 380 mutex_exit(&efd->efd_lock); 381 382 return 0; 383 } 384 385 static int 386 eventfd_fop_close(file_t * const fp) 387 { 388 struct eventfd * const efd = fp->f_eventfd; 389 390 fp->f_eventfd = NULL; 391 eventfd_destroy(efd); 392 393 return 0; 394 } 395 396 static void 397 eventfd_filt_read_detach(struct knote * const kn) 398 { 399 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 400 401 mutex_enter(&efd->efd_lock); 402 KASSERT(kn->kn_hook == efd); 403 selremove_knote(&efd->efd_read_sel, kn); 404 mutex_exit(&efd->efd_lock); 405 } 406 407 static int 408 eventfd_filt_read(struct knote * const kn, long const hint) 409 { 410 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 411 int rv; 412 413 if (hint & NOTE_SUBMIT) { 414 KASSERT(mutex_owned(&efd->efd_lock)); 415 } else { 416 mutex_enter(&efd->efd_lock); 417 } 418 419 kn->kn_data = (int64_t)efd->efd_val; 420 rv = (eventfd_t)kn->kn_data > 0; 421 422 if ((hint & NOTE_SUBMIT) == 0) { 423 mutex_exit(&efd->efd_lock); 424 } 425 426 return rv; 427 } 428 429 static const struct filterops eventfd_read_filterops = { 430 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 431 .f_detach = eventfd_filt_read_detach, 432 .f_event = eventfd_filt_read, 433 }; 434 435 static void 436 eventfd_filt_write_detach(struct knote * const kn) 437 { 438 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 439 440 mutex_enter(&efd->efd_lock); 441 KASSERT(kn->kn_hook == efd); 442 selremove_knote(&efd->efd_write_sel, kn); 443 mutex_exit(&efd->efd_lock); 444 } 445 446 static int 447 eventfd_filt_write(struct knote * const kn, long const hint) 448 { 449 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 450 int rv; 451 452 if (hint & NOTE_SUBMIT) { 453 KASSERT(mutex_owned(&efd->efd_lock)); 454 } else { 455 mutex_enter(&efd->efd_lock); 456 } 457 458 kn->kn_data = (int64_t)efd->efd_val; 459 rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL; 460 461 if ((hint & NOTE_SUBMIT) == 0) { 462 mutex_exit(&efd->efd_lock); 463 } 464 465 return rv; 466 } 467 468 static const struct filterops eventfd_write_filterops = { 469 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 470 .f_detach = eventfd_filt_write_detach, 471 .f_event = eventfd_filt_write, 472 }; 473 474 static int 475 eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn) 476 { 477 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 478 struct selinfo *sel; 479 480 switch (kn->kn_filter) { 481 case EVFILT_READ: 482 sel = &efd->efd_read_sel; 483 kn->kn_fop = &eventfd_read_filterops; 484 break; 485 486 case EVFILT_WRITE: 487 sel = &efd->efd_write_sel; 488 kn->kn_fop = &eventfd_write_filterops; 489 break; 490 491 default: 492 return EINVAL; 493 } 494 495 kn->kn_hook = efd; 496 497 mutex_enter(&efd->efd_lock); 498 selrecord_knote(sel, kn); 499 mutex_exit(&efd->efd_lock); 500 501 return 0; 502 } 503 504 static void 505 eventfd_fop_restart(file_t * const fp) 506 { 507 struct eventfd * const efd = fp->f_eventfd; 508 509 /* 510 * Unblock blocked reads/writes in order to allow close() to complete. 511 * System calls return ERESTART so that the fd is revalidated. 512 */ 513 514 mutex_enter(&efd->efd_lock); 515 516 if (efd->efd_nwaiters != 0) { 517 efd->efd_restarting = true; 518 if (efd->efd_has_read_waiters) { 519 cv_broadcast(&efd->efd_read_wait); 520 efd->efd_has_read_waiters = false; 521 } 522 if (efd->efd_has_write_waiters) { 523 cv_broadcast(&efd->efd_write_wait); 524 efd->efd_has_write_waiters = false; 525 } 526 } 527 528 mutex_exit(&efd->efd_lock); 529 } 530 531 static const struct fileops eventfd_fileops = { 532 .fo_name = "eventfd", 533 .fo_read = eventfd_fop_read, 534 .fo_write = eventfd_fop_write, 535 .fo_ioctl = fbadop_ioctl, 536 .fo_fcntl = fnullop_fcntl, 537 .fo_poll = eventfd_fop_poll, 538 .fo_stat = eventfd_fop_stat, 539 .fo_close = eventfd_fop_close, 540 .fo_kqfilter = eventfd_fop_kqfilter, 541 .fo_restart = eventfd_fop_restart, 542 }; 543 544 /* 545 * eventfd(2) system call 546 */ 547 int 548 do_eventfd(struct lwp * const l, unsigned int const val, int const flags, 549 register_t *retval) 550 { 551 file_t *fp; 552 int fd, error; 553 554 if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) { 555 return EINVAL; 556 } 557 558 if ((error = fd_allocfile(&fp, &fd)) != 0) { 559 return error; 560 } 561 562 fp->f_flag = FREAD | FWRITE; 563 if (flags & EFD_NONBLOCK) { 564 fp->f_flag |= FNONBLOCK; 565 } 566 fp->f_type = DTYPE_EVENTFD; 567 fp->f_ops = &eventfd_fileops; 568 fp->f_eventfd = eventfd_create(val, flags); 569 fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC)); 570 fd_affix(curproc, fp, fd); 571 572 *retval = fd; 573 return 0; 574 } 575 576 int 577 sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap, 578 register_t *retval) 579 { 580 /* { 581 syscallarg(unsigned int) val; 582 syscallarg(int) flags; 583 } */ 584 585 return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval); 586 } 587