1 /* $NetBSD: sys_eventfd.c,v 1.8 2021/11/24 16:35:33 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.8 2021/11/24 16:35:33 thorpej Exp $"); 34 35 /* 36 * eventfd 37 * 38 * Eventfd objects present a simple counting object associated with a 39 * file descriptor. Writes and reads to this file descriptor increment 40 * and decrement the count, respectively. When the count is non-zero, 41 * the descriptor is considered "readable", and when less than the max 42 * value (EVENTFD_MAXVAL), is considered "writable". 43 * 44 * This implementation is API compatible with the Linux eventfd(2) 45 * interface. 46 */ 47 48 #include <sys/param.h> 49 #include <sys/types.h> 50 #include <sys/condvar.h> 51 #include <sys/eventfd.h> 52 #include <sys/file.h> 53 #include <sys/filedesc.h> 54 #include <sys/kauth.h> 55 #include <sys/mutex.h> 56 #include <sys/poll.h> 57 #include <sys/proc.h> 58 #include <sys/select.h> 59 #include <sys/stat.h> 60 #include <sys/syscallargs.h> 61 #include <sys/uio.h> 62 63 struct eventfd { 64 kmutex_t efd_lock; 65 kcondvar_t efd_read_wait; 66 kcondvar_t efd_write_wait; 67 struct selinfo efd_read_sel; 68 struct selinfo efd_write_sel; 69 eventfd_t efd_val; 70 int64_t efd_nwaiters; 71 bool efd_restarting; 72 bool efd_has_read_waiters; 73 bool efd_has_write_waiters; 74 bool efd_is_semaphore; 75 76 /* 77 * Information kept for stat(2). 78 */ 79 struct timespec efd_btime; /* time created */ 80 struct timespec efd_mtime; /* last write */ 81 struct timespec efd_atime; /* last read */ 82 }; 83 84 #define EVENTFD_MAXVAL (UINT64_MAX - 1) 85 86 /* 87 * eventfd_create: 88 * 89 * Create an eventfd object. 90 */ 91 static struct eventfd * 92 eventfd_create(unsigned int const val, int const flags) 93 { 94 struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP); 95 96 mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE); 97 cv_init(&efd->efd_read_wait, "efdread"); 98 cv_init(&efd->efd_write_wait, "efdwrite"); 99 selinit(&efd->efd_read_sel); 100 selinit(&efd->efd_write_sel); 101 efd->efd_val = val; 102 efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE); 103 getnanotime(&efd->efd_btime); 104 105 /* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */ 106 107 return efd; 108 } 109 110 /* 111 * eventfd_destroy: 112 * 113 * Destroy an eventfd object. 114 */ 115 static void 116 eventfd_destroy(struct eventfd * const efd) 117 { 118 119 KASSERT(efd->efd_nwaiters == 0); 120 KASSERT(efd->efd_has_read_waiters == false); 121 KASSERT(efd->efd_has_write_waiters == false); 122 123 cv_destroy(&efd->efd_read_wait); 124 cv_destroy(&efd->efd_write_wait); 125 126 seldestroy(&efd->efd_read_sel); 127 seldestroy(&efd->efd_write_sel); 128 129 mutex_destroy(&efd->efd_lock); 130 131 kmem_free(efd, sizeof(*efd)); 132 } 133 134 /* 135 * eventfd_wait: 136 * 137 * Block on an eventfd. Handles non-blocking, as well as 138 * the restart cases. 139 */ 140 static int 141 eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write) 142 { 143 kcondvar_t *waitcv; 144 int error; 145 146 if (fflag & FNONBLOCK) { 147 return EAGAIN; 148 } 149 150 /* 151 * We're going to block. Check if we need to return ERESTART. 152 */ 153 if (efd->efd_restarting) { 154 return ERESTART; 155 } 156 157 if (is_write) { 158 efd->efd_has_write_waiters = true; 159 waitcv = &efd->efd_write_wait; 160 } else { 161 efd->efd_has_read_waiters = true; 162 waitcv = &efd->efd_read_wait; 163 } 164 165 efd->efd_nwaiters++; 166 KASSERT(efd->efd_nwaiters > 0); 167 error = cv_wait_sig(waitcv, &efd->efd_lock); 168 efd->efd_nwaiters--; 169 KASSERT(efd->efd_nwaiters >= 0); 170 171 /* 172 * If a restart was triggered while we were asleep, we need 173 * to return ERESTART if no other error was returned. 174 */ 175 if (efd->efd_restarting) { 176 if (error == 0) { 177 error = ERESTART; 178 } 179 } 180 181 return error; 182 } 183 184 /* 185 * eventfd_wake: 186 * 187 * Wake LWPs block on an eventfd. 188 */ 189 static void 190 eventfd_wake(struct eventfd * const efd, bool const is_write) 191 { 192 kcondvar_t *waitcv = NULL; 193 struct selinfo *sel; 194 int pollev; 195 196 if (is_write) { 197 if (efd->efd_has_read_waiters) { 198 waitcv = &efd->efd_read_wait; 199 efd->efd_has_read_waiters = false; 200 } 201 sel = &efd->efd_read_sel; 202 pollev = POLLIN | POLLRDNORM; 203 } else { 204 if (efd->efd_has_write_waiters) { 205 waitcv = &efd->efd_write_wait; 206 efd->efd_has_write_waiters = false; 207 } 208 sel = &efd->efd_write_sel; 209 pollev = POLLOUT | POLLWRNORM; 210 } 211 if (waitcv != NULL) { 212 cv_broadcast(waitcv); 213 } 214 selnotify(sel, pollev, NOTE_SUBMIT); 215 } 216 217 /* 218 * eventfd file operations 219 */ 220 221 static int 222 eventfd_fop_read(file_t * const fp, off_t * const offset, 223 struct uio * const uio, kauth_cred_t const cred, int const flags) 224 { 225 struct eventfd * const efd = fp->f_eventfd; 226 int const fflag = fp->f_flag; 227 eventfd_t return_value; 228 int error; 229 230 if (uio->uio_resid < sizeof(eventfd_t)) { 231 return EINVAL; 232 } 233 234 mutex_enter(&efd->efd_lock); 235 236 while (efd->efd_val == 0) { 237 if ((error = eventfd_wait(efd, fflag, false)) != 0) { 238 mutex_exit(&efd->efd_lock); 239 return error; 240 } 241 } 242 243 if (efd->efd_is_semaphore) { 244 return_value = 1; 245 efd->efd_val--; 246 } else { 247 return_value = efd->efd_val; 248 efd->efd_val = 0; 249 } 250 251 getnanotime(&efd->efd_atime); 252 eventfd_wake(efd, false); 253 254 mutex_exit(&efd->efd_lock); 255 256 error = uiomove(&return_value, sizeof(return_value), uio); 257 258 return error; 259 } 260 261 static int 262 eventfd_fop_write(file_t * const fp, off_t * const offset, 263 struct uio * const uio, kauth_cred_t const cred, int const flags) 264 { 265 struct eventfd * const efd = fp->f_eventfd; 266 int const fflag = fp->f_flag; 267 eventfd_t write_value; 268 int error; 269 270 if (uio->uio_resid < sizeof(eventfd_t)) { 271 return EINVAL; 272 } 273 274 if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) { 275 return error; 276 } 277 278 if (write_value > EVENTFD_MAXVAL) { 279 error = EINVAL; 280 goto out; 281 } 282 283 mutex_enter(&efd->efd_lock); 284 285 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 286 while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) { 287 if ((error = eventfd_wait(efd, fflag, true)) != 0) { 288 mutex_exit(&efd->efd_lock); 289 goto out; 290 } 291 } 292 293 efd->efd_val += write_value; 294 KASSERT(efd->efd_val <= EVENTFD_MAXVAL); 295 296 getnanotime(&efd->efd_mtime); 297 eventfd_wake(efd, true); 298 299 mutex_exit(&efd->efd_lock); 300 301 out: 302 if (error) { 303 /* 304 * Undo the effect of uiomove() so that the error 305 * gets reported correctly; see dofilewrite(). 306 */ 307 uio->uio_resid += sizeof(write_value); 308 } 309 return error; 310 } 311 312 static int 313 eventfd_fop_poll(file_t * const fp, int const events) 314 { 315 struct eventfd * const efd = fp->f_eventfd; 316 int revents = 0; 317 318 /* 319 * Note that Linux will return POLLERR if the eventfd count 320 * overflows, but that is not possible in the normal read/write 321 * API, only with Linux kernel-internal interfaces. So, this 322 * implementation never returns POLLERR. 323 * 324 * Also note that the Linux eventfd(2) man page does not 325 * specifically discuss returning POLLRDNORM, but we check 326 * for that event in addition to POLLIN. 327 */ 328 329 mutex_enter(&efd->efd_lock); 330 331 if (events & (POLLIN | POLLRDNORM)) { 332 if (efd->efd_val != 0) { 333 revents |= events & (POLLIN | POLLRDNORM); 334 } else { 335 selrecord(curlwp, &efd->efd_read_sel); 336 } 337 } 338 339 if (events & (POLLOUT | POLLWRNORM)) { 340 if (efd->efd_val < EVENTFD_MAXVAL) { 341 revents |= events & (POLLOUT | POLLWRNORM); 342 } else { 343 selrecord(curlwp, &efd->efd_write_sel); 344 } 345 } 346 347 mutex_exit(&efd->efd_lock); 348 349 return revents; 350 } 351 352 static int 353 eventfd_fop_stat(file_t * const fp, struct stat * const st) 354 { 355 struct eventfd * const efd = fp->f_eventfd; 356 357 memset(st, 0, sizeof(*st)); 358 359 mutex_enter(&efd->efd_lock); 360 st->st_size = (off_t)efd->efd_val; 361 st->st_blksize = sizeof(eventfd_t); 362 st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 363 st->st_blocks = 1; 364 st->st_birthtimespec = st->st_ctimespec = efd->efd_btime; 365 st->st_atimespec = efd->efd_atime; 366 st->st_mtimespec = efd->efd_mtime; 367 st->st_uid = kauth_cred_geteuid(fp->f_cred); 368 st->st_gid = kauth_cred_getegid(fp->f_cred); 369 mutex_exit(&efd->efd_lock); 370 371 return 0; 372 } 373 374 static int 375 eventfd_fop_close(file_t * const fp) 376 { 377 struct eventfd * const efd = fp->f_eventfd; 378 379 fp->f_eventfd = NULL; 380 eventfd_destroy(efd); 381 382 return 0; 383 } 384 385 static void 386 eventfd_filt_read_detach(struct knote * const kn) 387 { 388 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 389 390 mutex_enter(&efd->efd_lock); 391 KASSERT(kn->kn_hook == efd); 392 selremove_knote(&efd->efd_read_sel, kn); 393 mutex_exit(&efd->efd_lock); 394 } 395 396 static int 397 eventfd_filt_read(struct knote * const kn, long const hint) 398 { 399 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 400 int rv; 401 402 if (hint & NOTE_SUBMIT) { 403 KASSERT(mutex_owned(&efd->efd_lock)); 404 } else { 405 mutex_enter(&efd->efd_lock); 406 } 407 408 kn->kn_data = (int64_t)efd->efd_val; 409 rv = (eventfd_t)kn->kn_data > 0; 410 411 if ((hint & NOTE_SUBMIT) == 0) { 412 mutex_exit(&efd->efd_lock); 413 } 414 415 return rv; 416 } 417 418 static const struct filterops eventfd_read_filterops = { 419 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 420 .f_detach = eventfd_filt_read_detach, 421 .f_event = eventfd_filt_read, 422 }; 423 424 static void 425 eventfd_filt_write_detach(struct knote * const kn) 426 { 427 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 428 429 mutex_enter(&efd->efd_lock); 430 KASSERT(kn->kn_hook == efd); 431 selremove_knote(&efd->efd_write_sel, kn); 432 mutex_exit(&efd->efd_lock); 433 } 434 435 static int 436 eventfd_filt_write(struct knote * const kn, long const hint) 437 { 438 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 439 int rv; 440 441 if (hint & NOTE_SUBMIT) { 442 KASSERT(mutex_owned(&efd->efd_lock)); 443 } else { 444 mutex_enter(&efd->efd_lock); 445 } 446 447 kn->kn_data = (int64_t)efd->efd_val; 448 rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL; 449 450 if ((hint & NOTE_SUBMIT) == 0) { 451 mutex_exit(&efd->efd_lock); 452 } 453 454 return rv; 455 } 456 457 static const struct filterops eventfd_write_filterops = { 458 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 459 .f_detach = eventfd_filt_write_detach, 460 .f_event = eventfd_filt_write, 461 }; 462 463 static int 464 eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn) 465 { 466 struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd; 467 struct selinfo *sel; 468 469 switch (kn->kn_filter) { 470 case EVFILT_READ: 471 sel = &efd->efd_read_sel; 472 kn->kn_fop = &eventfd_read_filterops; 473 break; 474 475 case EVFILT_WRITE: 476 sel = &efd->efd_write_sel; 477 kn->kn_fop = &eventfd_write_filterops; 478 break; 479 480 default: 481 return EINVAL; 482 } 483 484 kn->kn_hook = efd; 485 486 mutex_enter(&efd->efd_lock); 487 selrecord_knote(sel, kn); 488 mutex_exit(&efd->efd_lock); 489 490 return 0; 491 } 492 493 static void 494 eventfd_fop_restart(file_t * const fp) 495 { 496 struct eventfd * const efd = fp->f_eventfd; 497 498 /* 499 * Unblock blocked reads/writes in order to allow close() to complete. 500 * System calls return ERESTART so that the fd is revalidated. 501 */ 502 503 mutex_enter(&efd->efd_lock); 504 505 if (efd->efd_nwaiters != 0) { 506 efd->efd_restarting = true; 507 if (efd->efd_has_read_waiters) { 508 cv_broadcast(&efd->efd_read_wait); 509 efd->efd_has_read_waiters = false; 510 } 511 if (efd->efd_has_write_waiters) { 512 cv_broadcast(&efd->efd_write_wait); 513 efd->efd_has_write_waiters = false; 514 } 515 } 516 517 mutex_exit(&efd->efd_lock); 518 } 519 520 static const struct fileops eventfd_fileops = { 521 .fo_name = "eventfd", 522 .fo_read = eventfd_fop_read, 523 .fo_write = eventfd_fop_write, 524 .fo_ioctl = fbadop_ioctl, 525 .fo_fcntl = fnullop_fcntl, 526 .fo_poll = eventfd_fop_poll, 527 .fo_stat = eventfd_fop_stat, 528 .fo_close = eventfd_fop_close, 529 .fo_kqfilter = eventfd_fop_kqfilter, 530 .fo_restart = eventfd_fop_restart, 531 }; 532 533 /* 534 * eventfd(2) system call 535 */ 536 int 537 do_eventfd(struct lwp * const l, unsigned int const val, int const flags, 538 register_t *retval) 539 { 540 file_t *fp; 541 int fd, error; 542 543 if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) { 544 return EINVAL; 545 } 546 547 if ((error = fd_allocfile(&fp, &fd)) != 0) { 548 return error; 549 } 550 551 fp->f_flag = FREAD | FWRITE; 552 if (flags & EFD_NONBLOCK) { 553 fp->f_flag |= FNONBLOCK; 554 } 555 fp->f_type = DTYPE_EVENTFD; 556 fp->f_ops = &eventfd_fileops; 557 fp->f_eventfd = eventfd_create(val, flags); 558 fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC)); 559 fd_affix(curproc, fp, fd); 560 561 *retval = fd; 562 return 0; 563 } 564 565 int 566 sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap, 567 register_t *retval) 568 { 569 /* { 570 syscallarg(unsigned int) val; 571 syscallarg(int) flags; 572 } */ 573 574 return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval); 575 } 576