1 /* $NetBSD: sys_timerfd.c,v 1.11 2024/12/19 23:50:22 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_timerfd.c,v 1.11 2024/12/19 23:50:22 riastradh Exp $"); 34 35 /* 36 * timerfd 37 * 38 * Timerfd objects are similar to POSIX timers, except they are associated 39 * with a file descriptor rather than a process. Timerfd objects are 40 * created with the timerfd_create(2) system call, similar to timer_create(2). 41 * The timerfd analogues for timer_gettime(2) and timer_settime(2) are 42 * timerfd_gettime(2) and timerfd_settime(2), respectively. 43 * 44 * When a timerfd object's timer fires, an internal counter is incremented. 45 * When this counter is non-zero, the descriptor associated with the timerfd 46 * object is "readable". Note that this is slightly different than the 47 * POSIX timer "overrun" counter, which only increments if the timer fires 48 * again while the notification signal is already pending. Thus, we are 49 * responsible for incrementing the "overrun" counter each time the timerfd 50 * timer fires. 51 * 52 * This implementation is API compatible with the Linux timerfd interface. 53 */ 54 55 #include <sys/param.h> 56 #include <sys/types.h> 57 #include <sys/condvar.h> 58 #include <sys/file.h> 59 #include <sys/filedesc.h> 60 #include <sys/kauth.h> 61 #include <sys/mutex.h> 62 #include <sys/poll.h> 63 #include <sys/proc.h> 64 #include <sys/select.h> 65 #include <sys/stat.h> 66 #include <sys/syscallargs.h> 67 #include <sys/timerfd.h> 68 #include <sys/uio.h> 69 70 /* N.B. all timerfd state is protected by itimer_lock() */ 71 struct timerfd { 72 struct itimer tfd_itimer; 73 kcondvar_t tfd_read_wait; 74 struct selinfo tfd_read_sel; 75 int64_t tfd_nwaiters; 76 bool tfd_cancel_on_set; 77 bool tfd_cancelled; 78 bool tfd_restarting; 79 80 /* 81 * Information kept for stat(2). 82 */ 83 struct timespec tfd_btime; /* time created */ 84 struct timespec tfd_mtime; /* last timerfd_settime() */ 85 struct timespec tfd_atime; /* last read */ 86 }; 87 88 static void timerfd_wake(struct timerfd *); 89 90 static inline uint64_t 91 timerfd_fire_count(const struct timerfd * const tfd) 92 { 93 return (unsigned int)tfd->tfd_itimer.it_overruns; 94 } 95 96 static inline bool 97 timerfd_is_readable(const struct timerfd * const tfd) 98 { 99 return tfd->tfd_itimer.it_overruns != 0 || tfd->tfd_cancelled; 100 } 101 102 /* 103 * timerfd_fire: 104 * 105 * Called when the timerfd's timer fires. 106 * 107 * Called from a callout with itimer lock held. 108 */ 109 static void 110 timerfd_fire(struct itimer * const it) 111 { 112 struct timerfd * const tfd = 113 container_of(it, struct timerfd, tfd_itimer); 114 115 it->it_overruns++; 116 timerfd_wake(tfd); 117 } 118 119 /* 120 * timerfd_realtime_changed: 121 * 122 * Called when CLOCK_REALTIME is changed with clock_settime() 123 * or settimeofday(). 124 * 125 * Called with itimer lock held. 126 */ 127 static void 128 timerfd_realtime_changed(struct itimer * const it) 129 { 130 struct timerfd * const tfd = 131 container_of(it, struct timerfd, tfd_itimer); 132 133 /* Should only be called when timer is armed. */ 134 KASSERT(timespecisset(&it->it_time.it_value)); 135 136 if (tfd->tfd_cancel_on_set) { 137 tfd->tfd_cancelled = true; 138 timerfd_wake(tfd); 139 } 140 } 141 142 static const struct itimer_ops timerfd_itimer_monotonic_ops = { 143 .ito_fire = timerfd_fire, 144 }; 145 146 static const struct itimer_ops timerfd_itimer_realtime_ops = { 147 .ito_fire = timerfd_fire, 148 .ito_realtime_changed = timerfd_realtime_changed, 149 }; 150 151 /* 152 * timerfd_create: 153 * 154 * Create a timerfd object. 155 */ 156 static struct timerfd * 157 timerfd_create(clockid_t const clock_id, int const flags) 158 { 159 struct timerfd * const tfd = kmem_zalloc(sizeof(*tfd), KM_SLEEP); 160 161 KASSERT(clock_id == CLOCK_REALTIME || clock_id == CLOCK_MONOTONIC); 162 163 cv_init(&tfd->tfd_read_wait, "tfdread"); 164 selinit(&tfd->tfd_read_sel); 165 getnanotime(&tfd->tfd_btime); 166 167 /* Caller deals with TFD_CLOEXEC and TFD_NONBLOCK. */ 168 169 itimer_lock(); 170 itimer_init(&tfd->tfd_itimer, 171 clock_id == CLOCK_REALTIME ? &timerfd_itimer_realtime_ops 172 : &timerfd_itimer_monotonic_ops, 173 clock_id, NULL); 174 itimer_unlock(); 175 176 return tfd; 177 } 178 179 /* 180 * timerfd_destroy: 181 * 182 * Destroy a timerfd object. 183 */ 184 static void 185 timerfd_destroy(struct timerfd * const tfd) 186 { 187 188 KASSERT(tfd->tfd_nwaiters == 0); 189 190 itimer_lock(); 191 itimer_poison(&tfd->tfd_itimer); 192 itimer_fini(&tfd->tfd_itimer); /* drops itimer lock */ 193 194 cv_destroy(&tfd->tfd_read_wait); 195 196 seldestroy(&tfd->tfd_read_sel); 197 198 kmem_free(tfd, sizeof(*tfd)); 199 } 200 201 /* 202 * timerfd_wait: 203 * 204 * Block on a timerfd. Handles non-blocking, as well as 205 * the restart cases. 206 */ 207 static int 208 timerfd_wait(struct timerfd * const tfd, int const fflag) 209 { 210 extern kmutex_t itimer_mutex; /* XXX */ 211 int error; 212 213 if (fflag & FNONBLOCK) { 214 return EAGAIN; 215 } 216 217 /* 218 * We're going to block. Check if we need to return ERESTART. 219 */ 220 if (tfd->tfd_restarting) { 221 return ERESTART; 222 } 223 224 tfd->tfd_nwaiters++; 225 KASSERT(tfd->tfd_nwaiters > 0); 226 error = cv_wait_sig(&tfd->tfd_read_wait, &itimer_mutex); 227 tfd->tfd_nwaiters--; 228 KASSERT(tfd->tfd_nwaiters >= 0); 229 230 /* 231 * If a restart was triggered while we were asleep, we need 232 * to return ERESTART if no other error was returned. 233 */ 234 if (tfd->tfd_restarting) { 235 if (error == 0) { 236 error = ERESTART; 237 } 238 } 239 240 return error; 241 } 242 243 /* 244 * timerfd_wake: 245 * 246 * Wake LWPs blocked on a timerfd. 247 */ 248 static void 249 timerfd_wake(struct timerfd * const tfd) 250 { 251 252 if (tfd->tfd_nwaiters) { 253 cv_broadcast(&tfd->tfd_read_wait); 254 } 255 selnotify(&tfd->tfd_read_sel, POLLIN | POLLRDNORM, NOTE_SUBMIT); 256 } 257 258 /* 259 * timerfd file operations 260 */ 261 262 static int 263 timerfd_fop_read(file_t * const fp, off_t * const offset, 264 struct uio * const uio, kauth_cred_t const cred, int const flags) 265 { 266 struct timerfd * const tfd = fp->f_timerfd; 267 struct itimer * const it = &tfd->tfd_itimer; 268 int const fflag = fp->f_flag; 269 uint64_t return_value; 270 int error; 271 272 if (uio->uio_resid < sizeof(uint64_t)) { 273 return EINVAL; 274 } 275 276 itimer_lock(); 277 278 while (!timerfd_is_readable(tfd)) { 279 if ((error = timerfd_wait(tfd, fflag)) != 0) { 280 itimer_unlock(); 281 return error; 282 } 283 } 284 285 if (tfd->tfd_cancelled) { 286 itimer_unlock(); 287 return ECANCELED; 288 } 289 290 return_value = timerfd_fire_count(tfd); 291 it->it_overruns = 0; 292 293 getnanotime(&tfd->tfd_atime); 294 295 itimer_unlock(); 296 297 error = uiomove(&return_value, sizeof(return_value), uio); 298 299 return error; 300 } 301 302 static int 303 timerfd_fop_ioctl(file_t * const fp, unsigned long const cmd, void * const data) 304 { 305 struct timerfd * const tfd = fp->f_timerfd; 306 int error = 0; 307 308 switch (cmd) { 309 case FIONBIO: 310 break; 311 312 case FIONREAD: 313 itimer_lock(); 314 *(int *)data = timerfd_is_readable(tfd) ? sizeof(uint64_t) : 0; 315 itimer_unlock(); 316 break; 317 318 case TFD_IOC_SET_TICKS: { 319 const uint64_t * const new_ticksp = data; 320 if (*new_ticksp > INT_MAX) { 321 return EINVAL; 322 } 323 itimer_lock(); 324 tfd->tfd_itimer.it_overruns = (int)*new_ticksp; 325 itimer_unlock(); 326 break; 327 } 328 329 default: 330 error = EPASSTHROUGH; 331 } 332 333 return error; 334 } 335 336 static int 337 timerfd_fop_poll(file_t * const fp, int const events) 338 { 339 struct timerfd * const tfd = fp->f_timerfd; 340 int revents = 0; 341 342 if (events & (POLLIN | POLLRDNORM)) { 343 itimer_lock(); 344 if (timerfd_is_readable(tfd)) { 345 revents |= events & (POLLIN | POLLRDNORM); 346 } else { 347 selrecord(curlwp, &tfd->tfd_read_sel); 348 } 349 itimer_unlock(); 350 } 351 352 return revents; 353 } 354 355 static int 356 timerfd_fop_stat(file_t * const fp, struct stat * const st) 357 { 358 struct timerfd * const tfd = fp->f_timerfd; 359 360 memset(st, 0, sizeof(*st)); 361 362 itimer_lock(); 363 st->st_size = (off_t)timerfd_fire_count(tfd); 364 st->st_atimespec = tfd->tfd_atime; 365 st->st_mtimespec = tfd->tfd_mtime; 366 itimer_unlock(); 367 368 st->st_blksize = sizeof(uint64_t); 369 st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 370 st->st_blocks = 1; 371 st->st_birthtimespec = tfd->tfd_btime; 372 st->st_ctimespec = st->st_mtimespec; 373 st->st_uid = kauth_cred_geteuid(fp->f_cred); 374 st->st_gid = kauth_cred_getegid(fp->f_cred); 375 376 return 0; 377 } 378 379 static int 380 timerfd_fop_close(file_t * const fp) 381 { 382 struct timerfd * const tfd = fp->f_timerfd; 383 384 fp->f_timerfd = NULL; 385 timerfd_destroy(tfd); 386 387 return 0; 388 } 389 390 static void 391 timerfd_filt_read_detach(struct knote * const kn) 392 { 393 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd; 394 395 itimer_lock(); 396 KASSERT(kn->kn_hook == tfd); 397 selremove_knote(&tfd->tfd_read_sel, kn); 398 itimer_unlock(); 399 } 400 401 static int 402 timerfd_filt_read(struct knote * const kn, long const hint) 403 { 404 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd; 405 int rv; 406 407 if (hint & NOTE_SUBMIT) { 408 KASSERT(itimer_lock_held()); 409 } else { 410 itimer_lock(); 411 } 412 413 kn->kn_data = (int64_t)timerfd_fire_count(tfd); 414 rv = kn->kn_data != 0; 415 416 if ((hint & NOTE_SUBMIT) == 0) { 417 itimer_unlock(); 418 } 419 420 return rv; 421 } 422 423 static const struct filterops timerfd_read_filterops = { 424 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 425 .f_detach = timerfd_filt_read_detach, 426 .f_event = timerfd_filt_read, 427 }; 428 429 static int 430 timerfd_fop_kqfilter(file_t * const fp, struct knote * const kn) 431 { 432 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd; 433 struct selinfo *sel; 434 435 switch (kn->kn_filter) { 436 case EVFILT_READ: 437 sel = &tfd->tfd_read_sel; 438 kn->kn_fop = &timerfd_read_filterops; 439 break; 440 441 default: 442 return EINVAL; 443 } 444 445 kn->kn_hook = tfd; 446 447 itimer_lock(); 448 selrecord_knote(sel, kn); 449 itimer_unlock(); 450 451 return 0; 452 } 453 454 static void 455 timerfd_fop_restart(file_t * const fp) 456 { 457 struct timerfd * const tfd = fp->f_timerfd; 458 459 /* 460 * Unblock blocked reads in order to allow close() to complete. 461 * System calls return ERESTART so that the fd is revalidated. 462 */ 463 464 itimer_lock(); 465 466 if (tfd->tfd_nwaiters != 0) { 467 tfd->tfd_restarting = true; 468 cv_broadcast(&tfd->tfd_read_wait); 469 } 470 471 itimer_unlock(); 472 } 473 474 static const struct fileops timerfd_fileops = { 475 .fo_name = "timerfd", 476 .fo_read = timerfd_fop_read, 477 .fo_write = fbadop_write, 478 .fo_ioctl = timerfd_fop_ioctl, 479 .fo_fcntl = fnullop_fcntl, 480 .fo_poll = timerfd_fop_poll, 481 .fo_stat = timerfd_fop_stat, 482 .fo_close = timerfd_fop_close, 483 .fo_kqfilter = timerfd_fop_kqfilter, 484 .fo_restart = timerfd_fop_restart, 485 }; 486 487 /* 488 * timerfd_create(2) system call 489 */ 490 int 491 do_timerfd_create(struct lwp * const l, clockid_t const clock_id, 492 int const flags, register_t *retval) 493 { 494 file_t *fp; 495 int fd, error; 496 497 if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) { 498 return EINVAL; 499 } 500 501 switch (clock_id) { 502 case CLOCK_REALTIME: 503 case CLOCK_MONOTONIC: 504 /* allowed */ 505 break; 506 507 default: 508 return EINVAL; 509 } 510 511 if ((error = fd_allocfile(&fp, &fd)) != 0) { 512 return error; 513 } 514 515 fp->f_flag = FREAD; 516 if (flags & TFD_NONBLOCK) { 517 fp->f_flag |= FNONBLOCK; 518 } 519 fp->f_type = DTYPE_TIMERFD; 520 fp->f_ops = &timerfd_fileops; 521 fp->f_timerfd = timerfd_create(clock_id, flags); 522 fd_set_exclose(l, fd, !!(flags & TFD_CLOEXEC)); 523 fd_affix(curproc, fp, fd); 524 525 *retval = fd; 526 return 0; 527 } 528 529 int 530 sys_timerfd_create(struct lwp *l, const struct sys_timerfd_create_args *uap, 531 register_t *retval) 532 { 533 /* { 534 syscallarg(clockid_t) clock_id; 535 syscallarg(int) flags; 536 } */ 537 538 return do_timerfd_create(l, SCARG(uap, clock_id), SCARG(uap, flags), 539 retval); 540 } 541 542 /* 543 * timerfd_gettime(2) system call. 544 */ 545 int 546 do_timerfd_gettime(struct lwp *l, int fd, struct itimerspec *curr_value, 547 register_t *retval) 548 { 549 file_t *fp; 550 551 if ((fp = fd_getfile(fd)) == NULL) { 552 return EBADF; 553 } 554 555 if (fp->f_ops != &timerfd_fileops) { 556 fd_putfile(fd); 557 return EINVAL; 558 } 559 560 struct timerfd * const tfd = fp->f_timerfd; 561 itimer_lock(); 562 itimer_gettime(&tfd->tfd_itimer, curr_value); 563 itimer_unlock(); 564 565 fd_putfile(fd); 566 return 0; 567 } 568 569 int 570 sys_timerfd_gettime(struct lwp *l, const struct sys_timerfd_gettime_args *uap, 571 register_t *retval) 572 { 573 /* { 574 syscallarg(int) fd; 575 syscallarg(struct itimerspec *) curr_value; 576 } */ 577 578 struct itimerspec oits; 579 int error; 580 581 error = do_timerfd_gettime(l, SCARG(uap, fd), &oits, retval); 582 if (error == 0) { 583 error = copyout(&oits, SCARG(uap, curr_value), sizeof(oits)); 584 } 585 return error; 586 } 587 588 /* 589 * timerfd_settime(2) system call. 590 */ 591 int 592 do_timerfd_settime(struct lwp *l, int fd, int flags, 593 const struct itimerspec *new_value, struct itimerspec *old_value, 594 register_t *retval) 595 { 596 struct itimerspec value = *new_value; 597 file_t *fp; 598 int error; 599 600 if (flags & ~(TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)) { 601 return EINVAL; 602 } 603 if (itimespecfix(&value.it_value) != 0 || 604 itimespecfix(&value.it_interval) != 0) { 605 return EINVAL; 606 } 607 608 if ((fp = fd_getfile(fd)) == NULL) { 609 return EBADF; 610 } 611 612 if (fp->f_ops != &timerfd_fileops) { 613 fd_putfile(fd); 614 return EINVAL; 615 } 616 617 struct timerfd * const tfd = fp->f_timerfd; 618 struct itimer * const it = &tfd->tfd_itimer; 619 620 itimer_lock(); 621 622 restart: 623 if (old_value != NULL) { 624 itimer_gettime(it, old_value); 625 } 626 it->it_time = value; 627 628 /* 629 * If we've been passed a relative value, convert it to an 630 * absolute, as that's what the itimer facility expects for 631 * non-virtual timers. Also ensure that this doesn't set it 632 * to zero or lets it go negative. 633 * XXXJRT re-factor. 634 */ 635 if (timespecisset(&it->it_time.it_value) && 636 (flags & TFD_TIMER_ABSTIME) == 0) { 637 struct timespec now; 638 if (it->it_clockid == CLOCK_REALTIME) { 639 getnanotime(&now); 640 } else { /* CLOCK_MONOTONIC */ 641 getnanouptime(&now); 642 } 643 timespecadd(&it->it_time.it_value, &now, 644 &it->it_time.it_value); 645 } 646 647 error = itimer_settime(it); 648 if (error == ERESTART) { 649 goto restart; 650 } 651 KASSERT(error == 0); 652 653 /* Reset the expirations counter. */ 654 it->it_overruns = 0; 655 656 if (it->it_clockid == CLOCK_REALTIME) { 657 tfd->tfd_cancelled = false; 658 tfd->tfd_cancel_on_set = !!(flags & TFD_TIMER_CANCEL_ON_SET); 659 } 660 661 getnanotime(&tfd->tfd_mtime); 662 itimer_unlock(); 663 664 fd_putfile(fd); 665 return error; 666 } 667 668 int 669 sys_timerfd_settime(struct lwp *l, const struct sys_timerfd_settime_args *uap, 670 register_t *retval) 671 { 672 /* { 673 syscallarg(int) fd; 674 syscallarg(int) flags; 675 syscallarg(const struct itimerspec *) new_value; 676 syscallarg(struct itimerspec *) old_value; 677 } */ 678 679 struct itimerspec nits, oits, *oitsp = NULL; 680 int error; 681 682 error = copyin(SCARG(uap, new_value), &nits, sizeof(nits)); 683 if (error) { 684 return error; 685 } 686 687 if (SCARG(uap, old_value) != NULL) { 688 oitsp = &oits; 689 } 690 691 error = do_timerfd_settime(l, SCARG(uap, fd), SCARG(uap, flags), 692 &nits, oitsp, retval); 693 if (error == 0 && oitsp != NULL) { 694 error = copyout(oitsp, SCARG(uap, old_value), sizeof(*oitsp)); 695 } 696 return error; 697 } 698