1 /* $NetBSD: sys_timerfd.c,v 1.6 2021/09/27 00:40:49 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_timerfd.c,v 1.6 2021/09/27 00:40:49 thorpej Exp $"); 34 35 /* 36 * timerfd 37 * 38 * Timerfd objects are similar to POSIX timers, except they are associated 39 * with a file descriptor rather than a process. Timerfd objects are 40 * created with the timerfd_create(2) system call, similar to timer_create(2). 41 * The timerfd analogues for timer_gettime(2) and timer_settime(2) are 42 * timerfd_gettime(2) and timerfd_settime(2), respectively. 43 * 44 * When a timerfd object's timer fires, an internal counter is incremented. 45 * When this counter is non-zero, the descriptor associated with the timerfd 46 * object is "readable". Note that this is slightly different than the 47 * POSIX timer "overrun" counter, which only increments if the timer fires 48 * again while the notification signal is already pending. Thus, we are 49 * responsible for incrementing the "overrun" counter each time the timerfd 50 * timer fires. 51 * 52 * This implementation is API compatible with the Linux timerfd interface. 53 */ 54 55 #include <sys/param.h> 56 #include <sys/types.h> 57 #include <sys/condvar.h> 58 #include <sys/file.h> 59 #include <sys/filedesc.h> 60 #include <sys/kauth.h> 61 #include <sys/mutex.h> 62 #include <sys/poll.h> 63 #include <sys/proc.h> 64 #include <sys/select.h> 65 #include <sys/stat.h> 66 #include <sys/syscallargs.h> 67 #include <sys/timerfd.h> 68 #include <sys/uio.h> 69 70 /* N.B. all timerfd state is protected by itimer_lock() */ 71 struct timerfd { 72 struct itimer tfd_itimer; 73 kcondvar_t tfd_read_wait; 74 kcondvar_t tfd_restart_wait; 75 struct selinfo tfd_read_sel; 76 int64_t tfd_nwaiters; 77 bool tfd_cancel_on_set; 78 bool tfd_cancelled; 79 bool tfd_restarting; 80 81 /* 82 * Information kept for stat(2). 83 */ 84 struct timespec tfd_btime; /* time created */ 85 struct timespec tfd_mtime; /* last timerfd_settime() */ 86 struct timespec tfd_atime; /* last read */ 87 }; 88 89 static void timerfd_wake(struct timerfd *); 90 91 static inline uint64_t 92 timerfd_fire_count(const struct timerfd * const tfd) 93 { 94 return (unsigned int)tfd->tfd_itimer.it_overruns; 95 } 96 97 static inline bool 98 timerfd_is_readable(const struct timerfd * const tfd) 99 { 100 return tfd->tfd_itimer.it_overruns != 0 || tfd->tfd_cancelled; 101 } 102 103 /* 104 * timerfd_fire: 105 * 106 * Called when the timerfd's timer fires. 107 * 108 * Called from a callout with itimer lock held. 109 */ 110 static void 111 timerfd_fire(struct itimer * const it) 112 { 113 struct timerfd * const tfd = 114 container_of(it, struct timerfd, tfd_itimer); 115 116 it->it_overruns++; 117 timerfd_wake(tfd); 118 } 119 120 /* 121 * timerfd_realtime_changed: 122 * 123 * Called when CLOCK_REALTIME is changed with clock_settime() 124 * or settimeofday(). 125 * 126 * Called with itimer lock held. 127 */ 128 static void 129 timerfd_realtime_changed(struct itimer * const it) 130 { 131 struct timerfd * const tfd = 132 container_of(it, struct timerfd, tfd_itimer); 133 134 /* Should only be called when timer is armed. */ 135 KASSERT(timespecisset(&it->it_time.it_value)); 136 137 if (tfd->tfd_cancel_on_set) { 138 tfd->tfd_cancelled = true; 139 timerfd_wake(tfd); 140 } 141 } 142 143 static const struct itimer_ops timerfd_itimer_monotonic_ops = { 144 .ito_fire = timerfd_fire, 145 }; 146 147 static const struct itimer_ops timerfd_itimer_realtime_ops = { 148 .ito_fire = timerfd_fire, 149 .ito_realtime_changed = timerfd_realtime_changed, 150 }; 151 152 /* 153 * timerfd_create: 154 * 155 * Create a timerfd object. 156 */ 157 static struct timerfd * 158 timerfd_create(clockid_t const clock_id, int const flags) 159 { 160 struct timerfd * const tfd = kmem_zalloc(sizeof(*tfd), KM_SLEEP); 161 162 KASSERT(clock_id == CLOCK_REALTIME || clock_id == CLOCK_MONOTONIC); 163 164 cv_init(&tfd->tfd_read_wait, "tfdread"); 165 cv_init(&tfd->tfd_restart_wait, "tfdrstrt"); 166 selinit(&tfd->tfd_read_sel); 167 getnanotime(&tfd->tfd_btime); 168 169 /* Caller deals with TFD_CLOEXEC and TFD_NONBLOCK. */ 170 171 itimer_lock(); 172 itimer_init(&tfd->tfd_itimer, 173 clock_id == CLOCK_REALTIME ? &timerfd_itimer_realtime_ops 174 : &timerfd_itimer_monotonic_ops, 175 clock_id, NULL); 176 itimer_unlock(); 177 178 return tfd; 179 } 180 181 /* 182 * timerfd_destroy: 183 * 184 * Destroy a timerfd object. 185 */ 186 static void 187 timerfd_destroy(struct timerfd * const tfd) 188 { 189 190 KASSERT(tfd->tfd_nwaiters == 0); 191 KASSERT(tfd->tfd_restarting == false); 192 193 itimer_lock(); 194 itimer_poison(&tfd->tfd_itimer); 195 itimer_fini(&tfd->tfd_itimer); /* drops itimer lock */ 196 197 cv_destroy(&tfd->tfd_read_wait); 198 cv_destroy(&tfd->tfd_restart_wait); 199 200 seldestroy(&tfd->tfd_read_sel); 201 202 kmem_free(tfd, sizeof(*tfd)); 203 } 204 205 /* 206 * timerfd_wait: 207 * 208 * Block on a timerfd. Handles non-blocking, as well as 209 * the restart cases. 210 */ 211 static int 212 timerfd_wait(struct timerfd * const tfd, int const fflag) 213 { 214 extern kmutex_t itimer_mutex; /* XXX */ 215 int error; 216 217 if (fflag & FNONBLOCK) { 218 return EAGAIN; 219 } 220 221 /* 222 * We're going to block. If there is a restart in-progress, 223 * wait for that to complete first. 224 */ 225 while (tfd->tfd_restarting) { 226 cv_wait(&tfd->tfd_restart_wait, &itimer_mutex); 227 } 228 229 tfd->tfd_nwaiters++; 230 KASSERT(tfd->tfd_nwaiters > 0); 231 error = cv_wait_sig(&tfd->tfd_read_wait, &itimer_mutex); 232 tfd->tfd_nwaiters--; 233 KASSERT(tfd->tfd_nwaiters >= 0); 234 235 /* 236 * If a restart was triggered while we were asleep, we need 237 * to return ERESTART if no other error was returned. If we 238 * are the last waiter coming out of the restart drain, clear 239 * the condition. 240 */ 241 if (tfd->tfd_restarting) { 242 if (error == 0) { 243 error = ERESTART; 244 } 245 if (tfd->tfd_nwaiters == 0) { 246 tfd->tfd_restarting = false; 247 cv_broadcast(&tfd->tfd_restart_wait); 248 } 249 } 250 251 return error; 252 } 253 254 /* 255 * timerfd_wake: 256 * 257 * Wake LWPs blocked on a timerfd. 258 */ 259 static void 260 timerfd_wake(struct timerfd * const tfd) 261 { 262 263 if (tfd->tfd_nwaiters) { 264 cv_broadcast(&tfd->tfd_read_wait); 265 } 266 selnotify(&tfd->tfd_read_sel, POLLIN | POLLRDNORM, NOTE_SUBMIT); 267 } 268 269 /* 270 * timerfd file operations 271 */ 272 273 static int 274 timerfd_fop_read(file_t * const fp, off_t * const offset, 275 struct uio * const uio, kauth_cred_t const cred, int const flags) 276 { 277 struct timerfd * const tfd = fp->f_timerfd; 278 struct itimer * const it = &tfd->tfd_itimer; 279 int const fflag = fp->f_flag; 280 uint64_t return_value; 281 int error; 282 283 if (uio->uio_resid < sizeof(uint64_t)) { 284 return EINVAL; 285 } 286 287 itimer_lock(); 288 289 while (!timerfd_is_readable(tfd)) { 290 if ((error = timerfd_wait(tfd, fflag)) != 0) { 291 itimer_unlock(); 292 return error; 293 } 294 } 295 296 if (tfd->tfd_cancelled) { 297 itimer_unlock(); 298 return ECANCELED; 299 } 300 301 return_value = timerfd_fire_count(tfd); 302 it->it_overruns = 0; 303 304 getnanotime(&tfd->tfd_atime); 305 306 itimer_unlock(); 307 308 error = uiomove(&return_value, sizeof(return_value), uio); 309 310 return error; 311 } 312 313 static int 314 timerfd_fop_ioctl(file_t * const fp, unsigned long const cmd, void * const data) 315 { 316 struct timerfd * const tfd = fp->f_timerfd; 317 int error = 0; 318 319 switch (cmd) { 320 case TFD_IOC_SET_TICKS: { 321 const uint64_t * const new_ticksp = data; 322 if (*new_ticksp > INT_MAX) { 323 return EINVAL; 324 } 325 itimer_lock(); 326 tfd->tfd_itimer.it_overruns = (int)*new_ticksp; 327 itimer_unlock(); 328 break; 329 } 330 331 default: 332 error = EPASSTHROUGH; 333 } 334 335 return error; 336 } 337 338 static int 339 timerfd_fop_poll(file_t * const fp, int const events) 340 { 341 struct timerfd * const tfd = fp->f_timerfd; 342 int revents = events & (POLLOUT | POLLWRNORM); 343 344 if (events & (POLLIN | POLLRDNORM)) { 345 itimer_lock(); 346 if (timerfd_is_readable(tfd)) { 347 revents |= events & (POLLIN | POLLRDNORM); 348 } else { 349 selrecord(curlwp, &tfd->tfd_read_sel); 350 } 351 itimer_unlock(); 352 } 353 354 return revents; 355 } 356 357 static int 358 timerfd_fop_stat(file_t * const fp, struct stat * const st) 359 { 360 struct timerfd * const tfd = fp->f_timerfd; 361 362 memset(st, 0, sizeof(*st)); 363 364 itimer_lock(); 365 st->st_size = (off_t)timerfd_fire_count(tfd); 366 st->st_atimespec = tfd->tfd_atime; 367 st->st_mtimespec = tfd->tfd_mtime; 368 itimer_unlock(); 369 370 st->st_blksize = sizeof(uint64_t); 371 st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 372 st->st_blocks = 1; 373 st->st_birthtimespec = tfd->tfd_btime; 374 st->st_ctimespec = st->st_mtimespec; 375 st->st_uid = kauth_cred_geteuid(fp->f_cred); 376 st->st_gid = kauth_cred_getegid(fp->f_cred); 377 378 return 0; 379 } 380 381 static int 382 timerfd_fop_close(file_t * const fp) 383 { 384 struct timerfd * const tfd = fp->f_timerfd; 385 386 fp->f_timerfd = NULL; 387 timerfd_destroy(tfd); 388 389 return 0; 390 } 391 392 static void 393 timerfd_filt_read_detach(struct knote * const kn) 394 { 395 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd; 396 397 itimer_lock(); 398 KASSERT(kn->kn_hook == tfd); 399 selremove_knote(&tfd->tfd_read_sel, kn); 400 itimer_unlock(); 401 } 402 403 static int 404 timerfd_filt_read(struct knote * const kn, long const hint) 405 { 406 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd; 407 int rv; 408 409 if (hint & NOTE_SUBMIT) { 410 KASSERT(itimer_lock_held()); 411 } else { 412 itimer_lock(); 413 } 414 415 kn->kn_data = (int64_t)timerfd_fire_count(tfd); 416 rv = kn->kn_data != 0; 417 418 if ((hint & NOTE_SUBMIT) == 0) { 419 itimer_unlock(); 420 } 421 422 return rv; 423 } 424 425 static const struct filterops timerfd_read_filterops = { 426 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 427 .f_detach = timerfd_filt_read_detach, 428 .f_event = timerfd_filt_read, 429 }; 430 431 static int 432 timerfd_fop_kqfilter(file_t * const fp, struct knote * const kn) 433 { 434 struct timerfd * const tfd = ((file_t *)kn->kn_obj)->f_timerfd; 435 struct selinfo *sel; 436 437 switch (kn->kn_filter) { 438 case EVFILT_READ: 439 sel = &tfd->tfd_read_sel; 440 kn->kn_fop = &timerfd_read_filterops; 441 break; 442 443 default: 444 return EINVAL; 445 } 446 447 kn->kn_hook = tfd; 448 449 itimer_lock(); 450 selrecord_knote(sel, kn); 451 itimer_unlock(); 452 453 return 0; 454 } 455 456 static void 457 timerfd_fop_restart(file_t * const fp) 458 { 459 struct timerfd * const tfd = fp->f_timerfd; 460 461 /* 462 * Unblock blocked reads in order to allow close() to complete. 463 * System calls return ERESTART so that the fd is revalidated. 464 */ 465 466 itimer_lock(); 467 468 if (tfd->tfd_nwaiters != 0) { 469 tfd->tfd_restarting = true; 470 cv_broadcast(&tfd->tfd_read_wait); 471 } 472 473 itimer_unlock(); 474 } 475 476 static const struct fileops timerfd_fileops = { 477 .fo_name = "timerfd", 478 .fo_read = timerfd_fop_read, 479 .fo_write = fbadop_write, 480 .fo_ioctl = timerfd_fop_ioctl, 481 .fo_fcntl = fnullop_fcntl, 482 .fo_poll = timerfd_fop_poll, 483 .fo_stat = timerfd_fop_stat, 484 .fo_close = timerfd_fop_close, 485 .fo_kqfilter = timerfd_fop_kqfilter, 486 .fo_restart = timerfd_fop_restart, 487 }; 488 489 /* 490 * timerfd_create(2) system call 491 */ 492 int 493 do_timerfd_create(struct lwp * const l, clockid_t const clock_id, 494 int const flags, register_t *retval) 495 { 496 file_t *fp; 497 int fd, error; 498 499 if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) { 500 return EINVAL; 501 } 502 503 switch (clock_id) { 504 case CLOCK_REALTIME: 505 case CLOCK_MONOTONIC: 506 /* allowed */ 507 break; 508 509 default: 510 return EINVAL; 511 } 512 513 if ((error = fd_allocfile(&fp, &fd)) != 0) { 514 return error; 515 } 516 517 fp->f_flag = FREAD; 518 if (flags & TFD_NONBLOCK) { 519 fp->f_flag |= FNONBLOCK; 520 } 521 fp->f_type = DTYPE_TIMERFD; 522 fp->f_ops = &timerfd_fileops; 523 fp->f_timerfd = timerfd_create(clock_id, flags); 524 fd_set_exclose(l, fd, !!(flags & TFD_CLOEXEC)); 525 fd_affix(curproc, fp, fd); 526 527 *retval = fd; 528 return 0; 529 } 530 531 int 532 sys_timerfd_create(struct lwp *l, const struct sys_timerfd_create_args *uap, 533 register_t *retval) 534 { 535 /* { 536 syscallarg(clockid_t) clock_id; 537 syscallarg(int) flags; 538 } */ 539 540 return do_timerfd_create(l, SCARG(uap, clock_id), SCARG(uap, flags), 541 retval); 542 } 543 544 /* 545 * timerfd_gettime(2) system call. 546 */ 547 int 548 do_timerfd_gettime(struct lwp *l, int fd, struct itimerspec *curr_value, 549 register_t *retval) 550 { 551 file_t *fp; 552 553 if ((fp = fd_getfile(fd)) == NULL) { 554 return EBADF; 555 } 556 557 if (fp->f_ops != &timerfd_fileops) { 558 fd_putfile(fd); 559 return EINVAL; 560 } 561 562 struct timerfd * const tfd = fp->f_timerfd; 563 itimer_lock(); 564 itimer_gettime(&tfd->tfd_itimer, curr_value); 565 itimer_unlock(); 566 567 fd_putfile(fd); 568 return 0; 569 } 570 571 int 572 sys_timerfd_gettime(struct lwp *l, const struct sys_timerfd_gettime_args *uap, 573 register_t *retval) 574 { 575 /* { 576 syscallarg(int) fd; 577 syscallarg(struct itimerspec *) curr_value; 578 } */ 579 580 struct itimerspec oits; 581 int error; 582 583 error = do_timerfd_gettime(l, SCARG(uap, fd), &oits, retval); 584 if (error == 0) { 585 error = copyout(&oits, SCARG(uap, curr_value), sizeof(oits)); 586 } 587 return error; 588 } 589 590 /* 591 * timerfd_settime(2) system call. 592 */ 593 int 594 do_timerfd_settime(struct lwp *l, int fd, int flags, 595 const struct itimerspec *new_value, struct itimerspec *old_value, 596 register_t *retval) 597 { 598 file_t *fp; 599 int error; 600 601 if (flags & ~(TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)) { 602 return EINVAL; 603 } 604 605 if ((fp = fd_getfile(fd)) == NULL) { 606 return EBADF; 607 } 608 609 if (fp->f_ops != &timerfd_fileops) { 610 fd_putfile(fd); 611 return EINVAL; 612 } 613 614 struct timerfd * const tfd = fp->f_timerfd; 615 struct itimer * const it = &tfd->tfd_itimer; 616 617 itimer_lock(); 618 619 restart: 620 if (old_value != NULL) { 621 *old_value = it->it_time; 622 } 623 it->it_time = *new_value; 624 625 /* 626 * If we've been passed a relative value, convert it to an 627 * absolute, as that's what the itimer facility expects for 628 * non-virtual timers. Also ensure that this doesn't set it 629 * to zero or lets it go negative. 630 * XXXJRT re-factor. 631 */ 632 if (timespecisset(&it->it_time.it_value) && 633 (flags & TFD_TIMER_ABSTIME) == 0) { 634 struct timespec now; 635 if (it->it_clockid == CLOCK_REALTIME) { 636 getnanotime(&now); 637 } else { /* CLOCK_MONOTONIC */ 638 getnanouptime(&now); 639 } 640 timespecadd(&it->it_time.it_value, &now, 641 &it->it_time.it_value); 642 } 643 644 error = itimer_settime(it); 645 if (error == ERESTART) { 646 goto restart; 647 } 648 KASSERT(error == 0); 649 650 /* Reset the expirations counter. */ 651 it->it_overruns = 0; 652 653 if (it->it_clockid == CLOCK_REALTIME) { 654 tfd->tfd_cancelled = false; 655 tfd->tfd_cancel_on_set = !!(flags & TFD_TIMER_CANCEL_ON_SET); 656 } 657 658 getnanotime(&tfd->tfd_mtime); 659 itimer_unlock(); 660 661 fd_putfile(fd); 662 return error; 663 } 664 665 int 666 sys_timerfd_settime(struct lwp *l, const struct sys_timerfd_settime_args *uap, 667 register_t *retval) 668 { 669 /* { 670 syscallarg(int) fd; 671 syscallarg(int) flags; 672 syscallarg(const struct itimerspec *) new_value; 673 syscallarg(struct itimerspec *) old_value; 674 } */ 675 676 struct itimerspec nits, oits, *oitsp = NULL; 677 int error; 678 679 error = copyin(SCARG(uap, new_value), &nits, sizeof(nits)); 680 if (error) { 681 return error; 682 } 683 684 if (SCARG(uap, old_value) != NULL) { 685 oitsp = &oits; 686 } 687 688 error = do_timerfd_settime(l, SCARG(uap, fd), SCARG(uap, flags), 689 &nits, oitsp, retval); 690 if (error == 0 && oitsp != NULL) { 691 error = copyout(oitsp, SCARG(uap, old_value), sizeof(*oitsp)); 692 } 693 return error; 694 } 695