1 /* $NetBSD: sys_epoll.c,v 1.4 2023/07/30 18:31:13 christos Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2007 Roman Divacky 7 * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 #include <sys/cdefs.h> 31 __KERNEL_RCSID(0, "$NetBSD: sys_epoll.c,v 1.4 2023/07/30 18:31:13 christos Exp $"); 32 33 34 #include <sys/param.h> 35 #include <sys/types.h> 36 #include <sys/bitops.h> 37 #include <sys/epoll.h> 38 #include <sys/event.h> 39 #include <sys/eventvar.h> 40 #include <sys/errno.h> 41 #include <sys/file.h> 42 #include <sys/filedesc.h> 43 #include <sys/fcntl.h> 44 #include <sys/proc.h> 45 #include <sys/signal.h> 46 #include <sys/vnode.h> 47 48 #include <sys/syscallargs.h> 49 50 #define EPOLL_MAX_DEPTH 5 51 52 #define EPOLL_EVRD (EPOLLIN|EPOLLRDNORM) 53 #define EPOLL_EVWR (EPOLLOUT|EPOLLWRNORM) 54 #define EPOLL_EVSUP (EPOLLET|EPOLLONESHOT|EPOLLHUP|EPOLLERR|EPOLLPRI \ 55 |EPOLL_EVRD|EPOLL_EVWR|EPOLLRDHUP) 56 57 #define kext_data ext[0] 58 #define kext_epfd ext[1] 59 #define kext_fd ext[2] 60 61 #if DEBUG 62 #define DPRINTF(x) uprintf x 63 #else 64 #define DPRINTF(x) __nothing 65 #endif 66 67 struct epoll_edge { 68 int epfd; 69 int fd; 70 }; 71 72 __BITMAP_TYPE(epoll_seen, char, 1); 73 74 static int epoll_to_kevent(int, int, struct epoll_event *, struct kevent *, 75 int *); 76 static void kevent_to_epoll(struct kevent *, struct epoll_event *); 77 static int epoll_kev_put_events(void *, struct kevent *, struct kevent *, 78 size_t, int); 79 static int epoll_kev_fetch_changes(void *, const struct kevent *, 80 struct kevent *, size_t, int); 81 static int epoll_kev_fetch_timeout(const void *, void *, size_t); 82 static int epoll_register_kevent(register_t *, int, int, int, 83 unsigned int); 84 static int epoll_fd_registered(register_t *, int, int); 85 static int epoll_delete_all_events(register_t *, int, int); 86 static int epoll_recover_watch_tree(struct epoll_edge *, size_t, size_t); 87 static int epoll_dfs(struct epoll_edge *, size_t, struct epoll_seen *, 88 size_t, int, int); 89 static int epoll_check_loop_and_depth(struct lwp *, int, int); 90 91 /* 92 * epoll_create1(2). Parse the flags and then create a kqueue instance. 93 */ 94 int 95 sys_epoll_create1(struct lwp *l, const struct sys_epoll_create1_args *uap, 96 register_t *retval) 97 { 98 /* { 99 syscallarg(int) flags; 100 } */ 101 struct sys_kqueue1_args kqa; 102 103 if ((SCARG(uap, flags) & ~(EPOLL_CLOEXEC)) != 0) 104 return EINVAL; 105 106 SCARG(&kqa, flags) = 0; 107 if (SCARG(uap, flags) & EPOLL_CLOEXEC) 108 SCARG(&kqa, flags) |= O_CLOEXEC; 109 110 return sys_kqueue1(l, &kqa, retval); 111 } 112 113 /* 114 * Structure converting function from epoll to kevent. 115 */ 116 static int 117 epoll_to_kevent(int epfd, int fd, struct epoll_event *l_event, 118 struct kevent *kevent, int *nkevents) 119 { 120 uint32_t levents = l_event->events; 121 uint32_t kev_flags = EV_ADD | EV_ENABLE; 122 123 /* flags related to how event is registered */ 124 if ((levents & EPOLLONESHOT) != 0) 125 kev_flags |= EV_DISPATCH; 126 if ((levents & EPOLLET) != 0) 127 kev_flags |= EV_CLEAR; 128 if ((levents & EPOLLERR) != 0) 129 kev_flags |= EV_ERROR; 130 if ((levents & EPOLLRDHUP) != 0) 131 kev_flags |= EV_EOF; 132 133 /* flags related to what event is registered */ 134 if ((levents & EPOLL_EVRD) != 0) { 135 EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0); 136 kevent->kext_data = l_event->data; 137 kevent->kext_epfd = epfd; 138 kevent->kext_fd = fd; 139 ++kevent; 140 ++(*nkevents); 141 } 142 if ((levents & EPOLL_EVWR) != 0) { 143 EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0); 144 kevent->kext_data = l_event->data; 145 kevent->kext_epfd = epfd; 146 kevent->kext_fd = fd; 147 ++kevent; 148 ++(*nkevents); 149 } 150 /* zero event mask is legal */ 151 if ((levents & (EPOLL_EVRD | EPOLL_EVWR)) == 0) { 152 EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0); 153 ++(*nkevents); 154 } 155 156 if ((levents & ~(EPOLL_EVSUP)) != 0) { 157 return EINVAL; 158 } 159 160 return 0; 161 } 162 163 /* 164 * Structure converting function from kevent to epoll. In a case 165 * this is called on error in registration we store the error in 166 * event->data and pick it up later in sys_epoll_ctl(). 167 */ 168 static void 169 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 170 { 171 172 l_event->data = kevent->kext_data; 173 174 if ((kevent->flags & EV_ERROR) != 0) { 175 l_event->events = EPOLLERR; 176 return; 177 } 178 179 /* XXX EPOLLPRI, EPOLLHUP */ 180 switch (kevent->filter) { 181 case EVFILT_READ: 182 l_event->events = EPOLLIN; 183 if ((kevent->flags & EV_EOF) != 0) 184 l_event->events |= EPOLLRDHUP; 185 break; 186 case EVFILT_WRITE: 187 l_event->events = EPOLLOUT; 188 break; 189 default: 190 DPRINTF(("%s: unhandled kevent filter %d\n", __func__, 191 kevent->filter)); 192 break; 193 } 194 } 195 196 /* 197 * Copyout callback used by kevent. This converts kevent events to 198 * epoll events that are located in args->eventlist. 199 */ 200 static int 201 epoll_kev_put_events(void *ctx, struct kevent *events, 202 struct kevent *eventlist, size_t index, int n) 203 { 204 int i; 205 struct epoll_event *eep = (struct epoll_event *)eventlist; 206 207 KASSERT(n >= 0 && n < EPOLL_MAX_EVENTS); 208 209 for (i = 0; i < n; i++) 210 kevent_to_epoll(events + i, eep + index + i); 211 212 return 0; 213 } 214 215 /* 216 * Copyin callback used by kevent. This copies already 217 * converted filters from kernel memory to the kevent 218 * internal kernel memory. Hence the memcpy instead of 219 * copyin. 220 */ 221 static int 222 epoll_kev_fetch_changes(void *ctx, const struct kevent *changelist, 223 struct kevent *changes, size_t index, int n) 224 { 225 KASSERT(n >= 0 && n < EPOLL_MAX_EVENTS); 226 227 memcpy(changes, changelist + index, n * sizeof(*changes)); 228 229 return 0; 230 } 231 232 /* 233 * Timer copy callback used by kevent. Copies a converted timeout 234 * from kernel memory to kevent memory. Hence the memcpy instead of 235 * just using copyin. 236 */ 237 static int 238 epoll_kev_fetch_timeout(const void *src, void *dest, size_t size) 239 { 240 memcpy(dest, src, size); 241 242 return 0; 243 } 244 245 /* 246 * Load epoll filter, convert it to kevent filter and load it into 247 * kevent subsystem. 248 * 249 * event must point to kernel memory or be NULL. 250 */ 251 int 252 epoll_ctl_common(struct lwp *l, register_t *retval, int epfd, int op, int fd, 253 struct epoll_event *event) 254 { 255 struct kevent kev[2]; 256 struct kevent_ops k_ops = { 257 .keo_private = NULL, 258 .keo_fetch_timeout = NULL, 259 .keo_fetch_changes = epoll_kev_fetch_changes, 260 .keo_put_events = NULL, 261 }; 262 file_t *epfp, *fp; 263 int error = 0; 264 int nchanges = 0; 265 266 /* 267 * Need to validate epfd and fd separately from kevent1 to match 268 * Linux's errno behaviour. 269 */ 270 epfp = fd_getfile(epfd); 271 if (epfp == NULL) 272 return EBADF; 273 if (epfp->f_type != DTYPE_KQUEUE) 274 error = EINVAL; 275 fd_putfile(epfd); 276 if (error != 0) 277 return error; 278 279 fp = fd_getfile(fd); 280 if (fp == NULL) 281 return EBADF; 282 if (fp->f_type == DTYPE_VNODE) { 283 switch (fp->f_vnode->v_type) { 284 case VREG: 285 case VDIR: 286 case VBLK: 287 case VLNK: 288 error = EPERM; 289 break; 290 291 default: 292 break; 293 } 294 } 295 fd_putfile(fd); 296 if (error != 0) 297 return error; 298 299 /* Linux disallows spying on himself */ 300 if (epfd == fd) { 301 return EINVAL; 302 } 303 304 if (op != EPOLL_CTL_DEL) { 305 error = epoll_to_kevent(epfd, fd, event, kev, &nchanges); 306 if (error != 0) 307 return error; 308 } 309 310 switch (op) { 311 case EPOLL_CTL_MOD: 312 error = epoll_delete_all_events(retval, epfd, fd); 313 if (error != 0) 314 return error; 315 break; 316 317 case EPOLL_CTL_ADD: 318 if (epoll_fd_registered(retval, epfd, fd)) 319 return EEXIST; 320 error = epoll_check_loop_and_depth(l, epfd, fd); 321 if (error != 0) 322 return error; 323 break; 324 325 case EPOLL_CTL_DEL: 326 /* CTL_DEL means unregister this fd with this epoll */ 327 return epoll_delete_all_events(retval, epfd, fd); 328 329 default: 330 DPRINTF(("%s: invalid op %d\n", __func__, op)); 331 return EINVAL; 332 } 333 334 error = kevent1(retval, epfd, kev, nchanges, NULL, 0, NULL, &k_ops); 335 336 if (error == EOPNOTSUPP) { 337 error = EPERM; 338 } 339 340 return error; 341 } 342 343 /* 344 * epoll_ctl(2). Copyin event if necessary and then call 345 * epoll_ctl_common(). 346 */ 347 int 348 sys_epoll_ctl(struct lwp *l, const struct sys_epoll_ctl_args *uap, 349 register_t *retval) 350 { 351 /* { 352 syscallarg(int) epfd; 353 syscallarg(int) op; 354 syscallarg(int) fd; 355 syscallarg(struct epoll_event *) event; 356 } */ 357 struct epoll_event ee; 358 struct epoll_event *eep; 359 int error; 360 361 if (SCARG(uap, op) != EPOLL_CTL_DEL) { 362 error = copyin(SCARG(uap, event), &ee, sizeof(ee)); 363 if (error != 0) 364 return error; 365 366 eep = ⅇ 367 } else 368 eep = NULL; 369 370 return epoll_ctl_common(l, retval, SCARG(uap, epfd), SCARG(uap, op), 371 SCARG(uap, fd), eep); 372 } 373 374 /* 375 * Wait for a filter to be triggered on the epoll file descriptor. 376 * All of the epoll_*wait* syscalls eventually end up here. 377 * 378 * events, nss, and ssp must point to kernel memory (or be NULL). 379 */ 380 int 381 epoll_wait_common(struct lwp *l, register_t *retval, int epfd, 382 struct epoll_event *events, int maxevents, struct timespec *tsp, 383 const sigset_t *nssp) 384 { 385 struct kevent_ops k_ops = { 386 .keo_private = NULL, 387 .keo_fetch_timeout = epoll_kev_fetch_timeout, 388 .keo_fetch_changes = NULL, 389 .keo_put_events = epoll_kev_put_events, 390 }; 391 struct proc *p = l->l_proc; 392 file_t *epfp; 393 sigset_t oss; 394 int error = 0; 395 396 if (maxevents <= 0 || maxevents > EPOLL_MAX_EVENTS) 397 return EINVAL; 398 399 /* 400 * Need to validate epfd separately from kevent1 to match 401 * Linux's errno behaviour. 402 */ 403 epfp = fd_getfile(epfd); 404 if (epfp == NULL) 405 return EBADF; 406 if (epfp->f_type != DTYPE_KQUEUE) 407 error = EINVAL; 408 fd_putfile(epfd); 409 if (error != 0) 410 return error; 411 412 if (nssp != NULL) { 413 mutex_enter(p->p_lock); 414 error = sigprocmask1(l, SIG_SETMASK, nssp, &oss); 415 mutex_exit(p->p_lock); 416 if (error != 0) 417 return error; 418 } 419 420 error = kevent1(retval, epfd, NULL, 0, (struct kevent *)events, 421 maxevents, tsp, &k_ops); 422 /* 423 * Since we're not registering nay events, ENOMEM should not 424 * be possible for this specific kevent1 call. 425 */ 426 KASSERT(error != ENOMEM); 427 428 if (nssp != NULL) { 429 mutex_enter(p->p_lock); 430 error = sigprocmask1(l, SIG_SETMASK, &oss, NULL); 431 mutex_exit(p->p_lock); 432 } 433 434 return error; 435 } 436 437 /* 438 * epoll_pwait2(2). 439 */ 440 int 441 sys_epoll_pwait2(struct lwp *l, const struct sys_epoll_pwait2_args *uap, 442 register_t *retval) 443 { 444 /* { 445 syscallarg(int) epfd; 446 syscallarg(struct epoll_event *) events; 447 syscallarg(int) maxevents; 448 syscallarg(struct timespec *) timeout; 449 syscallarg(sigset_t *) sigmask; 450 } */ 451 struct epoll_event *events; 452 struct timespec ts, *tsp; 453 sigset_t ss, *ssp; 454 int error; 455 const int maxevents = SCARG(uap, maxevents); 456 457 if (maxevents <= 0 || maxevents >= EPOLL_MAX_EVENTS) 458 return EINVAL; 459 460 if (SCARG(uap, timeout) != NULL) { 461 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 462 if (error != 0) 463 return error; 464 465 tsp = &ts; 466 } else 467 tsp = NULL; 468 469 if (SCARG(uap, sigmask) != NULL) { 470 error = copyin(SCARG(uap, sigmask), &ss, sizeof(ss)); 471 if (error != 0) 472 return error; 473 474 ssp = &ss; 475 } else 476 ssp = NULL; 477 478 events = kmem_alloc(maxevents * sizeof(*events), KM_SLEEP); 479 480 error = epoll_wait_common(l, retval, SCARG(uap, epfd), events, 481 maxevents, tsp, ssp); 482 if (error == 0) 483 error = copyout(events, SCARG(uap, events), 484 *retval * sizeof(*events)); 485 486 kmem_free(events, maxevents * sizeof(*events)); 487 return error; 488 } 489 490 /* 491 * Helper that registers a single kevent. 492 */ 493 static int 494 epoll_register_kevent(register_t *retval, int epfd, int fd, int filter, 495 unsigned int flags) 496 { 497 struct kevent kev; 498 struct kevent_ops k_ops = { 499 .keo_private = NULL, 500 .keo_fetch_timeout = NULL, 501 .keo_fetch_changes = epoll_kev_fetch_changes, 502 .keo_put_events = NULL, 503 }; 504 505 EV_SET(&kev, fd, filter, flags, 0, 0, 0); 506 507 return kevent1(retval, epfd, &kev, 1, NULL, 0, NULL, &k_ops); 508 } 509 510 /* 511 * Check if an fd is already registered in the kqueue referenced by epfd. 512 */ 513 static int 514 epoll_fd_registered(register_t *retval, int epfd, int fd) 515 { 516 /* 517 * Set empty filter flags to avoid accidental modification of already 518 * registered events. In the case of event re-registration: 519 * 1. If event does not exists kevent() does nothing and returns ENOENT 520 * 2. If event does exists, it's enabled/disabled state is preserved 521 * but fflags, data and udata fields are overwritten. So we can not 522 * set socket lowats and store user's context pointer in udata. 523 */ 524 if (epoll_register_kevent(retval, epfd, fd, EVFILT_READ, 0) != ENOENT || 525 epoll_register_kevent(retval, epfd, fd, EVFILT_WRITE, 0) != ENOENT) 526 return 1; 527 528 return 0; 529 } 530 531 /* 532 * Remove all events in the kqueue referenced by epfd that depend on 533 * fd. 534 */ 535 static int 536 epoll_delete_all_events(register_t *retval, int epfd, int fd) 537 { 538 int error1, error2; 539 540 error1 = epoll_register_kevent(retval, epfd, fd, EVFILT_READ, 541 EV_DELETE); 542 error2 = epoll_register_kevent(retval, epfd, fd, EVFILT_WRITE, 543 EV_DELETE); 544 545 /* return 0 if at least one result positive */ 546 return error1 == 0 ? 0 : error2; 547 } 548 549 /* 550 * Interate through all the knotes and recover a directed graph on 551 * which kqueues are watching each other. 552 * 553 * If edges is NULL, the number of edges is still counted but no graph 554 * is assembled. 555 */ 556 static int 557 epoll_recover_watch_tree(struct epoll_edge *edges, size_t nedges, size_t nfds) { 558 file_t *currfp, *targetfp; 559 struct knote *kn, *tmpkn; 560 size_t i, nedges_so_far = 0; 561 562 for (i = 0; i < nfds && (edges == NULL || nedges_so_far < nedges); i++) 563 { 564 currfp = fd_getfile(i); 565 if (currfp == NULL) 566 continue; 567 if (currfp->f_type != DTYPE_KQUEUE) 568 goto continue_count_outer; 569 570 SLIST_FOREACH_SAFE(kn, &currfp->f_kqueue->kq_sel.sel_klist, 571 kn_selnext, tmpkn) { 572 targetfp = fd_getfile(kn->kn_kevent.kext_epfd); 573 if (targetfp == NULL) 574 continue; 575 if (targetfp->f_type == DTYPE_KQUEUE) { 576 if (edges != NULL) { 577 edges[nedges_so_far].epfd = 578 kn->kn_kevent.kext_epfd; 579 edges[nedges_so_far].fd = 580 kn->kn_kevent.kext_fd; 581 } 582 nedges_so_far++; 583 } 584 585 fd_putfile(kn->kn_kevent.kext_epfd); 586 } 587 588 continue_count_outer: 589 fd_putfile(i); 590 } 591 592 return nedges_so_far; 593 } 594 595 /* 596 * Run dfs on the graph described by edges, checking for loops and a 597 * depth greater than EPOLL_MAX_DEPTH. 598 */ 599 static int 600 epoll_dfs(struct epoll_edge *edges, size_t nedges, struct epoll_seen *seen, 601 size_t nseen, int currfd, int depth) 602 { 603 int error; 604 size_t i; 605 606 KASSERT(edges != NULL); 607 KASSERT(seen != NULL); 608 KASSERT(nedges > 0); 609 KASSERT(currfd < nseen); 610 KASSERT(0 <= depth && depth <= EPOLL_MAX_DEPTH + 1); 611 612 if (__BITMAP_ISSET(currfd, seen)) 613 return ELOOP; 614 615 __BITMAP_SET(currfd, seen); 616 617 depth++; 618 if (depth > EPOLL_MAX_DEPTH) 619 return EINVAL; 620 621 for (i = 0; i < nedges; i++) { 622 if (edges[i].epfd != currfd) 623 continue; 624 625 error = epoll_dfs(edges, nedges, seen, nseen, 626 edges[i].fd, depth); 627 if (error != 0) 628 return error; 629 } 630 631 return 0; 632 } 633 634 /* 635 * Check if adding fd to epfd would violate the maximum depth or 636 * create a loop. 637 */ 638 static int 639 epoll_check_loop_and_depth(struct lwp *l, int epfd, int fd) 640 { 641 int error; 642 file_t *fp; 643 struct epoll_edge *edges; 644 struct epoll_seen *seen; 645 size_t nedges, nfds, seen_size; 646 bool fdirrelevant; 647 648 /* If the target isn't another kqueue, we can skip this check */ 649 fp = fd_getfile(fd); 650 if (fp == NULL) 651 return 0; 652 fdirrelevant = fp->f_type != DTYPE_KQUEUE; 653 fd_putfile(fd); 654 if (fdirrelevant) 655 return 0; 656 657 nfds = l->l_proc->p_fd->fd_lastfile + 1; 658 659 /* 660 * We call epoll_recover_watch_tree twice, once to find the 661 * number of edges, and once to actually fill them in. We add one 662 * because we want to include the edge epfd->fd. 663 */ 664 nedges = 1 + epoll_recover_watch_tree(NULL, 0, nfds); 665 666 edges = kmem_zalloc(nedges * sizeof(*edges), KM_SLEEP); 667 668 epoll_recover_watch_tree(edges + 1, nedges - 1, nfds); 669 670 edges[0].epfd = epfd; 671 edges[0].fd = fd; 672 673 seen_size = __BITMAP_SIZE(char, nfds); 674 seen = kmem_zalloc(seen_size, KM_SLEEP); 675 676 error = epoll_dfs(edges, nedges, seen, nfds, epfd, 0); 677 678 kmem_free(seen, seen_size); 679 kmem_free(edges, nedges * sizeof(*edges)); 680 681 return error; 682 } 683