1 /* $NetBSD: epoll.c,v 1.6 2024/08/18 20:47:21 christos Exp $ */ 2 3 /* 4 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu> 5 * Copyright 2007-2012 Niels Provos, Nick Mathewson 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 #include "event2/event-config.h" 30 #include "evconfig-private.h" 31 32 #ifdef EVENT__HAVE_EPOLL 33 34 #include <stdint.h> 35 #include <sys/types.h> 36 #include <sys/resource.h> 37 #ifdef EVENT__HAVE_SYS_TIME_H 38 #include <sys/time.h> 39 #endif 40 #include <sys/queue.h> 41 #include <sys/epoll.h> 42 #include <signal.h> 43 #include <limits.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <unistd.h> 48 #include <errno.h> 49 #ifdef EVENT__HAVE_FCNTL_H 50 #include <fcntl.h> 51 #endif 52 #ifdef EVENT__HAVE_SYS_TIMERFD_H 53 #include <sys/timerfd.h> 54 #endif 55 56 #include "event-internal.h" 57 #include "evsignal-internal.h" 58 #include "event2/thread.h" 59 #include "evthread-internal.h" 60 #include "log-internal.h" 61 #include "evmap-internal.h" 62 #include "changelist-internal.h" 63 #include "time-internal.h" 64 65 /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection 66 using special EPOLLRDHUP flag on a read event. 67 */ 68 #if !defined(EPOLLRDHUP) 69 #define EPOLLRDHUP 0 70 #define EARLY_CLOSE_IF_HAVE_RDHUP 0 71 #else 72 #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE 73 #endif 74 75 #include "epolltable-internal.h" 76 77 #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \ 78 defined(EVENT__HAVE_TIMERFD_CREATE) && \ 79 defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \ 80 defined(TFD_CLOEXEC) 81 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available 82 and working. This means that we can't support it on 2.6.25 (where timerfd 83 was introduced) or 2.6.26, since 2.6.27 introduced those flags. 84 */ 85 #define USING_TIMERFD 86 #endif 87 88 struct epollop { 89 struct epoll_event *events; 90 int nevents; 91 int epfd; 92 #ifdef USING_TIMERFD 93 int timerfd; 94 #endif 95 }; 96 97 static void *epoll_init(struct event_base *); 98 static int epoll_dispatch(struct event_base *, struct timeval *); 99 static void epoll_dealloc(struct event_base *); 100 101 static const struct eventop epollops_changelist = { 102 "epoll (with changelist)", 103 epoll_init, 104 event_changelist_add_, 105 event_changelist_del_, 106 epoll_dispatch, 107 epoll_dealloc, 108 1, /* need reinit */ 109 EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP, 110 EVENT_CHANGELIST_FDINFO_SIZE 111 }; 112 113 114 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 115 short old, short events, void *p); 116 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 117 short old, short events, void *p); 118 119 const struct eventop epollops = { 120 "epoll", 121 epoll_init, 122 epoll_nochangelist_add, 123 epoll_nochangelist_del, 124 epoll_dispatch, 125 epoll_dealloc, 126 1, /* need reinit */ 127 EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE, 128 0 129 }; 130 131 #define INITIAL_NEVENT 32 132 #define MAX_NEVENT 4096 133 134 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout 135 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be 136 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the 137 * largest number of msec we can support here is 2147482. Let's 138 * round that down by 47 seconds. 139 */ 140 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) 141 142 static void * 143 epoll_init(struct event_base *base) 144 { 145 int epfd = -1; 146 struct epollop *epollop; 147 148 #ifdef EVENT__HAVE_EPOLL_CREATE1 149 /* First, try the shiny new epoll_create1 interface, if we have it. */ 150 epfd = epoll_create1(EPOLL_CLOEXEC); 151 #endif 152 if (epfd == -1) { 153 /* Initialize the kernel queue using the old interface. (The 154 size field is ignored since 2.6.8.) */ 155 if ((epfd = epoll_create(32000)) == -1) { 156 if (errno != ENOSYS) 157 event_warn("epoll_create"); 158 return (NULL); 159 } 160 evutil_make_socket_closeonexec(epfd); 161 } 162 163 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) { 164 close(epfd); 165 return (NULL); 166 } 167 168 epollop->epfd = epfd; 169 170 /* Initialize fields */ 171 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event)); 172 if (epollop->events == NULL) { 173 mm_free(epollop); 174 close(epfd); 175 return (NULL); 176 } 177 epollop->nevents = INITIAL_NEVENT; 178 179 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 || 180 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 && 181 evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) { 182 183 base->evsel = &epollops_changelist; 184 } 185 186 #ifdef USING_TIMERFD 187 /* 188 The epoll interface ordinarily gives us one-millisecond precision, 189 so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE 190 timer. But when the user has set the new PRECISE_TIMER flag for an 191 event_base, we can try to use timerfd to give them finer granularity. 192 */ 193 if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) && 194 base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) { 195 int fd; 196 fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC); 197 if (epollop->timerfd >= 0) { 198 struct epoll_event epev; 199 memset(&epev, 0, sizeof(epev)); 200 epev.data.fd = epollop->timerfd; 201 epev.events = EPOLLIN; 202 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) { 203 event_warn("epoll_ctl(timerfd)"); 204 close(fd); 205 epollop->timerfd = -1; 206 } 207 } else { 208 if (errno != EINVAL && errno != ENOSYS) { 209 /* These errors probably mean that we were 210 * compiled with timerfd/TFD_* support, but 211 * we're running on a kernel that lacks those. 212 */ 213 event_warn("timerfd_create"); 214 } 215 epollop->timerfd = -1; 216 } 217 } else { 218 epollop->timerfd = -1; 219 } 220 #endif 221 222 evsig_init_(base); 223 224 return (epollop); 225 } 226 227 static const char * 228 change_to_string(int change) 229 { 230 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL); 231 if (change == EV_CHANGE_ADD) { 232 return "add"; 233 } else if (change == EV_CHANGE_DEL) { 234 return "del"; 235 } else if (change == 0) { 236 return "none"; 237 } else { 238 return "???"; 239 } 240 } 241 242 static const char * 243 epoll_op_to_string(int op) 244 { 245 return op == EPOLL_CTL_ADD?"ADD": 246 op == EPOLL_CTL_DEL?"DEL": 247 op == EPOLL_CTL_MOD?"MOD": 248 "???"; 249 } 250 251 #define PRINT_CHANGES(op, events, ch, status) \ 252 "Epoll %s(%d) on fd %d " status ". " \ 253 "Old events were %d; " \ 254 "read change was %d (%s); " \ 255 "write change was %d (%s); " \ 256 "close change was %d (%s)", \ 257 epoll_op_to_string(op), \ 258 events, \ 259 ch->fd, \ 260 ch->old_events, \ 261 ch->read_change, \ 262 change_to_string(ch->read_change), \ 263 ch->write_change, \ 264 change_to_string(ch->write_change), \ 265 ch->close_change, \ 266 change_to_string(ch->close_change) 267 268 static int 269 epoll_apply_one_change(struct event_base *base, 270 struct epollop *epollop, 271 const struct event_change *ch) 272 { 273 struct epoll_event epev; 274 int op, events = 0; 275 int idx; 276 277 idx = EPOLL_OP_TABLE_INDEX(ch); 278 op = epoll_op_table[idx].op; 279 events = epoll_op_table[idx].events; 280 281 if (!events) { 282 EVUTIL_ASSERT(op == 0); 283 return 0; 284 } 285 286 if ((ch->read_change|ch->write_change|ch->close_change) & EV_CHANGE_ET) 287 events |= EPOLLET; 288 289 memset(&epev, 0, sizeof(epev)); 290 epev.data.fd = ch->fd; 291 epev.events = events; 292 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) { 293 event_debug((PRINT_CHANGES(op, epev.events, ch, "okay"))); 294 return 0; 295 } 296 297 switch (op) { 298 case EPOLL_CTL_MOD: 299 if (errno == ENOENT) { 300 /* If a MOD operation fails with ENOENT, the 301 * fd was probably closed and re-opened. We 302 * should retry the operation as an ADD. 303 */ 304 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) { 305 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too", 306 (int)epev.events, ch->fd); 307 return -1; 308 } else { 309 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.", 310 (int)epev.events, 311 ch->fd)); 312 return 0; 313 } 314 } 315 break; 316 case EPOLL_CTL_ADD: 317 if (errno == EEXIST) { 318 /* If an ADD operation fails with EEXIST, 319 * either the operation was redundant (as with a 320 * precautionary add), or we ran into a fun 321 * kernel bug where using dup*() to duplicate the 322 * same file into the same fd gives you the same epitem 323 * rather than a fresh one. For the second case, 324 * we must retry with MOD. */ 325 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { 326 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", 327 (int)epev.events, ch->fd); 328 return -1; 329 } else { 330 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", 331 (int)epev.events, 332 ch->fd)); 333 return 0; 334 } 335 } 336 break; 337 case EPOLL_CTL_DEL: 338 if (errno == ENOENT || errno == EBADF || errno == EPERM) { 339 /* If a delete fails with one of these errors, 340 * that's fine too: we closed the fd before we 341 * got around to calling epoll_dispatch. */ 342 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.", 343 (int)epev.events, 344 ch->fd, 345 strerror(errno))); 346 return 0; 347 } 348 break; 349 default: 350 break; 351 } 352 353 event_warn(PRINT_CHANGES(op, epev.events, ch, "failed")); 354 return -1; 355 } 356 357 static int 358 epoll_apply_changes(struct event_base *base) 359 { 360 struct event_changelist *changelist = &base->changelist; 361 struct epollop *epollop = base->evbase; 362 struct event_change *ch; 363 364 int r = 0; 365 int i; 366 367 for (i = 0; i < changelist->n_changes; ++i) { 368 ch = &changelist->changes[i]; 369 if (epoll_apply_one_change(base, epollop, ch) < 0) 370 r = -1; 371 } 372 373 return (r); 374 } 375 376 static int 377 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 378 short old, short events, void *p) 379 { 380 struct event_change ch; 381 ch.fd = fd; 382 ch.old_events = old; 383 ch.read_change = ch.write_change = ch.close_change = 0; 384 if (events & EV_WRITE) 385 ch.write_change = EV_CHANGE_ADD | 386 (events & EV_ET); 387 if (events & EV_READ) 388 ch.read_change = EV_CHANGE_ADD | 389 (events & EV_ET); 390 if (events & EV_CLOSED) 391 ch.close_change = EV_CHANGE_ADD | 392 (events & EV_ET); 393 394 return epoll_apply_one_change(base, base->evbase, &ch); 395 } 396 397 static int 398 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 399 short old, short events, void *p) 400 { 401 struct event_change ch; 402 ch.fd = fd; 403 ch.old_events = old; 404 ch.read_change = ch.write_change = ch.close_change = 0; 405 if (events & EV_WRITE) 406 ch.write_change = EV_CHANGE_DEL | 407 (events & EV_ET); 408 if (events & EV_READ) 409 ch.read_change = EV_CHANGE_DEL | 410 (events & EV_ET); 411 if (events & EV_CLOSED) 412 ch.close_change = EV_CHANGE_DEL | 413 (events & EV_ET); 414 415 return epoll_apply_one_change(base, base->evbase, &ch); 416 } 417 418 static int 419 epoll_dispatch(struct event_base *base, struct timeval *tv) 420 { 421 struct epollop *epollop = base->evbase; 422 struct epoll_event *events = epollop->events; 423 int i, res; 424 long timeout = -1; 425 426 #ifdef USING_TIMERFD 427 if (epollop->timerfd >= 0) { 428 struct itimerspec is; 429 is.it_interval.tv_sec = 0; 430 is.it_interval.tv_nsec = 0; 431 if (tv == NULL) { 432 /* No timeout; disarm the timer. */ 433 is.it_value.tv_sec = 0; 434 is.it_value.tv_nsec = 0; 435 } else { 436 if (tv->tv_sec == 0 && tv->tv_usec == 0) { 437 /* we need to exit immediately; timerfd can't 438 * do that. */ 439 timeout = 0; 440 } 441 is.it_value.tv_sec = tv->tv_sec; 442 is.it_value.tv_nsec = tv->tv_usec * 1000; 443 } 444 /* TODO: we could avoid unnecessary syscalls here by only 445 calling timerfd_settime when the top timeout changes, or 446 when we're called with a different timeval. 447 */ 448 if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) { 449 event_warn("timerfd_settime"); 450 } 451 } else 452 #endif 453 if (tv != NULL) { 454 timeout = evutil_tv_to_msec_(tv); 455 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) { 456 /* Linux kernels can wait forever if the timeout is 457 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */ 458 timeout = MAX_EPOLL_TIMEOUT_MSEC; 459 } 460 } 461 462 epoll_apply_changes(base); 463 event_changelist_remove_all_(&base->changelist, base); 464 465 EVBASE_RELEASE_LOCK(base, th_base_lock); 466 467 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); 468 469 EVBASE_ACQUIRE_LOCK(base, th_base_lock); 470 471 if (res == -1) { 472 if (errno != EINTR) { 473 event_warn("epoll_wait"); 474 return (-1); 475 } 476 477 return (0); 478 } 479 480 event_debug(("%s: epoll_wait reports %d", __func__, res)); 481 EVUTIL_ASSERT(res <= epollop->nevents); 482 483 for (i = 0; i < res; i++) { 484 int what = events[i].events; 485 short ev = 0; 486 #ifdef USING_TIMERFD 487 if (events[i].data.fd == epollop->timerfd) 488 continue; 489 #endif 490 491 if (what & EPOLLERR) { 492 ev = EV_READ | EV_WRITE; 493 } else if ((what & EPOLLHUP) && !(what & EPOLLRDHUP)) { 494 ev = EV_READ | EV_WRITE; 495 } else { 496 if (what & EPOLLIN) 497 ev |= EV_READ; 498 if (what & EPOLLOUT) 499 ev |= EV_WRITE; 500 if (what & EPOLLRDHUP) 501 ev |= EV_CLOSED; 502 } 503 504 if (!ev) 505 continue; 506 507 evmap_io_active_(base, events[i].data.fd, ev | EV_ET); 508 } 509 510 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) { 511 /* We used all of the event space this time. We should 512 be ready for more events next time. */ 513 int new_nevents = epollop->nevents * 2; 514 struct epoll_event *new_events; 515 516 new_events = mm_realloc(epollop->events, 517 new_nevents * sizeof(struct epoll_event)); 518 if (new_events) { 519 epollop->events = new_events; 520 epollop->nevents = new_nevents; 521 } 522 } 523 524 return (0); 525 } 526 527 528 static void 529 epoll_dealloc(struct event_base *base) 530 { 531 struct epollop *epollop = base->evbase; 532 533 evsig_dealloc_(base); 534 if (epollop->events) 535 mm_free(epollop->events); 536 if (epollop->epfd >= 0) 537 close(epollop->epfd); 538 #ifdef USING_TIMERFD 539 if (epollop->timerfd >= 0) 540 close(epollop->timerfd); 541 #endif 542 543 memset(epollop, 0, sizeof(struct epollop)); 544 mm_free(epollop); 545 } 546 547 #endif /* EVENT__HAVE_EPOLL */ 548