1 /* $NetBSD: epoll.c,v 1.4 2016/01/08 21:35:40 christos Exp $ */ 2 3 /* 4 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu> 5 * Copyright 2007-2012 Niels Provos, Nick Mathewson 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 #include "event2/event-config.h" 30 #include "evconfig-private.h" 31 32 #ifdef EVENT__HAVE_EPOLL 33 34 #include <stdint.h> 35 #include <sys/types.h> 36 #include <sys/resource.h> 37 #ifdef EVENT__HAVE_SYS_TIME_H 38 #include <sys/time.h> 39 #endif 40 #include <sys/queue.h> 41 #include <sys/epoll.h> 42 #include <signal.h> 43 #include <limits.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <unistd.h> 48 #include <errno.h> 49 #ifdef EVENT__HAVE_FCNTL_H 50 #include <fcntl.h> 51 #endif 52 #ifdef EVENT__HAVE_SYS_TIMERFD_H 53 #include <sys/timerfd.h> 54 #endif 55 56 #include "event-internal.h" 57 #include "evsignal-internal.h" 58 #include "event2/thread.h" 59 #include "evthread-internal.h" 60 #include "log-internal.h" 61 #include "evmap-internal.h" 62 #include "changelist-internal.h" 63 #include "time-internal.h" 64 65 /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection 66 using special EPOLLRDHUP flag on a read event. 67 */ 68 #if !defined(EPOLLRDHUP) 69 #define EPOLLRDHUP 0 70 #define EARLY_CLOSE_IF_HAVE_RDHUP 0 71 #else 72 #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE 73 #endif 74 75 #include "epolltable-internal.h" 76 77 #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \ 78 defined(EVENT__HAVE_TIMERFD_CREATE) && \ 79 defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \ 80 defined(TFD_CLOEXEC) 81 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available 82 and working. This means that we can't support it on 2.6.25 (where timerfd 83 was introduced) or 2.6.26, since 2.6.27 introduced those flags. 84 */ 85 #define USING_TIMERFD 86 #endif 87 88 struct epollop { 89 struct epoll_event *events; 90 int nevents; 91 int epfd; 92 #ifdef USING_TIMERFD 93 int timerfd; 94 #endif 95 }; 96 97 static void *epoll_init(struct event_base *); 98 static int epoll_dispatch(struct event_base *, struct timeval *); 99 static void epoll_dealloc(struct event_base *); 100 101 static const struct eventop epollops_changelist = { 102 "epoll (with changelist)", 103 epoll_init, 104 event_changelist_add_, 105 event_changelist_del_, 106 epoll_dispatch, 107 epoll_dealloc, 108 1, /* need reinit */ 109 EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP, 110 EVENT_CHANGELIST_FDINFO_SIZE 111 }; 112 113 114 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 115 short old, short events, void *p); 116 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 117 short old, short events, void *p); 118 119 const struct eventop epollops = { 120 "epoll", 121 epoll_init, 122 epoll_nochangelist_add, 123 epoll_nochangelist_del, 124 epoll_dispatch, 125 epoll_dealloc, 126 1, /* need reinit */ 127 EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE, 128 0 129 }; 130 131 #define INITIAL_NEVENT 32 132 #define MAX_NEVENT 4096 133 134 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout 135 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be 136 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the 137 * largest number of msec we can support here is 2147482. Let's 138 * round that down by 47 seconds. 139 */ 140 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) 141 142 static void * 143 epoll_init(struct event_base *base) 144 { 145 int epfd = -1; 146 struct epollop *epollop; 147 148 #ifdef EVENT__HAVE_EPOLL_CREATE1 149 /* First, try the shiny new epoll_create1 interface, if we have it. */ 150 epfd = epoll_create1(EPOLL_CLOEXEC); 151 #endif 152 if (epfd == -1) { 153 /* Initialize the kernel queue using the old interface. (The 154 size field is ignored since 2.6.8.) */ 155 if ((epfd = epoll_create(32000)) == -1) { 156 if (errno != ENOSYS) 157 event_warn("epoll_create"); 158 return (NULL); 159 } 160 evutil_make_socket_closeonexec(epfd); 161 } 162 163 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) { 164 close(epfd); 165 return (NULL); 166 } 167 168 epollop->epfd = epfd; 169 170 /* Initialize fields */ 171 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event)); 172 if (epollop->events == NULL) { 173 mm_free(epollop); 174 close(epfd); 175 return (NULL); 176 } 177 epollop->nevents = INITIAL_NEVENT; 178 179 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 || 180 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 && 181 evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) { 182 183 base->evsel = &epollops_changelist; 184 } 185 186 #ifdef USING_TIMERFD 187 /* 188 The epoll interface ordinarily gives us one-millisecond precision, 189 so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE 190 timer. But when the user has set the new PRECISE_TIMER flag for an 191 event_base, we can try to use timerfd to give them finer granularity. 192 */ 193 if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) && 194 base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) { 195 int fd; 196 fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC); 197 if (epollop->timerfd >= 0) { 198 struct epoll_event epev; 199 memset(&epev, 0, sizeof(epev)); 200 epev.data.fd = epollop->timerfd; 201 epev.events = EPOLLIN; 202 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) { 203 event_warn("epoll_ctl(timerfd)"); 204 close(fd); 205 epollop->timerfd = -1; 206 } 207 } else { 208 if (errno != EINVAL && errno != ENOSYS) { 209 /* These errors probably mean that we were 210 * compiled with timerfd/TFD_* support, but 211 * we're running on a kernel that lacks those. 212 */ 213 event_warn("timerfd_create"); 214 } 215 epollop->timerfd = -1; 216 } 217 } else { 218 epollop->timerfd = -1; 219 } 220 #endif 221 222 evsig_init_(base); 223 224 return (epollop); 225 } 226 227 static const char * 228 change_to_string(int change) 229 { 230 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL); 231 if (change == EV_CHANGE_ADD) { 232 return "add"; 233 } else if (change == EV_CHANGE_DEL) { 234 return "del"; 235 } else if (change == 0) { 236 return "none"; 237 } else { 238 return "???"; 239 } 240 } 241 242 static const char * 243 epoll_op_to_string(int op) 244 { 245 return op == EPOLL_CTL_ADD?"ADD": 246 op == EPOLL_CTL_DEL?"DEL": 247 op == EPOLL_CTL_MOD?"MOD": 248 "???"; 249 } 250 251 static int 252 epoll_apply_one_change(struct event_base *base, 253 struct epollop *epollop, 254 const struct event_change *ch) 255 { 256 struct epoll_event epev; 257 int op, events = 0; 258 int idx; 259 260 idx = EPOLL_OP_TABLE_INDEX(ch); 261 op = epoll_op_table[idx].op; 262 events = epoll_op_table[idx].events; 263 264 if (!events) { 265 EVUTIL_ASSERT(op == 0); 266 return 0; 267 } 268 269 if ((ch->read_change|ch->write_change) & EV_CHANGE_ET) 270 events |= EPOLLET; 271 272 memset(&epev, 0, sizeof(epev)); 273 epev.data.fd = ch->fd; 274 epev.events = events; 275 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) { 276 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d; close change was %d]", 277 epoll_op_to_string(op), 278 (int)epev.events, 279 (int)ch->fd, 280 ch->old_events, 281 ch->read_change, 282 ch->write_change, 283 ch->close_change)); 284 return 0; 285 } 286 287 switch (op) { 288 case EPOLL_CTL_MOD: 289 if (errno == ENOENT) { 290 /* If a MOD operation fails with ENOENT, the 291 * fd was probably closed and re-opened. We 292 * should retry the operation as an ADD. 293 */ 294 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) { 295 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too", 296 (int)epev.events, ch->fd); 297 return -1; 298 } else { 299 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.", 300 (int)epev.events, 301 ch->fd)); 302 return 0; 303 } 304 } 305 break; 306 case EPOLL_CTL_ADD: 307 if (errno == EEXIST) { 308 /* If an ADD operation fails with EEXIST, 309 * either the operation was redundant (as with a 310 * precautionary add), or we ran into a fun 311 * kernel bug where using dup*() to duplicate the 312 * same file into the same fd gives you the same epitem 313 * rather than a fresh one. For the second case, 314 * we must retry with MOD. */ 315 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { 316 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", 317 (int)epev.events, ch->fd); 318 return -1; 319 } else { 320 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", 321 (int)epev.events, 322 ch->fd)); 323 return 0; 324 } 325 } 326 break; 327 case EPOLL_CTL_DEL: 328 if (errno == ENOENT || errno == EBADF || errno == EPERM) { 329 /* If a delete fails with one of these errors, 330 * that's fine too: we closed the fd before we 331 * got around to calling epoll_dispatch. */ 332 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.", 333 (int)epev.events, 334 ch->fd, 335 strerror(errno))); 336 return 0; 337 } 338 break; 339 default: 340 break; 341 } 342 343 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s); close change was %d (%s)", 344 epoll_op_to_string(op), 345 (int)epev.events, 346 ch->fd, 347 ch->old_events, 348 ch->read_change, 349 change_to_string(ch->read_change), 350 ch->write_change, 351 change_to_string(ch->write_change), 352 ch->close_change, 353 change_to_string(ch->close_change)); 354 355 return -1; 356 } 357 358 static int 359 epoll_apply_changes(struct event_base *base) 360 { 361 struct event_changelist *changelist = &base->changelist; 362 struct epollop *epollop = base->evbase; 363 struct event_change *ch; 364 365 int r = 0; 366 int i; 367 368 for (i = 0; i < changelist->n_changes; ++i) { 369 ch = &changelist->changes[i]; 370 if (epoll_apply_one_change(base, epollop, ch) < 0) 371 r = -1; 372 } 373 374 return (r); 375 } 376 377 static int 378 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 379 short old, short events, void *p) 380 { 381 struct event_change ch; 382 ch.fd = fd; 383 ch.old_events = old; 384 ch.read_change = ch.write_change = ch.close_change = 0; 385 if (events & EV_WRITE) 386 ch.write_change = EV_CHANGE_ADD | 387 (events & EV_ET); 388 if (events & EV_READ) 389 ch.read_change = EV_CHANGE_ADD | 390 (events & EV_ET); 391 if (events & EV_CLOSED) 392 ch.close_change = EV_CHANGE_ADD | 393 (events & EV_ET); 394 395 return epoll_apply_one_change(base, base->evbase, &ch); 396 } 397 398 static int 399 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 400 short old, short events, void *p) 401 { 402 struct event_change ch; 403 ch.fd = fd; 404 ch.old_events = old; 405 ch.read_change = ch.write_change = ch.close_change = 0; 406 if (events & EV_WRITE) 407 ch.write_change = EV_CHANGE_DEL; 408 if (events & EV_READ) 409 ch.read_change = EV_CHANGE_DEL; 410 if (events & EV_CLOSED) 411 ch.close_change = EV_CHANGE_DEL; 412 413 return epoll_apply_one_change(base, base->evbase, &ch); 414 } 415 416 static int 417 epoll_dispatch(struct event_base *base, struct timeval *tv) 418 { 419 struct epollop *epollop = base->evbase; 420 struct epoll_event *events = epollop->events; 421 int i, res; 422 long timeout = -1; 423 424 #ifdef USING_TIMERFD 425 if (epollop->timerfd >= 0) { 426 struct itimerspec is; 427 is.it_interval.tv_sec = 0; 428 is.it_interval.tv_nsec = 0; 429 if (tv == NULL) { 430 /* No timeout; disarm the timer. */ 431 is.it_value.tv_sec = 0; 432 is.it_value.tv_nsec = 0; 433 } else { 434 if (tv->tv_sec == 0 && tv->tv_usec == 0) { 435 /* we need to exit immediately; timerfd can't 436 * do that. */ 437 timeout = 0; 438 } 439 is.it_value.tv_sec = tv->tv_sec; 440 is.it_value.tv_nsec = tv->tv_usec * 1000; 441 } 442 /* TODO: we could avoid unnecessary syscalls here by only 443 calling timerfd_settime when the top timeout changes, or 444 when we're called with a different timeval. 445 */ 446 if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) { 447 event_warn("timerfd_settime"); 448 } 449 } else 450 #endif 451 if (tv != NULL) { 452 timeout = evutil_tv_to_msec_(tv); 453 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) { 454 /* Linux kernels can wait forever if the timeout is 455 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */ 456 timeout = MAX_EPOLL_TIMEOUT_MSEC; 457 } 458 } 459 460 epoll_apply_changes(base); 461 event_changelist_remove_all_(&base->changelist, base); 462 463 EVBASE_RELEASE_LOCK(base, th_base_lock); 464 465 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); 466 467 EVBASE_ACQUIRE_LOCK(base, th_base_lock); 468 469 if (res == -1) { 470 if (errno != EINTR) { 471 event_warn("epoll_wait"); 472 return (-1); 473 } 474 475 return (0); 476 } 477 478 event_debug(("%s: epoll_wait reports %d", __func__, res)); 479 EVUTIL_ASSERT(res <= epollop->nevents); 480 481 for (i = 0; i < res; i++) { 482 int what = events[i].events; 483 short ev = 0; 484 #ifdef USING_TIMERFD 485 if (events[i].data.fd == epollop->timerfd) 486 continue; 487 #endif 488 489 if (what & (EPOLLHUP|EPOLLERR)) { 490 ev = EV_READ | EV_WRITE; 491 } else { 492 if (what & EPOLLIN) 493 ev |= EV_READ; 494 if (what & EPOLLOUT) 495 ev |= EV_WRITE; 496 if (what & EPOLLRDHUP) 497 ev |= EV_CLOSED; 498 } 499 500 if (!ev) 501 continue; 502 503 evmap_io_active_(base, events[i].data.fd, ev | EV_ET); 504 } 505 506 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) { 507 /* We used all of the event space this time. We should 508 be ready for more events next time. */ 509 int new_nevents = epollop->nevents * 2; 510 struct epoll_event *new_events; 511 512 new_events = mm_realloc(epollop->events, 513 new_nevents * sizeof(struct epoll_event)); 514 if (new_events) { 515 epollop->events = new_events; 516 epollop->nevents = new_nevents; 517 } 518 } 519 520 return (0); 521 } 522 523 524 static void 525 epoll_dealloc(struct event_base *base) 526 { 527 struct epollop *epollop = base->evbase; 528 529 evsig_dealloc_(base); 530 if (epollop->events) 531 mm_free(epollop->events); 532 if (epollop->epfd >= 0) 533 close(epollop->epfd); 534 #ifdef USING_TIMERFD 535 if (epollop->timerfd >= 0) 536 close(epollop->timerfd); 537 #endif 538 539 memset(epollop, 0, sizeof(struct epollop)); 540 mm_free(epollop); 541 } 542 543 #endif /* EVENT__HAVE_EPOLL */ 544