1 /* $NetBSD: epoll.c,v 1.1.1.1 2013/12/27 23:31:19 christos Exp $ */ 2 3 /* 4 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu> 5 * Copyright 2007-2012 Niels Provos, Nick Mathewson 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. The name of the author may not be used to endorse or promote products 16 * derived from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 #include "event2/event-config.h" 30 #include "evconfig-private.h" 31 32 #ifdef EVENT__HAVE_EPOLL 33 34 #include <stdint.h> 35 #include <sys/types.h> 36 #include <sys/resource.h> 37 #ifdef EVENT__HAVE_SYS_TIME_H 38 #include <sys/time.h> 39 #endif 40 #include <sys/queue.h> 41 #include <sys/epoll.h> 42 #include <signal.h> 43 #include <limits.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 #include <unistd.h> 48 #include <errno.h> 49 #ifdef EVENT__HAVE_FCNTL_H 50 #include <fcntl.h> 51 #endif 52 #ifdef EVENT__HAVE_SYS_TIMERFD_H 53 #include <sys/timerfd.h> 54 #endif 55 56 #include "event-internal.h" 57 #include "evsignal-internal.h" 58 #include "event2/thread.h" 59 #include "evthread-internal.h" 60 #include "log-internal.h" 61 #include "evmap-internal.h" 62 #include "changelist-internal.h" 63 #include "time-internal.h" 64 65 #if defined(EVENT__HAVE_SYS_TIMERFD_H) && \ 66 defined(EVENT__HAVE_TIMERFD_CREATE) && \ 67 defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \ 68 defined(TFD_CLOEXEC) 69 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available 70 and working. This means that we can't support it on 2.6.25 (where timerfd 71 was introduced) or 2.6.26, since 2.6.27 introduced those flags. 72 */ 73 #define USING_TIMERFD 74 #endif 75 76 struct epollop { 77 struct epoll_event *events; 78 int nevents; 79 int epfd; 80 #ifdef USING_TIMERFD 81 int timerfd; 82 #endif 83 }; 84 85 static void *epoll_init(struct event_base *); 86 static int epoll_dispatch(struct event_base *, struct timeval *); 87 static void epoll_dealloc(struct event_base *); 88 89 static const struct eventop epollops_changelist = { 90 "epoll (with changelist)", 91 epoll_init, 92 event_changelist_add_, 93 event_changelist_del_, 94 epoll_dispatch, 95 epoll_dealloc, 96 1, /* need reinit */ 97 EV_FEATURE_ET|EV_FEATURE_O1, 98 EVENT_CHANGELIST_FDINFO_SIZE 99 }; 100 101 102 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 103 short old, short events, void *p); 104 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 105 short old, short events, void *p); 106 107 const struct eventop epollops = { 108 "epoll", 109 epoll_init, 110 epoll_nochangelist_add, 111 epoll_nochangelist_del, 112 epoll_dispatch, 113 epoll_dealloc, 114 1, /* need reinit */ 115 EV_FEATURE_ET|EV_FEATURE_O1, 116 0 117 }; 118 119 #define INITIAL_NEVENT 32 120 #define MAX_NEVENT 4096 121 122 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout 123 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be 124 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the 125 * largest number of msec we can support here is 2147482. Let's 126 * round that down by 47 seconds. 127 */ 128 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) 129 130 static void * 131 epoll_init(struct event_base *base) 132 { 133 int epfd = -1; 134 struct epollop *epollop; 135 136 #ifdef EVENT__HAVE_EPOLL_CREATE1 137 /* First, try the shiny new epoll_create1 interface, if we have it. */ 138 epfd = epoll_create1(EPOLL_CLOEXEC); 139 #endif 140 if (epfd == -1) { 141 /* Initialize the kernel queue using the old interface. (The 142 size field is ignored since 2.6.8.) */ 143 if ((epfd = epoll_create(32000)) == -1) { 144 if (errno != ENOSYS) 145 event_warn("epoll_create"); 146 return (NULL); 147 } 148 evutil_make_socket_closeonexec(epfd); 149 } 150 151 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) { 152 close(epfd); 153 return (NULL); 154 } 155 156 epollop->epfd = epfd; 157 158 /* Initialize fields */ 159 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event)); 160 if (epollop->events == NULL) { 161 mm_free(epollop); 162 close(epfd); 163 return (NULL); 164 } 165 epollop->nevents = INITIAL_NEVENT; 166 167 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 || 168 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 && 169 evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) { 170 171 base->evsel = &epollops_changelist; 172 } 173 174 #ifdef USING_TIMERFD 175 /* 176 The epoll interface ordinarily gives us one-millisecond precision, 177 so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE 178 timer. But when the user has set the new PRECISE_TIMER flag for an 179 event_base, we can try to use timerfd to give them finer granularity. 180 */ 181 if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) && 182 base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) { 183 int fd; 184 fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC); 185 if (epollop->timerfd >= 0) { 186 struct epoll_event epev; 187 memset(&epev, 0, sizeof(epev)); 188 epev.data.fd = epollop->timerfd; 189 epev.events = EPOLLIN; 190 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) { 191 event_warn("epoll_ctl(timerfd)"); 192 close(fd); 193 epollop->timerfd = -1; 194 } 195 } else { 196 if (errno != EINVAL && errno != ENOSYS) { 197 /* These errors probably mean that we were 198 * compiled with timerfd/TFD_* support, but 199 * we're running on a kernel that lacks those. 200 */ 201 event_warn("timerfd_create"); 202 } 203 epollop->timerfd = -1; 204 } 205 } else { 206 epollop->timerfd = -1; 207 } 208 #endif 209 210 evsig_init_(base); 211 212 return (epollop); 213 } 214 215 static const char * 216 change_to_string(int change) 217 { 218 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL); 219 if (change == EV_CHANGE_ADD) { 220 return "add"; 221 } else if (change == EV_CHANGE_DEL) { 222 return "del"; 223 } else if (change == 0) { 224 return "none"; 225 } else { 226 return "???"; 227 } 228 } 229 230 static const char * 231 epoll_op_to_string(int op) 232 { 233 return op == EPOLL_CTL_ADD?"ADD": 234 op == EPOLL_CTL_DEL?"DEL": 235 op == EPOLL_CTL_MOD?"MOD": 236 "???"; 237 } 238 239 /* 240 Here are the values we're masking off to decide what operations to do. 241 Note that since EV_READ|EV_WRITE. 242 243 Note also that this table is a little sparse, since ADD+DEL is 244 nonsensical ("xxx" in the list below.) 245 246 Note also also that we are shifting old_events by only 3 bits, since 247 EV_READ is 2 and EV_WRITE is 4. 248 249 The table was auto-generated with a python script, according to this 250 pseudocode: 251 252 If either the read or the write change is add+del: 253 This is impossible; Set op==-1, events=0. 254 Else, if either the read or the write change is add: 255 Set events to 0. 256 If the read change is add, or 257 (the read change is not del, and ev_read is in old_events): 258 Add EPOLLIN to events. 259 If the write change is add, or 260 (the write change is not del, and ev_write is in old_events): 261 Add EPOLLOUT to events. 262 263 If old_events is set: 264 Set op to EPOLL_CTL_MOD [*1,*2] 265 Else: 266 Set op to EPOLL_CTL_ADD [*3] 267 268 Else, if the read or the write change is del: 269 Set op to EPOLL_CTL_DEL. 270 If the read change is del: 271 If the write change is del: 272 Set events to EPOLLIN|EPOLLOUT 273 Else if ev_write is in old_events: 274 Set events to EPOLLOUT 275 Set op to EPOLL_CTL_MOD 276 Else 277 Set events to EPOLLIN 278 Else: 279 {The write change is del.} 280 If ev_read is in old_events: 281 Set events to EPOLLIN 282 Set op to EPOLL_CTL_MOD 283 Else: 284 Set the events to EPOLLOUT 285 286 Else: 287 There is no read or write change; set op to 0 and events to 0. 288 289 The logic is a little tricky, since we had no events set on the fd before, 290 we need to set op="ADD" and set events=the events we want to add. If we 291 had any events set on the fd before, and we want any events to remain on 292 the fd, we need to say op="MOD" and set events=the events we want to 293 remain. But if we want to delete the last event, we say op="DEL" and 294 set events=(any non-null pointer). 295 296 [*1] This MOD is only a guess. MOD might fail with ENOENT if the file was 297 closed and a new file was opened with the same fd. If so, we'll retry 298 with ADD. 299 300 [*2] We can't replace this with a no-op even if old_events is the same as 301 the new events: if the file was closed and reopened, we need to retry 302 with an ADD. (We do a MOD in this case since "no change" is more 303 common than "close and reopen", so we'll usually wind up doing 1 304 syscalls instead of 2.) 305 306 [*3] This ADD is only a guess. There is a fun Linux kernel issue where if 307 you have two fds for the same file (via dup) and you ADD one to an 308 epfd, then close it, then re-create it with the same fd (via dup2 or an 309 unlucky dup), then try to ADD it again, you'll get an EEXIST, since the 310 struct epitem is not actually removed from the struct eventpoll until 311 the file itself is closed. 312 313 EV_CHANGE_ADD==1 314 EV_CHANGE_DEL==2 315 EV_READ ==2 316 EV_WRITE ==4 317 Bit 0: read change is add 318 Bit 1: read change is del 319 Bit 2: write change is add 320 Bit 3: write change is del 321 Bit 4: old events had EV_READ 322 Bit 5: old events had EV_WRITE 323 */ 324 325 #define INDEX(c) \ 326 ( (((c)->read_change&(EV_CHANGE_ADD|EV_CHANGE_DEL))) | \ 327 (((c)->write_change&(EV_CHANGE_ADD|EV_CHANGE_DEL)) << 2) | \ 328 (((c)->old_events&(EV_READ|EV_WRITE)) << 3) ) 329 330 #if EV_READ != 2 || EV_WRITE != 4 || EV_CHANGE_ADD != 1 || EV_CHANGE_DEL != 2 331 #error "Libevent's internals changed! Regenerate the op_table in epoll.c" 332 #endif 333 334 static const struct operation { 335 int events; 336 int op; 337 } op_table[] = { 338 { 0, 0 }, /* old= 0, write: 0, read: 0 */ 339 { EPOLLIN, EPOLL_CTL_ADD }, /* old= 0, write: 0, read:add */ 340 { EPOLLIN, EPOLL_CTL_DEL }, /* old= 0, write: 0, read:del */ 341 { 0, -1 }, /* old= 0, write: 0, read:xxx */ 342 { EPOLLOUT, EPOLL_CTL_ADD }, /* old= 0, write:add, read: 0 */ 343 { EPOLLIN|EPOLLOUT, EPOLL_CTL_ADD },/* old= 0, write:add, read:add */ 344 { EPOLLOUT, EPOLL_CTL_ADD }, /* old= 0, write:add, read:del */ 345 { 0, -1 }, /* old= 0, write:add, read:xxx */ 346 { EPOLLOUT, EPOLL_CTL_DEL }, /* old= 0, write:del, read: 0 */ 347 { EPOLLIN, EPOLL_CTL_ADD }, /* old= 0, write:del, read:add */ 348 { EPOLLIN|EPOLLOUT, EPOLL_CTL_DEL },/* old= 0, write:del, read:del */ 349 { 0, -1 }, /* old= 0, write:del, read:xxx */ 350 { 0, -1 }, /* old= 0, write:xxx, read: 0 */ 351 { 0, -1 }, /* old= 0, write:xxx, read:add */ 352 { 0, -1 }, /* old= 0, write:xxx, read:del */ 353 { 0, -1 }, /* old= 0, write:xxx, read:xxx */ 354 { 0, 0 }, /* old= r, write: 0, read: 0 */ 355 { EPOLLIN, EPOLL_CTL_MOD }, /* old= r, write: 0, read:add */ 356 { EPOLLIN, EPOLL_CTL_DEL }, /* old= r, write: 0, read:del */ 357 { 0, -1 }, /* old= r, write: 0, read:xxx */ 358 { EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old= r, write:add, read: 0 */ 359 { EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old= r, write:add, read:add */ 360 { EPOLLOUT, EPOLL_CTL_MOD }, /* old= r, write:add, read:del */ 361 { 0, -1 }, /* old= r, write:add, read:xxx */ 362 { EPOLLIN, EPOLL_CTL_MOD }, /* old= r, write:del, read: 0 */ 363 { EPOLLIN, EPOLL_CTL_MOD }, /* old= r, write:del, read:add */ 364 { EPOLLIN|EPOLLOUT, EPOLL_CTL_DEL },/* old= r, write:del, read:del */ 365 { 0, -1 }, /* old= r, write:del, read:xxx */ 366 { 0, -1 }, /* old= r, write:xxx, read: 0 */ 367 { 0, -1 }, /* old= r, write:xxx, read:add */ 368 { 0, -1 }, /* old= r, write:xxx, read:del */ 369 { 0, -1 }, /* old= r, write:xxx, read:xxx */ 370 { 0, 0 }, /* old= w, write: 0, read: 0 */ 371 { EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old= w, write: 0, read:add */ 372 { EPOLLOUT, EPOLL_CTL_MOD }, /* old= w, write: 0, read:del */ 373 { 0, -1 }, /* old= w, write: 0, read:xxx */ 374 { EPOLLOUT, EPOLL_CTL_MOD }, /* old= w, write:add, read: 0 */ 375 { EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old= w, write:add, read:add */ 376 { EPOLLOUT, EPOLL_CTL_MOD }, /* old= w, write:add, read:del */ 377 { 0, -1 }, /* old= w, write:add, read:xxx */ 378 { EPOLLOUT, EPOLL_CTL_DEL }, /* old= w, write:del, read: 0 */ 379 { EPOLLIN, EPOLL_CTL_MOD }, /* old= w, write:del, read:add */ 380 { EPOLLIN|EPOLLOUT, EPOLL_CTL_DEL },/* old= w, write:del, read:del */ 381 { 0, -1 }, /* old= w, write:del, read:xxx */ 382 { 0, -1 }, /* old= w, write:xxx, read: 0 */ 383 { 0, -1 }, /* old= w, write:xxx, read:add */ 384 { 0, -1 }, /* old= w, write:xxx, read:del */ 385 { 0, -1 }, /* old= w, write:xxx, read:xxx */ 386 { 0, 0 }, /* old=rw, write: 0, read: 0 */ 387 { EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old=rw, write: 0, read:add */ 388 { EPOLLOUT, EPOLL_CTL_MOD }, /* old=rw, write: 0, read:del */ 389 { 0, -1 }, /* old=rw, write: 0, read:xxx */ 390 { EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old=rw, write:add, read: 0 */ 391 { EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old=rw, write:add, read:add */ 392 { EPOLLOUT, EPOLL_CTL_MOD }, /* old=rw, write:add, read:del */ 393 { 0, -1 }, /* old=rw, write:add, read:xxx */ 394 { EPOLLIN, EPOLL_CTL_MOD }, /* old=rw, write:del, read: 0 */ 395 { EPOLLIN, EPOLL_CTL_MOD }, /* old=rw, write:del, read:add */ 396 { EPOLLIN|EPOLLOUT, EPOLL_CTL_DEL },/* old=rw, write:del, read:del */ 397 { 0, -1 }, /* old=rw, write:del, read:xxx */ 398 { 0, -1 }, /* old=rw, write:xxx, read: 0 */ 399 { 0, -1 }, /* old=rw, write:xxx, read:add */ 400 { 0, -1 }, /* old=rw, write:xxx, read:del */ 401 { 0, -1 }, /* old=rw, write:xxx, read:xxx */ 402 }; 403 404 static int 405 epoll_apply_one_change(struct event_base *base, 406 struct epollop *epollop, 407 const struct event_change *ch) 408 { 409 struct epoll_event epev; 410 int op, events = 0; 411 int idx; 412 413 idx = INDEX(ch); 414 op = op_table[idx].op; 415 events = op_table[idx].events; 416 417 if (!events) { 418 EVUTIL_ASSERT(op == 0); 419 return 0; 420 } 421 422 if ((ch->read_change|ch->write_change) & EV_CHANGE_ET) 423 events |= EPOLLET; 424 425 memset(&epev, 0, sizeof(epev)); 426 epev.data.fd = ch->fd; 427 epev.events = events; 428 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) { 429 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]", 430 epoll_op_to_string(op), 431 (int)epev.events, 432 (int)ch->fd, 433 ch->old_events, 434 ch->read_change, 435 ch->write_change)); 436 return 0; 437 } 438 439 switch (op) { 440 case EPOLL_CTL_MOD: 441 if (errno == ENOENT) { 442 /* If a MOD operation fails with ENOENT, the 443 * fd was probably closed and re-opened. We 444 * should retry the operation as an ADD. 445 */ 446 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) { 447 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too", 448 (int)epev.events, ch->fd); 449 return -1; 450 } else { 451 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.", 452 (int)epev.events, 453 ch->fd)); 454 return 0; 455 } 456 } 457 break; 458 case EPOLL_CTL_ADD: 459 if (errno == EEXIST) { 460 /* If an ADD operation fails with EEXIST, 461 * either the operation was redundant (as with a 462 * precautionary add), or we ran into a fun 463 * kernel bug where using dup*() to duplicate the 464 * same file into the same fd gives you the same epitem 465 * rather than a fresh one. For the second case, 466 * we must retry with MOD. */ 467 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) { 468 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too", 469 (int)epev.events, ch->fd); 470 return -1; 471 } else { 472 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.", 473 (int)epev.events, 474 ch->fd)); 475 return 0; 476 } 477 } 478 break; 479 case EPOLL_CTL_DEL: 480 if (errno == ENOENT || errno == EBADF || errno == EPERM) { 481 /* If a delete fails with one of these errors, 482 * that's fine too: we closed the fd before we 483 * got around to calling epoll_dispatch. */ 484 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.", 485 (int)epev.events, 486 ch->fd, 487 strerror(errno))); 488 return 0; 489 } 490 break; 491 default: 492 break; 493 } 494 495 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)", 496 epoll_op_to_string(op), 497 (int)epev.events, 498 ch->fd, 499 ch->old_events, 500 ch->read_change, 501 change_to_string(ch->read_change), 502 ch->write_change, 503 change_to_string(ch->write_change)); 504 505 return -1; 506 } 507 508 static int 509 epoll_apply_changes(struct event_base *base) 510 { 511 struct event_changelist *changelist = &base->changelist; 512 struct epollop *epollop = base->evbase; 513 struct event_change *ch; 514 515 int r = 0; 516 int i; 517 518 for (i = 0; i < changelist->n_changes; ++i) { 519 ch = &changelist->changes[i]; 520 if (epoll_apply_one_change(base, epollop, ch) < 0) 521 r = -1; 522 } 523 524 return (r); 525 } 526 527 static int 528 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd, 529 short old, short events, void *p) 530 { 531 struct event_change ch; 532 ch.fd = fd; 533 ch.old_events = old; 534 ch.read_change = ch.write_change = 0; 535 if (events & EV_WRITE) 536 ch.write_change = EV_CHANGE_ADD | 537 (events & EV_ET); 538 if (events & EV_READ) 539 ch.read_change = EV_CHANGE_ADD | 540 (events & EV_ET); 541 542 return epoll_apply_one_change(base, base->evbase, &ch); 543 } 544 545 static int 546 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd, 547 short old, short events, void *p) 548 { 549 struct event_change ch; 550 ch.fd = fd; 551 ch.old_events = old; 552 ch.read_change = ch.write_change = 0; 553 if (events & EV_WRITE) 554 ch.write_change = EV_CHANGE_DEL; 555 if (events & EV_READ) 556 ch.read_change = EV_CHANGE_DEL; 557 558 return epoll_apply_one_change(base, base->evbase, &ch); 559 } 560 561 static int 562 epoll_dispatch(struct event_base *base, struct timeval *tv) 563 { 564 struct epollop *epollop = base->evbase; 565 struct epoll_event *events = epollop->events; 566 int i, res; 567 long timeout = -1; 568 569 #ifdef USING_TIMERFD 570 if (epollop->timerfd >= 0) { 571 struct itimerspec is; 572 is.it_interval.tv_sec = 0; 573 is.it_interval.tv_nsec = 0; 574 if (tv == NULL) { 575 /* No timeout; disarm the timer. */ 576 is.it_value.tv_sec = 0; 577 is.it_value.tv_nsec = 0; 578 } else { 579 if (tv->tv_sec == 0 && tv->tv_usec == 0) { 580 /* we need to exit immediately; timerfd can't 581 * do that. */ 582 timeout = 0; 583 } 584 is.it_value.tv_sec = tv->tv_sec; 585 is.it_value.tv_nsec = tv->tv_usec * 1000; 586 } 587 /* TODO: we could avoid unnecessary syscalls here by only 588 calling timerfd_settime when the top timeout changes, or 589 when we're called with a different timeval. 590 */ 591 if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) { 592 event_warn("timerfd_settime"); 593 } 594 } else 595 #endif 596 if (tv != NULL) { 597 timeout = evutil_tv_to_msec_(tv); 598 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) { 599 /* Linux kernels can wait forever if the timeout is 600 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */ 601 timeout = MAX_EPOLL_TIMEOUT_MSEC; 602 } 603 } 604 605 epoll_apply_changes(base); 606 event_changelist_remove_all_(&base->changelist, base); 607 608 EVBASE_RELEASE_LOCK(base, th_base_lock); 609 610 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); 611 612 EVBASE_ACQUIRE_LOCK(base, th_base_lock); 613 614 if (res == -1) { 615 if (errno != EINTR) { 616 event_warn("epoll_wait"); 617 return (-1); 618 } 619 620 return (0); 621 } 622 623 event_debug(("%s: epoll_wait reports %d", __func__, res)); 624 EVUTIL_ASSERT(res <= epollop->nevents); 625 626 for (i = 0; i < res; i++) { 627 int what = events[i].events; 628 short ev = 0; 629 #ifdef USING_TIMERFD 630 if (events[i].data.fd == epollop->timerfd) 631 continue; 632 #endif 633 634 if (what & (EPOLLHUP|EPOLLERR)) { 635 ev = EV_READ | EV_WRITE; 636 } else { 637 if (what & EPOLLIN) 638 ev |= EV_READ; 639 if (what & EPOLLOUT) 640 ev |= EV_WRITE; 641 } 642 643 if (!ev) 644 continue; 645 646 evmap_io_active_(base, events[i].data.fd, ev | EV_ET); 647 } 648 649 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) { 650 /* We used all of the event space this time. We should 651 be ready for more events next time. */ 652 int new_nevents = epollop->nevents * 2; 653 struct epoll_event *new_events; 654 655 new_events = mm_realloc(epollop->events, 656 new_nevents * sizeof(struct epoll_event)); 657 if (new_events) { 658 epollop->events = new_events; 659 epollop->nevents = new_nevents; 660 } 661 } 662 663 return (0); 664 } 665 666 667 static void 668 epoll_dealloc(struct event_base *base) 669 { 670 struct epollop *epollop = base->evbase; 671 672 evsig_dealloc_(base); 673 if (epollop->events) 674 mm_free(epollop->events); 675 if (epollop->epfd >= 0) 676 close(epollop->epfd); 677 #ifdef USING_TIMERFD 678 if (epollop->timerfd >= 0) 679 close(epollop->timerfd); 680 #endif 681 682 memset(epollop, 0, sizeof(struct epollop)); 683 mm_free(epollop); 684 } 685 686 #endif /* EVENT__HAVE_EPOLL */ 687