xref: /netbsd-src/external/bsd/ntp/dist/sntp/libevent/epoll.c (revision 6a493d6bc668897c91594964a732d38505b70cbb)
1 /*	$NetBSD: epoll.c,v 1.1.1.1 2013/12/27 23:31:19 christos Exp $	*/
2 
3 /*
4  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
5  * Copyright 2007-2012 Niels Provos, Nick Mathewson
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. The name of the author may not be used to endorse or promote products
16  *    derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 #include "event2/event-config.h"
30 #include "evconfig-private.h"
31 
32 #ifdef EVENT__HAVE_EPOLL
33 
34 #include <stdint.h>
35 #include <sys/types.h>
36 #include <sys/resource.h>
37 #ifdef EVENT__HAVE_SYS_TIME_H
38 #include <sys/time.h>
39 #endif
40 #include <sys/queue.h>
41 #include <sys/epoll.h>
42 #include <signal.h>
43 #include <limits.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48 #include <errno.h>
49 #ifdef EVENT__HAVE_FCNTL_H
50 #include <fcntl.h>
51 #endif
52 #ifdef EVENT__HAVE_SYS_TIMERFD_H
53 #include <sys/timerfd.h>
54 #endif
55 
56 #include "event-internal.h"
57 #include "evsignal-internal.h"
58 #include "event2/thread.h"
59 #include "evthread-internal.h"
60 #include "log-internal.h"
61 #include "evmap-internal.h"
62 #include "changelist-internal.h"
63 #include "time-internal.h"
64 
65 #if defined(EVENT__HAVE_SYS_TIMERFD_H) &&			  \
66 	defined(EVENT__HAVE_TIMERFD_CREATE) &&			  \
67 	defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \
68 	defined(TFD_CLOEXEC)
69 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available
70    and working.  This means that we can't support it on 2.6.25 (where timerfd
71    was introduced) or 2.6.26, since 2.6.27 introduced those flags.
72  */
73 #define USING_TIMERFD
74 #endif
75 
76 struct epollop {
77 	struct epoll_event *events;
78 	int nevents;
79 	int epfd;
80 #ifdef USING_TIMERFD
81 	int timerfd;
82 #endif
83 };
84 
85 static void *epoll_init(struct event_base *);
86 static int epoll_dispatch(struct event_base *, struct timeval *);
87 static void epoll_dealloc(struct event_base *);
88 
89 static const struct eventop epollops_changelist = {
90 	"epoll (with changelist)",
91 	epoll_init,
92 	event_changelist_add_,
93 	event_changelist_del_,
94 	epoll_dispatch,
95 	epoll_dealloc,
96 	1, /* need reinit */
97 	EV_FEATURE_ET|EV_FEATURE_O1,
98 	EVENT_CHANGELIST_FDINFO_SIZE
99 };
100 
101 
102 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
103     short old, short events, void *p);
104 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
105     short old, short events, void *p);
106 
107 const struct eventop epollops = {
108 	"epoll",
109 	epoll_init,
110 	epoll_nochangelist_add,
111 	epoll_nochangelist_del,
112 	epoll_dispatch,
113 	epoll_dealloc,
114 	1, /* need reinit */
115 	EV_FEATURE_ET|EV_FEATURE_O1,
116 	0
117 };
118 
119 #define INITIAL_NEVENT 32
120 #define MAX_NEVENT 4096
121 
122 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
123  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be
124  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
125  * largest number of msec we can support here is 2147482.  Let's
126  * round that down by 47 seconds.
127  */
128 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
129 
130 static void *
131 epoll_init(struct event_base *base)
132 {
133 	int epfd = -1;
134 	struct epollop *epollop;
135 
136 #ifdef EVENT__HAVE_EPOLL_CREATE1
137 	/* First, try the shiny new epoll_create1 interface, if we have it. */
138 	epfd = epoll_create1(EPOLL_CLOEXEC);
139 #endif
140 	if (epfd == -1) {
141 		/* Initialize the kernel queue using the old interface.  (The
142 		size field is ignored   since 2.6.8.) */
143 		if ((epfd = epoll_create(32000)) == -1) {
144 			if (errno != ENOSYS)
145 				event_warn("epoll_create");
146 			return (NULL);
147 		}
148 		evutil_make_socket_closeonexec(epfd);
149 	}
150 
151 	if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
152 		close(epfd);
153 		return (NULL);
154 	}
155 
156 	epollop->epfd = epfd;
157 
158 	/* Initialize fields */
159 	epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
160 	if (epollop->events == NULL) {
161 		mm_free(epollop);
162 		close(epfd);
163 		return (NULL);
164 	}
165 	epollop->nevents = INITIAL_NEVENT;
166 
167 	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
168 	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
169 		evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {
170 
171 		base->evsel = &epollops_changelist;
172 	}
173 
174 #ifdef USING_TIMERFD
175 	/*
176 	  The epoll interface ordinarily gives us one-millisecond precision,
177 	  so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE
178 	  timer.  But when the user has set the new PRECISE_TIMER flag for an
179 	  event_base, we can try to use timerfd to give them finer granularity.
180 	*/
181 	if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&
182 	    base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {
183 		int fd;
184 		fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
185 		if (epollop->timerfd >= 0) {
186 			struct epoll_event epev;
187 			memset(&epev, 0, sizeof(epev));
188 			epev.data.fd = epollop->timerfd;
189 			epev.events = EPOLLIN;
190 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {
191 				event_warn("epoll_ctl(timerfd)");
192 				close(fd);
193 				epollop->timerfd = -1;
194 			}
195 		} else {
196 			if (errno != EINVAL && errno != ENOSYS) {
197 				/* These errors probably mean that we were
198 				 * compiled with timerfd/TFD_* support, but
199 				 * we're running on a kernel that lacks those.
200 				 */
201 				event_warn("timerfd_create");
202 			}
203 			epollop->timerfd = -1;
204 		}
205 	} else {
206 		epollop->timerfd = -1;
207 	}
208 #endif
209 
210 	evsig_init_(base);
211 
212 	return (epollop);
213 }
214 
215 static const char *
216 change_to_string(int change)
217 {
218 	change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
219 	if (change == EV_CHANGE_ADD) {
220 		return "add";
221 	} else if (change == EV_CHANGE_DEL) {
222 		return "del";
223 	} else if (change == 0) {
224 		return "none";
225 	} else {
226 		return "???";
227 	}
228 }
229 
230 static const char *
231 epoll_op_to_string(int op)
232 {
233 	return op == EPOLL_CTL_ADD?"ADD":
234 	    op == EPOLL_CTL_DEL?"DEL":
235 	    op == EPOLL_CTL_MOD?"MOD":
236 	    "???";
237 }
238 
239 /*
240   Here are the values we're masking off to decide what operations to do.
241   Note that since EV_READ|EV_WRITE.
242 
243   Note also that this table is a little sparse, since ADD+DEL is
244   nonsensical ("xxx" in the list below.)
245 
246   Note also also that we are shifting old_events by only 3 bits, since
247   EV_READ is 2 and EV_WRITE is 4.
248 
249   The table was auto-generated with a python script, according to this
250   pseudocode:
251 
252       If either the read or the write change is add+del:
253 	 This is impossible; Set op==-1, events=0.
254       Else, if either the read or the write change is add:
255 	 Set events to 0.
256 	 If the read change is add, or
257 	    (the read change is not del, and ev_read is in old_events):
258 	       Add EPOLLIN to events.
259 	 If the write change is add, or
260 	    (the write change is not del, and ev_write is in old_events):
261 	       Add EPOLLOUT to events.
262 
263 	 If old_events is set:
264 	       Set op to EPOLL_CTL_MOD [*1,*2]
265 	Else:
266 	       Set op to EPOLL_CTL_ADD [*3]
267 
268       Else, if the read or the write change is del:
269 	 Set op to EPOLL_CTL_DEL.
270 	 If the read change is del:
271 	     If the write change is del:
272 		 Set events to EPOLLIN|EPOLLOUT
273 	     Else if ev_write is in old_events:
274 		 Set events to EPOLLOUT
275 		Set op to EPOLL_CTL_MOD
276 	     Else
277 		 Set events to EPOLLIN
278 	 Else:
279 	     {The write change is del.}
280 	    If ev_read is in old_events:
281 		 Set events to EPOLLIN
282 		Set op to EPOLL_CTL_MOD
283 	    Else:
284 		Set the events to EPOLLOUT
285 
286       Else:
287 	   There is no read or write change; set op to 0 and events to 0.
288 
289       The logic is a little tricky, since we had no events set on the fd before,
290       we need to set op="ADD" and set events=the events we want to add.	 If we
291       had any events set on the fd before, and we want any events to remain on
292       the fd, we need to say op="MOD" and set events=the events we want to
293       remain.  But if we want to delete the last event, we say op="DEL" and
294       set events=(any non-null pointer).
295 
296   [*1] This MOD is only a guess.  MOD might fail with ENOENT if the file was
297        closed and a new file was opened with the same fd.  If so, we'll retry
298        with ADD.
299 
300   [*2] We can't replace this with a no-op even if old_events is the same as
301        the new events: if the file was closed and reopened, we need to retry
302        with an ADD.  (We do a MOD in this case since "no change" is more
303        common than "close and reopen", so we'll usually wind up doing 1
304        syscalls instead of 2.)
305 
306   [*3] This ADD is only a guess.  There is a fun Linux kernel issue where if
307        you have two fds for the same file (via dup) and you ADD one to an
308        epfd, then close it, then re-create it with the same fd (via dup2 or an
309        unlucky dup), then try to ADD it again, you'll get an EEXIST, since the
310        struct epitem is not actually removed from the struct eventpoll until
311        the file itself is closed.
312 
313   EV_CHANGE_ADD==1
314   EV_CHANGE_DEL==2
315   EV_READ      ==2
316   EV_WRITE     ==4
317   Bit 0: read change is add
318   Bit 1: read change is del
319   Bit 2: write change is add
320   Bit 3: write change is del
321   Bit 4: old events had EV_READ
322   Bit 5: old events had EV_WRITE
323 */
324 
325 #define INDEX(c) \
326 	(   (((c)->read_change&(EV_CHANGE_ADD|EV_CHANGE_DEL))) |       \
327 	    (((c)->write_change&(EV_CHANGE_ADD|EV_CHANGE_DEL)) << 2) | \
328 	    (((c)->old_events&(EV_READ|EV_WRITE)) << 3) )
329 
330 #if EV_READ != 2 || EV_WRITE != 4 || EV_CHANGE_ADD != 1 || EV_CHANGE_DEL != 2
331 #error "Libevent's internals changed!  Regenerate the op_table in epoll.c"
332 #endif
333 
334 static const struct operation {
335 	int events;
336 	int op;
337 } op_table[] = {
338 	{ 0, 0 },                           /* old= 0, write:  0, read:  0 */
339 	{ EPOLLIN, EPOLL_CTL_ADD },         /* old= 0, write:  0, read:add */
340 	{ EPOLLIN, EPOLL_CTL_DEL },         /* old= 0, write:  0, read:del */
341 	{ 0, -1 },                          /* old= 0, write:  0, read:xxx */
342 	{ EPOLLOUT, EPOLL_CTL_ADD },        /* old= 0, write:add, read:  0 */
343 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_ADD },/* old= 0, write:add, read:add */
344 	{ EPOLLOUT, EPOLL_CTL_ADD },        /* old= 0, write:add, read:del */
345 	{ 0, -1 },                          /* old= 0, write:add, read:xxx */
346 	{ EPOLLOUT, EPOLL_CTL_DEL },        /* old= 0, write:del, read:  0 */
347 	{ EPOLLIN, EPOLL_CTL_ADD },         /* old= 0, write:del, read:add */
348 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_DEL },/* old= 0, write:del, read:del */
349 	{ 0, -1 },                          /* old= 0, write:del, read:xxx */
350 	{ 0, -1 },                          /* old= 0, write:xxx, read:  0 */
351 	{ 0, -1 },                          /* old= 0, write:xxx, read:add */
352 	{ 0, -1 },                          /* old= 0, write:xxx, read:del */
353 	{ 0, -1 },                          /* old= 0, write:xxx, read:xxx */
354 	{ 0, 0 },                           /* old= r, write:  0, read:  0 */
355 	{ EPOLLIN, EPOLL_CTL_MOD },         /* old= r, write:  0, read:add */
356 	{ EPOLLIN, EPOLL_CTL_DEL },         /* old= r, write:  0, read:del */
357 	{ 0, -1 },                          /* old= r, write:  0, read:xxx */
358 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old= r, write:add, read:  0 */
359 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old= r, write:add, read:add */
360 	{ EPOLLOUT, EPOLL_CTL_MOD },        /* old= r, write:add, read:del */
361 	{ 0, -1 },                          /* old= r, write:add, read:xxx */
362 	{ EPOLLIN, EPOLL_CTL_MOD },         /* old= r, write:del, read:  0 */
363 	{ EPOLLIN, EPOLL_CTL_MOD },         /* old= r, write:del, read:add */
364 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_DEL },/* old= r, write:del, read:del */
365 	{ 0, -1 },                          /* old= r, write:del, read:xxx */
366 	{ 0, -1 },                          /* old= r, write:xxx, read:  0 */
367 	{ 0, -1 },                          /* old= r, write:xxx, read:add */
368 	{ 0, -1 },                          /* old= r, write:xxx, read:del */
369 	{ 0, -1 },                          /* old= r, write:xxx, read:xxx */
370 	{ 0, 0 },                           /* old= w, write:  0, read:  0 */
371 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old= w, write:  0, read:add */
372 	{ EPOLLOUT, EPOLL_CTL_MOD },        /* old= w, write:  0, read:del */
373 	{ 0, -1 },                          /* old= w, write:  0, read:xxx */
374 	{ EPOLLOUT, EPOLL_CTL_MOD },        /* old= w, write:add, read:  0 */
375 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old= w, write:add, read:add */
376 	{ EPOLLOUT, EPOLL_CTL_MOD },        /* old= w, write:add, read:del */
377 	{ 0, -1 },                          /* old= w, write:add, read:xxx */
378 	{ EPOLLOUT, EPOLL_CTL_DEL },        /* old= w, write:del, read:  0 */
379 	{ EPOLLIN, EPOLL_CTL_MOD },         /* old= w, write:del, read:add */
380 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_DEL },/* old= w, write:del, read:del */
381 	{ 0, -1 },                          /* old= w, write:del, read:xxx */
382 	{ 0, -1 },                          /* old= w, write:xxx, read:  0 */
383 	{ 0, -1 },                          /* old= w, write:xxx, read:add */
384 	{ 0, -1 },                          /* old= w, write:xxx, read:del */
385 	{ 0, -1 },                          /* old= w, write:xxx, read:xxx */
386 	{ 0, 0 },                           /* old=rw, write:  0, read:  0 */
387 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old=rw, write:  0, read:add */
388 	{ EPOLLOUT, EPOLL_CTL_MOD },        /* old=rw, write:  0, read:del */
389 	{ 0, -1 },                          /* old=rw, write:  0, read:xxx */
390 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old=rw, write:add, read:  0 */
391 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_MOD },/* old=rw, write:add, read:add */
392 	{ EPOLLOUT, EPOLL_CTL_MOD },        /* old=rw, write:add, read:del */
393 	{ 0, -1 },                          /* old=rw, write:add, read:xxx */
394 	{ EPOLLIN, EPOLL_CTL_MOD },         /* old=rw, write:del, read:  0 */
395 	{ EPOLLIN, EPOLL_CTL_MOD },         /* old=rw, write:del, read:add */
396 	{ EPOLLIN|EPOLLOUT, EPOLL_CTL_DEL },/* old=rw, write:del, read:del */
397 	{ 0, -1 },                          /* old=rw, write:del, read:xxx */
398 	{ 0, -1 },                          /* old=rw, write:xxx, read:  0 */
399 	{ 0, -1 },                          /* old=rw, write:xxx, read:add */
400 	{ 0, -1 },                          /* old=rw, write:xxx, read:del */
401 	{ 0, -1 },                          /* old=rw, write:xxx, read:xxx */
402 };
403 
404 static int
405 epoll_apply_one_change(struct event_base *base,
406     struct epollop *epollop,
407     const struct event_change *ch)
408 {
409 	struct epoll_event epev;
410 	int op, events = 0;
411 	int idx;
412 
413 	idx = INDEX(ch);
414 	op = op_table[idx].op;
415 	events = op_table[idx].events;
416 
417 	if (!events) {
418 		EVUTIL_ASSERT(op == 0);
419 		return 0;
420 	}
421 
422 	if ((ch->read_change|ch->write_change) & EV_CHANGE_ET)
423 		events |= EPOLLET;
424 
425 	memset(&epev, 0, sizeof(epev));
426 	epev.data.fd = ch->fd;
427 	epev.events = events;
428 	if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) {
429 		event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
430 			epoll_op_to_string(op),
431 			(int)epev.events,
432 			(int)ch->fd,
433 			ch->old_events,
434 			ch->read_change,
435 			ch->write_change));
436 		return 0;
437 	}
438 
439 	switch (op) {
440 	case EPOLL_CTL_MOD:
441 		if (errno == ENOENT) {
442 			/* If a MOD operation fails with ENOENT, the
443 			 * fd was probably closed and re-opened.  We
444 			 * should retry the operation as an ADD.
445 			 */
446 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
447 				event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
448 				    (int)epev.events, ch->fd);
449 				return -1;
450 			} else {
451 				event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
452 					(int)epev.events,
453 					ch->fd));
454 				return 0;
455 			}
456 		}
457 		break;
458 	case EPOLL_CTL_ADD:
459 		if (errno == EEXIST) {
460 			/* If an ADD operation fails with EEXIST,
461 			 * either the operation was redundant (as with a
462 			 * precautionary add), or we ran into a fun
463 			 * kernel bug where using dup*() to duplicate the
464 			 * same file into the same fd gives you the same epitem
465 			 * rather than a fresh one.  For the second case,
466 			 * we must retry with MOD. */
467 			if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
468 				event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
469 				    (int)epev.events, ch->fd);
470 				return -1;
471 			} else {
472 				event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
473 					(int)epev.events,
474 					ch->fd));
475 				return 0;
476 			}
477 		}
478 		break;
479 	case EPOLL_CTL_DEL:
480 		if (errno == ENOENT || errno == EBADF || errno == EPERM) {
481 			/* If a delete fails with one of these errors,
482 			 * that's fine too: we closed the fd before we
483 			 * got around to calling epoll_dispatch. */
484 			event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
485 				(int)epev.events,
486 				ch->fd,
487 				strerror(errno)));
488 			return 0;
489 		}
490 		break;
491 	default:
492 		break;
493 	}
494 
495 	event_warn("Epoll %s(%d) on fd %d failed.  Old events were %d; read change was %d (%s); write change was %d (%s)",
496 	    epoll_op_to_string(op),
497 	    (int)epev.events,
498 	    ch->fd,
499 	    ch->old_events,
500 	    ch->read_change,
501 	    change_to_string(ch->read_change),
502 	    ch->write_change,
503 	    change_to_string(ch->write_change));
504 
505 	return -1;
506 }
507 
508 static int
509 epoll_apply_changes(struct event_base *base)
510 {
511 	struct event_changelist *changelist = &base->changelist;
512 	struct epollop *epollop = base->evbase;
513 	struct event_change *ch;
514 
515 	int r = 0;
516 	int i;
517 
518 	for (i = 0; i < changelist->n_changes; ++i) {
519 		ch = &changelist->changes[i];
520 		if (epoll_apply_one_change(base, epollop, ch) < 0)
521 			r = -1;
522 	}
523 
524 	return (r);
525 }
526 
527 static int
528 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
529     short old, short events, void *p)
530 {
531 	struct event_change ch;
532 	ch.fd = fd;
533 	ch.old_events = old;
534 	ch.read_change = ch.write_change = 0;
535 	if (events & EV_WRITE)
536 		ch.write_change = EV_CHANGE_ADD |
537 		    (events & EV_ET);
538 	if (events & EV_READ)
539 		ch.read_change = EV_CHANGE_ADD |
540 		    (events & EV_ET);
541 
542 	return epoll_apply_one_change(base, base->evbase, &ch);
543 }
544 
545 static int
546 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
547     short old, short events, void *p)
548 {
549 	struct event_change ch;
550 	ch.fd = fd;
551 	ch.old_events = old;
552 	ch.read_change = ch.write_change = 0;
553 	if (events & EV_WRITE)
554 		ch.write_change = EV_CHANGE_DEL;
555 	if (events & EV_READ)
556 		ch.read_change = EV_CHANGE_DEL;
557 
558 	return epoll_apply_one_change(base, base->evbase, &ch);
559 }
560 
561 static int
562 epoll_dispatch(struct event_base *base, struct timeval *tv)
563 {
564 	struct epollop *epollop = base->evbase;
565 	struct epoll_event *events = epollop->events;
566 	int i, res;
567 	long timeout = -1;
568 
569 #ifdef USING_TIMERFD
570 	if (epollop->timerfd >= 0) {
571 		struct itimerspec is;
572 		is.it_interval.tv_sec = 0;
573 		is.it_interval.tv_nsec = 0;
574 		if (tv == NULL) {
575 			/* No timeout; disarm the timer. */
576 			is.it_value.tv_sec = 0;
577 			is.it_value.tv_nsec = 0;
578 		} else {
579 			if (tv->tv_sec == 0 && tv->tv_usec == 0) {
580 				/* we need to exit immediately; timerfd can't
581 				 * do that. */
582 				timeout = 0;
583 			}
584 			is.it_value.tv_sec = tv->tv_sec;
585 			is.it_value.tv_nsec = tv->tv_usec * 1000;
586 		}
587 		/* TODO: we could avoid unnecessary syscalls here by only
588 		   calling timerfd_settime when the top timeout changes, or
589 		   when we're called with a different timeval.
590 		*/
591 		if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {
592 			event_warn("timerfd_settime");
593 		}
594 	} else
595 #endif
596 	if (tv != NULL) {
597 		timeout = evutil_tv_to_msec_(tv);
598 		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
599 			/* Linux kernels can wait forever if the timeout is
600 			 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
601 			timeout = MAX_EPOLL_TIMEOUT_MSEC;
602 		}
603 	}
604 
605 	epoll_apply_changes(base);
606 	event_changelist_remove_all_(&base->changelist, base);
607 
608 	EVBASE_RELEASE_LOCK(base, th_base_lock);
609 
610 	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
611 
612 	EVBASE_ACQUIRE_LOCK(base, th_base_lock);
613 
614 	if (res == -1) {
615 		if (errno != EINTR) {
616 			event_warn("epoll_wait");
617 			return (-1);
618 		}
619 
620 		return (0);
621 	}
622 
623 	event_debug(("%s: epoll_wait reports %d", __func__, res));
624 	EVUTIL_ASSERT(res <= epollop->nevents);
625 
626 	for (i = 0; i < res; i++) {
627 		int what = events[i].events;
628 		short ev = 0;
629 #ifdef USING_TIMERFD
630 		if (events[i].data.fd == epollop->timerfd)
631 			continue;
632 #endif
633 
634 		if (what & (EPOLLHUP|EPOLLERR)) {
635 			ev = EV_READ | EV_WRITE;
636 		} else {
637 			if (what & EPOLLIN)
638 				ev |= EV_READ;
639 			if (what & EPOLLOUT)
640 				ev |= EV_WRITE;
641 		}
642 
643 		if (!ev)
644 			continue;
645 
646 		evmap_io_active_(base, events[i].data.fd, ev | EV_ET);
647 	}
648 
649 	if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
650 		/* We used all of the event space this time.  We should
651 		   be ready for more events next time. */
652 		int new_nevents = epollop->nevents * 2;
653 		struct epoll_event *new_events;
654 
655 		new_events = mm_realloc(epollop->events,
656 		    new_nevents * sizeof(struct epoll_event));
657 		if (new_events) {
658 			epollop->events = new_events;
659 			epollop->nevents = new_nevents;
660 		}
661 	}
662 
663 	return (0);
664 }
665 
666 
667 static void
668 epoll_dealloc(struct event_base *base)
669 {
670 	struct epollop *epollop = base->evbase;
671 
672 	evsig_dealloc_(base);
673 	if (epollop->events)
674 		mm_free(epollop->events);
675 	if (epollop->epfd >= 0)
676 		close(epollop->epfd);
677 #ifdef USING_TIMERFD
678 	if (epollop->timerfd >= 0)
679 		close(epollop->timerfd);
680 #endif
681 
682 	memset(epollop, 0, sizeof(struct epollop));
683 	mm_free(epollop);
684 }
685 
686 #endif /* EVENT__HAVE_EPOLL */
687