1 /* $NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $ */
2 /*
3 * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>
4 * Copyright 2007-2012 Niels Provos, Nick Mathewson
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 #include "event2/event-config.h"
29 #include <sys/cdefs.h>
30 __RCSID("$NetBSD: epoll.c,v 1.1.1.1 2013/04/11 16:43:19 christos Exp $");
31
32 #include <stdint.h>
33 #include <sys/types.h>
34 #include <sys/resource.h>
35 #ifdef _EVENT_HAVE_SYS_TIME_H
36 #include <sys/time.h>
37 #endif
38 #include <sys/queue.h>
39 #include <sys/epoll.h>
40 #include <signal.h>
41 #include <limits.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include <errno.h>
47 #ifdef _EVENT_HAVE_FCNTL_H
48 #include <fcntl.h>
49 #endif
50
51 #include "event-internal.h"
52 #include "evsignal-internal.h"
53 #include "event2/thread.h"
54 #include "evthread-internal.h"
55 #include "log-internal.h"
56 #include "evmap-internal.h"
57 #include "changelist-internal.h"
58
59 struct epollop {
60 struct epoll_event *events;
61 int nevents;
62 int epfd;
63 };
64
65 static void *epoll_init(struct event_base *);
66 static int epoll_dispatch(struct event_base *, struct timeval *);
67 static void epoll_dealloc(struct event_base *);
68
69 static const struct eventop epollops_changelist = {
70 "epoll (with changelist)",
71 epoll_init,
72 event_changelist_add,
73 event_changelist_del,
74 epoll_dispatch,
75 epoll_dealloc,
76 1, /* need reinit */
77 EV_FEATURE_ET|EV_FEATURE_O1,
78 EVENT_CHANGELIST_FDINFO_SIZE
79 };
80
81
82 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
83 short old, short events, void *p);
84 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
85 short old, short events, void *p);
86
87 const struct eventop epollops = {
88 "epoll",
89 epoll_init,
90 epoll_nochangelist_add,
91 epoll_nochangelist_del,
92 epoll_dispatch,
93 epoll_dealloc,
94 1, /* need reinit */
95 EV_FEATURE_ET|EV_FEATURE_O1,
96 0
97 };
98
99 #define INITIAL_NEVENT 32
100 #define MAX_NEVENT 4096
101
102 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout
103 * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be
104 * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the
105 * largest number of msec we can support here is 2147482. Let's
106 * round that down by 47 seconds.
107 */
108 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)
109
110 static void *
epoll_init(struct event_base * base)111 epoll_init(struct event_base *base)
112 {
113 int epfd;
114 struct epollop *epollop;
115
116 /* Initialize the kernel queue. (The size field is ignored since
117 * 2.6.8.) */
118 if ((epfd = epoll_create(32000)) == -1) {
119 if (errno != ENOSYS)
120 event_warn("epoll_create");
121 return (NULL);
122 }
123
124 evutil_make_socket_closeonexec(epfd);
125
126 if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {
127 close(epfd);
128 return (NULL);
129 }
130
131 epollop->epfd = epfd;
132
133 /* Initialize fields */
134 epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));
135 if (epollop->events == NULL) {
136 mm_free(epollop);
137 close(epfd);
138 return (NULL);
139 }
140 epollop->nevents = INITIAL_NEVENT;
141
142 if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
143 ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
144 evutil_getenv("EVENT_EPOLL_USE_CHANGELIST") != NULL))
145 base->evsel = &epollops_changelist;
146
147 evsig_init(base);
148
149 return (epollop);
150 }
151
152 static const char *
change_to_string(int change)153 change_to_string(int change)
154 {
155 change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);
156 if (change == EV_CHANGE_ADD) {
157 return "add";
158 } else if (change == EV_CHANGE_DEL) {
159 return "del";
160 } else if (change == 0) {
161 return "none";
162 } else {
163 return "???";
164 }
165 }
166
167 static const char *
epoll_op_to_string(int op)168 epoll_op_to_string(int op)
169 {
170 return op == EPOLL_CTL_ADD?"ADD":
171 op == EPOLL_CTL_DEL?"DEL":
172 op == EPOLL_CTL_MOD?"MOD":
173 "???";
174 }
175
176 static int
epoll_apply_one_change(struct event_base * base,struct epollop * epollop,const struct event_change * ch)177 epoll_apply_one_change(struct event_base *base,
178 struct epollop *epollop,
179 const struct event_change *ch)
180 {
181 struct epoll_event epev;
182 int op, events = 0;
183
184 if (1) {
185 /* The logic here is a little tricky. If we had no events set
186 on the fd before, we need to set op="ADD" and set
187 events=the events we want to add. If we had any events set
188 on the fd before, and we want any events to remain on the
189 fd, we need to say op="MOD" and set events=the events we
190 want to remain. But if we want to delete the last event,
191 we say op="DEL" and set events=the remaining events. What
192 fun!
193 */
194
195 /* TODO: Turn this into a switch or a table lookup. */
196
197 if ((ch->read_change & EV_CHANGE_ADD) ||
198 (ch->write_change & EV_CHANGE_ADD)) {
199 /* If we are adding anything at all, we'll want to do
200 * either an ADD or a MOD. */
201 events = 0;
202 op = EPOLL_CTL_ADD;
203 if (ch->read_change & EV_CHANGE_ADD) {
204 events |= EPOLLIN;
205 } else if (ch->read_change & EV_CHANGE_DEL) {
206 ;
207 } else if (ch->old_events & EV_READ) {
208 events |= EPOLLIN;
209 }
210 if (ch->write_change & EV_CHANGE_ADD) {
211 events |= EPOLLOUT;
212 } else if (ch->write_change & EV_CHANGE_DEL) {
213 ;
214 } else if (ch->old_events & EV_WRITE) {
215 events |= EPOLLOUT;
216 }
217 if ((ch->read_change|ch->write_change) & EV_ET)
218 events |= EPOLLET;
219
220 if (ch->old_events) {
221 /* If MOD fails, we retry as an ADD, and if
222 * ADD fails we will retry as a MOD. So the
223 * only hard part here is to guess which one
224 * will work. As a heuristic, we'll try
225 * MOD first if we think there were old
226 * events and ADD if we think there were none.
227 *
228 * We can be wrong about the MOD if the file
229 * has in fact been closed and re-opened.
230 *
231 * We can be wrong about the ADD if the
232 * the fd has been re-created with a dup()
233 * of the same file that it was before.
234 */
235 op = EPOLL_CTL_MOD;
236 }
237 } else if ((ch->read_change & EV_CHANGE_DEL) ||
238 (ch->write_change & EV_CHANGE_DEL)) {
239 /* If we're deleting anything, we'll want to do a MOD
240 * or a DEL. */
241 op = EPOLL_CTL_DEL;
242
243 if (ch->read_change & EV_CHANGE_DEL) {
244 if (ch->write_change & EV_CHANGE_DEL) {
245 events = EPOLLIN|EPOLLOUT;
246 } else if (ch->old_events & EV_WRITE) {
247 events = EPOLLOUT;
248 op = EPOLL_CTL_MOD;
249 } else {
250 events = EPOLLIN;
251 }
252 } else if (ch->write_change & EV_CHANGE_DEL) {
253 if (ch->old_events & EV_READ) {
254 events = EPOLLIN;
255 op = EPOLL_CTL_MOD;
256 } else {
257 events = EPOLLOUT;
258 }
259 }
260 }
261
262 if (!events)
263 return 0;
264
265 memset(&epev, 0, sizeof(epev));
266 epev.data.fd = ch->fd;
267 epev.events = events;
268 if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == -1) {
269 if (op == EPOLL_CTL_MOD && errno == ENOENT) {
270 /* If a MOD operation fails with ENOENT, the
271 * fd was probably closed and re-opened. We
272 * should retry the operation as an ADD.
273 */
274 if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {
275 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",
276 (int)epev.events, ch->fd);
277 return -1;
278 } else {
279 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",
280 (int)epev.events,
281 ch->fd));
282 }
283 } else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
284 /* If an ADD operation fails with EEXIST,
285 * either the operation was redundant (as with a
286 * precautionary add), or we ran into a fun
287 * kernel bug where using dup*() to duplicate the
288 * same file into the same fd gives you the same epitem
289 * rather than a fresh one. For the second case,
290 * we must retry with MOD. */
291 if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
292 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
293 (int)epev.events, ch->fd);
294 return -1;
295 } else {
296 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
297 (int)epev.events,
298 ch->fd));
299 }
300 } else if (op == EPOLL_CTL_DEL &&
301 (errno == ENOENT || errno == EBADF ||
302 errno == EPERM)) {
303 /* If a delete fails with one of these errors,
304 * that's fine too: we closed the fd before we
305 * got around to calling epoll_dispatch. */
306 event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",
307 (int)epev.events,
308 ch->fd,
309 strerror(errno)));
310 } else {
311 event_warn("Epoll %s(%d) on fd %d failed. Old events were %d; read change was %d (%s); write change was %d (%s)",
312 epoll_op_to_string(op),
313 (int)epev.events,
314 ch->fd,
315 ch->old_events,
316 ch->read_change,
317 change_to_string(ch->read_change),
318 ch->write_change,
319 change_to_string(ch->write_change));
320 return -1;
321 }
322 } else {
323 event_debug(("Epoll %s(%d) on fd %d okay. [old events were %d; read change was %d; write change was %d]",
324 epoll_op_to_string(op),
325 (int)epev.events,
326 (int)ch->fd,
327 ch->old_events,
328 ch->read_change,
329 ch->write_change));
330 }
331 }
332 return 0;
333 }
334
335 static int
epoll_apply_changes(struct event_base * base)336 epoll_apply_changes(struct event_base *base)
337 {
338 struct event_changelist *changelist = &base->changelist;
339 struct epollop *epollop = base->evbase;
340 struct event_change *ch;
341
342 int r = 0;
343 int i;
344
345 for (i = 0; i < changelist->n_changes; ++i) {
346 ch = &changelist->changes[i];
347 if (epoll_apply_one_change(base, epollop, ch) < 0)
348 r = -1;
349 }
350
351 return (r);
352 }
353
354 static int
epoll_nochangelist_add(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)355 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,
356 short old, short events, void *p)
357 {
358 struct event_change ch;
359 ch.fd = fd;
360 ch.old_events = old;
361 ch.read_change = ch.write_change = 0;
362 if (events & EV_WRITE)
363 ch.write_change = EV_CHANGE_ADD |
364 (events & EV_ET);
365 if (events & EV_READ)
366 ch.read_change = EV_CHANGE_ADD |
367 (events & EV_ET);
368
369 return epoll_apply_one_change(base, base->evbase, &ch);
370 }
371
372 static int
epoll_nochangelist_del(struct event_base * base,evutil_socket_t fd,short old,short events,void * p)373 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,
374 short old, short events, void *p)
375 {
376 struct event_change ch;
377 ch.fd = fd;
378 ch.old_events = old;
379 ch.read_change = ch.write_change = 0;
380 if (events & EV_WRITE)
381 ch.write_change = EV_CHANGE_DEL;
382 if (events & EV_READ)
383 ch.read_change = EV_CHANGE_DEL;
384
385 return epoll_apply_one_change(base, base->evbase, &ch);
386 }
387
388 static int
epoll_dispatch(struct event_base * base,struct timeval * tv)389 epoll_dispatch(struct event_base *base, struct timeval *tv)
390 {
391 struct epollop *epollop = base->evbase;
392 struct epoll_event *events = epollop->events;
393 int i, res;
394 long timeout = -1;
395
396 if (tv != NULL) {
397 timeout = evutil_tv_to_msec(tv);
398 if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
399 /* Linux kernels can wait forever if the timeout is
400 * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */
401 timeout = MAX_EPOLL_TIMEOUT_MSEC;
402 }
403 }
404
405 epoll_apply_changes(base);
406 event_changelist_remove_all(&base->changelist, base);
407
408 EVBASE_RELEASE_LOCK(base, th_base_lock);
409
410 res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
411
412 EVBASE_ACQUIRE_LOCK(base, th_base_lock);
413
414 if (res == -1) {
415 if (errno != EINTR) {
416 event_warn("epoll_wait");
417 return (-1);
418 }
419
420 return (0);
421 }
422
423 event_debug(("%s: epoll_wait reports %d", __func__, res));
424 EVUTIL_ASSERT(res <= epollop->nevents);
425
426 for (i = 0; i < res; i++) {
427 int what = events[i].events;
428 short ev = 0;
429
430 if (what & (EPOLLHUP|EPOLLERR)) {
431 ev = EV_READ | EV_WRITE;
432 } else {
433 if (what & EPOLLIN)
434 ev |= EV_READ;
435 if (what & EPOLLOUT)
436 ev |= EV_WRITE;
437 }
438
439 if (!ev)
440 continue;
441
442 evmap_io_active(base, events[i].data.fd, ev | EV_ET);
443 }
444
445 if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {
446 /* We used all of the event space this time. We should
447 be ready for more events next time. */
448 int new_nevents = epollop->nevents * 2;
449 struct epoll_event *new_events;
450
451 new_events = mm_realloc(epollop->events,
452 new_nevents * sizeof(struct epoll_event));
453 if (new_events) {
454 epollop->events = new_events;
455 epollop->nevents = new_nevents;
456 }
457 }
458
459 return (0);
460 }
461
462
463 static void
epoll_dealloc(struct event_base * base)464 epoll_dealloc(struct event_base *base)
465 {
466 struct epollop *epollop = base->evbase;
467
468 evsig_dealloc(base);
469 if (epollop->events)
470 mm_free(epollop->events);
471 if (epollop->epfd >= 0)
472 close(epollop->epfd);
473
474 memset(epollop, 0, sizeof(struct epollop));
475 mm_free(epollop);
476 }
477