1 /* Copyright libuv contributors. All rights reserved. 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to 5 * deal in the Software without restriction, including without limitation the 6 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 * sell copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 * IN THE SOFTWARE. 20 */ 21 22 #include "uv.h" 23 #include "internal.h" 24 #include <errno.h> 25 #include <sys/epoll.h> 26 27 int uv__epoll_init(uv_loop_t* loop) { 28 int fd; 29 fd = epoll_create1(O_CLOEXEC); 30 31 /* epoll_create1() can fail either because it's not implemented (old kernel) 32 * or because it doesn't understand the O_CLOEXEC flag. 33 */ 34 if (fd == -1 && (errno == ENOSYS || errno == EINVAL)) { 35 fd = epoll_create(256); 36 37 if (fd != -1) 38 uv__cloexec(fd, 1); 39 } 40 41 loop->backend_fd = fd; 42 if (fd == -1) 43 return UV__ERR(errno); 44 45 return 0; 46 } 47 48 49 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) { 50 struct epoll_event* events; 51 struct epoll_event dummy; 52 uintptr_t i; 53 uintptr_t nfds; 54 55 assert(loop->watchers != NULL); 56 assert(fd >= 0); 57 58 events = (struct epoll_event*) loop->watchers[loop->nwatchers]; 59 nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1]; 60 if (events != NULL) 61 /* Invalidate events with same file descriptor */ 62 for (i = 0; i < nfds; i++) 63 if (events[i].data.fd == fd) 64 events[i].data.fd = -1; 65 66 /* Remove the file descriptor from the epoll. 67 * This avoids a problem where the same file description remains open 68 * in another process, causing repeated junk epoll events. 69 * 70 * We pass in a dummy epoll_event, to work around a bug in old kernels. 71 */ 72 if (loop->backend_fd >= 0) { 73 /* Work around a bug in kernels 3.10 to 3.19 where passing a struct that 74 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings. 75 */ 76 memset(&dummy, 0, sizeof(dummy)); 77 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy); 78 } 79 } 80 81 82 int uv__io_check_fd(uv_loop_t* loop, int fd) { 83 struct epoll_event e; 84 int rc; 85 86 memset(&e, 0, sizeof(e)); 87 e.events = POLLIN; 88 e.data.fd = -1; 89 90 rc = 0; 91 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e)) 92 if (errno != EEXIST) 93 rc = UV__ERR(errno); 94 95 if (rc == 0) 96 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e)) 97 abort(); 98 99 return rc; 100 } 101 102 103 void uv__io_poll(uv_loop_t* loop, int timeout) { 104 /* A bug in kernels < 2.6.37 makes timeouts larger than ~30 minutes 105 * effectively infinite on 32 bits architectures. To avoid blocking 106 * indefinitely, we cap the timeout and poll again if necessary. 107 * 108 * Note that "30 minutes" is a simplification because it depends on 109 * the value of CONFIG_HZ. The magic constant assumes CONFIG_HZ=1200, 110 * that being the largest value I have seen in the wild (and only once.) 111 */ 112 static const int max_safe_timeout = 1789569; 113 static int no_epoll_pwait_cached; 114 static int no_epoll_wait_cached; 115 int no_epoll_pwait; 116 int no_epoll_wait; 117 struct epoll_event events[1024]; 118 struct epoll_event* pe; 119 struct epoll_event e; 120 int real_timeout; 121 QUEUE* q; 122 uv__io_t* w; 123 sigset_t sigset; 124 uint64_t sigmask; 125 uint64_t base; 126 int have_signals; 127 int nevents; 128 int count; 129 int nfds; 130 int fd; 131 int op; 132 int i; 133 int user_timeout; 134 int reset_timeout; 135 136 if (loop->nfds == 0) { 137 assert(QUEUE_EMPTY(&loop->watcher_queue)); 138 return; 139 } 140 141 memset(&e, 0, sizeof(e)); 142 143 while (!QUEUE_EMPTY(&loop->watcher_queue)) { 144 q = QUEUE_HEAD(&loop->watcher_queue); 145 QUEUE_REMOVE(q); 146 QUEUE_INIT(q); 147 148 w = QUEUE_DATA(q, uv__io_t, watcher_queue); 149 assert(w->pevents != 0); 150 assert(w->fd >= 0); 151 assert(w->fd < (int) loop->nwatchers); 152 153 e.events = w->pevents; 154 e.data.fd = w->fd; 155 156 if (w->events == 0) 157 op = EPOLL_CTL_ADD; 158 else 159 op = EPOLL_CTL_MOD; 160 161 /* XXX Future optimization: do EPOLL_CTL_MOD lazily if we stop watching 162 * events, skip the syscall and squelch the events after epoll_wait(). 163 */ 164 if (epoll_ctl(loop->backend_fd, op, w->fd, &e)) { 165 if (errno != EEXIST) 166 abort(); 167 168 assert(op == EPOLL_CTL_ADD); 169 170 /* We've reactivated a file descriptor that's been watched before. */ 171 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_MOD, w->fd, &e)) 172 abort(); 173 } 174 175 w->events = w->pevents; 176 } 177 178 sigmask = 0; 179 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) { 180 sigemptyset(&sigset); 181 sigaddset(&sigset, SIGPROF); 182 sigmask |= 1 << (SIGPROF - 1); 183 } 184 185 assert(timeout >= -1); 186 base = loop->time; 187 count = 48; /* Benchmarks suggest this gives the best throughput. */ 188 real_timeout = timeout; 189 190 if (uv__get_internal_fields(loop)->flags & UV_METRICS_IDLE_TIME) { 191 reset_timeout = 1; 192 user_timeout = timeout; 193 timeout = 0; 194 } else { 195 reset_timeout = 0; 196 user_timeout = 0; 197 } 198 199 /* You could argue there is a dependency between these two but 200 * ultimately we don't care about their ordering with respect 201 * to one another. Worst case, we make a few system calls that 202 * could have been avoided because another thread already knows 203 * they fail with ENOSYS. Hardly the end of the world. 204 */ 205 no_epoll_pwait = uv__load_relaxed(&no_epoll_pwait_cached); 206 no_epoll_wait = uv__load_relaxed(&no_epoll_wait_cached); 207 208 for (;;) { 209 /* Only need to set the provider_entry_time if timeout != 0. The function 210 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME. 211 */ 212 if (timeout != 0) 213 uv__metrics_set_provider_entry_time(loop); 214 215 /* See the comment for max_safe_timeout for an explanation of why 216 * this is necessary. Executive summary: kernel bug workaround. 217 */ 218 if (sizeof(int32_t) == sizeof(long) && timeout >= max_safe_timeout) 219 timeout = max_safe_timeout; 220 221 if (sigmask != 0 && no_epoll_pwait != 0) 222 if (pthread_sigmask(SIG_BLOCK, &sigset, NULL)) 223 abort(); 224 225 if (no_epoll_wait != 0 || (sigmask != 0 && no_epoll_pwait == 0)) { 226 nfds = epoll_pwait(loop->backend_fd, 227 events, 228 ARRAY_SIZE(events), 229 timeout, 230 &sigset); 231 if (nfds == -1 && errno == ENOSYS) { 232 uv__store_relaxed(&no_epoll_pwait_cached, 1); 233 no_epoll_pwait = 1; 234 } 235 } else { 236 nfds = epoll_wait(loop->backend_fd, 237 events, 238 ARRAY_SIZE(events), 239 timeout); 240 if (nfds == -1 && errno == ENOSYS) { 241 uv__store_relaxed(&no_epoll_wait_cached, 1); 242 no_epoll_wait = 1; 243 } 244 } 245 246 if (sigmask != 0 && no_epoll_pwait != 0) 247 if (pthread_sigmask(SIG_UNBLOCK, &sigset, NULL)) 248 abort(); 249 250 /* Update loop->time unconditionally. It's tempting to skip the update when 251 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the 252 * operating system didn't reschedule our process while in the syscall. 253 */ 254 SAVE_ERRNO(uv__update_time(loop)); 255 256 if (nfds == 0) { 257 assert(timeout != -1); 258 259 if (reset_timeout != 0) { 260 timeout = user_timeout; 261 reset_timeout = 0; 262 } 263 264 if (timeout == -1) 265 continue; 266 267 if (timeout == 0) 268 return; 269 270 /* We may have been inside the system call for longer than |timeout| 271 * milliseconds so we need to update the timestamp to avoid drift. 272 */ 273 goto update_timeout; 274 } 275 276 if (nfds == -1) { 277 if (errno == ENOSYS) { 278 /* epoll_wait() or epoll_pwait() failed, try the other system call. */ 279 assert(no_epoll_wait == 0 || no_epoll_pwait == 0); 280 continue; 281 } 282 283 if (errno != EINTR) 284 abort(); 285 286 if (reset_timeout != 0) { 287 timeout = user_timeout; 288 reset_timeout = 0; 289 } 290 291 if (timeout == -1) 292 continue; 293 294 if (timeout == 0) 295 return; 296 297 /* Interrupted by a signal. Update timeout and poll again. */ 298 goto update_timeout; 299 } 300 301 have_signals = 0; 302 nevents = 0; 303 304 { 305 /* Squelch a -Waddress-of-packed-member warning with gcc >= 9. */ 306 union { 307 struct epoll_event* events; 308 uv__io_t* watchers; 309 } x; 310 311 x.events = events; 312 assert(loop->watchers != NULL); 313 loop->watchers[loop->nwatchers] = x.watchers; 314 loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds; 315 } 316 317 for (i = 0; i < nfds; i++) { 318 pe = events + i; 319 fd = pe->data.fd; 320 321 /* Skip invalidated events, see uv__platform_invalidate_fd */ 322 if (fd == -1) 323 continue; 324 325 assert(fd >= 0); 326 assert((unsigned) fd < loop->nwatchers); 327 328 w = loop->watchers[fd]; 329 330 if (w == NULL) { 331 /* File descriptor that we've stopped watching, disarm it. 332 * 333 * Ignore all errors because we may be racing with another thread 334 * when the file descriptor is closed. 335 */ 336 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, pe); 337 continue; 338 } 339 340 /* Give users only events they're interested in. Prevents spurious 341 * callbacks when previous callback invocation in this loop has stopped 342 * the current watcher. Also, filters out events that users has not 343 * requested us to watch. 344 */ 345 pe->events &= w->pevents | POLLERR | POLLHUP; 346 347 /* Work around an epoll quirk where it sometimes reports just the 348 * EPOLLERR or EPOLLHUP event. In order to force the event loop to 349 * move forward, we merge in the read/write events that the watcher 350 * is interested in; uv__read() and uv__write() will then deal with 351 * the error or hangup in the usual fashion. 352 * 353 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user 354 * reads the available data, calls uv_read_stop(), then sometime later 355 * calls uv_read_start() again. By then, libuv has forgotten about the 356 * hangup and the kernel won't report EPOLLIN again because there's 357 * nothing left to read. If anything, libuv is to blame here. The 358 * current hack is just a quick bandaid; to properly fix it, libuv 359 * needs to remember the error/hangup event. We should get that for 360 * free when we switch over to edge-triggered I/O. 361 */ 362 if (pe->events == POLLERR || pe->events == POLLHUP) 363 pe->events |= 364 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI); 365 366 if (pe->events != 0) { 367 /* Run signal watchers last. This also affects child process watchers 368 * because those are implemented in terms of signal watchers. 369 */ 370 if (w == &loop->signal_io_watcher) { 371 have_signals = 1; 372 } else { 373 uv__metrics_update_idle_time(loop); 374 w->cb(loop, w, pe->events); 375 } 376 377 nevents++; 378 } 379 } 380 381 if (reset_timeout != 0) { 382 timeout = user_timeout; 383 reset_timeout = 0; 384 } 385 386 if (have_signals != 0) { 387 uv__metrics_update_idle_time(loop); 388 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN); 389 } 390 391 loop->watchers[loop->nwatchers] = NULL; 392 loop->watchers[loop->nwatchers + 1] = NULL; 393 394 if (have_signals != 0) 395 return; /* Event loop should cycle now so don't poll again. */ 396 397 if (nevents != 0) { 398 if (nfds == ARRAY_SIZE(events) && --count != 0) { 399 /* Poll for more events but don't block this time. */ 400 timeout = 0; 401 continue; 402 } 403 return; 404 } 405 406 if (timeout == 0) 407 return; 408 409 if (timeout == -1) 410 continue; 411 412 update_timeout: 413 assert(timeout > 0); 414 415 real_timeout -= (loop->time - base); 416 if (real_timeout <= 0) 417 return; 418 419 timeout = real_timeout; 420 } 421 } 422 423