1 /* Socket event dispatching library - by D.C. van Moolenbroek */ 2 3 #include <minix/drivers.h> 4 #include <minix/sockdriver.h> 5 #include <minix/sockevent.h> 6 #include <sys/ioctl.h> 7 8 #include "sockevent_proc.h" 9 10 #define US 1000000UL /* microseconds per second */ 11 12 #define SOCKHASH_SLOTS 256 /* # slots in ID-to-sock hash table */ 13 14 static SLIST_HEAD(, sock) sockhash[SOCKHASH_SLOTS]; 15 16 static SLIST_HEAD(, sock) socktimer; 17 18 static minix_timer_t sockevent_timer; 19 20 static SIMPLEQ_HEAD(, sock) sockevent_pending; 21 22 static sockevent_socket_cb_t sockevent_socket_cb = NULL; 23 24 static int sockevent_working; 25 26 static void socktimer_del(struct sock * sock); 27 static void sockevent_cancel_send(struct sock * sock, 28 struct sockevent_proc * spr, int err); 29 static void sockevent_cancel_recv(struct sock * sock, 30 struct sockevent_proc * spr, int err); 31 32 /* 33 * Initialize the hash table of sock objects. 34 */ 35 static void 36 sockhash_init(void) 37 { 38 unsigned int slot; 39 40 for (slot = 0; slot < __arraycount(sockhash); slot++) 41 SLIST_INIT(&sockhash[slot]); 42 } 43 44 /* 45 * Given a socket identifier, return a hash table slot number. 46 */ 47 static unsigned int 48 sockhash_slot(sockid_t id) 49 { 50 51 /* 52 * The idea of the shift is that a socket driver may offer multiple 53 * classes of sockets, and put the class in the higher bits. The shift 54 * aims to prevent that all classes' first sockets end up in the same 55 * hash slot. 56 */ 57 return (id + (id >> 16)) % SOCKHASH_SLOTS; 58 } 59 60 /* 61 * Obtain a sock object from the hash table using its unique identifier. 62 * Return a pointer to the object if found, or NULL otherwise. 63 */ 64 static struct sock * 65 sockhash_get(sockid_t id) 66 { 67 struct sock *sock; 68 unsigned int slot; 69 70 slot = sockhash_slot(id); 71 72 SLIST_FOREACH(sock, &sockhash[slot], sock_hash) { 73 if (sock->sock_id == id) 74 return sock; 75 } 76 77 return NULL; 78 } 79 80 /* 81 * Add a sock object to the hash table. The sock object must have a valid ID 82 * in its 'sock_id' field, and must not be in the hash table already. 83 */ 84 static void 85 sockhash_add(struct sock * sock) 86 { 87 unsigned int slot; 88 89 slot = sockhash_slot(sock->sock_id); 90 91 SLIST_INSERT_HEAD(&sockhash[slot], sock, sock_hash); 92 } 93 94 /* 95 * Remove a sock object from the hash table. The sock object must be in the 96 * hash table. 97 */ 98 static void 99 sockhash_del(struct sock * sock) 100 { 101 unsigned int slot; 102 103 slot = sockhash_slot(sock->sock_id); 104 105 /* This macro is O(n). */ 106 SLIST_REMOVE(&sockhash[slot], sock, sock, sock_hash); 107 } 108 109 /* 110 * Reset a socket object to a proper initial state, with a particular socket 111 * identifier, a SOCK_ type, and a socket operations table. The socket is 112 * added to the ID-to-object hash table. This function always succeeds. 113 */ 114 static void 115 sockevent_reset(struct sock * sock, sockid_t id, int domain, int type, 116 const struct sockevent_ops * ops) 117 { 118 119 assert(sock != NULL); 120 121 memset(sock, 0, sizeof(*sock)); 122 123 sock->sock_id = id; 124 sock->sock_domain = domain; 125 sock->sock_type = type; 126 127 sock->sock_slowat = 1; 128 sock->sock_rlowat = 1; 129 130 sock->sock_ops = ops; 131 sock->sock_proc = NULL; 132 sock->sock_select.ss_endpt = NONE; 133 134 sockhash_add(sock); 135 } 136 137 /* 138 * Initialize a new socket that will serve as an accepted socket on the given 139 * listening socket 'sock'. The new socket is given as 'newsock', and its new 140 * socket identifier is given as 'newid'. This function always succeeds. 141 */ 142 void 143 sockevent_clone(struct sock * sock, struct sock * newsock, sockid_t newid) 144 { 145 146 sockevent_reset(newsock, newid, (int)sock->sock_domain, 147 sock->sock_type, sock->sock_ops); 148 149 /* These are the settings that are currently inherited. */ 150 newsock->sock_opt = sock->sock_opt & ~SO_ACCEPTCONN; 151 newsock->sock_linger = sock->sock_linger; 152 newsock->sock_stimeo = sock->sock_stimeo; 153 newsock->sock_rtimeo = sock->sock_rtimeo; 154 newsock->sock_slowat = sock->sock_slowat; 155 newsock->sock_rlowat = sock->sock_rlowat; 156 157 newsock->sock_flags |= SFL_CLONED; 158 } 159 160 /* 161 * A new socket has just been accepted. The corresponding listening socket is 162 * given as 'sock'. The new socket has ID 'newid', and if it had not already 163 * been added to the hash table through sockevent_clone() before, 'newsock' is 164 * a non-NULL pointer which identifies the socket object to clone into. 165 */ 166 static void 167 sockevent_accepted(struct sock * sock, struct sock * newsock, sockid_t newid) 168 { 169 170 if (newsock == NULL) { 171 if ((newsock = sockhash_get(newid)) == NULL) 172 panic("libsockdriver: socket driver returned unknown " 173 "ID %d from accept callback", newid); 174 } else 175 sockevent_clone(sock, newsock, newid); 176 177 assert(newsock->sock_flags & SFL_CLONED); 178 newsock->sock_flags &= ~SFL_CLONED; 179 } 180 181 /* 182 * Allocate a sock object, by asking the socket driver for one. On success, 183 * return OK, with a pointer to the new object stored in 'sockp'. This new 184 * object has all its fields set to initial values, in part based on the given 185 * parameters. On failure, return an error code. Failure has two typical 186 * cause: either the given domain, type, protocol combination is not supported, 187 * or the socket driver is out of sockets (globally or for this combination). 188 */ 189 static int 190 sockevent_alloc(int domain, int type, int protocol, endpoint_t user_endpt, 191 struct sock ** sockp) 192 { 193 struct sock *sock; 194 const struct sockevent_ops *ops; 195 sockid_t r; 196 197 /* 198 * Verify that the given domain is sane. Unlike the type and protocol, 199 * the domain is already verified by VFS, so we do not limit ourselves 200 * here. The result is that we can store the domain in just a byte. 201 */ 202 if (domain < 0 || domain > UINT8_MAX) 203 return EAFNOSUPPORT; 204 205 /* Make sure that the library has actually been initialized. */ 206 if (sockevent_socket_cb == NULL) 207 panic("libsockevent: not initialized"); 208 209 sock = NULL; 210 ops = NULL; 211 212 /* 213 * Ask the socket driver to create a socket for the given combination 214 * of domain, type, and protocol. If so, let it return a new sock 215 * object, a unique socket identifier for that object, and an 216 * operations table for it. 217 */ 218 if ((r = sockevent_socket_cb(domain, type, protocol, user_endpt, &sock, 219 &ops)) < 0) 220 return r; 221 222 assert(sock != NULL); 223 assert(ops != NULL); 224 225 sockevent_reset(sock, r, domain, type, ops); 226 227 *sockp = sock; 228 return OK; 229 } 230 231 /* 232 * Free a previously allocated sock object. 233 */ 234 static void 235 sockevent_free(struct sock * sock) 236 { 237 const struct sockevent_ops *ops; 238 239 assert(sock->sock_proc == NULL); 240 241 socktimer_del(sock); 242 243 sockhash_del(sock); 244 245 /* 246 * Invalidate the operations table on the socket, before freeing the 247 * socket. This allows us to detect cases where sockevent functions 248 * are called on sockets that have already been freed. 249 */ 250 ops = sock->sock_ops; 251 sock->sock_ops = NULL; 252 253 assert(ops != NULL); 254 assert(ops->sop_free != NULL); 255 256 ops->sop_free(sock); 257 } 258 259 /* 260 * Create a new socket. 261 */ 262 static sockid_t 263 sockevent_socket(int domain, int type, int protocol, endpoint_t user_endpt) 264 { 265 struct sock *sock; 266 int r; 267 268 if ((r = sockevent_alloc(domain, type, protocol, user_endpt, 269 &sock)) != OK) 270 return r; 271 272 return sock->sock_id; 273 } 274 275 /* 276 * Create a pair of connected sockets. 277 */ 278 static int 279 sockevent_socketpair(int domain, int type, int protocol, endpoint_t user_endpt, 280 sockid_t id[2]) 281 { 282 struct sock *sock1, *sock2; 283 int r; 284 285 if ((r = sockevent_alloc(domain, type, protocol, user_endpt, 286 &sock1)) != OK) 287 return r; 288 289 /* Creating socket pairs is not always supported. */ 290 if (sock1->sock_ops->sop_pair == NULL) { 291 sockevent_free(sock1); 292 293 return EOPNOTSUPP; 294 } 295 296 if ((r = sockevent_alloc(domain, type, protocol, user_endpt, 297 &sock2)) != OK) { 298 sockevent_free(sock1); 299 300 return r; 301 } 302 303 assert(sock1->sock_ops == sock2->sock_ops); 304 305 r = sock1->sock_ops->sop_pair(sock1, sock2, user_endpt); 306 307 if (r != OK) { 308 sockevent_free(sock2); 309 sockevent_free(sock1); 310 311 return r; 312 } 313 314 id[0] = sock1->sock_id; 315 id[1] = sock2->sock_id; 316 return OK; 317 } 318 319 /* 320 * A send request returned EPIPE. If desired, send a SIGPIPE signal to the 321 * user process that issued the request. 322 */ 323 static void 324 sockevent_sigpipe(struct sock * sock, endpoint_t user_endpt, int flags) 325 { 326 327 /* 328 * POSIX says that pipe signals should be generated for SOCK_STREAM 329 * sockets. Linux does just this, NetBSD raises signals for all socket 330 * types. 331 */ 332 if (sock->sock_type != SOCK_STREAM) 333 return; 334 335 /* 336 * Why would there be fewer than four ways to do the same thing? 337 * O_NOSIGPIPE, MSG_NOSIGNAL, SO_NOSIGPIPE, and of course blocking 338 * SIGPIPE. VFS already sets MSG_NOSIGNAL for calls on sockets with 339 * O_NOSIGPIPE. The fact that SO_NOSIGPIPE is a thing, is also the 340 * reason why we cannot let VFS handle signal generation altogether. 341 */ 342 if (flags & MSG_NOSIGNAL) 343 return; 344 if (sock->sock_opt & SO_NOSIGPIPE) 345 return; 346 347 /* 348 * Send a SIGPIPE signal to the user process. Unfortunately we cannot 349 * guarantee that the SIGPIPE reaches the user process before the send 350 * call returns. Usually, the scheduling priorities of system services 351 * are such that the signal is likely to arrive first anyway, but if 352 * timely arrival of the signal is required, a more fundamental change 353 * to the system would be needed. 354 */ 355 sys_kill(user_endpt, SIGPIPE); 356 } 357 358 /* 359 * Suspend a request without data, that is, a bind, connect, accept, or close 360 * request. 361 */ 362 static void 363 sockevent_suspend(struct sock * sock, unsigned int event, 364 const struct sockdriver_call * __restrict call, endpoint_t user_endpt) 365 { 366 struct sockevent_proc *spr, **sprp; 367 368 /* There is one slot for each process, so this should never fail. */ 369 if ((spr = sockevent_proc_alloc()) == NULL) 370 panic("libsockevent: too many suspended processes"); 371 372 spr->spr_next = NULL; 373 spr->spr_event = event; 374 spr->spr_timer = FALSE; 375 spr->spr_call = *call; 376 spr->spr_endpt = user_endpt; 377 378 /* 379 * Add the request to the tail of the queue. This operation is O(n), 380 * but the number of suspended requests per socket is expected to be 381 * low at all times. 382 */ 383 for (sprp = &sock->sock_proc; *sprp != NULL; 384 sprp = &(*sprp)->spr_next); 385 *sprp = spr; 386 } 387 388 /* 389 * Suspend a request with data, that is, a send or receive request. 390 */ 391 static void 392 sockevent_suspend_data(struct sock * sock, unsigned int event, int timer, 393 const struct sockdriver_call * __restrict call, endpoint_t user_endpt, 394 const struct sockdriver_data * __restrict data, size_t len, size_t off, 395 const struct sockdriver_data * __restrict ctl, socklen_t ctl_len, 396 socklen_t ctl_off, int flags, int rflags, clock_t time) 397 { 398 struct sockevent_proc *spr, **sprp; 399 400 /* There is one slot for each process, so this should never fail. */ 401 if ((spr = sockevent_proc_alloc()) == NULL) 402 panic("libsockevent: too many suspended processes"); 403 404 spr->spr_next = NULL; 405 spr->spr_event = event; 406 spr->spr_timer = timer; 407 spr->spr_call = *call; 408 spr->spr_endpt = user_endpt; 409 sockdriver_pack_data(&spr->spr_data, call, data, len); 410 spr->spr_datalen = len; 411 spr->spr_dataoff = off; 412 sockdriver_pack_data(&spr->spr_ctl, call, ctl, ctl_len); 413 spr->spr_ctllen = ctl_len; 414 spr->spr_ctloff = ctl_off; 415 spr->spr_flags = flags; 416 spr->spr_rflags = rflags; 417 spr->spr_time = time; 418 419 /* 420 * Add the request to the tail of the queue. This operation is O(n), 421 * but the number of suspended requests per socket is expected to be 422 * low at all times. 423 */ 424 for (sprp = &sock->sock_proc; *sprp != NULL; 425 sprp = &(*sprp)->spr_next); 426 *sprp = spr; 427 } 428 429 /* 430 * Return TRUE if there are any suspended requests on the given socket's queue 431 * that match any of the events in the given event mask, or FALSE otherwise. 432 */ 433 static int 434 sockevent_has_suspended(struct sock * sock, unsigned int mask) 435 { 436 struct sockevent_proc *spr; 437 438 for (spr = sock->sock_proc; spr != NULL; spr = spr->spr_next) 439 if (spr->spr_event & mask) 440 return TRUE; 441 442 return FALSE; 443 } 444 445 /* 446 * Check whether the given call is on the given socket's queue of suspended 447 * requests. If so, remove it from the queue and return a pointer to the 448 * suspension data structure. The caller is then responsible for freeing that 449 * data structure using sockevent_proc_free(). If the call was not found, the 450 * function returns NULL. 451 */ 452 static struct sockevent_proc * 453 sockevent_unsuspend(struct sock * sock, const struct sockdriver_call * call) 454 { 455 struct sockevent_proc *spr, **sprp; 456 457 /* Find the suspended request being canceled. */ 458 for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; 459 sprp = &spr->spr_next) { 460 if (spr->spr_call.sc_endpt == call->sc_endpt && 461 spr->spr_call.sc_req == call->sc_req) { 462 /* Found; remove and return it. */ 463 *sprp = spr->spr_next; 464 465 return spr; 466 } 467 } 468 469 return NULL; 470 } 471 472 /* 473 * Attempt to resume the given suspended request for the given socket object. 474 * Return TRUE if the suspended request has been fully resumed and can be 475 * removed from the queue of suspended requests, or FALSE if it has not been 476 * fully resumed and should stay on the queue. In the latter case, no 477 * resumption will be attempted for other suspended requests of the same type. 478 */ 479 static int 480 sockevent_resume(struct sock * sock, struct sockevent_proc * spr) 481 { 482 struct sock *newsock; 483 struct sockdriver_data data, ctl; 484 char addr[SOCKADDR_MAX]; 485 socklen_t addr_len; 486 size_t len, min; 487 sockid_t r; 488 489 switch (spr->spr_event) { 490 case SEV_CONNECT: 491 /* 492 * If the connect call was suspended for the purpose of 493 * intercepting resumption, simply remove it from the queue. 494 */ 495 if (spr->spr_call.sc_endpt == NONE) 496 return TRUE; 497 498 /* FALLTHROUGH */ 499 case SEV_BIND: 500 if ((r = sock->sock_err) != OK) 501 sock->sock_err = OK; 502 503 sockdriver_reply_generic(&spr->spr_call, r); 504 505 return TRUE; 506 507 case SEV_ACCEPT: 508 /* 509 * A previous accept call may not have blocked on a socket that 510 * was not in listening mode. 511 */ 512 assert(sock->sock_opt & SO_ACCEPTCONN); 513 514 addr_len = 0; 515 newsock = NULL; 516 517 /* 518 * This call is suspended, which implies that the call table 519 * pointer has already tested to be non-NULL. 520 */ 521 if ((r = sock->sock_ops->sop_accept(sock, 522 (struct sockaddr *)&addr, &addr_len, spr->spr_endpt, 523 &newsock)) == SUSPEND) 524 return FALSE; 525 526 if (r >= 0) { 527 assert(addr_len <= sizeof(addr)); 528 529 sockevent_accepted(sock, newsock, r); 530 } 531 532 sockdriver_reply_accept(&spr->spr_call, r, 533 (struct sockaddr *)&addr, addr_len); 534 535 return TRUE; 536 537 case SEV_SEND: 538 if (sock->sock_err != OK || (sock->sock_flags & SFL_SHUT_WR)) { 539 if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0) 540 r = (int)spr->spr_dataoff; 541 else if ((r = sock->sock_err) != OK) 542 sock->sock_err = OK; 543 else 544 r = EPIPE; 545 } else { 546 sockdriver_unpack_data(&data, &spr->spr_call, 547 &spr->spr_data, spr->spr_datalen); 548 sockdriver_unpack_data(&ctl, &spr->spr_call, 549 &spr->spr_ctl, spr->spr_ctllen); 550 551 len = spr->spr_datalen - spr->spr_dataoff; 552 553 min = sock->sock_slowat; 554 if (min > len) 555 min = len; 556 557 /* 558 * As mentioned elsewhere, we do not save the address 559 * upon suspension so we cannot supply it anymore here. 560 */ 561 r = sock->sock_ops->sop_send(sock, &data, len, 562 &spr->spr_dataoff, &ctl, 563 spr->spr_ctllen - spr->spr_ctloff, 564 &spr->spr_ctloff, NULL, 0, spr->spr_endpt, 565 spr->spr_flags, min); 566 567 assert(r <= 0); 568 569 if (r == SUSPEND) 570 return FALSE; 571 572 /* 573 * If an error occurred but some data were already 574 * sent, return the progress rather than the error. 575 * Note that if the socket driver detects an 576 * asynchronous error during the send, it itself must 577 * perform this check and call sockevent_set_error() as 578 * needed, to make sure the error does not get lost. 579 */ 580 if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0) 581 r = spr->spr_dataoff; 582 } 583 584 if (r == EPIPE) 585 sockevent_sigpipe(sock, spr->spr_endpt, 586 spr->spr_flags); 587 588 sockdriver_reply_generic(&spr->spr_call, r); 589 590 return TRUE; 591 592 case SEV_RECV: 593 addr_len = 0; 594 595 if (sock->sock_flags & SFL_SHUT_RD) 596 r = SOCKEVENT_EOF; 597 else { 598 len = spr->spr_datalen - spr->spr_dataoff; 599 600 if (sock->sock_err == OK) { 601 min = sock->sock_rlowat; 602 if (min > len) 603 min = len; 604 } else 605 min = 0; 606 607 sockdriver_unpack_data(&data, &spr->spr_call, 608 &spr->spr_data, spr->spr_datalen); 609 sockdriver_unpack_data(&ctl, &spr->spr_call, 610 &spr->spr_ctl, spr->spr_ctllen); 611 612 r = sock->sock_ops->sop_recv(sock, &data, len, 613 &spr->spr_dataoff, &ctl, 614 spr->spr_ctllen - spr->spr_ctloff, 615 &spr->spr_ctloff, (struct sockaddr *)&addr, 616 &addr_len, spr->spr_endpt, spr->spr_flags, min, 617 &spr->spr_rflags); 618 619 /* 620 * If the call remains suspended but a socket error is 621 * pending, return the pending socket error instead. 622 */ 623 if (r == SUSPEND) { 624 if (sock->sock_err == OK) 625 return FALSE; 626 627 r = SOCKEVENT_EOF; 628 } 629 630 assert(addr_len <= sizeof(addr)); 631 } 632 633 /* 634 * If the receive call reported success, or if some data were 635 * already received, return the (partial) result. Otherwise, 636 * return a pending error if any, or otherwise a regular error 637 * or 0 for EOF. 638 */ 639 if (r == OK || spr->spr_dataoff > 0 || spr->spr_ctloff > 0) 640 r = (int)spr->spr_dataoff; 641 else if (sock->sock_err != OK) { 642 r = sock->sock_err; 643 644 sock->sock_err = OK; 645 } else if (r == SOCKEVENT_EOF) 646 r = 0; /* EOF */ 647 648 sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff, 649 (struct sockaddr *)&addr, addr_len, spr->spr_rflags); 650 651 return TRUE; 652 653 case SEV_CLOSE: 654 sockdriver_reply_generic(&spr->spr_call, OK); 655 656 return TRUE; 657 658 default: 659 panic("libsockevent: process suspended on unknown event 0x%x", 660 spr->spr_event); 661 } 662 } 663 664 /* 665 * Return TRUE if the given socket is ready for reading for a select call, or 666 * FALSE otherwise. 667 */ 668 static int 669 sockevent_test_readable(struct sock * sock) 670 { 671 int r; 672 673 /* 674 * The meaning of "ready-to-read" depends on whether the socket is a 675 * listening socket or not. For the former, it is a test on whether 676 * there are any new sockets to accept. However, shutdown flags take 677 * precedence in both cases. 678 */ 679 if (sock->sock_flags & SFL_SHUT_RD) 680 return TRUE; 681 682 if (sock->sock_err != OK) 683 return TRUE; 684 685 /* 686 * Depending on whether this is a listening-mode socket, test whether 687 * either accepts or receives would block. 688 */ 689 if (sock->sock_opt & SO_ACCEPTCONN) { 690 if (sock->sock_ops->sop_test_accept == NULL) 691 return TRUE; 692 693 r = sock->sock_ops->sop_test_accept(sock); 694 } else { 695 if (sock->sock_ops->sop_test_recv == NULL) 696 return TRUE; 697 698 r = sock->sock_ops->sop_test_recv(sock, sock->sock_rlowat, 699 NULL); 700 } 701 702 return (r != SUSPEND); 703 } 704 705 /* 706 * Return TRUE if the given socket is ready for writing for a select call, or 707 * FALSE otherwise. 708 */ 709 static int 710 sockevent_test_writable(struct sock * sock) 711 { 712 int r; 713 714 if (sock->sock_err != OK) 715 return TRUE; 716 717 if (sock->sock_flags & SFL_SHUT_WR) 718 return TRUE; 719 720 if (sock->sock_ops->sop_test_send == NULL) 721 return TRUE; 722 723 /* 724 * Test whether sends would block. The low send watermark is relevant 725 * for stream-type sockets only. 726 */ 727 r = sock->sock_ops->sop_test_send(sock, sock->sock_slowat); 728 729 return (r != SUSPEND); 730 } 731 732 /* 733 * Test whether any of the given select operations are ready on the given 734 * socket. Return the subset of ready operations; zero if none. 735 */ 736 static unsigned int 737 sockevent_test_select(struct sock * sock, unsigned int ops) 738 { 739 unsigned int ready_ops; 740 741 assert(!(ops & ~(SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR))); 742 743 /* 744 * We do not support the "bind in progress" case here. If a blocking 745 * bind call is in progress, the file descriptor should not be ready 746 * for either reading or writing. Currently, socket drivers will have 747 * to cover this case themselves. Otherwise we would have to check the 748 * queue of suspended calls, or create a custom flag for this. 749 */ 750 751 ready_ops = 0; 752 753 if ((ops & SDEV_OP_RD) && sockevent_test_readable(sock)) 754 ready_ops |= SDEV_OP_RD; 755 756 if ((ops & SDEV_OP_WR) && sockevent_test_writable(sock)) 757 ready_ops |= SDEV_OP_WR; 758 759 /* TODO: OOB receive support. */ 760 761 return ready_ops; 762 } 763 764 /* 765 * Fire the given mask of events on the given socket object now. 766 */ 767 static void 768 sockevent_fire(struct sock * sock, unsigned int mask) 769 { 770 struct sockevent_proc *spr, **sprp; 771 unsigned int r, flag, ops; 772 773 /* 774 * A completed connection attempt (successful or not) also always 775 * implies that the socket becomes writable. For convenience we 776 * enforce this rule here, because it is easy to forget. Note that in 777 * any case, a suspended connect request should be the first in the 778 * list, so we do not risk returning 0 from a connect call as a result 779 * of sock_err getting eaten by another resumed call. 780 */ 781 if (mask & SEV_CONNECT) 782 mask |= SEV_SEND; 783 784 /* 785 * First try resuming regular system calls. 786 */ 787 for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) { 788 flag = spr->spr_event; 789 790 if ((mask & flag) && sockevent_resume(sock, spr)) { 791 *sprp = spr->spr_next; 792 793 sockevent_proc_free(spr); 794 } else { 795 mask &= ~flag; 796 797 sprp = &spr->spr_next; 798 } 799 } 800 801 /* 802 * Then see if we can satisfy pending select queries. 803 */ 804 if ((mask & (SEV_ACCEPT | SEV_SEND | SEV_RECV)) && 805 sock->sock_select.ss_endpt != NONE) { 806 assert(sock->sock_selops != 0); 807 808 /* 809 * Only retest select operations that, based on the given event 810 * mask, could possibly be satisfied now. 811 */ 812 ops = sock->sock_selops; 813 if (!(mask & (SEV_ACCEPT | SEV_RECV))) 814 ops &= ~SDEV_OP_RD; 815 if (!(mask & SEV_SEND)) 816 ops &= ~SDEV_OP_WR; 817 if (!(0)) /* TODO: OOB receive support */ 818 ops &= ~SDEV_OP_ERR; 819 820 /* Are there any operations to test? */ 821 if (ops != 0) { 822 /* Test those operations. */ 823 r = sockevent_test_select(sock, ops); 824 825 /* Were any satisfied? */ 826 if (r != 0) { 827 /* Let the caller know. */ 828 sockdriver_reply_select(&sock->sock_select, 829 sock->sock_id, r); 830 831 sock->sock_selops &= ~r; 832 833 /* Are there any saved operations left now? */ 834 if (sock->sock_selops == 0) 835 sock->sock_select.ss_endpt = NONE; 836 } 837 } 838 } 839 840 /* 841 * Finally, a SEV_CLOSE event unconditionally frees the sock object. 842 * This event should be fired only for sockets that are either not yet, 843 * or not anymore, in use by userland. 844 */ 845 if (mask & SEV_CLOSE) { 846 assert(sock->sock_flags & (SFL_CLONED | SFL_CLOSING)); 847 848 sockevent_free(sock); 849 } 850 } 851 852 /* 853 * Process all pending events. Events must still be blocked, so that if 854 * handling one event generates a new event, that event is handled from here 855 * rather than immediately. 856 */ 857 static void 858 sockevent_pump(void) 859 { 860 struct sock *sock; 861 unsigned int mask; 862 863 assert(sockevent_working); 864 865 while (!SIMPLEQ_EMPTY(&sockevent_pending)) { 866 sock = SIMPLEQ_FIRST(&sockevent_pending); 867 SIMPLEQ_REMOVE_HEAD(&sockevent_pending, sock_next); 868 869 mask = sock->sock_events; 870 assert(mask != 0); 871 sock->sock_events = 0; 872 873 sockevent_fire(sock, mask); 874 /* 875 * At this point, the sock object may already have been readded 876 * to the event list, or even be deallocated altogether. 877 */ 878 } 879 } 880 881 /* 882 * Return TRUE if any events are pending on any sockets, or FALSE otherwise. 883 */ 884 static int 885 sockevent_has_events(void) 886 { 887 888 return (!SIMPLEQ_EMPTY(&sockevent_pending)); 889 } 890 891 /* 892 * Raise the given bitwise-OR'ed set of events on the given socket object. 893 * Depending on the context of the call, they events may or may not be 894 * processed immediately. 895 */ 896 void 897 sockevent_raise(struct sock * sock, unsigned int mask) 898 { 899 900 assert(sock->sock_ops != NULL); 901 902 /* 903 * Handle SEV_CLOSE first. This event must not be deferred, so as to 904 * let socket drivers recycle sock objects as they are needed. For 905 * example, a user-closed TCP socket may stay open to transmit the 906 * remainder of its send buffer, until the TCP driver runs out of 907 * sockets, in which case the connection is aborted. The driver would 908 * then raise SEV_CLOSE on the sock object so as to clean it up, and 909 * immediately reuse it afterward. If the close event were to be 910 * deferred, this immediate reuse would not be possible. 911 * 912 * The sop_free() callback routine may not raise new events, and thus, 913 * the state of 'sockevent_working' need not be checked or set here. 914 */ 915 if (mask & SEV_CLOSE) { 916 assert(mask == SEV_CLOSE); 917 918 sockevent_fire(sock, mask); 919 920 return; 921 } 922 923 /* 924 * If we are currently processing a socket message, store the event for 925 * later. If not, this call is not coming from inside libsockevent, 926 * and we must handle the event immediately. 927 */ 928 if (sockevent_working) { 929 assert(mask != 0); 930 assert(mask <= UCHAR_MAX); /* sock_events field size check */ 931 932 if (sock->sock_events == 0) 933 SIMPLEQ_INSERT_TAIL(&sockevent_pending, sock, 934 sock_next); 935 936 sock->sock_events |= mask; 937 } else { 938 sockevent_working = TRUE; 939 940 sockevent_fire(sock, mask); 941 942 if (sockevent_has_events()) 943 sockevent_pump(); 944 945 sockevent_working = FALSE; 946 } 947 } 948 949 /* 950 * Set a pending error on the socket object, and wake up any suspended 951 * operations that are affected by this. 952 */ 953 void 954 sockevent_set_error(struct sock * sock, int err) 955 { 956 957 assert(err < 0); 958 assert(sock->sock_ops != NULL); 959 960 /* If an error was set already, it will be overridden. */ 961 sock->sock_err = err; 962 963 sockevent_raise(sock, SEV_BIND | SEV_CONNECT | SEV_SEND | SEV_RECV); 964 } 965 966 /* 967 * Initialize timer-related data structures. 968 */ 969 static void 970 socktimer_init(void) 971 { 972 973 SLIST_INIT(&socktimer); 974 975 init_timer(&sockevent_timer); 976 } 977 978 /* 979 * Check whether the given socket object has any suspended requests that have 980 * now expired. If so, cancel them. Also, if the socket object has any 981 * suspended requests with a timeout that has not yet expired, return the 982 * earliest (relative) timeout of all of them, or TMR_NEVER if no such requests 983 * are present. 984 */ 985 static clock_t 986 sockevent_expire(struct sock * sock, clock_t now) 987 { 988 struct sockevent_proc *spr, **sprp; 989 clock_t lowest, left; 990 int r; 991 992 /* 993 * First handle the case that the socket is closed. In this case, 994 * there may be a linger timer, although the socket may also simply 995 * still be on the timer list because of a request that did not time 996 * out right before the socket was closed. 997 */ 998 if (sock->sock_flags & SFL_CLOSING) { 999 /* Was there a linger timer and has it expired? */ 1000 if ((sock->sock_opt & SO_LINGER) && 1001 tmr_is_first(sock->sock_linger, now)) { 1002 assert(sock->sock_ops->sop_close != NULL); 1003 1004 /* 1005 * Whatever happens next, we must now resume the 1006 * pending close operation, if it was not canceled 1007 * earlier. As before, we return OK rather than the 1008 * standardized EWOULDBLOCK, to ensure that the user 1009 * process knows the file descriptor has been closed. 1010 */ 1011 if ((spr = sock->sock_proc) != NULL) { 1012 assert(spr->spr_event == SEV_CLOSE); 1013 assert(spr->spr_next == NULL); 1014 1015 sock->sock_proc = NULL; 1016 1017 sockdriver_reply_generic(&spr->spr_call, OK); 1018 1019 sockevent_proc_free(spr); 1020 } 1021 1022 /* 1023 * Tell the socket driver that closing the socket is 1024 * now a bit more desired than the last time we asked. 1025 */ 1026 r = sock->sock_ops->sop_close(sock, TRUE /*force*/); 1027 1028 assert(r == OK || r == SUSPEND); 1029 1030 /* 1031 * The linger timer fires once. After that, the socket 1032 * driver is free to decide that it still will not 1033 * close the socket. If it does, do not fire the 1034 * linger timer again. 1035 */ 1036 if (r == SUSPEND) 1037 sock->sock_opt &= ~SO_LINGER; 1038 else 1039 sockevent_free(sock); 1040 } 1041 1042 return TMR_NEVER; 1043 } 1044 1045 /* 1046 * Then see if any send and/or receive requests have expired. Also see 1047 * if there are any send and/or receive requests left that have not yet 1048 * expired but do have a timeout, so that we can return the lowest of 1049 * those timeouts. 1050 */ 1051 lowest = TMR_NEVER; 1052 1053 for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) { 1054 /* Skip requests without a timeout. */ 1055 if (spr->spr_timer == 0) { 1056 sprp = &spr->spr_next; 1057 1058 continue; 1059 } 1060 1061 assert(spr->spr_event == SEV_SEND || 1062 spr->spr_event == SEV_RECV); 1063 1064 /* 1065 * If the request has expired, cancel it and remove it from the 1066 * list. Otherwise, see if the request has the lowest number 1067 * of ticks until its timeout so far. 1068 */ 1069 if (tmr_is_first(spr->spr_time, now)) { 1070 *sprp = spr->spr_next; 1071 1072 if (spr->spr_event == SEV_SEND) 1073 sockevent_cancel_send(sock, spr, EWOULDBLOCK); 1074 else 1075 sockevent_cancel_recv(sock, spr, EWOULDBLOCK); 1076 1077 sockevent_proc_free(spr); 1078 } else { 1079 left = spr->spr_time - now; 1080 1081 if (lowest == TMR_NEVER || lowest > left) 1082 lowest = left; 1083 1084 sprp = &spr->spr_next; 1085 } 1086 } 1087 1088 return lowest; 1089 } 1090 1091 /* 1092 * The socket event alarm went off. Go through the set of socket objects with 1093 * timers, and see if any of their requests have now expired. Set a new alarm 1094 * as necessary. 1095 */ 1096 static void 1097 socktimer_expire(int arg __unused) 1098 { 1099 SLIST_HEAD(, sock) oldtimer; 1100 struct sock *sock, *tsock; 1101 clock_t now, lowest, left; 1102 int working; 1103 1104 /* 1105 * This function may or may not be called from a context where we are 1106 * already deferring events, so we have to cover both cases here. 1107 */ 1108 if ((working = sockevent_working) == FALSE) 1109 sockevent_working = TRUE; 1110 1111 /* Start a new list. */ 1112 memcpy(&oldtimer, &socktimer, sizeof(oldtimer)); 1113 SLIST_INIT(&socktimer); 1114 1115 now = getticks(); 1116 lowest = TMR_NEVER; 1117 1118 /* 1119 * Go through all sockets that have or had a request with a timeout, 1120 * canceling any expired requests and building a new list of sockets 1121 * that still have requests with timeouts as we go. 1122 */ 1123 SLIST_FOREACH_SAFE(sock, &oldtimer, sock_timer, tsock) { 1124 assert(sock->sock_flags & SFL_TIMER); 1125 sock->sock_flags &= ~SFL_TIMER; 1126 1127 left = sockevent_expire(sock, now); 1128 /* 1129 * The sock object may already have been deallocated now. 1130 * If 'next' is TMR_NEVER, do not touch 'sock' anymore. 1131 */ 1132 1133 if (left != TMR_NEVER) { 1134 if (lowest == TMR_NEVER || lowest > left) 1135 lowest = left; 1136 1137 SLIST_INSERT_HEAD(&socktimer, sock, sock_timer); 1138 1139 sock->sock_flags |= SFL_TIMER; 1140 } 1141 } 1142 1143 /* If there is a new lowest timeout at all, set a new timer. */ 1144 if (lowest != TMR_NEVER) 1145 set_timer(&sockevent_timer, lowest, socktimer_expire, 0); 1146 1147 if (!working) { 1148 /* If any new events were raised, process them now. */ 1149 if (sockevent_has_events()) 1150 sockevent_pump(); 1151 1152 sockevent_working = FALSE; 1153 } 1154 } 1155 1156 /* 1157 * Set a timer for the given (relative) number of clock ticks, adding the 1158 * associated socket object to the set of socket objects with timers, if it was 1159 * not already in that set. Set a new alarm if necessary, and return the 1160 * absolute timeout for the timer. Since the timers list is maintained lazily, 1161 * the caller need not take the object off the set if the call was canceled 1162 * later; see also socktimer_del(). 1163 */ 1164 static clock_t 1165 socktimer_add(struct sock * sock, clock_t ticks) 1166 { 1167 clock_t now; 1168 1169 /* 1170 * Relative time comparisons require that any two times are no more 1171 * than half the comparison space (clock_t, unsigned long) apart. 1172 */ 1173 assert(ticks <= TMRDIFF_MAX); 1174 1175 /* If the socket was not already on the timers list, put it on. */ 1176 if (!(sock->sock_flags & SFL_TIMER)) { 1177 SLIST_INSERT_HEAD(&socktimer, sock, sock_timer); 1178 1179 sock->sock_flags |= SFL_TIMER; 1180 } 1181 1182 /* 1183 * (Re)set the timer if either it was not running at all or this new 1184 * timeout will occur sooner than the currently scheduled alarm. Note 1185 * that setting a timer that was already set is allowed. 1186 */ 1187 now = getticks(); 1188 1189 if (!tmr_is_set(&sockevent_timer) || 1190 tmr_is_first(now + ticks, tmr_exp_time(&sockevent_timer))) 1191 set_timer(&sockevent_timer, ticks, socktimer_expire, 0); 1192 1193 /* Return the absolute timeout. */ 1194 return now + ticks; 1195 } 1196 1197 /* 1198 * Remove a socket object from the set of socket objects with timers. Since 1199 * the timer list is maintained lazily, this needs to be done only right before 1200 * the socket object is freed. 1201 */ 1202 static void 1203 socktimer_del(struct sock * sock) 1204 { 1205 1206 if (sock->sock_flags & SFL_TIMER) { 1207 /* This macro is O(n). */ 1208 SLIST_REMOVE(&socktimer, sock, sock, sock_timer); 1209 1210 sock->sock_flags &= ~SFL_TIMER; 1211 } 1212 } 1213 1214 /* 1215 * Bind a socket to a local address. 1216 */ 1217 static int 1218 sockevent_bind(sockid_t id, const struct sockaddr * __restrict addr, 1219 socklen_t addr_len, endpoint_t user_endpt, 1220 const struct sockdriver_call * __restrict call) 1221 { 1222 struct sock *sock; 1223 int r; 1224 1225 if ((sock = sockhash_get(id)) == NULL) 1226 return EINVAL; 1227 1228 if (sock->sock_ops->sop_bind == NULL) 1229 return EOPNOTSUPP; 1230 1231 /* Binding a socket in listening mode is never supported. */ 1232 if (sock->sock_opt & SO_ACCEPTCONN) 1233 return EINVAL; 1234 1235 r = sock->sock_ops->sop_bind(sock, addr, addr_len, user_endpt); 1236 1237 if (r == SUSPEND) { 1238 if (call == NULL) 1239 return EINPROGRESS; 1240 1241 sockevent_suspend(sock, SEV_BIND, call, user_endpt); 1242 } 1243 1244 return r; 1245 } 1246 1247 /* 1248 * Connect a socket to a remote address. 1249 */ 1250 static int 1251 sockevent_connect(sockid_t id, const struct sockaddr * __restrict addr, 1252 socklen_t addr_len, endpoint_t user_endpt, 1253 const struct sockdriver_call * call) 1254 { 1255 struct sockdriver_call fakecall; 1256 struct sockevent_proc *spr; 1257 struct sock *sock; 1258 int r; 1259 1260 if ((sock = sockhash_get(id)) == NULL) 1261 return EINVAL; 1262 1263 if (sock->sock_ops->sop_connect == NULL) 1264 return EOPNOTSUPP; 1265 1266 /* Connecting a socket in listening mode is never supported. */ 1267 if (sock->sock_opt & SO_ACCEPTCONN) 1268 return EOPNOTSUPP; 1269 1270 /* 1271 * The upcoming connect call may fire an accept event for which the 1272 * handler may in turn fire a connect event on this socket. Since we 1273 * delay event processing until after processing calls, this would 1274 * create the problem that even if the connection is accepted right 1275 * away, non-blocking connect requests would return EINPROGRESS. For 1276 * UDS, this is undesirable behavior. To remedy this, we use a hack: 1277 * we temporarily suspend the connect even if non-blocking, then 1278 * process events, and then cancel the connect request again. If the 1279 * connection was accepted immediately, the cancellation will have no 1280 * effect, since the request has already been replied to. In order not 1281 * to violate libsockdriver rules with this hack, we fabricate a fake 1282 * 'conn' object. 1283 */ 1284 r = sock->sock_ops->sop_connect(sock, addr, addr_len, user_endpt); 1285 1286 if (r == SUSPEND) { 1287 if (call != NULL || sockevent_has_events()) { 1288 if (call == NULL) { 1289 fakecall.sc_endpt = NONE; 1290 1291 call = &fakecall; 1292 } 1293 1294 assert(!sockevent_has_suspended(sock, 1295 SEV_SEND | SEV_RECV)); 1296 1297 sockevent_suspend(sock, SEV_CONNECT, call, user_endpt); 1298 1299 if (call == &fakecall) { 1300 /* Process any pending events first now. */ 1301 sockevent_pump(); 1302 1303 /* 1304 * If the connect request has not been resumed 1305 * yet now, we must remove it from the queue 1306 * again, and return EINPROGRESS ourselves. 1307 * Otherwise, return OK or a pending error. 1308 */ 1309 spr = sockevent_unsuspend(sock, call); 1310 if (spr != NULL) { 1311 sockevent_proc_free(spr); 1312 1313 r = EINPROGRESS; 1314 } else if ((r = sock->sock_err) != OK) 1315 sock->sock_err = OK; 1316 } 1317 } else 1318 r = EINPROGRESS; 1319 } 1320 1321 if (r == OK) { 1322 /* 1323 * A completed connection attempt also always implies that the 1324 * socket becomes writable. For convenience we enforce this 1325 * rule here, because it is easy to forget. 1326 */ 1327 sockevent_raise(sock, SEV_SEND); 1328 } 1329 1330 return r; 1331 } 1332 1333 /* 1334 * Put a socket in listening mode. 1335 */ 1336 static int 1337 sockevent_listen(sockid_t id, int backlog) 1338 { 1339 struct sock *sock; 1340 int r; 1341 1342 if ((sock = sockhash_get(id)) == NULL) 1343 return EINVAL; 1344 1345 if (sock->sock_ops->sop_listen == NULL) 1346 return EOPNOTSUPP; 1347 1348 /* 1349 * Perform a general adjustment on the backlog value, applying the 1350 * customary BSD "fudge factor" of 1.5x. Keep the value within bounds 1351 * though. POSIX imposes that a negative backlog value is equal to a 1352 * backlog value of zero. A backlog value of zero, in turn, may mean 1353 * anything; we take it to be one. POSIX also imposes that all socket 1354 * drivers accept up to at least SOMAXCONN connections on the queue. 1355 */ 1356 if (backlog < 0) 1357 backlog = 0; 1358 if (backlog < SOMAXCONN) 1359 backlog += 1 + ((unsigned int)backlog >> 1); 1360 if (backlog > SOMAXCONN) 1361 backlog = SOMAXCONN; 1362 1363 r = sock->sock_ops->sop_listen(sock, backlog); 1364 1365 /* 1366 * On success, the socket is now in listening mode. As part of that, 1367 * a select(2) ready-to-read condition now indicates that a connection 1368 * may be accepted on the socket, rather than that data may be read. 1369 * Since libsockevent is responsible for this distinction, we keep 1370 * track of the listening mode at this level. Conveniently, there is a 1371 * socket option for this, which we support out of the box as a result. 1372 */ 1373 if (r == OK) { 1374 sock->sock_opt |= SO_ACCEPTCONN; 1375 1376 /* 1377 * For the extremely unlikely case that right after the socket 1378 * is put into listening mode, it has a connection ready tor 1379 * accept, we retest blocked ready-to-read select queries now. 1380 */ 1381 sockevent_raise(sock, SEV_ACCEPT); 1382 } 1383 1384 return r; 1385 } 1386 1387 /* 1388 * Accept a connection on a listening socket, creating a new socket. 1389 */ 1390 static sockid_t 1391 sockevent_accept(sockid_t id, struct sockaddr * __restrict addr, 1392 socklen_t * __restrict addr_len, endpoint_t user_endpt, 1393 const struct sockdriver_call * __restrict call) 1394 { 1395 struct sock *sock, *newsock; 1396 sockid_t r; 1397 1398 if ((sock = sockhash_get(id)) == NULL) 1399 return EINVAL; 1400 1401 if (sock->sock_ops->sop_accept == NULL) 1402 return EOPNOTSUPP; 1403 1404 /* 1405 * Attempt to accept a connection. The socket driver is responsible 1406 * for allocating a sock object (and identifier) on success. It may 1407 * already have done so before, in which case it should leave newsock 1408 * filled with NULL; otherwise, the returned sock object is cloned from 1409 * the listening socket. The socket driver is also responsible for 1410 * failing the call if the socket is not in listening mode, because it 1411 * must specify the error to return: EOPNOTSUPP or EINVAL. 1412 */ 1413 newsock = NULL; 1414 1415 if ((r = sock->sock_ops->sop_accept(sock, addr, addr_len, user_endpt, 1416 &newsock)) == SUSPEND) { 1417 assert(sock->sock_opt & SO_ACCEPTCONN); 1418 1419 if (call == NULL) 1420 return EWOULDBLOCK; 1421 1422 sockevent_suspend(sock, SEV_ACCEPT, call, user_endpt); 1423 1424 return SUSPEND; 1425 } 1426 1427 if (r >= 0) 1428 sockevent_accepted(sock, newsock, r); 1429 1430 return r; 1431 } 1432 1433 /* 1434 * Send regular and/or control data. 1435 */ 1436 static int 1437 sockevent_send(sockid_t id, const struct sockdriver_data * __restrict data, 1438 size_t len, const struct sockdriver_data * __restrict ctl_data, 1439 socklen_t ctl_len, const struct sockaddr * __restrict addr, 1440 socklen_t addr_len, endpoint_t user_endpt, int flags, 1441 const struct sockdriver_call * __restrict call) 1442 { 1443 struct sock *sock; 1444 clock_t time; 1445 size_t min, off; 1446 socklen_t ctl_off; 1447 int r, timer; 1448 1449 if ((sock = sockhash_get(id)) == NULL) 1450 return EINVAL; 1451 1452 /* 1453 * The order of the following checks is not necessarily fixed, and may 1454 * be changed later. As far as applicable, they should match the order 1455 * of the checks during call resumption, though. 1456 */ 1457 if ((r = sock->sock_err) != OK) { 1458 sock->sock_err = OK; 1459 1460 return r; 1461 } 1462 1463 if (sock->sock_flags & SFL_SHUT_WR) { 1464 sockevent_sigpipe(sock, user_endpt, flags); 1465 1466 return EPIPE; 1467 } 1468 1469 /* 1470 * Translate the sticky SO_DONTROUTE option to a per-request 1471 * MSG_DONTROUTE flag. This achieves two purposes: socket drivers have 1472 * to check only one flag, and socket drivers that do not support the 1473 * flag will fail send requests in a consistent way. 1474 */ 1475 if (sock->sock_opt & SO_DONTROUTE) 1476 flags |= MSG_DONTROUTE; 1477 1478 /* 1479 * Check if this is a valid send request as far as the socket driver is 1480 * concerned. We do this separately from sop_send for the reason that 1481 * this send request may immediately be queued behind other pending 1482 * send requests (without a call to sop_send), which means even invalid 1483 * requests would be queued and not return failure until much later. 1484 */ 1485 if (sock->sock_ops->sop_pre_send != NULL && 1486 (r = sock->sock_ops->sop_pre_send(sock, len, ctl_len, addr, 1487 addr_len, user_endpt, 1488 flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK) 1489 return r; 1490 1491 if (sock->sock_ops->sop_send == NULL) 1492 return EOPNOTSUPP; 1493 1494 off = 0; 1495 ctl_off = 0; 1496 1497 /* 1498 * Sending out-of-band data is treated differently from regular data: 1499 * 1500 * - sop_send is called immediately, even if a partial non-OOB send 1501 * operation is currently suspended (TODO: it may have to be aborted 1502 * in order to maintain atomicity guarantees - that should be easy); 1503 * - sop_send must not return SUSPEND; instead, if it cannot process 1504 * the OOB data immediately, it must return an appropriate error; 1505 * - the send low watermark is ignored. 1506 * 1507 * Given that none of the current socket drivers support OOB data at 1508 * all, more sophisticated approaches would have no added value now. 1509 */ 1510 if (flags & MSG_OOB) { 1511 r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data, 1512 ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, 0); 1513 1514 if (r == SUSPEND) 1515 panic("libsockevent: MSG_OOB send calls may not be " 1516 "suspended"); 1517 1518 return (r == OK) ? (int)off : r; 1519 } 1520 1521 /* 1522 * Only call the actual sop_send function now if no other send calls 1523 * are suspended already. 1524 * 1525 * Call sop_send with 'min' set to the minimum of the request size and 1526 * the socket's send low water mark, but only if the call is non- 1527 * blocking. For stream-oriented sockets, this should have the effect 1528 * that non-blocking calls fail with EWOULDBLOCK if not at least that 1529 * much can be sent immediately. For consistency, we choose to apply 1530 * the same threshold to blocking calls. For datagram-oriented 1531 * sockets, the minimum is not a factor to be considered. 1532 */ 1533 if (!sockevent_has_suspended(sock, SEV_SEND)) { 1534 min = sock->sock_slowat; 1535 if (min > len) 1536 min = len; 1537 1538 r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data, 1539 ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, min); 1540 } else 1541 r = SUSPEND; 1542 1543 if (r == SUSPEND) { 1544 /* 1545 * We do not store the target's address on suspension, because 1546 * that would add significantly to the per-process suspension 1547 * state. As a result, we disallow socket drivers from 1548 * suspending send calls with addresses, because we would no 1549 * longer have the address for proper call resumption. 1550 * However, we do not know here whether the socket is in 1551 * connection-oriented mode; if it is, the address is to be 1552 * ignored altogether. Therefore, there is no test on 'addr' 1553 * here. Resumed calls will get a NULL address pointer, and 1554 * the socket driver is expected to do the right thing. 1555 */ 1556 1557 /* 1558 * For non-blocking socket calls, return an error only if we 1559 * were not able to send anything at all. If only control data 1560 * were sent, the return value is therefore zero. 1561 */ 1562 if (call != NULL) { 1563 if (sock->sock_stimeo != 0) { 1564 timer = TRUE; 1565 time = socktimer_add(sock, sock->sock_stimeo); 1566 } else { 1567 timer = FALSE; 1568 time = 0; 1569 } 1570 1571 sockevent_suspend_data(sock, SEV_SEND, timer, call, 1572 user_endpt, data, len, off, ctl_data, ctl_len, 1573 ctl_off, flags, 0, time); 1574 } else 1575 r = (off > 0 || ctl_off > 0) ? OK : EWOULDBLOCK; 1576 } else if (r == EPIPE) 1577 sockevent_sigpipe(sock, user_endpt, flags); 1578 1579 return (r == OK) ? (int)off : r; 1580 } 1581 1582 /* 1583 * The inner part of the receive request handler. An error returned from here 1584 * may be overridden by an error pending on the socket, although data returned 1585 * from here trumps such pending errors. 1586 */ 1587 static int 1588 sockevent_recv_inner(struct sock * sock, 1589 const struct sockdriver_data * __restrict data, 1590 size_t len, size_t * __restrict off, 1591 const struct sockdriver_data * __restrict ctl_data, 1592 socklen_t ctl_len, socklen_t * __restrict ctl_off, 1593 struct sockaddr * __restrict addr, 1594 socklen_t * __restrict addr_len, endpoint_t user_endpt, 1595 int * __restrict flags, const struct sockdriver_call * __restrict call) 1596 { 1597 clock_t time; 1598 size_t min; 1599 int r, oob, inflags, timer; 1600 1601 /* 1602 * Check if this is a valid receive request as far as the socket driver 1603 * is concerned. We do this separately from sop_recv for the reason 1604 * that this receive request may immediately be queued behind other 1605 * pending receive requests (without a call to sop_recv), which means 1606 * even invalid requests would be queued and not return failure until 1607 * much later. 1608 */ 1609 inflags = *flags; 1610 *flags = 0; 1611 1612 if (sock->sock_ops->sop_pre_recv != NULL && 1613 (r = sock->sock_ops->sop_pre_recv(sock, user_endpt, 1614 inflags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK) 1615 return r; 1616 1617 /* 1618 * The order of the following checks is not necessarily fixed, and may 1619 * be changed later. As far as applicable, they should match the order 1620 * of the checks during call resumption, though. 1621 */ 1622 if (sock->sock_flags & SFL_SHUT_RD) 1623 return SOCKEVENT_EOF; 1624 1625 if (sock->sock_ops->sop_recv == NULL) 1626 return EOPNOTSUPP; 1627 1628 /* 1629 * Receiving out-of-band data is treated differently from regular data: 1630 * 1631 * - sop_recv is called immediately, even if a partial non-OOB receive 1632 * operation is currently suspended (TODO: it may have to be aborted 1633 * in order to maintain atomicity guarantees - that should be easy); 1634 * - sop_recv must not return SUSPEND; instead, if it cannot return any 1635 * the OOB data immediately, it must return an appropriate error; 1636 * - the receive low watermark is ignored. 1637 * 1638 * Given that none of the current socket drivers support OOB data at 1639 * all, more sophisticated approaches would have no added value now. 1640 */ 1641 oob = (inflags & MSG_OOB); 1642 1643 if (oob && (sock->sock_opt & SO_OOBINLINE)) 1644 return EINVAL; 1645 1646 /* 1647 * Only call the actual sop_recv function now if no other receive 1648 * calls are suspended already. 1649 * 1650 * Call sop_recv with 'min' set to the minimum of the request size and 1651 * the socket's socket's low water mark, unless there is a pending 1652 * error. As a result, blocking calls will block, and non-blocking 1653 * calls will yield EWOULDBLOCK, if at least that much can be received, 1654 * unless another condition (EOF or that pending error) prevents more 1655 * from being received anyway. For datagram-oriented sockets, the 1656 * minimum is not a factor to be considered. 1657 */ 1658 if (oob || !sockevent_has_suspended(sock, SEV_RECV)) { 1659 if (!oob && sock->sock_err == OK) { 1660 min = sock->sock_rlowat; 1661 if (min > len) 1662 min = len; 1663 } else 1664 min = 0; /* receive even no-data segments */ 1665 1666 r = sock->sock_ops->sop_recv(sock, data, len, off, ctl_data, 1667 ctl_len, ctl_off, addr, addr_len, user_endpt, inflags, min, 1668 flags); 1669 } else 1670 r = SUSPEND; 1671 1672 assert(r <= 0 || r == SOCKEVENT_EOF); 1673 1674 if (r == SUSPEND) { 1675 if (oob) 1676 panic("libsockevent: MSG_OOB receive calls may not be " 1677 "suspended"); 1678 1679 /* 1680 * For non-blocking socket calls, return EWOULDBLOCK only if we 1681 * did not receive anything at all. If only control data were 1682 * received, the return value is therefore zero. Suspension 1683 * implies that there is nothing to read. For the purpose of 1684 * the calling wrapper function, never suspend a call when 1685 * there is a pending error. 1686 */ 1687 if (call != NULL && sock->sock_err == OK) { 1688 if (sock->sock_rtimeo != 0) { 1689 timer = TRUE; 1690 time = socktimer_add(sock, sock->sock_rtimeo); 1691 } else { 1692 timer = FALSE; 1693 time = 0; 1694 } 1695 1696 sockevent_suspend_data(sock, SEV_RECV, timer, call, 1697 user_endpt, data, len, *off, ctl_data, 1698 ctl_len, *ctl_off, inflags, *flags, time); 1699 } else 1700 r = EWOULDBLOCK; 1701 } 1702 1703 return r; 1704 } 1705 1706 /* 1707 * Receive regular and/or control data. 1708 */ 1709 static int 1710 sockevent_recv(sockid_t id, const struct sockdriver_data * __restrict data, 1711 size_t len, const struct sockdriver_data * __restrict ctl_data, 1712 socklen_t * __restrict ctl_len, struct sockaddr * __restrict addr, 1713 socklen_t * __restrict addr_len, endpoint_t user_endpt, 1714 int * __restrict flags, const struct sockdriver_call * __restrict call) 1715 { 1716 struct sock *sock; 1717 size_t off; 1718 socklen_t ctl_inlen; 1719 int r; 1720 1721 if ((sock = sockhash_get(id)) == NULL) 1722 return EINVAL; 1723 1724 /* 1725 * This function is a wrapper around the actual receive functionality. 1726 * The reason for this is that receiving data should take precedence 1727 * over a pending socket error, while a pending socket error should 1728 * take precedence over both regular errors as well as EOF. In other 1729 * words: if there is a pending error, we must try to receive anything 1730 * at all; if receiving does not work, we must fail the call with the 1731 * pending error. However, until we call the receive callback, we have 1732 * no way of telling whether any data can be received. So we must try 1733 * that before we can decide whether to return a pending error. 1734 */ 1735 off = 0; 1736 ctl_inlen = *ctl_len; 1737 *ctl_len = 0; 1738 1739 /* 1740 * Attempt to perform the actual receive call. 1741 */ 1742 r = sockevent_recv_inner(sock, data, len, &off, ctl_data, ctl_inlen, 1743 ctl_len, addr, addr_len, user_endpt, flags, call); 1744 1745 /* 1746 * If the receive request succeeded, or it failed but yielded a partial 1747 * result, then return the (partal) result. Otherwise, if an error is 1748 * pending, return that error. Otherwise, return either a regular 1749 * error or 0 for EOF. 1750 */ 1751 if (r == OK || (r != SUSPEND && (off > 0 || *ctl_len > 0))) 1752 r = (int)off; 1753 else if (sock->sock_err != OK) { 1754 assert(r != SUSPEND); 1755 1756 r = sock->sock_err; 1757 1758 sock->sock_err = OK; 1759 } else if (r == SOCKEVENT_EOF) 1760 r = 0; 1761 1762 return r; 1763 } 1764 1765 /* 1766 * Process an I/O control call. 1767 */ 1768 static int 1769 sockevent_ioctl(sockid_t id, unsigned long request, 1770 const struct sockdriver_data * __restrict data, endpoint_t user_endpt, 1771 const struct sockdriver_call * __restrict call __unused) 1772 { 1773 struct sock *sock; 1774 size_t size; 1775 int r, val; 1776 1777 if ((sock = sockhash_get(id)) == NULL) 1778 return EINVAL; 1779 1780 /* We handle a very small subset of generic IOCTLs here. */ 1781 switch (request) { 1782 case FIONREAD: 1783 size = 0; 1784 if (!(sock->sock_flags & SFL_SHUT_RD) && 1785 sock->sock_ops->sop_test_recv != NULL) 1786 (void)sock->sock_ops->sop_test_recv(sock, 0, &size); 1787 1788 val = (int)size; 1789 1790 return sockdriver_copyout(data, 0, &val, sizeof(val)); 1791 } 1792 1793 if (sock->sock_ops->sop_ioctl == NULL) 1794 return ENOTTY; 1795 1796 r = sock->sock_ops->sop_ioctl(sock, request, data, user_endpt); 1797 1798 /* 1799 * Suspending IOCTL requests is not currently supported by this 1800 * library, even though the VFS protocol and libsockdriver do support 1801 * it. The reason is that IOCTLs do not match our proces suspension 1802 * model: they could be neither queued nor repeated. For now, it seems 1803 * that this feature is not needed by the socket drivers either. Thus, 1804 * even though there are possible solutions, we defer implementing them 1805 * until we know what exactly is needed. 1806 */ 1807 if (r == SUSPEND) 1808 panic("libsockevent: socket driver suspended IOCTL 0x%lx", 1809 request); 1810 1811 return r; 1812 } 1813 1814 /* 1815 * Set socket options. 1816 */ 1817 static int 1818 sockevent_setsockopt(sockid_t id, int level, int name, 1819 const struct sockdriver_data * data, socklen_t len) 1820 { 1821 struct sock *sock; 1822 struct linger linger; 1823 struct timeval tv; 1824 clock_t secs, ticks; 1825 int r, val; 1826 1827 if ((sock = sockhash_get(id)) == NULL) 1828 return EINVAL; 1829 1830 if (level == SOL_SOCKET) { 1831 /* 1832 * Handle a subset of the socket-level options here. For most 1833 * of them, this means that the socket driver itself need not 1834 * handle changing or returning the options, but still needs to 1835 * implement the correct behavior based on them where needed. 1836 * A few of them are handled exclusively in this library: 1837 * SO_ACCEPTCONN, SO_NOSIGPIPE, SO_ERROR, SO_TYPE, SO_LINGER, 1838 * SO_SNDLOWAT, SO_RCVLOWAT, SO_SNDTIMEO, and SO_RCVTIMEO. 1839 * The SO_USELOOPBACK option is explicitly absent, as it is 1840 * valid for routing sockets only and is set by default there. 1841 */ 1842 switch (name) { 1843 case SO_DEBUG: 1844 case SO_REUSEADDR: 1845 case SO_KEEPALIVE: 1846 case SO_DONTROUTE: 1847 case SO_BROADCAST: 1848 case SO_OOBINLINE: 1849 case SO_REUSEPORT: 1850 case SO_NOSIGPIPE: 1851 case SO_TIMESTAMP: 1852 /* 1853 * Simple on-off options. Changing them does not 1854 * involve the socket driver. 1855 */ 1856 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 1857 len)) != OK) 1858 return r; 1859 1860 if (val) 1861 sock->sock_opt |= (unsigned int)name; 1862 else 1863 sock->sock_opt &= ~(unsigned int)name; 1864 1865 /* 1866 * In priciple these on-off options are maintained in 1867 * this library, but some socket drivers may need to 1868 * apply the options elsewhere, so we notify them that 1869 * something has changed. Using the sop_setsockopt 1870 * callback would be inconvenient for this for two 1871 * reasons: multiple value copy-ins and default errors. 1872 */ 1873 if (sock->sock_ops->sop_setsockmask != NULL) 1874 sock->sock_ops->sop_setsockmask(sock, 1875 sock->sock_opt); 1876 1877 /* 1878 * The inlining of OOB data may make new data available 1879 * through regular receive calls. Thus, see if we can 1880 * wake up any suspended receive calls now. 1881 */ 1882 if (name == SO_OOBINLINE && val) 1883 sockevent_raise(sock, SEV_RECV); 1884 1885 return OK; 1886 1887 case SO_LINGER: 1888 /* The only on-off option with an associated value. */ 1889 if ((r = sockdriver_copyin_opt(data, &linger, 1890 sizeof(linger), len)) != OK) 1891 return r; 1892 1893 if (linger.l_onoff) { 1894 if (linger.l_linger < 0) 1895 return EINVAL; 1896 /* EDOM is the closest applicable error.. */ 1897 secs = (clock_t)linger.l_linger; 1898 if (secs >= TMRDIFF_MAX / sys_hz()) 1899 return EDOM; 1900 1901 sock->sock_opt |= SO_LINGER; 1902 sock->sock_linger = secs * sys_hz(); 1903 } else { 1904 sock->sock_opt &= ~SO_LINGER; 1905 sock->sock_linger = 0; 1906 } 1907 1908 return OK; 1909 1910 case SO_SNDLOWAT: 1911 case SO_RCVLOWAT: 1912 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 1913 len)) != OK) 1914 return r; 1915 1916 if (val <= 0) 1917 return EINVAL; 1918 1919 /* 1920 * Setting these values may allow suspended operations 1921 * (send, recv, select) to be resumed, so recheck. 1922 */ 1923 if (name == SO_SNDLOWAT) { 1924 sock->sock_slowat = (size_t)val; 1925 1926 sockevent_raise(sock, SEV_SEND); 1927 } else { 1928 sock->sock_rlowat = (size_t)val; 1929 1930 sockevent_raise(sock, SEV_RECV); 1931 } 1932 1933 return OK; 1934 1935 case SO_SNDTIMEO: 1936 case SO_RCVTIMEO: 1937 if ((r = sockdriver_copyin_opt(data, &tv, sizeof(tv), 1938 len)) != OK) 1939 return r; 1940 1941 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 1942 (unsigned long)tv.tv_usec >= US) 1943 return EINVAL; 1944 if (tv.tv_sec >= TMRDIFF_MAX / sys_hz()) 1945 return EDOM; 1946 1947 ticks = tv.tv_sec * sys_hz() + 1948 (tv.tv_usec * sys_hz() + US - 1) / US; 1949 1950 if (name == SO_SNDTIMEO) 1951 sock->sock_stimeo = ticks; 1952 else 1953 sock->sock_rtimeo = ticks; 1954 1955 /* 1956 * The timeouts for any calls already in progress for 1957 * this socket are left as is. 1958 */ 1959 return OK; 1960 1961 case SO_ACCEPTCONN: 1962 case SO_ERROR: 1963 case SO_TYPE: 1964 /* These options may be retrieved but not set. */ 1965 return ENOPROTOOPT; 1966 1967 default: 1968 /* 1969 * The remaining options either cannot be handled in a 1970 * generic way, or are not recognized altogether. Pass 1971 * them to the socket driver, which should handle what 1972 * it knows and reject the rest. 1973 */ 1974 break; 1975 } 1976 } 1977 1978 if (sock->sock_ops->sop_setsockopt == NULL) 1979 return ENOPROTOOPT; 1980 1981 /* 1982 * The socket driver must return ENOPROTOOPT for all options it does 1983 * not recognize. 1984 */ 1985 return sock->sock_ops->sop_setsockopt(sock, level, name, data, len); 1986 } 1987 1988 /* 1989 * Retrieve socket options. 1990 */ 1991 static int 1992 sockevent_getsockopt(sockid_t id, int level, int name, 1993 const struct sockdriver_data * __restrict data, 1994 socklen_t * __restrict len) 1995 { 1996 struct sock *sock; 1997 struct linger linger; 1998 struct timeval tv; 1999 clock_t ticks; 2000 int val; 2001 2002 if ((sock = sockhash_get(id)) == NULL) 2003 return EINVAL; 2004 2005 if (level == SOL_SOCKET) { 2006 /* 2007 * As with setting, handle a subset of the socket-level options 2008 * here. The rest is to be taken care of by the socket driver. 2009 */ 2010 switch (name) { 2011 case SO_DEBUG: 2012 case SO_ACCEPTCONN: 2013 case SO_REUSEADDR: 2014 case SO_KEEPALIVE: 2015 case SO_DONTROUTE: 2016 case SO_BROADCAST: 2017 case SO_OOBINLINE: 2018 case SO_REUSEPORT: 2019 case SO_NOSIGPIPE: 2020 case SO_TIMESTAMP: 2021 val = !!(sock->sock_opt & (unsigned int)name); 2022 2023 return sockdriver_copyout_opt(data, &val, sizeof(val), 2024 len); 2025 2026 case SO_LINGER: 2027 linger.l_onoff = !!(sock->sock_opt & SO_LINGER); 2028 linger.l_linger = sock->sock_linger / sys_hz(); 2029 2030 return sockdriver_copyout_opt(data, &linger, 2031 sizeof(linger), len); 2032 2033 case SO_ERROR: 2034 if ((val = -sock->sock_err) != OK) 2035 sock->sock_err = OK; 2036 2037 return sockdriver_copyout_opt(data, &val, sizeof(val), 2038 len); 2039 2040 case SO_TYPE: 2041 val = sock->sock_type; 2042 2043 return sockdriver_copyout_opt(data, &val, sizeof(val), 2044 len); 2045 2046 case SO_SNDLOWAT: 2047 val = (int)sock->sock_slowat; 2048 2049 return sockdriver_copyout_opt(data, &val, sizeof(val), 2050 len); 2051 2052 case SO_RCVLOWAT: 2053 val = (int)sock->sock_rlowat; 2054 2055 return sockdriver_copyout_opt(data, &val, sizeof(val), 2056 len); 2057 2058 case SO_SNDTIMEO: 2059 case SO_RCVTIMEO: 2060 if (name == SO_SNDTIMEO) 2061 ticks = sock->sock_stimeo; 2062 else 2063 ticks = sock->sock_rtimeo; 2064 2065 tv.tv_sec = ticks / sys_hz(); 2066 tv.tv_usec = (ticks % sys_hz()) * US / sys_hz(); 2067 2068 return sockdriver_copyout_opt(data, &tv, sizeof(tv), 2069 len); 2070 2071 default: 2072 break; 2073 } 2074 } 2075 2076 if (sock->sock_ops->sop_getsockopt == NULL) 2077 return ENOPROTOOPT; 2078 2079 /* 2080 * The socket driver must return ENOPROTOOPT for all options it does 2081 * not recognize. 2082 */ 2083 return sock->sock_ops->sop_getsockopt(sock, level, name, data, len); 2084 } 2085 2086 /* 2087 * Retrieve a socket's local address. 2088 */ 2089 static int 2090 sockevent_getsockname(sockid_t id, struct sockaddr * __restrict addr, 2091 socklen_t * __restrict addr_len) 2092 { 2093 struct sock *sock; 2094 2095 if ((sock = sockhash_get(id)) == NULL) 2096 return EINVAL; 2097 2098 if (sock->sock_ops->sop_getsockname == NULL) 2099 return EOPNOTSUPP; 2100 2101 return sock->sock_ops->sop_getsockname(sock, addr, addr_len); 2102 } 2103 2104 /* 2105 * Retrieve a socket's remote address. 2106 */ 2107 static int 2108 sockevent_getpeername(sockid_t id, struct sockaddr * __restrict addr, 2109 socklen_t * __restrict addr_len) 2110 { 2111 struct sock *sock; 2112 2113 if ((sock = sockhash_get(id)) == NULL) 2114 return EINVAL; 2115 2116 /* Listening-mode sockets cannot possibly have a peer address. */ 2117 if (sock->sock_opt & SO_ACCEPTCONN) 2118 return ENOTCONN; 2119 2120 if (sock->sock_ops->sop_getpeername == NULL) 2121 return EOPNOTSUPP; 2122 2123 return sock->sock_ops->sop_getpeername(sock, addr, addr_len); 2124 } 2125 2126 /* 2127 * Mark the socket object as shut down for sending and/or receiving. The flags 2128 * parameter may be a bitwise-OR'ed combination of SFL_SHUT_RD and SFL_SHUT_WR. 2129 * This function will wake up any suspended requests affected by this change, 2130 * but it will not invoke the sop_shutdown() callback function on the socket. 2131 * The function may in fact be called from sop_shutdown() before completion to 2132 * mark the socket as shut down as reflected by sockevent_is_shutdown(). 2133 */ 2134 void 2135 sockevent_set_shutdown(struct sock * sock, unsigned int flags) 2136 { 2137 unsigned int mask; 2138 2139 assert(sock->sock_ops != NULL); 2140 assert(!(flags & ~(SFL_SHUT_RD | SFL_SHUT_WR))); 2141 2142 /* Look at the newly set flags only. */ 2143 flags &= ~(unsigned int)sock->sock_flags; 2144 2145 if (flags != 0) { 2146 sock->sock_flags |= flags; 2147 2148 /* 2149 * Wake up any blocked calls that are affected by the shutdown. 2150 * Shutting down listening sockets causes ongoing accept calls 2151 * to be rechecked. 2152 */ 2153 mask = 0; 2154 if (flags & SFL_SHUT_RD) 2155 mask |= SEV_RECV; 2156 if (flags & SFL_SHUT_WR) 2157 mask |= SEV_SEND; 2158 if (sock->sock_opt & SO_ACCEPTCONN) 2159 mask |= SEV_ACCEPT; 2160 2161 assert(mask != 0); 2162 sockevent_raise(sock, mask); 2163 } 2164 } 2165 2166 /* 2167 * Shut down socket send and receive operations. 2168 */ 2169 static int 2170 sockevent_shutdown(sockid_t id, int how) 2171 { 2172 struct sock *sock; 2173 unsigned int flags; 2174 int r; 2175 2176 if ((sock = sockhash_get(id)) == NULL) 2177 return EINVAL; 2178 2179 /* Convert the request to a set of flags. */ 2180 flags = 0; 2181 if (how == SHUT_RD || how == SHUT_RDWR) 2182 flags |= SFL_SHUT_RD; 2183 if (how == SHUT_WR || how == SHUT_RDWR) 2184 flags |= SFL_SHUT_WR; 2185 2186 if (sock->sock_ops->sop_shutdown != NULL) 2187 r = sock->sock_ops->sop_shutdown(sock, flags); 2188 else 2189 r = OK; 2190 2191 /* On success, update our internal state as well. */ 2192 if (r == OK) 2193 sockevent_set_shutdown(sock, flags); 2194 2195 return r; 2196 } 2197 2198 /* 2199 * Close a socket. 2200 */ 2201 static int 2202 sockevent_close(sockid_t id, const struct sockdriver_call * call) 2203 { 2204 struct sock *sock; 2205 int r, force; 2206 2207 if ((sock = sockhash_get(id)) == NULL) 2208 return EINVAL; 2209 2210 assert(sock->sock_proc == NULL); 2211 sock->sock_select.ss_endpt = NONE; 2212 2213 /* 2214 * There are several scenarios when it comes to closing sockets. First 2215 * of all, we never actually force the socket driver to close a socket. 2216 * The driver may always suspend the close call and take as long as it 2217 * wants. After a suspension, it signals its completion of the close 2218 * through the SEV_CLOSE socket event. 2219 * 2220 * With that said, we offer two levels of urgency regarding the close 2221 * request: regular and forced. The former allows for a graceful 2222 * close; the latter urges the socket driver to close the socket as 2223 * soon as possible. A socket that has been requested to be closed 2224 * gracefully can, as long as it is still open (i.e., no SEV_CLOSE was 2225 * fired yet), later be requested to be closed forcefully. This is how 2226 * SO_LINGER with a nonzero timeout is implemented. If SO_LINGER is 2227 * set with a zero timeout, the socket is force-closed immediately. 2228 * Finally, if SO_LINGER is not set, the socket will be closed normally 2229 * and never be forced--akin to SO_LINGER with an infinite timeout. 2230 * 2231 * The return value of the caller's close(2) may only ever be either 2232 * OK or EINPROGRESS, to ensure that the caller knows that the file 2233 * descriptor is freed up, as per Austin Group Defect #529. In fact, 2234 * EINPROGRESS is to be returned only on signal interruption (i.e., 2235 * cancel). For that reason, this function only ever returns OK. 2236 */ 2237 force = ((sock->sock_opt & SO_LINGER) && sock->sock_linger == 0); 2238 2239 if (sock->sock_ops->sop_close != NULL) 2240 r = sock->sock_ops->sop_close(sock, force); 2241 else 2242 r = OK; 2243 2244 assert(r == OK || r == SUSPEND); 2245 2246 if (r == SUSPEND) { 2247 sock->sock_flags |= SFL_CLOSING; 2248 2249 /* 2250 * If we were requested to force-close the socket immediately, 2251 * but the socket driver needs more time anyway, then tell the 2252 * caller that the socket was closed right away. 2253 */ 2254 if (force) 2255 return OK; 2256 2257 /* 2258 * If we are to force-close the socket only after a specific 2259 * linger timeout, set the timer for that now, even if the call 2260 * is non-blocking. This also means that we cannot associate 2261 * the linger timeout with the close call. Instead, we convert 2262 * the sock_linger value from a (relative) duration to an 2263 * (absolute) timeout time, and use the SFL_CLOSING flag (along 2264 * with SFL_TIMER) to tell the difference. Since the socket is 2265 * otherwise unreachable from userland at this point, the 2266 * conversion is never visible in any way. 2267 * 2268 * The socket may already be in the timers list, so we must 2269 * always check the SO_LINGER flag before checking sock_linger. 2270 * 2271 * If SO_LINGER is not set, we must never suspend the call. 2272 */ 2273 if (sock->sock_opt & SO_LINGER) { 2274 sock->sock_linger = 2275 socktimer_add(sock, sock->sock_linger); 2276 } else 2277 call = NULL; 2278 2279 /* 2280 * A non-blocking close is completed asynchronously. The 2281 * caller is not told about this with EWOULDBLOCK as usual, for 2282 * the reasons mentioned above. 2283 */ 2284 if (call != NULL) 2285 sockevent_suspend(sock, SEV_CLOSE, call, NONE); 2286 else 2287 r = OK; 2288 } else if (r == OK) 2289 sockevent_free(sock); 2290 2291 return r; 2292 } 2293 2294 /* 2295 * Cancel a suspended send request. 2296 */ 2297 static void 2298 sockevent_cancel_send(struct sock * sock, struct sockevent_proc * spr, int err) 2299 { 2300 int r; 2301 2302 /* 2303 * If any regular or control data were sent, return the number of data 2304 * bytes sent--possibly zero. Otherwise return the given error code. 2305 */ 2306 if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0) 2307 r = (int)spr->spr_dataoff; 2308 else 2309 r = err; 2310 2311 sockdriver_reply_generic(&spr->spr_call, r); 2312 2313 /* 2314 * In extremely rare circumstances, one send may be queued behind 2315 * another send even though the former can actually be sent on the 2316 * socket right away. For this reason, we retry sending when canceling 2317 * a send. We need to do this only when the first send in the queue 2318 * was canceled, but multiple blocked sends on a single socket should 2319 * be rare anyway. 2320 */ 2321 sockevent_raise(sock, SEV_SEND); 2322 } 2323 2324 /* 2325 * Cancel a suspended receive request. 2326 */ 2327 static void 2328 sockevent_cancel_recv(struct sock * sock, struct sockevent_proc * spr, int err) 2329 { 2330 int r; 2331 2332 /* 2333 * If any regular or control data were received, return the number of 2334 * data bytes received--possibly zero. Otherwise return the given 2335 * error code. 2336 */ 2337 if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0) 2338 r = (int)spr->spr_dataoff; 2339 else 2340 r = err; 2341 2342 /* 2343 * Also return any flags set for the data received so far, e.g. 2344 * MSG_CTRUNC. Do not return an address: receive calls on unconnected 2345 * sockets must never block after receiving some data--instead, they 2346 * are supposed to return MSG_TRUNC if not all data were copied out. 2347 */ 2348 sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff, NULL, 0, 2349 spr->spr_rflags); 2350 2351 /* 2352 * The same story as for sends (see above) applies to receives, 2353 * although this case should be even more rare in practice. 2354 */ 2355 sockevent_raise(sock, SEV_RECV); 2356 } 2357 2358 /* 2359 * Cancel a previous request that may currently be suspended. The cancel 2360 * operation itself does not have a reply. Instead, if the given request was 2361 * found to be suspended, that request must be aborted and an appropriate reply 2362 * must be sent for the request. If no matching request was found, no reply 2363 * must be sent at all. 2364 */ 2365 static void 2366 sockevent_cancel(sockid_t id, const struct sockdriver_call * call) 2367 { 2368 struct sockevent_proc *spr; 2369 struct sock *sock; 2370 2371 /* 2372 * Due to asynchronous close(2) operations, not even the sock object 2373 * may be found. If this (entirely legitimate) case, do not send any 2374 * reply. 2375 */ 2376 if ((sock = sockhash_get(id)) == NULL) 2377 return; 2378 2379 /* 2380 * The request may already have completed by the time we receive the 2381 * cancel request, in which case we can not find it. In this (entirely 2382 * legitimate) case, do not send any reply. 2383 */ 2384 if ((spr = sockevent_unsuspend(sock, call)) == NULL) 2385 return; 2386 2387 /* 2388 * We found the operation. Cancel it according to its call type. 2389 * Then, once fully done with it, free the suspension data structure. 2390 * 2391 * Note that we have to use the call structure from the suspension data 2392 * structure rather than the given 'call' pointer: only the former 2393 * includes all the information necessary to resume the request! 2394 */ 2395 switch (spr->spr_event) { 2396 case SEV_BIND: 2397 case SEV_CONNECT: 2398 assert(spr->spr_call.sc_endpt != NONE); 2399 2400 sockdriver_reply_generic(&spr->spr_call, EINTR); 2401 2402 break; 2403 2404 case SEV_ACCEPT: 2405 sockdriver_reply_accept(&spr->spr_call, EINTR, NULL, 0); 2406 2407 break; 2408 2409 case SEV_SEND: 2410 sockevent_cancel_send(sock, spr, EINTR); 2411 2412 break; 2413 2414 case SEV_RECV: 2415 sockevent_cancel_recv(sock, spr, EINTR); 2416 2417 break; 2418 2419 case SEV_CLOSE: 2420 /* 2421 * Return EINPROGRESS rather than EINTR, so that the user 2422 * process can tell from the close(2) result that the file 2423 * descriptor has in fact been closed. 2424 */ 2425 sockdriver_reply_generic(&spr->spr_call, EINPROGRESS); 2426 2427 /* 2428 * Do not free the sock object here: the socket driver will 2429 * complete the close in the background, and fire SEV_CLOSE 2430 * once it is done. Only then is the sock object freed. 2431 */ 2432 break; 2433 2434 default: 2435 panic("libsockevent: process suspended on unknown event 0x%x", 2436 spr->spr_event); 2437 } 2438 2439 sockevent_proc_free(spr); 2440 } 2441 2442 /* 2443 * Process a select request. 2444 */ 2445 static int 2446 sockevent_select(sockid_t id, unsigned int ops, 2447 const struct sockdriver_select * sel) 2448 { 2449 struct sock *sock; 2450 unsigned int r, notify; 2451 2452 if ((sock = sockhash_get(id)) == NULL) 2453 return EINVAL; 2454 2455 notify = (ops & SDEV_NOTIFY); 2456 ops &= (SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR); 2457 2458 /* 2459 * See if any of the requested select operations can be satisfied 2460 * immediately. 2461 */ 2462 r = sockevent_test_select(sock, ops); 2463 2464 /* 2465 * If select operations were pending, the new results must not indicate 2466 * that any of those were satisfied, as that would indicate an internal 2467 * logic error: the socket driver is supposed to update its state 2468 * proactively, and thus, discovering that things have changed here is 2469 * not something that should ever happen. 2470 */ 2471 assert(!(sock->sock_selops & r)); 2472 2473 /* 2474 * If any select operations are not satisfied immediately, and we are 2475 * asked to notify the caller when they are satisfied later, save them 2476 * for later retesting. 2477 */ 2478 ops &= ~r; 2479 2480 if (notify && ops != 0) { 2481 /* 2482 * For now, we support only one caller when it comes to select 2483 * queries: VFS. If we want to support a networked file system 2484 * (or so) directly calling select as well, this library will 2485 * have to be extended accordingly (should not be too hard). 2486 */ 2487 if (sock->sock_select.ss_endpt != NONE) { 2488 if (sock->sock_select.ss_endpt != sel->ss_endpt) { 2489 printf("libsockevent: no support for multiple " 2490 "select callers yet\n"); 2491 2492 return EIO; 2493 } 2494 2495 /* 2496 * If a select query was already pending for this 2497 * caller, we must simply merge in the new operations. 2498 */ 2499 sock->sock_selops |= ops; 2500 } else { 2501 assert(sel->ss_endpt != NONE); 2502 2503 sock->sock_select = *sel; 2504 sock->sock_selops = ops; 2505 } 2506 } 2507 2508 return r; 2509 } 2510 2511 /* 2512 * An alarm has triggered. Expire any timers. Socket drivers that do not pass 2513 * clock notification messages to libsockevent must call expire_timers(3) 2514 * themselves instead. 2515 */ 2516 static void 2517 sockevent_alarm(clock_t now) 2518 { 2519 2520 expire_timers(now); 2521 } 2522 2523 static const struct sockdriver sockevent_tab = { 2524 .sdr_socket = sockevent_socket, 2525 .sdr_socketpair = sockevent_socketpair, 2526 .sdr_bind = sockevent_bind, 2527 .sdr_connect = sockevent_connect, 2528 .sdr_listen = sockevent_listen, 2529 .sdr_accept = sockevent_accept, 2530 .sdr_send = sockevent_send, 2531 .sdr_recv = sockevent_recv, 2532 .sdr_ioctl = sockevent_ioctl, 2533 .sdr_setsockopt = sockevent_setsockopt, 2534 .sdr_getsockopt = sockevent_getsockopt, 2535 .sdr_getsockname = sockevent_getsockname, 2536 .sdr_getpeername = sockevent_getpeername, 2537 .sdr_shutdown = sockevent_shutdown, 2538 .sdr_close = sockevent_close, 2539 .sdr_cancel = sockevent_cancel, 2540 .sdr_select = sockevent_select, 2541 .sdr_alarm = sockevent_alarm 2542 }; 2543 2544 /* 2545 * Initialize the socket event library. 2546 */ 2547 void 2548 sockevent_init(sockevent_socket_cb_t socket_cb) 2549 { 2550 2551 sockhash_init(); 2552 2553 socktimer_init(); 2554 2555 sockevent_proc_init(); 2556 2557 SIMPLEQ_INIT(&sockevent_pending); 2558 2559 assert(socket_cb != NULL); 2560 sockevent_socket_cb = socket_cb; 2561 2562 /* Announce we are up. */ 2563 sockdriver_announce(); 2564 2565 sockevent_working = FALSE; 2566 } 2567 2568 /* 2569 * Process a socket driver request message. 2570 */ 2571 void 2572 sockevent_process(const message * m_ptr, int ipc_status) 2573 { 2574 2575 /* Block events until after we have processed the request. */ 2576 assert(!sockevent_working); 2577 sockevent_working = TRUE; 2578 2579 /* Actually process the request. */ 2580 sockdriver_process(&sockevent_tab, m_ptr, ipc_status); 2581 2582 /* 2583 * If any events were fired while processing the request, they will 2584 * have been queued for later. Go through them now. 2585 */ 2586 if (sockevent_has_events()) 2587 sockevent_pump(); 2588 2589 sockevent_working = FALSE; 2590 } 2591