1 /* UNIX Domain Sockets - io.c - sending and receiving */ 2 3 #include "uds.h" 4 #include <sys/mman.h> 5 6 /* 7 * Our UDS sockets do not have a send buffer. They only have a receive buffer. 8 * This receive buffer, when not empty, is split up in segments. Each segment 9 * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and 10 * (SOCK_DGRAM) neither. There are two types of ancillary data: in-flight file 11 * descriptors and sender credentials. In addition, for SOCK_DGRAM sockets, 12 * the segment may contain the sender's socket path (if the sender's socket is 13 * bound). Each segment has a header, containing the full segment size, the 14 * size of the actual data in the segment (if any), and a flags field that 15 * states which ancillary are associated with the segment (if any). For 16 * SOCK_STREAM type sockets, new data may be merged into a previous segment, 17 * but only if it has no ancillary data. For the other two socket types, each 18 * packet has its own header. The resulting behavior should be in line with 19 * the POSIX "Socket Receive Queue" specification. 20 * 21 * More specifically, each segment consists of the following parts: 22 * - always a five-byte header, containing a two-byte segment length (including 23 * the header, so always non-zero), a two-byte regular data length (zero or 24 * more), and a one-byte flags field which is a bitwise combination of 25 * UDS_HAS_{FD,CRED,PATH} flags; 26 * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure; 27 * since this structure is variable-size, the structure is prepended by a 28 * single byte that contains the length of the structure (excluding the byte 29 * itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN); 30 * - next, if UDS_HAS_PATH is set in the segment header: 31 * - next, if the data length is non-zero, the actual regular data. 32 * If the segment is not the last in the receive buffer, it is followed by the 33 * next segment immediately afterward. There is no alignment. 34 * 35 * It is the sender's responsibility to merge new data into the last segment 36 * whenever possible, so that the receiver side never needs to consider more 37 * than one segment at once. In order to allow such merging, each receive 38 * buffer has not only a tail and in-use length (pointing to the head when 39 * combined) but also an offset from the tail to the last header, if any. Note 40 * that the receiver may over time still look at multiple segments for a single 41 * request: this happens when a MSG_WAITALL request empties the buffer and then 42 * blocks - the next piece of arriving data can then obviously not be merged. 43 * 44 * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file 45 * descriptors are associated with the segment. These are stored in a separate 46 * data structure, mainly to simplify cleaning up when the socket is shut down 47 * for reading or closed. That structure also contains the number of file 48 * descriptors associated with the current segment, so this is not stored in 49 * the segment itself. As mentioned later, this may be changed in the future. 50 * 51 * On the sender side, there is a trade-off between fully utilizing the receive 52 * buffer, and not repeatedly performing expensive actions for the same call: 53 * it may be costly to determine exactly how many in-flight file descriptors 54 * there will be (if any) and/or how much space is needed to store credentials. 55 * We currently use the policy that we rather block/reject a send request that 56 * may (just) have fit in the remaining part of the receive buffer, than obtain 57 * the same information multiple times or keep state between callbacks. In 58 * practice this is not expected to make a difference, especially since 59 * transfer of ancillary data should be rare anyway. 60 */ 61 /* 62 * The current layout of the segment header is as follows. 63 * 64 * The first byte contains the upper eight bits of the total segment length. 65 * The second byte contains the lower eight bits of the total segment length. 66 * The third byte contains the upper eight bits of the data length. 67 * The fourth byte contains the lower eight bits of the data length. 68 * The fifth byte is a bitmask for ancillary data associated with the segment. 69 */ 70 #define UDS_HDRLEN 5 71 72 #define UDS_HAS_FDS 0x01 /* segment has in-flight file descriptors */ 73 #define UDS_HAS_CRED 0x02 /* segment has sender credentials */ 74 #define UDS_HAS_PATH 0x04 /* segment has source socket path */ 75 76 #define UDS_MAXCREDLEN SOCKCREDSIZE(NGROUPS_MAX) 77 78 #define uds_get_head(uds) \ 79 ((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF) 80 #define uds_get_last(uds) \ 81 ((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF) 82 #define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF) 83 84 /* 85 * All in-flight file descriptors are (co-)owned by the UDS driver itself, as 86 * local open file descriptors. Like any other process, the UDS driver can not 87 * have more than OPEN_MAX open file descriptors at any time. Thus, this is 88 * also the inherent maximum number of in-flight file descriptors. Therefore, 89 * we maintain a single pool of in-flight FD structures, and we associate these 90 * structures with sockets as needed. 91 */ 92 static struct uds_fd uds_fds[OPEN_MAX]; 93 static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds; 94 95 static char uds_ctlbuf[UDS_CTL_MAX]; 96 static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)]; 97 98 /* 99 * Initialize the input/output part of the UDS service. 100 */ 101 void 102 uds_io_init(void) 103 { 104 unsigned int slot; 105 106 SIMPLEQ_INIT(&uds_freefds); 107 108 for (slot = 0; slot < __arraycount(uds_fds); slot++) 109 SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next); 110 } 111 112 /* 113 * Set up all input/output state for the given socket, which has just been 114 * allocated. As part of this, allocate memory for the receive buffer of the 115 * socket. Return OK or a negative error code. 116 */ 117 int 118 uds_io_setup(struct udssock * uds) 119 { 120 121 /* TODO: decide if we should preallocate the memory. */ 122 if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE, 123 MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) 124 return ENOMEM; 125 126 uds->uds_tail = 0; 127 uds->uds_len = 0; 128 uds->uds_last = 0; 129 130 SIMPLEQ_INIT(&uds->uds_fds); 131 132 return OK; 133 } 134 135 /* 136 * Clean up the input/output state for the given socket, which is about to be 137 * freed. As part of this, deallocate memory for the receive buffer and close 138 * any file descriptors still in flight on the socket. 139 */ 140 void 141 uds_io_cleanup(struct udssock * uds) 142 { 143 144 /* Close any in-flight file descriptors. */ 145 uds_io_reset(uds); 146 147 /* Free the receive buffer memory. */ 148 if (munmap(uds->uds_buf, UDS_BUF) != 0) 149 panic("UDS: munmap failed: %d", errno); 150 } 151 152 /* 153 * The socket is being closed or shut down for reading. If there are still any 154 * in-flight file descriptors, theey will never be received anymore, so close 155 * them now. 156 */ 157 void 158 uds_io_reset(struct udssock * uds) 159 { 160 struct uds_fd *ufd; 161 162 /* 163 * The UDS service may have the last and only reference to any of these 164 * file descriptors here. For that reason, we currently disallow 165 * transfer of UDS file descriptors, because the close(2) here could 166 * block on a socket close operation back to us, leading to a deadlock. 167 * Also, we use a non-blocking variant of close(2), to prevent that we 168 * end up hanging on sockets with SO_LINGER turned on. 169 */ 170 SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) { 171 dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); 172 173 closenb(ufd->ufd_fd); 174 } 175 176 SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds); 177 178 /* 179 * If this reset happens as part of a shutdown, it might be done 180 * again on close, so ensure that it will find a clean state. The 181 * receive buffer should never be looked at again either way, but reset 182 * it too just to be sure. 183 */ 184 uds->uds_tail = 0; 185 uds->uds_len = 0; 186 uds->uds_last = 0; 187 188 SIMPLEQ_INIT(&uds->uds_fds); 189 } 190 191 /* 192 * Return the maximum usable part of the receive buffer, in bytes. The return 193 * value is used for the SO_SNDBUF and SO_RCVBUF socket options. 194 */ 195 size_t 196 uds_io_buflen(void) 197 { 198 199 /* 200 * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we 201 * could use the full receive buffer for data. This would require that 202 * we store up to one header in the socket object rather than in the 203 * receive buffer. 204 */ 205 return UDS_BUF - UDS_HDRLEN; 206 } 207 208 /* 209 * Fetch 'len' bytes starting from absolute position 'pos' into the receive 210 * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'. 211 * Return the absolute position of the first byte after the fetched data in the 212 * receive buffer. 213 */ 214 static size_t 215 uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len) 216 { 217 size_t left; 218 219 assert(off < UDS_BUF); 220 221 left = UDS_BUF - off; 222 if (len >= left) { 223 memcpy(ptr, &uds->uds_buf[off], left); 224 225 if ((len -= left) > 0) 226 memcpy((char *)ptr + left, &uds->uds_buf[0], len); 227 228 return len; 229 } else { 230 memcpy(ptr, &uds->uds_buf[off], len); 231 232 return off + len; 233 } 234 } 235 236 /* 237 * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive 238 * buffer of socket 'uds', starting at absolute position 'pos' into the receive 239 * buffer. Return the absolute position of the first byte after the stored 240 * data in the receive buffer. 241 */ 242 static size_t 243 uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len) 244 { 245 size_t left; 246 247 assert(off < UDS_BUF); 248 249 left = UDS_BUF - off; 250 if (len >= left) { 251 memcpy(&uds->uds_buf[off], ptr, left); 252 253 if ((len -= left) > 0) 254 memcpy(&uds->uds_buf[0], (const char *)ptr + left, 255 len); 256 257 return len; 258 } else { 259 memcpy(&uds->uds_buf[off], ptr, len); 260 261 return off + len; 262 } 263 } 264 265 /* 266 * Fetch a segment header previously stored in the receive buffer of socket 267 * 'uds' at absolute position 'off'. Return the absolute position of the first 268 * byte after the header, as well as the entire segment length in 'seglen', the 269 * length of the data in the segment in 'datalen', and the segment flags in 270 * 'segflags'. 271 */ 272 static size_t 273 uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen, 274 size_t * datalen, unsigned int * segflags) 275 { 276 unsigned char hdr[UDS_HDRLEN]; 277 278 off = uds_fetch(uds, off, hdr, sizeof(hdr)); 279 280 *seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1]; 281 *datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3]; 282 *segflags = hdr[4]; 283 284 assert(*seglen >= UDS_HDRLEN); 285 assert(*seglen <= uds->uds_len); 286 assert(*datalen <= *seglen - UDS_HDRLEN); 287 assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN); 288 assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH))); 289 290 return off; 291 } 292 293 /* 294 * Store a segment header in the receive buffer of socket 'uds' at absolute 295 * position 'off', with the segment length 'seglen', the segment data length 296 * 'datalen', and the segment flags 'segflags'. Return the absolute receive 297 * buffer position of the first data byte after the stored header. 298 */ 299 static size_t 300 uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen, 301 unsigned int segflags) 302 { 303 unsigned char hdr[UDS_HDRLEN]; 304 305 assert(seglen <= USHRT_MAX); 306 assert(datalen <= seglen); 307 assert(segflags <= UCHAR_MAX); 308 assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH))); 309 310 hdr[0] = (seglen >> 8) & 0xff; 311 hdr[1] = seglen & 0xff; 312 hdr[2] = (datalen >> 8) & 0xff; 313 hdr[3] = datalen & 0xff; 314 hdr[4] = segflags; 315 316 return uds_store(uds, off, hdr, sizeof(hdr)); 317 } 318 319 /* 320 * Perform initial checks on a send request, before it may potentially be 321 * suspended. Return OK if this send request is valid, or a negative error 322 * code if it is not. 323 */ 324 int 325 uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused, 326 const struct sockaddr * addr, socklen_t addr_len __unused, 327 endpoint_t user_endpt __unused, int flags) 328 { 329 struct udssock *uds = (struct udssock *)sock; 330 size_t pathlen; 331 332 /* 333 * Reject calls with unknown flags. Besides the flags handled entirely 334 * by libsockevent (which are not part of 'flags' here), that is all of 335 * them. TODO: ensure that we should really reject all other flags 336 * rather than ignore them. 337 */ 338 if (flags != 0) 339 return EOPNOTSUPP; 340 341 /* 342 * Perform very basic address and message size checks on the send call. 343 * For non-stream sockets, we must reject packets that may never fit in 344 * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the 345 * send call may end up being suspended indefinitely. Therefore, we 346 * assume the worst-case scenario, which is that a full set of 347 * credentials must be associated with the packet. As a result, we may 348 * reject some large packets that could actually just fit. Checking 349 * the peer's LOCAL_CREDS setting here is not safe: even if we know the 350 * peer already at all (for SOCK_DGRAM we do not), the send may still 351 * block and the option toggled before it unblocks. 352 */ 353 switch (uds_get_type(uds)) { 354 case SOCK_STREAM: 355 /* Nothing to check for this case. */ 356 break; 357 358 case SOCK_SEQPACKET: 359 if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN) 360 return EMSGSIZE; 361 362 break; 363 364 case SOCK_DGRAM: 365 if (!uds_has_link(uds) && addr == NULL) 366 return EDESTADDRREQ; 367 368 /* 369 * The path is stored without null terminator, but with leading 370 * byte containing the path length--if there is a path at all. 371 */ 372 pathlen = (size_t)uds->uds_pathlen; 373 if (pathlen > 0) 374 pathlen++; 375 376 if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN) 377 return EMSGSIZE; 378 379 break; 380 381 default: 382 assert(0); 383 } 384 385 return OK; 386 } 387 388 /* 389 * Determine whether the (real or pretend) send request should be processed 390 * now, suspended until later, or rejected based on the current socket state. 391 * Return OK if the send request should be processed now. Return SUSPEND if 392 * the send request should be retried later. Return an appropriate negative 393 * error code if the send request should fail. 394 */ 395 static int 396 uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min, 397 int partial) 398 { 399 struct udssock *conn; 400 size_t avail, hdrlen, credlen; 401 402 assert(!uds_is_shutdown(uds, SFL_SHUT_WR)); 403 404 if (uds_get_type(uds) != SOCK_DGRAM) { 405 if (uds_is_connecting(uds)) 406 return SUSPEND; 407 if (!uds_is_connected(uds) && !uds_is_disconnected(uds)) 408 return ENOTCONN; 409 if (!uds_has_conn(uds)) 410 return EPIPE; 411 412 conn = uds->uds_conn; 413 414 if (uds_is_shutdown(conn, SFL_SHUT_RD)) 415 return EPIPE; 416 417 /* 418 * For connection-type sockets, we now have to check if there 419 * is enough room in the receive buffer. For SOCK_STREAM 420 * sockets, we must check if at least 'min' bytes can be moved 421 * into the receive buffer, at least if that is a reasonable 422 * value for ever making any forward progress at all. For 423 * SOCK_SEQPACKET sockets, we must check if the entire packet 424 * of size 'len' can be stored in the receive buffer. In both 425 * cases, we must take into account any metadata to store along 426 * with the data. 427 * 428 * Unlike in uds_pre_send(), we can now check safely whether 429 * the peer is expecting credentials, but we still don't know 430 * the actual size of the credentials, so again we take the 431 * maximum possible size. The same applies to file descriptors 432 * transferred via control data: all we have the control length 433 * right now, which if non-zero we assume to mean there might 434 * be file descriptors. 435 * 436 * In both cases, the reason of overestimating is that actually 437 * getting accurate sizes, by obtaining credentials or copying 438 * in control data, is very costly. We want to do that only 439 * when we are sure we will not suspend the send call after 440 * all. It is no problem to overestimate how much space will 441 * be needed here, but not to underestimate: that could cause 442 * applications that use select(2) and non-blocking sockets to 443 * end up in a busy-wait loop. 444 */ 445 if (!partial && (conn->uds_flags & UDSF_PASSCRED)) 446 credlen = 1 + UDS_MAXCREDLEN; 447 else 448 credlen = 0; 449 450 avail = UDS_BUF - conn->uds_len; 451 452 if (uds_get_type(uds) == SOCK_STREAM) { 453 /* 454 * Limit the low threshold to the maximum that can ever 455 * be sent at once. 456 */ 457 if (min > UDS_BUF - UDS_HDRLEN - credlen) 458 min = UDS_BUF - UDS_HDRLEN - credlen; 459 460 /* 461 * Suspend the call only if not even the low threshold 462 * is met. Otherwise we may make (partial) progress. 463 */ 464 if (len > min) 465 len = min; 466 467 /* 468 * If the receive buffer already has at least one 469 * segment, and there are certainly no file descriptors 470 * to transfer now, and we do not have to store 471 * credentials either, then this segment can be merged 472 * with the previous one. In that case, we need no 473 * space for a header. That is certainly the case if 474 * we are resuming an already partially completed send. 475 */ 476 hdrlen = (avail == UDS_BUF || ctl_len != 0 || 477 credlen > 0) ? UDS_HDRLEN : 0; 478 } else 479 hdrlen = UDS_HDRLEN; 480 481 if (avail < hdrlen + credlen + len) 482 return SUSPEND; 483 } 484 485 return OK; 486 } 487 488 /* 489 * Get the destination peer for a send request. The send test has already been 490 * performed first. On success, return OK, with a pointer to the peer socket 491 * stored in 'peerp'. On failure, return an appropriate error code. 492 */ 493 static int 494 uds_send_peer(struct udssock * uds, const struct sockaddr * addr, 495 socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp) 496 { 497 struct udssock *peer; 498 int r; 499 500 if (uds_get_type(uds) == SOCK_DGRAM) { 501 if (!uds_has_link(uds)) { 502 /* This was already checked in uds_pre_check(). */ 503 assert(addr != NULL); 504 505 /* 506 * Find the socket identified by the given address. 507 * If it exists at all, see if it is a proper match. 508 */ 509 if ((r = uds_lookup(uds, addr, addr_len, user_endpt, 510 &peer)) != OK) 511 return r; 512 513 /* 514 * If the peer socket is connected to a target, it 515 * must be this socket. Unfortunately, POSIX does not 516 * specify an error code for this. We borrow Linux's. 517 */ 518 if (uds_has_link(peer) && peer->uds_link != uds) 519 return EPERM; 520 } else 521 peer = uds->uds_link; 522 523 /* 524 * If the receiving end will never receive this packet, we 525 * might as well not send it, so drop it immeiately. Indicate 526 * as such to the caller, using NetBSD's chosen error code. 527 */ 528 if (uds_is_shutdown(peer, SFL_SHUT_RD)) 529 return ENOBUFS; 530 } else { 531 assert(uds_has_conn(uds)); 532 533 peer = uds->uds_conn; 534 } 535 536 *peerp = peer; 537 return OK; 538 } 539 540 /* 541 * Generate a new segment for the current send request, or arrange things such 542 * that new data can be merged with a previous segment. As part of this, 543 * decide whether we can merge data at all. The segment will be merged if, and 544 * only if, all of the following requirements are met: 545 * 546 * 1) the socket is of type SOCK_STREAM; 547 * 2) there is a previous segment in the receive buffer; 548 * 3) there is no ancillary data for the current send request. 549 * 550 * Also copy in regular data (if any), retrieve the sender's credentials (if 551 * needed), and copy over the source path (if applicable). However, do not yet 552 * commit the segment (or the new part to be merged), because the send request 553 * may still fail for other reasons. 554 * 555 * On success, return the length of the new segment (or, when merging, the 556 * length to be added to the last segment), as well as a flag indicating 557 * whether we are merging into the last segment in 'mergep', the length of the 558 * (new) data in the segment in 'datalenp', and the new segment's flags in 559 * 'segflagsp' (always zero when merging). Note that a return value of zero 560 * implies that we are merging zero extra bytes into the last segment, which 561 * means that effectively nothing changes; in that case the send call will be 562 * cut short and return zero to the caller as well. On failure, return a 563 * negative error code. 564 */ 565 static int 566 uds_send_data(struct udssock * uds, struct udssock * peer, 567 const struct sockdriver_data * data, size_t len, size_t off, 568 endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep, 569 size_t * __restrict datalenp, unsigned int * __restrict segflagsp) 570 { 571 struct sockcred sockcred; 572 gid_t groups[NGROUPS_MAX]; 573 iovec_t iov[2]; 574 unsigned int iovcnt, segflags; 575 unsigned char lenbyte; 576 size_t credlen, pathlen, datalen, seglen; 577 size_t avail, pos, left; 578 int r, merge; 579 580 /* 581 * At this point we should add the data to the peer's receive buffer. 582 * In the case of SOCK_STREAM sockets, we should add as much of the 583 * data as possible and suspend the call to send the rest later, if 584 * applicable. In the case of SOCK_DGRAM sockets, we should drop the 585 * packet if it does not fit in the buffer. 586 * 587 * Due to the checks in uds_can_send(), we know for sure that we no 588 * longer have to suspend without making any progress at this point. 589 */ 590 segflags = (nfds > 0) ? UDS_HAS_FDS : 0; 591 592 /* 593 * Obtain the credentials now. Doing so allows us to determine how 594 * much space we actually need for them. 595 */ 596 if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) { 597 memset(&sockcred, 0, sizeof(sockcred)); 598 599 if ((r = getsockcred(user_endpt, &sockcred, groups, 600 __arraycount(groups))) != OK) 601 return r; 602 603 /* 604 * getsockcred(3) returns the total number of groups for the 605 * process, which may exceed the size of the given array. Our 606 * groups array should always be large enough for all groups, 607 * but we check to be sure anyway. 608 */ 609 assert(sockcred.sc_ngroups <= (int)__arraycount(groups)); 610 611 credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups); 612 613 segflags |= UDS_HAS_CRED; 614 } else 615 credlen = 0; 616 617 /* For bound source datagram sockets, include the source path. */ 618 if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) { 619 pathlen = (size_t)uds->uds_pathlen + 1; 620 621 segflags |= UDS_HAS_PATH; 622 } else 623 pathlen = 0; 624 625 avail = UDS_BUF - peer->uds_len; 626 627 if (uds_get_type(uds) == SOCK_STREAM) { 628 /* 629 * Determine whether we can merge data into the previous 630 * segment. This is a more refined version of the test in 631 * uds_can_send(), as we now know whether there are actually 632 * any FDs to transfer. 633 */ 634 merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0); 635 636 /* Determine how much we can send at once. */ 637 if (!merge) { 638 assert(avail > UDS_HDRLEN + credlen); 639 datalen = avail - UDS_HDRLEN - credlen; 640 } else 641 datalen = avail; 642 643 if (datalen > len) 644 datalen = len; 645 646 /* If we cannot make progress, we should have suspended.. */ 647 assert(datalen != 0 || len == 0); 648 } else { 649 merge = FALSE; 650 651 datalen = len; 652 } 653 assert(datalen <= len); 654 assert(datalen <= UDS_BUF); 655 656 /* 657 * Compute the total amount of space we need for the segment in the 658 * receive buffer. Given that we have done will-it-fit tests in 659 * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one 660 * case left where the result may not fit, and that is for SOCK_DGRAM 661 * packets. In that case, we drop the packet. POSIX says we should 662 * throw an error in that case, and that is also what NetBSD does. 663 */ 664 if (!merge) 665 seglen = UDS_HDRLEN + credlen + pathlen + datalen; 666 else 667 seglen = datalen; 668 669 if (seglen > avail) { 670 assert(uds_get_type(uds) == SOCK_DGRAM); 671 672 /* Drop the packet, borrowing NetBSD's chosen error code. */ 673 return ENOBUFS; 674 } 675 676 /* 677 * Generate the full segment, but do not yet update the buffer head. 678 * We may still run into an error (copying in file descriptors) or even 679 * decide that nothing gets sent after all (if there are no data or 680 * file descriptors). If we are merging the new data into the previous 681 * segment, do not generate a header. 682 */ 683 pos = uds_get_head(peer); 684 685 /* Generate the header, if needed. */ 686 if (!merge) 687 pos = uds_store_hdr(peer, pos, seglen, datalen, segflags); 688 else 689 assert(segflags == 0); 690 691 /* Copy in and store the sender's credentials, if desired. */ 692 if (credlen > 0) { 693 assert(credlen >= 1 + sizeof(sockcred)); 694 assert(credlen <= UCHAR_MAX); 695 696 lenbyte = credlen - 1; 697 pos = uds_store(peer, pos, &lenbyte, 1); 698 699 if (sockcred.sc_ngroups > 0) { 700 pos = uds_store(peer, pos, &sockcred, 701 offsetof(struct sockcred, sc_groups)); 702 pos = uds_store(peer, pos, groups, 703 sockcred.sc_ngroups * sizeof(gid_t)); 704 } else 705 pos = uds_store(peer, pos, &sockcred, 706 sizeof(sockcred)); 707 } 708 709 /* Store the sender's address if any. Datagram sockets only. */ 710 if (pathlen > 0) { 711 assert(pathlen > 1); 712 assert(pathlen <= UCHAR_MAX); 713 714 lenbyte = uds->uds_pathlen; 715 pos = uds_store(peer, pos, &lenbyte, 1); 716 pos = uds_store(peer, pos, uds->uds_path, pathlen - 1); 717 } 718 719 /* Lastly, copy in the actual data (if any) from the caller. */ 720 if (datalen > 0) { 721 iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos]; 722 left = UDS_BUF - pos; 723 724 if (left < datalen) { 725 assert(left > 0); 726 iov[0].iov_size = left; 727 iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0]; 728 iov[1].iov_size = datalen - left; 729 iovcnt = 2; 730 } else { 731 iov[0].iov_size = datalen; 732 iovcnt = 1; 733 } 734 735 if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK) 736 return r; 737 } 738 739 *mergep = merge; 740 *datalenp = datalen; 741 *segflagsp = segflags; 742 return seglen; 743 } 744 745 /* 746 * Copy in control data for the current send request, and extract any file 747 * descriptors to be transferred. Do not yet duplicate the file descriptors, 748 * but rather store a list in a temporary buffer: the send request may still 749 * fail in which case we want to avoid having to undo the duplication. 750 * 751 * On success, return the number of (zero or more) file descriptors extracted 752 * from the request and stored in the temporary buffer. On failure, return a 753 * negative error code. 754 */ 755 static int 756 uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len, 757 endpoint_t user_endpt) 758 { 759 struct msghdr msghdr; 760 struct cmsghdr *cmsg; 761 socklen_t left; 762 unsigned int i, n, nfds; 763 int r; 764 765 /* 766 * Copy in the control data. We can spend a lot of effort copying in 767 * the data in small chunks, and change the receiving side to do the 768 * same, but it is really not worth it: applications never send a whole 769 * lot of file descriptors at once, and the buffer size is currently 770 * such that the UDS service itself will exhaust its OPEN_MAX limit 771 * anyway if they do. 772 */ 773 if (ctl_len > sizeof(uds_ctlbuf)) 774 return ENOBUFS; 775 776 if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK) 777 return r; 778 779 if (ctl_len < sizeof(uds_ctlbuf)) 780 memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len); 781 782 /* 783 * Look for any file descriptors, and store their remote file 784 * descriptor numbers into a temporary array. 785 */ 786 memset(&msghdr, 0, sizeof(msghdr)); 787 msghdr.msg_control = uds_ctlbuf; 788 msghdr.msg_controllen = ctl_len; 789 790 nfds = 0; 791 r = OK; 792 793 /* 794 * The sender may provide file descriptors in multiple chunks. 795 * Currently we do not preserve these chunk boundaries, instead 796 * generating one single chunk with all file descriptors for the 797 * segment upon receipt. If needed, we can fairly easily adapt this 798 * later. 799 */ 800 for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; 801 cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { 802 /* 803 * Check for bogus lengths. There is no excuse for this; 804 * either the caller does not know what they are doing or we 805 * are looking at a hacking attempt. 806 */ 807 assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len); 808 left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf); 809 assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */ 810 811 if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) { 812 printf("UDS: malformed control data from %u\n", 813 user_endpt); 814 r = EINVAL; 815 break; 816 } 817 818 if (cmsg->cmsg_level != SOL_SOCKET || 819 cmsg->cmsg_type != SCM_RIGHTS) 820 continue; 821 822 n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); 823 824 for (i = 0; i < n; i++) { 825 /* 826 * Copy the file descriptor to the temporary buffer, 827 * whose size is based on the control data buffer, so 828 * it is always large enough to contain all FDs. 829 */ 830 assert(nfds < __arraycount(uds_ctlfds)); 831 832 memcpy(&uds_ctlfds[nfds], 833 &((int *)CMSG_DATA(cmsg))[i], sizeof(int)); 834 835 nfds++; 836 } 837 } 838 839 return nfds; 840 } 841 842 /* 843 * Actually duplicate any file descriptors that we extracted from the sender's 844 * control data and stored in our temporary buffer. On success, return OK, 845 * with all file descriptors stored in file descriptor objects that are 846 * appended to the socket's list of in-flight FD objects. Thus, on success, 847 * the send request may no longer fail. On failure, return a negative error 848 * code, with any partial duplication undone. 849 */ 850 static int 851 uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt) 852 { 853 SIMPLEQ_HEAD(, uds_fd) fds; 854 struct uds_fd *ufd; 855 unsigned int i; 856 int r; 857 858 SIMPLEQ_INIT(&fds); 859 860 for (i = 0; i < nfds; i++) { 861 if (SIMPLEQ_EMPTY(&uds_freefds)) { 862 /* UDS itself may already have OPEN_MAX FDs. */ 863 r = ENFILE; 864 break; 865 } 866 867 /* 868 * The caller may have given an invalid FD, or UDS itself may 869 * unexpectedly have run out of available file descriptors etc. 870 */ 871 if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0) 872 break; 873 874 ufd = SIMPLEQ_FIRST(&uds_freefds); 875 SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next); 876 877 ufd->ufd_fd = r; 878 ufd->ufd_count = 0; 879 880 SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next); 881 882 dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r)); 883 } 884 885 /* Did we experience an error while copying in the file descriptors? */ 886 if (r < 0) { 887 /* Revert the successful copyfd() calls made so far. */ 888 SIMPLEQ_FOREACH(ufd, &fds, ufd_next) { 889 dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); 890 891 closenb(ufd->ufd_fd); 892 } 893 894 SIMPLEQ_CONCAT(&uds_freefds, &fds); 895 896 return r; 897 } 898 899 /* 900 * Success. If there were any file descriptors at all, add them to the 901 * peer's list of in-flight file descriptors. Assign the number of 902 * file descriptors copied in to the first file descriptor object, so 903 * that we know how many to copy out (or discard) for this segment. 904 * Also set the UDS_HAS_FDS flag on the segment. 905 */ 906 ufd = SIMPLEQ_FIRST(&fds); 907 ufd->ufd_count = nfds; 908 909 SIMPLEQ_CONCAT(&peer->uds_fds, &fds); 910 911 return OK; 912 } 913 914 /* 915 * The current send request is successful or at least has made progress. 916 * Commit the new segment or, if we decided to merge the new data into the last 917 * segment, update the header of the last segment. Also wake up the receiving 918 * side, because there will now be new data to receive. 919 */ 920 static void 921 uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen, 922 int merge, size_t seglen, unsigned int segflags) 923 { 924 size_t pos, prevseglen, prevdatalen; 925 926 /* 927 * For non-datagram sockets, credentials are sent only once after 928 * setting the LOCAL_CREDS option. After that, the option is unset. 929 */ 930 if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM) 931 peer->uds_flags &= ~UDSF_PASSCRED; 932 933 if (merge) { 934 assert(segflags == 0); 935 936 pos = uds_get_last(peer); 937 938 (void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen, 939 &segflags); 940 941 peer->uds_len += seglen; 942 assert(peer->uds_len <= UDS_BUF); 943 944 seglen += prevseglen; 945 datalen += prevdatalen; 946 assert(seglen <= UDS_BUF); 947 948 uds_store_hdr(peer, pos, seglen, datalen, segflags); 949 } else { 950 peer->uds_last = peer->uds_len; 951 952 peer->uds_len += seglen; 953 assert(peer->uds_len <= UDS_BUF); 954 } 955 956 /* Now that there are new data, wake up the receiver side. */ 957 sockevent_raise(&peer->uds_sock, SEV_RECV); 958 } 959 960 /* 961 * Process a send request. Return OK if the send request has successfully 962 * completed, SUSPEND if it should be tried again later, or a negative error 963 * code on failure. In all cases, the values of 'off' and 'ctl_off' must be 964 * updated if any progress has been made; if either is non-zero, libsockevent 965 * will return the partial progress rather than an error code. 966 */ 967 int 968 uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len, 969 size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, 970 socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len, 971 endpoint_t user_endpt, int flags __unused, size_t min) 972 { 973 struct udssock *uds = (struct udssock *)sock; 974 struct udssock *peer; 975 size_t seglen, datalen = 0 /*gcc*/; 976 unsigned int nfds, segflags = 0 /*gcc*/; 977 int r, partial, merge = 0 /*gcc*/; 978 979 dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n", 980 uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len, 981 (ctl_off != NULL) ? *ctl_off : 0, flags)); 982 983 partial = (off != NULL && *off > 0); 984 985 /* 986 * First see whether we can process this send call at all right now. 987 * Most importantly, for connected sockets, if the peer's receive 988 * buffer is full, we may have to suspend the call until some space has 989 * been freed up. 990 */ 991 if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK) 992 return r; 993 994 /* 995 * Then get the peer socket. For connected sockets, this is trivial. 996 * For unconnected sockets, it may involve a lookup of the given 997 * address. 998 */ 999 if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK) 1000 return r; 1001 1002 /* 1003 * We now know for sure that we will not suspend this call without 1004 * making any progress. However, the call may still fail. Copy in 1005 * control data first now, so that we know whether there are any file 1006 * descriptors to transfer. This aspect may determine whether or not 1007 * we can merge data with a previous segment. Do not actually copy in 1008 * the actual file descriptors yet, because that is much harder to undo 1009 * in case of a failure later on. 1010 */ 1011 if (ctl_len > 0) { 1012 /* We process control data once, in full. */ 1013 assert(*ctl_off == 0); 1014 1015 if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0) 1016 return r; 1017 nfds = (unsigned int)r; 1018 } else 1019 nfds = 0; 1020 1021 /* 1022 * Now generate a new segment, or (if possible) merge new data into the 1023 * last segment. Since the call may still fail, prepare the segment 1024 * but do not update the buffer head yet. Note that the segment 1025 * contains not just regular data (in fact it may contain no data at 1026 * all) but (also) certain ancillary data. 1027 */ 1028 if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds, 1029 &merge, &datalen, &segflags)) <= 0) 1030 return r; 1031 seglen = (size_t)r; 1032 1033 /* 1034 * If we extracted any file descriptors from the control data earlier, 1035 * copy them over to ourselves now. The resulting in-flight file 1036 * descriptors are stored in a separate data structure. This is the 1037 * last point where the send call may actually fail. 1038 */ 1039 if (nfds > 0) { 1040 if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK) 1041 return r; 1042 } 1043 1044 /* 1045 * The transmission is now known to be (partially) successful. Commit 1046 * the new work by moving the receive buffer head. 1047 */ 1048 uds_send_advance(uds, peer, datalen, merge, seglen, segflags); 1049 1050 /* 1051 * Register the result. For stream-type sockets, the expected behavior 1052 * is that all data be sent, and so we may still have to suspend the 1053 * call after partial progress. Otherwise, we are now done. Either 1054 * way, we are done with the control data, so mark it as consumed. 1055 */ 1056 *off += datalen; 1057 *ctl_off += ctl_len; 1058 if (uds_get_type(uds) == SOCK_STREAM && datalen < len) 1059 return SUSPEND; 1060 else 1061 return OK; 1062 } 1063 1064 /* 1065 * Test whether a send request would block. The given 'min' parameter contains 1066 * the minimum number of bytes that should be possible to send without blocking 1067 * (the low send watermark). Return SUSPEND if the send request would block, 1068 * or any other error code if it would not. 1069 */ 1070 int 1071 uds_test_send(struct sock * sock, size_t min) 1072 { 1073 struct udssock *uds = (struct udssock *)sock; 1074 1075 return uds_send_test(uds, min, 0, min, FALSE /*partial*/); 1076 } 1077 1078 /* 1079 * Perform initial checks on a receive request, before it may potentially be 1080 * suspended. Return OK if this receive request is valid, or a negative error 1081 * code if it is not. 1082 */ 1083 int 1084 uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, 1085 int flags) 1086 { 1087 1088 /* 1089 * Reject calls with unknown flags. TODO: ensure that we should really 1090 * reject all other flags rather than ignore them. 1091 */ 1092 if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0) 1093 return EOPNOTSUPP; 1094 1095 return OK; 1096 } 1097 1098 /* 1099 * Determine whether the (real or pretend) receive request should be processed 1100 * now, suspended until later, or rejected based on the current socket state. 1101 * Return OK if the receive request should be processed now, along with a first 1102 * indication whether the call may still be suspended later in 'may_block'. 1103 * Return SUSPEND if the receive request should be retried later. Return an 1104 * appropriate negative error code if the receive request should fail. 1105 */ 1106 static int 1107 uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial, 1108 int * may_block) 1109 { 1110 size_t seglen, datalen; 1111 unsigned int segflags; 1112 int r; 1113 1114 /* 1115 * If there are any pending data, those should always be received 1116 * first. However, if there is nothing to receive, then whether we 1117 * should suspend the receive call or fail immediately depends on other 1118 * conditions. We first look at these other conditions. 1119 */ 1120 r = OK; 1121 1122 if (uds_get_type(uds) != SOCK_DGRAM) { 1123 if (uds_is_connecting(uds)) 1124 r = SUSPEND; 1125 else if (!uds_is_connected(uds) && !uds_is_disconnected(uds)) 1126 r = ENOTCONN; 1127 else if (!uds_has_conn(uds) || 1128 uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR)) 1129 r = SOCKEVENT_EOF; 1130 } 1131 1132 if (uds->uds_len == 0) { 1133 /* 1134 * For stream-type sockets, we use the policy: if no regular 1135 * data is requested, then end the call without receiving 1136 * anything. For packet-type sockets, the request should block 1137 * until there is a packet to discard, though. 1138 */ 1139 if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0)) 1140 return r; 1141 1142 return SUSPEND; 1143 } 1144 1145 /* 1146 * For stream-type sockets, we should still suspend the call if fewer 1147 * than 'min' bytes are available right now, and there is a possibility 1148 * that more data may arrive later. More may arrive later iff 'r' is 1149 * OK (i.e., no EOF or error will follow) and, in case we already 1150 * received some partial results, there is not already a next segment 1151 * with ancillary data (i.e, nonzero segment flags), or in any case 1152 * there isn't more than one segment in the buffer. Limit 'min' to the 1153 * maximum that can ever be received, though. Since that is difficult 1154 * in our case, we check whether the buffer is entirely full instead. 1155 */ 1156 if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 && 1157 uds->uds_len < UDS_BUF) { 1158 assert(uds->uds_len >= UDS_HDRLEN); 1159 1160 (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen, 1161 &segflags); 1162 1163 if (datalen < min && seglen == uds->uds_len && 1164 (!partial || segflags == 0)) 1165 return SUSPEND; 1166 } 1167 1168 /* 1169 * Also start the decision process as to whether we should suspend the 1170 * current call if MSG_WAITALL is given. Unfortunately there is no one 1171 * place where we can conveniently do all the required checks. 1172 */ 1173 if (may_block != NULL) 1174 *may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM); 1175 return OK; 1176 } 1177 1178 /* 1179 * Receive regular data, and possibly the source path, from the tail segment in 1180 * the receive buffer. On success, return the positive non-zero length of the 1181 * tail segment, with 'addr' and 'addr_len' modified to store the source 1182 * address if applicable, the result flags in 'rflags' updated as appropriate, 1183 * the tail segment's data length stored in 'datalen', the number of received 1184 * regular data bytes stored in 'reslen', the segment flags stored in 1185 * 'segflags', and the absolute receive buffer position of the credentials in 1186 * the segment stored in 'credpos' if applicable. Since the receive call may 1187 * still fail, this function must not yet update the tail or any other aspect 1188 * of the receive buffer. Return zero if the current receive call was already 1189 * partially successful (due to MSG_WAITALL) and can no longer make progress, 1190 * and thus should be ended. Return a negative error code on failure. 1191 */ 1192 static int 1193 uds_recv_data(struct udssock * uds, const struct sockdriver_data * data, 1194 size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len, 1195 int * __restrict rflags, size_t * __restrict datalen, 1196 size_t * __restrict reslen, unsigned int * __restrict segflags, 1197 size_t * __restrict credpos) 1198 { 1199 iovec_t iov[2]; 1200 unsigned char lenbyte; 1201 unsigned int iovcnt; 1202 size_t pos, seglen, left; 1203 int r; 1204 1205 pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags); 1206 1207 /* 1208 * If a partially completed receive now runs into a segment that cannot 1209 * be logically merged with the previous one (because it has at least 1210 * one segment flag set, meaning it has ancillary data), then we must 1211 * shortcut the receive now. 1212 */ 1213 if (off != 0 && *segflags != 0) 1214 return OK; 1215 1216 /* 1217 * As stated, for stream-type sockets, we choose to ignore zero-size 1218 * receive calls. This has the consequence that reading a zero-sized 1219 * segment (with ancillary data) requires a receive request for at 1220 * least one regular data byte. Such a receive call would then return 1221 * zero. The problem with handling zero-data receive requests is that 1222 * we need to know whether the current segment is terminated (i.e., no 1223 * more data can possibly be merged into it later), which is a test 1224 * that we rather not perform, not in the least because we do not know 1225 * whether there is an error pending on the socket. 1226 * 1227 * For datagrams, we currently allow a zero-size receive call to 1228 * discard the next datagram. 1229 * 1230 * TODO: compare this against policies on other platforms. 1231 */ 1232 if (len == 0 && uds_get_type(uds) == SOCK_STREAM) 1233 return OK; 1234 1235 /* 1236 * We have to skip the credentials for now: these are copied out as 1237 * control data, and thus will (well, may) be looked at when dealing 1238 * with the control data. For the same reason, we do not even look at 1239 * UDS_HAS_FDS here. 1240 */ 1241 if (*segflags & UDS_HAS_CRED) { 1242 *credpos = pos; 1243 1244 pos = uds_fetch(uds, pos, &lenbyte, 1); 1245 pos = uds_advance(pos, (size_t)lenbyte); 1246 } 1247 1248 /* 1249 * Copy out the source address, but only if the (datagram) socket is 1250 * not connected. TODO: even when it is connected, it may still 1251 * receive packets sent to it from other sockets *before* being 1252 * connected, and the receiver has no way of knowing that those packets 1253 * did not come from its new peer. Ideally, the older packets should 1254 * be dropped.. 1255 */ 1256 if (*segflags & UDS_HAS_PATH) { 1257 pos = uds_fetch(uds, pos, &lenbyte, 1); 1258 1259 if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds)) 1260 uds_make_addr((const char *)&uds->uds_buf[pos], 1261 (size_t)lenbyte, addr, addr_len); 1262 1263 pos = uds_advance(pos, (size_t)lenbyte); 1264 } 1265 1266 /* 1267 * We can receive no more data than those that are present in the 1268 * segment, obviously. For stream-type sockets, any more data that 1269 * could have been received along with the current data would have been 1270 * merged in the current segment, so we need not search for any next 1271 * segments. 1272 * 1273 * For non-stream sockets, the caller may receive less than a whole 1274 * packet if it supplied a small buffer. In that case, the rest of the 1275 * packet will be discarded (but not here yet!) and the caller gets 1276 * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway. 1277 */ 1278 if (len > *datalen) 1279 len = *datalen; 1280 else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM) 1281 *rflags |= MSG_TRUNC; 1282 1283 /* Copy out the data to the caller. */ 1284 if (len > 0) { 1285 iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos]; 1286 left = UDS_BUF - pos; 1287 1288 if (left < len) { 1289 iov[0].iov_size = left; 1290 iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0]; 1291 iov[1].iov_size = len - left; 1292 iovcnt = 2; 1293 } else { 1294 iov[0].iov_size = len; 1295 iovcnt = 1; 1296 } 1297 1298 if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK) 1299 return r; 1300 } 1301 1302 *reslen = len; 1303 assert(seglen > 0 && seglen <= INT_MAX); 1304 return (int)seglen; 1305 } 1306 1307 /* 1308 * The current segment has associated file descriptors. If possible, copy out 1309 * all file descriptors to the receiver, and generate and copy out a chunk of 1310 * control data that contains their file descriptor numbers. If not all 1311 * file descriptors fit in the receiver's buffer, or if any error occurs, no 1312 * file descriptors are copied out. 1313 */ 1314 static int 1315 uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl, 1316 socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags) 1317 { 1318 struct msghdr msghdr; 1319 struct cmsghdr *cmsg; 1320 struct uds_fd *ufd; 1321 unsigned int i, nfds; 1322 socklen_t chunklen, chunkspace; 1323 int r, fd, what; 1324 1325 /* See how many file descriptors should be part of this chunk. */ 1326 assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); 1327 ufd = SIMPLEQ_FIRST(&uds->uds_fds); 1328 nfds = ufd->ufd_count; 1329 assert(nfds > 0); 1330 1331 /* 1332 * We produce and copy out potentially unaligned chunks, using 1333 * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE. 1334 * This may leave "gap" bytes unchanged in userland, but that should 1335 * not be a problem. By producing unaligned chunks, we eliminate a 1336 * potential boundary case where the unaligned chunk passed in (by the 1337 * sender) no longer fits in the same buffer after being aligned here. 1338 */ 1339 chunklen = CMSG_LEN(sizeof(int) * nfds); 1340 chunkspace = CMSG_SPACE(sizeof(int) * nfds); 1341 assert(chunklen <= sizeof(uds_ctlbuf)); 1342 if (chunklen > ctl_len) 1343 return 0; /* chunk would not fit, so produce nothing instead */ 1344 if (chunkspace > ctl_len) 1345 chunkspace = ctl_len; 1346 1347 memset(&msghdr, 0, sizeof(msghdr)); 1348 msghdr.msg_control = uds_ctlbuf; 1349 msghdr.msg_controllen = sizeof(uds_ctlbuf); 1350 1351 memset(uds_ctlbuf, 0, chunklen); 1352 cmsg = CMSG_FIRSTHDR(&msghdr); 1353 cmsg->cmsg_len = chunklen; 1354 cmsg->cmsg_level = SOL_SOCKET; 1355 cmsg->cmsg_type = SCM_RIGHTS; 1356 1357 /* 1358 * Copy the group's local file descriptors to the target endpoint, and 1359 * store the resulting remote file descriptors in the chunk buffer. 1360 */ 1361 r = OK; 1362 1363 for (i = 0; i < nfds; i++) { 1364 assert(ufd != SIMPLEQ_END(&uds->uds_fds)); 1365 assert(i == 0 || ufd->ufd_count == 0); 1366 1367 what = COPYFD_TO; 1368 if (flags & MSG_CMSG_CLOEXEC) 1369 what |= COPYFD_CLOEXEC; 1370 1371 /* Failure may happen legitimately here (e.g., EMFILE). */ 1372 if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0) 1373 break; /* we keep our progress so far in 'i' */ 1374 1375 fd = r; 1376 1377 dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd)); 1378 1379 memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int)); 1380 1381 ufd = SIMPLEQ_NEXT(ufd, ufd_next); 1382 } 1383 1384 /* If everything went well so far, copy out the produced chunk. */ 1385 if (r >= 0) 1386 r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen); 1387 1388 /* 1389 * Handle errors. At this point, the 'i' variable contains the number 1390 * of file descriptors that have already been successfully copied out. 1391 */ 1392 if (r < 0) { 1393 /* Revert the successful copyfd() calls made so far. */ 1394 while (i-- > 0) { 1395 memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int)); 1396 1397 (void)copyfd(user_endpt, fd, COPYFD_CLOSE); 1398 } 1399 1400 return r; 1401 } 1402 1403 /* 1404 * Success. Return the aligned size of the produced chunk, if the 1405 * given length permits it. From here on, the receive call may no 1406 * longer fail, as that would result in lost file descriptors. 1407 */ 1408 return chunkspace; 1409 } 1410 1411 /* 1412 * Generate and copy out a chunk of control data with the sender's credentials. 1413 * Return the aligned chunk size on success, or a negative error code on 1414 * failure. 1415 */ 1416 static int 1417 uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl, 1418 socklen_t ctl_len, socklen_t ctl_off, size_t credpos) 1419 { 1420 struct msghdr msghdr; 1421 struct cmsghdr *cmsg; 1422 socklen_t chunklen, chunkspace; 1423 unsigned char lenbyte; 1424 size_t credlen; 1425 int r; 1426 1427 /* 1428 * Since the sender side already did the hard work of producing the 1429 * (variable-size) sockcred structure as it should be received, there 1430 * is relatively little work to be done here. 1431 */ 1432 credpos = uds_fetch(uds, credpos, &lenbyte, 1); 1433 credlen = (size_t)lenbyte; 1434 1435 chunklen = CMSG_LEN(credlen); 1436 chunkspace = CMSG_SPACE(credlen); 1437 assert(chunklen <= sizeof(uds_ctlbuf)); 1438 if (chunklen > ctl_len) 1439 return 0; /* chunk would not fit, so produce nothing instead */ 1440 if (chunkspace > ctl_len) 1441 chunkspace = ctl_len; 1442 1443 memset(&msghdr, 0, sizeof(msghdr)); 1444 msghdr.msg_control = uds_ctlbuf; 1445 msghdr.msg_controllen = sizeof(uds_ctlbuf); 1446 1447 memset(uds_ctlbuf, 0, chunklen); 1448 cmsg = CMSG_FIRSTHDR(&msghdr); 1449 cmsg->cmsg_len = chunklen; 1450 cmsg->cmsg_level = SOL_SOCKET; 1451 cmsg->cmsg_type = SCM_CREDS; 1452 1453 uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen); 1454 1455 if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK) 1456 return r; 1457 1458 return chunkspace; 1459 } 1460 1461 /* 1462 * Copy out control data for the ancillary data associated with the current 1463 * segment, if any. Return OK on success, at which point the current receive 1464 * call may no longer fail. 'rflags' may be updated with additional result 1465 * flags. Return a negative error code on failure. 1466 */ 1467 static int 1468 uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl, 1469 socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt, 1470 int flags, unsigned int segflags, size_t credpos, int * rflags) 1471 { 1472 int r; 1473 1474 /* 1475 * We first copy out all file descriptors, if any. We put them in one 1476 * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS 1477 * chunks. We believe that this should not cause application-level 1478 * issues, but if it does, we can change that later with some effort. 1479 * We then copy out credentials, if any. 1480 * 1481 * We copy out each control chunk independently of the others, and also 1482 * perform error recovery on a per-chunk basis. This implies the 1483 * following. If producing or copying out the first chunk fails, the 1484 * entire recvmsg(2) call will fail with an appropriate error. If 1485 * producing or copying out any subsequent chunk fails, the recvmsg(2) 1486 * call will still return the previously generated chunks (a "short 1487 * control read" if you will) as well as the MSG_CTRUNC flag. This 1488 * approach is simple and clean, and it guarantees that we can always 1489 * copy out at least as many file descriptors as we copied in for this 1490 * segment, even if credentials are present as well. However, the 1491 * approach does cause slightly more overhead when there are multiple 1492 * chunks per call, as those are copied out separately. 1493 * 1494 * Since the generated SCM_RIGHTS chunk is never larger than the 1495 * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf" 1496 * buffer is always large enough to contain the chunk in its entirety. 1497 * SCM_CREDS chunks should always fit easily as well. 1498 * 1499 * The MSG_CTRUNC flag will be returned iff not the entire user-given 1500 * control buffer was filled and not all control chunks were delivered. 1501 * Our current implementation does not deliver partial chunks. NetBSD 1502 * does, except for SCM_RIGHTS chunks. 1503 * 1504 * TODO: get rid of the redundancy in processing return values. 1505 */ 1506 if (segflags & UDS_HAS_FDS) { 1507 r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt, 1508 flags); 1509 1510 /* 1511 * At this point, 'r' contains one of the following: 1512 * 1513 * r > 0 a chunk of 'r' bytes was added successfully. 1514 * r == 0 not enough space left; the chunk was not added. 1515 * r < 0 an error occurred; the chunk was not added. 1516 */ 1517 if (r < 0 && *ctl_off == 0) 1518 return r; 1519 1520 if (r > 0) { 1521 ctl_len -= r; 1522 *ctl_off += r; 1523 } else 1524 *rflags |= MSG_CTRUNC; 1525 } 1526 1527 if (segflags & UDS_HAS_CRED) { 1528 r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos); 1529 1530 /* As above. */ 1531 if (r < 0 && *ctl_off == 0) 1532 return r; 1533 1534 if (r > 0) { 1535 ctl_len -= r; 1536 *ctl_off += r; 1537 } else 1538 *rflags |= MSG_CTRUNC; 1539 } 1540 1541 return OK; 1542 } 1543 1544 /* 1545 * The current receive request is successful or, in the case of MSG_WAITALL, 1546 * has made progress. Advance the receive buffer tail, either by discarding 1547 * the entire tail segment or by generating a new, smaller tail segment that 1548 * contains only the regular data left to be received from the original tail 1549 * segment. Also wake up the sending side for connection-oriented sockets if 1550 * applicable, because there may now be room for more data to be sent. Update 1551 * 'may_block' if we are now sure that the call may not block on MSG_WAITALL 1552 * after all. 1553 */ 1554 static void 1555 uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen, 1556 size_t reslen, unsigned int segflags, int * may_block) 1557 { 1558 struct udssock *conn; 1559 struct uds_fd *ufd; 1560 size_t delta, nseglen, advance; 1561 unsigned int nfds; 1562 1563 /* Note that 'reslen' may be legitimately zero. */ 1564 assert(reslen <= datalen); 1565 1566 if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen) 1567 reslen = datalen; 1568 1569 delta = datalen - reslen; 1570 1571 if (delta == 0) { 1572 /* 1573 * Fully consume the tail segment. We advance the tail by the 1574 * full segment length, thus moving up to either the next 1575 * segment in the receive buffer, or an empty receive buffer. 1576 */ 1577 advance = seglen; 1578 1579 uds->uds_tail = uds_advance(uds->uds_tail, advance); 1580 } else { 1581 /* 1582 * Partially consume the tail segment. We put a new segment 1583 * header right in front of the remaining data, which obviously 1584 * always fits. Since any ancillary data was consumed along 1585 * with the first data byte of the segment, the new segment has 1586 * no ancillary data anymore (and thus a zero flags field). 1587 */ 1588 nseglen = UDS_HDRLEN + delta; 1589 assert(nseglen < seglen); 1590 1591 advance = seglen - nseglen; 1592 1593 uds->uds_tail = uds_advance(uds->uds_tail, advance); 1594 1595 uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0); 1596 } 1597 1598 /* 1599 * For datagram-oriented sockets, we always consume at least a header. 1600 * For stream-type sockets, we either consume a zero-data segment along 1601 * with its ancillary data, or we consume at least one byte from a 1602 * segment that does have regular data. In all other cases, the 1603 * receive call has already been ended by now. Thus, we always advance 1604 * the tail of the receive buffer here. 1605 */ 1606 assert(advance > 0); 1607 1608 /* 1609 * The receive buffer's used length (uds_len) and pointer to the 1610 * previous segment header (uds_last) are offsets from the tail. Now 1611 * that we have moved the tail, we need to adjust these accordingly. 1612 * If the buffer is now empty, reset the tail to the buffer start so as 1613 * to avoid splitting inter-process copies whenever possible. 1614 */ 1615 assert(uds->uds_len >= advance); 1616 uds->uds_len -= advance; 1617 1618 if (uds->uds_len == 0) 1619 uds->uds_tail = 0; 1620 1621 /* 1622 * If uds_last is zero here, it was pointing to the segment we just 1623 * (partially) consumed. By leaving it zero, it will still point to 1624 * the new or next segment. 1625 */ 1626 if (uds->uds_last > 0) { 1627 assert(uds->uds_len > 0); 1628 assert(uds->uds_last >= advance); 1629 uds->uds_last -= advance; 1630 } 1631 1632 /* 1633 * If there were any file descriptors associated with this segment, 1634 * close and free them now. 1635 */ 1636 if (segflags & UDS_HAS_FDS) { 1637 assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); 1638 ufd = SIMPLEQ_FIRST(&uds->uds_fds); 1639 nfds = ufd->ufd_count; 1640 assert(nfds > 0); 1641 1642 while (nfds-- > 0) { 1643 assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); 1644 ufd = SIMPLEQ_FIRST(&uds->uds_fds); 1645 SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next); 1646 1647 dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); 1648 1649 closenb(ufd->ufd_fd); 1650 1651 SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next); 1652 } 1653 } 1654 1655 /* 1656 * If there is now any data left in the receive buffer, then there has 1657 * been a reason that we haven't received it. For stream sockets, that 1658 * reason is that the next segment has ancillary data. In any case, 1659 * this means we should never block the current receive operation 1660 * waiting for more data. Otherwise, we may block on MSG_WAITALL. 1661 */ 1662 if (uds->uds_len > 0) 1663 *may_block = FALSE; 1664 1665 /* 1666 * If the (non-datagram) socket has a peer that is not shut down for 1667 * writing, see if it can be woken up to send more data. Note that 1668 * the event will never be processed immediately. 1669 */ 1670 if (uds_is_connected(uds)) { 1671 assert(uds_get_type(uds) != SOCK_DGRAM); 1672 1673 conn = uds->uds_conn; 1674 1675 if (!uds_is_shutdown(conn, SFL_SHUT_WR)) 1676 sockevent_raise(&conn->uds_sock, SEV_SEND); 1677 } 1678 } 1679 1680 /* 1681 * Process a receive request. Return OK if the receive request has completed 1682 * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an 1683 * end-of-file condition is reached, or a negative error code on failure. In 1684 * all cases, the values of 'off' and 'ctl_off' must be updated if any progress 1685 * has been made; if either is non-zero, libsockevent will return the partial 1686 * progress rather than an error code or EOF. 1687 */ 1688 int 1689 uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len, 1690 size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, 1691 socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len, 1692 endpoint_t user_endpt, int flags, size_t min, int * rflags) 1693 { 1694 struct udssock *uds = (struct udssock *)sock; 1695 size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/; 1696 unsigned int segflags; 1697 int r, partial, may_block = 0 /*gcc*/; 1698 1699 dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n", 1700 uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len, 1701 (ctl_off != NULL) ? *ctl_off : 0, flags)); 1702 1703 /* 1704 * Start by testing whether anything can be received at all, or whether 1705 * an error or EOF should be returned instead, or whether the receive 1706 * call should be suspended until later otherwise. If no (regular or 1707 * control) data can be received, or if this was a test for select, 1708 * we bail out right after. 1709 */ 1710 partial = (off != NULL && *off > 0); 1711 1712 if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK) 1713 return r; 1714 1715 /* 1716 * Copy out regular data, if any. Do this before copying out control 1717 * data, because the latter is harder to undo on failure. This data 1718 * copy function returns returns OK (0) if we are to return a result of 1719 * zero bytes (which is *not* EOF) to the caller without doing anything 1720 * else. The function returns a nonzero positive segment length if we 1721 * should carry on with the receive call (as it happens, all its other 1722 * returned values may in fact be zero). 1723 */ 1724 if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags, 1725 &datalen, &reslen, &segflags, &credpos)) <= 0) 1726 return r; 1727 seglen = (size_t)r; 1728 1729 /* 1730 * Copy out control data, if any: transfer and copy out records of file 1731 * descriptors, and/or copy out sender credentials. This is the last 1732 * part of the call that may fail. 1733 */ 1734 if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags, 1735 segflags, credpos, rflags)) != OK) 1736 return r; 1737 1738 /* 1739 * Now that the call has succeeded, move the tail of the receive 1740 * buffer, unless we were merely peeking. 1741 */ 1742 if (!(flags & MSG_PEEK)) 1743 uds_recv_advance(uds, seglen, datalen, reslen, segflags, 1744 &may_block); 1745 else 1746 may_block = FALSE; 1747 1748 /* 1749 * If the MSG_WAITALL flag was given, we may still have to suspend the 1750 * call after partial success. In particular, the receive call may 1751 * suspend after partial success if all of these conditions are met: 1752 * 1753 * 1) the socket is a stream-type socket; 1754 * 2) MSG_WAITALL is set; 1755 * 3) MSG_PEEK is not set; 1756 * 4) MSG_DONTWAIT is not set (tested upon return); 1757 * 5) the socket must not have a pending error (tested upon return); 1758 * 6) the socket must not be shut down for reading (tested later); 1759 * 7) the socket must still be connected to a peer (no EOF); 1760 * 8) the peer must not have been shut down for writing (no EOF); 1761 * 9) the next segment, if any, contains no ancillary data. 1762 * 1763 * Together, these points guarantee that the call could conceivably 1764 * receive more after being resumed. Points 4 to 6 are covered by 1765 * libsockevent, which will end the call even if we return SUSPEND 1766 * here. Due to segment merging, we cover point 9 by checking that 1767 * there is currently no next segment at all. Once a new segment 1768 * arrives, the ancillary-data test is done then. 1769 */ 1770 *off += reslen; 1771 if ((flags & MSG_WAITALL) && reslen < len && may_block) 1772 return SUSPEND; 1773 else 1774 return OK; 1775 } 1776 1777 /* 1778 * Test whether a receive request would block. The given 'min' parameter 1779 * contains the minimum number of bytes that should be possible to receive 1780 * without blocking (the low receive watermark). Return SUSPEND if the send 1781 * request would block. Otherwise, return any other error code (including OK 1782 * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled 1783 * with the number of bytes available for receipt right now (if not zero). 1784 * Note that if 'size' is not NULL, 'min' will always be zero. 1785 */ 1786 int 1787 uds_test_recv(struct sock * sock, size_t min, size_t * size) 1788 { 1789 struct udssock *uds = (struct udssock *)sock; 1790 size_t seglen; 1791 unsigned int segflags; 1792 int r; 1793 1794 if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/, 1795 NULL /*may_block*/)) == SUSPEND) 1796 return r; 1797 1798 if (size != NULL && uds->uds_len > 0) 1799 (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size, 1800 &segflags); 1801 1802 return r; 1803 } 1804