1 /* $OpenBSD: uipc_socket2.c,v 1.158 2024/07/12 19:50:35 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/pool.h> 45 46 /* 47 * Primitive routines for operating on sockets and socket buffers 48 */ 49 50 u_long sb_max = SB_MAX; /* patchable */ 51 52 extern struct pool mclpools[]; 53 extern struct pool mbpool; 54 55 /* 56 * Procedures to manipulate state flags of socket 57 * and do appropriate wakeups. Normal sequence from the 58 * active (originating) side is that soisconnecting() is 59 * called during processing of connect() call, 60 * resulting in an eventual call to soisconnected() if/when the 61 * connection is established. When the connection is torn down 62 * soisdisconnecting() is called during processing of disconnect() call, 63 * and soisdisconnected() is called when the connection to the peer 64 * is totally severed. The semantics of these routines are such that 65 * connectionless protocols can call soisconnected() and soisdisconnected() 66 * only, bypassing the in-progress calls when setting up a ``connection'' 67 * takes no time. 68 * 69 * From the passive side, a socket is created with 70 * two queues of sockets: so_q0 for connections in progress 71 * and so_q for connections already made and awaiting user acceptance. 72 * As a protocol is preparing incoming connections, it creates a socket 73 * structure queued on so_q0 by calling sonewconn(). When the connection 74 * is established, soisconnected() is called, and transfers the 75 * socket structure to so_q, making it available to accept(). 76 * 77 * If a socket is closed with sockets on either 78 * so_q0 or so_q, these sockets are dropped. 79 * 80 * If higher level protocols are implemented in 81 * the kernel, the wakeups done here will sometimes 82 * cause software-interrupt process scheduling. 83 */ 84 85 void 86 soisconnecting(struct socket *so) 87 { 88 soassertlocked(so); 89 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 90 so->so_state |= SS_ISCONNECTING; 91 } 92 93 void 94 soisconnected(struct socket *so) 95 { 96 struct socket *head = so->so_head; 97 98 soassertlocked(so); 99 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 100 so->so_state |= SS_ISCONNECTED; 101 102 if (head != NULL && so->so_onq == &head->so_q0) { 103 int persocket = solock_persocket(so); 104 105 if (persocket) { 106 soref(so); 107 soref(head); 108 109 sounlock(so); 110 solock(head); 111 solock(so); 112 113 if (so->so_onq != &head->so_q0) { 114 sounlock(head); 115 sorele(head); 116 sorele(so); 117 118 return; 119 } 120 121 sorele(head); 122 sorele(so); 123 } 124 125 soqremque(so, 0); 126 soqinsque(head, so, 1); 127 sorwakeup(head); 128 wakeup_one(&head->so_timeo); 129 130 if (persocket) 131 sounlock(head); 132 } else { 133 wakeup(&so->so_timeo); 134 sorwakeup(so); 135 sowwakeup(so); 136 } 137 } 138 139 void 140 soisdisconnecting(struct socket *so) 141 { 142 soassertlocked(so); 143 so->so_state &= ~SS_ISCONNECTING; 144 so->so_state |= SS_ISDISCONNECTING; 145 146 mtx_enter(&so->so_rcv.sb_mtx); 147 so->so_rcv.sb_state |= SS_CANTRCVMORE; 148 mtx_leave(&so->so_rcv.sb_mtx); 149 150 mtx_enter(&so->so_snd.sb_mtx); 151 so->so_snd.sb_state |= SS_CANTSENDMORE; 152 mtx_leave(&so->so_snd.sb_mtx); 153 154 wakeup(&so->so_timeo); 155 sowwakeup(so); 156 sorwakeup(so); 157 } 158 159 void 160 soisdisconnected(struct socket *so) 161 { 162 soassertlocked(so); 163 164 mtx_enter(&so->so_rcv.sb_mtx); 165 so->so_rcv.sb_state |= SS_CANTRCVMORE; 166 mtx_leave(&so->so_rcv.sb_mtx); 167 168 mtx_enter(&so->so_snd.sb_mtx); 169 so->so_snd.sb_state |= SS_CANTSENDMORE; 170 mtx_leave(&so->so_snd.sb_mtx); 171 172 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 173 so->so_state |= SS_ISDISCONNECTED; 174 175 wakeup(&so->so_timeo); 176 sowwakeup(so); 177 sorwakeup(so); 178 } 179 180 /* 181 * When an attempt at a new connection is noted on a socket 182 * which accepts connections, sonewconn is called. If the 183 * connection is possible (subject to space constraints, etc.) 184 * then we allocate a new structure, properly linked into the 185 * data structure of the original socket, and return this. 186 * Connstatus may be 0 or SS_ISCONNECTED. 187 */ 188 struct socket * 189 sonewconn(struct socket *head, int connstatus, int wait) 190 { 191 struct socket *so; 192 int persocket = solock_persocket(head); 193 int soqueue = connstatus ? 1 : 0; 194 195 /* 196 * XXXSMP as long as `so' and `head' share the same lock, we 197 * can call soreserve() and pr_attach() below w/o explicitly 198 * locking `so'. 199 */ 200 soassertlocked(head); 201 202 if (m_pool_used() > 95) 203 return (NULL); 204 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 205 return (NULL); 206 so = soalloc(head->so_proto, wait); 207 if (so == NULL) 208 return (NULL); 209 so->so_type = head->so_type; 210 so->so_options = head->so_options &~ SO_ACCEPTCONN; 211 so->so_linger = head->so_linger; 212 so->so_state = head->so_state | SS_NOFDREF; 213 so->so_proto = head->so_proto; 214 so->so_timeo = head->so_timeo; 215 so->so_euid = head->so_euid; 216 so->so_ruid = head->so_ruid; 217 so->so_egid = head->so_egid; 218 so->so_rgid = head->so_rgid; 219 so->so_cpid = head->so_cpid; 220 221 /* 222 * Lock order will be `head' -> `so' while these sockets are linked. 223 */ 224 if (persocket) 225 solock(so); 226 227 /* 228 * Inherit watermarks but those may get clamped in low mem situations. 229 */ 230 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) 231 goto fail; 232 233 mtx_enter(&head->so_snd.sb_mtx); 234 so->so_snd.sb_wat = head->so_snd.sb_wat; 235 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 236 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 237 mtx_leave(&head->so_snd.sb_mtx); 238 239 mtx_enter(&head->so_rcv.sb_mtx); 240 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 241 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 242 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 243 mtx_leave(&head->so_rcv.sb_mtx); 244 245 sigio_copy(&so->so_sigio, &head->so_sigio); 246 247 soqinsque(head, so, soqueue); 248 if (pru_attach(so, 0, wait) != 0) { 249 soqremque(so, soqueue); 250 goto fail; 251 } 252 if (connstatus) { 253 so->so_state |= connstatus; 254 sorwakeup(head); 255 wakeup(&head->so_timeo); 256 } 257 258 if (persocket) 259 sounlock(so); 260 261 return (so); 262 263 fail: 264 if (persocket) 265 sounlock(so); 266 sigio_free(&so->so_sigio); 267 klist_free(&so->so_rcv.sb_klist); 268 klist_free(&so->so_snd.sb_klist); 269 pool_put(&socket_pool, so); 270 271 return (NULL); 272 } 273 274 void 275 soqinsque(struct socket *head, struct socket *so, int q) 276 { 277 soassertlocked(head); 278 soassertlocked(so); 279 280 KASSERT(so->so_onq == NULL); 281 282 so->so_head = head; 283 if (q == 0) { 284 head->so_q0len++; 285 so->so_onq = &head->so_q0; 286 } else { 287 head->so_qlen++; 288 so->so_onq = &head->so_q; 289 } 290 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 291 } 292 293 int 294 soqremque(struct socket *so, int q) 295 { 296 struct socket *head = so->so_head; 297 298 soassertlocked(so); 299 soassertlocked(head); 300 301 if (q == 0) { 302 if (so->so_onq != &head->so_q0) 303 return (0); 304 head->so_q0len--; 305 } else { 306 if (so->so_onq != &head->so_q) 307 return (0); 308 head->so_qlen--; 309 } 310 TAILQ_REMOVE(so->so_onq, so, so_qe); 311 so->so_onq = NULL; 312 so->so_head = NULL; 313 return (1); 314 } 315 316 /* 317 * Socantsendmore indicates that no more data will be sent on the 318 * socket; it would normally be applied to a socket when the user 319 * informs the system that no more data is to be sent, by the protocol 320 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 321 * will be received, and will normally be applied to the socket by a 322 * protocol when it detects that the peer will send no more data. 323 * Data queued for reading in the socket may yet be read. 324 */ 325 326 void 327 socantsendmore(struct socket *so) 328 { 329 soassertlocked(so); 330 mtx_enter(&so->so_snd.sb_mtx); 331 so->so_snd.sb_state |= SS_CANTSENDMORE; 332 mtx_leave(&so->so_snd.sb_mtx); 333 sowwakeup(so); 334 } 335 336 void 337 socantrcvmore(struct socket *so) 338 { 339 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 340 soassertlocked(so); 341 342 mtx_enter(&so->so_rcv.sb_mtx); 343 so->so_rcv.sb_state |= SS_CANTRCVMORE; 344 mtx_leave(&so->so_rcv.sb_mtx); 345 sorwakeup(so); 346 } 347 348 void 349 solock(struct socket *so) 350 { 351 switch (so->so_proto->pr_domain->dom_family) { 352 case PF_INET: 353 case PF_INET6: 354 NET_LOCK(); 355 break; 356 default: 357 rw_enter_write(&so->so_lock); 358 break; 359 } 360 } 361 362 void 363 solock_shared(struct socket *so) 364 { 365 switch (so->so_proto->pr_domain->dom_family) { 366 case PF_INET: 367 case PF_INET6: 368 if (ISSET(so->so_proto->pr_flags, PR_MPSOCKET)) { 369 NET_LOCK_SHARED(); 370 rw_enter_write(&so->so_lock); 371 } else 372 NET_LOCK(); 373 break; 374 default: 375 rw_enter_write(&so->so_lock); 376 break; 377 } 378 } 379 380 int 381 solock_persocket(struct socket *so) 382 { 383 switch (so->so_proto->pr_domain->dom_family) { 384 case PF_INET: 385 case PF_INET6: 386 return 0; 387 default: 388 return 1; 389 } 390 } 391 392 void 393 solock_pair(struct socket *so1, struct socket *so2) 394 { 395 KASSERT(so1 != so2); 396 KASSERT(so1->so_type == so2->so_type); 397 KASSERT(solock_persocket(so1)); 398 399 if (so1 < so2) { 400 solock(so1); 401 solock(so2); 402 } else { 403 solock(so2); 404 solock(so1); 405 } 406 } 407 408 void 409 sounlock(struct socket *so) 410 { 411 switch (so->so_proto->pr_domain->dom_family) { 412 case PF_INET: 413 case PF_INET6: 414 NET_UNLOCK(); 415 break; 416 default: 417 rw_exit_write(&so->so_lock); 418 break; 419 } 420 } 421 422 void 423 sounlock_shared(struct socket *so) 424 { 425 switch (so->so_proto->pr_domain->dom_family) { 426 case PF_INET: 427 case PF_INET6: 428 if (ISSET(so->so_proto->pr_flags, PR_MPSOCKET)) { 429 rw_exit_write(&so->so_lock); 430 NET_UNLOCK_SHARED(); 431 } else 432 NET_UNLOCK(); 433 break; 434 default: 435 rw_exit_write(&so->so_lock); 436 break; 437 } 438 } 439 440 void 441 soassertlocked_readonly(struct socket *so) 442 { 443 switch (so->so_proto->pr_domain->dom_family) { 444 case PF_INET: 445 case PF_INET6: 446 NET_ASSERT_LOCKED(); 447 break; 448 default: 449 rw_assert_wrlock(&so->so_lock); 450 break; 451 } 452 } 453 454 void 455 soassertlocked(struct socket *so) 456 { 457 switch (so->so_proto->pr_domain->dom_family) { 458 case PF_INET: 459 case PF_INET6: 460 if (rw_status(&netlock) == RW_READ) { 461 NET_ASSERT_LOCKED(); 462 463 if (splassert_ctl > 0 && 464 rw_status(&so->so_lock) != RW_WRITE) 465 splassert_fail(0, RW_WRITE, __func__); 466 } else 467 NET_ASSERT_LOCKED_EXCLUSIVE(); 468 break; 469 default: 470 rw_assert_wrlock(&so->so_lock); 471 break; 472 } 473 } 474 475 int 476 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 477 uint64_t nsecs) 478 { 479 int ret; 480 481 switch (so->so_proto->pr_domain->dom_family) { 482 case PF_INET: 483 case PF_INET6: 484 if (ISSET(so->so_proto->pr_flags, PR_MPSOCKET) && 485 rw_status(&netlock) == RW_READ) { 486 rw_exit_write(&so->so_lock); 487 } 488 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 489 if (ISSET(so->so_proto->pr_flags, PR_MPSOCKET) && 490 rw_status(&netlock) == RW_READ) { 491 rw_enter_write(&so->so_lock); 492 } 493 break; 494 default: 495 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); 496 break; 497 } 498 499 return ret; 500 } 501 502 void 503 sbmtxassertlocked(struct socket *so, struct sockbuf *sb) 504 { 505 if (sb->sb_flags & SB_MTXLOCK) { 506 if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0) 507 splassert_fail(0, RW_WRITE, __func__); 508 } else 509 soassertlocked(so); 510 } 511 512 /* 513 * Wait for data to arrive at/drain from a socket buffer. 514 */ 515 int 516 sbwait(struct socket *so, struct sockbuf *sb) 517 { 518 uint64_t timeo_nsecs; 519 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 520 521 if (sb->sb_flags & SB_MTXLOCK) { 522 MUTEX_ASSERT_LOCKED(&sb->sb_mtx); 523 524 sb->sb_flags |= SB_WAIT; 525 return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", 526 sb->sb_timeo_nsecs); 527 } 528 529 soassertlocked(so); 530 531 mtx_enter(&sb->sb_mtx); 532 timeo_nsecs = sb->sb_timeo_nsecs; 533 sb->sb_flags |= SB_WAIT; 534 mtx_leave(&sb->sb_mtx); 535 536 return sosleep_nsec(so, &sb->sb_cc, prio, "netio", timeo_nsecs); 537 } 538 539 int 540 sblock(struct sockbuf *sb, int flags) 541 { 542 int rwflags = RW_WRITE, error; 543 544 if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) 545 rwflags |= RW_INTR; 546 if (!(flags & SBL_WAIT)) 547 rwflags |= RW_NOSLEEP; 548 549 error = rw_enter(&sb->sb_lock, rwflags); 550 if (error == EBUSY) 551 error = EWOULDBLOCK; 552 553 return error; 554 } 555 556 void 557 sbunlock(struct sockbuf *sb) 558 { 559 rw_exit(&sb->sb_lock); 560 } 561 562 /* 563 * Wakeup processes waiting on a socket buffer. 564 * Do asynchronous notification via SIGIO 565 * if the socket buffer has the SB_ASYNC flag set. 566 */ 567 void 568 sowakeup(struct socket *so, struct sockbuf *sb) 569 { 570 int dowakeup = 0, dopgsigio = 0; 571 572 mtx_enter(&sb->sb_mtx); 573 if (sb->sb_flags & SB_WAIT) { 574 sb->sb_flags &= ~SB_WAIT; 575 dowakeup = 1; 576 } 577 if (sb->sb_flags & SB_ASYNC) 578 dopgsigio = 1; 579 580 knote_locked(&sb->sb_klist, 0); 581 mtx_leave(&sb->sb_mtx); 582 583 if (dowakeup) 584 wakeup(&sb->sb_cc); 585 586 if (dopgsigio) 587 pgsigio(&so->so_sigio, SIGIO, 0); 588 } 589 590 /* 591 * Socket buffer (struct sockbuf) utility routines. 592 * 593 * Each socket contains two socket buffers: one for sending data and 594 * one for receiving data. Each buffer contains a queue of mbufs, 595 * information about the number of mbufs and amount of data in the 596 * queue, and other fields allowing select() statements and notification 597 * on data availability to be implemented. 598 * 599 * Data stored in a socket buffer is maintained as a list of records. 600 * Each record is a list of mbufs chained together with the m_next 601 * field. Records are chained together with the m_nextpkt field. The upper 602 * level routine soreceive() expects the following conventions to be 603 * observed when placing information in the receive buffer: 604 * 605 * 1. If the protocol requires each message be preceded by the sender's 606 * name, then a record containing that name must be present before 607 * any associated data (mbuf's must be of type MT_SONAME). 608 * 2. If the protocol supports the exchange of ``access rights'' (really 609 * just additional data associated with the message), and there are 610 * ``rights'' to be received, then a record containing this data 611 * should be present (mbuf's must be of type MT_CONTROL). 612 * 3. If a name or rights record exists, then it must be followed by 613 * a data record, perhaps of zero length. 614 * 615 * Before using a new socket structure it is first necessary to reserve 616 * buffer space to the socket, by calling sbreserve(). This should commit 617 * some of the available buffer space in the system buffer pool for the 618 * socket (currently, it does nothing but enforce limits). The space 619 * should be released by calling sbrelease() when the socket is destroyed. 620 */ 621 622 int 623 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 624 { 625 soassertlocked(so); 626 627 mtx_enter(&so->so_rcv.sb_mtx); 628 mtx_enter(&so->so_snd.sb_mtx); 629 if (sbreserve(so, &so->so_snd, sndcc)) 630 goto bad; 631 so->so_snd.sb_wat = sndcc; 632 if (so->so_snd.sb_lowat == 0) 633 so->so_snd.sb_lowat = MCLBYTES; 634 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 635 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 636 if (sbreserve(so, &so->so_rcv, rcvcc)) 637 goto bad2; 638 so->so_rcv.sb_wat = rcvcc; 639 if (so->so_rcv.sb_lowat == 0) 640 so->so_rcv.sb_lowat = 1; 641 mtx_leave(&so->so_snd.sb_mtx); 642 mtx_leave(&so->so_rcv.sb_mtx); 643 644 return (0); 645 bad2: 646 sbrelease(so, &so->so_snd); 647 bad: 648 mtx_leave(&so->so_snd.sb_mtx); 649 mtx_leave(&so->so_rcv.sb_mtx); 650 return (ENOBUFS); 651 } 652 653 /* 654 * Allot mbufs to a sockbuf. 655 * Attempt to scale mbmax so that mbcnt doesn't become limiting 656 * if buffering efficiency is near the normal case. 657 */ 658 int 659 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 660 { 661 sbmtxassertlocked(so, sb); 662 663 if (cc == 0 || cc > sb_max) 664 return (1); 665 sb->sb_hiwat = cc; 666 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 667 if (sb->sb_lowat > sb->sb_hiwat) 668 sb->sb_lowat = sb->sb_hiwat; 669 return (0); 670 } 671 672 /* 673 * In low memory situation, do not accept any greater than normal request. 674 */ 675 int 676 sbcheckreserve(u_long cnt, u_long defcnt) 677 { 678 if (cnt > defcnt && sbchecklowmem()) 679 return (ENOBUFS); 680 return (0); 681 } 682 683 int 684 sbchecklowmem(void) 685 { 686 static int sblowmem; 687 unsigned int used = m_pool_used(); 688 689 if (used < 60) 690 sblowmem = 0; 691 else if (used > 80) 692 sblowmem = 1; 693 694 return (sblowmem); 695 } 696 697 /* 698 * Free mbufs held by a socket, and reserved mbuf space. 699 */ 700 void 701 sbrelease(struct socket *so, struct sockbuf *sb) 702 { 703 704 sbflush(so, sb); 705 sb->sb_hiwat = sb->sb_mbmax = 0; 706 } 707 708 /* 709 * Routines to add and remove 710 * data from an mbuf queue. 711 * 712 * The routines sbappend() or sbappendrecord() are normally called to 713 * append new mbufs to a socket buffer, after checking that adequate 714 * space is available, comparing the function sbspace() with the amount 715 * of data to be added. sbappendrecord() differs from sbappend() in 716 * that data supplied is treated as the beginning of a new record. 717 * To place a sender's address, optional access rights, and data in a 718 * socket receive buffer, sbappendaddr() should be used. To place 719 * access rights and data in a socket receive buffer, sbappendrights() 720 * should be used. In either case, the new data begins a new record. 721 * Note that unlike sbappend() and sbappendrecord(), these routines check 722 * for the caller that there will be enough space to store the data. 723 * Each fails if there is not enough space, or if it cannot find mbufs 724 * to store additional information in. 725 * 726 * Reliable protocols may use the socket send buffer to hold data 727 * awaiting acknowledgement. Data is normally copied from a socket 728 * send buffer in a protocol with m_copym for output to a peer, 729 * and then removing the data from the socket buffer with sbdrop() 730 * or sbdroprecord() when the data is acknowledged by the peer. 731 */ 732 733 #ifdef SOCKBUF_DEBUG 734 void 735 sblastrecordchk(struct sockbuf *sb, const char *where) 736 { 737 struct mbuf *m = sb->sb_mb; 738 739 while (m && m->m_nextpkt) 740 m = m->m_nextpkt; 741 742 if (m != sb->sb_lastrecord) { 743 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 744 sb->sb_mb, sb->sb_lastrecord, m); 745 printf("packet chain:\n"); 746 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 747 printf("\t%p\n", m); 748 panic("sblastrecordchk from %s", where); 749 } 750 } 751 752 void 753 sblastmbufchk(struct sockbuf *sb, const char *where) 754 { 755 struct mbuf *m = sb->sb_mb; 756 struct mbuf *n; 757 758 while (m && m->m_nextpkt) 759 m = m->m_nextpkt; 760 761 while (m && m->m_next) 762 m = m->m_next; 763 764 if (m != sb->sb_mbtail) { 765 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 766 sb->sb_mb, sb->sb_mbtail, m); 767 printf("packet tree:\n"); 768 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 769 printf("\t"); 770 for (n = m; n != NULL; n = n->m_next) 771 printf("%p ", n); 772 printf("\n"); 773 } 774 panic("sblastmbufchk from %s", where); 775 } 776 } 777 #endif /* SOCKBUF_DEBUG */ 778 779 #define SBLINKRECORD(sb, m0) \ 780 do { \ 781 if ((sb)->sb_lastrecord != NULL) \ 782 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 783 else \ 784 (sb)->sb_mb = (m0); \ 785 (sb)->sb_lastrecord = (m0); \ 786 } while (/*CONSTCOND*/0) 787 788 /* 789 * Append mbuf chain m to the last record in the 790 * socket buffer sb. The additional space associated 791 * the mbuf chain is recorded in sb. Empty mbufs are 792 * discarded and mbufs are compacted where possible. 793 */ 794 void 795 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 796 { 797 struct mbuf *n; 798 799 if (m == NULL) 800 return; 801 802 sbmtxassertlocked(so, sb); 803 SBLASTRECORDCHK(sb, "sbappend 1"); 804 805 if ((n = sb->sb_lastrecord) != NULL) { 806 /* 807 * XXX Would like to simply use sb_mbtail here, but 808 * XXX I need to verify that I won't miss an EOR that 809 * XXX way. 810 */ 811 do { 812 if (n->m_flags & M_EOR) { 813 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 814 return; 815 } 816 } while (n->m_next && (n = n->m_next)); 817 } else { 818 /* 819 * If this is the first record in the socket buffer, it's 820 * also the last record. 821 */ 822 sb->sb_lastrecord = m; 823 } 824 sbcompress(so, sb, m, n); 825 SBLASTRECORDCHK(sb, "sbappend 2"); 826 } 827 828 /* 829 * This version of sbappend() should only be used when the caller 830 * absolutely knows that there will never be more than one record 831 * in the socket buffer, that is, a stream protocol (such as TCP). 832 */ 833 void 834 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 835 { 836 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 837 soassertlocked(so); 838 KDASSERT(m->m_nextpkt == NULL); 839 KASSERT(sb->sb_mb == sb->sb_lastrecord); 840 841 SBLASTMBUFCHK(sb, __func__); 842 843 sbcompress(so, sb, m, sb->sb_mbtail); 844 845 sb->sb_lastrecord = sb->sb_mb; 846 SBLASTRECORDCHK(sb, __func__); 847 } 848 849 #ifdef SOCKBUF_DEBUG 850 void 851 sbcheck(struct socket *so, struct sockbuf *sb) 852 { 853 struct mbuf *m, *n; 854 u_long len = 0, mbcnt = 0; 855 856 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 857 for (n = m; n; n = n->m_next) { 858 len += n->m_len; 859 mbcnt += MSIZE; 860 if (n->m_flags & M_EXT) 861 mbcnt += n->m_ext.ext_size; 862 if (m != n && n->m_nextpkt) 863 panic("sbcheck nextpkt"); 864 } 865 } 866 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 867 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 868 mbcnt, sb->sb_mbcnt); 869 panic("sbcheck"); 870 } 871 } 872 #endif 873 874 /* 875 * As above, except the mbuf chain 876 * begins a new record. 877 */ 878 void 879 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 880 { 881 struct mbuf *m; 882 883 sbmtxassertlocked(so, sb); 884 885 if (m0 == NULL) 886 return; 887 888 /* 889 * Put the first mbuf on the queue. 890 * Note this permits zero length records. 891 */ 892 sballoc(so, sb, m0); 893 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 894 SBLINKRECORD(sb, m0); 895 m = m0->m_next; 896 m0->m_next = NULL; 897 if (m && (m0->m_flags & M_EOR)) { 898 m0->m_flags &= ~M_EOR; 899 m->m_flags |= M_EOR; 900 } 901 sbcompress(so, sb, m, m0); 902 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 903 } 904 905 /* 906 * Append address and data, and optionally, control (ancillary) data 907 * to the receive queue of a socket. If present, 908 * m0 must include a packet header with total length. 909 * Returns 0 if no space in sockbuf or insufficient mbufs. 910 */ 911 int 912 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 913 struct mbuf *m0, struct mbuf *control) 914 { 915 struct mbuf *m, *n, *nlast; 916 int space = asa->sa_len; 917 918 sbmtxassertlocked(so, sb); 919 920 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 921 panic("sbappendaddr"); 922 if (m0) 923 space += m0->m_pkthdr.len; 924 for (n = control; n; n = n->m_next) { 925 space += n->m_len; 926 if (n->m_next == NULL) /* keep pointer to last control buf */ 927 break; 928 } 929 if (space > sbspace_locked(so, sb)) 930 return (0); 931 if (asa->sa_len > MLEN) 932 return (0); 933 MGET(m, M_DONTWAIT, MT_SONAME); 934 if (m == NULL) 935 return (0); 936 m->m_len = asa->sa_len; 937 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 938 if (n) 939 n->m_next = m0; /* concatenate data to control */ 940 else 941 control = m0; 942 m->m_next = control; 943 944 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 945 946 for (n = m; n->m_next != NULL; n = n->m_next) 947 sballoc(so, sb, n); 948 sballoc(so, sb, n); 949 nlast = n; 950 SBLINKRECORD(sb, m); 951 952 sb->sb_mbtail = nlast; 953 SBLASTMBUFCHK(sb, "sbappendaddr"); 954 955 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 956 957 return (1); 958 } 959 960 int 961 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 962 struct mbuf *control) 963 { 964 struct mbuf *m, *mlast, *n; 965 int eor = 0, space = 0; 966 967 sbmtxassertlocked(so, sb); 968 969 if (control == NULL) 970 panic("sbappendcontrol"); 971 for (m = control; ; m = m->m_next) { 972 space += m->m_len; 973 if (m->m_next == NULL) 974 break; 975 } 976 n = m; /* save pointer to last control buffer */ 977 for (m = m0; m; m = m->m_next) { 978 space += m->m_len; 979 eor |= m->m_flags & M_EOR; 980 if (eor) { 981 if (m->m_next == NULL) 982 m->m_flags |= M_EOR; 983 else 984 m->m_flags &= ~M_EOR; 985 } 986 } 987 if (space > sbspace_locked(so, sb)) 988 return (0); 989 n->m_next = m0; /* concatenate data to control */ 990 991 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 992 993 for (m = control; m->m_next != NULL; m = m->m_next) 994 sballoc(so, sb, m); 995 sballoc(so, sb, m); 996 mlast = m; 997 SBLINKRECORD(sb, control); 998 999 sb->sb_mbtail = mlast; 1000 SBLASTMBUFCHK(sb, "sbappendcontrol"); 1001 1002 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 1003 1004 return (1); 1005 } 1006 1007 /* 1008 * Compress mbuf chain m into the socket 1009 * buffer sb following mbuf n. If n 1010 * is null, the buffer is presumed empty. 1011 */ 1012 void 1013 sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m, 1014 struct mbuf *n) 1015 { 1016 int eor = 0; 1017 struct mbuf *o; 1018 1019 while (m) { 1020 eor |= m->m_flags & M_EOR; 1021 if (m->m_len == 0 && 1022 (eor == 0 || 1023 (((o = m->m_next) || (o = n)) && 1024 o->m_type == m->m_type))) { 1025 if (sb->sb_lastrecord == m) 1026 sb->sb_lastrecord = m->m_next; 1027 m = m_free(m); 1028 continue; 1029 } 1030 if (n && (n->m_flags & M_EOR) == 0 && 1031 /* m_trailingspace() checks buffer writeability */ 1032 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 1033 MCLBYTES) / 4 && /* XXX Don't copy too much */ 1034 m->m_len <= m_trailingspace(n) && 1035 n->m_type == m->m_type) { 1036 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 1037 m->m_len); 1038 n->m_len += m->m_len; 1039 sb->sb_cc += m->m_len; 1040 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1041 sb->sb_datacc += m->m_len; 1042 m = m_free(m); 1043 continue; 1044 } 1045 if (n) 1046 n->m_next = m; 1047 else 1048 sb->sb_mb = m; 1049 sb->sb_mbtail = m; 1050 sballoc(so, sb, m); 1051 n = m; 1052 m->m_flags &= ~M_EOR; 1053 m = m->m_next; 1054 n->m_next = NULL; 1055 } 1056 if (eor) { 1057 if (n) 1058 n->m_flags |= eor; 1059 else 1060 printf("semi-panic: sbcompress"); 1061 } 1062 SBLASTMBUFCHK(sb, __func__); 1063 } 1064 1065 /* 1066 * Free all mbufs in a sockbuf. 1067 * Check that all resources are reclaimed. 1068 */ 1069 void 1070 sbflush(struct socket *so, struct sockbuf *sb) 1071 { 1072 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 1073 rw_assert_unlocked(&sb->sb_lock); 1074 1075 while (sb->sb_mbcnt) 1076 sbdrop(so, sb, (int)sb->sb_cc); 1077 1078 KASSERT(sb->sb_cc == 0); 1079 KASSERT(sb->sb_datacc == 0); 1080 KASSERT(sb->sb_mb == NULL); 1081 KASSERT(sb->sb_mbtail == NULL); 1082 KASSERT(sb->sb_lastrecord == NULL); 1083 } 1084 1085 /* 1086 * Drop data from (the front of) a sockbuf. 1087 */ 1088 void 1089 sbdrop(struct socket *so, struct sockbuf *sb, int len) 1090 { 1091 struct mbuf *m, *mn; 1092 struct mbuf *next; 1093 1094 sbmtxassertlocked(so, sb); 1095 1096 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1097 while (len > 0) { 1098 if (m == NULL) { 1099 if (next == NULL) 1100 panic("sbdrop"); 1101 m = next; 1102 next = m->m_nextpkt; 1103 continue; 1104 } 1105 if (m->m_len > len) { 1106 m->m_len -= len; 1107 m->m_data += len; 1108 sb->sb_cc -= len; 1109 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1110 sb->sb_datacc -= len; 1111 break; 1112 } 1113 len -= m->m_len; 1114 sbfree(so, sb, m); 1115 mn = m_free(m); 1116 m = mn; 1117 } 1118 while (m && m->m_len == 0) { 1119 sbfree(so, sb, m); 1120 mn = m_free(m); 1121 m = mn; 1122 } 1123 if (m) { 1124 sb->sb_mb = m; 1125 m->m_nextpkt = next; 1126 } else 1127 sb->sb_mb = next; 1128 /* 1129 * First part is an inline SB_EMPTY_FIXUP(). Second part 1130 * makes sure sb_lastrecord is up-to-date if we dropped 1131 * part of the last record. 1132 */ 1133 m = sb->sb_mb; 1134 if (m == NULL) { 1135 sb->sb_mbtail = NULL; 1136 sb->sb_lastrecord = NULL; 1137 } else if (m->m_nextpkt == NULL) 1138 sb->sb_lastrecord = m; 1139 } 1140 1141 /* 1142 * Drop a record off the front of a sockbuf 1143 * and move the next record to the front. 1144 */ 1145 void 1146 sbdroprecord(struct socket *so, struct sockbuf *sb) 1147 { 1148 struct mbuf *m, *mn; 1149 1150 m = sb->sb_mb; 1151 if (m) { 1152 sb->sb_mb = m->m_nextpkt; 1153 do { 1154 sbfree(so, sb, m); 1155 mn = m_free(m); 1156 } while ((m = mn) != NULL); 1157 } 1158 SB_EMPTY_FIXUP(sb); 1159 } 1160 1161 /* 1162 * Create a "control" mbuf containing the specified data 1163 * with the specified type for presentation on a socket buffer. 1164 */ 1165 struct mbuf * 1166 sbcreatecontrol(const void *p, size_t size, int type, int level) 1167 { 1168 struct cmsghdr *cp; 1169 struct mbuf *m; 1170 1171 if (CMSG_SPACE(size) > MCLBYTES) { 1172 printf("sbcreatecontrol: message too large %zu\n", size); 1173 return (NULL); 1174 } 1175 1176 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1177 return (NULL); 1178 if (CMSG_SPACE(size) > MLEN) { 1179 MCLGET(m, M_DONTWAIT); 1180 if ((m->m_flags & M_EXT) == 0) { 1181 m_free(m); 1182 return NULL; 1183 } 1184 } 1185 cp = mtod(m, struct cmsghdr *); 1186 memset(cp, 0, CMSG_SPACE(size)); 1187 memcpy(CMSG_DATA(cp), p, size); 1188 m->m_len = CMSG_SPACE(size); 1189 cp->cmsg_len = CMSG_LEN(size); 1190 cp->cmsg_level = level; 1191 cp->cmsg_type = type; 1192 return (m); 1193 } 1194