1 /* $OpenBSD: uipc_socket2.c,v 1.164 2025/01/05 12:36:48 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/pool.h> 45 46 /* 47 * Primitive routines for operating on sockets and socket buffers 48 */ 49 50 u_long sb_max = SB_MAX; /* [I] patchable */ 51 52 extern struct pool mclpools[]; 53 extern struct pool mbpool; 54 55 /* 56 * Procedures to manipulate state flags of socket 57 * and do appropriate wakeups. Normal sequence from the 58 * active (originating) side is that soisconnecting() is 59 * called during processing of connect() call, 60 * resulting in an eventual call to soisconnected() if/when the 61 * connection is established. When the connection is torn down 62 * soisdisconnecting() is called during processing of disconnect() call, 63 * and soisdisconnected() is called when the connection to the peer 64 * is totally severed. The semantics of these routines are such that 65 * connectionless protocols can call soisconnected() and soisdisconnected() 66 * only, bypassing the in-progress calls when setting up a ``connection'' 67 * takes no time. 68 * 69 * From the passive side, a socket is created with 70 * two queues of sockets: so_q0 for connections in progress 71 * and so_q for connections already made and awaiting user acceptance. 72 * As a protocol is preparing incoming connections, it creates a socket 73 * structure queued on so_q0 by calling sonewconn(). When the connection 74 * is established, soisconnected() is called, and transfers the 75 * socket structure to so_q, making it available to accept(). 76 * 77 * If a socket is closed with sockets on either 78 * so_q0 or so_q, these sockets are dropped. 79 * 80 * If higher level protocols are implemented in 81 * the kernel, the wakeups done here will sometimes 82 * cause software-interrupt process scheduling. 83 */ 84 85 void 86 soisconnecting(struct socket *so) 87 { 88 soassertlocked(so); 89 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 90 so->so_state |= SS_ISCONNECTING; 91 } 92 93 void 94 soisconnected(struct socket *so) 95 { 96 struct socket *head = so->so_head; 97 98 soassertlocked(so); 99 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 100 so->so_state |= SS_ISCONNECTED; 101 102 if (head != NULL && so->so_onq == &head->so_q0) { 103 int persocket = solock_persocket(so); 104 105 if (persocket) { 106 soref(head); 107 108 sounlock(so); 109 solock(head); 110 solock(so); 111 112 if (so->so_onq != &head->so_q0) { 113 sounlock(head); 114 sorele(head); 115 return; 116 } 117 118 } 119 120 soqremque(so, 0); 121 soqinsque(head, so, 1); 122 sorwakeup(head); 123 wakeup_one(&head->so_timeo); 124 125 if (persocket) { 126 sounlock(head); 127 sorele(head); 128 } 129 } else { 130 wakeup(&so->so_timeo); 131 sorwakeup(so); 132 sowwakeup(so); 133 } 134 } 135 136 void 137 soisdisconnecting(struct socket *so) 138 { 139 soassertlocked(so); 140 so->so_state &= ~SS_ISCONNECTING; 141 so->so_state |= SS_ISDISCONNECTING; 142 143 mtx_enter(&so->so_rcv.sb_mtx); 144 so->so_rcv.sb_state |= SS_CANTRCVMORE; 145 mtx_leave(&so->so_rcv.sb_mtx); 146 147 mtx_enter(&so->so_snd.sb_mtx); 148 so->so_snd.sb_state |= SS_CANTSENDMORE; 149 mtx_leave(&so->so_snd.sb_mtx); 150 151 wakeup(&so->so_timeo); 152 sowwakeup(so); 153 sorwakeup(so); 154 } 155 156 void 157 soisdisconnected(struct socket *so) 158 { 159 soassertlocked(so); 160 161 mtx_enter(&so->so_rcv.sb_mtx); 162 so->so_rcv.sb_state |= SS_CANTRCVMORE; 163 mtx_leave(&so->so_rcv.sb_mtx); 164 165 mtx_enter(&so->so_snd.sb_mtx); 166 so->so_snd.sb_state |= SS_CANTSENDMORE; 167 mtx_leave(&so->so_snd.sb_mtx); 168 169 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 170 so->so_state |= SS_ISDISCONNECTED; 171 172 wakeup(&so->so_timeo); 173 sowwakeup(so); 174 sorwakeup(so); 175 } 176 177 /* 178 * When an attempt at a new connection is noted on a socket 179 * which accepts connections, sonewconn is called. If the 180 * connection is possible (subject to space constraints, etc.) 181 * then we allocate a new structure, properly linked into the 182 * data structure of the original socket, and return this. 183 * Connstatus may be 0 or SS_ISCONNECTED. 184 */ 185 struct socket * 186 sonewconn(struct socket *head, int connstatus, int wait) 187 { 188 struct socket *so; 189 int persocket = solock_persocket(head); 190 int soqueue = connstatus ? 1 : 0; 191 192 /* 193 * XXXSMP as long as `so' and `head' share the same lock, we 194 * can call soreserve() and pr_attach() below w/o explicitly 195 * locking `so'. 196 */ 197 soassertlocked(head); 198 199 if (m_pool_used() > 95) 200 return (NULL); 201 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 202 return (NULL); 203 so = soalloc(head->so_proto, wait); 204 if (so == NULL) 205 return (NULL); 206 so->so_type = head->so_type; 207 so->so_options = head->so_options &~ SO_ACCEPTCONN; 208 so->so_linger = head->so_linger; 209 so->so_state = head->so_state | SS_NOFDREF; 210 so->so_proto = head->so_proto; 211 so->so_timeo = head->so_timeo; 212 so->so_euid = head->so_euid; 213 so->so_ruid = head->so_ruid; 214 so->so_egid = head->so_egid; 215 so->so_rgid = head->so_rgid; 216 so->so_cpid = head->so_cpid; 217 218 /* 219 * Lock order will be `head' -> `so' while these sockets are linked. 220 */ 221 if (persocket) 222 solock(so); 223 224 /* 225 * Inherit watermarks but those may get clamped in low mem situations. 226 */ 227 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) 228 goto fail; 229 230 mtx_enter(&head->so_snd.sb_mtx); 231 so->so_snd.sb_wat = head->so_snd.sb_wat; 232 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 233 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 234 mtx_leave(&head->so_snd.sb_mtx); 235 236 mtx_enter(&head->so_rcv.sb_mtx); 237 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 238 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 239 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 240 mtx_leave(&head->so_rcv.sb_mtx); 241 242 sigio_copy(&so->so_sigio, &head->so_sigio); 243 244 soqinsque(head, so, soqueue); 245 if (pru_attach(so, 0, wait) != 0) { 246 soqremque(so, soqueue); 247 goto fail; 248 } 249 if (connstatus) { 250 so->so_state |= connstatus; 251 sorwakeup(head); 252 wakeup(&head->so_timeo); 253 } 254 255 if (persocket) 256 sounlock(so); 257 258 return (so); 259 260 fail: 261 if (persocket) 262 sounlock(so); 263 sigio_free(&so->so_sigio); 264 klist_free(&so->so_rcv.sb_klist); 265 klist_free(&so->so_snd.sb_klist); 266 pool_put(&socket_pool, so); 267 268 return (NULL); 269 } 270 271 void 272 soqinsque(struct socket *head, struct socket *so, int q) 273 { 274 soassertlocked(head); 275 soassertlocked(so); 276 277 KASSERT(so->so_onq == NULL); 278 279 so->so_head = head; 280 if (q == 0) { 281 head->so_q0len++; 282 so->so_onq = &head->so_q0; 283 } else { 284 head->so_qlen++; 285 so->so_onq = &head->so_q; 286 } 287 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 288 } 289 290 int 291 soqremque(struct socket *so, int q) 292 { 293 struct socket *head = so->so_head; 294 295 soassertlocked(so); 296 soassertlocked(head); 297 298 if (q == 0) { 299 if (so->so_onq != &head->so_q0) 300 return (0); 301 head->so_q0len--; 302 } else { 303 if (so->so_onq != &head->so_q) 304 return (0); 305 head->so_qlen--; 306 } 307 TAILQ_REMOVE(so->so_onq, so, so_qe); 308 so->so_onq = NULL; 309 so->so_head = NULL; 310 return (1); 311 } 312 313 /* 314 * Socantsendmore indicates that no more data will be sent on the 315 * socket; it would normally be applied to a socket when the user 316 * informs the system that no more data is to be sent, by the protocol 317 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 318 * will be received, and will normally be applied to the socket by a 319 * protocol when it detects that the peer will send no more data. 320 * Data queued for reading in the socket may yet be read. 321 */ 322 323 void 324 socantsendmore(struct socket *so) 325 { 326 soassertlocked(so); 327 mtx_enter(&so->so_snd.sb_mtx); 328 so->so_snd.sb_state |= SS_CANTSENDMORE; 329 mtx_leave(&so->so_snd.sb_mtx); 330 sowwakeup(so); 331 } 332 333 void 334 socantrcvmore(struct socket *so) 335 { 336 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 337 soassertlocked(so); 338 339 mtx_enter(&so->so_rcv.sb_mtx); 340 so->so_rcv.sb_state |= SS_CANTRCVMORE; 341 mtx_leave(&so->so_rcv.sb_mtx); 342 sorwakeup(so); 343 } 344 345 void 346 solock(struct socket *so) 347 { 348 switch (so->so_proto->pr_domain->dom_family) { 349 case PF_INET: 350 case PF_INET6: 351 NET_LOCK(); 352 break; 353 default: 354 rw_enter_write(&so->so_lock); 355 break; 356 } 357 } 358 359 void 360 solock_shared(struct socket *so) 361 { 362 switch (so->so_proto->pr_domain->dom_family) { 363 case PF_INET: 364 case PF_INET6: 365 NET_LOCK_SHARED(); 366 rw_enter_write(&so->so_lock); 367 break; 368 default: 369 rw_enter_write(&so->so_lock); 370 break; 371 } 372 } 373 374 int 375 solock_persocket(struct socket *so) 376 { 377 switch (so->so_proto->pr_domain->dom_family) { 378 case PF_INET: 379 case PF_INET6: 380 return 0; 381 default: 382 return 1; 383 } 384 } 385 386 void 387 solock_pair(struct socket *so1, struct socket *so2) 388 { 389 KASSERT(so1 != so2); 390 KASSERT(so1->so_type == so2->so_type); 391 KASSERT(solock_persocket(so1)); 392 393 if (so1 < so2) { 394 solock(so1); 395 solock(so2); 396 } else { 397 solock(so2); 398 solock(so1); 399 } 400 } 401 402 void 403 sounlock(struct socket *so) 404 { 405 switch (so->so_proto->pr_domain->dom_family) { 406 case PF_INET: 407 case PF_INET6: 408 NET_UNLOCK(); 409 break; 410 default: 411 rw_exit_write(&so->so_lock); 412 break; 413 } 414 } 415 416 void 417 sounlock_shared(struct socket *so) 418 { 419 switch (so->so_proto->pr_domain->dom_family) { 420 case PF_INET: 421 case PF_INET6: 422 rw_exit_write(&so->so_lock); 423 NET_UNLOCK_SHARED(); 424 break; 425 default: 426 rw_exit_write(&so->so_lock); 427 break; 428 } 429 } 430 431 void 432 soassertlocked_readonly(struct socket *so) 433 { 434 switch (so->so_proto->pr_domain->dom_family) { 435 case PF_INET: 436 case PF_INET6: 437 NET_ASSERT_LOCKED(); 438 break; 439 default: 440 rw_assert_wrlock(&so->so_lock); 441 break; 442 } 443 } 444 445 void 446 soassertlocked(struct socket *so) 447 { 448 switch (so->so_proto->pr_domain->dom_family) { 449 case PF_INET: 450 case PF_INET6: 451 if (rw_status(&netlock) == RW_READ) { 452 NET_ASSERT_LOCKED(); 453 454 if (splassert_ctl > 0 && 455 rw_status(&so->so_lock) != RW_WRITE) 456 splassert_fail(0, RW_WRITE, __func__); 457 } else 458 NET_ASSERT_LOCKED_EXCLUSIVE(); 459 break; 460 default: 461 rw_assert_wrlock(&so->so_lock); 462 break; 463 } 464 } 465 466 int 467 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 468 uint64_t nsecs) 469 { 470 int ret; 471 472 switch (so->so_proto->pr_domain->dom_family) { 473 case PF_INET: 474 case PF_INET6: 475 if (rw_status(&netlock) == RW_READ) 476 rw_exit_write(&so->so_lock); 477 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 478 if (rw_status(&netlock) == RW_READ) 479 rw_enter_write(&so->so_lock); 480 break; 481 default: 482 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); 483 break; 484 } 485 486 return ret; 487 } 488 489 void 490 sbmtxassertlocked(struct socket *so, struct sockbuf *sb) 491 { 492 if (sb->sb_flags & SB_MTXLOCK) { 493 if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0) 494 splassert_fail(0, RW_WRITE, __func__); 495 } else 496 soassertlocked(so); 497 } 498 499 /* 500 * Wait for data to arrive at/drain from a socket buffer. 501 */ 502 int 503 sbwait(struct socket *so, struct sockbuf *sb) 504 { 505 uint64_t timeo_nsecs; 506 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 507 508 if (sb->sb_flags & SB_MTXLOCK) { 509 MUTEX_ASSERT_LOCKED(&sb->sb_mtx); 510 511 sb->sb_flags |= SB_WAIT; 512 return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", 513 sb->sb_timeo_nsecs); 514 } 515 516 soassertlocked(so); 517 518 mtx_enter(&sb->sb_mtx); 519 timeo_nsecs = sb->sb_timeo_nsecs; 520 sb->sb_flags |= SB_WAIT; 521 mtx_leave(&sb->sb_mtx); 522 523 return sosleep_nsec(so, &sb->sb_cc, prio, "netio", timeo_nsecs); 524 } 525 526 int 527 sblock(struct sockbuf *sb, int flags) 528 { 529 int rwflags = RW_WRITE, error; 530 531 if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) 532 rwflags |= RW_INTR; 533 if (!(flags & SBL_WAIT)) 534 rwflags |= RW_NOSLEEP; 535 536 error = rw_enter(&sb->sb_lock, rwflags); 537 if (error == EBUSY) 538 error = EWOULDBLOCK; 539 540 return error; 541 } 542 543 void 544 sbunlock(struct sockbuf *sb) 545 { 546 rw_exit(&sb->sb_lock); 547 } 548 549 /* 550 * Wakeup processes waiting on a socket buffer. 551 * Do asynchronous notification via SIGIO 552 * if the socket buffer has the SB_ASYNC flag set. 553 */ 554 void 555 sowakeup(struct socket *so, struct sockbuf *sb) 556 { 557 int dowakeup = 0, dopgsigio = 0; 558 559 mtx_enter(&sb->sb_mtx); 560 if (sb->sb_flags & SB_WAIT) { 561 sb->sb_flags &= ~SB_WAIT; 562 dowakeup = 1; 563 } 564 if (sb->sb_flags & SB_ASYNC) 565 dopgsigio = 1; 566 567 knote_locked(&sb->sb_klist, 0); 568 mtx_leave(&sb->sb_mtx); 569 570 if (dowakeup) 571 wakeup(&sb->sb_cc); 572 573 if (dopgsigio) 574 pgsigio(&so->so_sigio, SIGIO, 0); 575 } 576 577 /* 578 * Socket buffer (struct sockbuf) utility routines. 579 * 580 * Each socket contains two socket buffers: one for sending data and 581 * one for receiving data. Each buffer contains a queue of mbufs, 582 * information about the number of mbufs and amount of data in the 583 * queue, and other fields allowing select() statements and notification 584 * on data availability to be implemented. 585 * 586 * Data stored in a socket buffer is maintained as a list of records. 587 * Each record is a list of mbufs chained together with the m_next 588 * field. Records are chained together with the m_nextpkt field. The upper 589 * level routine soreceive() expects the following conventions to be 590 * observed when placing information in the receive buffer: 591 * 592 * 1. If the protocol requires each message be preceded by the sender's 593 * name, then a record containing that name must be present before 594 * any associated data (mbuf's must be of type MT_SONAME). 595 * 2. If the protocol supports the exchange of ``access rights'' (really 596 * just additional data associated with the message), and there are 597 * ``rights'' to be received, then a record containing this data 598 * should be present (mbuf's must be of type MT_CONTROL). 599 * 3. If a name or rights record exists, then it must be followed by 600 * a data record, perhaps of zero length. 601 * 602 * Before using a new socket structure it is first necessary to reserve 603 * buffer space to the socket, by calling sbreserve(). This should commit 604 * some of the available buffer space in the system buffer pool for the 605 * socket (currently, it does nothing but enforce limits). The space 606 * should be released by calling sbrelease() when the socket is destroyed. 607 */ 608 609 int 610 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 611 { 612 soassertlocked(so); 613 614 mtx_enter(&so->so_rcv.sb_mtx); 615 mtx_enter(&so->so_snd.sb_mtx); 616 if (sbreserve(so, &so->so_snd, sndcc)) 617 goto bad; 618 so->so_snd.sb_wat = sndcc; 619 if (so->so_snd.sb_lowat == 0) 620 so->so_snd.sb_lowat = MCLBYTES; 621 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 622 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 623 if (sbreserve(so, &so->so_rcv, rcvcc)) 624 goto bad2; 625 so->so_rcv.sb_wat = rcvcc; 626 if (so->so_rcv.sb_lowat == 0) 627 so->so_rcv.sb_lowat = 1; 628 mtx_leave(&so->so_snd.sb_mtx); 629 mtx_leave(&so->so_rcv.sb_mtx); 630 631 return (0); 632 bad2: 633 sbrelease(so, &so->so_snd); 634 bad: 635 mtx_leave(&so->so_snd.sb_mtx); 636 mtx_leave(&so->so_rcv.sb_mtx); 637 return (ENOBUFS); 638 } 639 640 /* 641 * Allot mbufs to a sockbuf. 642 * Attempt to scale mbmax so that mbcnt doesn't become limiting 643 * if buffering efficiency is near the normal case. 644 */ 645 int 646 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 647 { 648 sbmtxassertlocked(so, sb); 649 650 if (cc == 0 || cc > sb_max) 651 return (1); 652 sb->sb_hiwat = cc; 653 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 654 if (sb->sb_lowat > sb->sb_hiwat) 655 sb->sb_lowat = sb->sb_hiwat; 656 return (0); 657 } 658 659 /* 660 * In low memory situation, do not accept any greater than normal request. 661 */ 662 int 663 sbcheckreserve(u_long cnt, u_long defcnt) 664 { 665 if (cnt > defcnt && sbchecklowmem()) 666 return (ENOBUFS); 667 return (0); 668 } 669 670 int 671 sbchecklowmem(void) 672 { 673 static int sblowmem; 674 unsigned int used; 675 676 /* 677 * m_pool_used() is thread safe. Global variable sblowmem is updated 678 * by multiple CPUs, but most times with the same value. And even 679 * if the value is not correct for a short time, it does not matter. 680 */ 681 used = m_pool_used(); 682 if (used < 60) 683 atomic_store_int(&sblowmem, 0); 684 else if (used > 80) 685 atomic_store_int(&sblowmem, 1); 686 687 return (atomic_load_int(&sblowmem)); 688 } 689 690 /* 691 * Free mbufs held by a socket, and reserved mbuf space. 692 */ 693 void 694 sbrelease(struct socket *so, struct sockbuf *sb) 695 { 696 697 sbflush(so, sb); 698 sb->sb_hiwat = sb->sb_mbmax = 0; 699 } 700 701 /* 702 * Routines to add and remove 703 * data from an mbuf queue. 704 * 705 * The routines sbappend() or sbappendrecord() are normally called to 706 * append new mbufs to a socket buffer, after checking that adequate 707 * space is available, comparing the function sbspace() with the amount 708 * of data to be added. sbappendrecord() differs from sbappend() in 709 * that data supplied is treated as the beginning of a new record. 710 * To place a sender's address, optional access rights, and data in a 711 * socket receive buffer, sbappendaddr() should be used. To place 712 * access rights and data in a socket receive buffer, sbappendrights() 713 * should be used. In either case, the new data begins a new record. 714 * Note that unlike sbappend() and sbappendrecord(), these routines check 715 * for the caller that there will be enough space to store the data. 716 * Each fails if there is not enough space, or if it cannot find mbufs 717 * to store additional information in. 718 * 719 * Reliable protocols may use the socket send buffer to hold data 720 * awaiting acknowledgement. Data is normally copied from a socket 721 * send buffer in a protocol with m_copym for output to a peer, 722 * and then removing the data from the socket buffer with sbdrop() 723 * or sbdroprecord() when the data is acknowledged by the peer. 724 */ 725 726 #ifdef SOCKBUF_DEBUG 727 void 728 sblastrecordchk(struct sockbuf *sb, const char *where) 729 { 730 struct mbuf *m = sb->sb_mb; 731 732 while (m && m->m_nextpkt) 733 m = m->m_nextpkt; 734 735 if (m != sb->sb_lastrecord) { 736 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 737 sb->sb_mb, sb->sb_lastrecord, m); 738 printf("packet chain:\n"); 739 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 740 printf("\t%p\n", m); 741 panic("sblastrecordchk from %s", where); 742 } 743 } 744 745 void 746 sblastmbufchk(struct sockbuf *sb, const char *where) 747 { 748 struct mbuf *m = sb->sb_mb; 749 struct mbuf *n; 750 751 while (m && m->m_nextpkt) 752 m = m->m_nextpkt; 753 754 while (m && m->m_next) 755 m = m->m_next; 756 757 if (m != sb->sb_mbtail) { 758 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 759 sb->sb_mb, sb->sb_mbtail, m); 760 printf("packet tree:\n"); 761 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 762 printf("\t"); 763 for (n = m; n != NULL; n = n->m_next) 764 printf("%p ", n); 765 printf("\n"); 766 } 767 panic("sblastmbufchk from %s", where); 768 } 769 } 770 #endif /* SOCKBUF_DEBUG */ 771 772 #define SBLINKRECORD(sb, m0) \ 773 do { \ 774 if ((sb)->sb_lastrecord != NULL) \ 775 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 776 else \ 777 (sb)->sb_mb = (m0); \ 778 (sb)->sb_lastrecord = (m0); \ 779 } while (/*CONSTCOND*/0) 780 781 /* 782 * Append mbuf chain m to the last record in the 783 * socket buffer sb. The additional space associated 784 * the mbuf chain is recorded in sb. Empty mbufs are 785 * discarded and mbufs are compacted where possible. 786 */ 787 void 788 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 789 { 790 struct mbuf *n; 791 792 if (m == NULL) 793 return; 794 795 sbmtxassertlocked(so, sb); 796 SBLASTRECORDCHK(sb, "sbappend 1"); 797 798 if ((n = sb->sb_lastrecord) != NULL) { 799 /* 800 * XXX Would like to simply use sb_mbtail here, but 801 * XXX I need to verify that I won't miss an EOR that 802 * XXX way. 803 */ 804 do { 805 if (n->m_flags & M_EOR) { 806 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 807 return; 808 } 809 } while (n->m_next && (n = n->m_next)); 810 } else { 811 /* 812 * If this is the first record in the socket buffer, it's 813 * also the last record. 814 */ 815 sb->sb_lastrecord = m; 816 } 817 sbcompress(so, sb, m, n); 818 SBLASTRECORDCHK(sb, "sbappend 2"); 819 } 820 821 /* 822 * This version of sbappend() should only be used when the caller 823 * absolutely knows that there will never be more than one record 824 * in the socket buffer, that is, a stream protocol (such as TCP). 825 */ 826 void 827 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 828 { 829 sbmtxassertlocked(so, sb); 830 KDASSERT(m->m_nextpkt == NULL); 831 KASSERT(sb->sb_mb == sb->sb_lastrecord); 832 833 SBLASTMBUFCHK(sb, __func__); 834 835 sbcompress(so, sb, m, sb->sb_mbtail); 836 837 sb->sb_lastrecord = sb->sb_mb; 838 SBLASTRECORDCHK(sb, __func__); 839 } 840 841 #ifdef SOCKBUF_DEBUG 842 void 843 sbcheck(struct socket *so, struct sockbuf *sb) 844 { 845 struct mbuf *m, *n; 846 u_long len = 0, mbcnt = 0; 847 848 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 849 for (n = m; n; n = n->m_next) { 850 len += n->m_len; 851 mbcnt += MSIZE; 852 if (n->m_flags & M_EXT) 853 mbcnt += n->m_ext.ext_size; 854 if (m != n && n->m_nextpkt) 855 panic("sbcheck nextpkt"); 856 } 857 } 858 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 859 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 860 mbcnt, sb->sb_mbcnt); 861 panic("sbcheck"); 862 } 863 } 864 #endif 865 866 /* 867 * As above, except the mbuf chain 868 * begins a new record. 869 */ 870 void 871 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 872 { 873 struct mbuf *m; 874 875 sbmtxassertlocked(so, sb); 876 877 if (m0 == NULL) 878 return; 879 880 /* 881 * Put the first mbuf on the queue. 882 * Note this permits zero length records. 883 */ 884 sballoc(so, sb, m0); 885 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 886 SBLINKRECORD(sb, m0); 887 m = m0->m_next; 888 m0->m_next = NULL; 889 if (m && (m0->m_flags & M_EOR)) { 890 m0->m_flags &= ~M_EOR; 891 m->m_flags |= M_EOR; 892 } 893 sbcompress(so, sb, m, m0); 894 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 895 } 896 897 /* 898 * Append address and data, and optionally, control (ancillary) data 899 * to the receive queue of a socket. If present, 900 * m0 must include a packet header with total length. 901 * Returns 0 if no space in sockbuf or insufficient mbufs. 902 */ 903 int 904 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 905 struct mbuf *m0, struct mbuf *control) 906 { 907 struct mbuf *m, *n, *nlast; 908 int space = asa->sa_len; 909 910 sbmtxassertlocked(so, sb); 911 912 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 913 panic("sbappendaddr"); 914 if (m0) 915 space += m0->m_pkthdr.len; 916 for (n = control; n; n = n->m_next) { 917 space += n->m_len; 918 if (n->m_next == NULL) /* keep pointer to last control buf */ 919 break; 920 } 921 if (space > sbspace_locked(so, sb)) 922 return (0); 923 if (asa->sa_len > MLEN) 924 return (0); 925 MGET(m, M_DONTWAIT, MT_SONAME); 926 if (m == NULL) 927 return (0); 928 m->m_len = asa->sa_len; 929 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 930 if (n) 931 n->m_next = m0; /* concatenate data to control */ 932 else 933 control = m0; 934 m->m_next = control; 935 936 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 937 938 for (n = m; n->m_next != NULL; n = n->m_next) 939 sballoc(so, sb, n); 940 sballoc(so, sb, n); 941 nlast = n; 942 SBLINKRECORD(sb, m); 943 944 sb->sb_mbtail = nlast; 945 SBLASTMBUFCHK(sb, "sbappendaddr"); 946 947 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 948 949 return (1); 950 } 951 952 int 953 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 954 struct mbuf *control) 955 { 956 struct mbuf *m, *mlast, *n; 957 int eor = 0, space = 0; 958 959 sbmtxassertlocked(so, sb); 960 961 if (control == NULL) 962 panic("sbappendcontrol"); 963 for (m = control; ; m = m->m_next) { 964 space += m->m_len; 965 if (m->m_next == NULL) 966 break; 967 } 968 n = m; /* save pointer to last control buffer */ 969 for (m = m0; m; m = m->m_next) { 970 space += m->m_len; 971 eor |= m->m_flags & M_EOR; 972 if (eor) { 973 if (m->m_next == NULL) 974 m->m_flags |= M_EOR; 975 else 976 m->m_flags &= ~M_EOR; 977 } 978 } 979 if (space > sbspace_locked(so, sb)) 980 return (0); 981 n->m_next = m0; /* concatenate data to control */ 982 983 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 984 985 for (m = control; m->m_next != NULL; m = m->m_next) 986 sballoc(so, sb, m); 987 sballoc(so, sb, m); 988 mlast = m; 989 SBLINKRECORD(sb, control); 990 991 sb->sb_mbtail = mlast; 992 SBLASTMBUFCHK(sb, "sbappendcontrol"); 993 994 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 995 996 return (1); 997 } 998 999 /* 1000 * Compress mbuf chain m into the socket 1001 * buffer sb following mbuf n. If n 1002 * is null, the buffer is presumed empty. 1003 */ 1004 void 1005 sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m, 1006 struct mbuf *n) 1007 { 1008 int eor = 0; 1009 struct mbuf *o; 1010 1011 while (m) { 1012 eor |= m->m_flags & M_EOR; 1013 if (m->m_len == 0 && 1014 (eor == 0 || 1015 (((o = m->m_next) || (o = n)) && 1016 o->m_type == m->m_type))) { 1017 if (sb->sb_lastrecord == m) 1018 sb->sb_lastrecord = m->m_next; 1019 m = m_free(m); 1020 continue; 1021 } 1022 if (n && (n->m_flags & M_EOR) == 0 && 1023 /* m_trailingspace() checks buffer writeability */ 1024 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 1025 MCLBYTES) / 4 && /* XXX Don't copy too much */ 1026 m->m_len <= m_trailingspace(n) && 1027 n->m_type == m->m_type) { 1028 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 1029 m->m_len); 1030 n->m_len += m->m_len; 1031 sb->sb_cc += m->m_len; 1032 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1033 sb->sb_datacc += m->m_len; 1034 m = m_free(m); 1035 continue; 1036 } 1037 if (n) 1038 n->m_next = m; 1039 else 1040 sb->sb_mb = m; 1041 sb->sb_mbtail = m; 1042 sballoc(so, sb, m); 1043 n = m; 1044 m->m_flags &= ~M_EOR; 1045 m = m->m_next; 1046 n->m_next = NULL; 1047 } 1048 if (eor) { 1049 if (n) 1050 n->m_flags |= eor; 1051 else 1052 printf("semi-panic: sbcompress"); 1053 } 1054 SBLASTMBUFCHK(sb, __func__); 1055 } 1056 1057 /* 1058 * Free all mbufs in a sockbuf. 1059 * Check that all resources are reclaimed. 1060 */ 1061 void 1062 sbflush(struct socket *so, struct sockbuf *sb) 1063 { 1064 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 1065 rw_assert_unlocked(&sb->sb_lock); 1066 1067 while (sb->sb_mbcnt) 1068 sbdrop(so, sb, (int)sb->sb_cc); 1069 1070 KASSERT(sb->sb_cc == 0); 1071 KASSERT(sb->sb_datacc == 0); 1072 KASSERT(sb->sb_mb == NULL); 1073 KASSERT(sb->sb_mbtail == NULL); 1074 KASSERT(sb->sb_lastrecord == NULL); 1075 } 1076 1077 /* 1078 * Drop data from (the front of) a sockbuf. 1079 */ 1080 void 1081 sbdrop(struct socket *so, struct sockbuf *sb, int len) 1082 { 1083 struct mbuf *m, *mn; 1084 struct mbuf *next; 1085 1086 sbmtxassertlocked(so, sb); 1087 1088 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1089 while (len > 0) { 1090 if (m == NULL) { 1091 if (next == NULL) 1092 panic("sbdrop"); 1093 m = next; 1094 next = m->m_nextpkt; 1095 continue; 1096 } 1097 if (m->m_len > len) { 1098 m->m_len -= len; 1099 m->m_data += len; 1100 sb->sb_cc -= len; 1101 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1102 sb->sb_datacc -= len; 1103 break; 1104 } 1105 len -= m->m_len; 1106 sbfree(so, sb, m); 1107 mn = m_free(m); 1108 m = mn; 1109 } 1110 while (m && m->m_len == 0) { 1111 sbfree(so, sb, m); 1112 mn = m_free(m); 1113 m = mn; 1114 } 1115 if (m) { 1116 sb->sb_mb = m; 1117 m->m_nextpkt = next; 1118 } else 1119 sb->sb_mb = next; 1120 /* 1121 * First part is an inline SB_EMPTY_FIXUP(). Second part 1122 * makes sure sb_lastrecord is up-to-date if we dropped 1123 * part of the last record. 1124 */ 1125 m = sb->sb_mb; 1126 if (m == NULL) { 1127 sb->sb_mbtail = NULL; 1128 sb->sb_lastrecord = NULL; 1129 } else if (m->m_nextpkt == NULL) 1130 sb->sb_lastrecord = m; 1131 } 1132 1133 /* 1134 * Drop a record off the front of a sockbuf 1135 * and move the next record to the front. 1136 */ 1137 void 1138 sbdroprecord(struct socket *so, struct sockbuf *sb) 1139 { 1140 struct mbuf *m, *mn; 1141 1142 m = sb->sb_mb; 1143 if (m) { 1144 sb->sb_mb = m->m_nextpkt; 1145 do { 1146 sbfree(so, sb, m); 1147 mn = m_free(m); 1148 } while ((m = mn) != NULL); 1149 } 1150 SB_EMPTY_FIXUP(sb); 1151 } 1152 1153 /* 1154 * Create a "control" mbuf containing the specified data 1155 * with the specified type for presentation on a socket buffer. 1156 */ 1157 struct mbuf * 1158 sbcreatecontrol(const void *p, size_t size, int type, int level) 1159 { 1160 struct cmsghdr *cp; 1161 struct mbuf *m; 1162 1163 if (CMSG_SPACE(size) > MCLBYTES) { 1164 printf("sbcreatecontrol: message too large %zu\n", size); 1165 return (NULL); 1166 } 1167 1168 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1169 return (NULL); 1170 if (CMSG_SPACE(size) > MLEN) { 1171 MCLGET(m, M_DONTWAIT); 1172 if ((m->m_flags & M_EXT) == 0) { 1173 m_free(m); 1174 return NULL; 1175 } 1176 } 1177 cp = mtod(m, struct cmsghdr *); 1178 memset(cp, 0, CMSG_SPACE(size)); 1179 memcpy(CMSG_DATA(cp), p, size); 1180 m->m_len = CMSG_SPACE(size); 1181 cp->cmsg_len = CMSG_LEN(size); 1182 cp->cmsg_level = level; 1183 cp->cmsg_type = type; 1184 return (m); 1185 } 1186