1 /* $OpenBSD: uipc_socket2.c,v 1.171 2025/01/27 14:57:13 mvs Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/pool.h> 45 46 /* 47 * Primitive routines for operating on sockets and socket buffers 48 */ 49 50 u_long sb_max = SB_MAX; /* [I] patchable */ 51 52 extern struct pool mclpools[]; 53 extern struct pool mbpool; 54 55 /* 56 * Procedures to manipulate state flags of socket 57 * and do appropriate wakeups. Normal sequence from the 58 * active (originating) side is that soisconnecting() is 59 * called during processing of connect() call, 60 * resulting in an eventual call to soisconnected() if/when the 61 * connection is established. When the connection is torn down 62 * soisdisconnecting() is called during processing of disconnect() call, 63 * and soisdisconnected() is called when the connection to the peer 64 * is totally severed. The semantics of these routines are such that 65 * connectionless protocols can call soisconnected() and soisdisconnected() 66 * only, bypassing the in-progress calls when setting up a ``connection'' 67 * takes no time. 68 * 69 * From the passive side, a socket is created with 70 * two queues of sockets: so_q0 for connections in progress 71 * and so_q for connections already made and awaiting user acceptance. 72 * As a protocol is preparing incoming connections, it creates a socket 73 * structure queued on so_q0 by calling sonewconn(). When the connection 74 * is established, soisconnected() is called, and transfers the 75 * socket structure to so_q, making it available to accept(). 76 * 77 * If a socket is closed with sockets on either 78 * so_q0 or so_q, these sockets are dropped. 79 * 80 * If higher level protocols are implemented in 81 * the kernel, the wakeups done here will sometimes 82 * cause software-interrupt process scheduling. 83 */ 84 85 void 86 soisconnecting(struct socket *so) 87 { 88 soassertlocked(so); 89 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 90 so->so_state |= SS_ISCONNECTING; 91 } 92 93 void 94 soisconnected(struct socket *so) 95 { 96 struct socket *head = so->so_head; 97 98 soassertlocked(so); 99 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 100 so->so_state |= SS_ISCONNECTED; 101 102 if (head != NULL && so->so_onq == &head->so_q0) { 103 soref(head); 104 sounlock(so); 105 solock(head); 106 solock(so); 107 108 if (so->so_onq != &head->so_q0) { 109 sounlock(head); 110 sorele(head); 111 return; 112 } 113 114 soqremque(so, 0); 115 soqinsque(head, so, 1); 116 sorwakeup(head); 117 wakeup_one(&head->so_timeo); 118 119 sounlock(head); 120 sorele(head); 121 } else { 122 wakeup(&so->so_timeo); 123 sorwakeup(so); 124 sowwakeup(so); 125 } 126 } 127 128 void 129 soisdisconnecting(struct socket *so) 130 { 131 soassertlocked(so); 132 so->so_state &= ~SS_ISCONNECTING; 133 so->so_state |= SS_ISDISCONNECTING; 134 135 mtx_enter(&so->so_rcv.sb_mtx); 136 so->so_rcv.sb_state |= SS_CANTRCVMORE; 137 mtx_leave(&so->so_rcv.sb_mtx); 138 139 mtx_enter(&so->so_snd.sb_mtx); 140 so->so_snd.sb_state |= SS_CANTSENDMORE; 141 mtx_leave(&so->so_snd.sb_mtx); 142 143 wakeup(&so->so_timeo); 144 sowwakeup(so); 145 sorwakeup(so); 146 } 147 148 void 149 soisdisconnected(struct socket *so) 150 { 151 soassertlocked(so); 152 153 mtx_enter(&so->so_rcv.sb_mtx); 154 so->so_rcv.sb_state |= SS_CANTRCVMORE; 155 mtx_leave(&so->so_rcv.sb_mtx); 156 157 mtx_enter(&so->so_snd.sb_mtx); 158 so->so_snd.sb_state |= SS_CANTSENDMORE; 159 mtx_leave(&so->so_snd.sb_mtx); 160 161 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 162 so->so_state |= SS_ISDISCONNECTED; 163 164 wakeup(&so->so_timeo); 165 sowwakeup(so); 166 sorwakeup(so); 167 } 168 169 /* 170 * When an attempt at a new connection is noted on a socket 171 * which accepts connections, sonewconn is called. If the 172 * connection is possible (subject to space constraints, etc.) 173 * then we allocate a new structure, properly linked into the 174 * data structure of the original socket, and return this. 175 * Connstatus may be 0 or SS_ISCONNECTED. 176 */ 177 struct socket * 178 sonewconn(struct socket *head, int connstatus, int wait) 179 { 180 struct socket *so; 181 int soqueue = connstatus ? 1 : 0; 182 183 soassertlocked(head); 184 185 if (m_pool_used() > 95) 186 return (NULL); 187 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 188 return (NULL); 189 so = soalloc(head->so_proto, wait); 190 if (so == NULL) 191 return (NULL); 192 so->so_type = head->so_type; 193 so->so_options = head->so_options &~ SO_ACCEPTCONN; 194 so->so_linger = head->so_linger; 195 so->so_state = head->so_state | SS_NOFDREF; 196 so->so_proto = head->so_proto; 197 so->so_timeo = head->so_timeo; 198 so->so_euid = head->so_euid; 199 so->so_ruid = head->so_ruid; 200 so->so_egid = head->so_egid; 201 so->so_rgid = head->so_rgid; 202 so->so_cpid = head->so_cpid; 203 204 /* 205 * Lock order will be `head' -> `so' while these sockets are linked. 206 */ 207 solock_nonet(so); 208 209 /* 210 * Inherit watermarks but those may get clamped in low mem situations. 211 */ 212 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) 213 goto fail; 214 215 mtx_enter(&head->so_snd.sb_mtx); 216 so->so_snd.sb_wat = head->so_snd.sb_wat; 217 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 218 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 219 mtx_leave(&head->so_snd.sb_mtx); 220 221 mtx_enter(&head->so_rcv.sb_mtx); 222 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 223 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 224 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 225 mtx_leave(&head->so_rcv.sb_mtx); 226 227 sigio_copy(&so->so_sigio, &head->so_sigio); 228 229 soqinsque(head, so, soqueue); 230 if (pru_attach(so, 0, wait) != 0) { 231 soqremque(so, soqueue); 232 goto fail; 233 } 234 if (connstatus) { 235 so->so_state |= connstatus; 236 sorwakeup(head); 237 wakeup(&head->so_timeo); 238 } 239 240 return (so); 241 242 fail: 243 sounlock_nonet(so); 244 sigio_free(&so->so_sigio); 245 klist_free(&so->so_rcv.sb_klist); 246 klist_free(&so->so_snd.sb_klist); 247 pool_put(&socket_pool, so); 248 249 return (NULL); 250 } 251 252 void 253 soqinsque(struct socket *head, struct socket *so, int q) 254 { 255 soassertlocked(head); 256 soassertlocked(so); 257 258 KASSERT(so->so_onq == NULL); 259 260 so->so_head = head; 261 if (q == 0) { 262 head->so_q0len++; 263 so->so_onq = &head->so_q0; 264 } else { 265 head->so_qlen++; 266 so->so_onq = &head->so_q; 267 } 268 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 269 } 270 271 int 272 soqremque(struct socket *so, int q) 273 { 274 struct socket *head = so->so_head; 275 276 soassertlocked(so); 277 soassertlocked(head); 278 279 if (q == 0) { 280 if (so->so_onq != &head->so_q0) 281 return (0); 282 head->so_q0len--; 283 } else { 284 if (so->so_onq != &head->so_q) 285 return (0); 286 head->so_qlen--; 287 } 288 TAILQ_REMOVE(so->so_onq, so, so_qe); 289 so->so_onq = NULL; 290 so->so_head = NULL; 291 return (1); 292 } 293 294 /* 295 * Socantsendmore indicates that no more data will be sent on the 296 * socket; it would normally be applied to a socket when the user 297 * informs the system that no more data is to be sent, by the protocol 298 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 299 * will be received, and will normally be applied to the socket by a 300 * protocol when it detects that the peer will send no more data. 301 * Data queued for reading in the socket may yet be read. 302 */ 303 304 void 305 socantsendmore(struct socket *so) 306 { 307 soassertlocked(so); 308 mtx_enter(&so->so_snd.sb_mtx); 309 so->so_snd.sb_state |= SS_CANTSENDMORE; 310 mtx_leave(&so->so_snd.sb_mtx); 311 sowwakeup(so); 312 } 313 314 void 315 socantrcvmore(struct socket *so) 316 { 317 mtx_enter(&so->so_rcv.sb_mtx); 318 so->so_rcv.sb_state |= SS_CANTRCVMORE; 319 mtx_leave(&so->so_rcv.sb_mtx); 320 sorwakeup(so); 321 } 322 323 void 324 solock(struct socket *so) 325 { 326 switch (so->so_proto->pr_domain->dom_family) { 327 case PF_INET: 328 case PF_INET6: 329 NET_LOCK(); 330 break; 331 default: 332 rw_enter_write(&so->so_lock); 333 break; 334 } 335 } 336 337 void 338 solock_shared(struct socket *so) 339 { 340 switch (so->so_proto->pr_domain->dom_family) { 341 case PF_INET: 342 case PF_INET6: 343 NET_LOCK_SHARED(); 344 break; 345 } 346 rw_enter_write(&so->so_lock); 347 } 348 349 void 350 solock_nonet(struct socket *so) 351 { 352 switch (so->so_proto->pr_domain->dom_family) { 353 case PF_INET: 354 case PF_INET6: 355 NET_ASSERT_LOCKED(); 356 break; 357 } 358 rw_enter_write(&so->so_lock); 359 } 360 361 int 362 solock_persocket(struct socket *so) 363 { 364 switch (so->so_proto->pr_domain->dom_family) { 365 case PF_INET: 366 case PF_INET6: 367 return 0; 368 default: 369 return 1; 370 } 371 } 372 373 void 374 solock_pair(struct socket *so1, struct socket *so2) 375 { 376 KASSERT(so1 != so2); 377 KASSERT(so1->so_type == so2->so_type); 378 KASSERT(solock_persocket(so1)); 379 380 if (so1 < so2) { 381 solock(so1); 382 solock(so2); 383 } else { 384 solock(so2); 385 solock(so1); 386 } 387 } 388 389 void 390 sounlock(struct socket *so) 391 { 392 switch (so->so_proto->pr_domain->dom_family) { 393 case PF_INET: 394 case PF_INET6: 395 NET_UNLOCK(); 396 break; 397 default: 398 rw_exit_write(&so->so_lock); 399 break; 400 } 401 } 402 403 void 404 sounlock_shared(struct socket *so) 405 { 406 rw_exit_write(&so->so_lock); 407 switch (so->so_proto->pr_domain->dom_family) { 408 case PF_INET: 409 case PF_INET6: 410 NET_UNLOCK_SHARED(); 411 break; 412 } 413 } 414 415 void 416 sounlock_nonet(struct socket *so) 417 { 418 rw_exit_write(&so->so_lock); 419 } 420 421 void 422 soassertlocked_readonly(struct socket *so) 423 { 424 switch (so->so_proto->pr_domain->dom_family) { 425 case PF_INET: 426 case PF_INET6: 427 NET_ASSERT_LOCKED(); 428 break; 429 default: 430 rw_assert_wrlock(&so->so_lock); 431 break; 432 } 433 } 434 435 void 436 soassertlocked(struct socket *so) 437 { 438 switch (so->so_proto->pr_domain->dom_family) { 439 case PF_INET: 440 case PF_INET6: 441 if (rw_status(&netlock) == RW_READ) { 442 NET_ASSERT_LOCKED(); 443 444 if (splassert_ctl > 0 && 445 rw_status(&so->so_lock) != RW_WRITE) 446 splassert_fail(0, RW_WRITE, __func__); 447 } else 448 NET_ASSERT_LOCKED_EXCLUSIVE(); 449 break; 450 default: 451 rw_assert_wrlock(&so->so_lock); 452 break; 453 } 454 } 455 456 int 457 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 458 uint64_t nsecs) 459 { 460 int ret; 461 462 switch (so->so_proto->pr_domain->dom_family) { 463 case PF_INET: 464 case PF_INET6: 465 if (rw_status(&netlock) == RW_READ) 466 rw_exit_write(&so->so_lock); 467 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 468 if (rw_status(&netlock) == RW_READ) 469 rw_enter_write(&so->so_lock); 470 break; 471 default: 472 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); 473 break; 474 } 475 476 return ret; 477 } 478 479 void 480 sbmtxassertlocked(struct sockbuf *sb) 481 { 482 if (splassert_ctl > 0 && mtx_owned(&sb->sb_mtx) == 0) 483 splassert_fail(0, RW_WRITE, __func__); 484 } 485 486 /* 487 * Wait for data to arrive at/drain from a socket buffer. 488 */ 489 int 490 sbwait(struct sockbuf *sb) 491 { 492 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 493 494 MUTEX_ASSERT_LOCKED(&sb->sb_mtx); 495 496 sb->sb_flags |= SB_WAIT; 497 return msleep_nsec(&sb->sb_cc, &sb->sb_mtx, prio, "sbwait", 498 sb->sb_timeo_nsecs); 499 } 500 501 int 502 sblock(struct sockbuf *sb, int flags) 503 { 504 int rwflags = RW_WRITE, error; 505 506 if (!(flags & SBL_NOINTR || sb->sb_flags & SB_NOINTR)) 507 rwflags |= RW_INTR; 508 if (!(flags & SBL_WAIT)) 509 rwflags |= RW_NOSLEEP; 510 511 error = rw_enter(&sb->sb_lock, rwflags); 512 if (error == EBUSY) 513 error = EWOULDBLOCK; 514 515 return error; 516 } 517 518 void 519 sbunlock(struct sockbuf *sb) 520 { 521 rw_exit(&sb->sb_lock); 522 } 523 524 /* 525 * Wakeup processes waiting on a socket buffer. 526 * Do asynchronous notification via SIGIO 527 * if the socket buffer has the SB_ASYNC flag set. 528 */ 529 void 530 sowakeup(struct socket *so, struct sockbuf *sb) 531 { 532 int dowakeup = 0, dopgsigio = 0; 533 534 mtx_enter(&sb->sb_mtx); 535 if (sb->sb_flags & SB_WAIT) { 536 sb->sb_flags &= ~SB_WAIT; 537 dowakeup = 1; 538 } 539 if (sb->sb_flags & SB_ASYNC) 540 dopgsigio = 1; 541 542 knote_locked(&sb->sb_klist, 0); 543 mtx_leave(&sb->sb_mtx); 544 545 if (dowakeup) 546 wakeup(&sb->sb_cc); 547 548 if (dopgsigio) 549 pgsigio(&so->so_sigio, SIGIO, 0); 550 } 551 552 /* 553 * Socket buffer (struct sockbuf) utility routines. 554 * 555 * Each socket contains two socket buffers: one for sending data and 556 * one for receiving data. Each buffer contains a queue of mbufs, 557 * information about the number of mbufs and amount of data in the 558 * queue, and other fields allowing select() statements and notification 559 * on data availability to be implemented. 560 * 561 * Data stored in a socket buffer is maintained as a list of records. 562 * Each record is a list of mbufs chained together with the m_next 563 * field. Records are chained together with the m_nextpkt field. The upper 564 * level routine soreceive() expects the following conventions to be 565 * observed when placing information in the receive buffer: 566 * 567 * 1. If the protocol requires each message be preceded by the sender's 568 * name, then a record containing that name must be present before 569 * any associated data (mbuf's must be of type MT_SONAME). 570 * 2. If the protocol supports the exchange of ``access rights'' (really 571 * just additional data associated with the message), and there are 572 * ``rights'' to be received, then a record containing this data 573 * should be present (mbuf's must be of type MT_CONTROL). 574 * 3. If a name or rights record exists, then it must be followed by 575 * a data record, perhaps of zero length. 576 * 577 * Before using a new socket structure it is first necessary to reserve 578 * buffer space to the socket, by calling sbreserve(). This should commit 579 * some of the available buffer space in the system buffer pool for the 580 * socket (currently, it does nothing but enforce limits). The space 581 * should be released by calling sbrelease() when the socket is destroyed. 582 */ 583 584 int 585 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 586 { 587 soassertlocked(so); 588 589 mtx_enter(&so->so_rcv.sb_mtx); 590 mtx_enter(&so->so_snd.sb_mtx); 591 if (sbreserve(so, &so->so_snd, sndcc)) 592 goto bad; 593 so->so_snd.sb_wat = sndcc; 594 if (so->so_snd.sb_lowat == 0) 595 so->so_snd.sb_lowat = MCLBYTES; 596 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 597 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 598 if (sbreserve(so, &so->so_rcv, rcvcc)) 599 goto bad2; 600 so->so_rcv.sb_wat = rcvcc; 601 if (so->so_rcv.sb_lowat == 0) 602 so->so_rcv.sb_lowat = 1; 603 mtx_leave(&so->so_snd.sb_mtx); 604 mtx_leave(&so->so_rcv.sb_mtx); 605 606 return (0); 607 bad2: 608 sbrelease(so, &so->so_snd); 609 bad: 610 mtx_leave(&so->so_snd.sb_mtx); 611 mtx_leave(&so->so_rcv.sb_mtx); 612 return (ENOBUFS); 613 } 614 615 /* 616 * Allot mbufs to a sockbuf. 617 * Attempt to scale mbmax so that mbcnt doesn't become limiting 618 * if buffering efficiency is near the normal case. 619 */ 620 int 621 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 622 { 623 sbmtxassertlocked(sb); 624 625 if (cc == 0 || cc > sb_max) 626 return (1); 627 sb->sb_hiwat = cc; 628 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 629 if (sb->sb_lowat > sb->sb_hiwat) 630 sb->sb_lowat = sb->sb_hiwat; 631 return (0); 632 } 633 634 /* 635 * In low memory situation, do not accept any greater than normal request. 636 */ 637 int 638 sbcheckreserve(u_long cnt, u_long defcnt) 639 { 640 if (cnt > defcnt && sbchecklowmem()) 641 return (ENOBUFS); 642 return (0); 643 } 644 645 int 646 sbchecklowmem(void) 647 { 648 static int sblowmem; 649 unsigned int used; 650 651 /* 652 * m_pool_used() is thread safe. Global variable sblowmem is updated 653 * by multiple CPUs, but most times with the same value. And even 654 * if the value is not correct for a short time, it does not matter. 655 */ 656 used = m_pool_used(); 657 if (used < 60) 658 atomic_store_int(&sblowmem, 0); 659 else if (used > 80) 660 atomic_store_int(&sblowmem, 1); 661 662 return (atomic_load_int(&sblowmem)); 663 } 664 665 /* 666 * Free mbufs held by a socket, and reserved mbuf space. 667 */ 668 void 669 sbrelease(struct socket *so, struct sockbuf *sb) 670 { 671 672 sbflush(so, sb); 673 sb->sb_hiwat = sb->sb_mbmax = 0; 674 } 675 676 /* 677 * Routines to add and remove 678 * data from an mbuf queue. 679 * 680 * The routines sbappend() or sbappendrecord() are normally called to 681 * append new mbufs to a socket buffer, after checking that adequate 682 * space is available, comparing the function sbspace() with the amount 683 * of data to be added. sbappendrecord() differs from sbappend() in 684 * that data supplied is treated as the beginning of a new record. 685 * To place a sender's address, optional access rights, and data in a 686 * socket receive buffer, sbappendaddr() should be used. To place 687 * access rights and data in a socket receive buffer, sbappendrights() 688 * should be used. In either case, the new data begins a new record. 689 * Note that unlike sbappend() and sbappendrecord(), these routines check 690 * for the caller that there will be enough space to store the data. 691 * Each fails if there is not enough space, or if it cannot find mbufs 692 * to store additional information in. 693 * 694 * Reliable protocols may use the socket send buffer to hold data 695 * awaiting acknowledgement. Data is normally copied from a socket 696 * send buffer in a protocol with m_copym for output to a peer, 697 * and then removing the data from the socket buffer with sbdrop() 698 * or sbdroprecord() when the data is acknowledged by the peer. 699 */ 700 701 #ifdef SOCKBUF_DEBUG 702 void 703 sblastrecordchk(struct sockbuf *sb, const char *where) 704 { 705 struct mbuf *m = sb->sb_mb; 706 707 while (m && m->m_nextpkt) 708 m = m->m_nextpkt; 709 710 if (m != sb->sb_lastrecord) { 711 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 712 sb->sb_mb, sb->sb_lastrecord, m); 713 printf("packet chain:\n"); 714 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 715 printf("\t%p\n", m); 716 panic("sblastrecordchk from %s", where); 717 } 718 } 719 720 void 721 sblastmbufchk(struct sockbuf *sb, const char *where) 722 { 723 struct mbuf *m = sb->sb_mb; 724 struct mbuf *n; 725 726 while (m && m->m_nextpkt) 727 m = m->m_nextpkt; 728 729 while (m && m->m_next) 730 m = m->m_next; 731 732 if (m != sb->sb_mbtail) { 733 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 734 sb->sb_mb, sb->sb_mbtail, m); 735 printf("packet tree:\n"); 736 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 737 printf("\t"); 738 for (n = m; n != NULL; n = n->m_next) 739 printf("%p ", n); 740 printf("\n"); 741 } 742 panic("sblastmbufchk from %s", where); 743 } 744 } 745 #endif /* SOCKBUF_DEBUG */ 746 747 #define SBLINKRECORD(sb, m0) \ 748 do { \ 749 if ((sb)->sb_lastrecord != NULL) \ 750 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 751 else \ 752 (sb)->sb_mb = (m0); \ 753 (sb)->sb_lastrecord = (m0); \ 754 } while (/*CONSTCOND*/0) 755 756 /* 757 * Append mbuf chain m to the last record in the 758 * socket buffer sb. The additional space associated 759 * the mbuf chain is recorded in sb. Empty mbufs are 760 * discarded and mbufs are compacted where possible. 761 */ 762 void 763 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 764 { 765 struct mbuf *n; 766 767 if (m == NULL) 768 return; 769 770 sbmtxassertlocked(sb); 771 SBLASTRECORDCHK(sb, "sbappend 1"); 772 773 if ((n = sb->sb_lastrecord) != NULL) { 774 /* 775 * XXX Would like to simply use sb_mbtail here, but 776 * XXX I need to verify that I won't miss an EOR that 777 * XXX way. 778 */ 779 do { 780 if (n->m_flags & M_EOR) { 781 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 782 return; 783 } 784 } while (n->m_next && (n = n->m_next)); 785 } else { 786 /* 787 * If this is the first record in the socket buffer, it's 788 * also the last record. 789 */ 790 sb->sb_lastrecord = m; 791 } 792 sbcompress(so, sb, m, n); 793 SBLASTRECORDCHK(sb, "sbappend 2"); 794 } 795 796 /* 797 * This version of sbappend() should only be used when the caller 798 * absolutely knows that there will never be more than one record 799 * in the socket buffer, that is, a stream protocol (such as TCP). 800 */ 801 void 802 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 803 { 804 sbmtxassertlocked(sb); 805 KDASSERT(m->m_nextpkt == NULL); 806 KASSERT(sb->sb_mb == sb->sb_lastrecord); 807 808 SBLASTMBUFCHK(sb, __func__); 809 810 sbcompress(so, sb, m, sb->sb_mbtail); 811 812 sb->sb_lastrecord = sb->sb_mb; 813 SBLASTRECORDCHK(sb, __func__); 814 } 815 816 #ifdef SOCKBUF_DEBUG 817 void 818 sbcheck(struct socket *so, struct sockbuf *sb) 819 { 820 struct mbuf *m, *n; 821 u_long len = 0, mbcnt = 0; 822 823 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 824 for (n = m; n; n = n->m_next) { 825 len += n->m_len; 826 mbcnt += MSIZE; 827 if (n->m_flags & M_EXT) 828 mbcnt += n->m_ext.ext_size; 829 if (m != n && n->m_nextpkt) 830 panic("sbcheck nextpkt"); 831 } 832 } 833 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 834 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 835 mbcnt, sb->sb_mbcnt); 836 panic("sbcheck"); 837 } 838 } 839 #endif 840 841 /* 842 * As above, except the mbuf chain 843 * begins a new record. 844 */ 845 void 846 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 847 { 848 struct mbuf *m; 849 850 sbmtxassertlocked(sb); 851 852 if (m0 == NULL) 853 return; 854 855 /* 856 * Put the first mbuf on the queue. 857 * Note this permits zero length records. 858 */ 859 sballoc(sb, m0); 860 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 861 SBLINKRECORD(sb, m0); 862 m = m0->m_next; 863 m0->m_next = NULL; 864 if (m && (m0->m_flags & M_EOR)) { 865 m0->m_flags &= ~M_EOR; 866 m->m_flags |= M_EOR; 867 } 868 sbcompress(so, sb, m, m0); 869 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 870 } 871 872 /* 873 * Append address and data, and optionally, control (ancillary) data 874 * to the receive queue of a socket. If present, 875 * m0 must include a packet header with total length. 876 * Returns 0 if no space in sockbuf or insufficient mbufs. 877 */ 878 int 879 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 880 struct mbuf *m0, struct mbuf *control) 881 { 882 struct mbuf *m, *n, *nlast; 883 int space = asa->sa_len; 884 885 sbmtxassertlocked(sb); 886 887 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 888 panic("sbappendaddr"); 889 if (m0) 890 space += m0->m_pkthdr.len; 891 for (n = control; n; n = n->m_next) { 892 space += n->m_len; 893 if (n->m_next == NULL) /* keep pointer to last control buf */ 894 break; 895 } 896 if (space > sbspace_locked(so, sb)) 897 return (0); 898 if (asa->sa_len > MLEN) 899 return (0); 900 MGET(m, M_DONTWAIT, MT_SONAME); 901 if (m == NULL) 902 return (0); 903 m->m_len = asa->sa_len; 904 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 905 if (n) 906 n->m_next = m0; /* concatenate data to control */ 907 else 908 control = m0; 909 m->m_next = control; 910 911 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 912 913 for (n = m; n->m_next != NULL; n = n->m_next) 914 sballoc(sb, n); 915 sballoc(sb, n); 916 nlast = n; 917 SBLINKRECORD(sb, m); 918 919 sb->sb_mbtail = nlast; 920 SBLASTMBUFCHK(sb, "sbappendaddr"); 921 922 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 923 924 return (1); 925 } 926 927 int 928 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 929 struct mbuf *control) 930 { 931 struct mbuf *m, *mlast, *n; 932 int eor = 0, space = 0; 933 934 sbmtxassertlocked(sb); 935 936 if (control == NULL) 937 panic("sbappendcontrol"); 938 for (m = control; ; m = m->m_next) { 939 space += m->m_len; 940 if (m->m_next == NULL) 941 break; 942 } 943 n = m; /* save pointer to last control buffer */ 944 for (m = m0; m; m = m->m_next) { 945 space += m->m_len; 946 eor |= m->m_flags & M_EOR; 947 if (eor) { 948 if (m->m_next == NULL) 949 m->m_flags |= M_EOR; 950 else 951 m->m_flags &= ~M_EOR; 952 } 953 } 954 if (space > sbspace_locked(so, sb)) 955 return (0); 956 n->m_next = m0; /* concatenate data to control */ 957 958 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 959 960 for (m = control; m->m_next != NULL; m = m->m_next) 961 sballoc(sb, m); 962 sballoc(sb, m); 963 mlast = m; 964 SBLINKRECORD(sb, control); 965 966 sb->sb_mbtail = mlast; 967 SBLASTMBUFCHK(sb, "sbappendcontrol"); 968 969 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 970 971 return (1); 972 } 973 974 /* 975 * Compress mbuf chain m into the socket 976 * buffer sb following mbuf n. If n 977 * is null, the buffer is presumed empty. 978 */ 979 void 980 sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m, 981 struct mbuf *n) 982 { 983 int eor = 0; 984 struct mbuf *o; 985 986 while (m) { 987 eor |= m->m_flags & M_EOR; 988 if (m->m_len == 0 && 989 (eor == 0 || 990 (((o = m->m_next) || (o = n)) && 991 o->m_type == m->m_type))) { 992 if (sb->sb_lastrecord == m) 993 sb->sb_lastrecord = m->m_next; 994 m = m_free(m); 995 continue; 996 } 997 if (n && (n->m_flags & M_EOR) == 0 && 998 /* m_trailingspace() checks buffer writeability */ 999 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 1000 MCLBYTES) / 4 && /* XXX Don't copy too much */ 1001 m->m_len <= m_trailingspace(n) && 1002 n->m_type == m->m_type) { 1003 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 1004 m->m_len); 1005 n->m_len += m->m_len; 1006 sb->sb_cc += m->m_len; 1007 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1008 sb->sb_datacc += m->m_len; 1009 m = m_free(m); 1010 continue; 1011 } 1012 if (n) 1013 n->m_next = m; 1014 else 1015 sb->sb_mb = m; 1016 sb->sb_mbtail = m; 1017 sballoc(sb, m); 1018 n = m; 1019 m->m_flags &= ~M_EOR; 1020 m = m->m_next; 1021 n->m_next = NULL; 1022 } 1023 if (eor) { 1024 if (n) 1025 n->m_flags |= eor; 1026 else 1027 printf("semi-panic: sbcompress"); 1028 } 1029 SBLASTMBUFCHK(sb, __func__); 1030 } 1031 1032 /* 1033 * Free all mbufs in a sockbuf. 1034 * Check that all resources are reclaimed. 1035 */ 1036 void 1037 sbflush(struct socket *so, struct sockbuf *sb) 1038 { 1039 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 1040 rw_assert_unlocked(&sb->sb_lock); 1041 1042 while (sb->sb_mbcnt) 1043 sbdrop(so, sb, (int)sb->sb_cc); 1044 1045 KASSERT(sb->sb_cc == 0); 1046 KASSERT(sb->sb_datacc == 0); 1047 KASSERT(sb->sb_mb == NULL); 1048 KASSERT(sb->sb_mbtail == NULL); 1049 KASSERT(sb->sb_lastrecord == NULL); 1050 } 1051 1052 /* 1053 * Drop data from (the front of) a sockbuf. 1054 */ 1055 void 1056 sbdrop(struct socket *so, struct sockbuf *sb, int len) 1057 { 1058 struct mbuf *m, *mn; 1059 struct mbuf *next; 1060 1061 sbmtxassertlocked(sb); 1062 1063 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1064 while (len > 0) { 1065 if (m == NULL) { 1066 if (next == NULL) 1067 panic("sbdrop"); 1068 m = next; 1069 next = m->m_nextpkt; 1070 continue; 1071 } 1072 if (m->m_len > len) { 1073 m->m_len -= len; 1074 m->m_data += len; 1075 sb->sb_cc -= len; 1076 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 1077 sb->sb_datacc -= len; 1078 break; 1079 } 1080 len -= m->m_len; 1081 sbfree(sb, m); 1082 mn = m_free(m); 1083 m = mn; 1084 } 1085 while (m && m->m_len == 0) { 1086 sbfree(sb, m); 1087 mn = m_free(m); 1088 m = mn; 1089 } 1090 if (m) { 1091 sb->sb_mb = m; 1092 m->m_nextpkt = next; 1093 } else 1094 sb->sb_mb = next; 1095 /* 1096 * First part is an inline SB_EMPTY_FIXUP(). Second part 1097 * makes sure sb_lastrecord is up-to-date if we dropped 1098 * part of the last record. 1099 */ 1100 m = sb->sb_mb; 1101 if (m == NULL) { 1102 sb->sb_mbtail = NULL; 1103 sb->sb_lastrecord = NULL; 1104 } else if (m->m_nextpkt == NULL) 1105 sb->sb_lastrecord = m; 1106 } 1107 1108 /* 1109 * Drop a record off the front of a sockbuf 1110 * and move the next record to the front. 1111 */ 1112 void 1113 sbdroprecord(struct socket *so, struct sockbuf *sb) 1114 { 1115 struct mbuf *m, *mn; 1116 1117 m = sb->sb_mb; 1118 if (m) { 1119 sb->sb_mb = m->m_nextpkt; 1120 do { 1121 sbfree(sb, m); 1122 mn = m_free(m); 1123 } while ((m = mn) != NULL); 1124 } 1125 SB_EMPTY_FIXUP(sb); 1126 } 1127 1128 /* 1129 * Create a "control" mbuf containing the specified data 1130 * with the specified type for presentation on a socket buffer. 1131 */ 1132 struct mbuf * 1133 sbcreatecontrol(const void *p, size_t size, int type, int level) 1134 { 1135 struct cmsghdr *cp; 1136 struct mbuf *m; 1137 1138 if (CMSG_SPACE(size) > MCLBYTES) { 1139 printf("sbcreatecontrol: message too large %zu\n", size); 1140 return (NULL); 1141 } 1142 1143 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1144 return (NULL); 1145 if (CMSG_SPACE(size) > MLEN) { 1146 MCLGET(m, M_DONTWAIT); 1147 if ((m->m_flags & M_EXT) == 0) { 1148 m_free(m); 1149 return NULL; 1150 } 1151 } 1152 cp = mtod(m, struct cmsghdr *); 1153 memset(cp, 0, CMSG_SPACE(size)); 1154 memcpy(CMSG_DATA(cp), p, size); 1155 m->m_len = CMSG_SPACE(size); 1156 cp->cmsg_len = CMSG_LEN(size); 1157 cp->cmsg_level = level; 1158 cp->cmsg_type = type; 1159 return (m); 1160 } 1161