1 /* $OpenBSD: uipc_socket2.c,v 1.109 2021/05/01 16:13:13 mvs Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/event.h> 45 #include <sys/pool.h> 46 47 /* 48 * Primitive routines for operating on sockets and socket buffers 49 */ 50 51 u_long sb_max = SB_MAX; /* patchable */ 52 53 extern struct pool mclpools[]; 54 extern struct pool mbpool; 55 56 extern struct rwlock unp_lock; 57 58 /* 59 * Procedures to manipulate state flags of socket 60 * and do appropriate wakeups. Normal sequence from the 61 * active (originating) side is that soisconnecting() is 62 * called during processing of connect() call, 63 * resulting in an eventual call to soisconnected() if/when the 64 * connection is established. When the connection is torn down 65 * soisdisconnecting() is called during processing of disconnect() call, 66 * and soisdisconnected() is called when the connection to the peer 67 * is totally severed. The semantics of these routines are such that 68 * connectionless protocols can call soisconnected() and soisdisconnected() 69 * only, bypassing the in-progress calls when setting up a ``connection'' 70 * takes no time. 71 * 72 * From the passive side, a socket is created with 73 * two queues of sockets: so_q0 for connections in progress 74 * and so_q for connections already made and awaiting user acceptance. 75 * As a protocol is preparing incoming connections, it creates a socket 76 * structure queued on so_q0 by calling sonewconn(). When the connection 77 * is established, soisconnected() is called, and transfers the 78 * socket structure to so_q, making it available to accept(). 79 * 80 * If a socket is closed with sockets on either 81 * so_q0 or so_q, these sockets are dropped. 82 * 83 * If higher level protocols are implemented in 84 * the kernel, the wakeups done here will sometimes 85 * cause software-interrupt process scheduling. 86 */ 87 88 void 89 soisconnecting(struct socket *so) 90 { 91 soassertlocked(so); 92 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 93 so->so_state |= SS_ISCONNECTING; 94 } 95 96 void 97 soisconnected(struct socket *so) 98 { 99 struct socket *head = so->so_head; 100 101 soassertlocked(so); 102 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 103 so->so_state |= SS_ISCONNECTED; 104 if (head && soqremque(so, 0)) { 105 soqinsque(head, so, 1); 106 sorwakeup(head); 107 wakeup_one(&head->so_timeo); 108 } else { 109 wakeup(&so->so_timeo); 110 sorwakeup(so); 111 sowwakeup(so); 112 } 113 } 114 115 void 116 soisdisconnecting(struct socket *so) 117 { 118 soassertlocked(so); 119 so->so_state &= ~SS_ISCONNECTING; 120 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 121 wakeup(&so->so_timeo); 122 sowwakeup(so); 123 sorwakeup(so); 124 } 125 126 void 127 soisdisconnected(struct socket *so) 128 { 129 soassertlocked(so); 130 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 131 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 132 wakeup(&so->so_timeo); 133 sowwakeup(so); 134 sorwakeup(so); 135 } 136 137 /* 138 * When an attempt at a new connection is noted on a socket 139 * which accepts connections, sonewconn is called. If the 140 * connection is possible (subject to space constraints, etc.) 141 * then we allocate a new structure, properly linked into the 142 * data structure of the original socket, and return this. 143 * Connstatus may be 0 or SS_ISCONNECTED. 144 */ 145 struct socket * 146 sonewconn(struct socket *head, int connstatus) 147 { 148 struct socket *so; 149 int soqueue = connstatus ? 1 : 0; 150 151 /* 152 * XXXSMP as long as `so' and `head' share the same lock, we 153 * can call soreserve() and pr_attach() below w/o expliclitly 154 * locking `so'. 155 */ 156 soassertlocked(head); 157 158 if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 95 / 100) 159 return (NULL); 160 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 161 return (NULL); 162 so = pool_get(&socket_pool, PR_NOWAIT|PR_ZERO); 163 if (so == NULL) 164 return (NULL); 165 rw_init(&so->so_lock, "solock"); 166 so->so_type = head->so_type; 167 so->so_options = head->so_options &~ SO_ACCEPTCONN; 168 so->so_linger = head->so_linger; 169 so->so_state = head->so_state | SS_NOFDREF; 170 so->so_proto = head->so_proto; 171 so->so_timeo = head->so_timeo; 172 so->so_euid = head->so_euid; 173 so->so_ruid = head->so_ruid; 174 so->so_egid = head->so_egid; 175 so->so_rgid = head->so_rgid; 176 so->so_cpid = head->so_cpid; 177 178 /* 179 * Inherit watermarks but those may get clamped in low mem situations. 180 */ 181 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 182 pool_put(&socket_pool, so); 183 return (NULL); 184 } 185 so->so_snd.sb_wat = head->so_snd.sb_wat; 186 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 187 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 188 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 189 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 190 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 191 192 sigio_init(&so->so_sigio); 193 sigio_copy(&so->so_sigio, &head->so_sigio); 194 195 soqinsque(head, so, soqueue); 196 if ((*so->so_proto->pr_attach)(so, 0)) { 197 (void) soqremque(so, soqueue); 198 sigio_free(&so->so_sigio); 199 pool_put(&socket_pool, so); 200 return (NULL); 201 } 202 if (connstatus) { 203 sorwakeup(head); 204 wakeup(&head->so_timeo); 205 so->so_state |= connstatus; 206 } 207 return (so); 208 } 209 210 void 211 soqinsque(struct socket *head, struct socket *so, int q) 212 { 213 soassertlocked(head); 214 215 #ifdef DIAGNOSTIC 216 if (so->so_onq != NULL) 217 panic("soqinsque"); 218 #endif 219 220 so->so_head = head; 221 if (q == 0) { 222 head->so_q0len++; 223 so->so_onq = &head->so_q0; 224 } else { 225 head->so_qlen++; 226 so->so_onq = &head->so_q; 227 } 228 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 229 } 230 231 int 232 soqremque(struct socket *so, int q) 233 { 234 struct socket *head = so->so_head; 235 236 soassertlocked(head); 237 238 if (q == 0) { 239 if (so->so_onq != &head->so_q0) 240 return (0); 241 head->so_q0len--; 242 } else { 243 if (so->so_onq != &head->so_q) 244 return (0); 245 head->so_qlen--; 246 } 247 TAILQ_REMOVE(so->so_onq, so, so_qe); 248 so->so_onq = NULL; 249 so->so_head = NULL; 250 return (1); 251 } 252 253 /* 254 * Socantsendmore indicates that no more data will be sent on the 255 * socket; it would normally be applied to a socket when the user 256 * informs the system that no more data is to be sent, by the protocol 257 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 258 * will be received, and will normally be applied to the socket by a 259 * protocol when it detects that the peer will send no more data. 260 * Data queued for reading in the socket may yet be read. 261 */ 262 263 void 264 socantsendmore(struct socket *so) 265 { 266 soassertlocked(so); 267 so->so_state |= SS_CANTSENDMORE; 268 sowwakeup(so); 269 } 270 271 void 272 socantrcvmore(struct socket *so) 273 { 274 soassertlocked(so); 275 so->so_state |= SS_CANTRCVMORE; 276 sorwakeup(so); 277 } 278 279 int 280 solock(struct socket *so) 281 { 282 switch (so->so_proto->pr_domain->dom_family) { 283 case PF_INET: 284 case PF_INET6: 285 NET_LOCK(); 286 break; 287 case PF_UNIX: 288 rw_enter_write(&unp_lock); 289 break; 290 case PF_ROUTE: 291 rw_enter_write(&so->so_lock); 292 break; 293 case PF_KEY: 294 default: 295 KERNEL_LOCK(); 296 break; 297 } 298 299 return (SL_LOCKED); 300 } 301 302 void 303 sounlock(struct socket *so, int s) 304 { 305 KASSERT(s == SL_LOCKED || s == SL_NOUNLOCK); 306 307 if (s != SL_LOCKED) 308 return; 309 310 switch (so->so_proto->pr_domain->dom_family) { 311 case PF_INET: 312 case PF_INET6: 313 NET_UNLOCK(); 314 break; 315 case PF_UNIX: 316 rw_exit_write(&unp_lock); 317 break; 318 case PF_ROUTE: 319 rw_exit_write(&so->so_lock); 320 break; 321 case PF_KEY: 322 default: 323 KERNEL_UNLOCK(); 324 break; 325 } 326 } 327 328 void 329 soassertlocked(struct socket *so) 330 { 331 switch (so->so_proto->pr_domain->dom_family) { 332 case PF_INET: 333 case PF_INET6: 334 NET_ASSERT_LOCKED(); 335 break; 336 case PF_UNIX: 337 rw_assert_wrlock(&unp_lock); 338 break; 339 case PF_ROUTE: 340 rw_assert_wrlock(&so->so_lock); 341 break; 342 case PF_KEY: 343 default: 344 KERNEL_ASSERT_LOCKED(); 345 break; 346 } 347 } 348 349 int 350 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 351 uint64_t nsecs) 352 { 353 int ret; 354 355 switch (so->so_proto->pr_domain->dom_family) { 356 case PF_INET: 357 case PF_INET6: 358 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 359 break; 360 case PF_UNIX: 361 ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs); 362 break; 363 case PF_ROUTE: 364 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); 365 break; 366 case PF_KEY: 367 default: 368 ret = tsleep_nsec(ident, prio, wmesg, nsecs); 369 break; 370 } 371 372 return ret; 373 } 374 375 /* 376 * Wait for data to arrive at/drain from a socket buffer. 377 */ 378 int 379 sbwait(struct socket *so, struct sockbuf *sb) 380 { 381 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 382 383 soassertlocked(so); 384 385 sb->sb_flags |= SB_WAIT; 386 return sosleep_nsec(so, &sb->sb_cc, prio, "netio", sb->sb_timeo_nsecs); 387 } 388 389 int 390 sblock(struct socket *so, struct sockbuf *sb, int wait) 391 { 392 int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 393 394 soassertlocked(so); 395 396 if ((sb->sb_flags & SB_LOCK) == 0) { 397 sb->sb_flags |= SB_LOCK; 398 return (0); 399 } 400 if (wait & M_NOWAIT) 401 return (EWOULDBLOCK); 402 403 while (sb->sb_flags & SB_LOCK) { 404 sb->sb_flags |= SB_WANT; 405 error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", INFSLP); 406 if (error) 407 return (error); 408 } 409 sb->sb_flags |= SB_LOCK; 410 return (0); 411 } 412 413 void 414 sbunlock(struct socket *so, struct sockbuf *sb) 415 { 416 soassertlocked(so); 417 418 sb->sb_flags &= ~SB_LOCK; 419 if (sb->sb_flags & SB_WANT) { 420 sb->sb_flags &= ~SB_WANT; 421 wakeup(&sb->sb_flags); 422 } 423 } 424 425 /* 426 * Wakeup processes waiting on a socket buffer. 427 * Do asynchronous notification via SIGIO 428 * if the socket has the SS_ASYNC flag set. 429 */ 430 void 431 sowakeup(struct socket *so, struct sockbuf *sb) 432 { 433 soassertlocked(so); 434 435 sb->sb_flags &= ~SB_SEL; 436 if (sb->sb_flags & SB_WAIT) { 437 sb->sb_flags &= ~SB_WAIT; 438 wakeup(&sb->sb_cc); 439 } 440 if (so->so_state & SS_ASYNC) 441 pgsigio(&so->so_sigio, SIGIO, 0); 442 selwakeup(&sb->sb_sel); 443 } 444 445 /* 446 * Socket buffer (struct sockbuf) utility routines. 447 * 448 * Each socket contains two socket buffers: one for sending data and 449 * one for receiving data. Each buffer contains a queue of mbufs, 450 * information about the number of mbufs and amount of data in the 451 * queue, and other fields allowing select() statements and notification 452 * on data availability to be implemented. 453 * 454 * Data stored in a socket buffer is maintained as a list of records. 455 * Each record is a list of mbufs chained together with the m_next 456 * field. Records are chained together with the m_nextpkt field. The upper 457 * level routine soreceive() expects the following conventions to be 458 * observed when placing information in the receive buffer: 459 * 460 * 1. If the protocol requires each message be preceded by the sender's 461 * name, then a record containing that name must be present before 462 * any associated data (mbuf's must be of type MT_SONAME). 463 * 2. If the protocol supports the exchange of ``access rights'' (really 464 * just additional data associated with the message), and there are 465 * ``rights'' to be received, then a record containing this data 466 * should be present (mbuf's must be of type MT_CONTROL). 467 * 3. If a name or rights record exists, then it must be followed by 468 * a data record, perhaps of zero length. 469 * 470 * Before using a new socket structure it is first necessary to reserve 471 * buffer space to the socket, by calling sbreserve(). This should commit 472 * some of the available buffer space in the system buffer pool for the 473 * socket (currently, it does nothing but enforce limits). The space 474 * should be released by calling sbrelease() when the socket is destroyed. 475 */ 476 477 int 478 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 479 { 480 soassertlocked(so); 481 482 if (sbreserve(so, &so->so_snd, sndcc)) 483 goto bad; 484 if (sbreserve(so, &so->so_rcv, rcvcc)) 485 goto bad2; 486 so->so_snd.sb_wat = sndcc; 487 so->so_rcv.sb_wat = rcvcc; 488 if (so->so_rcv.sb_lowat == 0) 489 so->so_rcv.sb_lowat = 1; 490 if (so->so_snd.sb_lowat == 0) 491 so->so_snd.sb_lowat = MCLBYTES; 492 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 493 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 494 return (0); 495 bad2: 496 sbrelease(so, &so->so_snd); 497 bad: 498 return (ENOBUFS); 499 } 500 501 /* 502 * Allot mbufs to a sockbuf. 503 * Attempt to scale mbmax so that mbcnt doesn't become limiting 504 * if buffering efficiency is near the normal case. 505 */ 506 int 507 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 508 { 509 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 510 soassertlocked(so); 511 512 if (cc == 0 || cc > sb_max) 513 return (1); 514 sb->sb_hiwat = cc; 515 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 516 if (sb->sb_lowat > sb->sb_hiwat) 517 sb->sb_lowat = sb->sb_hiwat; 518 return (0); 519 } 520 521 /* 522 * In low memory situation, do not accept any greater than normal request. 523 */ 524 int 525 sbcheckreserve(u_long cnt, u_long defcnt) 526 { 527 if (cnt > defcnt && sbchecklowmem()) 528 return (ENOBUFS); 529 return (0); 530 } 531 532 int 533 sbchecklowmem(void) 534 { 535 static int sblowmem; 536 537 if (mclpools[0].pr_nout < mclpools[0].pr_hardlimit * 60 / 100 || 538 mbpool.pr_nout < mbpool.pr_hardlimit * 60 / 100) 539 sblowmem = 0; 540 if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 80 / 100 || 541 mbpool.pr_nout > mbpool.pr_hardlimit * 80 / 100) 542 sblowmem = 1; 543 return (sblowmem); 544 } 545 546 /* 547 * Free mbufs held by a socket, and reserved mbuf space. 548 */ 549 void 550 sbrelease(struct socket *so, struct sockbuf *sb) 551 { 552 553 sbflush(so, sb); 554 sb->sb_hiwat = sb->sb_mbmax = 0; 555 } 556 557 /* 558 * Routines to add and remove 559 * data from an mbuf queue. 560 * 561 * The routines sbappend() or sbappendrecord() are normally called to 562 * append new mbufs to a socket buffer, after checking that adequate 563 * space is available, comparing the function sbspace() with the amount 564 * of data to be added. sbappendrecord() differs from sbappend() in 565 * that data supplied is treated as the beginning of a new record. 566 * To place a sender's address, optional access rights, and data in a 567 * socket receive buffer, sbappendaddr() should be used. To place 568 * access rights and data in a socket receive buffer, sbappendrights() 569 * should be used. In either case, the new data begins a new record. 570 * Note that unlike sbappend() and sbappendrecord(), these routines check 571 * for the caller that there will be enough space to store the data. 572 * Each fails if there is not enough space, or if it cannot find mbufs 573 * to store additional information in. 574 * 575 * Reliable protocols may use the socket send buffer to hold data 576 * awaiting acknowledgement. Data is normally copied from a socket 577 * send buffer in a protocol with m_copym for output to a peer, 578 * and then removing the data from the socket buffer with sbdrop() 579 * or sbdroprecord() when the data is acknowledged by the peer. 580 */ 581 582 #ifdef SOCKBUF_DEBUG 583 void 584 sblastrecordchk(struct sockbuf *sb, const char *where) 585 { 586 struct mbuf *m = sb->sb_mb; 587 588 while (m && m->m_nextpkt) 589 m = m->m_nextpkt; 590 591 if (m != sb->sb_lastrecord) { 592 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 593 sb->sb_mb, sb->sb_lastrecord, m); 594 printf("packet chain:\n"); 595 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 596 printf("\t%p\n", m); 597 panic("sblastrecordchk from %s", where); 598 } 599 } 600 601 void 602 sblastmbufchk(struct sockbuf *sb, const char *where) 603 { 604 struct mbuf *m = sb->sb_mb; 605 struct mbuf *n; 606 607 while (m && m->m_nextpkt) 608 m = m->m_nextpkt; 609 610 while (m && m->m_next) 611 m = m->m_next; 612 613 if (m != sb->sb_mbtail) { 614 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 615 sb->sb_mb, sb->sb_mbtail, m); 616 printf("packet tree:\n"); 617 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 618 printf("\t"); 619 for (n = m; n != NULL; n = n->m_next) 620 printf("%p ", n); 621 printf("\n"); 622 } 623 panic("sblastmbufchk from %s", where); 624 } 625 } 626 #endif /* SOCKBUF_DEBUG */ 627 628 #define SBLINKRECORD(sb, m0) \ 629 do { \ 630 if ((sb)->sb_lastrecord != NULL) \ 631 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 632 else \ 633 (sb)->sb_mb = (m0); \ 634 (sb)->sb_lastrecord = (m0); \ 635 } while (/*CONSTCOND*/0) 636 637 /* 638 * Append mbuf chain m to the last record in the 639 * socket buffer sb. The additional space associated 640 * the mbuf chain is recorded in sb. Empty mbufs are 641 * discarded and mbufs are compacted where possible. 642 */ 643 void 644 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 645 { 646 struct mbuf *n; 647 648 if (m == NULL) 649 return; 650 651 soassertlocked(so); 652 SBLASTRECORDCHK(sb, "sbappend 1"); 653 654 if ((n = sb->sb_lastrecord) != NULL) { 655 /* 656 * XXX Would like to simply use sb_mbtail here, but 657 * XXX I need to verify that I won't miss an EOR that 658 * XXX way. 659 */ 660 do { 661 if (n->m_flags & M_EOR) { 662 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 663 return; 664 } 665 } while (n->m_next && (n = n->m_next)); 666 } else { 667 /* 668 * If this is the first record in the socket buffer, it's 669 * also the last record. 670 */ 671 sb->sb_lastrecord = m; 672 } 673 sbcompress(sb, m, n); 674 SBLASTRECORDCHK(sb, "sbappend 2"); 675 } 676 677 /* 678 * This version of sbappend() should only be used when the caller 679 * absolutely knows that there will never be more than one record 680 * in the socket buffer, that is, a stream protocol (such as TCP). 681 */ 682 void 683 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 684 { 685 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 686 soassertlocked(so); 687 KDASSERT(m->m_nextpkt == NULL); 688 KASSERT(sb->sb_mb == sb->sb_lastrecord); 689 690 SBLASTMBUFCHK(sb, __func__); 691 692 sbcompress(sb, m, sb->sb_mbtail); 693 694 sb->sb_lastrecord = sb->sb_mb; 695 SBLASTRECORDCHK(sb, __func__); 696 } 697 698 #ifdef SOCKBUF_DEBUG 699 void 700 sbcheck(struct sockbuf *sb) 701 { 702 struct mbuf *m, *n; 703 u_long len = 0, mbcnt = 0; 704 705 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 706 for (n = m; n; n = n->m_next) { 707 len += n->m_len; 708 mbcnt += MSIZE; 709 if (n->m_flags & M_EXT) 710 mbcnt += n->m_ext.ext_size; 711 if (m != n && n->m_nextpkt) 712 panic("sbcheck nextpkt"); 713 } 714 } 715 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 716 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 717 mbcnt, sb->sb_mbcnt); 718 panic("sbcheck"); 719 } 720 } 721 #endif 722 723 /* 724 * As above, except the mbuf chain 725 * begins a new record. 726 */ 727 void 728 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 729 { 730 struct mbuf *m; 731 732 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 733 soassertlocked(so); 734 735 if (m0 == NULL) 736 return; 737 738 /* 739 * Put the first mbuf on the queue. 740 * Note this permits zero length records. 741 */ 742 sballoc(sb, m0); 743 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 744 SBLINKRECORD(sb, m0); 745 m = m0->m_next; 746 m0->m_next = NULL; 747 if (m && (m0->m_flags & M_EOR)) { 748 m0->m_flags &= ~M_EOR; 749 m->m_flags |= M_EOR; 750 } 751 sbcompress(sb, m, m0); 752 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 753 } 754 755 /* 756 * As above except that OOB data 757 * is inserted at the beginning of the sockbuf, 758 * but after any other OOB data. 759 */ 760 void 761 sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 762 { 763 struct mbuf *m, **mp; 764 765 if (m0 == NULL) 766 return; 767 768 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 769 770 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 771 again: 772 switch (m->m_type) { 773 774 case MT_OOBDATA: 775 continue; /* WANT next train */ 776 777 case MT_CONTROL: 778 if ((m = m->m_next) != NULL) 779 goto again; /* inspect THIS train further */ 780 } 781 break; 782 } 783 /* 784 * Put the first mbuf on the queue. 785 * Note this permits zero length records. 786 */ 787 sballoc(sb, m0); 788 m0->m_nextpkt = *mp; 789 if (*mp == NULL) { 790 /* m0 is actually the new tail */ 791 sb->sb_lastrecord = m0; 792 } 793 *mp = m0; 794 m = m0->m_next; 795 m0->m_next = NULL; 796 if (m && (m0->m_flags & M_EOR)) { 797 m0->m_flags &= ~M_EOR; 798 m->m_flags |= M_EOR; 799 } 800 sbcompress(sb, m, m0); 801 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 802 } 803 804 /* 805 * Append address and data, and optionally, control (ancillary) data 806 * to the receive queue of a socket. If present, 807 * m0 must include a packet header with total length. 808 * Returns 0 if no space in sockbuf or insufficient mbufs. 809 */ 810 int 811 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 812 struct mbuf *m0, struct mbuf *control) 813 { 814 struct mbuf *m, *n, *nlast; 815 int space = asa->sa_len; 816 817 soassertlocked(so); 818 819 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 820 panic("sbappendaddr"); 821 if (m0) 822 space += m0->m_pkthdr.len; 823 for (n = control; n; n = n->m_next) { 824 space += n->m_len; 825 if (n->m_next == NULL) /* keep pointer to last control buf */ 826 break; 827 } 828 if (space > sbspace(so, sb)) 829 return (0); 830 if (asa->sa_len > MLEN) 831 return (0); 832 MGET(m, M_DONTWAIT, MT_SONAME); 833 if (m == NULL) 834 return (0); 835 m->m_len = asa->sa_len; 836 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 837 if (n) 838 n->m_next = m0; /* concatenate data to control */ 839 else 840 control = m0; 841 m->m_next = control; 842 843 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 844 845 for (n = m; n->m_next != NULL; n = n->m_next) 846 sballoc(sb, n); 847 sballoc(sb, n); 848 nlast = n; 849 SBLINKRECORD(sb, m); 850 851 sb->sb_mbtail = nlast; 852 SBLASTMBUFCHK(sb, "sbappendaddr"); 853 854 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 855 856 return (1); 857 } 858 859 int 860 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 861 struct mbuf *control) 862 { 863 struct mbuf *m, *mlast, *n; 864 int space = 0; 865 866 if (control == NULL) 867 panic("sbappendcontrol"); 868 for (m = control; ; m = m->m_next) { 869 space += m->m_len; 870 if (m->m_next == NULL) 871 break; 872 } 873 n = m; /* save pointer to last control buffer */ 874 for (m = m0; m; m = m->m_next) 875 space += m->m_len; 876 if (space > sbspace(so, sb)) 877 return (0); 878 n->m_next = m0; /* concatenate data to control */ 879 880 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 881 882 for (m = control; m->m_next != NULL; m = m->m_next) 883 sballoc(sb, m); 884 sballoc(sb, m); 885 mlast = m; 886 SBLINKRECORD(sb, control); 887 888 sb->sb_mbtail = mlast; 889 SBLASTMBUFCHK(sb, "sbappendcontrol"); 890 891 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 892 893 return (1); 894 } 895 896 /* 897 * Compress mbuf chain m into the socket 898 * buffer sb following mbuf n. If n 899 * is null, the buffer is presumed empty. 900 */ 901 void 902 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 903 { 904 int eor = 0; 905 struct mbuf *o; 906 907 while (m) { 908 eor |= m->m_flags & M_EOR; 909 if (m->m_len == 0 && 910 (eor == 0 || 911 (((o = m->m_next) || (o = n)) && 912 o->m_type == m->m_type))) { 913 if (sb->sb_lastrecord == m) 914 sb->sb_lastrecord = m->m_next; 915 m = m_free(m); 916 continue; 917 } 918 if (n && (n->m_flags & M_EOR) == 0 && 919 /* m_trailingspace() checks buffer writeability */ 920 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 921 MCLBYTES) / 4 && /* XXX Don't copy too much */ 922 m->m_len <= m_trailingspace(n) && 923 n->m_type == m->m_type) { 924 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 925 m->m_len); 926 n->m_len += m->m_len; 927 sb->sb_cc += m->m_len; 928 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 929 sb->sb_datacc += m->m_len; 930 m = m_free(m); 931 continue; 932 } 933 if (n) 934 n->m_next = m; 935 else 936 sb->sb_mb = m; 937 sb->sb_mbtail = m; 938 sballoc(sb, m); 939 n = m; 940 m->m_flags &= ~M_EOR; 941 m = m->m_next; 942 n->m_next = NULL; 943 } 944 if (eor) { 945 if (n) 946 n->m_flags |= eor; 947 else 948 printf("semi-panic: sbcompress"); 949 } 950 SBLASTMBUFCHK(sb, __func__); 951 } 952 953 /* 954 * Free all mbufs in a sockbuf. 955 * Check that all resources are reclaimed. 956 */ 957 void 958 sbflush(struct socket *so, struct sockbuf *sb) 959 { 960 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 961 KASSERT((sb->sb_flags & SB_LOCK) == 0); 962 963 while (sb->sb_mbcnt) 964 sbdrop(so, sb, (int)sb->sb_cc); 965 966 KASSERT(sb->sb_cc == 0); 967 KASSERT(sb->sb_datacc == 0); 968 KASSERT(sb->sb_mb == NULL); 969 KASSERT(sb->sb_mbtail == NULL); 970 KASSERT(sb->sb_lastrecord == NULL); 971 } 972 973 /* 974 * Drop data from (the front of) a sockbuf. 975 */ 976 void 977 sbdrop(struct socket *so, struct sockbuf *sb, int len) 978 { 979 struct mbuf *m, *mn; 980 struct mbuf *next; 981 982 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 983 soassertlocked(so); 984 985 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 986 while (len > 0) { 987 if (m == NULL) { 988 if (next == NULL) 989 panic("sbdrop"); 990 m = next; 991 next = m->m_nextpkt; 992 continue; 993 } 994 if (m->m_len > len) { 995 m->m_len -= len; 996 m->m_data += len; 997 sb->sb_cc -= len; 998 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 999 sb->sb_datacc -= len; 1000 break; 1001 } 1002 len -= m->m_len; 1003 sbfree(sb, m); 1004 mn = m_free(m); 1005 m = mn; 1006 } 1007 while (m && m->m_len == 0) { 1008 sbfree(sb, m); 1009 mn = m_free(m); 1010 m = mn; 1011 } 1012 if (m) { 1013 sb->sb_mb = m; 1014 m->m_nextpkt = next; 1015 } else 1016 sb->sb_mb = next; 1017 /* 1018 * First part is an inline SB_EMPTY_FIXUP(). Second part 1019 * makes sure sb_lastrecord is up-to-date if we dropped 1020 * part of the last record. 1021 */ 1022 m = sb->sb_mb; 1023 if (m == NULL) { 1024 sb->sb_mbtail = NULL; 1025 sb->sb_lastrecord = NULL; 1026 } else if (m->m_nextpkt == NULL) 1027 sb->sb_lastrecord = m; 1028 } 1029 1030 /* 1031 * Drop a record off the front of a sockbuf 1032 * and move the next record to the front. 1033 */ 1034 void 1035 sbdroprecord(struct sockbuf *sb) 1036 { 1037 struct mbuf *m, *mn; 1038 1039 m = sb->sb_mb; 1040 if (m) { 1041 sb->sb_mb = m->m_nextpkt; 1042 do { 1043 sbfree(sb, m); 1044 mn = m_free(m); 1045 } while ((m = mn) != NULL); 1046 } 1047 SB_EMPTY_FIXUP(sb); 1048 } 1049 1050 /* 1051 * Create a "control" mbuf containing the specified data 1052 * with the specified type for presentation on a socket buffer. 1053 */ 1054 struct mbuf * 1055 sbcreatecontrol(const void *p, size_t size, int type, int level) 1056 { 1057 struct cmsghdr *cp; 1058 struct mbuf *m; 1059 1060 if (CMSG_SPACE(size) > MCLBYTES) { 1061 printf("sbcreatecontrol: message too large %zu\n", size); 1062 return (NULL); 1063 } 1064 1065 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1066 return (NULL); 1067 if (CMSG_SPACE(size) > MLEN) { 1068 MCLGET(m, M_DONTWAIT); 1069 if ((m->m_flags & M_EXT) == 0) { 1070 m_free(m); 1071 return NULL; 1072 } 1073 } 1074 cp = mtod(m, struct cmsghdr *); 1075 memset(cp, 0, CMSG_SPACE(size)); 1076 memcpy(CMSG_DATA(cp), p, size); 1077 m->m_len = CMSG_SPACE(size); 1078 cp->cmsg_len = CMSG_LEN(size); 1079 cp->cmsg_level = level; 1080 cp->cmsg_type = type; 1081 return (m); 1082 } 1083