1 /* $OpenBSD: uipc_socket2.c,v 1.113 2021/07/26 05:51:13 mpi Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/event.h> 45 #include <sys/pool.h> 46 47 /* 48 * Primitive routines for operating on sockets and socket buffers 49 */ 50 51 u_long sb_max = SB_MAX; /* patchable */ 52 53 extern struct pool mclpools[]; 54 extern struct pool mbpool; 55 56 extern struct rwlock unp_lock; 57 58 /* 59 * Procedures to manipulate state flags of socket 60 * and do appropriate wakeups. Normal sequence from the 61 * active (originating) side is that soisconnecting() is 62 * called during processing of connect() call, 63 * resulting in an eventual call to soisconnected() if/when the 64 * connection is established. When the connection is torn down 65 * soisdisconnecting() is called during processing of disconnect() call, 66 * and soisdisconnected() is called when the connection to the peer 67 * is totally severed. The semantics of these routines are such that 68 * connectionless protocols can call soisconnected() and soisdisconnected() 69 * only, bypassing the in-progress calls when setting up a ``connection'' 70 * takes no time. 71 * 72 * From the passive side, a socket is created with 73 * two queues of sockets: so_q0 for connections in progress 74 * and so_q for connections already made and awaiting user acceptance. 75 * As a protocol is preparing incoming connections, it creates a socket 76 * structure queued on so_q0 by calling sonewconn(). When the connection 77 * is established, soisconnected() is called, and transfers the 78 * socket structure to so_q, making it available to accept(). 79 * 80 * If a socket is closed with sockets on either 81 * so_q0 or so_q, these sockets are dropped. 82 * 83 * If higher level protocols are implemented in 84 * the kernel, the wakeups done here will sometimes 85 * cause software-interrupt process scheduling. 86 */ 87 88 void 89 soisconnecting(struct socket *so) 90 { 91 soassertlocked(so); 92 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 93 so->so_state |= SS_ISCONNECTING; 94 } 95 96 void 97 soisconnected(struct socket *so) 98 { 99 struct socket *head = so->so_head; 100 101 soassertlocked(so); 102 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 103 so->so_state |= SS_ISCONNECTED; 104 if (head && soqremque(so, 0)) { 105 soqinsque(head, so, 1); 106 sorwakeup(head); 107 wakeup_one(&head->so_timeo); 108 } else { 109 wakeup(&so->so_timeo); 110 sorwakeup(so); 111 sowwakeup(so); 112 } 113 } 114 115 void 116 soisdisconnecting(struct socket *so) 117 { 118 soassertlocked(so); 119 so->so_state &= ~SS_ISCONNECTING; 120 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 121 wakeup(&so->so_timeo); 122 sowwakeup(so); 123 sorwakeup(so); 124 } 125 126 void 127 soisdisconnected(struct socket *so) 128 { 129 soassertlocked(so); 130 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 131 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 132 wakeup(&so->so_timeo); 133 sowwakeup(so); 134 sorwakeup(so); 135 } 136 137 /* 138 * When an attempt at a new connection is noted on a socket 139 * which accepts connections, sonewconn is called. If the 140 * connection is possible (subject to space constraints, etc.) 141 * then we allocate a new structure, properly linked into the 142 * data structure of the original socket, and return this. 143 * Connstatus may be 0 or SS_ISCONNECTED. 144 */ 145 struct socket * 146 sonewconn(struct socket *head, int connstatus) 147 { 148 struct socket *so; 149 int soqueue = connstatus ? 1 : 0; 150 151 /* 152 * XXXSMP as long as `so' and `head' share the same lock, we 153 * can call soreserve() and pr_attach() below w/o expliclitly 154 * locking `so'. 155 */ 156 soassertlocked(head); 157 158 if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 95 / 100) 159 return (NULL); 160 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 161 return (NULL); 162 so = pool_get(&socket_pool, PR_NOWAIT|PR_ZERO); 163 if (so == NULL) 164 return (NULL); 165 rw_init(&so->so_lock, "solock"); 166 so->so_type = head->so_type; 167 so->so_options = head->so_options &~ SO_ACCEPTCONN; 168 so->so_linger = head->so_linger; 169 so->so_state = head->so_state | SS_NOFDREF; 170 so->so_proto = head->so_proto; 171 so->so_timeo = head->so_timeo; 172 so->so_euid = head->so_euid; 173 so->so_ruid = head->so_ruid; 174 so->so_egid = head->so_egid; 175 so->so_rgid = head->so_rgid; 176 so->so_cpid = head->so_cpid; 177 178 /* 179 * Inherit watermarks but those may get clamped in low mem situations. 180 */ 181 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 182 pool_put(&socket_pool, so); 183 return (NULL); 184 } 185 so->so_snd.sb_wat = head->so_snd.sb_wat; 186 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 187 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 188 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 189 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 190 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 191 192 sigio_init(&so->so_sigio); 193 sigio_copy(&so->so_sigio, &head->so_sigio); 194 195 soqinsque(head, so, soqueue); 196 if ((*so->so_proto->pr_attach)(so, 0)) { 197 (void) soqremque(so, soqueue); 198 sigio_free(&so->so_sigio); 199 pool_put(&socket_pool, so); 200 return (NULL); 201 } 202 if (connstatus) { 203 sorwakeup(head); 204 wakeup(&head->so_timeo); 205 so->so_state |= connstatus; 206 } 207 return (so); 208 } 209 210 void 211 soqinsque(struct socket *head, struct socket *so, int q) 212 { 213 soassertlocked(head); 214 215 #ifdef DIAGNOSTIC 216 if (so->so_onq != NULL) 217 panic("soqinsque"); 218 #endif 219 220 so->so_head = head; 221 if (q == 0) { 222 head->so_q0len++; 223 so->so_onq = &head->so_q0; 224 } else { 225 head->so_qlen++; 226 so->so_onq = &head->so_q; 227 } 228 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 229 } 230 231 int 232 soqremque(struct socket *so, int q) 233 { 234 struct socket *head = so->so_head; 235 236 soassertlocked(head); 237 238 if (q == 0) { 239 if (so->so_onq != &head->so_q0) 240 return (0); 241 head->so_q0len--; 242 } else { 243 if (so->so_onq != &head->so_q) 244 return (0); 245 head->so_qlen--; 246 } 247 TAILQ_REMOVE(so->so_onq, so, so_qe); 248 so->so_onq = NULL; 249 so->so_head = NULL; 250 return (1); 251 } 252 253 /* 254 * Socantsendmore indicates that no more data will be sent on the 255 * socket; it would normally be applied to a socket when the user 256 * informs the system that no more data is to be sent, by the protocol 257 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 258 * will be received, and will normally be applied to the socket by a 259 * protocol when it detects that the peer will send no more data. 260 * Data queued for reading in the socket may yet be read. 261 */ 262 263 void 264 socantsendmore(struct socket *so) 265 { 266 soassertlocked(so); 267 so->so_state |= SS_CANTSENDMORE; 268 sowwakeup(so); 269 } 270 271 void 272 socantrcvmore(struct socket *so) 273 { 274 soassertlocked(so); 275 so->so_state |= SS_CANTRCVMORE; 276 sorwakeup(so); 277 } 278 279 int 280 solock(struct socket *so) 281 { 282 switch (so->so_proto->pr_domain->dom_family) { 283 case PF_INET: 284 case PF_INET6: 285 NET_LOCK(); 286 break; 287 case PF_UNIX: 288 rw_enter_write(&unp_lock); 289 break; 290 default: 291 rw_enter_write(&so->so_lock); 292 break; 293 } 294 295 return (SL_LOCKED); 296 } 297 298 void 299 sounlock(struct socket *so, int s) 300 { 301 KASSERT(s == SL_LOCKED || s == SL_NOUNLOCK); 302 303 if (s != SL_LOCKED) 304 return; 305 306 switch (so->so_proto->pr_domain->dom_family) { 307 case PF_INET: 308 case PF_INET6: 309 NET_UNLOCK(); 310 break; 311 case PF_UNIX: 312 rw_exit_write(&unp_lock); 313 break; 314 default: 315 rw_exit_write(&so->so_lock); 316 break; 317 } 318 } 319 320 void 321 soassertlocked(struct socket *so) 322 { 323 switch (so->so_proto->pr_domain->dom_family) { 324 case PF_INET: 325 case PF_INET6: 326 NET_ASSERT_LOCKED(); 327 break; 328 case PF_UNIX: 329 rw_assert_wrlock(&unp_lock); 330 break; 331 default: 332 rw_assert_wrlock(&so->so_lock); 333 break; 334 } 335 } 336 337 int 338 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 339 uint64_t nsecs) 340 { 341 int ret; 342 343 switch (so->so_proto->pr_domain->dom_family) { 344 case PF_INET: 345 case PF_INET6: 346 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 347 break; 348 case PF_UNIX: 349 ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs); 350 break; 351 default: 352 ret = rwsleep_nsec(ident, &so->so_lock, prio, wmesg, nsecs); 353 break; 354 } 355 356 return ret; 357 } 358 359 /* 360 * Wait for data to arrive at/drain from a socket buffer. 361 */ 362 int 363 sbwait(struct socket *so, struct sockbuf *sb) 364 { 365 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 366 367 soassertlocked(so); 368 369 sb->sb_flags |= SB_WAIT; 370 return sosleep_nsec(so, &sb->sb_cc, prio, "netio", sb->sb_timeo_nsecs); 371 } 372 373 int 374 sblock(struct socket *so, struct sockbuf *sb, int wait) 375 { 376 int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 377 378 soassertlocked(so); 379 380 if ((sb->sb_flags & SB_LOCK) == 0) { 381 sb->sb_flags |= SB_LOCK; 382 return (0); 383 } 384 if (wait & M_NOWAIT) 385 return (EWOULDBLOCK); 386 387 while (sb->sb_flags & SB_LOCK) { 388 sb->sb_flags |= SB_WANT; 389 error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", INFSLP); 390 if (error) 391 return (error); 392 } 393 sb->sb_flags |= SB_LOCK; 394 return (0); 395 } 396 397 void 398 sbunlock(struct socket *so, struct sockbuf *sb) 399 { 400 soassertlocked(so); 401 402 sb->sb_flags &= ~SB_LOCK; 403 if (sb->sb_flags & SB_WANT) { 404 sb->sb_flags &= ~SB_WANT; 405 wakeup(&sb->sb_flags); 406 } 407 } 408 409 /* 410 * Wakeup processes waiting on a socket buffer. 411 * Do asynchronous notification via SIGIO 412 * if the socket buffer has the SB_ASYNC flag set. 413 */ 414 void 415 sowakeup(struct socket *so, struct sockbuf *sb) 416 { 417 soassertlocked(so); 418 419 sb->sb_flags &= ~SB_SEL; 420 if (sb->sb_flags & SB_WAIT) { 421 sb->sb_flags &= ~SB_WAIT; 422 wakeup(&sb->sb_cc); 423 } 424 if (sb->sb_flags & SB_ASYNC) 425 pgsigio(&so->so_sigio, SIGIO, 0); 426 selwakeup(&sb->sb_sel); 427 } 428 429 /* 430 * Socket buffer (struct sockbuf) utility routines. 431 * 432 * Each socket contains two socket buffers: one for sending data and 433 * one for receiving data. Each buffer contains a queue of mbufs, 434 * information about the number of mbufs and amount of data in the 435 * queue, and other fields allowing select() statements and notification 436 * on data availability to be implemented. 437 * 438 * Data stored in a socket buffer is maintained as a list of records. 439 * Each record is a list of mbufs chained together with the m_next 440 * field. Records are chained together with the m_nextpkt field. The upper 441 * level routine soreceive() expects the following conventions to be 442 * observed when placing information in the receive buffer: 443 * 444 * 1. If the protocol requires each message be preceded by the sender's 445 * name, then a record containing that name must be present before 446 * any associated data (mbuf's must be of type MT_SONAME). 447 * 2. If the protocol supports the exchange of ``access rights'' (really 448 * just additional data associated with the message), and there are 449 * ``rights'' to be received, then a record containing this data 450 * should be present (mbuf's must be of type MT_CONTROL). 451 * 3. If a name or rights record exists, then it must be followed by 452 * a data record, perhaps of zero length. 453 * 454 * Before using a new socket structure it is first necessary to reserve 455 * buffer space to the socket, by calling sbreserve(). This should commit 456 * some of the available buffer space in the system buffer pool for the 457 * socket (currently, it does nothing but enforce limits). The space 458 * should be released by calling sbrelease() when the socket is destroyed. 459 */ 460 461 int 462 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 463 { 464 soassertlocked(so); 465 466 if (sbreserve(so, &so->so_snd, sndcc)) 467 goto bad; 468 if (sbreserve(so, &so->so_rcv, rcvcc)) 469 goto bad2; 470 so->so_snd.sb_wat = sndcc; 471 so->so_rcv.sb_wat = rcvcc; 472 if (so->so_rcv.sb_lowat == 0) 473 so->so_rcv.sb_lowat = 1; 474 if (so->so_snd.sb_lowat == 0) 475 so->so_snd.sb_lowat = MCLBYTES; 476 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 477 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 478 return (0); 479 bad2: 480 sbrelease(so, &so->so_snd); 481 bad: 482 return (ENOBUFS); 483 } 484 485 /* 486 * Allot mbufs to a sockbuf. 487 * Attempt to scale mbmax so that mbcnt doesn't become limiting 488 * if buffering efficiency is near the normal case. 489 */ 490 int 491 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 492 { 493 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 494 soassertlocked(so); 495 496 if (cc == 0 || cc > sb_max) 497 return (1); 498 sb->sb_hiwat = cc; 499 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 500 if (sb->sb_lowat > sb->sb_hiwat) 501 sb->sb_lowat = sb->sb_hiwat; 502 return (0); 503 } 504 505 /* 506 * In low memory situation, do not accept any greater than normal request. 507 */ 508 int 509 sbcheckreserve(u_long cnt, u_long defcnt) 510 { 511 if (cnt > defcnt && sbchecklowmem()) 512 return (ENOBUFS); 513 return (0); 514 } 515 516 int 517 sbchecklowmem(void) 518 { 519 static int sblowmem; 520 521 if (mclpools[0].pr_nout < mclpools[0].pr_hardlimit * 60 / 100 || 522 mbpool.pr_nout < mbpool.pr_hardlimit * 60 / 100) 523 sblowmem = 0; 524 if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 80 / 100 || 525 mbpool.pr_nout > mbpool.pr_hardlimit * 80 / 100) 526 sblowmem = 1; 527 return (sblowmem); 528 } 529 530 /* 531 * Free mbufs held by a socket, and reserved mbuf space. 532 */ 533 void 534 sbrelease(struct socket *so, struct sockbuf *sb) 535 { 536 537 sbflush(so, sb); 538 sb->sb_hiwat = sb->sb_mbmax = 0; 539 } 540 541 /* 542 * Routines to add and remove 543 * data from an mbuf queue. 544 * 545 * The routines sbappend() or sbappendrecord() are normally called to 546 * append new mbufs to a socket buffer, after checking that adequate 547 * space is available, comparing the function sbspace() with the amount 548 * of data to be added. sbappendrecord() differs from sbappend() in 549 * that data supplied is treated as the beginning of a new record. 550 * To place a sender's address, optional access rights, and data in a 551 * socket receive buffer, sbappendaddr() should be used. To place 552 * access rights and data in a socket receive buffer, sbappendrights() 553 * should be used. In either case, the new data begins a new record. 554 * Note that unlike sbappend() and sbappendrecord(), these routines check 555 * for the caller that there will be enough space to store the data. 556 * Each fails if there is not enough space, or if it cannot find mbufs 557 * to store additional information in. 558 * 559 * Reliable protocols may use the socket send buffer to hold data 560 * awaiting acknowledgement. Data is normally copied from a socket 561 * send buffer in a protocol with m_copym for output to a peer, 562 * and then removing the data from the socket buffer with sbdrop() 563 * or sbdroprecord() when the data is acknowledged by the peer. 564 */ 565 566 #ifdef SOCKBUF_DEBUG 567 void 568 sblastrecordchk(struct sockbuf *sb, const char *where) 569 { 570 struct mbuf *m = sb->sb_mb; 571 572 while (m && m->m_nextpkt) 573 m = m->m_nextpkt; 574 575 if (m != sb->sb_lastrecord) { 576 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 577 sb->sb_mb, sb->sb_lastrecord, m); 578 printf("packet chain:\n"); 579 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 580 printf("\t%p\n", m); 581 panic("sblastrecordchk from %s", where); 582 } 583 } 584 585 void 586 sblastmbufchk(struct sockbuf *sb, const char *where) 587 { 588 struct mbuf *m = sb->sb_mb; 589 struct mbuf *n; 590 591 while (m && m->m_nextpkt) 592 m = m->m_nextpkt; 593 594 while (m && m->m_next) 595 m = m->m_next; 596 597 if (m != sb->sb_mbtail) { 598 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 599 sb->sb_mb, sb->sb_mbtail, m); 600 printf("packet tree:\n"); 601 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 602 printf("\t"); 603 for (n = m; n != NULL; n = n->m_next) 604 printf("%p ", n); 605 printf("\n"); 606 } 607 panic("sblastmbufchk from %s", where); 608 } 609 } 610 #endif /* SOCKBUF_DEBUG */ 611 612 #define SBLINKRECORD(sb, m0) \ 613 do { \ 614 if ((sb)->sb_lastrecord != NULL) \ 615 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 616 else \ 617 (sb)->sb_mb = (m0); \ 618 (sb)->sb_lastrecord = (m0); \ 619 } while (/*CONSTCOND*/0) 620 621 /* 622 * Append mbuf chain m to the last record in the 623 * socket buffer sb. The additional space associated 624 * the mbuf chain is recorded in sb. Empty mbufs are 625 * discarded and mbufs are compacted where possible. 626 */ 627 void 628 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 629 { 630 struct mbuf *n; 631 632 if (m == NULL) 633 return; 634 635 soassertlocked(so); 636 SBLASTRECORDCHK(sb, "sbappend 1"); 637 638 if ((n = sb->sb_lastrecord) != NULL) { 639 /* 640 * XXX Would like to simply use sb_mbtail here, but 641 * XXX I need to verify that I won't miss an EOR that 642 * XXX way. 643 */ 644 do { 645 if (n->m_flags & M_EOR) { 646 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 647 return; 648 } 649 } while (n->m_next && (n = n->m_next)); 650 } else { 651 /* 652 * If this is the first record in the socket buffer, it's 653 * also the last record. 654 */ 655 sb->sb_lastrecord = m; 656 } 657 sbcompress(so, sb, m, n); 658 SBLASTRECORDCHK(sb, "sbappend 2"); 659 } 660 661 /* 662 * This version of sbappend() should only be used when the caller 663 * absolutely knows that there will never be more than one record 664 * in the socket buffer, that is, a stream protocol (such as TCP). 665 */ 666 void 667 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 668 { 669 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 670 soassertlocked(so); 671 KDASSERT(m->m_nextpkt == NULL); 672 KASSERT(sb->sb_mb == sb->sb_lastrecord); 673 674 SBLASTMBUFCHK(sb, __func__); 675 676 sbcompress(so, sb, m, sb->sb_mbtail); 677 678 sb->sb_lastrecord = sb->sb_mb; 679 SBLASTRECORDCHK(sb, __func__); 680 } 681 682 #ifdef SOCKBUF_DEBUG 683 void 684 sbcheck(struct socket *so, struct sockbuf *sb) 685 { 686 struct mbuf *m, *n; 687 u_long len = 0, mbcnt = 0; 688 689 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 690 for (n = m; n; n = n->m_next) { 691 len += n->m_len; 692 mbcnt += MSIZE; 693 if (n->m_flags & M_EXT) 694 mbcnt += n->m_ext.ext_size; 695 if (m != n && n->m_nextpkt) 696 panic("sbcheck nextpkt"); 697 } 698 } 699 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 700 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 701 mbcnt, sb->sb_mbcnt); 702 panic("sbcheck"); 703 } 704 } 705 #endif 706 707 /* 708 * As above, except the mbuf chain 709 * begins a new record. 710 */ 711 void 712 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 713 { 714 struct mbuf *m; 715 716 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 717 soassertlocked(so); 718 719 if (m0 == NULL) 720 return; 721 722 /* 723 * Put the first mbuf on the queue. 724 * Note this permits zero length records. 725 */ 726 sballoc(so, sb, m0); 727 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 728 SBLINKRECORD(sb, m0); 729 m = m0->m_next; 730 m0->m_next = NULL; 731 if (m && (m0->m_flags & M_EOR)) { 732 m0->m_flags &= ~M_EOR; 733 m->m_flags |= M_EOR; 734 } 735 sbcompress(so, sb, m, m0); 736 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 737 } 738 739 /* 740 * Append address and data, and optionally, control (ancillary) data 741 * to the receive queue of a socket. If present, 742 * m0 must include a packet header with total length. 743 * Returns 0 if no space in sockbuf or insufficient mbufs. 744 */ 745 int 746 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 747 struct mbuf *m0, struct mbuf *control) 748 { 749 struct mbuf *m, *n, *nlast; 750 int space = asa->sa_len; 751 752 soassertlocked(so); 753 754 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 755 panic("sbappendaddr"); 756 if (m0) 757 space += m0->m_pkthdr.len; 758 for (n = control; n; n = n->m_next) { 759 space += n->m_len; 760 if (n->m_next == NULL) /* keep pointer to last control buf */ 761 break; 762 } 763 if (space > sbspace(so, sb)) 764 return (0); 765 if (asa->sa_len > MLEN) 766 return (0); 767 MGET(m, M_DONTWAIT, MT_SONAME); 768 if (m == NULL) 769 return (0); 770 m->m_len = asa->sa_len; 771 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 772 if (n) 773 n->m_next = m0; /* concatenate data to control */ 774 else 775 control = m0; 776 m->m_next = control; 777 778 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 779 780 for (n = m; n->m_next != NULL; n = n->m_next) 781 sballoc(so, sb, n); 782 sballoc(so, sb, n); 783 nlast = n; 784 SBLINKRECORD(sb, m); 785 786 sb->sb_mbtail = nlast; 787 SBLASTMBUFCHK(sb, "sbappendaddr"); 788 789 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 790 791 return (1); 792 } 793 794 int 795 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 796 struct mbuf *control) 797 { 798 struct mbuf *m, *mlast, *n; 799 int space = 0; 800 801 if (control == NULL) 802 panic("sbappendcontrol"); 803 for (m = control; ; m = m->m_next) { 804 space += m->m_len; 805 if (m->m_next == NULL) 806 break; 807 } 808 n = m; /* save pointer to last control buffer */ 809 for (m = m0; m; m = m->m_next) 810 space += m->m_len; 811 if (space > sbspace(so, sb)) 812 return (0); 813 n->m_next = m0; /* concatenate data to control */ 814 815 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 816 817 for (m = control; m->m_next != NULL; m = m->m_next) 818 sballoc(so, sb, m); 819 sballoc(so, sb, m); 820 mlast = m; 821 SBLINKRECORD(sb, control); 822 823 sb->sb_mbtail = mlast; 824 SBLASTMBUFCHK(sb, "sbappendcontrol"); 825 826 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 827 828 return (1); 829 } 830 831 /* 832 * Compress mbuf chain m into the socket 833 * buffer sb following mbuf n. If n 834 * is null, the buffer is presumed empty. 835 */ 836 void 837 sbcompress(struct socket *so, struct sockbuf *sb, struct mbuf *m, 838 struct mbuf *n) 839 { 840 int eor = 0; 841 struct mbuf *o; 842 843 while (m) { 844 eor |= m->m_flags & M_EOR; 845 if (m->m_len == 0 && 846 (eor == 0 || 847 (((o = m->m_next) || (o = n)) && 848 o->m_type == m->m_type))) { 849 if (sb->sb_lastrecord == m) 850 sb->sb_lastrecord = m->m_next; 851 m = m_free(m); 852 continue; 853 } 854 if (n && (n->m_flags & M_EOR) == 0 && 855 /* m_trailingspace() checks buffer writeability */ 856 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 857 MCLBYTES) / 4 && /* XXX Don't copy too much */ 858 m->m_len <= m_trailingspace(n) && 859 n->m_type == m->m_type) { 860 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 861 m->m_len); 862 n->m_len += m->m_len; 863 sb->sb_cc += m->m_len; 864 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 865 sb->sb_datacc += m->m_len; 866 m = m_free(m); 867 continue; 868 } 869 if (n) 870 n->m_next = m; 871 else 872 sb->sb_mb = m; 873 sb->sb_mbtail = m; 874 sballoc(so, sb, m); 875 n = m; 876 m->m_flags &= ~M_EOR; 877 m = m->m_next; 878 n->m_next = NULL; 879 } 880 if (eor) { 881 if (n) 882 n->m_flags |= eor; 883 else 884 printf("semi-panic: sbcompress"); 885 } 886 SBLASTMBUFCHK(sb, __func__); 887 } 888 889 /* 890 * Free all mbufs in a sockbuf. 891 * Check that all resources are reclaimed. 892 */ 893 void 894 sbflush(struct socket *so, struct sockbuf *sb) 895 { 896 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 897 KASSERT((sb->sb_flags & SB_LOCK) == 0); 898 899 while (sb->sb_mbcnt) 900 sbdrop(so, sb, (int)sb->sb_cc); 901 902 KASSERT(sb->sb_cc == 0); 903 KASSERT(sb->sb_datacc == 0); 904 KASSERT(sb->sb_mb == NULL); 905 KASSERT(sb->sb_mbtail == NULL); 906 KASSERT(sb->sb_lastrecord == NULL); 907 } 908 909 /* 910 * Drop data from (the front of) a sockbuf. 911 */ 912 void 913 sbdrop(struct socket *so, struct sockbuf *sb, int len) 914 { 915 struct mbuf *m, *mn; 916 struct mbuf *next; 917 918 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 919 soassertlocked(so); 920 921 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 922 while (len > 0) { 923 if (m == NULL) { 924 if (next == NULL) 925 panic("sbdrop"); 926 m = next; 927 next = m->m_nextpkt; 928 continue; 929 } 930 if (m->m_len > len) { 931 m->m_len -= len; 932 m->m_data += len; 933 sb->sb_cc -= len; 934 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 935 sb->sb_datacc -= len; 936 break; 937 } 938 len -= m->m_len; 939 sbfree(so, sb, m); 940 mn = m_free(m); 941 m = mn; 942 } 943 while (m && m->m_len == 0) { 944 sbfree(so, sb, m); 945 mn = m_free(m); 946 m = mn; 947 } 948 if (m) { 949 sb->sb_mb = m; 950 m->m_nextpkt = next; 951 } else 952 sb->sb_mb = next; 953 /* 954 * First part is an inline SB_EMPTY_FIXUP(). Second part 955 * makes sure sb_lastrecord is up-to-date if we dropped 956 * part of the last record. 957 */ 958 m = sb->sb_mb; 959 if (m == NULL) { 960 sb->sb_mbtail = NULL; 961 sb->sb_lastrecord = NULL; 962 } else if (m->m_nextpkt == NULL) 963 sb->sb_lastrecord = m; 964 } 965 966 /* 967 * Drop a record off the front of a sockbuf 968 * and move the next record to the front. 969 */ 970 void 971 sbdroprecord(struct socket *so, struct sockbuf *sb) 972 { 973 struct mbuf *m, *mn; 974 975 m = sb->sb_mb; 976 if (m) { 977 sb->sb_mb = m->m_nextpkt; 978 do { 979 sbfree(so, sb, m); 980 mn = m_free(m); 981 } while ((m = mn) != NULL); 982 } 983 SB_EMPTY_FIXUP(sb); 984 } 985 986 /* 987 * Create a "control" mbuf containing the specified data 988 * with the specified type for presentation on a socket buffer. 989 */ 990 struct mbuf * 991 sbcreatecontrol(const void *p, size_t size, int type, int level) 992 { 993 struct cmsghdr *cp; 994 struct mbuf *m; 995 996 if (CMSG_SPACE(size) > MCLBYTES) { 997 printf("sbcreatecontrol: message too large %zu\n", size); 998 return (NULL); 999 } 1000 1001 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1002 return (NULL); 1003 if (CMSG_SPACE(size) > MLEN) { 1004 MCLGET(m, M_DONTWAIT); 1005 if ((m->m_flags & M_EXT) == 0) { 1006 m_free(m); 1007 return NULL; 1008 } 1009 } 1010 cp = mtod(m, struct cmsghdr *); 1011 memset(cp, 0, CMSG_SPACE(size)); 1012 memcpy(CMSG_DATA(cp), p, size); 1013 m->m_len = CMSG_SPACE(size); 1014 cp->cmsg_len = CMSG_LEN(size); 1015 cp->cmsg_level = level; 1016 cp->cmsg_type = type; 1017 return (m); 1018 } 1019