1 /* $OpenBSD: uipc_socket2.c,v 1.106 2021/02/11 20:28:57 mvs Exp $ */ 2 /* $NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/mbuf.h> 39 #include <sys/protosw.h> 40 #include <sys/domain.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/signalvar.h> 44 #include <sys/event.h> 45 #include <sys/pool.h> 46 47 /* 48 * Primitive routines for operating on sockets and socket buffers 49 */ 50 51 u_long sb_max = SB_MAX; /* patchable */ 52 53 extern struct pool mclpools[]; 54 extern struct pool mbpool; 55 56 extern struct rwlock unp_lock; 57 58 /* 59 * Procedures to manipulate state flags of socket 60 * and do appropriate wakeups. Normal sequence from the 61 * active (originating) side is that soisconnecting() is 62 * called during processing of connect() call, 63 * resulting in an eventual call to soisconnected() if/when the 64 * connection is established. When the connection is torn down 65 * soisdisconnecting() is called during processing of disconnect() call, 66 * and soisdisconnected() is called when the connection to the peer 67 * is totally severed. The semantics of these routines are such that 68 * connectionless protocols can call soisconnected() and soisdisconnected() 69 * only, bypassing the in-progress calls when setting up a ``connection'' 70 * takes no time. 71 * 72 * From the passive side, a socket is created with 73 * two queues of sockets: so_q0 for connections in progress 74 * and so_q for connections already made and awaiting user acceptance. 75 * As a protocol is preparing incoming connections, it creates a socket 76 * structure queued on so_q0 by calling sonewconn(). When the connection 77 * is established, soisconnected() is called, and transfers the 78 * socket structure to so_q, making it available to accept(). 79 * 80 * If a socket is closed with sockets on either 81 * so_q0 or so_q, these sockets are dropped. 82 * 83 * If higher level protocols are implemented in 84 * the kernel, the wakeups done here will sometimes 85 * cause software-interrupt process scheduling. 86 */ 87 88 void 89 soisconnecting(struct socket *so) 90 { 91 soassertlocked(so); 92 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 93 so->so_state |= SS_ISCONNECTING; 94 } 95 96 void 97 soisconnected(struct socket *so) 98 { 99 struct socket *head = so->so_head; 100 101 soassertlocked(so); 102 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 103 so->so_state |= SS_ISCONNECTED; 104 if (head && soqremque(so, 0)) { 105 soqinsque(head, so, 1); 106 sorwakeup(head); 107 wakeup_one(&head->so_timeo); 108 } else { 109 wakeup(&so->so_timeo); 110 sorwakeup(so); 111 sowwakeup(so); 112 } 113 } 114 115 void 116 soisdisconnecting(struct socket *so) 117 { 118 soassertlocked(so); 119 so->so_state &= ~SS_ISCONNECTING; 120 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 121 wakeup(&so->so_timeo); 122 sowwakeup(so); 123 sorwakeup(so); 124 } 125 126 void 127 soisdisconnected(struct socket *so) 128 { 129 soassertlocked(so); 130 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 131 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 132 wakeup(&so->so_timeo); 133 sowwakeup(so); 134 sorwakeup(so); 135 } 136 137 /* 138 * When an attempt at a new connection is noted on a socket 139 * which accepts connections, sonewconn is called. If the 140 * connection is possible (subject to space constraints, etc.) 141 * then we allocate a new structure, properly linked into the 142 * data structure of the original socket, and return this. 143 * Connstatus may be 0 or SS_ISCONNECTED. 144 */ 145 struct socket * 146 sonewconn(struct socket *head, int connstatus) 147 { 148 struct socket *so; 149 int soqueue = connstatus ? 1 : 0; 150 151 /* 152 * XXXSMP as long as `so' and `head' share the same lock, we 153 * can call soreserve() and pr_attach() below w/o expliclitly 154 * locking `so'. 155 */ 156 soassertlocked(head); 157 158 if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 95 / 100) 159 return (NULL); 160 if (head->so_qlen + head->so_q0len > head->so_qlimit * 3) 161 return (NULL); 162 so = pool_get(&socket_pool, PR_NOWAIT|PR_ZERO); 163 if (so == NULL) 164 return (NULL); 165 so->so_type = head->so_type; 166 so->so_options = head->so_options &~ SO_ACCEPTCONN; 167 so->so_linger = head->so_linger; 168 so->so_state = head->so_state | SS_NOFDREF; 169 so->so_proto = head->so_proto; 170 so->so_timeo = head->so_timeo; 171 so->so_euid = head->so_euid; 172 so->so_ruid = head->so_ruid; 173 so->so_egid = head->so_egid; 174 so->so_rgid = head->so_rgid; 175 so->so_cpid = head->so_cpid; 176 177 /* 178 * Inherit watermarks but those may get clamped in low mem situations. 179 */ 180 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 181 pool_put(&socket_pool, so); 182 return (NULL); 183 } 184 so->so_snd.sb_wat = head->so_snd.sb_wat; 185 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 186 so->so_snd.sb_timeo_nsecs = head->so_snd.sb_timeo_nsecs; 187 so->so_rcv.sb_wat = head->so_rcv.sb_wat; 188 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 189 so->so_rcv.sb_timeo_nsecs = head->so_rcv.sb_timeo_nsecs; 190 191 sigio_init(&so->so_sigio); 192 sigio_copy(&so->so_sigio, &head->so_sigio); 193 194 soqinsque(head, so, soqueue); 195 if ((*so->so_proto->pr_attach)(so, 0)) { 196 (void) soqremque(so, soqueue); 197 sigio_free(&so->so_sigio); 198 pool_put(&socket_pool, so); 199 return (NULL); 200 } 201 if (connstatus) { 202 sorwakeup(head); 203 wakeup(&head->so_timeo); 204 so->so_state |= connstatus; 205 } 206 return (so); 207 } 208 209 void 210 soqinsque(struct socket *head, struct socket *so, int q) 211 { 212 soassertlocked(head); 213 214 #ifdef DIAGNOSTIC 215 if (so->so_onq != NULL) 216 panic("soqinsque"); 217 #endif 218 219 so->so_head = head; 220 if (q == 0) { 221 head->so_q0len++; 222 so->so_onq = &head->so_q0; 223 } else { 224 head->so_qlen++; 225 so->so_onq = &head->so_q; 226 } 227 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 228 } 229 230 int 231 soqremque(struct socket *so, int q) 232 { 233 struct socket *head = so->so_head; 234 235 soassertlocked(head); 236 237 if (q == 0) { 238 if (so->so_onq != &head->so_q0) 239 return (0); 240 head->so_q0len--; 241 } else { 242 if (so->so_onq != &head->so_q) 243 return (0); 244 head->so_qlen--; 245 } 246 TAILQ_REMOVE(so->so_onq, so, so_qe); 247 so->so_onq = NULL; 248 so->so_head = NULL; 249 return (1); 250 } 251 252 /* 253 * Socantsendmore indicates that no more data will be sent on the 254 * socket; it would normally be applied to a socket when the user 255 * informs the system that no more data is to be sent, by the protocol 256 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 257 * will be received, and will normally be applied to the socket by a 258 * protocol when it detects that the peer will send no more data. 259 * Data queued for reading in the socket may yet be read. 260 */ 261 262 void 263 socantsendmore(struct socket *so) 264 { 265 soassertlocked(so); 266 so->so_state |= SS_CANTSENDMORE; 267 sowwakeup(so); 268 } 269 270 void 271 socantrcvmore(struct socket *so) 272 { 273 soassertlocked(so); 274 so->so_state |= SS_CANTRCVMORE; 275 sorwakeup(so); 276 } 277 278 int 279 solock(struct socket *so) 280 { 281 switch (so->so_proto->pr_domain->dom_family) { 282 case PF_INET: 283 case PF_INET6: 284 NET_LOCK(); 285 break; 286 case PF_UNIX: 287 rw_enter_write(&unp_lock); 288 break; 289 case PF_ROUTE: 290 case PF_KEY: 291 default: 292 KERNEL_LOCK(); 293 break; 294 } 295 296 return (SL_LOCKED); 297 } 298 299 void 300 sounlock(struct socket *so, int s) 301 { 302 KASSERT(s == SL_LOCKED || s == SL_NOUNLOCK); 303 304 if (s != SL_LOCKED) 305 return; 306 307 switch (so->so_proto->pr_domain->dom_family) { 308 case PF_INET: 309 case PF_INET6: 310 NET_UNLOCK(); 311 break; 312 case PF_UNIX: 313 rw_exit_write(&unp_lock); 314 break; 315 case PF_ROUTE: 316 case PF_KEY: 317 default: 318 KERNEL_UNLOCK(); 319 break; 320 } 321 } 322 323 void 324 soassertlocked(struct socket *so) 325 { 326 switch (so->so_proto->pr_domain->dom_family) { 327 case PF_INET: 328 case PF_INET6: 329 NET_ASSERT_LOCKED(); 330 break; 331 case PF_UNIX: 332 rw_assert_wrlock(&unp_lock); 333 break; 334 case PF_ROUTE: 335 case PF_KEY: 336 default: 337 KERNEL_ASSERT_LOCKED(); 338 break; 339 } 340 } 341 342 int 343 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, 344 uint64_t nsecs) 345 { 346 int ret; 347 348 switch (so->so_proto->pr_domain->dom_family) { 349 case PF_INET: 350 case PF_INET6: 351 ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); 352 break; 353 case PF_UNIX: 354 ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs); 355 break; 356 case PF_ROUTE: 357 case PF_KEY: 358 default: 359 ret = tsleep_nsec(ident, prio, wmesg, nsecs); 360 break; 361 } 362 363 return ret; 364 } 365 366 /* 367 * Wait for data to arrive at/drain from a socket buffer. 368 */ 369 int 370 sbwait(struct socket *so, struct sockbuf *sb) 371 { 372 int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 373 374 soassertlocked(so); 375 376 sb->sb_flags |= SB_WAIT; 377 return sosleep_nsec(so, &sb->sb_cc, prio, "netio", sb->sb_timeo_nsecs); 378 } 379 380 int 381 sblock(struct socket *so, struct sockbuf *sb, int wait) 382 { 383 int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; 384 385 soassertlocked(so); 386 387 if ((sb->sb_flags & SB_LOCK) == 0) { 388 sb->sb_flags |= SB_LOCK; 389 return (0); 390 } 391 if (wait & M_NOWAIT) 392 return (EWOULDBLOCK); 393 394 while (sb->sb_flags & SB_LOCK) { 395 sb->sb_flags |= SB_WANT; 396 error = sosleep_nsec(so, &sb->sb_flags, prio, "netlck", INFSLP); 397 if (error) 398 return (error); 399 } 400 sb->sb_flags |= SB_LOCK; 401 return (0); 402 } 403 404 void 405 sbunlock(struct socket *so, struct sockbuf *sb) 406 { 407 soassertlocked(so); 408 409 sb->sb_flags &= ~SB_LOCK; 410 if (sb->sb_flags & SB_WANT) { 411 sb->sb_flags &= ~SB_WANT; 412 wakeup(&sb->sb_flags); 413 } 414 } 415 416 /* 417 * Wakeup processes waiting on a socket buffer. 418 * Do asynchronous notification via SIGIO 419 * if the socket has the SS_ASYNC flag set. 420 */ 421 void 422 sowakeup(struct socket *so, struct sockbuf *sb) 423 { 424 soassertlocked(so); 425 426 sb->sb_flags &= ~SB_SEL; 427 if (sb->sb_flags & SB_WAIT) { 428 sb->sb_flags &= ~SB_WAIT; 429 wakeup(&sb->sb_cc); 430 } 431 if (so->so_state & SS_ASYNC) 432 pgsigio(&so->so_sigio, SIGIO, 0); 433 selwakeup(&sb->sb_sel); 434 } 435 436 /* 437 * Socket buffer (struct sockbuf) utility routines. 438 * 439 * Each socket contains two socket buffers: one for sending data and 440 * one for receiving data. Each buffer contains a queue of mbufs, 441 * information about the number of mbufs and amount of data in the 442 * queue, and other fields allowing select() statements and notification 443 * on data availability to be implemented. 444 * 445 * Data stored in a socket buffer is maintained as a list of records. 446 * Each record is a list of mbufs chained together with the m_next 447 * field. Records are chained together with the m_nextpkt field. The upper 448 * level routine soreceive() expects the following conventions to be 449 * observed when placing information in the receive buffer: 450 * 451 * 1. If the protocol requires each message be preceded by the sender's 452 * name, then a record containing that name must be present before 453 * any associated data (mbuf's must be of type MT_SONAME). 454 * 2. If the protocol supports the exchange of ``access rights'' (really 455 * just additional data associated with the message), and there are 456 * ``rights'' to be received, then a record containing this data 457 * should be present (mbuf's must be of type MT_CONTROL). 458 * 3. If a name or rights record exists, then it must be followed by 459 * a data record, perhaps of zero length. 460 * 461 * Before using a new socket structure it is first necessary to reserve 462 * buffer space to the socket, by calling sbreserve(). This should commit 463 * some of the available buffer space in the system buffer pool for the 464 * socket (currently, it does nothing but enforce limits). The space 465 * should be released by calling sbrelease() when the socket is destroyed. 466 */ 467 468 int 469 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 470 { 471 soassertlocked(so); 472 473 if (sbreserve(so, &so->so_snd, sndcc)) 474 goto bad; 475 if (sbreserve(so, &so->so_rcv, rcvcc)) 476 goto bad2; 477 so->so_snd.sb_wat = sndcc; 478 so->so_rcv.sb_wat = rcvcc; 479 if (so->so_rcv.sb_lowat == 0) 480 so->so_rcv.sb_lowat = 1; 481 if (so->so_snd.sb_lowat == 0) 482 so->so_snd.sb_lowat = MCLBYTES; 483 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 484 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 485 return (0); 486 bad2: 487 sbrelease(so, &so->so_snd); 488 bad: 489 return (ENOBUFS); 490 } 491 492 /* 493 * Allot mbufs to a sockbuf. 494 * Attempt to scale mbmax so that mbcnt doesn't become limiting 495 * if buffering efficiency is near the normal case. 496 */ 497 int 498 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) 499 { 500 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 501 soassertlocked(so); 502 503 if (cc == 0 || cc > sb_max) 504 return (1); 505 sb->sb_hiwat = cc; 506 sb->sb_mbmax = max(3 * MAXMCLBYTES, cc * 8); 507 if (sb->sb_lowat > sb->sb_hiwat) 508 sb->sb_lowat = sb->sb_hiwat; 509 return (0); 510 } 511 512 /* 513 * In low memory situation, do not accept any greater than normal request. 514 */ 515 int 516 sbcheckreserve(u_long cnt, u_long defcnt) 517 { 518 if (cnt > defcnt && sbchecklowmem()) 519 return (ENOBUFS); 520 return (0); 521 } 522 523 int 524 sbchecklowmem(void) 525 { 526 static int sblowmem; 527 528 if (mclpools[0].pr_nout < mclpools[0].pr_hardlimit * 60 / 100 || 529 mbpool.pr_nout < mbpool.pr_hardlimit * 60 / 100) 530 sblowmem = 0; 531 if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 80 / 100 || 532 mbpool.pr_nout > mbpool.pr_hardlimit * 80 / 100) 533 sblowmem = 1; 534 return (sblowmem); 535 } 536 537 /* 538 * Free mbufs held by a socket, and reserved mbuf space. 539 */ 540 void 541 sbrelease(struct socket *so, struct sockbuf *sb) 542 { 543 544 sbflush(so, sb); 545 sb->sb_hiwat = sb->sb_mbmax = 0; 546 } 547 548 /* 549 * Routines to add and remove 550 * data from an mbuf queue. 551 * 552 * The routines sbappend() or sbappendrecord() are normally called to 553 * append new mbufs to a socket buffer, after checking that adequate 554 * space is available, comparing the function sbspace() with the amount 555 * of data to be added. sbappendrecord() differs from sbappend() in 556 * that data supplied is treated as the beginning of a new record. 557 * To place a sender's address, optional access rights, and data in a 558 * socket receive buffer, sbappendaddr() should be used. To place 559 * access rights and data in a socket receive buffer, sbappendrights() 560 * should be used. In either case, the new data begins a new record. 561 * Note that unlike sbappend() and sbappendrecord(), these routines check 562 * for the caller that there will be enough space to store the data. 563 * Each fails if there is not enough space, or if it cannot find mbufs 564 * to store additional information in. 565 * 566 * Reliable protocols may use the socket send buffer to hold data 567 * awaiting acknowledgement. Data is normally copied from a socket 568 * send buffer in a protocol with m_copym for output to a peer, 569 * and then removing the data from the socket buffer with sbdrop() 570 * or sbdroprecord() when the data is acknowledged by the peer. 571 */ 572 573 #ifdef SOCKBUF_DEBUG 574 void 575 sblastrecordchk(struct sockbuf *sb, const char *where) 576 { 577 struct mbuf *m = sb->sb_mb; 578 579 while (m && m->m_nextpkt) 580 m = m->m_nextpkt; 581 582 if (m != sb->sb_lastrecord) { 583 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 584 sb->sb_mb, sb->sb_lastrecord, m); 585 printf("packet chain:\n"); 586 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 587 printf("\t%p\n", m); 588 panic("sblastrecordchk from %s", where); 589 } 590 } 591 592 void 593 sblastmbufchk(struct sockbuf *sb, const char *where) 594 { 595 struct mbuf *m = sb->sb_mb; 596 struct mbuf *n; 597 598 while (m && m->m_nextpkt) 599 m = m->m_nextpkt; 600 601 while (m && m->m_next) 602 m = m->m_next; 603 604 if (m != sb->sb_mbtail) { 605 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 606 sb->sb_mb, sb->sb_mbtail, m); 607 printf("packet tree:\n"); 608 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 609 printf("\t"); 610 for (n = m; n != NULL; n = n->m_next) 611 printf("%p ", n); 612 printf("\n"); 613 } 614 panic("sblastmbufchk from %s", where); 615 } 616 } 617 #endif /* SOCKBUF_DEBUG */ 618 619 #define SBLINKRECORD(sb, m0) \ 620 do { \ 621 if ((sb)->sb_lastrecord != NULL) \ 622 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 623 else \ 624 (sb)->sb_mb = (m0); \ 625 (sb)->sb_lastrecord = (m0); \ 626 } while (/*CONSTCOND*/0) 627 628 /* 629 * Append mbuf chain m to the last record in the 630 * socket buffer sb. The additional space associated 631 * the mbuf chain is recorded in sb. Empty mbufs are 632 * discarded and mbufs are compacted where possible. 633 */ 634 void 635 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m) 636 { 637 struct mbuf *n; 638 639 if (m == NULL) 640 return; 641 642 soassertlocked(so); 643 SBLASTRECORDCHK(sb, "sbappend 1"); 644 645 if ((n = sb->sb_lastrecord) != NULL) { 646 /* 647 * XXX Would like to simply use sb_mbtail here, but 648 * XXX I need to verify that I won't miss an EOR that 649 * XXX way. 650 */ 651 do { 652 if (n->m_flags & M_EOR) { 653 sbappendrecord(so, sb, m); /* XXXXXX!!!! */ 654 return; 655 } 656 } while (n->m_next && (n = n->m_next)); 657 } else { 658 /* 659 * If this is the first record in the socket buffer, it's 660 * also the last record. 661 */ 662 sb->sb_lastrecord = m; 663 } 664 sbcompress(sb, m, n); 665 SBLASTRECORDCHK(sb, "sbappend 2"); 666 } 667 668 /* 669 * This version of sbappend() should only be used when the caller 670 * absolutely knows that there will never be more than one record 671 * in the socket buffer, that is, a stream protocol (such as TCP). 672 */ 673 void 674 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m) 675 { 676 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 677 soassertlocked(so); 678 KDASSERT(m->m_nextpkt == NULL); 679 KASSERT(sb->sb_mb == sb->sb_lastrecord); 680 681 SBLASTMBUFCHK(sb, __func__); 682 683 sbcompress(sb, m, sb->sb_mbtail); 684 685 sb->sb_lastrecord = sb->sb_mb; 686 SBLASTRECORDCHK(sb, __func__); 687 } 688 689 #ifdef SOCKBUF_DEBUG 690 void 691 sbcheck(struct sockbuf *sb) 692 { 693 struct mbuf *m, *n; 694 u_long len = 0, mbcnt = 0; 695 696 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 697 for (n = m; n; n = n->m_next) { 698 len += n->m_len; 699 mbcnt += MSIZE; 700 if (n->m_flags & M_EXT) 701 mbcnt += n->m_ext.ext_size; 702 if (m != n && n->m_nextpkt) 703 panic("sbcheck nextpkt"); 704 } 705 } 706 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 707 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 708 mbcnt, sb->sb_mbcnt); 709 panic("sbcheck"); 710 } 711 } 712 #endif 713 714 /* 715 * As above, except the mbuf chain 716 * begins a new record. 717 */ 718 void 719 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0) 720 { 721 struct mbuf *m; 722 723 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 724 soassertlocked(so); 725 726 if (m0 == NULL) 727 return; 728 729 /* 730 * Put the first mbuf on the queue. 731 * Note this permits zero length records. 732 */ 733 sballoc(sb, m0); 734 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 735 SBLINKRECORD(sb, m0); 736 m = m0->m_next; 737 m0->m_next = NULL; 738 if (m && (m0->m_flags & M_EOR)) { 739 m0->m_flags &= ~M_EOR; 740 m->m_flags |= M_EOR; 741 } 742 sbcompress(sb, m, m0); 743 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 744 } 745 746 /* 747 * As above except that OOB data 748 * is inserted at the beginning of the sockbuf, 749 * but after any other OOB data. 750 */ 751 void 752 sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 753 { 754 struct mbuf *m, **mp; 755 756 if (m0 == NULL) 757 return; 758 759 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 760 761 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 762 again: 763 switch (m->m_type) { 764 765 case MT_OOBDATA: 766 continue; /* WANT next train */ 767 768 case MT_CONTROL: 769 if ((m = m->m_next) != NULL) 770 goto again; /* inspect THIS train further */ 771 } 772 break; 773 } 774 /* 775 * Put the first mbuf on the queue. 776 * Note this permits zero length records. 777 */ 778 sballoc(sb, m0); 779 m0->m_nextpkt = *mp; 780 if (*mp == NULL) { 781 /* m0 is actually the new tail */ 782 sb->sb_lastrecord = m0; 783 } 784 *mp = m0; 785 m = m0->m_next; 786 m0->m_next = NULL; 787 if (m && (m0->m_flags & M_EOR)) { 788 m0->m_flags &= ~M_EOR; 789 m->m_flags |= M_EOR; 790 } 791 sbcompress(sb, m, m0); 792 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 793 } 794 795 /* 796 * Append address and data, and optionally, control (ancillary) data 797 * to the receive queue of a socket. If present, 798 * m0 must include a packet header with total length. 799 * Returns 0 if no space in sockbuf or insufficient mbufs. 800 */ 801 int 802 sbappendaddr(struct socket *so, struct sockbuf *sb, const struct sockaddr *asa, 803 struct mbuf *m0, struct mbuf *control) 804 { 805 struct mbuf *m, *n, *nlast; 806 int space = asa->sa_len; 807 808 soassertlocked(so); 809 810 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 811 panic("sbappendaddr"); 812 if (m0) 813 space += m0->m_pkthdr.len; 814 for (n = control; n; n = n->m_next) { 815 space += n->m_len; 816 if (n->m_next == NULL) /* keep pointer to last control buf */ 817 break; 818 } 819 if (space > sbspace(so, sb)) 820 return (0); 821 if (asa->sa_len > MLEN) 822 return (0); 823 MGET(m, M_DONTWAIT, MT_SONAME); 824 if (m == NULL) 825 return (0); 826 m->m_len = asa->sa_len; 827 memcpy(mtod(m, caddr_t), asa, asa->sa_len); 828 if (n) 829 n->m_next = m0; /* concatenate data to control */ 830 else 831 control = m0; 832 m->m_next = control; 833 834 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 835 836 for (n = m; n->m_next != NULL; n = n->m_next) 837 sballoc(sb, n); 838 sballoc(sb, n); 839 nlast = n; 840 SBLINKRECORD(sb, m); 841 842 sb->sb_mbtail = nlast; 843 SBLASTMBUFCHK(sb, "sbappendaddr"); 844 845 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 846 847 return (1); 848 } 849 850 int 851 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0, 852 struct mbuf *control) 853 { 854 struct mbuf *m, *mlast, *n; 855 int space = 0; 856 857 if (control == NULL) 858 panic("sbappendcontrol"); 859 for (m = control; ; m = m->m_next) { 860 space += m->m_len; 861 if (m->m_next == NULL) 862 break; 863 } 864 n = m; /* save pointer to last control buffer */ 865 for (m = m0; m; m = m->m_next) 866 space += m->m_len; 867 if (space > sbspace(so, sb)) 868 return (0); 869 n->m_next = m0; /* concatenate data to control */ 870 871 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 872 873 for (m = control; m->m_next != NULL; m = m->m_next) 874 sballoc(sb, m); 875 sballoc(sb, m); 876 mlast = m; 877 SBLINKRECORD(sb, control); 878 879 sb->sb_mbtail = mlast; 880 SBLASTMBUFCHK(sb, "sbappendcontrol"); 881 882 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 883 884 return (1); 885 } 886 887 /* 888 * Compress mbuf chain m into the socket 889 * buffer sb following mbuf n. If n 890 * is null, the buffer is presumed empty. 891 */ 892 void 893 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 894 { 895 int eor = 0; 896 struct mbuf *o; 897 898 while (m) { 899 eor |= m->m_flags & M_EOR; 900 if (m->m_len == 0 && 901 (eor == 0 || 902 (((o = m->m_next) || (o = n)) && 903 o->m_type == m->m_type))) { 904 if (sb->sb_lastrecord == m) 905 sb->sb_lastrecord = m->m_next; 906 m = m_free(m); 907 continue; 908 } 909 if (n && (n->m_flags & M_EOR) == 0 && 910 /* m_trailingspace() checks buffer writeability */ 911 m->m_len <= ((n->m_flags & M_EXT)? n->m_ext.ext_size : 912 MCLBYTES) / 4 && /* XXX Don't copy too much */ 913 m->m_len <= m_trailingspace(n) && 914 n->m_type == m->m_type) { 915 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 916 m->m_len); 917 n->m_len += m->m_len; 918 sb->sb_cc += m->m_len; 919 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 920 sb->sb_datacc += m->m_len; 921 m = m_free(m); 922 continue; 923 } 924 if (n) 925 n->m_next = m; 926 else 927 sb->sb_mb = m; 928 sb->sb_mbtail = m; 929 sballoc(sb, m); 930 n = m; 931 m->m_flags &= ~M_EOR; 932 m = m->m_next; 933 n->m_next = NULL; 934 } 935 if (eor) { 936 if (n) 937 n->m_flags |= eor; 938 else 939 printf("semi-panic: sbcompress"); 940 } 941 SBLASTMBUFCHK(sb, __func__); 942 } 943 944 /* 945 * Free all mbufs in a sockbuf. 946 * Check that all resources are reclaimed. 947 */ 948 void 949 sbflush(struct socket *so, struct sockbuf *sb) 950 { 951 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 952 KASSERT((sb->sb_flags & SB_LOCK) == 0); 953 954 while (sb->sb_mbcnt) 955 sbdrop(so, sb, (int)sb->sb_cc); 956 957 KASSERT(sb->sb_cc == 0); 958 KASSERT(sb->sb_datacc == 0); 959 KASSERT(sb->sb_mb == NULL); 960 KASSERT(sb->sb_mbtail == NULL); 961 KASSERT(sb->sb_lastrecord == NULL); 962 } 963 964 /* 965 * Drop data from (the front of) a sockbuf. 966 */ 967 void 968 sbdrop(struct socket *so, struct sockbuf *sb, int len) 969 { 970 struct mbuf *m, *mn; 971 struct mbuf *next; 972 973 KASSERT(sb == &so->so_rcv || sb == &so->so_snd); 974 soassertlocked(so); 975 976 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 977 while (len > 0) { 978 if (m == NULL) { 979 if (next == NULL) 980 panic("sbdrop"); 981 m = next; 982 next = m->m_nextpkt; 983 continue; 984 } 985 if (m->m_len > len) { 986 m->m_len -= len; 987 m->m_data += len; 988 sb->sb_cc -= len; 989 if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) 990 sb->sb_datacc -= len; 991 break; 992 } 993 len -= m->m_len; 994 sbfree(sb, m); 995 mn = m_free(m); 996 m = mn; 997 } 998 while (m && m->m_len == 0) { 999 sbfree(sb, m); 1000 mn = m_free(m); 1001 m = mn; 1002 } 1003 if (m) { 1004 sb->sb_mb = m; 1005 m->m_nextpkt = next; 1006 } else 1007 sb->sb_mb = next; 1008 /* 1009 * First part is an inline SB_EMPTY_FIXUP(). Second part 1010 * makes sure sb_lastrecord is up-to-date if we dropped 1011 * part of the last record. 1012 */ 1013 m = sb->sb_mb; 1014 if (m == NULL) { 1015 sb->sb_mbtail = NULL; 1016 sb->sb_lastrecord = NULL; 1017 } else if (m->m_nextpkt == NULL) 1018 sb->sb_lastrecord = m; 1019 } 1020 1021 /* 1022 * Drop a record off the front of a sockbuf 1023 * and move the next record to the front. 1024 */ 1025 void 1026 sbdroprecord(struct sockbuf *sb) 1027 { 1028 struct mbuf *m, *mn; 1029 1030 m = sb->sb_mb; 1031 if (m) { 1032 sb->sb_mb = m->m_nextpkt; 1033 do { 1034 sbfree(sb, m); 1035 mn = m_free(m); 1036 } while ((m = mn) != NULL); 1037 } 1038 SB_EMPTY_FIXUP(sb); 1039 } 1040 1041 /* 1042 * Create a "control" mbuf containing the specified data 1043 * with the specified type for presentation on a socket buffer. 1044 */ 1045 struct mbuf * 1046 sbcreatecontrol(const void *p, size_t size, int type, int level) 1047 { 1048 struct cmsghdr *cp; 1049 struct mbuf *m; 1050 1051 if (CMSG_SPACE(size) > MCLBYTES) { 1052 printf("sbcreatecontrol: message too large %zu\n", size); 1053 return (NULL); 1054 } 1055 1056 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1057 return (NULL); 1058 if (CMSG_SPACE(size) > MLEN) { 1059 MCLGET(m, M_DONTWAIT); 1060 if ((m->m_flags & M_EXT) == 0) { 1061 m_free(m); 1062 return NULL; 1063 } 1064 } 1065 cp = mtod(m, struct cmsghdr *); 1066 memset(cp, 0, CMSG_SPACE(size)); 1067 memcpy(CMSG_DATA(cp), p, size); 1068 m->m_len = CMSG_SPACE(size); 1069 cp->cmsg_len = CMSG_LEN(size); 1070 cp->cmsg_level = level; 1071 cp->cmsg_type = type; 1072 return (m); 1073 } 1074