1 /* $NetBSD: uipc_socket2.c,v 1.147 2024/12/07 02:31:14 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright (c) 1982, 1986, 1988, 1990, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. Neither the name of the University nor the names of its contributors 42 * may be used to endorse or promote products derived from this software 43 * without specific prior written permission. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.147 2024/12/07 02:31:14 riastradh Exp $"); 62 63 #ifdef _KERNEL_OPT 64 #include "opt_ddb.h" 65 #include "opt_inet.h" 66 #include "opt_mbuftrace.h" 67 #include "opt_sb_max.h" 68 #endif 69 70 #include <sys/param.h> 71 #include <sys/types.h> 72 73 #include <sys/buf.h> 74 #include <sys/domain.h> 75 #include <sys/file.h> 76 #include <sys/kauth.h> 77 #include <sys/mbuf.h> 78 #include <sys/poll.h> 79 #include <sys/pool.h> 80 #include <sys/proc.h> 81 #include <sys/protosw.h> 82 #include <sys/sdt.h> 83 #include <sys/signalvar.h> 84 #include <sys/socket.h> 85 #include <sys/socketvar.h> 86 #include <sys/systm.h> 87 #include <sys/uidinfo.h> 88 89 #ifdef DDB 90 #include <sys/filedesc.h> 91 #include <ddb/db_active.h> 92 #endif 93 94 /* 95 * Primitive routines for operating on sockets and socket buffers. 96 * 97 * Connection life-cycle: 98 * 99 * Normal sequence from the active (originating) side: 100 * 101 * - soisconnecting() is called during processing of connect() call, 102 * - resulting in an eventual call to soisconnected() if/when the 103 * connection is established. 104 * 105 * When the connection is torn down during processing of disconnect(): 106 * 107 * - soisdisconnecting() is called and, 108 * - soisdisconnected() is called when the connection to the peer 109 * is totally severed. 110 * 111 * The semantics of these routines are such that connectionless protocols 112 * can call soisconnected() and soisdisconnected() only, bypassing the 113 * in-progress calls when setting up a ``connection'' takes no time. 114 * 115 * From the passive side, a socket is created with two queues of sockets: 116 * 117 * - so_q0 (0) for partial connections (i.e. connections in progress) 118 * - so_q (1) for connections already made and awaiting user acceptance. 119 * 120 * As a protocol is preparing incoming connections, it creates a socket 121 * structure queued on so_q0 by calling sonewconn(). When the connection 122 * is established, soisconnected() is called, and transfers the 123 * socket structure to so_q, making it available to accept(). 124 * 125 * If a socket is closed with sockets on either so_q0 or so_q, these 126 * sockets are dropped. 127 * 128 * Locking rules and assumptions: 129 * 130 * o socket::so_lock can change on the fly. The low level routines used 131 * to lock sockets are aware of this. When so_lock is acquired, the 132 * routine locking must check to see if so_lock still points to the 133 * lock that was acquired. If so_lock has changed in the meantime, the 134 * now irrelevant lock that was acquired must be dropped and the lock 135 * operation retried. Although not proven here, this is completely safe 136 * on a multiprocessor system, even with relaxed memory ordering, given 137 * the next two rules: 138 * 139 * o In order to mutate so_lock, the lock pointed to by the current value 140 * of so_lock must be held: i.e., the socket must be held locked by the 141 * changing thread. The thread must issue membar_release() to prevent 142 * memory accesses being reordered, and can set so_lock to the desired 143 * value. If the lock pointed to by the new value of so_lock is not 144 * held by the changing thread, the socket must then be considered 145 * unlocked. 146 * 147 * o If so_lock is mutated, and the previous lock referred to by so_lock 148 * could still be visible to other threads in the system (e.g. via file 149 * descriptor or protocol-internal reference), then the old lock must 150 * remain valid until the socket and/or protocol control block has been 151 * torn down. 152 * 153 * o If a socket has a non-NULL so_head value (i.e. is in the process of 154 * connecting), then locking the socket must also lock the socket pointed 155 * to by so_head: their lock pointers must match. 156 * 157 * o If a socket has connections in progress (so_q, so_q0 not empty) then 158 * locking the socket must also lock the sockets attached to both queues. 159 * Again, their lock pointers must match. 160 * 161 * o Beyond the initial lock assignment in socreate(), assigning locks to 162 * sockets is the responsibility of the individual protocols / protocol 163 * domains. 164 */ 165 166 static pool_cache_t socket_cache; 167 u_long sb_max = SB_MAX;/* maximum socket buffer size */ 168 static u_long sb_max_adj; /* adjusted sb_max */ 169 170 void 171 soisconnecting(struct socket *so) 172 { 173 174 KASSERT(solocked(so)); 175 176 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 177 so->so_state |= SS_ISCONNECTING; 178 } 179 180 void 181 soisconnected(struct socket *so) 182 { 183 struct socket *head; 184 185 head = so->so_head; 186 187 KASSERT(solocked(so)); 188 KASSERT(head == NULL || solocked2(so, head)); 189 190 so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING); 191 so->so_state |= SS_ISCONNECTED; 192 if (head && so->so_onq == &head->so_q0) { 193 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 194 /* 195 * Re-enqueue and wake up any waiters, e.g. 196 * processes blocking on accept(). 197 */ 198 soqremque(so, 0); 199 soqinsque(head, so, 1); 200 sorwakeup(head); 201 cv_broadcast(&head->so_cv); 202 } else { 203 so->so_upcall = 204 head->so_accf->so_accept_filter->accf_callback; 205 so->so_upcallarg = head->so_accf->so_accept_filter_arg; 206 so->so_rcv.sb_flags |= SB_UPCALL; 207 so->so_options &= ~SO_ACCEPTFILTER; 208 (*so->so_upcall)(so, so->so_upcallarg, 209 POLLIN|POLLRDNORM, M_DONTWAIT); 210 } 211 } else { 212 cv_broadcast(&so->so_cv); 213 sorwakeup(so); 214 sowwakeup(so); 215 } 216 } 217 218 void 219 soisdisconnecting(struct socket *so) 220 { 221 222 KASSERT(solocked(so)); 223 224 so->so_state &= ~SS_ISCONNECTING; 225 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 226 cv_broadcast(&so->so_cv); 227 sowwakeup(so); 228 sorwakeup(so); 229 } 230 231 void 232 soisdisconnected(struct socket *so) 233 { 234 235 KASSERT(solocked(so)); 236 237 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 238 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 239 cv_broadcast(&so->so_cv); 240 sowwakeup(so); 241 sorwakeup(so); 242 } 243 244 void 245 soinit2(void) 246 { 247 248 socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0, 249 "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL); 250 } 251 252 /* 253 * sonewconn: accept a new connection. 254 * 255 * When an attempt at a new connection is noted on a socket which accepts 256 * connections, sonewconn(9) is called. If the connection is possible 257 * (subject to space constraints, etc) then we allocate a new structure, 258 * properly linked into the data structure of the original socket. 259 * 260 * => If 'soready' is true, then socket will become ready for accept() i.e. 261 * inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken. 262 * => May be called from soft-interrupt context. 263 * => Listening socket should be locked. 264 * => Returns the new socket locked. 265 */ 266 struct socket * 267 sonewconn(struct socket *head, bool soready) 268 { 269 struct socket *so; 270 int soqueue, error; 271 272 KASSERT(solocked(head)); 273 274 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) { 275 /* 276 * Listen queue overflow. If there is an accept filter 277 * active, pass through the oldest cxn it's handling. 278 */ 279 if (head->so_accf == NULL) { 280 return NULL; 281 } else { 282 struct socket *so2, *next; 283 284 /* Pass the oldest connection waiting in the 285 accept filter */ 286 for (so2 = TAILQ_FIRST(&head->so_q0); 287 so2 != NULL; so2 = next) { 288 next = TAILQ_NEXT(so2, so_qe); 289 if (so2->so_upcall == NULL) { 290 continue; 291 } 292 so2->so_upcall = NULL; 293 so2->so_upcallarg = NULL; 294 so2->so_options &= ~SO_ACCEPTFILTER; 295 so2->so_rcv.sb_flags &= ~SB_UPCALL; 296 soisconnected(so2); 297 break; 298 } 299 300 /* If nothing was nudged out of the acept filter, bail 301 * out; otherwise proceed allocating the socket. */ 302 if (so2 == NULL) { 303 return NULL; 304 } 305 } 306 } 307 if ((head->so_options & SO_ACCEPTFILTER) != 0) { 308 soready = false; 309 } 310 soqueue = soready ? 1 : 0; 311 312 if ((so = soget(false)) == NULL) { 313 return NULL; 314 } 315 so->so_type = head->so_type; 316 so->so_options = head->so_options & ~SO_ACCEPTCONN; 317 so->so_linger = head->so_linger; 318 so->so_state = head->so_state | SS_NOFDREF; 319 so->so_proto = head->so_proto; 320 so->so_timeo = head->so_timeo; 321 so->so_pgid = head->so_pgid; 322 so->so_send = head->so_send; 323 so->so_receive = head->so_receive; 324 so->so_uidinfo = head->so_uidinfo; 325 so->so_egid = head->so_egid; 326 so->so_cpid = head->so_cpid; 327 328 /* 329 * Share the lock with the listening-socket, it may get unshared 330 * once the connection is complete. 331 * 332 * so_lock is stable while we hold the socket locked, so no 333 * need for atomic_load_* here. 334 */ 335 mutex_obj_hold(head->so_lock); 336 so->so_lock = head->so_lock; 337 338 /* 339 * Reserve the space for socket buffers. 340 */ 341 #ifdef MBUFTRACE 342 so->so_mowner = head->so_mowner; 343 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; 344 so->so_snd.sb_mowner = head->so_snd.sb_mowner; 345 #endif 346 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 347 goto out; 348 } 349 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 350 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 351 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 352 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 353 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC); 354 so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC); 355 356 /* 357 * Finally, perform the protocol attach. Note: a new socket 358 * lock may be assigned at this point (if so, it will be held). 359 */ 360 error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0); 361 if (error) { 362 out: 363 KASSERT(solocked(so)); 364 KASSERT(so->so_accf == NULL); 365 soput(so); 366 367 /* Note: the listening socket shall stay locked. */ 368 KASSERT(solocked(head)); 369 return NULL; 370 } 371 KASSERT(solocked2(head, so)); 372 373 /* 374 * Insert into the queue. If ready, update the connection status 375 * and wake up any waiters, e.g. processes blocking on accept(). 376 */ 377 soqinsque(head, so, soqueue); 378 if (soready) { 379 so->so_state |= SS_ISCONNECTED; 380 sorwakeup(head); 381 cv_broadcast(&head->so_cv); 382 } 383 return so; 384 } 385 386 struct socket * 387 soget(bool waitok) 388 { 389 struct socket *so; 390 391 so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); 392 if (__predict_false(so == NULL)) 393 return (NULL); 394 memset(so, 0, sizeof(*so)); 395 TAILQ_INIT(&so->so_q0); 396 TAILQ_INIT(&so->so_q); 397 cv_init(&so->so_cv, "socket"); 398 cv_init(&so->so_rcv.sb_cv, "netio"); 399 cv_init(&so->so_snd.sb_cv, "netio"); 400 selinit(&so->so_rcv.sb_sel); 401 selinit(&so->so_snd.sb_sel); 402 so->so_rcv.sb_so = so; 403 so->so_snd.sb_so = so; 404 return so; 405 } 406 407 void 408 soput(struct socket *so) 409 { 410 411 KASSERT(!cv_has_waiters(&so->so_cv)); 412 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); 413 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); 414 seldestroy(&so->so_rcv.sb_sel); 415 seldestroy(&so->so_snd.sb_sel); 416 mutex_obj_free(so->so_lock); 417 cv_destroy(&so->so_cv); 418 cv_destroy(&so->so_rcv.sb_cv); 419 cv_destroy(&so->so_snd.sb_cv); 420 pool_cache_put(socket_cache, so); 421 } 422 423 /* 424 * soqinsque: insert socket of a new connection into the specified 425 * accept queue of the listening socket (head). 426 * 427 * q = 0: queue of partial connections 428 * q = 1: queue of incoming connections 429 */ 430 void 431 soqinsque(struct socket *head, struct socket *so, int q) 432 { 433 KASSERT(q == 0 || q == 1); 434 KASSERT(solocked2(head, so)); 435 KASSERT(so->so_onq == NULL); 436 KASSERT(so->so_head == NULL); 437 438 so->so_head = head; 439 if (q == 0) { 440 head->so_q0len++; 441 so->so_onq = &head->so_q0; 442 } else { 443 head->so_qlen++; 444 so->so_onq = &head->so_q; 445 } 446 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 447 } 448 449 /* 450 * soqremque: remove socket from the specified queue. 451 * 452 * => Returns true if socket was removed from the specified queue. 453 * => False if socket was not removed (because it was in other queue). 454 */ 455 bool 456 soqremque(struct socket *so, int q) 457 { 458 struct socket *head = so->so_head; 459 460 KASSERT(q == 0 || q == 1); 461 KASSERT(solocked(so)); 462 KASSERT(so->so_onq != NULL); 463 KASSERT(head != NULL); 464 465 if (q == 0) { 466 if (so->so_onq != &head->so_q0) 467 return false; 468 head->so_q0len--; 469 } else { 470 if (so->so_onq != &head->so_q) 471 return false; 472 head->so_qlen--; 473 } 474 KASSERT(solocked2(so, head)); 475 TAILQ_REMOVE(so->so_onq, so, so_qe); 476 so->so_onq = NULL; 477 so->so_head = NULL; 478 return true; 479 } 480 481 /* 482 * socantsendmore: indicates that no more data will be sent on the 483 * socket; it would normally be applied to a socket when the user 484 * informs the system that no more data is to be sent, by the protocol 485 * code (in case pr_shutdown()). 486 */ 487 void 488 socantsendmore(struct socket *so) 489 { 490 KASSERT(solocked(so)); 491 492 so->so_state |= SS_CANTSENDMORE; 493 sowwakeup(so); 494 } 495 496 /* 497 * socantrcvmore(): indicates that no more data will be received and 498 * will normally be applied to the socket by a protocol when it detects 499 * that the peer will send no more data. Data queued for reading in 500 * the socket may yet be read. 501 */ 502 void 503 socantrcvmore(struct socket *so) 504 { 505 KASSERT(solocked(so)); 506 507 so->so_state |= SS_CANTRCVMORE; 508 sorwakeup(so); 509 } 510 511 /* 512 * soroverflow(): indicates that data was attempted to be sent 513 * but the receiving buffer overflowed. 514 */ 515 void 516 soroverflow(struct socket *so) 517 { 518 KASSERT(solocked(so)); 519 520 so->so_rcv.sb_overflowed++; 521 if (so->so_options & SO_RERROR) { 522 so->so_rerror = SET_ERROR(ENOBUFS); 523 sorwakeup(so); 524 } 525 } 526 527 /* 528 * Wait for data to arrive at/drain from a socket buffer. 529 */ 530 int 531 sbwait(struct sockbuf *sb) 532 { 533 struct socket *so; 534 kmutex_t *lock; 535 int error; 536 537 so = sb->sb_so; 538 539 KASSERT(solocked(so)); 540 541 sb->sb_flags |= SB_NOTIFY; 542 lock = so->so_lock; 543 if ((sb->sb_flags & SB_NOINTR) != 0) 544 error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo); 545 else 546 error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo); 547 if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) 548 solockretry(so, lock); 549 return error; 550 } 551 552 /* 553 * Wakeup processes waiting on a socket buffer. 554 * Do asynchronous notification via SIGIO 555 * if the socket buffer has the SB_ASYNC flag set. 556 */ 557 void 558 sowakeup(struct socket *so, struct sockbuf *sb, int code) 559 { 560 int band; 561 562 KASSERT(solocked(so)); 563 KASSERT(sb->sb_so == so); 564 565 switch (code) { 566 case POLL_IN: 567 band = POLLIN|POLLRDNORM; 568 break; 569 570 case POLL_OUT: 571 band = POLLOUT|POLLWRNORM; 572 break; 573 574 case POLL_HUP: 575 band = POLLHUP; 576 break; 577 578 default: 579 band = 0; 580 #ifdef DIAGNOSTIC 581 printf("bad siginfo code %d in socket notification.\n", code); 582 #endif 583 break; 584 } 585 586 sb->sb_flags &= ~SB_NOTIFY; 587 selnotify(&sb->sb_sel, band, NOTE_SUBMIT); 588 cv_broadcast(&sb->sb_cv); 589 if (sb->sb_flags & SB_ASYNC) 590 fownsignal(so->so_pgid, SIGIO, code, band, so); 591 if (sb->sb_flags & SB_UPCALL) 592 (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT); 593 } 594 595 /* 596 * Reset a socket's lock pointer. Wake all threads waiting on the 597 * socket's condition variables so that they can restart their waits 598 * using the new lock. The existing lock must be held. 599 * 600 * Caller must have issued membar_release before this. 601 */ 602 void 603 solockreset(struct socket *so, kmutex_t *lock) 604 { 605 606 KASSERT(solocked(so)); 607 608 so->so_lock = lock; 609 cv_broadcast(&so->so_snd.sb_cv); 610 cv_broadcast(&so->so_rcv.sb_cv); 611 cv_broadcast(&so->so_cv); 612 } 613 614 /* 615 * Socket buffer (struct sockbuf) utility routines. 616 * 617 * Each socket contains two socket buffers: one for sending data and 618 * one for receiving data. Each buffer contains a queue of mbufs, 619 * information about the number of mbufs and amount of data in the 620 * queue, and other fields allowing poll() statements and notification 621 * on data availability to be implemented. 622 * 623 * Data stored in a socket buffer is maintained as a list of records. 624 * Each record is a list of mbufs chained together with the m_next 625 * field. Records are chained together with the m_nextpkt field. The upper 626 * level routine soreceive() expects the following conventions to be 627 * observed when placing information in the receive buffer: 628 * 629 * 1. If the protocol requires each message be preceded by the sender's 630 * name, then a record containing that name must be present before 631 * any associated data (mbuf's must be of type MT_SONAME). 632 * 2. If the protocol supports the exchange of ``access rights'' (really 633 * just additional data associated with the message), and there are 634 * ``rights'' to be received, then a record containing this data 635 * should be present (mbuf's must be of type MT_CONTROL). 636 * 3. If a name or rights record exists, then it must be followed by 637 * a data record, perhaps of zero length. 638 * 639 * Before using a new socket structure it is first necessary to reserve 640 * buffer space to the socket, by calling sbreserve(). This should commit 641 * some of the available buffer space in the system buffer pool for the 642 * socket (currently, it does nothing but enforce limits). The space 643 * should be released by calling sbrelease() when the socket is destroyed. 644 */ 645 646 int 647 sb_max_set(u_long new_sbmax) 648 { 649 int s; 650 651 if (new_sbmax < (16 * 1024)) 652 return SET_ERROR(EINVAL); 653 654 s = splsoftnet(); 655 sb_max = new_sbmax; 656 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); 657 splx(s); 658 659 return (0); 660 } 661 662 int 663 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 664 { 665 KASSERT(so->so_pcb == NULL || solocked(so)); 666 667 /* 668 * there's at least one application (a configure script of screen) 669 * which expects a fifo is writable even if it has "some" bytes 670 * in its buffer. 671 * so we want to make sure (hiwat - lowat) >= (some bytes). 672 * 673 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above. 674 * we expect it's large enough for such applications. 675 */ 676 u_long lowat = MAX(sock_loan_thresh, MCLBYTES); 677 u_long hiwat = lowat + PIPE_BUF; 678 679 if (sndcc < hiwat) 680 sndcc = hiwat; 681 if (sbreserve(&so->so_snd, sndcc, so) == 0) 682 goto bad; 683 if (sbreserve(&so->so_rcv, rcvcc, so) == 0) 684 goto bad2; 685 if (so->so_rcv.sb_lowat == 0) 686 so->so_rcv.sb_lowat = 1; 687 if (so->so_snd.sb_lowat == 0) 688 so->so_snd.sb_lowat = lowat; 689 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 690 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 691 return (0); 692 bad2: 693 sbrelease(&so->so_snd, so); 694 bad: 695 return SET_ERROR(ENOBUFS); 696 } 697 698 /* 699 * Allot mbufs to a sockbuf. 700 * Attempt to scale mbmax so that mbcnt doesn't become limiting 701 * if buffering efficiency is near the normal case. 702 */ 703 int 704 sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) 705 { 706 struct lwp *l = curlwp; /* XXX */ 707 rlim_t maxcc; 708 struct uidinfo *uidinfo; 709 710 KASSERT(so->so_pcb == NULL || solocked(so)); 711 KASSERT(sb->sb_so == so); 712 KASSERT(sb_max_adj != 0); 713 714 if (cc == 0 || cc > sb_max_adj) 715 return (0); 716 717 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur; 718 719 uidinfo = so->so_uidinfo; 720 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc)) 721 return 0; 722 sb->sb_mbmax = uimin(cc * 2, sb_max); 723 if (sb->sb_lowat > sb->sb_hiwat) 724 sb->sb_lowat = sb->sb_hiwat; 725 726 return (1); 727 } 728 729 /* 730 * Free mbufs held by a socket, and reserved mbuf space. We do not assert 731 * that the socket is held locked here: see sorflush(). 732 */ 733 void 734 sbrelease(struct sockbuf *sb, struct socket *so) 735 { 736 737 KASSERT(sb->sb_so == so); 738 739 sbflush(sb); 740 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); 741 sb->sb_mbmax = 0; 742 } 743 744 /* 745 * Routines to add and remove 746 * data from an mbuf queue. 747 * 748 * The routines sbappend() or sbappendrecord() are normally called to 749 * append new mbufs to a socket buffer, after checking that adequate 750 * space is available, comparing the function sbspace() with the amount 751 * of data to be added. sbappendrecord() differs from sbappend() in 752 * that data supplied is treated as the beginning of a new record. 753 * To place a sender's address, optional access rights, and data in a 754 * socket receive buffer, sbappendaddr() should be used. To place 755 * access rights and data in a socket receive buffer, sbappendrights() 756 * should be used. In either case, the new data begins a new record. 757 * Note that unlike sbappend() and sbappendrecord(), these routines check 758 * for the caller that there will be enough space to store the data. 759 * Each fails if there is not enough space, or if it cannot find mbufs 760 * to store additional information in. 761 * 762 * Reliable protocols may use the socket send buffer to hold data 763 * awaiting acknowledgement. Data is normally copied from a socket 764 * send buffer in a protocol with m_copym for output to a peer, 765 * and then removing the data from the socket buffer with sbdrop() 766 * or sbdroprecord() when the data is acknowledged by the peer. 767 */ 768 769 #ifdef SOCKBUF_DEBUG 770 void 771 sblastrecordchk(struct sockbuf *sb, const char *where) 772 { 773 struct mbuf *m = sb->sb_mb; 774 775 KASSERT(solocked(sb->sb_so)); 776 777 while (m && m->m_nextpkt) 778 m = m->m_nextpkt; 779 780 if (m != sb->sb_lastrecord) { 781 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 782 sb->sb_mb, sb->sb_lastrecord, m); 783 printf("packet chain:\n"); 784 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 785 printf("\t%p\n", m); 786 panic("sblastrecordchk from %s", where); 787 } 788 } 789 790 void 791 sblastmbufchk(struct sockbuf *sb, const char *where) 792 { 793 struct mbuf *m = sb->sb_mb; 794 struct mbuf *n; 795 796 KASSERT(solocked(sb->sb_so)); 797 798 while (m && m->m_nextpkt) 799 m = m->m_nextpkt; 800 801 while (m && m->m_next) 802 m = m->m_next; 803 804 if (m != sb->sb_mbtail) { 805 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 806 sb->sb_mb, sb->sb_mbtail, m); 807 printf("packet tree:\n"); 808 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 809 printf("\t"); 810 for (n = m; n != NULL; n = n->m_next) 811 printf("%p ", n); 812 printf("\n"); 813 } 814 panic("sblastmbufchk from %s", where); 815 } 816 } 817 #endif /* SOCKBUF_DEBUG */ 818 819 /* 820 * Link a chain of records onto a socket buffer 821 */ 822 #define SBLINKRECORDCHAIN(sb, m0, mlast) \ 823 do { \ 824 if ((sb)->sb_lastrecord != NULL) \ 825 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 826 else \ 827 (sb)->sb_mb = (m0); \ 828 (sb)->sb_lastrecord = (mlast); \ 829 } while (/*CONSTCOND*/0) 830 831 832 #define SBLINKRECORD(sb, m0) \ 833 SBLINKRECORDCHAIN(sb, m0, m0) 834 835 /* 836 * Append mbuf chain m to the last record in the 837 * socket buffer sb. The additional space associated 838 * the mbuf chain is recorded in sb. Empty mbufs are 839 * discarded and mbufs are compacted where possible. 840 */ 841 void 842 sbappend(struct sockbuf *sb, struct mbuf *m) 843 { 844 struct mbuf *n; 845 846 KASSERT(solocked(sb->sb_so)); 847 848 if (m == NULL) 849 return; 850 851 #ifdef MBUFTRACE 852 m_claimm(m, sb->sb_mowner); 853 #endif 854 855 SBLASTRECORDCHK(sb, "sbappend 1"); 856 857 if ((n = sb->sb_lastrecord) != NULL) { 858 /* 859 * XXX Would like to simply use sb_mbtail here, but 860 * XXX I need to verify that I won't miss an EOR that 861 * XXX way. 862 */ 863 do { 864 if (n->m_flags & M_EOR) { 865 sbappendrecord(sb, m); /* XXXXXX!!!! */ 866 return; 867 } 868 } while (n->m_next && (n = n->m_next)); 869 } else { 870 /* 871 * If this is the first record in the socket buffer, it's 872 * also the last record. 873 */ 874 sb->sb_lastrecord = m; 875 } 876 sbcompress(sb, m, n); 877 SBLASTRECORDCHK(sb, "sbappend 2"); 878 } 879 880 /* 881 * This version of sbappend() should only be used when the caller 882 * absolutely knows that there will never be more than one record 883 * in the socket buffer, that is, a stream protocol (such as TCP). 884 */ 885 void 886 sbappendstream(struct sockbuf *sb, struct mbuf *m) 887 { 888 889 KASSERT(solocked(sb->sb_so)); 890 KDASSERT(m->m_nextpkt == NULL); 891 KASSERT(sb->sb_mb == sb->sb_lastrecord); 892 893 SBLASTMBUFCHK(sb, __func__); 894 895 #ifdef MBUFTRACE 896 m_claimm(m, sb->sb_mowner); 897 #endif 898 899 sbcompress(sb, m, sb->sb_mbtail); 900 901 sb->sb_lastrecord = sb->sb_mb; 902 SBLASTRECORDCHK(sb, __func__); 903 } 904 905 #ifdef SOCKBUF_DEBUG 906 void 907 sbcheck(struct sockbuf *sb) 908 { 909 struct mbuf *m, *m2; 910 u_long len, mbcnt; 911 912 KASSERT(solocked(sb->sb_so)); 913 914 len = 0; 915 mbcnt = 0; 916 for (m = sb->sb_mb; m; m = m->m_nextpkt) { 917 for (m2 = m; m2 != NULL; m2 = m2->m_next) { 918 len += m2->m_len; 919 mbcnt += MSIZE; 920 if (m2->m_flags & M_EXT) 921 mbcnt += m2->m_ext.ext_size; 922 if (m2->m_nextpkt != NULL) 923 panic("sbcheck nextpkt"); 924 } 925 } 926 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 927 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 928 mbcnt, sb->sb_mbcnt); 929 panic("sbcheck"); 930 } 931 } 932 #endif 933 934 /* 935 * As above, except the mbuf chain 936 * begins a new record. 937 */ 938 void 939 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 940 { 941 struct mbuf *m; 942 943 KASSERT(solocked(sb->sb_so)); 944 945 if (m0 == NULL) 946 return; 947 948 #ifdef MBUFTRACE 949 m_claimm(m0, sb->sb_mowner); 950 #endif 951 /* 952 * Put the first mbuf on the queue. 953 * Note this permits zero length records. 954 */ 955 sballoc(sb, m0); 956 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 957 SBLINKRECORD(sb, m0); 958 m = m0->m_next; 959 m0->m_next = 0; 960 if (m && (m0->m_flags & M_EOR)) { 961 m0->m_flags &= ~M_EOR; 962 m->m_flags |= M_EOR; 963 } 964 sbcompress(sb, m, m0); 965 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 966 } 967 968 /* 969 * As above except that OOB data 970 * is inserted at the beginning of the sockbuf, 971 * but after any other OOB data. 972 */ 973 void 974 sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 975 { 976 struct mbuf *m, **mp; 977 978 KASSERT(solocked(sb->sb_so)); 979 980 if (m0 == NULL) 981 return; 982 983 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 984 985 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 986 again: 987 switch (m->m_type) { 988 989 case MT_OOBDATA: 990 continue; /* WANT next train */ 991 992 case MT_CONTROL: 993 if ((m = m->m_next) != NULL) 994 goto again; /* inspect THIS train further */ 995 } 996 break; 997 } 998 /* 999 * Put the first mbuf on the queue. 1000 * Note this permits zero length records. 1001 */ 1002 sballoc(sb, m0); 1003 m0->m_nextpkt = *mp; 1004 if (*mp == NULL) { 1005 /* m0 is actually the new tail */ 1006 sb->sb_lastrecord = m0; 1007 } 1008 *mp = m0; 1009 m = m0->m_next; 1010 m0->m_next = 0; 1011 if (m && (m0->m_flags & M_EOR)) { 1012 m0->m_flags &= ~M_EOR; 1013 m->m_flags |= M_EOR; 1014 } 1015 sbcompress(sb, m, m0); 1016 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 1017 } 1018 1019 /* 1020 * Append address and data, and optionally, control (ancillary) data 1021 * to the receive queue of a socket. If present, 1022 * m0 must include a packet header with total length. 1023 * Returns 0 if no space in sockbuf or insufficient mbufs. 1024 */ 1025 int 1026 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 1027 struct mbuf *control) 1028 { 1029 struct mbuf *m, *n, *nlast; 1030 int space, len; 1031 1032 KASSERT(solocked(sb->sb_so)); 1033 1034 space = asa->sa_len; 1035 1036 if (m0 != NULL) { 1037 if ((m0->m_flags & M_PKTHDR) == 0) 1038 panic("sbappendaddr"); 1039 space += m0->m_pkthdr.len; 1040 #ifdef MBUFTRACE 1041 m_claimm(m0, sb->sb_mowner); 1042 #endif 1043 } 1044 for (n = control; n; n = n->m_next) { 1045 space += n->m_len; 1046 MCLAIM(n, sb->sb_mowner); 1047 if (n->m_next == NULL) /* keep pointer to last control buf */ 1048 break; 1049 } 1050 if (space > sbspace(sb)) 1051 return (0); 1052 m = m_get(M_DONTWAIT, MT_SONAME); 1053 if (m == NULL) 1054 return (0); 1055 MCLAIM(m, sb->sb_mowner); 1056 /* 1057 * XXX avoid 'comparison always true' warning which isn't easily 1058 * avoided. 1059 */ 1060 len = asa->sa_len; 1061 if (len > MLEN) { 1062 MEXTMALLOC(m, asa->sa_len, M_NOWAIT); 1063 if ((m->m_flags & M_EXT) == 0) { 1064 m_free(m); 1065 return (0); 1066 } 1067 } 1068 m->m_len = asa->sa_len; 1069 memcpy(mtod(m, void *), asa, asa->sa_len); 1070 if (n) 1071 n->m_next = m0; /* concatenate data to control */ 1072 else 1073 control = m0; 1074 m->m_next = control; 1075 1076 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 1077 1078 for (n = m; n->m_next != NULL; n = n->m_next) 1079 sballoc(sb, n); 1080 sballoc(sb, n); 1081 nlast = n; 1082 SBLINKRECORD(sb, m); 1083 1084 sb->sb_mbtail = nlast; 1085 SBLASTMBUFCHK(sb, "sbappendaddr"); 1086 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 1087 1088 return (1); 1089 } 1090 1091 /* 1092 * Helper for sbappendchainaddr: prepend a struct sockaddr* to 1093 * an mbuf chain. 1094 */ 1095 static inline struct mbuf * 1096 m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0, 1097 const struct sockaddr *asa) 1098 { 1099 struct mbuf *m; 1100 const int salen = asa->sa_len; 1101 1102 KASSERT(solocked(sb->sb_so)); 1103 1104 /* only the first in each chain need be a pkthdr */ 1105 m = m_gethdr(M_DONTWAIT, MT_SONAME); 1106 if (m == NULL) 1107 return NULL; 1108 MCLAIM(m, sb->sb_mowner); 1109 #ifdef notyet 1110 if (salen > MHLEN) { 1111 MEXTMALLOC(m, salen, M_NOWAIT); 1112 if ((m->m_flags & M_EXT) == 0) { 1113 m_free(m); 1114 return NULL; 1115 } 1116 } 1117 #else 1118 KASSERT(salen <= MHLEN); 1119 #endif 1120 m->m_len = salen; 1121 memcpy(mtod(m, void *), asa, salen); 1122 m->m_next = m0; 1123 m->m_pkthdr.len = salen + m0->m_pkthdr.len; 1124 1125 return m; 1126 } 1127 1128 int 1129 sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, 1130 struct mbuf *m0, int sbprio) 1131 { 1132 struct mbuf *m, *n, *n0, *nlast; 1133 int error; 1134 1135 KASSERT(solocked(sb->sb_so)); 1136 1137 /* 1138 * XXX sbprio reserved for encoding priority of this* request: 1139 * SB_PRIO_NONE --> honour normal sb limits 1140 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, 1141 * take whole chain. Intended for large requests 1142 * that should be delivered atomically (all, or none). 1143 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow 1144 * over normal socket limits, for messages indicating 1145 * buffer overflow in earlier normal/lower-priority messages 1146 * SB_PRIO_BESTEFFORT --> ignore limits entirely. 1147 * Intended for kernel-generated messages only. 1148 * Up to generator to avoid total mbuf resource exhaustion. 1149 */ 1150 (void)sbprio; 1151 1152 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 1153 panic("sbappendaddrchain"); 1154 1155 #ifdef notyet 1156 space = sbspace(sb); 1157 1158 /* 1159 * Enforce SB_PRIO_* limits as described above. 1160 */ 1161 #endif 1162 1163 n0 = NULL; 1164 nlast = NULL; 1165 for (m = m0; m; m = m->m_nextpkt) { 1166 struct mbuf *np; 1167 1168 #ifdef MBUFTRACE 1169 m_claimm(m, sb->sb_mowner); 1170 #endif 1171 1172 /* Prepend sockaddr to this record (m) of input chain m0 */ 1173 n = m_prepend_sockaddr(sb, m, asa); 1174 if (n == NULL) { 1175 error = SET_ERROR(ENOBUFS); 1176 goto bad; 1177 } 1178 1179 /* Append record (asa+m) to end of new chain n0 */ 1180 if (n0 == NULL) { 1181 n0 = n; 1182 } else { 1183 nlast->m_nextpkt = n; 1184 } 1185 /* Keep track of last record on new chain */ 1186 nlast = n; 1187 1188 for (np = n; np; np = np->m_next) 1189 sballoc(sb, np); 1190 } 1191 1192 SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); 1193 1194 /* Drop the entire chain of (asa+m) records onto the socket */ 1195 SBLINKRECORDCHAIN(sb, n0, nlast); 1196 1197 SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); 1198 1199 for (m = nlast; m->m_next; m = m->m_next) 1200 ; 1201 sb->sb_mbtail = m; 1202 SBLASTMBUFCHK(sb, "sbappendaddrchain"); 1203 1204 return (1); 1205 1206 bad: 1207 /* 1208 * On error, free the prepended addresses. For consistency 1209 * with sbappendaddr(), leave it to our caller to free 1210 * the input record chain passed to us as m0. 1211 */ 1212 while ((n = n0) != NULL) { 1213 struct mbuf *np; 1214 1215 /* Undo the sballoc() of this record */ 1216 for (np = n; np; np = np->m_next) 1217 sbfree(sb, np); 1218 1219 n0 = n->m_nextpkt; /* iterate at next prepended address */ 1220 np = m_free(n); /* free prepended address (not data) */ 1221 } 1222 return error; 1223 } 1224 1225 1226 int 1227 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 1228 { 1229 struct mbuf *m, *mlast, *n; 1230 int space; 1231 1232 KASSERT(solocked(sb->sb_so)); 1233 1234 space = 0; 1235 if (control == NULL) 1236 panic("sbappendcontrol"); 1237 for (m = control; ; m = m->m_next) { 1238 space += m->m_len; 1239 MCLAIM(m, sb->sb_mowner); 1240 if (m->m_next == NULL) 1241 break; 1242 } 1243 n = m; /* save pointer to last control buffer */ 1244 for (m = m0; m; m = m->m_next) { 1245 MCLAIM(m, sb->sb_mowner); 1246 space += m->m_len; 1247 } 1248 if (space > sbspace(sb)) 1249 return (0); 1250 n->m_next = m0; /* concatenate data to control */ 1251 1252 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 1253 1254 for (m = control; m->m_next != NULL; m = m->m_next) 1255 sballoc(sb, m); 1256 sballoc(sb, m); 1257 mlast = m; 1258 SBLINKRECORD(sb, control); 1259 1260 sb->sb_mbtail = mlast; 1261 SBLASTMBUFCHK(sb, "sbappendcontrol"); 1262 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 1263 1264 return (1); 1265 } 1266 1267 /* 1268 * Compress mbuf chain m into the socket 1269 * buffer sb following mbuf n. If n 1270 * is null, the buffer is presumed empty. 1271 */ 1272 void 1273 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 1274 { 1275 int eor; 1276 struct mbuf *o; 1277 1278 KASSERT(solocked(sb->sb_so)); 1279 1280 eor = 0; 1281 while (m) { 1282 eor |= m->m_flags & M_EOR; 1283 if (m->m_len == 0 && 1284 (eor == 0 || 1285 (((o = m->m_next) || (o = n)) && 1286 o->m_type == m->m_type))) { 1287 if (sb->sb_lastrecord == m) 1288 sb->sb_lastrecord = m->m_next; 1289 m = m_free(m); 1290 continue; 1291 } 1292 if (n && (n->m_flags & M_EOR) == 0 && 1293 /* M_TRAILINGSPACE() checks buffer writeability */ 1294 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ 1295 m->m_len <= M_TRAILINGSPACE(n) && 1296 n->m_type == m->m_type) { 1297 memcpy(mtod(n, char *) + n->m_len, mtod(m, void *), 1298 (unsigned)m->m_len); 1299 n->m_len += m->m_len; 1300 sb->sb_cc += m->m_len; 1301 m = m_free(m); 1302 continue; 1303 } 1304 if (n) 1305 n->m_next = m; 1306 else 1307 sb->sb_mb = m; 1308 sb->sb_mbtail = m; 1309 sballoc(sb, m); 1310 n = m; 1311 m->m_flags &= ~M_EOR; 1312 m = m->m_next; 1313 n->m_next = 0; 1314 } 1315 if (eor) { 1316 if (n) 1317 n->m_flags |= eor; 1318 else 1319 printf("semi-panic: sbcompress\n"); 1320 } 1321 SBLASTMBUFCHK(sb, __func__); 1322 } 1323 1324 /* 1325 * Free all mbufs in a sockbuf. 1326 * Check that all resources are reclaimed. 1327 */ 1328 void 1329 sbflush(struct sockbuf *sb) 1330 { 1331 1332 KASSERT(solocked(sb->sb_so)); 1333 KASSERT((sb->sb_flags & SB_LOCK) == 0); 1334 1335 while (sb->sb_mbcnt) 1336 sbdrop(sb, (int)sb->sb_cc); 1337 1338 KASSERT(sb->sb_cc == 0); 1339 KASSERT(sb->sb_mb == NULL); 1340 KASSERT(sb->sb_mbtail == NULL); 1341 KASSERT(sb->sb_lastrecord == NULL); 1342 } 1343 1344 /* 1345 * Drop data from (the front of) a sockbuf. 1346 */ 1347 void 1348 sbdrop(struct sockbuf *sb, int len) 1349 { 1350 struct mbuf *m, *next; 1351 1352 KASSERT(solocked(sb->sb_so)); 1353 1354 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; 1355 while (len > 0) { 1356 if (m == NULL) { 1357 if (next == NULL) 1358 panic("sbdrop(%p,%d): cc=%lu", 1359 sb, len, sb->sb_cc); 1360 m = next; 1361 next = m->m_nextpkt; 1362 continue; 1363 } 1364 if (m->m_len > len) { 1365 m->m_len -= len; 1366 m->m_data += len; 1367 sb->sb_cc -= len; 1368 break; 1369 } 1370 len -= m->m_len; 1371 sbfree(sb, m); 1372 m = m_free(m); 1373 } 1374 while (m && m->m_len == 0) { 1375 sbfree(sb, m); 1376 m = m_free(m); 1377 } 1378 if (m) { 1379 sb->sb_mb = m; 1380 m->m_nextpkt = next; 1381 } else 1382 sb->sb_mb = next; 1383 /* 1384 * First part is an inline SB_EMPTY_FIXUP(). Second part 1385 * makes sure sb_lastrecord is up-to-date if we dropped 1386 * part of the last record. 1387 */ 1388 m = sb->sb_mb; 1389 if (m == NULL) { 1390 sb->sb_mbtail = NULL; 1391 sb->sb_lastrecord = NULL; 1392 } else if (m->m_nextpkt == NULL) 1393 sb->sb_lastrecord = m; 1394 } 1395 1396 /* 1397 * Drop a record off the front of a sockbuf 1398 * and move the next record to the front. 1399 */ 1400 void 1401 sbdroprecord(struct sockbuf *sb) 1402 { 1403 struct mbuf *m, *mn; 1404 1405 KASSERT(solocked(sb->sb_so)); 1406 1407 m = sb->sb_mb; 1408 if (m) { 1409 sb->sb_mb = m->m_nextpkt; 1410 do { 1411 sbfree(sb, m); 1412 mn = m_free(m); 1413 } while ((m = mn) != NULL); 1414 } 1415 SB_EMPTY_FIXUP(sb); 1416 } 1417 1418 /* 1419 * Create a "control" mbuf containing the specified data 1420 * with the specified type for presentation on a socket buffer. 1421 */ 1422 struct mbuf * 1423 sbcreatecontrol1(void **p, int size, int type, int level, int flags) 1424 { 1425 struct cmsghdr *cp; 1426 struct mbuf *m; 1427 int space = CMSG_SPACE(size); 1428 1429 if ((flags & M_DONTWAIT) && space > MCLBYTES) { 1430 printf("%s: message too large %d\n", __func__, space); 1431 return NULL; 1432 } 1433 1434 if ((m = m_get(flags, MT_CONTROL)) == NULL) 1435 return NULL; 1436 if (space > MLEN) { 1437 if (space > MCLBYTES) 1438 MEXTMALLOC(m, space, M_WAITOK); 1439 else 1440 MCLGET(m, flags); 1441 if ((m->m_flags & M_EXT) == 0) { 1442 m_free(m); 1443 return NULL; 1444 } 1445 } 1446 cp = mtod(m, struct cmsghdr *); 1447 *p = CMSG_DATA(cp); 1448 m->m_len = space; 1449 cp->cmsg_len = CMSG_LEN(size); 1450 cp->cmsg_level = level; 1451 cp->cmsg_type = type; 1452 1453 memset(cp + 1, 0, CMSG_LEN(0) - sizeof(*cp)); 1454 memset((uint8_t *)*p + size, 0, CMSG_ALIGN(size) - size); 1455 1456 return m; 1457 } 1458 1459 struct mbuf * 1460 sbcreatecontrol(void *p, int size, int type, int level) 1461 { 1462 struct mbuf *m; 1463 void *v; 1464 1465 m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT); 1466 if (m == NULL) 1467 return NULL; 1468 memcpy(v, p, size); 1469 return m; 1470 } 1471 1472 void 1473 solockretry(struct socket *so, kmutex_t *lock) 1474 { 1475 1476 while (lock != atomic_load_relaxed(&so->so_lock)) { 1477 mutex_exit(lock); 1478 lock = atomic_load_consume(&so->so_lock); 1479 mutex_enter(lock); 1480 } 1481 } 1482 1483 bool 1484 solocked(const struct socket *so) 1485 { 1486 1487 /* 1488 * Used only for diagnostic assertions, so so_lock should be 1489 * stable at this point, hence on need for atomic_load_*. 1490 */ 1491 return mutex_owned(so->so_lock); 1492 } 1493 1494 bool 1495 solocked2(const struct socket *so1, const struct socket *so2) 1496 { 1497 const kmutex_t *lock; 1498 1499 /* 1500 * Used only for diagnostic assertions, so so_lock should be 1501 * stable at this point, hence on need for atomic_load_*. 1502 */ 1503 lock = so1->so_lock; 1504 if (lock != so2->so_lock) 1505 return false; 1506 return mutex_owned(lock); 1507 } 1508 1509 /* 1510 * sosetlock: assign a default lock to a new socket. 1511 */ 1512 void 1513 sosetlock(struct socket *so) 1514 { 1515 if (so->so_lock == NULL) { 1516 kmutex_t *lock = softnet_lock; 1517 1518 so->so_lock = lock; 1519 mutex_obj_hold(lock); 1520 mutex_enter(lock); 1521 } 1522 KASSERT(solocked(so)); 1523 } 1524 1525 /* 1526 * Set lock on sockbuf sb; sleep if lock is already held. 1527 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible. 1528 * Returns error without lock if sleep is interrupted. 1529 */ 1530 int 1531 sblock(struct sockbuf *sb, int wf) 1532 { 1533 struct socket *so; 1534 kmutex_t *lock; 1535 int error; 1536 1537 KASSERT(solocked(sb->sb_so)); 1538 1539 for (;;) { 1540 if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) { 1541 sb->sb_flags |= SB_LOCK; 1542 return 0; 1543 } 1544 if (wf != M_WAITOK) 1545 return SET_ERROR(EWOULDBLOCK); 1546 so = sb->sb_so; 1547 lock = so->so_lock; 1548 if ((sb->sb_flags & SB_NOINTR) != 0) { 1549 cv_wait(&so->so_cv, lock); 1550 error = 0; 1551 } else 1552 error = cv_wait_sig(&so->so_cv, lock); 1553 if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) 1554 solockretry(so, lock); 1555 if (error != 0) 1556 return error; 1557 } 1558 } 1559 1560 void 1561 sbunlock(struct sockbuf *sb) 1562 { 1563 struct socket *so; 1564 1565 so = sb->sb_so; 1566 1567 KASSERT(solocked(so)); 1568 KASSERT((sb->sb_flags & SB_LOCK) != 0); 1569 1570 sb->sb_flags &= ~SB_LOCK; 1571 cv_broadcast(&so->so_cv); 1572 } 1573 1574 int 1575 sowait(struct socket *so, bool catch_p, int timo) 1576 { 1577 kmutex_t *lock; 1578 int error; 1579 1580 KASSERT(solocked(so)); 1581 KASSERT(catch_p || timo != 0); 1582 1583 lock = so->so_lock; 1584 if (catch_p) 1585 error = cv_timedwait_sig(&so->so_cv, lock, timo); 1586 else 1587 error = cv_timedwait(&so->so_cv, lock, timo); 1588 if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) 1589 solockretry(so, lock); 1590 return error; 1591 } 1592 1593 #ifdef DDB 1594 1595 /* 1596 * Currently, sofindproc() is used only from DDB. It could be used from others 1597 * by using db_mutex_enter() 1598 */ 1599 1600 static inline int 1601 db_mutex_enter(kmutex_t *mtx) 1602 { 1603 int rv; 1604 1605 if (!db_active) { 1606 mutex_enter(mtx); 1607 rv = 1; 1608 } else 1609 rv = mutex_tryenter(mtx); 1610 1611 return rv; 1612 } 1613 1614 int 1615 sofindproc(struct socket *so, int all, void (*pr)(const char *, ...)) 1616 { 1617 proc_t *p; 1618 filedesc_t *fdp; 1619 fdtab_t *dt; 1620 fdfile_t *ff; 1621 file_t *fp = NULL; 1622 int found = 0; 1623 int i, t; 1624 1625 if (so == NULL) 1626 return 0; 1627 1628 t = db_mutex_enter(&proc_lock); 1629 if (!t) { 1630 pr("could not acquire proc_lock mutex\n"); 1631 return 0; 1632 } 1633 PROCLIST_FOREACH(p, &allproc) { 1634 if (p->p_stat == SIDL) 1635 continue; 1636 fdp = p->p_fd; 1637 t = db_mutex_enter(&fdp->fd_lock); 1638 if (!t) { 1639 pr("could not acquire fd_lock mutex\n"); 1640 continue; 1641 } 1642 dt = atomic_load_consume(&fdp->fd_dt); 1643 for (i = 0; i < dt->dt_nfiles; i++) { 1644 ff = dt->dt_ff[i]; 1645 if (ff == NULL) 1646 continue; 1647 1648 fp = atomic_load_consume(&ff->ff_file); 1649 if (fp == NULL) 1650 continue; 1651 1652 t = db_mutex_enter(&fp->f_lock); 1653 if (!t) { 1654 pr("could not acquire f_lock mutex\n"); 1655 continue; 1656 } 1657 if ((struct socket *)fp->f_data != so) { 1658 mutex_exit(&fp->f_lock); 1659 continue; 1660 } 1661 found++; 1662 if (pr) 1663 pr("socket %p: owner %s(pid=%d)\n", 1664 so, p->p_comm, p->p_pid); 1665 mutex_exit(&fp->f_lock); 1666 if (all == 0) 1667 break; 1668 } 1669 mutex_exit(&fdp->fd_lock); 1670 if (all == 0 && found != 0) 1671 break; 1672 } 1673 mutex_exit(&proc_lock); 1674 1675 return found; 1676 } 1677 1678 void 1679 socket_print(const char *modif, void (*pr)(const char *, ...)) 1680 { 1681 file_t *fp; 1682 struct socket *so; 1683 struct sockbuf *sb_snd, *sb_rcv; 1684 struct mbuf *m_rec, *m; 1685 bool opt_v = false; 1686 bool opt_m = false; 1687 bool opt_a = false; 1688 bool opt_p = false; 1689 int nrecs, nmbufs; 1690 char ch; 1691 const char *family; 1692 1693 while ( (ch = *(modif++)) != '\0') { 1694 switch (ch) { 1695 case 'v': 1696 opt_v = true; 1697 break; 1698 case 'm': 1699 opt_m = true; 1700 break; 1701 case 'a': 1702 opt_a = true; 1703 break; 1704 case 'p': 1705 opt_p = true; 1706 break; 1707 } 1708 } 1709 if (opt_v == false && pr) 1710 (pr)("Ignore empty sockets. use /v to print all.\n"); 1711 if (opt_p == true && pr) 1712 (pr)("Don't search owner process.\n"); 1713 1714 LIST_FOREACH(fp, &filehead, f_list) { 1715 if (fp->f_type != DTYPE_SOCKET) 1716 continue; 1717 so = (struct socket *)fp->f_data; 1718 if (so == NULL) 1719 continue; 1720 1721 if (so->so_proto->pr_domain->dom_family == AF_INET) 1722 family = "INET"; 1723 #ifdef INET6 1724 else if (so->so_proto->pr_domain->dom_family == AF_INET6) 1725 family = "INET6"; 1726 #endif 1727 else if (so->so_proto->pr_domain->dom_family == pseudo_AF_KEY) 1728 family = "KEY"; 1729 else if (so->so_proto->pr_domain->dom_family == AF_ROUTE) 1730 family = "ROUTE"; 1731 else 1732 continue; 1733 1734 sb_snd = &so->so_snd; 1735 sb_rcv = &so->so_rcv; 1736 1737 if (opt_v != true && 1738 sb_snd->sb_cc == 0 && sb_rcv->sb_cc == 0) 1739 continue; 1740 1741 pr("---SOCKET %p: type %s\n", so, family); 1742 if (opt_p != true) 1743 sofindproc(so, opt_a == true ? 1 : 0, pr); 1744 pr("Send Buffer Bytes: %d [bytes]\n", sb_snd->sb_cc); 1745 pr("Send Buffer mbufs:\n"); 1746 m_rec = m = sb_snd->sb_mb; 1747 nrecs = 0; 1748 nmbufs = 0; 1749 while (m_rec) { 1750 nrecs++; 1751 if (opt_m == true) 1752 pr(" mbuf chain %p\n", m_rec); 1753 while (m) { 1754 nmbufs++; 1755 m = m->m_next; 1756 } 1757 m_rec = m = m_rec->m_nextpkt; 1758 } 1759 pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs); 1760 1761 pr("Recv Buffer Usage: %d [bytes]\n", sb_rcv->sb_cc); 1762 pr("Recv Buffer mbufs:\n"); 1763 m_rec = m = sb_rcv->sb_mb; 1764 nrecs = 0; 1765 nmbufs = 0; 1766 while (m_rec) { 1767 nrecs++; 1768 if (opt_m == true) 1769 pr(" mbuf chain %p\n", m_rec); 1770 while (m) { 1771 nmbufs++; 1772 m = m->m_next; 1773 } 1774 m_rec = m = m_rec->m_nextpkt; 1775 } 1776 pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs); 1777 } 1778 } 1779 #endif /* DDB */ 1780