1 /* $NetBSD: uipc_socket2.c,v 1.63 2004/05/27 19:19:00 jonathan Exp $ */ 2 3 /* 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.63 2004/05/27 19:19:00 jonathan Exp $"); 36 37 #include "opt_mbuftrace.h" 38 #include "opt_sb_max.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/proc.h> 43 #include <sys/file.h> 44 #include <sys/buf.h> 45 #include <sys/malloc.h> 46 #include <sys/mbuf.h> 47 #include <sys/protosw.h> 48 #include <sys/poll.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/signalvar.h> 52 53 /* 54 * Primitive routines for operating on sockets and socket buffers 55 */ 56 57 /* strings for sleep message: */ 58 const char netcon[] = "netcon"; 59 const char netcls[] = "netcls"; 60 const char netio[] = "netio"; 61 const char netlck[] = "netlck"; 62 63 u_long sb_max = SB_MAX; /* maximum socket buffer size */ 64 static u_long sb_max_adj; /* adjusted sb_max */ 65 66 /* 67 * Procedures to manipulate state flags of socket 68 * and do appropriate wakeups. Normal sequence from the 69 * active (originating) side is that soisconnecting() is 70 * called during processing of connect() call, 71 * resulting in an eventual call to soisconnected() if/when the 72 * connection is established. When the connection is torn down 73 * soisdisconnecting() is called during processing of disconnect() call, 74 * and soisdisconnected() is called when the connection to the peer 75 * is totally severed. The semantics of these routines are such that 76 * connectionless protocols can call soisconnected() and soisdisconnected() 77 * only, bypassing the in-progress calls when setting up a ``connection'' 78 * takes no time. 79 * 80 * From the passive side, a socket is created with 81 * two queues of sockets: so_q0 for connections in progress 82 * and so_q for connections already made and awaiting user acceptance. 83 * As a protocol is preparing incoming connections, it creates a socket 84 * structure queued on so_q0 by calling sonewconn(). When the connection 85 * is established, soisconnected() is called, and transfers the 86 * socket structure to so_q, making it available to accept(). 87 * 88 * If a socket is closed with sockets on either 89 * so_q0 or so_q, these sockets are dropped. 90 * 91 * If higher level protocols are implemented in 92 * the kernel, the wakeups done here will sometimes 93 * cause software-interrupt process scheduling. 94 */ 95 96 void 97 soisconnecting(struct socket *so) 98 { 99 100 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 101 so->so_state |= SS_ISCONNECTING; 102 } 103 104 void 105 soisconnected(struct socket *so) 106 { 107 struct socket *head; 108 109 head = so->so_head; 110 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 111 so->so_state |= SS_ISCONNECTED; 112 if (head && soqremque(so, 0)) { 113 soqinsque(head, so, 1); 114 sorwakeup(head); 115 wakeup((caddr_t)&head->so_timeo); 116 } else { 117 wakeup((caddr_t)&so->so_timeo); 118 sorwakeup(so); 119 sowwakeup(so); 120 } 121 } 122 123 void 124 soisdisconnecting(struct socket *so) 125 { 126 127 so->so_state &= ~SS_ISCONNECTING; 128 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); 129 wakeup((caddr_t)&so->so_timeo); 130 sowwakeup(so); 131 sorwakeup(so); 132 } 133 134 void 135 soisdisconnected(struct socket *so) 136 { 137 138 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 139 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); 140 wakeup((caddr_t)&so->so_timeo); 141 sowwakeup(so); 142 sorwakeup(so); 143 } 144 145 /* 146 * When an attempt at a new connection is noted on a socket 147 * which accepts connections, sonewconn is called. If the 148 * connection is possible (subject to space constraints, etc.) 149 * then we allocate a new structure, propoerly linked into the 150 * data structure of the original socket, and return this. 151 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 152 * 153 * Currently, sonewconn() is defined as sonewconn1() in socketvar.h 154 * to catch calls that are missing the (new) second parameter. 155 */ 156 struct socket * 157 sonewconn1(struct socket *head, int connstatus) 158 { 159 struct socket *so; 160 int soqueue; 161 162 soqueue = connstatus ? 1 : 0; 163 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) 164 return ((struct socket *)0); 165 so = pool_get(&socket_pool, PR_NOWAIT); 166 if (so == NULL) 167 return (NULL); 168 memset((caddr_t)so, 0, sizeof(*so)); 169 so->so_type = head->so_type; 170 so->so_options = head->so_options &~ SO_ACCEPTCONN; 171 so->so_linger = head->so_linger; 172 so->so_state = head->so_state | SS_NOFDREF; 173 so->so_proto = head->so_proto; 174 so->so_timeo = head->so_timeo; 175 so->so_pgid = head->so_pgid; 176 so->so_send = head->so_send; 177 so->so_receive = head->so_receive; 178 so->so_uid = head->so_uid; 179 #ifdef MBUFTRACE 180 so->so_mowner = head->so_mowner; 181 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner; 182 so->so_snd.sb_mowner = head->so_snd.sb_mowner; 183 #endif 184 (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); 185 soqinsque(head, so, soqueue); 186 if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, 187 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 188 (struct proc *)0)) { 189 (void) soqremque(so, soqueue); 190 pool_put(&socket_pool, so); 191 return (NULL); 192 } 193 if (connstatus) { 194 sorwakeup(head); 195 wakeup((caddr_t)&head->so_timeo); 196 so->so_state |= connstatus; 197 } 198 return (so); 199 } 200 201 void 202 soqinsque(struct socket *head, struct socket *so, int q) 203 { 204 205 #ifdef DIAGNOSTIC 206 if (so->so_onq != NULL) 207 panic("soqinsque"); 208 #endif 209 210 so->so_head = head; 211 if (q == 0) { 212 head->so_q0len++; 213 so->so_onq = &head->so_q0; 214 } else { 215 head->so_qlen++; 216 so->so_onq = &head->so_q; 217 } 218 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe); 219 } 220 221 int 222 soqremque(struct socket *so, int q) 223 { 224 struct socket *head; 225 226 head = so->so_head; 227 if (q == 0) { 228 if (so->so_onq != &head->so_q0) 229 return (0); 230 head->so_q0len--; 231 } else { 232 if (so->so_onq != &head->so_q) 233 return (0); 234 head->so_qlen--; 235 } 236 TAILQ_REMOVE(so->so_onq, so, so_qe); 237 so->so_onq = NULL; 238 so->so_head = NULL; 239 return (1); 240 } 241 242 /* 243 * Socantsendmore indicates that no more data will be sent on the 244 * socket; it would normally be applied to a socket when the user 245 * informs the system that no more data is to be sent, by the protocol 246 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data 247 * will be received, and will normally be applied to the socket by a 248 * protocol when it detects that the peer will send no more data. 249 * Data queued for reading in the socket may yet be read. 250 */ 251 252 void 253 socantsendmore(struct socket *so) 254 { 255 256 so->so_state |= SS_CANTSENDMORE; 257 sowwakeup(so); 258 } 259 260 void 261 socantrcvmore(struct socket *so) 262 { 263 264 so->so_state |= SS_CANTRCVMORE; 265 sorwakeup(so); 266 } 267 268 /* 269 * Wait for data to arrive at/drain from a socket buffer. 270 */ 271 int 272 sbwait(struct sockbuf *sb) 273 { 274 275 sb->sb_flags |= SB_WAIT; 276 return (tsleep((caddr_t)&sb->sb_cc, 277 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, 278 sb->sb_timeo)); 279 } 280 281 /* 282 * Lock a sockbuf already known to be locked; 283 * return any error returned from sleep (EINTR). 284 */ 285 int 286 sb_lock(struct sockbuf *sb) 287 { 288 int error; 289 290 while (sb->sb_flags & SB_LOCK) { 291 sb->sb_flags |= SB_WANT; 292 error = tsleep((caddr_t)&sb->sb_flags, 293 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, 294 netlck, 0); 295 if (error) 296 return (error); 297 } 298 sb->sb_flags |= SB_LOCK; 299 return (0); 300 } 301 302 /* 303 * Wakeup processes waiting on a socket buffer. 304 * Do asynchronous notification via SIGIO 305 * if the socket buffer has the SB_ASYNC flag set. 306 */ 307 void 308 sowakeup(struct socket *so, struct sockbuf *sb, int code) 309 { 310 selnotify(&sb->sb_sel, 0); 311 sb->sb_flags &= ~SB_SEL; 312 if (sb->sb_flags & SB_WAIT) { 313 sb->sb_flags &= ~SB_WAIT; 314 wakeup((caddr_t)&sb->sb_cc); 315 } 316 if (sb->sb_flags & SB_ASYNC) { 317 int band; 318 if (code == POLL_IN) 319 band = POLLIN|POLLRDNORM; 320 else 321 band = POLLOUT|POLLWRNORM; 322 fownsignal(so->so_pgid, SIGIO, code, band, so); 323 } 324 if (sb->sb_flags & SB_UPCALL) 325 (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); 326 } 327 328 /* 329 * Socket buffer (struct sockbuf) utility routines. 330 * 331 * Each socket contains two socket buffers: one for sending data and 332 * one for receiving data. Each buffer contains a queue of mbufs, 333 * information about the number of mbufs and amount of data in the 334 * queue, and other fields allowing poll() statements and notification 335 * on data availability to be implemented. 336 * 337 * Data stored in a socket buffer is maintained as a list of records. 338 * Each record is a list of mbufs chained together with the m_next 339 * field. Records are chained together with the m_nextpkt field. The upper 340 * level routine soreceive() expects the following conventions to be 341 * observed when placing information in the receive buffer: 342 * 343 * 1. If the protocol requires each message be preceded by the sender's 344 * name, then a record containing that name must be present before 345 * any associated data (mbuf's must be of type MT_SONAME). 346 * 2. If the protocol supports the exchange of ``access rights'' (really 347 * just additional data associated with the message), and there are 348 * ``rights'' to be received, then a record containing this data 349 * should be present (mbuf's must be of type MT_CONTROL). 350 * 3. If a name or rights record exists, then it must be followed by 351 * a data record, perhaps of zero length. 352 * 353 * Before using a new socket structure it is first necessary to reserve 354 * buffer space to the socket, by calling sbreserve(). This should commit 355 * some of the available buffer space in the system buffer pool for the 356 * socket (currently, it does nothing but enforce limits). The space 357 * should be released by calling sbrelease() when the socket is destroyed. 358 */ 359 360 int 361 sb_max_set(u_long new_sbmax) 362 { 363 int s; 364 365 if (new_sbmax < (16 * 1024)) 366 return (EINVAL); 367 368 s = splsoftnet(); 369 sb_max = new_sbmax; 370 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES); 371 splx(s); 372 373 return (0); 374 } 375 376 int 377 soreserve(struct socket *so, u_long sndcc, u_long rcvcc) 378 { 379 380 if (sbreserve(&so->so_snd, sndcc, so) == 0) 381 goto bad; 382 if (sbreserve(&so->so_rcv, rcvcc, so) == 0) 383 goto bad2; 384 if (so->so_rcv.sb_lowat == 0) 385 so->so_rcv.sb_lowat = 1; 386 if (so->so_snd.sb_lowat == 0) 387 so->so_snd.sb_lowat = MCLBYTES; 388 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) 389 so->so_snd.sb_lowat = so->so_snd.sb_hiwat; 390 return (0); 391 bad2: 392 sbrelease(&so->so_snd, so); 393 bad: 394 return (ENOBUFS); 395 } 396 397 /* 398 * Allot mbufs to a sockbuf. 399 * Attempt to scale mbmax so that mbcnt doesn't become limiting 400 * if buffering efficiency is near the normal case. 401 */ 402 int 403 sbreserve(struct sockbuf *sb, u_long cc, struct socket *so) 404 { 405 struct proc *p = curproc; /* XXX */ 406 rlim_t maxcc; 407 uid_t uid; 408 409 KDASSERT(sb_max_adj != 0); 410 if (cc == 0 || cc > sb_max_adj) 411 return (0); 412 if (so) { 413 if (p && p->p_ucred->cr_uid == so->so_uid) 414 maxcc = p->p_rlimit[RLIMIT_SBSIZE].rlim_cur; 415 else 416 maxcc = RLIM_INFINITY; 417 uid = so->so_uid; 418 } else { 419 uid = 0; /* XXX: nothing better */ 420 maxcc = RLIM_INFINITY; 421 } 422 if (!chgsbsize(uid, &sb->sb_hiwat, cc, maxcc)) 423 return 0; 424 sb->sb_mbmax = min(cc * 2, sb_max); 425 if (sb->sb_lowat > sb->sb_hiwat) 426 sb->sb_lowat = sb->sb_hiwat; 427 return (1); 428 } 429 430 /* 431 * Free mbufs held by a socket, and reserved mbuf space. 432 */ 433 void 434 sbrelease(struct sockbuf *sb, struct socket *so) 435 { 436 437 sbflush(sb); 438 (void)chgsbsize(so->so_uid, &sb->sb_hiwat, 0, 439 RLIM_INFINITY); 440 sb->sb_mbmax = 0; 441 } 442 443 /* 444 * Routines to add and remove 445 * data from an mbuf queue. 446 * 447 * The routines sbappend() or sbappendrecord() are normally called to 448 * append new mbufs to a socket buffer, after checking that adequate 449 * space is available, comparing the function sbspace() with the amount 450 * of data to be added. sbappendrecord() differs from sbappend() in 451 * that data supplied is treated as the beginning of a new record. 452 * To place a sender's address, optional access rights, and data in a 453 * socket receive buffer, sbappendaddr() should be used. To place 454 * access rights and data in a socket receive buffer, sbappendrights() 455 * should be used. In either case, the new data begins a new record. 456 * Note that unlike sbappend() and sbappendrecord(), these routines check 457 * for the caller that there will be enough space to store the data. 458 * Each fails if there is not enough space, or if it cannot find mbufs 459 * to store additional information in. 460 * 461 * Reliable protocols may use the socket send buffer to hold data 462 * awaiting acknowledgement. Data is normally copied from a socket 463 * send buffer in a protocol with m_copy for output to a peer, 464 * and then removing the data from the socket buffer with sbdrop() 465 * or sbdroprecord() when the data is acknowledged by the peer. 466 */ 467 468 #ifdef SOCKBUF_DEBUG 469 void 470 sblastrecordchk(struct sockbuf *sb, const char *where) 471 { 472 struct mbuf *m = sb->sb_mb; 473 474 while (m && m->m_nextpkt) 475 m = m->m_nextpkt; 476 477 if (m != sb->sb_lastrecord) { 478 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n", 479 sb->sb_mb, sb->sb_lastrecord, m); 480 printf("packet chain:\n"); 481 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) 482 printf("\t%p\n", m); 483 panic("sblastrecordchk from %s", where); 484 } 485 } 486 487 void 488 sblastmbufchk(struct sockbuf *sb, const char *where) 489 { 490 struct mbuf *m = sb->sb_mb; 491 struct mbuf *n; 492 493 while (m && m->m_nextpkt) 494 m = m->m_nextpkt; 495 496 while (m && m->m_next) 497 m = m->m_next; 498 499 if (m != sb->sb_mbtail) { 500 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n", 501 sb->sb_mb, sb->sb_mbtail, m); 502 printf("packet tree:\n"); 503 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { 504 printf("\t"); 505 for (n = m; n != NULL; n = n->m_next) 506 printf("%p ", n); 507 printf("\n"); 508 } 509 panic("sblastmbufchk from %s", where); 510 } 511 } 512 #endif /* SOCKBUF_DEBUG */ 513 514 /* 515 * Link a chain of records onto a socket buffer 516 */ 517 #define SBLINKRECORDCHAIN(sb, m0, mlast) \ 518 do { \ 519 if ((sb)->sb_lastrecord != NULL) \ 520 (sb)->sb_lastrecord->m_nextpkt = (m0); \ 521 else \ 522 (sb)->sb_mb = (m0); \ 523 (sb)->sb_lastrecord = (mlast); \ 524 } while (/*CONSTCOND*/0) 525 526 527 #define SBLINKRECORD(sb, m0) \ 528 SBLINKRECORDCHAIN(sb, m0, m0) 529 530 /* 531 * Append mbuf chain m to the last record in the 532 * socket buffer sb. The additional space associated 533 * the mbuf chain is recorded in sb. Empty mbufs are 534 * discarded and mbufs are compacted where possible. 535 */ 536 void 537 sbappend(struct sockbuf *sb, struct mbuf *m) 538 { 539 struct mbuf *n; 540 541 if (m == 0) 542 return; 543 544 #ifdef MBUFTRACE 545 m_claim(m, sb->sb_mowner); 546 #endif 547 548 SBLASTRECORDCHK(sb, "sbappend 1"); 549 550 if ((n = sb->sb_lastrecord) != NULL) { 551 /* 552 * XXX Would like to simply use sb_mbtail here, but 553 * XXX I need to verify that I won't miss an EOR that 554 * XXX way. 555 */ 556 do { 557 if (n->m_flags & M_EOR) { 558 sbappendrecord(sb, m); /* XXXXXX!!!! */ 559 return; 560 } 561 } while (n->m_next && (n = n->m_next)); 562 } else { 563 /* 564 * If this is the first record in the socket buffer, it's 565 * also the last record. 566 */ 567 sb->sb_lastrecord = m; 568 } 569 sbcompress(sb, m, n); 570 SBLASTRECORDCHK(sb, "sbappend 2"); 571 } 572 573 /* 574 * This version of sbappend() should only be used when the caller 575 * absolutely knows that there will never be more than one record 576 * in the socket buffer, that is, a stream protocol (such as TCP). 577 */ 578 void 579 sbappendstream(struct sockbuf *sb, struct mbuf *m) 580 { 581 582 KDASSERT(m->m_nextpkt == NULL); 583 KASSERT(sb->sb_mb == sb->sb_lastrecord); 584 585 SBLASTMBUFCHK(sb, __func__); 586 587 #ifdef MBUFTRACE 588 m_claim(m, sb->sb_mowner); 589 #endif 590 591 sbcompress(sb, m, sb->sb_mbtail); 592 593 sb->sb_lastrecord = sb->sb_mb; 594 SBLASTRECORDCHK(sb, __func__); 595 } 596 597 #ifdef SOCKBUF_DEBUG 598 void 599 sbcheck(struct sockbuf *sb) 600 { 601 struct mbuf *m; 602 u_long len, mbcnt; 603 604 len = 0; 605 mbcnt = 0; 606 for (m = sb->sb_mb; m; m = m->m_next) { 607 len += m->m_len; 608 mbcnt += MSIZE; 609 if (m->m_flags & M_EXT) 610 mbcnt += m->m_ext.ext_size; 611 if (m->m_nextpkt) 612 panic("sbcheck nextpkt"); 613 } 614 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 615 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc, 616 mbcnt, sb->sb_mbcnt); 617 panic("sbcheck"); 618 } 619 } 620 #endif 621 622 /* 623 * As above, except the mbuf chain 624 * begins a new record. 625 */ 626 void 627 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 628 { 629 struct mbuf *m; 630 631 if (m0 == 0) 632 return; 633 634 #ifdef MBUFTRACE 635 m_claim(m0, sb->sb_mowner); 636 #endif 637 /* 638 * Put the first mbuf on the queue. 639 * Note this permits zero length records. 640 */ 641 sballoc(sb, m0); 642 SBLASTRECORDCHK(sb, "sbappendrecord 1"); 643 SBLINKRECORD(sb, m0); 644 m = m0->m_next; 645 m0->m_next = 0; 646 if (m && (m0->m_flags & M_EOR)) { 647 m0->m_flags &= ~M_EOR; 648 m->m_flags |= M_EOR; 649 } 650 sbcompress(sb, m, m0); 651 SBLASTRECORDCHK(sb, "sbappendrecord 2"); 652 } 653 654 /* 655 * As above except that OOB data 656 * is inserted at the beginning of the sockbuf, 657 * but after any other OOB data. 658 */ 659 void 660 sbinsertoob(struct sockbuf *sb, struct mbuf *m0) 661 { 662 struct mbuf *m, **mp; 663 664 if (m0 == 0) 665 return; 666 667 SBLASTRECORDCHK(sb, "sbinsertoob 1"); 668 669 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) { 670 again: 671 switch (m->m_type) { 672 673 case MT_OOBDATA: 674 continue; /* WANT next train */ 675 676 case MT_CONTROL: 677 if ((m = m->m_next) != NULL) 678 goto again; /* inspect THIS train further */ 679 } 680 break; 681 } 682 /* 683 * Put the first mbuf on the queue. 684 * Note this permits zero length records. 685 */ 686 sballoc(sb, m0); 687 m0->m_nextpkt = *mp; 688 if (*mp == NULL) { 689 /* m0 is actually the new tail */ 690 sb->sb_lastrecord = m0; 691 } 692 *mp = m0; 693 m = m0->m_next; 694 m0->m_next = 0; 695 if (m && (m0->m_flags & M_EOR)) { 696 m0->m_flags &= ~M_EOR; 697 m->m_flags |= M_EOR; 698 } 699 sbcompress(sb, m, m0); 700 SBLASTRECORDCHK(sb, "sbinsertoob 2"); 701 } 702 703 /* 704 * Append address and data, and optionally, control (ancillary) data 705 * to the receive queue of a socket. If present, 706 * m0 must include a packet header with total length. 707 * Returns 0 if no space in sockbuf or insufficient mbufs. 708 */ 709 int 710 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 711 struct mbuf *control) 712 { 713 struct mbuf *m, *n, *nlast; 714 int space, len; 715 716 space = asa->sa_len; 717 718 if (m0 != NULL) { 719 if ((m0->m_flags & M_PKTHDR) == 0) 720 panic("sbappendaddr"); 721 space += m0->m_pkthdr.len; 722 #ifdef MBUFTRACE 723 m_claim(m0, sb->sb_mowner); 724 #endif 725 } 726 for (n = control; n; n = n->m_next) { 727 space += n->m_len; 728 MCLAIM(n, sb->sb_mowner); 729 if (n->m_next == 0) /* keep pointer to last control buf */ 730 break; 731 } 732 if (space > sbspace(sb)) 733 return (0); 734 MGET(m, M_DONTWAIT, MT_SONAME); 735 if (m == 0) 736 return (0); 737 MCLAIM(m, sb->sb_mowner); 738 /* 739 * XXX avoid 'comparison always true' warning which isn't easily 740 * avoided. 741 */ 742 len = asa->sa_len; 743 if (len > MLEN) { 744 MEXTMALLOC(m, asa->sa_len, M_NOWAIT); 745 if ((m->m_flags & M_EXT) == 0) { 746 m_free(m); 747 return (0); 748 } 749 } 750 m->m_len = asa->sa_len; 751 memcpy(mtod(m, caddr_t), (caddr_t)asa, asa->sa_len); 752 if (n) 753 n->m_next = m0; /* concatenate data to control */ 754 else 755 control = m0; 756 m->m_next = control; 757 758 SBLASTRECORDCHK(sb, "sbappendaddr 1"); 759 760 for (n = m; n->m_next != NULL; n = n->m_next) 761 sballoc(sb, n); 762 sballoc(sb, n); 763 nlast = n; 764 SBLINKRECORD(sb, m); 765 766 sb->sb_mbtail = nlast; 767 SBLASTMBUFCHK(sb, "sbappendaddr"); 768 769 SBLASTRECORDCHK(sb, "sbappendaddr 2"); 770 771 return (1); 772 } 773 774 /* 775 * Helper for sbappendchainaddr: prepend a struct sockaddr* to 776 * an mbuf chain. 777 */ 778 static __inline struct mbuf * 779 m_prepend_sockaddr(struct mbuf *m0, const struct sockaddr *asa) 780 { 781 struct mbuf *m; 782 const int mlen = asa->sa_len; 783 784 /* only the first in each chain need be a pkthdr */ 785 MGETHDR(m, M_DONTWAIT, MT_SONAME); 786 if (m == 0) 787 return (0); 788 MCLAIM(m, sb->sb_mowner); 789 KASSERT(mlen <= MLEN); 790 791 m->m_len = mlen; 792 bcopy((caddr_t)asa, mtod(m, caddr_t), mlen); 793 m->m_next = m0; 794 m->m_pkthdr.len = mlen + m0->m_pkthdr.len; 795 796 return m; 797 } 798 799 int 800 sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, 801 struct mbuf *m0, int sbprio) 802 { 803 int space; 804 struct mbuf *m, *n, *n0, *nlast; 805 int error; 806 807 /* 808 * XXX sbprio reserved for encoding priority of this* request: 809 * SB_PRIO_NONE --> honour normal sb limits 810 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, 811 * take whole chain. Intended for large requests 812 * that should be delivered atomically (all, or none). 813 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow 814 * over normal socket limits, for messages indicating 815 * buffer overflow in earlier normal/lower-priority messages 816 * SB_PRIO_BESTEFFORT --> ignore limits entirely. 817 * Intended for kernel-generated messages only. 818 * Up to generator to avoid total mbuf resource exhaustion. 819 */ 820 (void)sbprio; 821 822 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 823 panic("sbappendaddrchain"); 824 825 space = sbspace(sb); 826 827 #ifdef notyet 828 /* 829 * Enforce SB_PRIO_* limits as described above. 830 */ 831 #endif 832 833 n0 = NULL; 834 nlast = NULL; 835 for (m = m0; m; m = m->m_nextpkt) { 836 struct mbuf *np; 837 838 /* Prepend sockaddr to this record (m) of input chain m0 */ 839 n = m_prepend_sockaddr(m, asa); 840 if (n == NULL) { 841 error = ENOBUFS; 842 goto bad; 843 } 844 845 /* Append record (asa+m) to end of new chain n0 */ 846 if (n0 == NULL) { 847 n0 = n; 848 } else { 849 nlast->m_nextpkt = n; 850 } 851 /* Keep track of last record on new chain */ 852 nlast = n; 853 854 for (np = n; np; np = np->m_next) 855 sballoc(sb, np); 856 } 857 858 /* Drop the entire chain of (asa+m) records onto the socket */ 859 SBLINKRECORDCHAIN(sb, n0, nlast); 860 for (m = nlast; m->m_next; m = m->m_next) 861 ; 862 sb->sb_mbtail = m; 863 864 return (1); 865 866 bad: 867 if (n) 868 m_freem(n); 869 return 0; 870 } 871 872 873 int 874 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 875 { 876 struct mbuf *m, *mlast, *n; 877 int space; 878 879 space = 0; 880 if (control == 0) 881 panic("sbappendcontrol"); 882 for (m = control; ; m = m->m_next) { 883 space += m->m_len; 884 MCLAIM(m, sb->sb_mowner); 885 if (m->m_next == 0) 886 break; 887 } 888 n = m; /* save pointer to last control buffer */ 889 for (m = m0; m; m = m->m_next) { 890 MCLAIM(m, sb->sb_mowner); 891 space += m->m_len; 892 } 893 if (space > sbspace(sb)) 894 return (0); 895 n->m_next = m0; /* concatenate data to control */ 896 897 SBLASTRECORDCHK(sb, "sbappendcontrol 1"); 898 899 for (m = control; m->m_next != NULL; m = m->m_next) 900 sballoc(sb, m); 901 sballoc(sb, m); 902 mlast = m; 903 SBLINKRECORD(sb, control); 904 905 sb->sb_mbtail = mlast; 906 SBLASTMBUFCHK(sb, "sbappendcontrol"); 907 908 SBLASTRECORDCHK(sb, "sbappendcontrol 2"); 909 910 return (1); 911 } 912 913 /* 914 * Compress mbuf chain m into the socket 915 * buffer sb following mbuf n. If n 916 * is null, the buffer is presumed empty. 917 */ 918 void 919 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) 920 { 921 int eor; 922 struct mbuf *o; 923 924 eor = 0; 925 while (m) { 926 eor |= m->m_flags & M_EOR; 927 if (m->m_len == 0 && 928 (eor == 0 || 929 (((o = m->m_next) || (o = n)) && 930 o->m_type == m->m_type))) { 931 if (sb->sb_lastrecord == m) 932 sb->sb_lastrecord = m->m_next; 933 m = m_free(m); 934 continue; 935 } 936 if (n && (n->m_flags & M_EOR) == 0 && 937 /* M_TRAILINGSPACE() checks buffer writeability */ 938 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */ 939 m->m_len <= M_TRAILINGSPACE(n) && 940 n->m_type == m->m_type) { 941 memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), 942 (unsigned)m->m_len); 943 n->m_len += m->m_len; 944 sb->sb_cc += m->m_len; 945 m = m_free(m); 946 continue; 947 } 948 if (n) 949 n->m_next = m; 950 else 951 sb->sb_mb = m; 952 sb->sb_mbtail = m; 953 sballoc(sb, m); 954 n = m; 955 m->m_flags &= ~M_EOR; 956 m = m->m_next; 957 n->m_next = 0; 958 } 959 if (eor) { 960 if (n) 961 n->m_flags |= eor; 962 else 963 printf("semi-panic: sbcompress\n"); 964 } 965 SBLASTMBUFCHK(sb, __func__); 966 } 967 968 /* 969 * Free all mbufs in a sockbuf. 970 * Check that all resources are reclaimed. 971 */ 972 void 973 sbflush(struct sockbuf *sb) 974 { 975 976 KASSERT((sb->sb_flags & SB_LOCK) == 0); 977 978 while (sb->sb_mbcnt) 979 sbdrop(sb, (int)sb->sb_cc); 980 981 KASSERT(sb->sb_cc == 0); 982 KASSERT(sb->sb_mb == NULL); 983 KASSERT(sb->sb_mbtail == NULL); 984 KASSERT(sb->sb_lastrecord == NULL); 985 } 986 987 /* 988 * Drop data from (the front of) a sockbuf. 989 */ 990 void 991 sbdrop(struct sockbuf *sb, int len) 992 { 993 struct mbuf *m, *mn, *next; 994 995 next = (m = sb->sb_mb) ? m->m_nextpkt : 0; 996 while (len > 0) { 997 if (m == 0) { 998 if (next == 0) 999 panic("sbdrop"); 1000 m = next; 1001 next = m->m_nextpkt; 1002 continue; 1003 } 1004 if (m->m_len > len) { 1005 m->m_len -= len; 1006 m->m_data += len; 1007 sb->sb_cc -= len; 1008 break; 1009 } 1010 len -= m->m_len; 1011 sbfree(sb, m); 1012 MFREE(m, mn); 1013 m = mn; 1014 } 1015 while (m && m->m_len == 0) { 1016 sbfree(sb, m); 1017 MFREE(m, mn); 1018 m = mn; 1019 } 1020 if (m) { 1021 sb->sb_mb = m; 1022 m->m_nextpkt = next; 1023 } else 1024 sb->sb_mb = next; 1025 /* 1026 * First part is an inline SB_EMPTY_FIXUP(). Second part 1027 * makes sure sb_lastrecord is up-to-date if we dropped 1028 * part of the last record. 1029 */ 1030 m = sb->sb_mb; 1031 if (m == NULL) { 1032 sb->sb_mbtail = NULL; 1033 sb->sb_lastrecord = NULL; 1034 } else if (m->m_nextpkt == NULL) 1035 sb->sb_lastrecord = m; 1036 } 1037 1038 /* 1039 * Drop a record off the front of a sockbuf 1040 * and move the next record to the front. 1041 */ 1042 void 1043 sbdroprecord(struct sockbuf *sb) 1044 { 1045 struct mbuf *m, *mn; 1046 1047 m = sb->sb_mb; 1048 if (m) { 1049 sb->sb_mb = m->m_nextpkt; 1050 do { 1051 sbfree(sb, m); 1052 MFREE(m, mn); 1053 } while ((m = mn) != NULL); 1054 } 1055 SB_EMPTY_FIXUP(sb); 1056 } 1057 1058 /* 1059 * Create a "control" mbuf containing the specified data 1060 * with the specified type for presentation on a socket buffer. 1061 */ 1062 struct mbuf * 1063 sbcreatecontrol(caddr_t p, int size, int type, int level) 1064 { 1065 struct cmsghdr *cp; 1066 struct mbuf *m; 1067 1068 if (CMSG_SPACE(size) > MCLBYTES) { 1069 printf("sbcreatecontrol: message too large %d\n", size); 1070 return NULL; 1071 } 1072 1073 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) 1074 return ((struct mbuf *) NULL); 1075 if (CMSG_SPACE(size) > MLEN) { 1076 MCLGET(m, M_DONTWAIT); 1077 if ((m->m_flags & M_EXT) == 0) { 1078 m_free(m); 1079 return NULL; 1080 } 1081 } 1082 cp = mtod(m, struct cmsghdr *); 1083 memcpy(CMSG_DATA(cp), p, size); 1084 m->m_len = CMSG_SPACE(size); 1085 cp->cmsg_len = CMSG_LEN(size); 1086 cp->cmsg_level = level; 1087 cp->cmsg_type = type; 1088 return (m); 1089 } 1090