1 /* 2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 1982, 1986, 1988, 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by the University of 17 * California, Berkeley and its contributors. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 35 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $ 36 * $DragonFly: src/sys/kern/uipc_sockbuf.c,v 1.1 2007/04/22 01:13:10 dillon Exp $ 37 */ 38 39 #include "opt_param.h" 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/domain.h> 43 #include <sys/file.h> /* for maxfiles */ 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/malloc.h> 47 #include <sys/mbuf.h> 48 #include <sys/protosw.h> 49 #include <sys/resourcevar.h> 50 #include <sys/stat.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 54 #include <sys/thread2.h> 55 #include <sys/msgport2.h> 56 57 /* 58 * Routines to add and remove 59 * data from an mbuf queue. 60 * 61 * The routines sbappend() or sbappendrecord() are normally called to 62 * append new mbufs to a socket buffer, after checking that adequate 63 * space is available, comparing the function sbspace() with the amount 64 * of data to be added. sbappendrecord() differs from sbappend() in 65 * that data supplied is treated as the beginning of a new record. 66 * To place a sender's address, optional access rights, and data in a 67 * socket receive buffer, sbappendaddr() should be used. To place 68 * access rights and data in a socket receive buffer, sbappendrights() 69 * should be used. In either case, the new data begins a new record. 70 * Note that unlike sbappend() and sbappendrecord(), these routines check 71 * for the caller that there will be enough space to store the data. 72 * Each fails if there is not enough space, or if it cannot find mbufs 73 * to store additional information in. 74 * 75 * Reliable protocols may use the socket send buffer to hold data 76 * awaiting acknowledgement. Data is normally copied from a socket 77 * send buffer in a protocol with m_copy for output to a peer, 78 * and then removing the data from the socket buffer with sbdrop() 79 * or sbdroprecord() when the data is acknowledged by the peer. 80 */ 81 82 /* 83 * Append mbuf chain m to the last record in the 84 * socket buffer sb. The additional space associated 85 * the mbuf chain is recorded in sb. Empty mbufs are 86 * discarded and mbufs are compacted where possible. 87 */ 88 void 89 sbappend(struct sockbuf *sb, struct mbuf *m) 90 { 91 struct mbuf *n; 92 93 if (m) { 94 n = sb->sb_mb; 95 if (n) { 96 while (n->m_nextpkt) 97 n = n->m_nextpkt; 98 do { 99 if (n->m_flags & M_EOR) { 100 /* XXXXXX!!!! */ 101 sbappendrecord(sb, m); 102 return; 103 } 104 } while (n->m_next && (n = n->m_next)); 105 } 106 sbcompress(sb, m, n); 107 } 108 } 109 110 /* 111 * sbappendstream() is an optimized form of sbappend() for protocols 112 * such as TCP that only have one record in the socket buffer, are 113 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses 114 * sbappendstream() must use sbappendstream() exclusively. 115 */ 116 void 117 sbappendstream(struct sockbuf *sb, struct mbuf *m) 118 { 119 KKASSERT(m->m_nextpkt == NULL); 120 sbcompress(sb, m, sb->sb_lastmbuf); 121 } 122 123 #ifdef SOCKBUF_DEBUG 124 125 void 126 _sbcheck(struct sockbuf *sb) 127 { 128 struct mbuf *m; 129 struct mbuf *n = NULL; 130 u_long len = 0, mbcnt = 0; 131 132 for (m = sb->sb_mb; m; m = n) { 133 n = m->m_nextpkt; 134 if (n == NULL && sb->sb_lastrecord != m) { 135 kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m); 136 panic("sbcheck1"); 137 138 } 139 for (; m; m = m->m_next) { 140 len += m->m_len; 141 mbcnt += MSIZE; 142 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ 143 mbcnt += m->m_ext.ext_size; 144 if (n == NULL && m->m_next == NULL) { 145 if (sb->sb_lastmbuf != m) { 146 kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m); 147 panic("sbcheck2"); 148 } 149 } 150 } 151 } 152 if (sb->sb_mb == NULL) { 153 if (sb->sb_lastrecord != NULL) { 154 kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n", 155 sb, sb->sb_lastrecord); 156 panic("sbcheck3"); 157 } 158 if (sb->sb_lastmbuf != NULL) { 159 kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n", 160 sb, sb->sb_lastmbuf); 161 panic("sbcheck4"); 162 } 163 } 164 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { 165 kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n", 166 sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt); 167 panic("sbcheck5"); 168 } 169 } 170 171 #endif 172 173 /* 174 * Same as sbappend(), except the mbuf chain begins a new record. 175 */ 176 void 177 sbappendrecord(struct sockbuf *sb, struct mbuf *m0) 178 { 179 struct mbuf *firstmbuf; 180 struct mbuf *secondmbuf; 181 182 if (m0 == NULL) 183 return; 184 185 sbcheck(sb); 186 187 /* 188 * Break the first mbuf off from the rest of the mbuf chain. 189 */ 190 firstmbuf = m0; 191 secondmbuf = m0->m_next; 192 m0->m_next = NULL; 193 194 /* 195 * Insert the first mbuf of the m0 mbuf chain as the last record of 196 * the sockbuf. Note this permits zero length records! Keep the 197 * sockbuf state consistent. 198 */ 199 if (sb->sb_mb == NULL) 200 sb->sb_mb = firstmbuf; 201 else 202 sb->sb_lastrecord->m_nextpkt = firstmbuf; 203 sb->sb_lastrecord = firstmbuf; /* update hint for new last record */ 204 sb->sb_lastmbuf = firstmbuf; /* update hint for new last mbuf */ 205 206 if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) { 207 /* propagate the EOR flag */ 208 firstmbuf->m_flags &= ~M_EOR; 209 secondmbuf->m_flags |= M_EOR; 210 } 211 212 /* 213 * The succeeding call to sbcompress() omits accounting for 214 * the first mbuf, so do it here. 215 */ 216 sballoc(sb, firstmbuf); 217 218 /* Compact the rest of the mbuf chain in after the first mbuf. */ 219 sbcompress(sb, secondmbuf, firstmbuf); 220 } 221 222 /* 223 * Append address and data, and optionally, control (ancillary) data 224 * to the receive queue of a socket. If present, 225 * m0 must include a packet header with total length. 226 * Returns 0 if no space in sockbuf or insufficient mbufs. 227 */ 228 int 229 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, 230 struct mbuf *control) 231 { 232 struct mbuf *m, *n; 233 int space = asa->sa_len; 234 235 if (m0 && (m0->m_flags & M_PKTHDR) == 0) 236 panic("sbappendaddr"); 237 sbcheck(sb); 238 239 if (m0) 240 space += m0->m_pkthdr.len; 241 for (n = control; n; n = n->m_next) { 242 space += n->m_len; 243 if (n->m_next == 0) /* keep pointer to last control buf */ 244 break; 245 } 246 #if 0 247 if (space > sbspace(sb)) 248 return (0); 249 #endif 250 if (asa->sa_len > MLEN) 251 return (0); 252 MGET(m, MB_DONTWAIT, MT_SONAME); 253 if (m == NULL) 254 return (0); 255 KKASSERT(m->m_nextpkt == NULL); 256 m->m_len = asa->sa_len; 257 bcopy(asa, mtod(m, caddr_t), asa->sa_len); 258 if (n) 259 n->m_next = m0; /* concatenate data to control */ 260 else 261 control = m0; 262 m->m_next = control; 263 for (n = m; n; n = n->m_next) 264 sballoc(sb, n); 265 266 if (sb->sb_mb == NULL) 267 sb->sb_mb = m; 268 else 269 sb->sb_lastrecord->m_nextpkt = m; 270 sb->sb_lastrecord = m; 271 while (m->m_next) 272 m = m->m_next; 273 sb->sb_lastmbuf = m; 274 275 return (1); 276 } 277 278 /* 279 * Append control information followed by data. 280 * control must be non-null. 281 */ 282 int 283 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) 284 { 285 struct mbuf *n; 286 u_int length, cmbcnt, m0mbcnt; 287 288 KASSERT(control != NULL, ("sbappendcontrol")); 289 KKASSERT(control->m_nextpkt == NULL); 290 sbcheck(sb); 291 292 length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt); 293 #if 0 294 if (length > sbspace(sb)) 295 return (0); 296 #endif 297 298 n->m_next = m0; /* concatenate data to control */ 299 300 if (sb->sb_mb == NULL) 301 sb->sb_mb = control; 302 else 303 sb->sb_lastrecord->m_nextpkt = control; 304 sb->sb_lastrecord = control; 305 sb->sb_lastmbuf = m0; 306 307 sb->sb_cc += length; 308 sb->sb_mbcnt += cmbcnt + m0mbcnt; 309 310 return (1); 311 } 312 313 /* 314 * Compress mbuf chain m into the socket buffer sb following mbuf tailm. 315 * If tailm is null, the buffer is presumed empty. Also, as a side-effect, 316 * increment the sockbuf counts for each mbuf in the chain. 317 */ 318 void 319 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm) 320 { 321 int eor = 0; 322 struct mbuf *free_chain = NULL; 323 324 sbcheck(sb); 325 while (m) { 326 struct mbuf *o; 327 328 eor |= m->m_flags & M_EOR; 329 /* 330 * Disregard empty mbufs as long as we don't encounter 331 * an end-of-record or there is a trailing mbuf of 332 * the same type to propagate the EOR flag to. 333 * 334 * Defer the m_free() call because it can block and break 335 * the atomicy of the sockbuf. 336 */ 337 if (m->m_len == 0 && 338 (eor == 0 || 339 (((o = m->m_next) || (o = tailm)) && 340 o->m_type == m->m_type))) { 341 o = m->m_next; 342 m->m_next = free_chain; 343 free_chain = m; 344 m = o; 345 continue; 346 } 347 348 /* See if we can coalesce with preceding mbuf. */ 349 if (tailm && !(tailm->m_flags & M_EOR) && M_WRITABLE(tailm) && 350 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ 351 m->m_len <= M_TRAILINGSPACE(tailm) && 352 tailm->m_type == m->m_type) { 353 bcopy(mtod(m, caddr_t), 354 mtod(tailm, caddr_t) + tailm->m_len, 355 (unsigned)m->m_len); 356 tailm->m_len += m->m_len; 357 sb->sb_cc += m->m_len; /* update sb counter */ 358 o = m->m_next; 359 m->m_next = free_chain; 360 free_chain = m; 361 m = o; 362 continue; 363 } 364 365 /* Insert whole mbuf. */ 366 if (tailm == NULL) { 367 KASSERT(sb->sb_mb == NULL, 368 ("sbcompress: sb_mb not NULL")); 369 sb->sb_mb = m; /* only mbuf in sockbuf */ 370 sb->sb_lastrecord = m; /* new last record */ 371 } else { 372 tailm->m_next = m; /* tack m on following tailm */ 373 } 374 sb->sb_lastmbuf = m; /* update last mbuf hint */ 375 376 tailm = m; /* just inserted mbuf becomes the new tail */ 377 m = m->m_next; /* advance to next mbuf */ 378 tailm->m_next = NULL; /* split inserted mbuf off from chain */ 379 380 /* update sb counters for just added mbuf */ 381 sballoc(sb, tailm); 382 383 /* clear EOR on intermediate mbufs */ 384 tailm->m_flags &= ~M_EOR; 385 } 386 387 /* 388 * Propogate EOR to the last mbuf 389 */ 390 if (eor) { 391 if (tailm) 392 tailm->m_flags |= eor; 393 else 394 kprintf("semi-panic: sbcompress"); 395 } 396 397 /* 398 * Clean up any defered frees. 399 */ 400 while (free_chain) 401 free_chain = m_free(free_chain); 402 403 sbcheck(sb); 404 } 405 406 /* 407 * Free all mbufs in a sockbuf. 408 * Check that all resources are reclaimed. 409 */ 410 void 411 sbflush(struct sockbuf *sb) 412 { 413 while (sb->sb_mbcnt) { 414 /* 415 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty: 416 * we would loop forever. Panic instead. 417 */ 418 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len)) 419 break; 420 sbdrop(sb, (int)sb->sb_cc); 421 } 422 KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf), 423 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p", 424 sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf)); 425 } 426 427 /* 428 * Drop data from (the front of) a sockbuf. 429 */ 430 void 431 sbdrop(struct sockbuf *sb, int len) 432 { 433 struct mbuf *m; 434 struct mbuf *free_chain = NULL; 435 436 sbcheck(sb); 437 crit_enter(); 438 439 /* 440 * Remove mbufs from multiple records until the count is exhausted. 441 */ 442 m = sb->sb_mb; 443 while (m && len > 0) { 444 if (m->m_len > len) { 445 m->m_len -= len; 446 m->m_data += len; 447 sb->sb_cc -= len; 448 break; 449 } 450 len -= m->m_len; 451 m = sbunlinkmbuf(sb, m, &free_chain); 452 if (m == NULL && len) 453 m = sb->sb_mb; 454 } 455 456 /* 457 * Remove any trailing 0-length mbufs in the current record. If 458 * the last record for which data was removed is now empty, m will be 459 * NULL. 460 */ 461 while (m && m->m_len == 0) { 462 m = sbunlinkmbuf(sb, m, &free_chain); 463 } 464 crit_exit(); 465 if (free_chain) 466 m_freem(free_chain); 467 sbcheck(sb); 468 } 469 470 /* 471 * Drop a record off the front of a sockbuf and move the next record 472 * to the front. 473 * 474 * Must be called while holding a critical section. 475 */ 476 void 477 sbdroprecord(struct sockbuf *sb) 478 { 479 struct mbuf *m; 480 struct mbuf *n; 481 482 sbcheck(sb); 483 m = sb->sb_mb; 484 if (m) { 485 if ((sb->sb_mb = m->m_nextpkt) == NULL) { 486 sb->sb_lastrecord = NULL; 487 sb->sb_lastmbuf = NULL; 488 } 489 m->m_nextpkt = NULL; 490 for (n = m; n; n = n->m_next) 491 sbfree(sb, n); 492 m_freem(m); 493 sbcheck(sb); 494 } 495 } 496 497 /* 498 * Drop the first mbuf off the sockbuf and move the next mbuf to the front. 499 * Currently only the head mbuf of the sockbuf may be dropped this way. 500 * 501 * The next mbuf in the same record as the mbuf being removed is returned 502 * or NULL if the record is exhausted. Note that other records may remain 503 * in the sockbuf when NULL is returned. 504 * 505 * Must be called while holding a critical section. 506 */ 507 struct mbuf * 508 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain) 509 { 510 struct mbuf *n; 511 512 KKASSERT(sb->sb_mb == m); 513 sbfree(sb, m); 514 n = m->m_next; 515 if (n) { 516 sb->sb_mb = n; 517 if (sb->sb_lastrecord == m) 518 sb->sb_lastrecord = n; 519 KKASSERT(sb->sb_lastmbuf != m); 520 n->m_nextpkt = m->m_nextpkt; 521 } else { 522 sb->sb_mb = m->m_nextpkt; 523 if (sb->sb_lastrecord == m) { 524 KKASSERT(sb->sb_mb == NULL); 525 sb->sb_lastrecord = NULL; 526 } 527 if (sb->sb_mb == NULL) 528 sb->sb_lastmbuf = NULL; 529 } 530 m->m_nextpkt = NULL; 531 if (free_chain) { 532 m->m_next = *free_chain; 533 *free_chain = m; 534 } else { 535 m->m_next = NULL; 536 } 537 return(n); 538 } 539 540 /* 541 * Create a "control" mbuf containing the specified data 542 * with the specified type for presentation on a socket buffer. 543 */ 544 struct mbuf * 545 sbcreatecontrol(caddr_t p, int size, int type, int level) 546 { 547 struct cmsghdr *cp; 548 struct mbuf *m; 549 550 if (CMSG_SPACE((u_int)size) > MCLBYTES) 551 return (NULL); 552 m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL); 553 if (m == NULL) 554 return (NULL); 555 m->m_len = CMSG_SPACE(size); 556 cp = mtod(m, struct cmsghdr *); 557 if (p != NULL) 558 memcpy(CMSG_DATA(cp), p, size); 559 cp->cmsg_len = CMSG_LEN(size); 560 cp->cmsg_level = level; 561 cp->cmsg_type = type; 562 return (m); 563 } 564 565