xref: /dflybsd-src/sys/kern/uipc_sockbuf.c (revision 6d49aa6ffaff1e5a2ff3abe70c453cc8b47adb73)
1 /*
2  * Copyright (c) 2005 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 1982, 1986, 1988, 1990, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *	This product includes software developed by the University of
17  *	California, Berkeley and its contributors.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * @(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
35  * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
36  * $DragonFly: src/sys/kern/uipc_sockbuf.c,v 1.1 2007/04/22 01:13:10 dillon Exp $
37  */
38 
39 #include "opt_param.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/domain.h>
43 #include <sys/file.h>	/* for maxfiles */
44 #include <sys/kernel.h>
45 #include <sys/proc.h>
46 #include <sys/malloc.h>
47 #include <sys/mbuf.h>
48 #include <sys/protosw.h>
49 #include <sys/resourcevar.h>
50 #include <sys/stat.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 
54 #include <sys/thread2.h>
55 #include <sys/msgport2.h>
56 
57 /*
58  * Routines to add and remove
59  * data from an mbuf queue.
60  *
61  * The routines sbappend() or sbappendrecord() are normally called to
62  * append new mbufs to a socket buffer, after checking that adequate
63  * space is available, comparing the function sbspace() with the amount
64  * of data to be added.  sbappendrecord() differs from sbappend() in
65  * that data supplied is treated as the beginning of a new record.
66  * To place a sender's address, optional access rights, and data in a
67  * socket receive buffer, sbappendaddr() should be used.  To place
68  * access rights and data in a socket receive buffer, sbappendrights()
69  * should be used.  In either case, the new data begins a new record.
70  * Note that unlike sbappend() and sbappendrecord(), these routines check
71  * for the caller that there will be enough space to store the data.
72  * Each fails if there is not enough space, or if it cannot find mbufs
73  * to store additional information in.
74  *
75  * Reliable protocols may use the socket send buffer to hold data
76  * awaiting acknowledgement.  Data is normally copied from a socket
77  * send buffer in a protocol with m_copy for output to a peer,
78  * and then removing the data from the socket buffer with sbdrop()
79  * or sbdroprecord() when the data is acknowledged by the peer.
80  */
81 
82 /*
83  * Append mbuf chain m to the last record in the
84  * socket buffer sb.  The additional space associated
85  * the mbuf chain is recorded in sb.  Empty mbufs are
86  * discarded and mbufs are compacted where possible.
87  */
88 void
89 sbappend(struct sockbuf *sb, struct mbuf *m)
90 {
91 	struct mbuf *n;
92 
93 	if (m) {
94 		n = sb->sb_mb;
95 		if (n) {
96 			while (n->m_nextpkt)
97 				n = n->m_nextpkt;
98 			do {
99 				if (n->m_flags & M_EOR) {
100 					/* XXXXXX!!!! */
101 					sbappendrecord(sb, m);
102 					return;
103 				}
104 			} while (n->m_next && (n = n->m_next));
105 		}
106 		sbcompress(sb, m, n);
107 	}
108 }
109 
110 /*
111  * sbappendstream() is an optimized form of sbappend() for protocols
112  * such as TCP that only have one record in the socket buffer, are
113  * not PR_ATOMIC, nor allow MT_CONTROL data.  A protocol that uses
114  * sbappendstream() must use sbappendstream() exclusively.
115  */
116 void
117 sbappendstream(struct sockbuf *sb, struct mbuf *m)
118 {
119 	KKASSERT(m->m_nextpkt == NULL);
120 	sbcompress(sb, m, sb->sb_lastmbuf);
121 }
122 
123 #ifdef SOCKBUF_DEBUG
124 
125 void
126 _sbcheck(struct sockbuf *sb)
127 {
128 	struct mbuf *m;
129 	struct mbuf *n = NULL;
130 	u_long len = 0, mbcnt = 0;
131 
132 	for (m = sb->sb_mb; m; m = n) {
133 	    n = m->m_nextpkt;
134 	    if (n == NULL && sb->sb_lastrecord != m) {
135 		    kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m);
136 		    panic("sbcheck1");
137 
138 	    }
139 	    for (; m; m = m->m_next) {
140 		len += m->m_len;
141 		mbcnt += MSIZE;
142 		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
143 			mbcnt += m->m_ext.ext_size;
144 		if (n == NULL && m->m_next == NULL) {
145 			if (sb->sb_lastmbuf != m) {
146 				kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m);
147 				panic("sbcheck2");
148 			}
149 		}
150 	    }
151 	}
152 	if (sb->sb_mb == NULL) {
153 	    if (sb->sb_lastrecord != NULL) {
154 		kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n",
155 			sb, sb->sb_lastrecord);
156 		panic("sbcheck3");
157 	    }
158 	    if (sb->sb_lastmbuf != NULL) {
159 		kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n",
160 			sb, sb->sb_lastmbuf);
161 		panic("sbcheck4");
162 	    }
163 	}
164 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
165 		kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n",
166 		    sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt);
167 		panic("sbcheck5");
168 	}
169 }
170 
171 #endif
172 
173 /*
174  * Same as sbappend(), except the mbuf chain begins a new record.
175  */
176 void
177 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
178 {
179 	struct mbuf *firstmbuf;
180 	struct mbuf *secondmbuf;
181 
182 	if (m0 == NULL)
183 		return;
184 
185 	sbcheck(sb);
186 
187 	/*
188 	 * Break the first mbuf off from the rest of the mbuf chain.
189 	 */
190 	firstmbuf = m0;
191 	secondmbuf = m0->m_next;
192 	m0->m_next = NULL;
193 
194 	/*
195 	 * Insert the first mbuf of the m0 mbuf chain as the last record of
196 	 * the sockbuf.  Note this permits zero length records!  Keep the
197 	 * sockbuf state consistent.
198 	 */
199 	if (sb->sb_mb == NULL)
200 		sb->sb_mb = firstmbuf;
201 	else
202 		sb->sb_lastrecord->m_nextpkt = firstmbuf;
203 	sb->sb_lastrecord = firstmbuf;	/* update hint for new last record */
204 	sb->sb_lastmbuf = firstmbuf;	/* update hint for new last mbuf */
205 
206 	if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) {
207 		/* propagate the EOR flag */
208 		firstmbuf->m_flags &= ~M_EOR;
209 		secondmbuf->m_flags |= M_EOR;
210 	}
211 
212 	/*
213 	 * The succeeding call to sbcompress() omits accounting for
214 	 * the first mbuf, so do it here.
215 	 */
216 	sballoc(sb, firstmbuf);
217 
218 	/* Compact the rest of the mbuf chain in after the first mbuf. */
219 	sbcompress(sb, secondmbuf, firstmbuf);
220 }
221 
222 /*
223  * Append address and data, and optionally, control (ancillary) data
224  * to the receive queue of a socket.  If present,
225  * m0 must include a packet header with total length.
226  * Returns 0 if no space in sockbuf or insufficient mbufs.
227  */
228 int
229 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
230 	     struct mbuf *control)
231 {
232 	struct mbuf *m, *n;
233 	int space = asa->sa_len;
234 
235 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
236 		panic("sbappendaddr");
237 	sbcheck(sb);
238 
239 	if (m0)
240 		space += m0->m_pkthdr.len;
241 	for (n = control; n; n = n->m_next) {
242 		space += n->m_len;
243 		if (n->m_next == 0)	/* keep pointer to last control buf */
244 			break;
245 	}
246 #if 0
247 	if (space > sbspace(sb))
248 		return (0);
249 #endif
250 	if (asa->sa_len > MLEN)
251 		return (0);
252 	MGET(m, MB_DONTWAIT, MT_SONAME);
253 	if (m == NULL)
254 		return (0);
255 	KKASSERT(m->m_nextpkt == NULL);
256 	m->m_len = asa->sa_len;
257 	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
258 	if (n)
259 		n->m_next = m0;		/* concatenate data to control */
260 	else
261 		control = m0;
262 	m->m_next = control;
263 	for (n = m; n; n = n->m_next)
264 		sballoc(sb, n);
265 
266 	if (sb->sb_mb == NULL)
267 		sb->sb_mb = m;
268 	else
269 		sb->sb_lastrecord->m_nextpkt = m;
270 	sb->sb_lastrecord = m;
271 	while (m->m_next)
272 		m = m->m_next;
273 	sb->sb_lastmbuf = m;
274 
275 	return (1);
276 }
277 
278 /*
279  * Append control information followed by data.
280  * control must be non-null.
281  */
282 int
283 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
284 {
285 	struct mbuf *n;
286 	u_int length, cmbcnt, m0mbcnt;
287 
288 	KASSERT(control != NULL, ("sbappendcontrol"));
289 	KKASSERT(control->m_nextpkt == NULL);
290 	sbcheck(sb);
291 
292 	length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt);
293 #if 0
294 	if (length > sbspace(sb))
295 		return (0);
296 #endif
297 
298 	n->m_next = m0;			/* concatenate data to control */
299 
300 	if (sb->sb_mb == NULL)
301 		sb->sb_mb = control;
302 	else
303 		sb->sb_lastrecord->m_nextpkt = control;
304 	sb->sb_lastrecord = control;
305 	sb->sb_lastmbuf = m0;
306 
307 	sb->sb_cc += length;
308 	sb->sb_mbcnt += cmbcnt + m0mbcnt;
309 
310 	return (1);
311 }
312 
313 /*
314  * Compress mbuf chain m into the socket buffer sb following mbuf tailm.
315  * If tailm is null, the buffer is presumed empty.  Also, as a side-effect,
316  * increment the sockbuf counts for each mbuf in the chain.
317  */
318 void
319 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm)
320 {
321 	int eor = 0;
322 	struct mbuf *free_chain = NULL;
323 
324 	sbcheck(sb);
325 	while (m) {
326 		struct mbuf *o;
327 
328 		eor |= m->m_flags & M_EOR;
329 		/*
330 		 * Disregard empty mbufs as long as we don't encounter
331 		 * an end-of-record or there is a trailing mbuf of
332 		 * the same type to propagate the EOR flag to.
333 		 *
334 		 * Defer the m_free() call because it can block and break
335 		 * the atomicy of the sockbuf.
336 		 */
337 		if (m->m_len == 0 &&
338 		    (eor == 0 ||
339 		     (((o = m->m_next) || (o = tailm)) &&
340 		      o->m_type == m->m_type))) {
341 			o = m->m_next;
342 			m->m_next = free_chain;
343 			free_chain = m;
344 			m = o;
345 			continue;
346 		}
347 
348 		/* See if we can coalesce with preceding mbuf. */
349 		if (tailm && !(tailm->m_flags & M_EOR) && M_WRITABLE(tailm) &&
350 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
351 		    m->m_len <= M_TRAILINGSPACE(tailm) &&
352 		    tailm->m_type == m->m_type) {
353 			bcopy(mtod(m, caddr_t),
354 			      mtod(tailm, caddr_t) + tailm->m_len,
355 			      (unsigned)m->m_len);
356 			tailm->m_len += m->m_len;
357 			sb->sb_cc += m->m_len;		/* update sb counter */
358 			o = m->m_next;
359 			m->m_next = free_chain;
360 			free_chain = m;
361 			m = o;
362 			continue;
363 		}
364 
365 		/* Insert whole mbuf. */
366 		if (tailm == NULL) {
367 			KASSERT(sb->sb_mb == NULL,
368 				("sbcompress: sb_mb not NULL"));
369 			sb->sb_mb = m;		/* only mbuf in sockbuf */
370 			sb->sb_lastrecord = m;	/* new last record */
371 		} else {
372 			tailm->m_next = m;	/* tack m on following tailm */
373 		}
374 		sb->sb_lastmbuf = m;	/* update last mbuf hint */
375 
376 		tailm = m;	/* just inserted mbuf becomes the new tail */
377 		m = m->m_next;		/* advance to next mbuf */
378 		tailm->m_next = NULL;	/* split inserted mbuf off from chain */
379 
380 		/* update sb counters for just added mbuf */
381 		sballoc(sb, tailm);
382 
383 		/* clear EOR on intermediate mbufs */
384 		tailm->m_flags &= ~M_EOR;
385 	}
386 
387 	/*
388 	 * Propogate EOR to the last mbuf
389 	 */
390 	if (eor) {
391 		if (tailm)
392 			tailm->m_flags |= eor;
393 		else
394 			kprintf("semi-panic: sbcompress");
395 	}
396 
397 	/*
398 	 * Clean up any defered frees.
399 	 */
400 	while (free_chain)
401 		free_chain = m_free(free_chain);
402 
403 	sbcheck(sb);
404 }
405 
406 /*
407  * Free all mbufs in a sockbuf.
408  * Check that all resources are reclaimed.
409  */
410 void
411 sbflush(struct sockbuf *sb)
412 {
413 	while (sb->sb_mbcnt) {
414 		/*
415 		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
416 		 * we would loop forever. Panic instead.
417 		 */
418 		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
419 			break;
420 		sbdrop(sb, (int)sb->sb_cc);
421 	}
422 	KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf),
423 	    ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p",
424 	    sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf));
425 }
426 
427 /*
428  * Drop data from (the front of) a sockbuf.
429  */
430 void
431 sbdrop(struct sockbuf *sb, int len)
432 {
433 	struct mbuf *m;
434 	struct mbuf *free_chain = NULL;
435 
436 	sbcheck(sb);
437 	crit_enter();
438 
439 	/*
440 	 * Remove mbufs from multiple records until the count is exhausted.
441 	 */
442 	m = sb->sb_mb;
443 	while (m && len > 0) {
444 		if (m->m_len > len) {
445 			m->m_len -= len;
446 			m->m_data += len;
447 			sb->sb_cc -= len;
448 			break;
449 		}
450 		len -= m->m_len;
451 		m = sbunlinkmbuf(sb, m, &free_chain);
452 		if (m == NULL && len)
453 			m = sb->sb_mb;
454 	}
455 
456 	/*
457 	 * Remove any trailing 0-length mbufs in the current record.  If
458 	 * the last record for which data was removed is now empty, m will be
459 	 * NULL.
460 	 */
461 	while (m && m->m_len == 0) {
462 		m = sbunlinkmbuf(sb, m, &free_chain);
463 	}
464 	crit_exit();
465 	if (free_chain)
466 		m_freem(free_chain);
467 	sbcheck(sb);
468 }
469 
470 /*
471  * Drop a record off the front of a sockbuf and move the next record
472  * to the front.
473  *
474  * Must be called while holding a critical section.
475  */
476 void
477 sbdroprecord(struct sockbuf *sb)
478 {
479 	struct mbuf *m;
480 	struct mbuf *n;
481 
482 	sbcheck(sb);
483 	m = sb->sb_mb;
484 	if (m) {
485 		if ((sb->sb_mb = m->m_nextpkt) == NULL) {
486 			sb->sb_lastrecord = NULL;
487 			sb->sb_lastmbuf = NULL;
488 		}
489 		m->m_nextpkt = NULL;
490 		for (n = m; n; n = n->m_next)
491 			sbfree(sb, n);
492 		m_freem(m);
493 		sbcheck(sb);
494 	}
495 }
496 
497 /*
498  * Drop the first mbuf off the sockbuf and move the next mbuf to the front.
499  * Currently only the head mbuf of the sockbuf may be dropped this way.
500  *
501  * The next mbuf in the same record as the mbuf being removed is returned
502  * or NULL if the record is exhausted.  Note that other records may remain
503  * in the sockbuf when NULL is returned.
504  *
505  * Must be called while holding a critical section.
506  */
507 struct mbuf *
508 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain)
509 {
510 	struct mbuf *n;
511 
512 	KKASSERT(sb->sb_mb == m);
513 	sbfree(sb, m);
514 	n = m->m_next;
515 	if (n) {
516 		sb->sb_mb = n;
517 		if (sb->sb_lastrecord == m)
518 			sb->sb_lastrecord = n;
519 		KKASSERT(sb->sb_lastmbuf != m);
520 		n->m_nextpkt = m->m_nextpkt;
521 	} else {
522 		sb->sb_mb = m->m_nextpkt;
523 		if (sb->sb_lastrecord == m) {
524 			KKASSERT(sb->sb_mb == NULL);
525 			sb->sb_lastrecord = NULL;
526 		}
527 		if (sb->sb_mb == NULL)
528 			sb->sb_lastmbuf = NULL;
529 	}
530 	m->m_nextpkt = NULL;
531 	if (free_chain) {
532 		m->m_next = *free_chain;
533 		*free_chain = m;
534 	} else {
535 		m->m_next = NULL;
536 	}
537 	return(n);
538 }
539 
540 /*
541  * Create a "control" mbuf containing the specified data
542  * with the specified type for presentation on a socket buffer.
543  */
544 struct mbuf *
545 sbcreatecontrol(caddr_t p, int size, int type, int level)
546 {
547 	struct cmsghdr *cp;
548 	struct mbuf *m;
549 
550 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
551 		return (NULL);
552 	m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL);
553 	if (m == NULL)
554 		return (NULL);
555 	m->m_len = CMSG_SPACE(size);
556 	cp = mtod(m, struct cmsghdr *);
557 	if (p != NULL)
558 		memcpy(CMSG_DATA(cp), p, size);
559 	cp->cmsg_len = CMSG_LEN(size);
560 	cp->cmsg_level = level;
561 	cp->cmsg_type = type;
562 	return (m);
563 }
564 
565