xref: /openbsd-src/sys/kern/uipc_socket2.c (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1 /*	$OpenBSD: uipc_socket2.c,v 1.89 2017/12/30 20:47:00 guenther Exp $	*/
2 /*	$NetBSD: uipc_socket2.c,v 1.11 1996/02/04 02:17:55 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/protosw.h>
40 #include <sys/domain.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/signalvar.h>
44 #include <sys/event.h>
45 #include <sys/pool.h>
46 
47 /*
48  * Primitive routines for operating on sockets and socket buffers
49  */
50 
51 u_long	sb_max = SB_MAX;		/* patchable */
52 
53 extern struct pool mclpools[];
54 extern struct pool mbpool;
55 
56 /*
57  * Procedures to manipulate state flags of socket
58  * and do appropriate wakeups.  Normal sequence from the
59  * active (originating) side is that soisconnecting() is
60  * called during processing of connect() call,
61  * resulting in an eventual call to soisconnected() if/when the
62  * connection is established.  When the connection is torn down
63  * soisdisconnecting() is called during processing of disconnect() call,
64  * and soisdisconnected() is called when the connection to the peer
65  * is totally severed.  The semantics of these routines are such that
66  * connectionless protocols can call soisconnected() and soisdisconnected()
67  * only, bypassing the in-progress calls when setting up a ``connection''
68  * takes no time.
69  *
70  * From the passive side, a socket is created with
71  * two queues of sockets: so_q0 for connections in progress
72  * and so_q for connections already made and awaiting user acceptance.
73  * As a protocol is preparing incoming connections, it creates a socket
74  * structure queued on so_q0 by calling sonewconn().  When the connection
75  * is established, soisconnected() is called, and transfers the
76  * socket structure to so_q, making it available to accept().
77  *
78  * If a socket is closed with sockets on either
79  * so_q0 or so_q, these sockets are dropped.
80  *
81  * If higher level protocols are implemented in
82  * the kernel, the wakeups done here will sometimes
83  * cause software-interrupt process scheduling.
84  */
85 
86 void
87 soisconnecting(struct socket *so)
88 {
89 	soassertlocked(so);
90 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
91 	so->so_state |= SS_ISCONNECTING;
92 }
93 
94 void
95 soisconnected(struct socket *so)
96 {
97 	struct socket *head = so->so_head;
98 
99 	soassertlocked(so);
100 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
101 	so->so_state |= SS_ISCONNECTED;
102 	if (head && soqremque(so, 0)) {
103 		soqinsque(head, so, 1);
104 		sorwakeup(head);
105 		wakeup_one(&head->so_timeo);
106 	} else {
107 		wakeup(&so->so_timeo);
108 		sorwakeup(so);
109 		sowwakeup(so);
110 	}
111 }
112 
113 void
114 soisdisconnecting(struct socket *so)
115 {
116 	soassertlocked(so);
117 	so->so_state &= ~SS_ISCONNECTING;
118 	so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
119 	wakeup(&so->so_timeo);
120 	sowwakeup(so);
121 	sorwakeup(so);
122 }
123 
124 void
125 soisdisconnected(struct socket *so)
126 {
127 	soassertlocked(so);
128 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
129 	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
130 	wakeup(&so->so_timeo);
131 	sowwakeup(so);
132 	sorwakeup(so);
133 }
134 
135 /*
136  * When an attempt at a new connection is noted on a socket
137  * which accepts connections, sonewconn is called.  If the
138  * connection is possible (subject to space constraints, etc.)
139  * then we allocate a new structure, properly linked into the
140  * data structure of the original socket, and return this.
141  * Connstatus may be 0 or SS_ISCONNECTED.
142  */
143 struct socket *
144 sonewconn(struct socket *head, int connstatus)
145 {
146 	struct socket *so;
147 	int soqueue = connstatus ? 1 : 0;
148 
149 	/*
150 	 * XXXSMP as long as `so' and `head' share the same lock, we
151 	 * can call soreserve() and pr_attach() below w/o expliclitly
152 	 * locking `so'.
153 	 */
154 	soassertlocked(head);
155 
156 	if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 95 / 100)
157 		return (NULL);
158 	if (head->so_qlen + head->so_q0len > head->so_qlimit * 3)
159 		return (NULL);
160 	so = pool_get(&socket_pool, PR_NOWAIT|PR_ZERO);
161 	if (so == NULL)
162 		return (NULL);
163 	so->so_type = head->so_type;
164 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
165 	so->so_linger = head->so_linger;
166 	so->so_state = head->so_state | SS_NOFDREF;
167 	so->so_proto = head->so_proto;
168 	so->so_timeo = head->so_timeo;
169 	so->so_pgid = head->so_pgid;
170 	so->so_euid = head->so_euid;
171 	so->so_ruid = head->so_ruid;
172 	so->so_egid = head->so_egid;
173 	so->so_rgid = head->so_rgid;
174 	so->so_cpid = head->so_cpid;
175 	so->so_siguid = head->so_siguid;
176 	so->so_sigeuid = head->so_sigeuid;
177 
178 	/*
179 	 * Inherit watermarks but those may get clamped in low mem situations.
180 	 */
181 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
182 		pool_put(&socket_pool, so);
183 		return (NULL);
184 	}
185 	so->so_snd.sb_wat = head->so_snd.sb_wat;
186 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
187 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
188 	so->so_rcv.sb_wat = head->so_rcv.sb_wat;
189 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
190 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
191 
192 	soqinsque(head, so, soqueue);
193 	if ((*so->so_proto->pr_attach)(so, 0)) {
194 		(void) soqremque(so, soqueue);
195 		pool_put(&socket_pool, so);
196 		return (NULL);
197 	}
198 	if (connstatus) {
199 		sorwakeup(head);
200 		wakeup(&head->so_timeo);
201 		so->so_state |= connstatus;
202 	}
203 	return (so);
204 }
205 
206 void
207 soqinsque(struct socket *head, struct socket *so, int q)
208 {
209 	soassertlocked(head);
210 
211 #ifdef DIAGNOSTIC
212 	if (so->so_onq != NULL)
213 		panic("soqinsque");
214 #endif
215 
216 	so->so_head = head;
217 	if (q == 0) {
218 		head->so_q0len++;
219 		so->so_onq = &head->so_q0;
220 	} else {
221 		head->so_qlen++;
222 		so->so_onq = &head->so_q;
223 	}
224 	TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
225 }
226 
227 int
228 soqremque(struct socket *so, int q)
229 {
230 	struct socket *head = so->so_head;
231 
232 	soassertlocked(head);
233 
234 	if (q == 0) {
235 		if (so->so_onq != &head->so_q0)
236 			return (0);
237 		head->so_q0len--;
238 	} else {
239 		if (so->so_onq != &head->so_q)
240 			return (0);
241 		head->so_qlen--;
242 	}
243 	TAILQ_REMOVE(so->so_onq, so, so_qe);
244 	so->so_onq = NULL;
245 	so->so_head = NULL;
246 	return (1);
247 }
248 
249 /*
250  * Socantsendmore indicates that no more data will be sent on the
251  * socket; it would normally be applied to a socket when the user
252  * informs the system that no more data is to be sent, by the protocol
253  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
254  * will be received, and will normally be applied to the socket by a
255  * protocol when it detects that the peer will send no more data.
256  * Data queued for reading in the socket may yet be read.
257  */
258 
259 void
260 socantsendmore(struct socket *so)
261 {
262 	soassertlocked(so);
263 	so->so_state |= SS_CANTSENDMORE;
264 	sowwakeup(so);
265 }
266 
267 void
268 socantrcvmore(struct socket *so)
269 {
270 	soassertlocked(so);
271 	so->so_state |= SS_CANTRCVMORE;
272 	sorwakeup(so);
273 }
274 
275 int
276 solock(struct socket *so)
277 {
278 	int s = 0;
279 
280 	if ((so->so_proto->pr_domain->dom_family != PF_LOCAL) &&
281 	    (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
282 	    (so->so_proto->pr_domain->dom_family != PF_KEY))
283 		NET_LOCK();
284 	else
285 		s = -42;
286 
287 	return (s);
288 }
289 
290 void
291 sounlock(int s)
292 {
293 	if (s != -42)
294 		NET_UNLOCK();
295 }
296 
297 void
298 soassertlocked(struct socket *so)
299 {
300 	switch (so->so_proto->pr_domain->dom_family) {
301 	case PF_INET:
302 	case PF_INET6:
303 		NET_ASSERT_LOCKED();
304 		break;
305 	case PF_LOCAL:
306 	case PF_ROUTE:
307 	case PF_KEY:
308 	default:
309 		KERNEL_ASSERT_LOCKED();
310 		break;
311 	}
312 }
313 
314 int
315 sosleep(struct socket *so, void *ident, int prio, const char *wmesg, int timo)
316 {
317 	if ((so->so_proto->pr_domain->dom_family != PF_LOCAL) &&
318 	    (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
319 	    (so->so_proto->pr_domain->dom_family != PF_KEY)) {
320 		return rwsleep(ident, &netlock, prio, wmesg, timo);
321 	} else
322 		return tsleep(ident, prio, wmesg, timo);
323 }
324 
325 /*
326  * Wait for data to arrive at/drain from a socket buffer.
327  */
328 int
329 sbwait(struct socket *so, struct sockbuf *sb)
330 {
331 	int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
332 
333 	soassertlocked(so);
334 
335 	sb->sb_flags |= SB_WAIT;
336 	return (sosleep(so, &sb->sb_cc, prio, "netio", sb->sb_timeo));
337 }
338 
339 int
340 sblock(struct socket *so, struct sockbuf *sb, int wait)
341 {
342 	int error, prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
343 
344 	soassertlocked(so);
345 
346 	if ((sb->sb_flags & SB_LOCK) == 0) {
347 		sb->sb_flags |= SB_LOCK;
348 		return (0);
349 	}
350 	if (wait & M_NOWAIT)
351 		return (EWOULDBLOCK);
352 
353 	while (sb->sb_flags & SB_LOCK) {
354 		sb->sb_flags |= SB_WANT;
355 		error = sosleep(so, &sb->sb_flags, prio, "netlck", 0);
356 		if (error)
357 			return (error);
358 	}
359 	sb->sb_flags |= SB_LOCK;
360 	return (0);
361 }
362 
363 void
364 sbunlock(struct socket *so, struct sockbuf *sb)
365 {
366 	soassertlocked(so);
367 
368 	sb->sb_flags &= ~SB_LOCK;
369 	if (sb->sb_flags & SB_WANT) {
370 		sb->sb_flags &= ~SB_WANT;
371 		wakeup(&sb->sb_flags);
372 	}
373 }
374 
375 /*
376  * Wakeup processes waiting on a socket buffer.
377  * Do asynchronous notification via SIGIO
378  * if the socket has the SS_ASYNC flag set.
379  */
380 void
381 sowakeup(struct socket *so, struct sockbuf *sb)
382 {
383 	soassertlocked(so);
384 
385 	sb->sb_flags &= ~SB_SEL;
386 	if (sb->sb_flags & SB_WAIT) {
387 		sb->sb_flags &= ~SB_WAIT;
388 		wakeup(&sb->sb_cc);
389 	}
390 	KERNEL_LOCK();
391 	if (so->so_state & SS_ASYNC)
392 		csignal(so->so_pgid, SIGIO, so->so_siguid, so->so_sigeuid);
393 	selwakeup(&sb->sb_sel);
394 	KERNEL_UNLOCK();
395 }
396 
397 /*
398  * Socket buffer (struct sockbuf) utility routines.
399  *
400  * Each socket contains two socket buffers: one for sending data and
401  * one for receiving data.  Each buffer contains a queue of mbufs,
402  * information about the number of mbufs and amount of data in the
403  * queue, and other fields allowing select() statements and notification
404  * on data availability to be implemented.
405  *
406  * Data stored in a socket buffer is maintained as a list of records.
407  * Each record is a list of mbufs chained together with the m_next
408  * field.  Records are chained together with the m_nextpkt field. The upper
409  * level routine soreceive() expects the following conventions to be
410  * observed when placing information in the receive buffer:
411  *
412  * 1. If the protocol requires each message be preceded by the sender's
413  *    name, then a record containing that name must be present before
414  *    any associated data (mbuf's must be of type MT_SONAME).
415  * 2. If the protocol supports the exchange of ``access rights'' (really
416  *    just additional data associated with the message), and there are
417  *    ``rights'' to be received, then a record containing this data
418  *    should be present (mbuf's must be of type MT_CONTROL).
419  * 3. If a name or rights record exists, then it must be followed by
420  *    a data record, perhaps of zero length.
421  *
422  * Before using a new socket structure it is first necessary to reserve
423  * buffer space to the socket, by calling sbreserve().  This should commit
424  * some of the available buffer space in the system buffer pool for the
425  * socket (currently, it does nothing but enforce limits).  The space
426  * should be released by calling sbrelease() when the socket is destroyed.
427  */
428 
429 int
430 soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
431 {
432 	soassertlocked(so);
433 
434 	if (sbreserve(so, &so->so_snd, sndcc))
435 		goto bad;
436 	if (sbreserve(so, &so->so_rcv, rcvcc))
437 		goto bad2;
438 	so->so_snd.sb_wat = sndcc;
439 	so->so_rcv.sb_wat = rcvcc;
440 	if (so->so_rcv.sb_lowat == 0)
441 		so->so_rcv.sb_lowat = 1;
442 	if (so->so_snd.sb_lowat == 0)
443 		so->so_snd.sb_lowat = MCLBYTES;
444 	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
445 		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
446 	return (0);
447 bad2:
448 	sbrelease(so, &so->so_snd);
449 bad:
450 	return (ENOBUFS);
451 }
452 
453 /*
454  * Allot mbufs to a sockbuf.
455  * Attempt to scale mbmax so that mbcnt doesn't become limiting
456  * if buffering efficiency is near the normal case.
457  */
458 int
459 sbreserve(struct socket *so, struct sockbuf *sb, u_long cc)
460 {
461 	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
462 	soassertlocked(so);
463 
464 	if (cc == 0 || cc > sb_max)
465 		return (1);
466 	sb->sb_hiwat = cc;
467 	sb->sb_mbmax = max(3 * MAXMCLBYTES,
468 	    min(cc * 2, sb_max + (sb_max / MCLBYTES) * MSIZE));
469 	if (sb->sb_lowat > sb->sb_hiwat)
470 		sb->sb_lowat = sb->sb_hiwat;
471 	return (0);
472 }
473 
474 /*
475  * In low memory situation, do not accept any greater than normal request.
476  */
477 int
478 sbcheckreserve(u_long cnt, u_long defcnt)
479 {
480 	if (cnt > defcnt && sbchecklowmem())
481 		return (ENOBUFS);
482 	return (0);
483 }
484 
485 int
486 sbchecklowmem(void)
487 {
488 	static int sblowmem;
489 
490 	if (mclpools[0].pr_nout < mclpools[0].pr_hardlimit * 60 / 100 ||
491 	    mbpool.pr_nout < mbpool.pr_hardlimit * 60 / 100)
492 		sblowmem = 0;
493 	if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 80 / 100 ||
494 	    mbpool.pr_nout > mbpool.pr_hardlimit * 80 / 100)
495 		sblowmem = 1;
496 	return (sblowmem);
497 }
498 
499 /*
500  * Free mbufs held by a socket, and reserved mbuf space.
501  */
502 void
503 sbrelease(struct socket *so, struct sockbuf *sb)
504 {
505 
506 	sbflush(so, sb);
507 	sb->sb_hiwat = sb->sb_mbmax = 0;
508 }
509 
510 /*
511  * Routines to add and remove
512  * data from an mbuf queue.
513  *
514  * The routines sbappend() or sbappendrecord() are normally called to
515  * append new mbufs to a socket buffer, after checking that adequate
516  * space is available, comparing the function sbspace() with the amount
517  * of data to be added.  sbappendrecord() differs from sbappend() in
518  * that data supplied is treated as the beginning of a new record.
519  * To place a sender's address, optional access rights, and data in a
520  * socket receive buffer, sbappendaddr() should be used.  To place
521  * access rights and data in a socket receive buffer, sbappendrights()
522  * should be used.  In either case, the new data begins a new record.
523  * Note that unlike sbappend() and sbappendrecord(), these routines check
524  * for the caller that there will be enough space to store the data.
525  * Each fails if there is not enough space, or if it cannot find mbufs
526  * to store additional information in.
527  *
528  * Reliable protocols may use the socket send buffer to hold data
529  * awaiting acknowledgement.  Data is normally copied from a socket
530  * send buffer in a protocol with m_copym for output to a peer,
531  * and then removing the data from the socket buffer with sbdrop()
532  * or sbdroprecord() when the data is acknowledged by the peer.
533  */
534 
535 #ifdef SOCKBUF_DEBUG
536 void
537 sblastrecordchk(struct sockbuf *sb, const char *where)
538 {
539 	struct mbuf *m = sb->sb_mb;
540 
541 	while (m && m->m_nextpkt)
542 		m = m->m_nextpkt;
543 
544 	if (m != sb->sb_lastrecord) {
545 		printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
546 		    sb->sb_mb, sb->sb_lastrecord, m);
547 		printf("packet chain:\n");
548 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
549 			printf("\t%p\n", m);
550 		panic("sblastrecordchk from %s", where);
551 	}
552 }
553 
554 void
555 sblastmbufchk(struct sockbuf *sb, const char *where)
556 {
557 	struct mbuf *m = sb->sb_mb;
558 	struct mbuf *n;
559 
560 	while (m && m->m_nextpkt)
561 		m = m->m_nextpkt;
562 
563 	while (m && m->m_next)
564 		m = m->m_next;
565 
566 	if (m != sb->sb_mbtail) {
567 		printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
568 		    sb->sb_mb, sb->sb_mbtail, m);
569 		printf("packet tree:\n");
570 		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
571 			printf("\t");
572 			for (n = m; n != NULL; n = n->m_next)
573 				printf("%p ", n);
574 			printf("\n");
575 		}
576 		panic("sblastmbufchk from %s", where);
577 	}
578 }
579 #endif /* SOCKBUF_DEBUG */
580 
581 #define	SBLINKRECORD(sb, m0)						\
582 do {									\
583 	if ((sb)->sb_lastrecord != NULL)				\
584 		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
585 	else								\
586 		(sb)->sb_mb = (m0);					\
587 	(sb)->sb_lastrecord = (m0);					\
588 } while (/*CONSTCOND*/0)
589 
590 /*
591  * Append mbuf chain m to the last record in the
592  * socket buffer sb.  The additional space associated
593  * the mbuf chain is recorded in sb.  Empty mbufs are
594  * discarded and mbufs are compacted where possible.
595  */
596 void
597 sbappend(struct socket *so, struct sockbuf *sb, struct mbuf *m)
598 {
599 	struct mbuf *n;
600 
601 	if (m == NULL)
602 		return;
603 
604 	SBLASTRECORDCHK(sb, "sbappend 1");
605 
606 	if ((n = sb->sb_lastrecord) != NULL) {
607 		/*
608 		 * XXX Would like to simply use sb_mbtail here, but
609 		 * XXX I need to verify that I won't miss an EOR that
610 		 * XXX way.
611 		 */
612 		do {
613 			if (n->m_flags & M_EOR) {
614 				sbappendrecord(so, sb, m); /* XXXXXX!!!! */
615 				return;
616 			}
617 		} while (n->m_next && (n = n->m_next));
618 	} else {
619 		/*
620 		 * If this is the first record in the socket buffer, it's
621 		 * also the last record.
622 		 */
623 		sb->sb_lastrecord = m;
624 	}
625 	sbcompress(sb, m, n);
626 	SBLASTRECORDCHK(sb, "sbappend 2");
627 }
628 
629 /*
630  * This version of sbappend() should only be used when the caller
631  * absolutely knows that there will never be more than one record
632  * in the socket buffer, that is, a stream protocol (such as TCP).
633  */
634 void
635 sbappendstream(struct socket *so, struct sockbuf *sb, struct mbuf *m)
636 {
637 	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
638 	soassertlocked(so);
639 	KDASSERT(m->m_nextpkt == NULL);
640 	KASSERT(sb->sb_mb == sb->sb_lastrecord);
641 
642 	SBLASTMBUFCHK(sb, __func__);
643 
644 	sbcompress(sb, m, sb->sb_mbtail);
645 
646 	sb->sb_lastrecord = sb->sb_mb;
647 	SBLASTRECORDCHK(sb, __func__);
648 }
649 
650 #ifdef SOCKBUF_DEBUG
651 void
652 sbcheck(struct sockbuf *sb)
653 {
654 	struct mbuf *m, *n;
655 	u_long len = 0, mbcnt = 0;
656 
657 	for (m = sb->sb_mb; m; m = m->m_nextpkt) {
658 		for (n = m; n; n = n->m_next) {
659 			len += n->m_len;
660 			mbcnt += MSIZE;
661 			if (n->m_flags & M_EXT)
662 				mbcnt += n->m_ext.ext_size;
663 			if (m != n && n->m_nextpkt)
664 				panic("sbcheck nextpkt");
665 		}
666 	}
667 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
668 		printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
669 		    mbcnt, sb->sb_mbcnt);
670 		panic("sbcheck");
671 	}
672 }
673 #endif
674 
675 /*
676  * As above, except the mbuf chain
677  * begins a new record.
678  */
679 void
680 sbappendrecord(struct socket *so, struct sockbuf *sb, struct mbuf *m0)
681 {
682 	struct mbuf *m;
683 
684 	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
685 	soassertlocked(so);
686 
687 	if (m0 == NULL)
688 		return;
689 
690 	/*
691 	 * Put the first mbuf on the queue.
692 	 * Note this permits zero length records.
693 	 */
694 	sballoc(sb, m0);
695 	SBLASTRECORDCHK(sb, "sbappendrecord 1");
696 	SBLINKRECORD(sb, m0);
697 	m = m0->m_next;
698 	m0->m_next = NULL;
699 	if (m && (m0->m_flags & M_EOR)) {
700 		m0->m_flags &= ~M_EOR;
701 		m->m_flags |= M_EOR;
702 	}
703 	sbcompress(sb, m, m0);
704 	SBLASTRECORDCHK(sb, "sbappendrecord 2");
705 }
706 
707 /*
708  * As above except that OOB data
709  * is inserted at the beginning of the sockbuf,
710  * but after any other OOB data.
711  */
712 void
713 sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
714 {
715 	struct mbuf *m, **mp;
716 
717 	if (m0 == NULL)
718 		return;
719 
720 	SBLASTRECORDCHK(sb, "sbinsertoob 1");
721 
722 	for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
723 	    again:
724 		switch (m->m_type) {
725 
726 		case MT_OOBDATA:
727 			continue;		/* WANT next train */
728 
729 		case MT_CONTROL:
730 			if ((m = m->m_next) != NULL)
731 				goto again;	/* inspect THIS train further */
732 		}
733 		break;
734 	}
735 	/*
736 	 * Put the first mbuf on the queue.
737 	 * Note this permits zero length records.
738 	 */
739 	sballoc(sb, m0);
740 	m0->m_nextpkt = *mp;
741 	if (*mp == NULL) {
742 		/* m0 is actually the new tail */
743 		sb->sb_lastrecord = m0;
744 	}
745 	*mp = m0;
746 	m = m0->m_next;
747 	m0->m_next = NULL;
748 	if (m && (m0->m_flags & M_EOR)) {
749 		m0->m_flags &= ~M_EOR;
750 		m->m_flags |= M_EOR;
751 	}
752 	sbcompress(sb, m, m0);
753 	SBLASTRECORDCHK(sb, "sbinsertoob 2");
754 }
755 
756 /*
757  * Append address and data, and optionally, control (ancillary) data
758  * to the receive queue of a socket.  If present,
759  * m0 must include a packet header with total length.
760  * Returns 0 if no space in sockbuf or insufficient mbufs.
761  */
762 int
763 sbappendaddr(struct socket *so, struct sockbuf *sb, struct sockaddr *asa,
764     struct mbuf *m0, struct mbuf *control)
765 {
766 	struct mbuf *m, *n, *nlast;
767 	int space = asa->sa_len;
768 
769 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
770 		panic("sbappendaddr");
771 	if (m0)
772 		space += m0->m_pkthdr.len;
773 	for (n = control; n; n = n->m_next) {
774 		space += n->m_len;
775 		if (n->m_next == NULL)	/* keep pointer to last control buf */
776 			break;
777 	}
778 	if (space > sbspace(so, sb))
779 		return (0);
780 	if (asa->sa_len > MLEN)
781 		return (0);
782 	MGET(m, M_DONTWAIT, MT_SONAME);
783 	if (m == NULL)
784 		return (0);
785 	m->m_len = asa->sa_len;
786 	memcpy(mtod(m, caddr_t), asa, asa->sa_len);
787 	if (n)
788 		n->m_next = m0;		/* concatenate data to control */
789 	else
790 		control = m0;
791 	m->m_next = control;
792 
793 	SBLASTRECORDCHK(sb, "sbappendaddr 1");
794 
795 	for (n = m; n->m_next != NULL; n = n->m_next)
796 		sballoc(sb, n);
797 	sballoc(sb, n);
798 	nlast = n;
799 	SBLINKRECORD(sb, m);
800 
801 	sb->sb_mbtail = nlast;
802 	SBLASTMBUFCHK(sb, "sbappendaddr");
803 
804 	SBLASTRECORDCHK(sb, "sbappendaddr 2");
805 
806 	return (1);
807 }
808 
809 int
810 sbappendcontrol(struct socket *so, struct sockbuf *sb, struct mbuf *m0,
811     struct mbuf *control)
812 {
813 	struct mbuf *m, *mlast, *n;
814 	int space = 0;
815 
816 	if (control == NULL)
817 		panic("sbappendcontrol");
818 	for (m = control; ; m = m->m_next) {
819 		space += m->m_len;
820 		if (m->m_next == NULL)
821 			break;
822 	}
823 	n = m;			/* save pointer to last control buffer */
824 	for (m = m0; m; m = m->m_next)
825 		space += m->m_len;
826 	if (space > sbspace(so, sb))
827 		return (0);
828 	n->m_next = m0;			/* concatenate data to control */
829 
830 	SBLASTRECORDCHK(sb, "sbappendcontrol 1");
831 
832 	for (m = control; m->m_next != NULL; m = m->m_next)
833 		sballoc(sb, m);
834 	sballoc(sb, m);
835 	mlast = m;
836 	SBLINKRECORD(sb, control);
837 
838 	sb->sb_mbtail = mlast;
839 	SBLASTMBUFCHK(sb, "sbappendcontrol");
840 
841 	SBLASTRECORDCHK(sb, "sbappendcontrol 2");
842 
843 	return (1);
844 }
845 
846 /*
847  * Compress mbuf chain m into the socket
848  * buffer sb following mbuf n.  If n
849  * is null, the buffer is presumed empty.
850  */
851 void
852 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
853 {
854 	int eor = 0;
855 	struct mbuf *o;
856 
857 	while (m) {
858 		eor |= m->m_flags & M_EOR;
859 		if (m->m_len == 0 &&
860 		    (eor == 0 ||
861 		    (((o = m->m_next) || (o = n)) &&
862 		    o->m_type == m->m_type))) {
863 			if (sb->sb_lastrecord == m)
864 				sb->sb_lastrecord = m->m_next;
865 			m = m_free(m);
866 			continue;
867 		}
868 		if (n && (n->m_flags & M_EOR) == 0 &&
869 		    /* M_TRAILINGSPACE() checks buffer writeability */
870 		    m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
871 		    m->m_len <= M_TRAILINGSPACE(n) &&
872 		    n->m_type == m->m_type) {
873 			memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t),
874 			    m->m_len);
875 			n->m_len += m->m_len;
876 			sb->sb_cc += m->m_len;
877 			if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
878 				sb->sb_datacc += m->m_len;
879 			m = m_free(m);
880 			continue;
881 		}
882 		if (n)
883 			n->m_next = m;
884 		else
885 			sb->sb_mb = m;
886 		sb->sb_mbtail = m;
887 		sballoc(sb, m);
888 		n = m;
889 		m->m_flags &= ~M_EOR;
890 		m = m->m_next;
891 		n->m_next = NULL;
892 	}
893 	if (eor) {
894 		if (n)
895 			n->m_flags |= eor;
896 		else
897 			printf("semi-panic: sbcompress");
898 	}
899 	SBLASTMBUFCHK(sb, __func__);
900 }
901 
902 /*
903  * Free all mbufs in a sockbuf.
904  * Check that all resources are reclaimed.
905  */
906 void
907 sbflush(struct socket *so, struct sockbuf *sb)
908 {
909 	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
910 	KASSERT((sb->sb_flags & SB_LOCK) == 0);
911 
912 	while (sb->sb_mbcnt)
913 		sbdrop(so, sb, (int)sb->sb_cc);
914 
915 	KASSERT(sb->sb_cc == 0);
916 	KASSERT(sb->sb_datacc == 0);
917 	KASSERT(sb->sb_mb == NULL);
918 	KASSERT(sb->sb_mbtail == NULL);
919 	KASSERT(sb->sb_lastrecord == NULL);
920 }
921 
922 /*
923  * Drop data from (the front of) a sockbuf.
924  */
925 void
926 sbdrop(struct socket *so, struct sockbuf *sb, int len)
927 {
928 	struct mbuf *m, *mn;
929 	struct mbuf *next;
930 
931 	KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
932 	soassertlocked(so);
933 
934 	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
935 	while (len > 0) {
936 		if (m == NULL) {
937 			if (next == NULL)
938 				panic("sbdrop");
939 			m = next;
940 			next = m->m_nextpkt;
941 			continue;
942 		}
943 		if (m->m_len > len) {
944 			m->m_len -= len;
945 			m->m_data += len;
946 			sb->sb_cc -= len;
947 			if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME)
948 				sb->sb_datacc -= len;
949 			break;
950 		}
951 		len -= m->m_len;
952 		sbfree(sb, m);
953 		mn = m_free(m);
954 		m = mn;
955 	}
956 	while (m && m->m_len == 0) {
957 		sbfree(sb, m);
958 		mn = m_free(m);
959 		m = mn;
960 	}
961 	if (m) {
962 		sb->sb_mb = m;
963 		m->m_nextpkt = next;
964 	} else
965 		sb->sb_mb = next;
966 	/*
967 	 * First part is an inline SB_EMPTY_FIXUP().  Second part
968 	 * makes sure sb_lastrecord is up-to-date if we dropped
969 	 * part of the last record.
970 	 */
971 	m = sb->sb_mb;
972 	if (m == NULL) {
973 		sb->sb_mbtail = NULL;
974 		sb->sb_lastrecord = NULL;
975 	} else if (m->m_nextpkt == NULL)
976 		sb->sb_lastrecord = m;
977 }
978 
979 /*
980  * Drop a record off the front of a sockbuf
981  * and move the next record to the front.
982  */
983 void
984 sbdroprecord(struct sockbuf *sb)
985 {
986 	struct mbuf *m, *mn;
987 
988 	m = sb->sb_mb;
989 	if (m) {
990 		sb->sb_mb = m->m_nextpkt;
991 		do {
992 			sbfree(sb, m);
993 			mn = m_free(m);
994 		} while ((m = mn) != NULL);
995 	}
996 	SB_EMPTY_FIXUP(sb);
997 }
998 
999 /*
1000  * Create a "control" mbuf containing the specified data
1001  * with the specified type for presentation on a socket buffer.
1002  */
1003 struct mbuf *
1004 sbcreatecontrol(caddr_t p, int size, int type, int level)
1005 {
1006 	struct cmsghdr *cp;
1007 	struct mbuf *m;
1008 
1009 	if (CMSG_SPACE(size) > MCLBYTES) {
1010 		printf("sbcreatecontrol: message too large %d\n", size);
1011 		return NULL;
1012 	}
1013 
1014 	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1015 		return (NULL);
1016 	if (CMSG_SPACE(size) > MLEN) {
1017 		MCLGET(m, M_DONTWAIT);
1018 		if ((m->m_flags & M_EXT) == 0) {
1019 			m_free(m);
1020 			return NULL;
1021 		}
1022 	}
1023 	cp = mtod(m, struct cmsghdr *);
1024 	memset(cp, 0, CMSG_SPACE(size));
1025 	memcpy(CMSG_DATA(cp), p, size);
1026 	m->m_len = CMSG_SPACE(size);
1027 	cp->cmsg_len = CMSG_LEN(size);
1028 	cp->cmsg_level = level;
1029 	cp->cmsg_type = type;
1030 	return (m);
1031 }
1032