xref: /netbsd-src/sys/dev/kttcp.c (revision aaf4ece63a859a04e37cf3a7229b5fab0157cc06)
1 /*	$NetBSD: kttcp.c,v 1.17 2005/12/11 12:20:53 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Wasabi Systems, Inc.
5  * All rights reserved.
6  *
7  * Written by Frank van der Linden and Jason R. Thorpe for
8  * Wasabi Systems, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed for the NetBSD Project by
21  *	Wasabi Systems, Inc.
22  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
23  *    or promote products derived from this software without specific prior
24  *    written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*
40  * kttcp.c --
41  *
42  *	This module provides kernel support for testing network
43  *	throughput from the perspective of the kernel.  It is
44  *	similar in spirit to the classic ttcp network benchmark
45  *	program, the main difference being that with kttcp, the
46  *	kernel is the source and sink of the data.
47  *
48  *	Testing like this is useful for a few reasons:
49  *
50  *	1. This allows us to know what kind of performance we can
51  *	   expect from network applications that run in the kernel
52  *	   space, such as the NFS server or the NFS client.  These
53  *	   applications don't have to move the data to/from userspace,
54  *	   and so benchmark programs which run in userspace don't
55  *	   give us an accurate model.
56  *
57  *	2. Since data received is just thrown away, the receiver
58  *	   is very fast.  This can provide better exercise for the
59  *	   sender at the other end.
60  *
61  *	3. Since the NetBSD kernel currently uses a run-to-completion
62  *	   scheduling model, kttcp provides a benchmark model where
63  *	   preemption of the benchmark program is not an issue.
64  */
65 
66 #include <sys/cdefs.h>
67 __KERNEL_RCSID(0, "$NetBSD: kttcp.c,v 1.17 2005/12/11 12:20:53 christos Exp $");
68 
69 #include <sys/param.h>
70 #include <sys/types.h>
71 #include <sys/ioctl.h>
72 #include <sys/file.h>
73 #include <sys/filedesc.h>
74 #include <sys/conf.h>
75 #include <sys/systm.h>
76 #include <sys/protosw.h>
77 #include <sys/proc.h>
78 #include <sys/resourcevar.h>
79 #include <sys/signal.h>
80 #include <sys/socketvar.h>
81 #include <sys/socket.h>
82 #include <sys/mbuf.h>
83 #include <sys/sa.h>
84 #include <sys/mount.h>
85 #include <sys/syscallargs.h>
86 
87 #include <dev/kttcpio.h>
88 
89 static int kttcp_send(struct lwp *l, struct kttcp_io_args *);
90 static int kttcp_recv(struct lwp *l, struct kttcp_io_args *);
91 static int kttcp_sosend(struct socket *, unsigned long long,
92 			unsigned long long *, struct lwp *, int);
93 static int kttcp_soreceive(struct socket *, unsigned long long,
94 			   unsigned long long *, struct lwp *, int *);
95 
96 void	kttcpattach(int);
97 
98 dev_type_ioctl(kttcpioctl);
99 
100 const struct cdevsw kttcp_cdevsw = {
101 	nullopen, nullclose, noread, nowrite, kttcpioctl,
102 	nostop, notty, nopoll, nommap, nokqfilter,
103 };
104 
105 void
106 kttcpattach(int count)
107 {
108 	/* Do nothing. */
109 }
110 
111 int
112 kttcpioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct lwp *l)
113 {
114 	int error;
115 
116 	if ((flag & FWRITE) == 0)
117 		return EPERM;
118 
119 	switch (cmd) {
120 	case KTTCP_IO_SEND:
121 		error = kttcp_send(l, (struct kttcp_io_args *) data);
122 		break;
123 
124 	case KTTCP_IO_RECV:
125 		error = kttcp_recv(l, (struct kttcp_io_args *) data);
126 		break;
127 
128 	default:
129 		return EINVAL;
130 	}
131 
132 	return error;
133 }
134 
135 static int
136 kttcp_send(struct lwp *l, struct kttcp_io_args *kio)
137 {
138 	struct file *fp;
139 	int error;
140 	struct timeval t0, t1;
141 	unsigned long long len, done;
142 
143 	if (kio->kio_totalsize >= KTTCP_MAX_XMIT)
144 		return EINVAL;
145 
146 	fp = fd_getfile(l->l_proc->p_fd, kio->kio_socket);
147 	if (fp == NULL)
148 		return EBADF;
149 	FILE_USE(fp);
150 	if (fp->f_type != DTYPE_SOCKET) {
151 		FILE_UNUSE(fp, l);
152 		return EFTYPE;
153 	}
154 
155 	len = kio->kio_totalsize;
156 	microtime(&t0);
157 	do {
158 		error = kttcp_sosend((struct socket *)fp->f_data, len,
159 		    &done, l, 0);
160 		len -= done;
161 	} while (error == 0 && len > 0);
162 
163 	FILE_UNUSE(fp, l);
164 
165 	microtime(&t1);
166 	if (error != 0)
167 		return error;
168 	timersub(&t1, &t0, &kio->kio_elapsed);
169 
170 	kio->kio_bytesdone = kio->kio_totalsize - len;
171 
172 	return 0;
173 }
174 
175 static int
176 kttcp_recv(struct lwp *l, struct kttcp_io_args *kio)
177 {
178 	struct file *fp;
179 	int error;
180 	struct timeval t0, t1;
181 	unsigned long long len, done;
182 
183 	if (kio->kio_totalsize > KTTCP_MAX_XMIT)
184 		return EINVAL;
185 
186 	fp = fd_getfile(l->l_proc->p_fd, kio->kio_socket);
187 	if (fp == NULL)
188 		return EBADF;
189 	FILE_USE(fp);
190 	if (fp->f_type != DTYPE_SOCKET) {
191 		FILE_UNUSE(fp, l);
192 		return EBADF;
193 	}
194 	len = kio->kio_totalsize;
195 	microtime(&t0);
196 	do {
197 		error = kttcp_soreceive((struct socket *)fp->f_data,
198 		    len, &done, l, NULL);
199 		len -= done;
200 	} while (error == 0 && len > 0 && done > 0);
201 
202 	FILE_UNUSE(fp, l);
203 
204 	microtime(&t1);
205 	if (error == EPIPE)
206 		error = 0;
207 	if (error != 0)
208 		return error;
209 	timersub(&t1, &t0, &kio->kio_elapsed);
210 
211 	kio->kio_bytesdone = kio->kio_totalsize - len;
212 
213 	return 0;
214 }
215 
216 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
217 
218 /*
219  * Slightly changed version of sosend()
220  */
221 static int
222 kttcp_sosend(struct socket *so, unsigned long long slen,
223 	     unsigned long long *done, struct lwp *l, int flags)
224 {
225 	struct mbuf **mp, *m, *top;
226 	long space, len, mlen;
227 	int error, s, dontroute, atomic;
228 	long long resid;
229 
230 	atomic = sosendallatonce(so);
231 	resid = slen;
232 	top = NULL;
233 	/*
234 	 * In theory resid should be unsigned.
235 	 * However, space must be signed, as it might be less than 0
236 	 * if we over-committed, and we must use a signed comparison
237 	 * of space and resid.  On the other hand, a negative resid
238 	 * causes us to loop sending 0-length segments to the protocol.
239 	 */
240 	if (resid < 0) {
241 		error = EINVAL;
242 		goto out;
243 	}
244 	dontroute =
245 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
246 	    (so->so_proto->pr_flags & PR_ATOMIC);
247 	/* WRS XXX - are we doing per-lwp or per-proc stats? */
248 	l->l_proc->p_stats->p_ru.ru_msgsnd++;
249 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
250 
251  restart:
252 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
253 		goto out;
254 	do {
255 		s = splsoftnet();
256 		if (so->so_state & SS_CANTSENDMORE)
257 			snderr(EPIPE);
258 		if (so->so_error) {
259 			error = so->so_error;
260 			so->so_error = 0;
261 			splx(s);
262 			goto release;
263 		}
264 		if ((so->so_state & SS_ISCONNECTED) == 0) {
265 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
266 				if ((so->so_state & SS_ISCONFIRMING) == 0)
267 					snderr(ENOTCONN);
268 			} else
269 				snderr(EDESTADDRREQ);
270 		}
271 		space = sbspace(&so->so_snd);
272 		if (flags & MSG_OOB)
273 			space += 1024;
274 		if ((atomic && resid > so->so_snd.sb_hiwat))
275 			snderr(EMSGSIZE);
276 		if (space < resid && (atomic || space < so->so_snd.sb_lowat)) {
277 			if (so->so_state & SS_NBIO)
278 				snderr(EWOULDBLOCK);
279 			SBLASTRECORDCHK(&so->so_rcv,
280 			    "kttcp_soreceive sbwait 1");
281 			SBLASTMBUFCHK(&so->so_rcv,
282 			    "kttcp_soreceive sbwait 1");
283 			sbunlock(&so->so_snd);
284 			error = sbwait(&so->so_snd);
285 			splx(s);
286 			if (error)
287 				goto out;
288 			goto restart;
289 		}
290 		splx(s);
291 		mp = &top;
292 		do {
293 			do {
294 				if (top == 0) {
295 					m = m_gethdr(M_WAIT, MT_DATA);
296 					mlen = MHLEN;
297 					m->m_pkthdr.len = 0;
298 					m->m_pkthdr.rcvif = NULL;
299 				} else {
300 					m = m_get(M_WAIT, MT_DATA);
301 					mlen = MLEN;
302 				}
303 				if (resid >= MINCLSIZE && space >= MCLBYTES) {
304 					m_clget(m, M_WAIT);
305 					if ((m->m_flags & M_EXT) == 0)
306 						goto nopages;
307 					mlen = MCLBYTES;
308 #ifdef	MAPPED_MBUFS
309 					len = lmin(MCLBYTES, resid);
310 #else
311 					if (atomic && top == 0) {
312 						len = lmin(MCLBYTES - max_hdr,
313 						    resid);
314 						m->m_data += max_hdr;
315 					} else
316 						len = lmin(MCLBYTES, resid);
317 #endif
318 					space -= len;
319 				} else {
320 nopages:
321 					len = lmin(lmin(mlen, resid), space);
322 					space -= len;
323 					/*
324 					 * For datagram protocols, leave room
325 					 * for protocol headers in first mbuf.
326 					 */
327 					if (atomic && top == 0 && len < mlen)
328 						MH_ALIGN(m, len);
329 				}
330 				resid -= len;
331 				m->m_len = len;
332 				*mp = m;
333 				top->m_pkthdr.len += len;
334 				if (error)
335 					goto release;
336 				mp = &m->m_next;
337 				if (resid <= 0) {
338 					if (flags & MSG_EOR)
339 						top->m_flags |= M_EOR;
340 					break;
341 				}
342 			} while (space > 0 && atomic);
343 
344 			s = splsoftnet();
345 
346 			if (so->so_state & SS_CANTSENDMORE)
347 				snderr(EPIPE);
348 
349 			if (dontroute)
350 				so->so_options |= SO_DONTROUTE;
351 			if (resid > 0)
352 				so->so_state |= SS_MORETOCOME;
353 			error = (*so->so_proto->pr_usrreq)(so,
354 			    (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
355 			    top, NULL, NULL, l);
356 			if (dontroute)
357 				so->so_options &= ~SO_DONTROUTE;
358 			if (resid > 0)
359 				so->so_state &= ~SS_MORETOCOME;
360 			splx(s);
361 
362 			top = 0;
363 			mp = &top;
364 			if (error)
365 				goto release;
366 		} while (resid && space > 0);
367 	} while (resid);
368 
369  release:
370 	sbunlock(&so->so_snd);
371  out:
372 	if (top)
373 		m_freem(top);
374 	*done = slen - resid;
375 #if 0
376 	printf("sosend: error %d slen %llu resid %lld\n", error, slen, resid);
377 #endif
378 	return (error);
379 }
380 
381 static int
382 kttcp_soreceive(struct socket *so, unsigned long long slen,
383 		unsigned long long *done, struct lwp *l, int *flagsp)
384 {
385 	struct mbuf *m, **mp;
386 	int flags, len, error, s, offset, moff, type;
387 	long long orig_resid, resid;
388 	const struct protosw *pr;
389 	struct mbuf *nextrecord;
390 
391 	pr = so->so_proto;
392 	mp = NULL;
393 	type = 0;
394 	resid = orig_resid = slen;
395 	if (flagsp)
396 		flags = *flagsp &~ MSG_EOR;
397 	else
398  		flags = 0;
399 	if (flags & MSG_OOB) {
400 		m = m_get(M_WAIT, MT_DATA);
401 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
402 		    (struct mbuf *)(long)(flags & MSG_PEEK), NULL, NULL);
403 		if (error)
404 			goto bad;
405 		do {
406 			resid -= min(resid, m->m_len);
407 			m = m_free(m);
408 		} while (resid && error == 0 && m);
409  bad:
410 		if (m)
411 			m_freem(m);
412 		return (error);
413 	}
414 	if (mp)
415 		*mp = NULL;
416 	if (so->so_state & SS_ISCONFIRMING && resid)
417 		(*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, NULL);
418 
419  restart:
420 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
421 		return (error);
422 	s = splsoftnet();
423 
424 	m = so->so_rcv.sb_mb;
425 	/*
426 	 * If we have less data than requested, block awaiting more
427 	 * (subject to any timeout) if:
428 	 *   1. the current count is less than the low water mark,
429 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
430 	 *	receive operation at once if we block (resid <= hiwat), or
431 	 *   3. MSG_DONTWAIT is not set.
432 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
433 	 * we have to do the receive in sections, and thus risk returning
434 	 * a short count if a timeout or signal occurs after we start.
435 	 */
436 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
437 	    so->so_rcv.sb_cc < resid) &&
438 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
439 	    ((flags & MSG_WAITALL) && resid <= so->so_rcv.sb_hiwat)) &&
440 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
441 #ifdef DIAGNOSTIC
442 		if (m == NULL && so->so_rcv.sb_cc)
443 			panic("receive 1");
444 #endif
445 		if (so->so_error) {
446 			if (m)
447 				goto dontblock;
448 			error = so->so_error;
449 			if ((flags & MSG_PEEK) == 0)
450 				so->so_error = 0;
451 			goto release;
452 		}
453 		if (so->so_state & SS_CANTRCVMORE) {
454 			if (m)
455 				goto dontblock;
456 			else
457 				goto release;
458 		}
459 		for (; m; m = m->m_next)
460 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
461 				m = so->so_rcv.sb_mb;
462 				goto dontblock;
463 			}
464 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
465 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
466 			error = ENOTCONN;
467 			goto release;
468 		}
469 		if (resid == 0)
470 			goto release;
471 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
472 			error = EWOULDBLOCK;
473 			goto release;
474 		}
475 		sbunlock(&so->so_rcv);
476 		error = sbwait(&so->so_rcv);
477 		splx(s);
478 		if (error)
479 			return (error);
480 		goto restart;
481 	}
482  dontblock:
483 	/*
484 	 * On entry here, m points to the first record of the socket buffer.
485 	 * While we process the initial mbufs containing address and control
486 	 * info, we save a copy of m->m_nextpkt into nextrecord.
487 	 */
488 #ifdef notyet /* XXXX */
489 	if (uio->uio_lwp)
490 		uio->uio_lwp->l_proc->p_stats->p_ru.ru_msgrcv++;
491 #endif
492 	KASSERT(m == so->so_rcv.sb_mb);
493 	SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 1");
494 	SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 1");
495 	nextrecord = m->m_nextpkt;
496 	if (pr->pr_flags & PR_ADDR) {
497 #ifdef DIAGNOSTIC
498 		if (m->m_type != MT_SONAME)
499 			panic("receive 1a");
500 #endif
501 		orig_resid = 0;
502 		if (flags & MSG_PEEK) {
503 			m = m->m_next;
504 		} else {
505 			sbfree(&so->so_rcv, m);
506 			MFREE(m, so->so_rcv.sb_mb);
507 			m = so->so_rcv.sb_mb;
508 		}
509 	}
510 	while (m && m->m_type == MT_CONTROL && error == 0) {
511 		if (flags & MSG_PEEK) {
512 			m = m->m_next;
513 		} else {
514 			sbfree(&so->so_rcv, m);
515 			MFREE(m, so->so_rcv.sb_mb);
516 			m = so->so_rcv.sb_mb;
517 		}
518 	}
519 
520 	/*
521 	 * If m is non-NULL, we have some data to read.  From now on,
522 	 * make sure to keep sb_lastrecord consistent when working on
523 	 * the last packet on the chain (nextrecord == NULL) and we
524 	 * change m->m_nextpkt.
525 	 */
526 	if (m) {
527 		if ((flags & MSG_PEEK) == 0) {
528 			m->m_nextpkt = nextrecord;
529 			/*
530 			 * If nextrecord == NULL (this is a single chain),
531 			 * then sb_lastrecord may not be valid here if m
532 			 * was changed earlier.
533 			 */
534 			if (nextrecord == NULL) {
535 				KASSERT(so->so_rcv.sb_mb == m);
536 				so->so_rcv.sb_lastrecord = m;
537 			}
538 		}
539 		type = m->m_type;
540 		if (type == MT_OOBDATA)
541 			flags |= MSG_OOB;
542 	} else {
543 		if ((flags & MSG_PEEK) == 0) {
544 			KASSERT(so->so_rcv.sb_mb == m);
545 			so->so_rcv.sb_mb = nextrecord;
546 			SB_EMPTY_FIXUP(&so->so_rcv);
547 		}
548 	}
549 	SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 2");
550 	SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 2");
551 
552 	moff = 0;
553 	offset = 0;
554 	while (m && resid > 0 && error == 0) {
555 		if (m->m_type == MT_OOBDATA) {
556 			if (type != MT_OOBDATA)
557 				break;
558 		} else if (type == MT_OOBDATA)
559 			break;
560 #ifdef DIAGNOSTIC
561 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
562 			panic("receive 3");
563 #endif
564 		so->so_state &= ~SS_RCVATMARK;
565 		len = resid;
566 		if (so->so_oobmark && len > so->so_oobmark - offset)
567 			len = so->so_oobmark - offset;
568 		if (len > m->m_len - moff)
569 			len = m->m_len - moff;
570 		/*
571 		 * If mp is set, just pass back the mbufs.
572 		 * Otherwise copy them out via the uio, then free.
573 		 * Sockbuf must be consistent here (points to current mbuf,
574 		 * it points to next record) when we drop priority;
575 		 * we must note any additions to the sockbuf when we
576 		 * block interrupts again.
577 		 */
578 		resid -= len;
579 		if (len == m->m_len - moff) {
580 			if (m->m_flags & M_EOR)
581 				flags |= MSG_EOR;
582 			if (flags & MSG_PEEK) {
583 				m = m->m_next;
584 				moff = 0;
585 			} else {
586 				nextrecord = m->m_nextpkt;
587 				sbfree(&so->so_rcv, m);
588 				if (mp) {
589 					*mp = m;
590 					mp = &m->m_next;
591 					so->so_rcv.sb_mb = m = m->m_next;
592 					*mp = NULL;
593 				} else {
594 					MFREE(m, so->so_rcv.sb_mb);
595 					m = so->so_rcv.sb_mb;
596 				}
597 				/*
598 				 * If m != NULL, we also know that
599 				 * so->so_rcv.sb_mb != NULL.
600 				 */
601 				KASSERT(so->so_rcv.sb_mb == m);
602 				if (m) {
603 					m->m_nextpkt = nextrecord;
604 					if (nextrecord == NULL)
605 						so->so_rcv.sb_lastrecord = m;
606 				} else {
607 					so->so_rcv.sb_mb = nextrecord;
608 					SB_EMPTY_FIXUP(&so->so_rcv);
609 				}
610 				SBLASTRECORDCHK(&so->so_rcv,
611 				    "kttcp_soreceive 3");
612 				SBLASTMBUFCHK(&so->so_rcv,
613 				    "kttcp_soreceive 3");
614 			}
615 		} else {
616 			if (flags & MSG_PEEK)
617 				moff += len;
618 			else {
619 				if (mp)
620 					*mp = m_copym(m, 0, len, M_WAIT);
621 				m->m_data += len;
622 				m->m_len -= len;
623 				so->so_rcv.sb_cc -= len;
624 			}
625 		}
626 		if (so->so_oobmark) {
627 			if ((flags & MSG_PEEK) == 0) {
628 				so->so_oobmark -= len;
629 				if (so->so_oobmark == 0) {
630 					so->so_state |= SS_RCVATMARK;
631 					break;
632 				}
633 			} else {
634 				offset += len;
635 				if (offset == so->so_oobmark)
636 					break;
637 			}
638 		}
639 		if (flags & MSG_EOR)
640 			break;
641 		/*
642 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
643 		 * we must not quit until "uio->uio_resid == 0" or an error
644 		 * termination.  If a signal/timeout occurs, return
645 		 * with a short count but without error.
646 		 * Keep sockbuf locked against other readers.
647 		 */
648 		while (flags & MSG_WAITALL && m == NULL && resid > 0 &&
649 		    !sosendallatonce(so) && !nextrecord) {
650 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
651 				break;
652 			/*
653 			 * If we are peeking and the socket receive buffer is
654 			 * full, stop since we can't get more data to peek at.
655 			 */
656 			if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
657 				break;
658 			/*
659 			 * If we've drained the socket buffer, tell the
660 			 * protocol in case it needs to do something to
661 			 * get it filled again.
662 			 */
663 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
664 				(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
665 				    (struct mbuf *)(long)flags, NULL, NULL);
666 			SBLASTRECORDCHK(&so->so_rcv,
667 			    "kttcp_soreceive sbwait 2");
668 			SBLASTMBUFCHK(&so->so_rcv,
669 			    "kttcp_soreceive sbwait 2");
670 			error = sbwait(&so->so_rcv);
671 			if (error) {
672 				sbunlock(&so->so_rcv);
673 				splx(s);
674 				return (0);
675 			}
676 			if ((m = so->so_rcv.sb_mb) != NULL)
677 				nextrecord = m->m_nextpkt;
678 		}
679 	}
680 
681 	if (m && pr->pr_flags & PR_ATOMIC) {
682 		flags |= MSG_TRUNC;
683 		if ((flags & MSG_PEEK) == 0)
684 			(void) sbdroprecord(&so->so_rcv);
685 	}
686 	if ((flags & MSG_PEEK) == 0) {
687 		if (m == NULL) {
688 			/*
689 			 * First part is an SB_EMPTY_FIXUP().  Second part
690 			 * makes sure sb_lastrecord is up-to-date if
691 			 * there is still data in the socket buffer.
692 			 */
693 			so->so_rcv.sb_mb = nextrecord;
694 			if (so->so_rcv.sb_mb == NULL) {
695 				so->so_rcv.sb_mbtail = NULL;
696 				so->so_rcv.sb_lastrecord = NULL;
697 			} else if (nextrecord->m_nextpkt == NULL)
698 				so->so_rcv.sb_lastrecord = nextrecord;
699 		}
700 		SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 4");
701 		SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 4");
702 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
703 			(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
704 			    (struct mbuf *)(long)flags, NULL, NULL);
705 	}
706 	if (orig_resid == resid && orig_resid &&
707 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
708 		sbunlock(&so->so_rcv);
709 		splx(s);
710 		goto restart;
711 	}
712 
713 	if (flagsp)
714 		*flagsp |= flags;
715  release:
716 	sbunlock(&so->so_rcv);
717 	splx(s);
718 	*done = slen - resid;
719 #if 0
720 	printf("soreceive: error %d slen %llu resid %lld\n", error, slen, resid);
721 #endif
722 	return (error);
723 }
724