xref: /netbsd-src/sys/dev/kttcp.c (revision ce2c90c7c172d95d2402a5b3d96d8f8e6d138a21)
1 /*	$NetBSD: kttcp.c,v 1.20 2006/10/12 01:30:51 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Wasabi Systems, Inc.
5  * All rights reserved.
6  *
7  * Written by Frank van der Linden and Jason R. Thorpe for
8  * Wasabi Systems, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed for the NetBSD Project by
21  *	Wasabi Systems, Inc.
22  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
23  *    or promote products derived from this software without specific prior
24  *    written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*
40  * kttcp.c --
41  *
42  *	This module provides kernel support for testing network
43  *	throughput from the perspective of the kernel.  It is
44  *	similar in spirit to the classic ttcp network benchmark
45  *	program, the main difference being that with kttcp, the
46  *	kernel is the source and sink of the data.
47  *
48  *	Testing like this is useful for a few reasons:
49  *
50  *	1. This allows us to know what kind of performance we can
51  *	   expect from network applications that run in the kernel
52  *	   space, such as the NFS server or the NFS client.  These
53  *	   applications don't have to move the data to/from userspace,
54  *	   and so benchmark programs which run in userspace don't
55  *	   give us an accurate model.
56  *
57  *	2. Since data received is just thrown away, the receiver
58  *	   is very fast.  This can provide better exercise for the
59  *	   sender at the other end.
60  *
61  *	3. Since the NetBSD kernel currently uses a run-to-completion
62  *	   scheduling model, kttcp provides a benchmark model where
63  *	   preemption of the benchmark program is not an issue.
64  */
65 
66 #include <sys/cdefs.h>
67 __KERNEL_RCSID(0, "$NetBSD: kttcp.c,v 1.20 2006/10/12 01:30:51 christos Exp $");
68 
69 #include <sys/param.h>
70 #include <sys/types.h>
71 #include <sys/ioctl.h>
72 #include <sys/file.h>
73 #include <sys/filedesc.h>
74 #include <sys/conf.h>
75 #include <sys/systm.h>
76 #include <sys/protosw.h>
77 #include <sys/proc.h>
78 #include <sys/resourcevar.h>
79 #include <sys/signal.h>
80 #include <sys/socketvar.h>
81 #include <sys/socket.h>
82 #include <sys/mbuf.h>
83 #include <sys/sa.h>
84 #include <sys/mount.h>
85 #include <sys/syscallargs.h>
86 
87 #include <dev/kttcpio.h>
88 
89 static int kttcp_send(struct lwp *l, struct kttcp_io_args *);
90 static int kttcp_recv(struct lwp *l, struct kttcp_io_args *);
91 static int kttcp_sosend(struct socket *, unsigned long long,
92 			unsigned long long *, struct lwp *, int);
93 static int kttcp_soreceive(struct socket *, unsigned long long,
94 			   unsigned long long *, struct lwp *, int *);
95 
96 void	kttcpattach(int);
97 
98 dev_type_ioctl(kttcpioctl);
99 
100 const struct cdevsw kttcp_cdevsw = {
101 	nullopen, nullclose, noread, nowrite, kttcpioctl,
102 	nostop, notty, nopoll, nommap, nokqfilter, D_OTHER
103 };
104 
105 void
106 kttcpattach(int count __unused)
107 {
108 	/* Do nothing. */
109 }
110 
111 int
112 kttcpioctl(dev_t dev __unused, u_long cmd, caddr_t data, int flag,
113     struct lwp *l)
114 {
115 	int error;
116 
117 	if ((flag & FWRITE) == 0)
118 		return EPERM;
119 
120 	switch (cmd) {
121 	case KTTCP_IO_SEND:
122 		error = kttcp_send(l, (struct kttcp_io_args *) data);
123 		break;
124 
125 	case KTTCP_IO_RECV:
126 		error = kttcp_recv(l, (struct kttcp_io_args *) data);
127 		break;
128 
129 	default:
130 		return EINVAL;
131 	}
132 
133 	return error;
134 }
135 
136 static int
137 kttcp_send(struct lwp *l, struct kttcp_io_args *kio)
138 {
139 	struct file *fp;
140 	int error;
141 	struct timeval t0, t1;
142 	unsigned long long len, done;
143 
144 	if (kio->kio_totalsize >= KTTCP_MAX_XMIT)
145 		return EINVAL;
146 
147 	fp = fd_getfile(l->l_proc->p_fd, kio->kio_socket);
148 	if (fp == NULL)
149 		return EBADF;
150 	FILE_USE(fp);
151 	if (fp->f_type != DTYPE_SOCKET) {
152 		FILE_UNUSE(fp, l);
153 		return EFTYPE;
154 	}
155 
156 	len = kio->kio_totalsize;
157 	microtime(&t0);
158 	do {
159 		error = kttcp_sosend((struct socket *)fp->f_data, len,
160 		    &done, l, 0);
161 		len -= done;
162 	} while (error == 0 && len > 0);
163 
164 	FILE_UNUSE(fp, l);
165 
166 	microtime(&t1);
167 	if (error != 0)
168 		return error;
169 	timersub(&t1, &t0, &kio->kio_elapsed);
170 
171 	kio->kio_bytesdone = kio->kio_totalsize - len;
172 
173 	return 0;
174 }
175 
176 static int
177 kttcp_recv(struct lwp *l, struct kttcp_io_args *kio)
178 {
179 	struct file *fp;
180 	int error;
181 	struct timeval t0, t1;
182 	unsigned long long len, done;
183 
184 	done = 0;	/* XXX gcc */
185 
186 	if (kio->kio_totalsize > KTTCP_MAX_XMIT)
187 		return EINVAL;
188 
189 	fp = fd_getfile(l->l_proc->p_fd, kio->kio_socket);
190 	if (fp == NULL)
191 		return EBADF;
192 	FILE_USE(fp);
193 	if (fp->f_type != DTYPE_SOCKET) {
194 		FILE_UNUSE(fp, l);
195 		return EBADF;
196 	}
197 	len = kio->kio_totalsize;
198 	microtime(&t0);
199 	do {
200 		error = kttcp_soreceive((struct socket *)fp->f_data,
201 		    len, &done, l, NULL);
202 		len -= done;
203 	} while (error == 0 && len > 0 && done > 0);
204 
205 	FILE_UNUSE(fp, l);
206 
207 	microtime(&t1);
208 	if (error == EPIPE)
209 		error = 0;
210 	if (error != 0)
211 		return error;
212 	timersub(&t1, &t0, &kio->kio_elapsed);
213 
214 	kio->kio_bytesdone = kio->kio_totalsize - len;
215 
216 	return 0;
217 }
218 
219 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
220 
221 /*
222  * Slightly changed version of sosend()
223  */
224 static int
225 kttcp_sosend(struct socket *so, unsigned long long slen,
226 	     unsigned long long *done, struct lwp *l, int flags)
227 {
228 	struct mbuf **mp, *m, *top;
229 	long space, len, mlen;
230 	int error, s, dontroute, atomic;
231 	long long resid;
232 
233 	atomic = sosendallatonce(so);
234 	resid = slen;
235 	top = NULL;
236 	/*
237 	 * In theory resid should be unsigned.
238 	 * However, space must be signed, as it might be less than 0
239 	 * if we over-committed, and we must use a signed comparison
240 	 * of space and resid.  On the other hand, a negative resid
241 	 * causes us to loop sending 0-length segments to the protocol.
242 	 */
243 	if (resid < 0) {
244 		error = EINVAL;
245 		goto out;
246 	}
247 	dontroute =
248 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
249 	    (so->so_proto->pr_flags & PR_ATOMIC);
250 	/* WRS XXX - are we doing per-lwp or per-proc stats? */
251 	l->l_proc->p_stats->p_ru.ru_msgsnd++;
252 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
253 
254  restart:
255 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
256 		goto out;
257 	do {
258 		s = splsoftnet();
259 		if (so->so_state & SS_CANTSENDMORE)
260 			snderr(EPIPE);
261 		if (so->so_error) {
262 			error = so->so_error;
263 			so->so_error = 0;
264 			splx(s);
265 			goto release;
266 		}
267 		if ((so->so_state & SS_ISCONNECTED) == 0) {
268 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
269 				if ((so->so_state & SS_ISCONFIRMING) == 0)
270 					snderr(ENOTCONN);
271 			} else
272 				snderr(EDESTADDRREQ);
273 		}
274 		space = sbspace(&so->so_snd);
275 		if (flags & MSG_OOB)
276 			space += 1024;
277 		if ((atomic && resid > so->so_snd.sb_hiwat))
278 			snderr(EMSGSIZE);
279 		if (space < resid && (atomic || space < so->so_snd.sb_lowat)) {
280 			if (so->so_state & SS_NBIO)
281 				snderr(EWOULDBLOCK);
282 			SBLASTRECORDCHK(&so->so_rcv,
283 			    "kttcp_soreceive sbwait 1");
284 			SBLASTMBUFCHK(&so->so_rcv,
285 			    "kttcp_soreceive sbwait 1");
286 			sbunlock(&so->so_snd);
287 			error = sbwait(&so->so_snd);
288 			splx(s);
289 			if (error)
290 				goto out;
291 			goto restart;
292 		}
293 		splx(s);
294 		mp = &top;
295 		do {
296 			do {
297 				if (top == 0) {
298 					m = m_gethdr(M_WAIT, MT_DATA);
299 					mlen = MHLEN;
300 					m->m_pkthdr.len = 0;
301 					m->m_pkthdr.rcvif = NULL;
302 				} else {
303 					m = m_get(M_WAIT, MT_DATA);
304 					mlen = MLEN;
305 				}
306 				if (resid >= MINCLSIZE && space >= MCLBYTES) {
307 					m_clget(m, M_WAIT);
308 					if ((m->m_flags & M_EXT) == 0)
309 						goto nopages;
310 					mlen = MCLBYTES;
311 #ifdef	MAPPED_MBUFS
312 					len = lmin(MCLBYTES, resid);
313 #else
314 					if (atomic && top == 0) {
315 						len = lmin(MCLBYTES - max_hdr,
316 						    resid);
317 						m->m_data += max_hdr;
318 					} else
319 						len = lmin(MCLBYTES, resid);
320 #endif
321 					space -= len;
322 				} else {
323 nopages:
324 					len = lmin(lmin(mlen, resid), space);
325 					space -= len;
326 					/*
327 					 * For datagram protocols, leave room
328 					 * for protocol headers in first mbuf.
329 					 */
330 					if (atomic && top == 0 && len < mlen)
331 						MH_ALIGN(m, len);
332 				}
333 				resid -= len;
334 				m->m_len = len;
335 				*mp = m;
336 				top->m_pkthdr.len += len;
337 				if (error)
338 					goto release;
339 				mp = &m->m_next;
340 				if (resid <= 0) {
341 					if (flags & MSG_EOR)
342 						top->m_flags |= M_EOR;
343 					break;
344 				}
345 			} while (space > 0 && atomic);
346 
347 			s = splsoftnet();
348 
349 			if (so->so_state & SS_CANTSENDMORE)
350 				snderr(EPIPE);
351 
352 			if (dontroute)
353 				so->so_options |= SO_DONTROUTE;
354 			if (resid > 0)
355 				so->so_state |= SS_MORETOCOME;
356 			error = (*so->so_proto->pr_usrreq)(so,
357 			    (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
358 			    top, NULL, NULL, l);
359 			if (dontroute)
360 				so->so_options &= ~SO_DONTROUTE;
361 			if (resid > 0)
362 				so->so_state &= ~SS_MORETOCOME;
363 			splx(s);
364 
365 			top = 0;
366 			mp = &top;
367 			if (error)
368 				goto release;
369 		} while (resid && space > 0);
370 	} while (resid);
371 
372  release:
373 	sbunlock(&so->so_snd);
374  out:
375 	if (top)
376 		m_freem(top);
377 	*done = slen - resid;
378 #if 0
379 	printf("sosend: error %d slen %llu resid %lld\n", error, slen, resid);
380 #endif
381 	return (error);
382 }
383 
384 static int
385 kttcp_soreceive(struct socket *so, unsigned long long slen,
386     unsigned long long *done, struct lwp *l __unused, int *flagsp)
387 {
388 	struct mbuf *m, **mp;
389 	int flags, len, error, s, offset, moff, type;
390 	long long orig_resid, resid;
391 	const struct protosw *pr;
392 	struct mbuf *nextrecord;
393 
394 	pr = so->so_proto;
395 	mp = NULL;
396 	type = 0;
397 	resid = orig_resid = slen;
398 	if (flagsp)
399 		flags = *flagsp &~ MSG_EOR;
400 	else
401  		flags = 0;
402 	if (flags & MSG_OOB) {
403 		m = m_get(M_WAIT, MT_DATA);
404 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
405 		    (struct mbuf *)(long)(flags & MSG_PEEK), NULL, NULL);
406 		if (error)
407 			goto bad;
408 		do {
409 			resid -= min(resid, m->m_len);
410 			m = m_free(m);
411 		} while (resid && error == 0 && m);
412  bad:
413 		if (m)
414 			m_freem(m);
415 		return (error);
416 	}
417 	if (mp)
418 		*mp = NULL;
419 	if (so->so_state & SS_ISCONFIRMING && resid)
420 		(*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, NULL);
421 
422  restart:
423 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
424 		return (error);
425 	s = splsoftnet();
426 
427 	m = so->so_rcv.sb_mb;
428 	/*
429 	 * If we have less data than requested, block awaiting more
430 	 * (subject to any timeout) if:
431 	 *   1. the current count is less than the low water mark,
432 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
433 	 *	receive operation at once if we block (resid <= hiwat), or
434 	 *   3. MSG_DONTWAIT is not set.
435 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
436 	 * we have to do the receive in sections, and thus risk returning
437 	 * a short count if a timeout or signal occurs after we start.
438 	 */
439 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
440 	    so->so_rcv.sb_cc < resid) &&
441 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
442 	    ((flags & MSG_WAITALL) && resid <= so->so_rcv.sb_hiwat)) &&
443 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
444 #ifdef DIAGNOSTIC
445 		if (m == NULL && so->so_rcv.sb_cc)
446 			panic("receive 1");
447 #endif
448 		if (so->so_error) {
449 			if (m)
450 				goto dontblock;
451 			error = so->so_error;
452 			if ((flags & MSG_PEEK) == 0)
453 				so->so_error = 0;
454 			goto release;
455 		}
456 		if (so->so_state & SS_CANTRCVMORE) {
457 			if (m)
458 				goto dontblock;
459 			else
460 				goto release;
461 		}
462 		for (; m; m = m->m_next)
463 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
464 				m = so->so_rcv.sb_mb;
465 				goto dontblock;
466 			}
467 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
468 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
469 			error = ENOTCONN;
470 			goto release;
471 		}
472 		if (resid == 0)
473 			goto release;
474 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
475 			error = EWOULDBLOCK;
476 			goto release;
477 		}
478 		sbunlock(&so->so_rcv);
479 		error = sbwait(&so->so_rcv);
480 		splx(s);
481 		if (error)
482 			return (error);
483 		goto restart;
484 	}
485  dontblock:
486 	/*
487 	 * On entry here, m points to the first record of the socket buffer.
488 	 * While we process the initial mbufs containing address and control
489 	 * info, we save a copy of m->m_nextpkt into nextrecord.
490 	 */
491 #ifdef notyet /* XXXX */
492 	if (uio->uio_lwp)
493 		uio->uio_lwp->l_proc->p_stats->p_ru.ru_msgrcv++;
494 #endif
495 	KASSERT(m == so->so_rcv.sb_mb);
496 	SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 1");
497 	SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 1");
498 	nextrecord = m->m_nextpkt;
499 	if (pr->pr_flags & PR_ADDR) {
500 #ifdef DIAGNOSTIC
501 		if (m->m_type != MT_SONAME)
502 			panic("receive 1a");
503 #endif
504 		orig_resid = 0;
505 		if (flags & MSG_PEEK) {
506 			m = m->m_next;
507 		} else {
508 			sbfree(&so->so_rcv, m);
509 			MFREE(m, so->so_rcv.sb_mb);
510 			m = so->so_rcv.sb_mb;
511 		}
512 	}
513 	while (m && m->m_type == MT_CONTROL && error == 0) {
514 		if (flags & MSG_PEEK) {
515 			m = m->m_next;
516 		} else {
517 			sbfree(&so->so_rcv, m);
518 			MFREE(m, so->so_rcv.sb_mb);
519 			m = so->so_rcv.sb_mb;
520 		}
521 	}
522 
523 	/*
524 	 * If m is non-NULL, we have some data to read.  From now on,
525 	 * make sure to keep sb_lastrecord consistent when working on
526 	 * the last packet on the chain (nextrecord == NULL) and we
527 	 * change m->m_nextpkt.
528 	 */
529 	if (m) {
530 		if ((flags & MSG_PEEK) == 0) {
531 			m->m_nextpkt = nextrecord;
532 			/*
533 			 * If nextrecord == NULL (this is a single chain),
534 			 * then sb_lastrecord may not be valid here if m
535 			 * was changed earlier.
536 			 */
537 			if (nextrecord == NULL) {
538 				KASSERT(so->so_rcv.sb_mb == m);
539 				so->so_rcv.sb_lastrecord = m;
540 			}
541 		}
542 		type = m->m_type;
543 		if (type == MT_OOBDATA)
544 			flags |= MSG_OOB;
545 	} else {
546 		if ((flags & MSG_PEEK) == 0) {
547 			KASSERT(so->so_rcv.sb_mb == m);
548 			so->so_rcv.sb_mb = nextrecord;
549 			SB_EMPTY_FIXUP(&so->so_rcv);
550 		}
551 	}
552 	SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 2");
553 	SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 2");
554 
555 	moff = 0;
556 	offset = 0;
557 	while (m && resid > 0 && error == 0) {
558 		if (m->m_type == MT_OOBDATA) {
559 			if (type != MT_OOBDATA)
560 				break;
561 		} else if (type == MT_OOBDATA)
562 			break;
563 #ifdef DIAGNOSTIC
564 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
565 			panic("receive 3");
566 #endif
567 		so->so_state &= ~SS_RCVATMARK;
568 		len = resid;
569 		if (so->so_oobmark && len > so->so_oobmark - offset)
570 			len = so->so_oobmark - offset;
571 		if (len > m->m_len - moff)
572 			len = m->m_len - moff;
573 		/*
574 		 * If mp is set, just pass back the mbufs.
575 		 * Otherwise copy them out via the uio, then free.
576 		 * Sockbuf must be consistent here (points to current mbuf,
577 		 * it points to next record) when we drop priority;
578 		 * we must note any additions to the sockbuf when we
579 		 * block interrupts again.
580 		 */
581 		resid -= len;
582 		if (len == m->m_len - moff) {
583 			if (m->m_flags & M_EOR)
584 				flags |= MSG_EOR;
585 			if (flags & MSG_PEEK) {
586 				m = m->m_next;
587 				moff = 0;
588 			} else {
589 				nextrecord = m->m_nextpkt;
590 				sbfree(&so->so_rcv, m);
591 				if (mp) {
592 					*mp = m;
593 					mp = &m->m_next;
594 					so->so_rcv.sb_mb = m = m->m_next;
595 					*mp = NULL;
596 				} else {
597 					MFREE(m, so->so_rcv.sb_mb);
598 					m = so->so_rcv.sb_mb;
599 				}
600 				/*
601 				 * If m != NULL, we also know that
602 				 * so->so_rcv.sb_mb != NULL.
603 				 */
604 				KASSERT(so->so_rcv.sb_mb == m);
605 				if (m) {
606 					m->m_nextpkt = nextrecord;
607 					if (nextrecord == NULL)
608 						so->so_rcv.sb_lastrecord = m;
609 				} else {
610 					so->so_rcv.sb_mb = nextrecord;
611 					SB_EMPTY_FIXUP(&so->so_rcv);
612 				}
613 				SBLASTRECORDCHK(&so->so_rcv,
614 				    "kttcp_soreceive 3");
615 				SBLASTMBUFCHK(&so->so_rcv,
616 				    "kttcp_soreceive 3");
617 			}
618 		} else {
619 			if (flags & MSG_PEEK)
620 				moff += len;
621 			else {
622 				if (mp)
623 					*mp = m_copym(m, 0, len, M_WAIT);
624 				m->m_data += len;
625 				m->m_len -= len;
626 				so->so_rcv.sb_cc -= len;
627 			}
628 		}
629 		if (so->so_oobmark) {
630 			if ((flags & MSG_PEEK) == 0) {
631 				so->so_oobmark -= len;
632 				if (so->so_oobmark == 0) {
633 					so->so_state |= SS_RCVATMARK;
634 					break;
635 				}
636 			} else {
637 				offset += len;
638 				if (offset == so->so_oobmark)
639 					break;
640 			}
641 		}
642 		if (flags & MSG_EOR)
643 			break;
644 		/*
645 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
646 		 * we must not quit until "uio->uio_resid == 0" or an error
647 		 * termination.  If a signal/timeout occurs, return
648 		 * with a short count but without error.
649 		 * Keep sockbuf locked against other readers.
650 		 */
651 		while (flags & MSG_WAITALL && m == NULL && resid > 0 &&
652 		    !sosendallatonce(so) && !nextrecord) {
653 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
654 				break;
655 			/*
656 			 * If we are peeking and the socket receive buffer is
657 			 * full, stop since we can't get more data to peek at.
658 			 */
659 			if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
660 				break;
661 			/*
662 			 * If we've drained the socket buffer, tell the
663 			 * protocol in case it needs to do something to
664 			 * get it filled again.
665 			 */
666 			if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
667 				(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
668 				    (struct mbuf *)(long)flags, NULL, NULL);
669 			SBLASTRECORDCHK(&so->so_rcv,
670 			    "kttcp_soreceive sbwait 2");
671 			SBLASTMBUFCHK(&so->so_rcv,
672 			    "kttcp_soreceive sbwait 2");
673 			error = sbwait(&so->so_rcv);
674 			if (error) {
675 				sbunlock(&so->so_rcv);
676 				splx(s);
677 				return (0);
678 			}
679 			if ((m = so->so_rcv.sb_mb) != NULL)
680 				nextrecord = m->m_nextpkt;
681 		}
682 	}
683 
684 	if (m && pr->pr_flags & PR_ATOMIC) {
685 		flags |= MSG_TRUNC;
686 		if ((flags & MSG_PEEK) == 0)
687 			(void) sbdroprecord(&so->so_rcv);
688 	}
689 	if ((flags & MSG_PEEK) == 0) {
690 		if (m == NULL) {
691 			/*
692 			 * First part is an SB_EMPTY_FIXUP().  Second part
693 			 * makes sure sb_lastrecord is up-to-date if
694 			 * there is still data in the socket buffer.
695 			 */
696 			so->so_rcv.sb_mb = nextrecord;
697 			if (so->so_rcv.sb_mb == NULL) {
698 				so->so_rcv.sb_mbtail = NULL;
699 				so->so_rcv.sb_lastrecord = NULL;
700 			} else if (nextrecord->m_nextpkt == NULL)
701 				so->so_rcv.sb_lastrecord = nextrecord;
702 		}
703 		SBLASTRECORDCHK(&so->so_rcv, "kttcp_soreceive 4");
704 		SBLASTMBUFCHK(&so->so_rcv, "kttcp_soreceive 4");
705 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
706 			(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
707 			    (struct mbuf *)(long)flags, NULL, NULL);
708 	}
709 	if (orig_resid == resid && orig_resid &&
710 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
711 		sbunlock(&so->so_rcv);
712 		splx(s);
713 		goto restart;
714 	}
715 
716 	if (flagsp)
717 		*flagsp |= flags;
718  release:
719 	sbunlock(&so->so_rcv);
720 	splx(s);
721 	*done = slen - resid;
722 #if 0
723 	printf("soreceive: error %d slen %llu resid %lld\n", error, slen, resid);
724 #endif
725 	return (error);
726 }
727