xref: /netbsd-src/sys/kern/sys_pipe.c (revision 1ca5c1b28139779176bd5c13ad7c5f25c0bcd5f8)
1 /*	$NetBSD: sys_pipe.c,v 1.21 2001/12/18 08:49:40 chs Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  *
21  * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $
22  */
23 
24 /*
25  * This file contains a high-performance replacement for the socket-based
26  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
27  * all features of sockets, but does do everything that pipes normally
28  * do.
29  *
30  * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31  * written by Jaromir Dolecek.
32  */
33 
34 /*
35  * This code has two modes of operation, a small write mode and a large
36  * write mode.  The small write mode acts like conventional pipes with
37  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
38  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
39  * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40  * those pages are also wired), and the receiving process can copy it directly
41  * from the pages in the sending process.
42  *
43  * If the sending process receives a signal, it is possible that it will
44  * go away, and certainly its address space can change, because control
45  * is returned back to the user-mode side.  In that case, the pipe code
46  * arranges to copy the buffer supplied by the user process on FreeBSD, to
47  * a pageable kernel buffer, and the receiving process will grab the data
48  * from the pageable kernel buffer.  Since signals don't happen all that often,
49  * the copy operation is normally eliminated.
50  * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51  * so no explicit handling need to be done, all is handled by standard VM
52  * facilities.
53  *
54  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55  * happen for small transfers so that the system will not spend all of
56  * its time context switching.  PIPE_SIZE is constrained by the
57  * amount of kernel virtual memory.
58  */
59 
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.21 2001/12/18 08:49:40 chs Exp $");
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/proc.h>
66 #include <sys/fcntl.h>
67 #include <sys/file.h>
68 #include <sys/filedesc.h>
69 #include <sys/filio.h>
70 #include <sys/ttycom.h>
71 #include <sys/stat.h>
72 #include <sys/poll.h>
73 #include <sys/signalvar.h>
74 #include <sys/vnode.h>
75 #include <sys/uio.h>
76 #include <sys/lock.h>
77 #ifdef __FreeBSD__
78 #include <sys/mutex.h>
79 #include <sys/selinfo.h>
80 #include <sys/sysproto.h>
81 #elif defined(__NetBSD__)
82 #include <sys/select.h>
83 #include <sys/malloc.h>
84 #include <sys/mount.h>
85 #include <sys/syscallargs.h>
86 #include <uvm/uvm.h>
87 #include <sys/sysctl.h>
88 #include <sys/kernel.h>
89 #endif /* NetBSD, FreeBSD */
90 
91 #include <sys/pipe.h>
92 
93 #ifdef __NetBSD__
94 /*
95  * Avoid microtime(9), it's slow. We don't guard the read from time(9)
96  * with splclock(9) since we don't actually need to be THAT sure the access
97  * is atomic.
98  */
99 #define vfs_timestamp(tv)	(*(tv) = time)
100 #endif
101 
102 /*
103  * Use this define if you want to disable *fancy* VM things.  Expect an
104  * approx 30% decrease in transfer rate.  This could be useful for
105  * OpenBSD.
106  */
107 /* #define PIPE_NODIRECT */
108 
109 /*
110  * interfaces to the outside world
111  */
112 #ifdef __FreeBSD__
113 static int pipe_read __P((struct file *fp, struct uio *uio,
114 		struct ucred *cred, int flags, struct proc *p));
115 static int pipe_write __P((struct file *fp, struct uio *uio,
116 		struct ucred *cred, int flags, struct proc *p));
117 static int pipe_close __P((struct file *fp, struct proc *p));
118 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
119 		struct proc *p));
120 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
121 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
122 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
123 
124 static struct fileops pipeops = {
125 	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
126 	pipe_stat, pipe_close
127 };
128 
129 static void	filt_pipedetach(struct knote *kn);
130 static int	filt_piperead(struct knote *kn, long hint);
131 static int	filt_pipewrite(struct knote *kn, long hint);
132 
133 static struct filterops pipe_rfiltops =
134 	{ 1, NULL, filt_pipedetach, filt_piperead };
135 static struct filterops pipe_wfiltops =
136 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
137 #endif /* FreeBSD */
138 
139 #ifdef __NetBSD__
140 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio,
141 		struct ucred *cred, int flags));
142 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio,
143 		struct ucred *cred, int flags));
144 static int pipe_close __P((struct file *fp, struct proc *p));
145 static int pipe_poll __P((struct file *fp, int events, struct proc *p));
146 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data,
147 		struct proc *p));
148 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
149 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
150 
151 static struct fileops pipeops =
152     { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
153       pipe_stat, pipe_close };
154 #endif /* NetBSD */
155 
156 /*
157  * Default pipe buffer size(s), this can be kind-of large now because pipe
158  * space is pageable.  The pipe code will try to maintain locality of
159  * reference for performance reasons, so small amounts of outstanding I/O
160  * will not wipe the cache.
161  */
162 #define MINPIPESIZE (PIPE_SIZE/3)
163 #define MAXPIPESIZE (2*PIPE_SIZE/3)
164 
165 /*
166  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
167  * is there so that on large systems, we don't exhaust it.
168  */
169 #define MAXPIPEKVA (8*1024*1024)
170 static int maxpipekva = MAXPIPEKVA;
171 
172 /*
173  * Limit for direct transfers, we cannot, of course limit
174  * the amount of kva for pipes in general though.
175  */
176 #define LIMITPIPEKVA (16*1024*1024)
177 static int limitpipekva = LIMITPIPEKVA;
178 
179 /*
180  * Limit the number of "big" pipes
181  */
182 #define LIMITBIGPIPES  32
183 static int maxbigpipes = LIMITBIGPIPES;
184 static int nbigpipe = 0;
185 
186 /*
187  * Amount of KVA consumed by pipe buffers.
188  */
189 static int amountpipekva = 0;
190 
191 static void pipeclose __P((struct pipe *));
192 static void pipe_free_kmem __P((struct pipe *));
193 static int pipe_create __P((struct pipe **, int));
194 static __inline int pipelock __P((struct pipe *, int));
195 static __inline void pipeunlock __P((struct pipe *));
196 static __inline void pipeselwakeup __P((struct pipe *, struct pipe *));
197 static int pipespace __P((struct pipe *, int));
198 
199 #ifdef __FreeBSD__
200 #ifndef PIPE_NODIRECT
201 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
202 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
203 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
204 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
205 #endif
206 
207 static vm_zone_t pipe_zone;
208 #endif /* FreeBSD */
209 
210 #ifdef __NetBSD__
211 #ifndef PIPE_NODIRECT
212 static int pipe_direct_write __P((struct pipe *, struct uio *));
213 static int pipe_loan_alloc __P((struct pipe *, int));
214 static void pipe_loan_free __P((struct pipe *));
215 #endif /* PIPE_NODIRECT */
216 
217 static struct pool pipe_pool;
218 #endif /* NetBSD */
219 
220 /*
221  * The pipe system call for the DTYPE_PIPE type of pipes
222  */
223 
224 /* ARGSUSED */
225 #ifdef __FreeBSD__
226 int
227 pipe(p, uap)
228 	struct proc *p;
229 	struct pipe_args /* {
230 		int	dummy;
231 	} */ *uap;
232 #elif defined(__NetBSD__)
233 int
234 sys_pipe(p, v, retval)
235 	struct proc *p;
236 	void *v;
237 	register_t *retval;
238 #endif
239 {
240 	struct file *rf, *wf;
241 	struct pipe *rpipe, *wpipe;
242 	int fd, error;
243 
244 #ifdef __FreeBSD__
245 	if (pipe_zone == NULL)
246 		pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
247 
248 	rpipe = wpipe = NULL;
249 	if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 1)) {
250 		pipeclose(rpipe);
251 		pipeclose(wpipe);
252 		return (ENFILE);
253 	}
254 
255 	error = falloc(p, &rf, &fd);
256 	if (error) {
257 		pipeclose(rpipe);
258 		pipeclose(wpipe);
259 		return (error);
260 	}
261 	fhold(rf);
262 	p->p_retval[0] = fd;
263 
264 	/*
265 	 * Warning: once we've gotten past allocation of the fd for the
266 	 * read-side, we can only drop the read side via fdrop() in order
267 	 * to avoid races against processes which manage to dup() the read
268 	 * side while we are blocked trying to allocate the write side.
269 	 */
270 	rf->f_flag = FREAD | FWRITE;
271 	rf->f_type = DTYPE_PIPE;
272 	rf->f_data = (caddr_t)rpipe;
273 	rf->f_ops = &pipeops;
274 	error = falloc(p, &wf, &fd);
275 	if (error) {
276 		struct filedesc *fdp = p->p_fd;
277 
278 		if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
279 			fdp->fd_ofiles[p->p_retval[0]] = NULL;
280 			fdrop(rf, p);
281 		}
282 		fdrop(rf, p);
283 		/* rpipe has been closed by fdrop(). */
284 		pipeclose(wpipe);
285 		return (error);
286 	}
287 	wf->f_flag = FREAD | FWRITE;
288 	wf->f_type = DTYPE_PIPE;
289 	wf->f_data = (caddr_t)wpipe;
290 	wf->f_ops = &pipeops;
291 	p->p_retval[1] = fd;
292 
293 	rpipe->pipe_peer = wpipe;
294 	wpipe->pipe_peer = rpipe;
295 	fdrop(rf, p);
296 #endif /* FreeBSD */
297 
298 #ifdef __NetBSD__
299 	rpipe = wpipe = NULL;
300 	if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) {
301 		pipeclose(rpipe);
302 		pipeclose(wpipe);
303 		return (ENFILE);
304 	}
305 
306 	/*
307 	 * Note: the file structure returned from falloc() is marked
308 	 * as 'larval' initially. Unless we mark it as 'mature' by
309 	 * FILE_SET_MATURE(), any attempt to do anything with it would
310 	 * return EBADF, including e.g. dup(2) or close(2). This avoids
311 	 * file descriptor races if we block in the second falloc().
312 	 */
313 
314 	error = falloc(p, &rf, &fd);
315 	if (error)
316 		goto free2;
317 	retval[0] = fd;
318 	rf->f_flag = FREAD;
319 	rf->f_type = DTYPE_PIPE;
320 	rf->f_data = (caddr_t)rpipe;
321 	rf->f_ops = &pipeops;
322 
323 	error = falloc(p, &wf, &fd);
324 	if (error)
325 		goto free3;
326 	retval[1] = fd;
327 	wf->f_flag = FWRITE;
328 	wf->f_type = DTYPE_PIPE;
329 	wf->f_data = (caddr_t)wpipe;
330 	wf->f_ops = &pipeops;
331 
332 	rpipe->pipe_peer = wpipe;
333 	wpipe->pipe_peer = rpipe;
334 
335 	FILE_SET_MATURE(rf);
336 	FILE_SET_MATURE(wf);
337 	FILE_UNUSE(rf, p);
338 	FILE_UNUSE(wf, p);
339 	return (0);
340 free3:
341 	FILE_UNUSE(rf, p);
342 	ffree(rf);
343 	fdremove(p->p_fd, retval[0]);
344 free2:
345 	pipeclose(wpipe);
346 	pipeclose(rpipe);
347 #endif /* NetBSD */
348 
349 	return (error);
350 }
351 
352 /*
353  * Allocate kva for pipe circular buffer, the space is pageable
354  * This routine will 'realloc' the size of a pipe safely, if it fails
355  * it will retain the old buffer.
356  * If it fails it will return ENOMEM.
357  */
358 static int
359 pipespace(cpipe, size)
360 	struct pipe *cpipe;
361 	int size;
362 {
363 	caddr_t buffer;
364 #ifdef __FreeBSD__
365 	struct vm_object *object;
366 	int npages, error;
367 
368 	npages = round_page(size)/PAGE_SIZE;
369 	/*
370 	 * Create an object, I don't like the idea of paging to/from
371 	 * kernel_object.
372 	 */
373 	mtx_lock(&vm_mtx);
374 	object = vm_object_allocate(OBJT_DEFAULT, npages);
375 	buffer = (caddr_t) vm_map_min(kernel_map);
376 
377 	/*
378 	 * Insert the object into the kernel map, and allocate kva for it.
379 	 * The map entry is, by default, pageable.
380 	 */
381 	error = vm_map_find(kernel_map, object, 0,
382 		(vm_offset_t *) &buffer, size, 1,
383 		VM_PROT_ALL, VM_PROT_ALL, 0);
384 
385 	if (error != KERN_SUCCESS) {
386 		vm_object_deallocate(object);
387 		mtx_unlock(&vm_mtx);
388 		return (ENOMEM);
389 	}
390 #endif /* FreeBSD */
391 
392 #ifdef __NetBSD__
393 	/*
394 	 * Allocate pageable virtual address space. Physical memory is allocated
395 	 * on demand.
396 	 */
397 	buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
398 	if (buffer == NULL)
399 		return (ENOMEM);
400 #endif /* NetBSD */
401 
402 	/* free old resources if we're resizing */
403 	pipe_free_kmem(cpipe);
404 #ifdef __FreeBSD__
405 	mtx_unlock(&vm_mtx);
406 	cpipe->pipe_buffer.object = object;
407 #endif
408 	cpipe->pipe_buffer.buffer = buffer;
409 	cpipe->pipe_buffer.size = size;
410 	cpipe->pipe_buffer.in = 0;
411 	cpipe->pipe_buffer.out = 0;
412 	cpipe->pipe_buffer.cnt = 0;
413 	amountpipekva += cpipe->pipe_buffer.size;
414 	return (0);
415 }
416 
417 /*
418  * initialize and allocate VM and memory for pipe
419  */
420 static int
421 pipe_create(cpipep, allockva)
422 	struct pipe **cpipep;
423 	int allockva;
424 {
425 	struct pipe *cpipe;
426 	int error;
427 
428 #ifdef __FreeBSD__
429 	*cpipep = zalloc(pipe_zone);
430 #endif
431 #ifdef __NetBSD__
432 	*cpipep = pool_get(&pipe_pool, M_WAITOK);
433 #endif
434 	if (*cpipep == NULL)
435 		return (ENOMEM);
436 
437 	cpipe = *cpipep;
438 
439 	/* Initialize */
440 	memset(cpipe, 0, sizeof(*cpipe));
441 	cpipe->pipe_state = PIPE_SIGNALR;
442 
443 	if (allockva && (error = pipespace(cpipe, PIPE_SIZE)))
444 		return (error);
445 
446 	vfs_timestamp(&cpipe->pipe_ctime);
447 	cpipe->pipe_atime = cpipe->pipe_ctime;
448 	cpipe->pipe_mtime = cpipe->pipe_ctime;
449 #ifdef __NetBSD__
450 	cpipe->pipe_pgid = NO_PID;
451 	lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
452 #endif
453 
454 	return (0);
455 }
456 
457 
458 /*
459  * lock a pipe for I/O, blocking other access
460  */
461 static __inline int
462 pipelock(cpipe, catch)
463 	struct pipe *cpipe;
464 	int catch;
465 {
466 	int error;
467 
468 #ifdef __FreeBSD__
469 	while (cpipe->pipe_state & PIPE_LOCK) {
470 		cpipe->pipe_state |= PIPE_LWANT;
471 		error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
472 		    "pipelk", 0);
473 		if (error != 0)
474 			return (error);
475 	}
476 	cpipe->pipe_state |= PIPE_LOCK;
477 	return (0);
478 #endif
479 
480 #ifdef __NetBSD__
481 	do {
482 		error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
483 	} while (!catch && (error == EINTR || error == ERESTART));
484 	return (error);
485 #endif
486 }
487 
488 /*
489  * unlock a pipe I/O lock
490  */
491 static __inline void
492 pipeunlock(cpipe)
493 	struct pipe *cpipe;
494 {
495 #ifdef __FreeBSD__
496 	cpipe->pipe_state &= ~PIPE_LOCK;
497 	if (cpipe->pipe_state & PIPE_LWANT) {
498 		cpipe->pipe_state &= ~PIPE_LWANT;
499 		wakeup(cpipe);
500 	}
501 #endif
502 
503 #ifdef __NetBSD__
504 	lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
505 #endif
506 }
507 
508 /*
509  * Select/poll wakup. This also sends SIGIO to peer connected to
510  * 'sigpipe' side of pipe.
511  */
512 static __inline void
513 pipeselwakeup(selp, sigp)
514 	struct pipe *selp, *sigp;
515 {
516 	if (selp->pipe_state & PIPE_SEL) {
517 		selp->pipe_state &= ~PIPE_SEL;
518 		selwakeup(&selp->pipe_sel);
519 	}
520 #ifdef __FreeBSD__
521 	if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
522 		pgsigio(sigp->pipe_sigio, SIGIO, 0);
523 	KNOTE(&selp->pipe_sel.si_note, 0);
524 #endif
525 
526 #ifdef __NetBSD__
527 	if (sigp && (sigp->pipe_state & PIPE_ASYNC)
528 	    && sigp->pipe_pgid != NO_PID){
529 		struct proc *p;
530 
531 		if (sigp->pipe_pgid < 0)
532 			gsignal(-sigp->pipe_pgid, SIGIO);
533 		else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0)
534 			psignal(p, SIGIO);
535 	}
536 #endif /* NetBSD */
537 }
538 
539 /* ARGSUSED */
540 #ifdef __FreeBSD__
541 static int
542 pipe_read(fp, uio, cred, flags, p)
543 	struct file *fp;
544 	struct uio *uio;
545 	struct ucred *cred;
546 	int flags;
547 	struct proc *p;
548 #elif defined(__NetBSD__)
549 static int
550 pipe_read(fp, offset, uio, cred, flags)
551 	struct file *fp;
552 	off_t *offset;
553 	struct uio *uio;
554 	struct ucred *cred;
555 	int flags;
556 #endif
557 {
558 	struct pipe *rpipe = (struct pipe *) fp->f_data;
559 	int error;
560 	size_t nread = 0;
561 	size_t size;
562 	size_t ocnt;
563 
564 	++rpipe->pipe_busy;
565 	error = pipelock(rpipe, 1);
566 	if (error)
567 		goto unlocked_error;
568 
569 	ocnt = rpipe->pipe_buffer.cnt;
570 
571 	while (uio->uio_resid) {
572 		/*
573 		 * normal pipe buffer receive
574 		 */
575 		if (rpipe->pipe_buffer.cnt > 0) {
576 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
577 			if (size > rpipe->pipe_buffer.cnt)
578 				size = rpipe->pipe_buffer.cnt;
579 			if (size > uio->uio_resid)
580 				size = uio->uio_resid;
581 
582 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
583 					size, uio);
584 			if (error)
585 				break;
586 
587 			rpipe->pipe_buffer.out += size;
588 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
589 				rpipe->pipe_buffer.out = 0;
590 
591 			rpipe->pipe_buffer.cnt -= size;
592 
593 			/*
594 			 * If there is no more to read in the pipe, reset
595 			 * its pointers to the beginning.  This improves
596 			 * cache hit stats.
597 			 */
598 			if (rpipe->pipe_buffer.cnt == 0) {
599 				rpipe->pipe_buffer.in = 0;
600 				rpipe->pipe_buffer.out = 0;
601 			}
602 			nread += size;
603 #ifndef PIPE_NODIRECT
604 		/*
605 		 * Direct copy, bypassing a kernel buffer.
606 		 */
607 		} else if ((size = rpipe->pipe_map.cnt) &&
608 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
609 			caddr_t	va;
610 			if (size > uio->uio_resid)
611 				size = uio->uio_resid;
612 
613 			va = (caddr_t) rpipe->pipe_map.kva +
614 			    rpipe->pipe_map.pos;
615 			error = uiomove(va, size, uio);
616 			if (error)
617 				break;
618 			nread += size;
619 			rpipe->pipe_map.pos += size;
620 			rpipe->pipe_map.cnt -= size;
621 			if (rpipe->pipe_map.cnt == 0) {
622 				rpipe->pipe_state &= ~PIPE_DIRECTW;
623 				wakeup(rpipe);
624 			}
625 #endif
626 		} else {
627 			/*
628 			 * detect EOF condition
629 			 * read returns 0 on EOF, no need to set error
630 			 */
631 			if (rpipe->pipe_state & PIPE_EOF)
632 				break;
633 
634 			/*
635 			 * If the "write-side" has been blocked, wake it up now.
636 			 */
637 			if (rpipe->pipe_state & PIPE_WANTW) {
638 				rpipe->pipe_state &= ~PIPE_WANTW;
639 				wakeup(rpipe);
640 			}
641 
642 			/*
643 			 * Break if some data was read.
644 			 */
645 			if (nread > 0)
646 				break;
647 
648 			/*
649 			 * don't block on non-blocking I/O
650 			 */
651 			if (fp->f_flag & FNONBLOCK) {
652 				error = EAGAIN;
653 				break;
654 			}
655 
656 			/*
657 			 * Unlock the pipe buffer for our remaining processing.
658 			 * We will either break out with an error or we will
659 			 * sleep and relock to loop.
660 			 */
661 			pipeunlock(rpipe);
662 
663 			/*
664 			 * We want to read more, wake up select/poll.
665 			 */
666 			pipeselwakeup(rpipe, rpipe->pipe_peer);
667 
668 			rpipe->pipe_state |= PIPE_WANTR;
669 			error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
670 			if (error != 0 || (error = pipelock(rpipe, 1)))
671 				goto unlocked_error;
672 		}
673 	}
674 	pipeunlock(rpipe);
675 
676 	if (error == 0)
677 		vfs_timestamp(&rpipe->pipe_atime);
678 unlocked_error:
679 	--rpipe->pipe_busy;
680 
681 	/*
682 	 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
683 	 */
684 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
685 		rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
686 		wakeup(rpipe);
687 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
688 		/*
689 		 * Handle write blocking hysteresis.
690 		 */
691 		if (rpipe->pipe_state & PIPE_WANTW) {
692 			rpipe->pipe_state &= ~PIPE_WANTW;
693 			wakeup(rpipe);
694 		}
695 	}
696 
697 	/*
698 	 * If anything was read off the buffer, signal to the writer it's
699 	 * possible to write more data. Also send signal if we are here for the
700 	 * first time after last write.
701 	 */
702 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
703 	    && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
704 		pipeselwakeup(rpipe, rpipe->pipe_peer);
705 		rpipe->pipe_state &= ~PIPE_SIGNALR;
706 	}
707 
708 	return (error);
709 }
710 
711 #ifdef __FreeBSD__
712 #ifndef PIPE_NODIRECT
713 /*
714  * Map the sending processes' buffer into kernel space and wire it.
715  * This is similar to a physical write operation.
716  */
717 static int
718 pipe_build_write_buffer(wpipe, uio)
719 	struct pipe *wpipe;
720 	struct uio *uio;
721 {
722 	size_t size;
723 	int i;
724 	vm_offset_t addr, endaddr, paddr;
725 
726 	size = uio->uio_iov->iov_len;
727 	if (size > wpipe->pipe_buffer.size)
728 		size = wpipe->pipe_buffer.size;
729 
730 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
731 	mtx_lock(&vm_mtx);
732 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
733 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
734 		vm_page_t m;
735 
736 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
737 		    (paddr = pmap_kextract(addr)) == 0) {
738 			int j;
739 
740 			for (j = 0; j < i; j++)
741 				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
742 			mtx_unlock(&vm_mtx);
743 			return (EFAULT);
744 		}
745 
746 		m = PHYS_TO_VM_PAGE(paddr);
747 		vm_page_wire(m);
748 		wpipe->pipe_map.ms[i] = m;
749 	}
750 
751 /*
752  * set up the control block
753  */
754 	wpipe->pipe_map.npages = i;
755 	wpipe->pipe_map.pos =
756 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
757 	wpipe->pipe_map.cnt = size;
758 
759 /*
760  * and map the buffer
761  */
762 	if (wpipe->pipe_map.kva == 0) {
763 		/*
764 		 * We need to allocate space for an extra page because the
765 		 * address range might (will) span pages at times.
766 		 */
767 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
768 			wpipe->pipe_buffer.size + PAGE_SIZE);
769 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
770 	}
771 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
772 		wpipe->pipe_map.npages);
773 
774 	mtx_unlock(&vm_mtx);
775 /*
776  * and update the uio data
777  */
778 
779 	uio->uio_iov->iov_len -= size;
780 	uio->uio_iov->iov_base += size;
781 	if (uio->uio_iov->iov_len == 0)
782 		uio->uio_iov++;
783 	uio->uio_resid -= size;
784 	uio->uio_offset += size;
785 	return (0);
786 }
787 
788 /*
789  * unmap and unwire the process buffer
790  */
791 static void
792 pipe_destroy_write_buffer(wpipe)
793 	struct pipe *wpipe;
794 {
795 	int i;
796 
797 	mtx_lock(&vm_mtx);
798 	if (wpipe->pipe_map.kva) {
799 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
800 
801 		if (amountpipekva > maxpipekva) {
802 			vm_offset_t kva = wpipe->pipe_map.kva;
803 			wpipe->pipe_map.kva = 0;
804 			kmem_free(kernel_map, kva,
805 				wpipe->pipe_buffer.size + PAGE_SIZE);
806 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
807 		}
808 	}
809 	for (i = 0; i < wpipe->pipe_map.npages; i++)
810 		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
811 	mtx_unlock(&vm_mtx);
812 }
813 
814 /*
815  * In the case of a signal, the writing process might go away.  This
816  * code copies the data into the circular buffer so that the source
817  * pages can be freed without loss of data.
818  */
819 static void
820 pipe_clone_write_buffer(wpipe)
821 	struct pipe *wpipe;
822 {
823 	int size;
824 	int pos;
825 
826 	size = wpipe->pipe_map.cnt;
827 	pos = wpipe->pipe_map.pos;
828 	memcpy((caddr_t) wpipe->pipe_buffer.buffer,
829 	    (caddr_t) wpipe->pipe_map.kva + pos, size);
830 
831 	wpipe->pipe_buffer.in = size;
832 	wpipe->pipe_buffer.out = 0;
833 	wpipe->pipe_buffer.cnt = size;
834 	wpipe->pipe_state &= ~PIPE_DIRECTW;
835 
836 	pipe_destroy_write_buffer(wpipe);
837 }
838 
839 /*
840  * This implements the pipe buffer write mechanism.  Note that only
841  * a direct write OR a normal pipe write can be pending at any given time.
842  * If there are any characters in the pipe buffer, the direct write will
843  * be deferred until the receiving process grabs all of the bytes from
844  * the pipe buffer.  Then the direct mapping write is set-up.
845  */
846 static int
847 pipe_direct_write(wpipe, uio)
848 	struct pipe *wpipe;
849 	struct uio *uio;
850 {
851 	int error;
852 
853 retry:
854 	while (wpipe->pipe_state & PIPE_DIRECTW) {
855 		if (wpipe->pipe_state & PIPE_WANTR) {
856 			wpipe->pipe_state &= ~PIPE_WANTR;
857 			wakeup(wpipe);
858 		}
859 		wpipe->pipe_state |= PIPE_WANTW;
860 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
861 		if (error)
862 			goto error1;
863 		if (wpipe->pipe_state & PIPE_EOF) {
864 			error = EPIPE;
865 			goto error1;
866 		}
867 	}
868 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
869 	if (wpipe->pipe_buffer.cnt > 0) {
870 		if (wpipe->pipe_state & PIPE_WANTR) {
871 			wpipe->pipe_state &= ~PIPE_WANTR;
872 			wakeup(wpipe);
873 		}
874 
875 		wpipe->pipe_state |= PIPE_WANTW;
876 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
877 		if (error)
878 			goto error1;
879 		if (wpipe->pipe_state & PIPE_EOF) {
880 			error = EPIPE;
881 			goto error1;
882 		}
883 		goto retry;
884 	}
885 
886 	wpipe->pipe_state |= PIPE_DIRECTW;
887 
888 	error = pipe_build_write_buffer(wpipe, uio);
889 	if (error) {
890 		wpipe->pipe_state &= ~PIPE_DIRECTW;
891 		goto error1;
892 	}
893 
894 	error = 0;
895 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
896 		if (wpipe->pipe_state & PIPE_EOF) {
897 			pipelock(wpipe, 0);
898 			pipe_destroy_write_buffer(wpipe);
899 			pipeunlock(wpipe);
900 			pipeselwakeup(wpipe, wpipe);
901 			error = EPIPE;
902 			goto error1;
903 		}
904 		if (wpipe->pipe_state & PIPE_WANTR) {
905 			wpipe->pipe_state &= ~PIPE_WANTR;
906 			wakeup(wpipe);
907 		}
908 		pipeselwakeup(wpipe, wpipe);
909 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
910 	}
911 
912 	pipelock(wpipe,0);
913 	if (wpipe->pipe_state & PIPE_DIRECTW) {
914 		/*
915 		 * this bit of trickery substitutes a kernel buffer for
916 		 * the process that might be going away.
917 		 */
918 		pipe_clone_write_buffer(wpipe);
919 	} else {
920 		pipe_destroy_write_buffer(wpipe);
921 	}
922 	pipeunlock(wpipe);
923 	return (error);
924 
925 error1:
926 	wakeup(wpipe);
927 	return (error);
928 }
929 #endif /* !PIPE_NODIRECT */
930 #endif /* FreeBSD */
931 
932 #ifdef __NetBSD__
933 #ifndef PIPE_NODIRECT
934 /*
935  * Allocate structure for loan transfer.
936  */
937 static int
938 pipe_loan_alloc(wpipe, npages)
939 	struct pipe *wpipe;
940 	int npages;
941 {
942 	vsize_t len;
943 
944 	len = (vsize_t)npages << PAGE_SHIFT;
945 	wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, len);
946 	if (wpipe->pipe_map.kva == NULL)
947 		return (ENOMEM);
948 
949 	amountpipekva += len;
950 	wpipe->pipe_map.npages = npages;
951 	wpipe->pipe_map.pgs = malloc(npages * sizeof(struct vm_page *), M_PIPE,
952 	    M_WAITOK);
953 	return (0);
954 }
955 
956 /*
957  * Free resources allocated for loan transfer.
958  */
959 static void
960 pipe_loan_free(wpipe)
961 	struct pipe *wpipe;
962 {
963 	vsize_t len;
964 
965 	len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT;
966 	uvm_km_free(kernel_map, wpipe->pipe_map.kva, len);
967 	wpipe->pipe_map.kva = NULL;
968 	amountpipekva -= len;
969 	free(wpipe->pipe_map.pgs, M_PIPE);
970 	wpipe->pipe_map.pgs = NULL;
971 }
972 
973 /*
974  * NetBSD direct write, using uvm_loan() mechanism.
975  * This implements the pipe buffer write mechanism.  Note that only
976  * a direct write OR a normal pipe write can be pending at any given time.
977  * If there are any characters in the pipe buffer, the direct write will
978  * be deferred until the receiving process grabs all of the bytes from
979  * the pipe buffer.  Then the direct mapping write is set-up.
980  */
981 static int
982 pipe_direct_write(wpipe, uio)
983 	struct pipe *wpipe;
984 	struct uio *uio;
985 {
986 	int error, npages, j;
987 	struct vm_page **pgs;
988 	vaddr_t bbase, kva, base, bend;
989 	vsize_t blen, bcnt;
990 	voff_t bpos;
991 
992 retry:
993 	while (wpipe->pipe_state & PIPE_DIRECTW) {
994 		if (wpipe->pipe_state & PIPE_WANTR) {
995 			wpipe->pipe_state &= ~PIPE_WANTR;
996 			wakeup(wpipe);
997 		}
998 		wpipe->pipe_state |= PIPE_WANTW;
999 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
1000 		if (error)
1001 			goto error;
1002 		if (wpipe->pipe_state & PIPE_EOF) {
1003 			error = EPIPE;
1004 			goto error;
1005 		}
1006 	}
1007 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
1008 	if (wpipe->pipe_buffer.cnt > 0) {
1009 		if (wpipe->pipe_state & PIPE_WANTR) {
1010 			wpipe->pipe_state &= ~PIPE_WANTR;
1011 			wakeup(wpipe);
1012 		}
1013 
1014 		wpipe->pipe_state |= PIPE_WANTW;
1015 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1016 		if (error)
1017 			goto error;
1018 		if (wpipe->pipe_state & PIPE_EOF) {
1019 			error = EPIPE;
1020 			goto error;
1021 		}
1022 		goto retry;
1023 	}
1024 
1025 	/*
1026 	 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers
1027 	 * not aligned to PAGE_SIZE.
1028 	 */
1029 	bbase = (vaddr_t)uio->uio_iov->iov_base;
1030 	base = trunc_page(bbase);
1031 	bend = round_page(bbase + uio->uio_iov->iov_len);
1032 	blen = bend - base;
1033 	bpos = bbase - base;
1034 
1035 	if (blen > PIPE_DIRECT_CHUNK) {
1036 		blen = PIPE_DIRECT_CHUNK;
1037 		bend = base + blen;
1038 		bcnt = PIPE_DIRECT_CHUNK - bpos;
1039 	} else {
1040 		bcnt = uio->uio_iov->iov_len;
1041 	}
1042 	npages = blen >> PAGE_SHIFT;
1043 
1044 	wpipe->pipe_map.pos = bpos;
1045 	wpipe->pipe_map.cnt = bcnt;
1046 
1047 	/*
1048 	 * Free the old kva if we need more pages than we have
1049 	 * allocated.
1050 	 */
1051 	if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages)
1052 		pipe_loan_free(wpipe);
1053 
1054 	/* Allocate new kva. */
1055 	if (wpipe->pipe_map.kva == NULL) {
1056 		error = pipe_loan_alloc(wpipe, npages);
1057 		if (error) {
1058 			goto error;
1059 		}
1060 	}
1061 
1062 	/* Loan the write buffer memory from writer process */
1063 	pgs = wpipe->pipe_map.pgs;
1064 	error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen,
1065 	    pgs, UVM_LOAN_TOPAGE);
1066 	if (error) {
1067 		pgs = NULL;
1068 		goto cleanup;
1069 	}
1070 
1071 	/* Enter the loaned pages to kva */
1072 	kva = wpipe->pipe_map.kva;
1073 	for (j = 0; j < npages; j++, kva += PAGE_SIZE) {
1074 		pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ);
1075 	}
1076 	pmap_update(pmap_kernel());
1077 
1078 	wpipe->pipe_state |= PIPE_DIRECTW;
1079 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1080 		if (wpipe->pipe_state & PIPE_EOF) {
1081 			error = EPIPE;
1082 			break;
1083 		}
1084 		if (wpipe->pipe_state & PIPE_WANTR) {
1085 			wpipe->pipe_state &= ~PIPE_WANTR;
1086 			wakeup(wpipe);
1087 		}
1088 		pipeselwakeup(wpipe, wpipe);
1089 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1090 	}
1091 
1092 	if (error)
1093 		wpipe->pipe_state &= ~PIPE_DIRECTW;
1094 
1095 cleanup:
1096 	pipelock(wpipe, 0);
1097 	if (pgs != NULL) {
1098 		pmap_kremove(wpipe->pipe_map.kva, blen);
1099 		uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
1100 	}
1101 	if (error || amountpipekva > maxpipekva)
1102 		pipe_loan_free(wpipe);
1103 	pipeunlock(wpipe);
1104 
1105 	if (error) {
1106 		pipeselwakeup(wpipe, wpipe);
1107 
1108 		/*
1109 		 * If nothing was read from what we offered, return error
1110 		 * straight on. Otherwise update uio resid first. Caller
1111 		 * will deal with the error condition, returning short
1112 		 * write, error, or restarting the write(2) as appropriate.
1113 		 */
1114 		if (wpipe->pipe_map.cnt == bcnt) {
1115 error:
1116 			wakeup(wpipe);
1117 			return (error);
1118 		}
1119 
1120 		bcnt -= wpipe->pipe_map.cnt;
1121 	}
1122 
1123 	uio->uio_resid -= bcnt;
1124 	/* uio_offset not updated, not set/used for write(2) */
1125 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt;
1126 	uio->uio_iov->iov_len -= bcnt;
1127 	if (uio->uio_iov->iov_len == 0) {
1128 		uio->uio_iov++;
1129 		uio->uio_iovcnt--;
1130 	}
1131 
1132 	return (error);
1133 }
1134 #endif /* !PIPE_NODIRECT */
1135 #endif /* NetBSD */
1136 
1137 #ifdef __FreeBSD__
1138 static int
1139 pipe_write(fp, uio, cred, flags, p)
1140 	struct file *fp;
1141 	off_t *offset;
1142 	struct uio *uio;
1143 	struct ucred *cred;
1144 	int flags;
1145 	struct proc *p;
1146 #elif defined(__NetBSD__)
1147 static int
1148 pipe_write(fp, offset, uio, cred, flags)
1149 	struct file *fp;
1150 	off_t *offset;
1151 	struct uio *uio;
1152 	struct ucred *cred;
1153 	int flags;
1154 #endif
1155 {
1156 	int error = 0;
1157 	struct pipe *wpipe, *rpipe;
1158 
1159 	rpipe = (struct pipe *) fp->f_data;
1160 	wpipe = rpipe->pipe_peer;
1161 
1162 	/*
1163 	 * detect loss of pipe read side, issue SIGPIPE if lost.
1164 	 */
1165 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF))
1166 		return (EPIPE);
1167 
1168 	++wpipe->pipe_busy;
1169 
1170 	/*
1171 	 * If it is advantageous to resize the pipe buffer, do
1172 	 * so.
1173 	 */
1174 	if ((uio->uio_resid > PIPE_SIZE) &&
1175 		(nbigpipe < maxbigpipes) &&
1176 #ifndef PIPE_NODIRECT
1177 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1178 #endif
1179 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1180 		(wpipe->pipe_buffer.cnt == 0)) {
1181 
1182 		if ((error = pipelock(wpipe,1)) == 0) {
1183 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1184 				nbigpipe++;
1185 			pipeunlock(wpipe);
1186 		} else {
1187 			/*
1188 			 * If an error occurred, unbusy and return, waking up
1189 			 * any waiting readers.
1190 			 */
1191 			--wpipe->pipe_busy;
1192 			if (wpipe->pipe_busy == 0
1193 			    && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1194 				wpipe->pipe_state &=
1195 				    ~(PIPE_WANTCLOSE | PIPE_WANTR);
1196 				wakeup(wpipe);
1197 			}
1198 
1199 			return (error);
1200 		}
1201 	}
1202 
1203 #ifdef __FreeBSD__
1204 	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1205 #endif
1206 
1207 	while (uio->uio_resid) {
1208 		int space;
1209 
1210 #ifndef PIPE_NODIRECT
1211 		/*
1212 		 * If the transfer is large, we can gain performance if
1213 		 * we do process-to-process copies directly.
1214 		 * If the write is non-blocking, we don't use the
1215 		 * direct write mechanism.
1216 		 *
1217 		 * The direct write mechanism will detect the reader going
1218 		 * away on us.
1219 		 */
1220 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1221 		    (fp->f_flag & FNONBLOCK) == 0 &&
1222 		    (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1223 			error = pipe_direct_write(wpipe, uio);
1224 
1225 			/*
1226 			 * Break out if error occured, unless it's ENOMEM.
1227 			 * ENOMEM means we failed to allocate some resources
1228 			 * for direct write, so we just fallback to ordinary
1229 			 * write. If the direct write was successful,
1230 			 * process rest of data via ordinary write.
1231 			 */
1232 			if (!error)
1233 				continue;
1234 
1235 			if (error != ENOMEM)
1236 				break;
1237 		}
1238 #endif /* PIPE_NODIRECT */
1239 
1240 		/*
1241 		 * Pipe buffered writes cannot be coincidental with
1242 		 * direct writes.  We wait until the currently executing
1243 		 * direct write is completed before we start filling the
1244 		 * pipe buffer.  We break out if a signal occurs or the
1245 		 * reader goes away.
1246 		 */
1247 	retrywrite:
1248 		while (wpipe->pipe_state & PIPE_DIRECTW) {
1249 			if (wpipe->pipe_state & PIPE_WANTR) {
1250 				wpipe->pipe_state &= ~PIPE_WANTR;
1251 				wakeup(wpipe);
1252 			}
1253 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1254 			if (wpipe->pipe_state & PIPE_EOF)
1255 				break;
1256 			if (error)
1257 				break;
1258 		}
1259 		if (wpipe->pipe_state & PIPE_EOF) {
1260 			error = EPIPE;
1261 			break;
1262 		}
1263 
1264 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1265 
1266 		/* Writes of size <= PIPE_BUF must be atomic. */
1267 		if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF))
1268 			space = 0;
1269 
1270 		if (space > 0) {
1271 			int size;	/* Transfer size */
1272 			int segsize;	/* first segment to transfer */
1273 
1274 			if ((error = pipelock(wpipe,1)) != 0)
1275 				break;
1276 
1277 			/*
1278 			 * It is possible for a direct write to
1279 			 * slip in on us... handle it here...
1280 			 */
1281 			if (wpipe->pipe_state & PIPE_DIRECTW) {
1282 				pipeunlock(wpipe);
1283 				goto retrywrite;
1284 			}
1285 			/*
1286 			 * If a process blocked in uiomove, our
1287 			 * value for space might be bad.
1288 			 *
1289 			 * XXX will we be ok if the reader has gone
1290 			 * away here?
1291 			 */
1292 			if (space > wpipe->pipe_buffer.size -
1293 				    wpipe->pipe_buffer.cnt) {
1294 				pipeunlock(wpipe);
1295 				goto retrywrite;
1296 			}
1297 
1298 			/*
1299 			 * Transfer size is minimum of uio transfer
1300 			 * and free space in pipe buffer.
1301 			 */
1302 			if (space > uio->uio_resid)
1303 				size = uio->uio_resid;
1304 			else
1305 				size = space;
1306 			/*
1307 			 * First segment to transfer is minimum of
1308 			 * transfer size and contiguous space in
1309 			 * pipe buffer.  If first segment to transfer
1310 			 * is less than the transfer size, we've got
1311 			 * a wraparound in the buffer.
1312 			 */
1313 			segsize = wpipe->pipe_buffer.size -
1314 				wpipe->pipe_buffer.in;
1315 			if (segsize > size)
1316 				segsize = size;
1317 
1318 			/* Transfer first segment */
1319 
1320 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1321 						segsize, uio);
1322 
1323 			if (error == 0 && segsize < size) {
1324 				/*
1325 				 * Transfer remaining part now, to
1326 				 * support atomic writes.  Wraparound
1327 				 * happened.
1328 				 */
1329 #ifdef DEBUG
1330 				if (wpipe->pipe_buffer.in + segsize !=
1331 				    wpipe->pipe_buffer.size)
1332 					panic("Expected pipe buffer wraparound disappeared");
1333 #endif
1334 
1335 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
1336 						size - segsize, uio);
1337 			}
1338 			if (error == 0) {
1339 				wpipe->pipe_buffer.in += size;
1340 				if (wpipe->pipe_buffer.in >=
1341 				    wpipe->pipe_buffer.size) {
1342 #ifdef DEBUG
1343 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1344 						panic("Expected wraparound bad");
1345 #endif
1346 					wpipe->pipe_buffer.in = size - segsize;
1347 				}
1348 
1349 				wpipe->pipe_buffer.cnt += size;
1350 #ifdef DEBUG
1351 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1352 					panic("Pipe buffer overflow");
1353 #endif
1354 			}
1355 			pipeunlock(wpipe);
1356 			if (error)
1357 				break;
1358 		} else {
1359 			/*
1360 			 * If the "read-side" has been blocked, wake it up now.
1361 			 */
1362 			if (wpipe->pipe_state & PIPE_WANTR) {
1363 				wpipe->pipe_state &= ~PIPE_WANTR;
1364 				wakeup(wpipe);
1365 			}
1366 
1367 			/*
1368 			 * don't block on non-blocking I/O
1369 			 */
1370 			if (fp->f_flag & FNONBLOCK) {
1371 				error = EAGAIN;
1372 				break;
1373 			}
1374 
1375 			/*
1376 			 * We have no more space and have something to offer,
1377 			 * wake up select/poll.
1378 			 */
1379 			pipeselwakeup(wpipe, wpipe);
1380 
1381 			wpipe->pipe_state |= PIPE_WANTW;
1382 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1383 			if (error != 0)
1384 				break;
1385 			/*
1386 			 * If read side wants to go away, we just issue a signal
1387 			 * to ourselves.
1388 			 */
1389 			if (wpipe->pipe_state & PIPE_EOF) {
1390 				error = EPIPE;
1391 				break;
1392 			}
1393 		}
1394 	}
1395 
1396 	--wpipe->pipe_busy;
1397 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1398 		wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1399 		wakeup(wpipe);
1400 	} else if (wpipe->pipe_buffer.cnt > 0) {
1401 		/*
1402 		 * If we have put any characters in the buffer, we wake up
1403 		 * the reader.
1404 		 */
1405 		if (wpipe->pipe_state & PIPE_WANTR) {
1406 			wpipe->pipe_state &= ~PIPE_WANTR;
1407 			wakeup(wpipe);
1408 		}
1409 	}
1410 
1411 	/*
1412 	 * Don't return EPIPE if I/O was successful
1413 	 */
1414 	if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1415 	    && (uio->uio_resid == 0))
1416 		error = 0;
1417 
1418 	if (error == 0)
1419 		vfs_timestamp(&wpipe->pipe_mtime);
1420 
1421 	/*
1422 	 * We have something to offer, wake up select/poll.
1423 	 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1424 	 * is only done synchronously), so check only wpipe->pipe_buffer.cnt
1425 	 */
1426 	if (wpipe->pipe_buffer.cnt)
1427 		pipeselwakeup(wpipe, wpipe);
1428 
1429 	/*
1430 	 * Arrange for next read(2) to do a signal.
1431 	 */
1432 	wpipe->pipe_state |= PIPE_SIGNALR;
1433 
1434 	return (error);
1435 }
1436 
1437 /*
1438  * we implement a very minimal set of ioctls for compatibility with sockets.
1439  */
1440 int
1441 pipe_ioctl(fp, cmd, data, p)
1442 	struct file *fp;
1443 	u_long cmd;
1444 	caddr_t data;
1445 	struct proc *p;
1446 {
1447 	struct pipe *mpipe = (struct pipe *)fp->f_data;
1448 
1449 	switch (cmd) {
1450 
1451 	case FIONBIO:
1452 		return (0);
1453 
1454 	case FIOASYNC:
1455 		if (*(int *)data) {
1456 			mpipe->pipe_state |= PIPE_ASYNC;
1457 		} else {
1458 			mpipe->pipe_state &= ~PIPE_ASYNC;
1459 		}
1460 		return (0);
1461 
1462 	case FIONREAD:
1463 #ifndef PIPE_NODIRECT
1464 		if (mpipe->pipe_state & PIPE_DIRECTW)
1465 			*(int *)data = mpipe->pipe_map.cnt;
1466 		else
1467 #endif
1468 			*(int *)data = mpipe->pipe_buffer.cnt;
1469 		return (0);
1470 
1471 #ifdef __FreeBSD__
1472 	case FIOSETOWN:
1473 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1474 
1475 	case FIOGETOWN:
1476 		*(int *)data = fgetown(mpipe->pipe_sigio);
1477 		return (0);
1478 
1479 	/* This is deprecated, FIOSETOWN should be used instead. */
1480 	case TIOCSPGRP:
1481 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1482 
1483 	/* This is deprecated, FIOGETOWN should be used instead. */
1484 	case TIOCGPGRP:
1485 		*(int *)data = -fgetown(mpipe->pipe_sigio);
1486 		return (0);
1487 #endif /* FreeBSD */
1488 #ifdef __NetBSD__
1489 	case TIOCSPGRP:
1490 		mpipe->pipe_pgid = *(int *)data;
1491 		return (0);
1492 
1493 	case TIOCGPGRP:
1494 		*(int *)data = mpipe->pipe_pgid;
1495 		return (0);
1496 #endif /* NetBSD */
1497 
1498 	}
1499 	return (ENOTTY);
1500 }
1501 
1502 int
1503 pipe_poll(fp, events, p)
1504 	struct file *fp;
1505 	int events;
1506 	struct proc *p;
1507 {
1508 	struct pipe *rpipe = (struct pipe *)fp->f_data;
1509 	struct pipe *wpipe;
1510 	int revents = 0;
1511 
1512 	wpipe = rpipe->pipe_peer;
1513 	if (events & (POLLIN | POLLRDNORM))
1514 		if ((rpipe->pipe_buffer.cnt > 0) ||
1515 #ifndef PIPE_NODIRECT
1516 		    (rpipe->pipe_state & PIPE_DIRECTW) ||
1517 #endif
1518 		    (rpipe->pipe_state & PIPE_EOF))
1519 			revents |= events & (POLLIN | POLLRDNORM);
1520 
1521 	if (events & (POLLOUT | POLLWRNORM))
1522 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1523 		    || (
1524 #ifndef PIPE_NODIRECT
1525 		     ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1526 #endif
1527 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1528 			revents |= events & (POLLOUT | POLLWRNORM);
1529 
1530 	if ((rpipe->pipe_state & PIPE_EOF) ||
1531 	    (wpipe == NULL) ||
1532 	    (wpipe->pipe_state & PIPE_EOF))
1533 		revents |= POLLHUP;
1534 
1535 	if (revents == 0) {
1536 		if (events & (POLLIN | POLLRDNORM)) {
1537 			selrecord(p, &rpipe->pipe_sel);
1538 			rpipe->pipe_state |= PIPE_SEL;
1539 		}
1540 
1541 		if (events & (POLLOUT | POLLWRNORM)) {
1542 			selrecord(p, &wpipe->pipe_sel);
1543 			wpipe->pipe_state |= PIPE_SEL;
1544 		}
1545 	}
1546 
1547 	return (revents);
1548 }
1549 
1550 static int
1551 pipe_stat(fp, ub, p)
1552 	struct file *fp;
1553 	struct stat *ub;
1554 	struct proc *p;
1555 {
1556 	struct pipe *pipe = (struct pipe *)fp->f_data;
1557 
1558 	memset((caddr_t)ub, 0, sizeof(*ub));
1559 	ub->st_mode = S_IFIFO;
1560 	ub->st_blksize = pipe->pipe_buffer.size;
1561 	ub->st_size = pipe->pipe_buffer.cnt;
1562 	ub->st_blocks = (ub->st_size) ? 1 : 0;
1563 #ifdef __FreeBSD__
1564 	ub->st_atimespec = pipe->pipe_atime;
1565 	ub->st_mtimespec = pipe->pipe_mtime;
1566 	ub->st_ctimespec = pipe->pipe_ctime;
1567 #endif /* FreeBSD */
1568 #ifdef __NetBSD__
1569 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1570 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1571 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1572 #endif /* NetBSD */
1573 	ub->st_uid = fp->f_cred->cr_uid;
1574 	ub->st_gid = fp->f_cred->cr_gid;
1575 	/*
1576 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1577 	 * XXX (st_dev, st_ino) should be unique.
1578 	 */
1579 	return (0);
1580 }
1581 
1582 /* ARGSUSED */
1583 static int
1584 pipe_close(fp, p)
1585 	struct file *fp;
1586 	struct proc *p;
1587 {
1588 	struct pipe *cpipe = (struct pipe *)fp->f_data;
1589 
1590 #ifdef __FreeBSD__
1591 	fp->f_ops = &badfileops;
1592 	funsetown(cpipe->pipe_sigio);
1593 #endif
1594 	fp->f_data = NULL;
1595 	pipeclose(cpipe);
1596 	return (0);
1597 }
1598 
1599 static void
1600 pipe_free_kmem(cpipe)
1601 	struct pipe *cpipe;
1602 {
1603 
1604 #ifdef __FreeBSD__
1605 	mtx_assert(&vm_mtx, MA_OWNED);
1606 #endif
1607 	if (cpipe->pipe_buffer.buffer != NULL) {
1608 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1609 			--nbigpipe;
1610 		amountpipekva -= cpipe->pipe_buffer.size;
1611 #ifdef __FreeBSD__
1612 		kmem_free(kernel_map,
1613 			(vm_offset_t)cpipe->pipe_buffer.buffer,
1614 			cpipe->pipe_buffer.size);
1615 #elif defined(__NetBSD__)
1616 		uvm_km_free(kernel_map,
1617 			(vaddr_t)cpipe->pipe_buffer.buffer,
1618 			cpipe->pipe_buffer.size);
1619 #endif /* NetBSD */
1620 		cpipe->pipe_buffer.buffer = NULL;
1621 	}
1622 #ifndef PIPE_NODIRECT
1623 	if (cpipe->pipe_map.kva != NULL) {
1624 #ifdef __FreeBSD__
1625 		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1626 		kmem_free(kernel_map,
1627 			cpipe->pipe_map.kva,
1628 			cpipe->pipe_buffer.size + PAGE_SIZE);
1629 #elif defined(__NetBSD__)
1630 		pipe_loan_free(cpipe);
1631 #endif /* NetBSD */
1632 		cpipe->pipe_map.cnt = 0;
1633 		cpipe->pipe_map.kva = NULL;
1634 		cpipe->pipe_map.pos = 0;
1635 		cpipe->pipe_map.npages = 0;
1636 	}
1637 #endif /* !PIPE_NODIRECT */
1638 }
1639 
1640 /*
1641  * shutdown the pipe
1642  */
1643 static void
1644 pipeclose(cpipe)
1645 	struct pipe *cpipe;
1646 {
1647 	struct pipe *ppipe;
1648 
1649 	if (!cpipe)
1650 		return;
1651 
1652 	pipeselwakeup(cpipe, cpipe);
1653 
1654 	/*
1655 	 * If the other side is blocked, wake it up saying that
1656 	 * we want to close it down.
1657 	 */
1658 	while (cpipe->pipe_busy) {
1659 		wakeup(cpipe);
1660 		cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1661 		tsleep(cpipe, PRIBIO, "pipecl", 0);
1662 	}
1663 
1664 	/*
1665 	 * Disconnect from peer
1666 	 */
1667 	if ((ppipe = cpipe->pipe_peer) != NULL) {
1668 		pipeselwakeup(ppipe, ppipe);
1669 
1670 		ppipe->pipe_state |= PIPE_EOF;
1671 		wakeup(ppipe);
1672 		ppipe->pipe_peer = NULL;
1673 	}
1674 
1675 	/*
1676 	 * free resources
1677 	 */
1678 #ifdef __FreeBSD__
1679 	mtx_lock(&vm_mtx);
1680 	pipe_free_kmem(cpipe);
1681 	/* XXX: erm, doesn't zalloc already have its own locks and
1682 	 * not need the giant vm lock?
1683 	 */
1684 	zfree(pipe_zone, cpipe);
1685 	mtx_unlock(&vm_mtx);
1686 #endif /* FreeBSD */
1687 
1688 #ifdef __NetBSD__
1689 	pipe_free_kmem(cpipe);
1690 	(void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1691 	pool_put(&pipe_pool, cpipe);
1692 #endif
1693 }
1694 
1695 #ifdef __FreeBSD__
1696 /*ARGSUSED*/
1697 static int
1698 pipe_kqfilter(struct file *fp, struct knote *kn)
1699 {
1700 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1701 
1702 	switch (kn->kn_filter) {
1703 	case EVFILT_READ:
1704 		kn->kn_fop = &pipe_rfiltops;
1705 		break;
1706 	case EVFILT_WRITE:
1707 		kn->kn_fop = &pipe_wfiltops;
1708 		cpipe = cpipe->pipe_peer;
1709 		break;
1710 	default:
1711 		return (1);
1712 	}
1713 	kn->kn_hook = (caddr_t)cpipe;
1714 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1715 	return (0);
1716 }
1717 
1718 static void
1719 filt_pipedetach(struct knote *kn)
1720 {
1721 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1722 
1723 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1724 }
1725 
1726 /*ARGSUSED*/
1727 static int
1728 filt_piperead(struct knote *kn, long hint)
1729 {
1730 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1731 	struct pipe *wpipe = rpipe->pipe_peer;
1732 
1733 	kn->kn_data = rpipe->pipe_buffer.cnt;
1734 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1735 		kn->kn_data = rpipe->pipe_map.cnt;
1736 
1737 	if ((rpipe->pipe_state & PIPE_EOF) ||
1738 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1739 		kn->kn_flags |= EV_EOF;
1740 		return (1);
1741 	}
1742 	return (kn->kn_data > 0);
1743 }
1744 
1745 /*ARGSUSED*/
1746 static int
1747 filt_pipewrite(struct knote *kn, long hint)
1748 {
1749 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1750 	struct pipe *wpipe = rpipe->pipe_peer;
1751 
1752 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1753 		kn->kn_data = 0;
1754 		kn->kn_flags |= EV_EOF;
1755 		return (1);
1756 	}
1757 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1758 	if (wpipe->pipe_state & PIPE_DIRECTW)
1759 		kn->kn_data = 0;
1760 
1761 	return (kn->kn_data >= PIPE_BUF);
1762 }
1763 #endif /* FreeBSD */
1764 
1765 #ifdef __NetBSD__
1766 static int
1767 pipe_fcntl(fp, cmd, data, p)
1768 	struct file *fp;
1769 	u_int cmd;
1770 	caddr_t data;
1771 	struct proc *p;
1772 {
1773 	if (cmd == F_SETFL)
1774 		return (0);
1775 	else
1776 		return (EOPNOTSUPP);
1777 }
1778 
1779 /*
1780  * Handle pipe sysctls.
1781  */
1782 int
1783 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1784 	int *name;
1785 	u_int namelen;
1786 	void *oldp;
1787 	size_t *oldlenp;
1788 	void *newp;
1789 	size_t newlen;
1790 {
1791 	/* All sysctl names at this level are terminal. */
1792 	if (namelen != 1)
1793 		return (ENOTDIR);		/* overloaded */
1794 
1795 	switch (name[0]) {
1796 	case KERN_PIPE_MAXKVASZ:
1797 		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1798 	case KERN_PIPE_LIMITKVA:
1799 		return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1800 	case KERN_PIPE_MAXBIGPIPES:
1801 		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1802 	case KERN_PIPE_NBIGPIPES:
1803 		return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1804 	case KERN_PIPE_KVASIZE:
1805 		return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1806 	default:
1807 		return (EOPNOTSUPP);
1808 	}
1809 	/* NOTREACHED */
1810 }
1811 
1812 /*
1813  * Initialize pipe structs.
1814  */
1815 void
1816 pipe_init(void)
1817 {
1818 	pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
1819 		0, NULL, NULL, M_PIPE);
1820 }
1821 
1822 #endif /* __NetBSD __ */
1823