xref: /netbsd-src/sys/kern/sys_pipe.c (revision 3b01aba77a7a698587faaae455bbfe740923c1f5)
1 /*	$NetBSD: sys_pipe.c,v 1.11 2001/07/26 14:14:28 jdolecek Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  *
21  * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $
22  */
23 
24 /*
25  * This file contains a high-performance replacement for the socket-based
26  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
27  * all features of sockets, but does do everything that pipes normally
28  * do.
29  *
30  * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31  * written by Jaromir Dolecek.
32  */
33 
34 /*
35  * This code has two modes of operation, a small write mode and a large
36  * write mode.  The small write mode acts like conventional pipes with
37  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
38  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
39  * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40  * those pages are also wired), and the receiving process can copy it directly
41  * from the pages in the sending process.
42  *
43  * If the sending process receives a signal, it is possible that it will
44  * go away, and certainly its address space can change, because control
45  * is returned back to the user-mode side.  In that case, the pipe code
46  * arranges to copy the buffer supplied by the user process on FreeBSD, to
47  * a pageable kernel buffer, and the receiving process will grab the data
48  * from the pageable kernel buffer.  Since signals don't happen all that often,
49  * the copy operation is normally eliminated.
50  * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51  * so no explicit handling need to be done, all is handled by standard VM
52  * facilities.
53  *
54  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55  * happen for small transfers so that the system will not spend all of
56  * its time context switching.  PIPE_SIZE is constrained by the
57  * amount of kernel virtual memory.
58  */
59 
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/proc.h>
63 #include <sys/fcntl.h>
64 #include <sys/file.h>
65 #include <sys/filedesc.h>
66 #include <sys/filio.h>
67 #include <sys/ttycom.h>
68 #include <sys/stat.h>
69 #include <sys/poll.h>
70 #include <sys/signalvar.h>
71 #include <sys/vnode.h>
72 #include <sys/uio.h>
73 #include <sys/lock.h>
74 #ifdef __FreeBSD__
75 #include <sys/mutex.h>
76 #include <sys/selinfo.h>
77 #include <sys/sysproto.h>
78 #elif defined(__NetBSD__)
79 #include <sys/select.h>
80 #include <sys/malloc.h>
81 #include <sys/mount.h>
82 #include <sys/syscallargs.h>
83 #include <uvm/uvm.h>
84 #include <sys/sysctl.h>
85 #endif /* NetBSD, FreeBSD */
86 
87 #include <sys/pipe.h>
88 
89 #ifdef __NetBSD__
90 #define vfs_timestamp(tv) 	microtime(tv)
91 #endif
92 
93 /*
94  * Use this define if you want to disable *fancy* VM things.  Expect an
95  * approx 30% decrease in transfer rate.  This could be useful for
96  * OpenBSD.
97  */
98 /* #define PIPE_NODIRECT */
99 
100 /*
101  * interfaces to the outside world
102  */
103 #ifdef __FreeBSD__
104 static int pipe_read __P((struct file *fp, struct uio *uio,
105 		struct ucred *cred, int flags, struct proc *p));
106 static int pipe_write __P((struct file *fp, struct uio *uio,
107 		struct ucred *cred, int flags, struct proc *p));
108 static int pipe_close __P((struct file *fp, struct proc *p));
109 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
110 		struct proc *p));
111 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
112 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
113 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
114 
115 static struct fileops pipeops = {
116 	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
117 	pipe_stat, pipe_close
118 };
119 
120 static void	filt_pipedetach(struct knote *kn);
121 static int	filt_piperead(struct knote *kn, long hint);
122 static int	filt_pipewrite(struct knote *kn, long hint);
123 
124 static struct filterops pipe_rfiltops =
125 	{ 1, NULL, filt_pipedetach, filt_piperead };
126 static struct filterops pipe_wfiltops =
127 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
128 #endif /* FreeBSD */
129 
130 #ifdef __NetBSD__
131 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio,
132 		struct ucred *cred, int flags));
133 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio,
134 		struct ucred *cred, int flags));
135 static int pipe_close __P((struct file *fp, struct proc *p));
136 static int pipe_poll __P((struct file *fp, int events, struct proc *p));
137 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data,
138 		struct proc *p));
139 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
140 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
141 
142 static struct fileops pipeops =
143     { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
144       pipe_stat, pipe_close };
145 #endif /* NetBSD */
146 
147 /*
148  * Default pipe buffer size(s), this can be kind-of large now because pipe
149  * space is pageable.  The pipe code will try to maintain locality of
150  * reference for performance reasons, so small amounts of outstanding I/O
151  * will not wipe the cache.
152  */
153 #define MINPIPESIZE (PIPE_SIZE/3)
154 #define MAXPIPESIZE (2*PIPE_SIZE/3)
155 
156 /*
157  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
158  * is there so that on large systems, we don't exhaust it.
159  */
160 #define MAXPIPEKVA (8*1024*1024)
161 static int maxpipekva = MAXPIPEKVA;
162 
163 /*
164  * Limit for direct transfers, we cannot, of course limit
165  * the amount of kva for pipes in general though.
166  */
167 #define LIMITPIPEKVA (16*1024*1024)
168 static int limitpipekva = LIMITPIPEKVA;
169 
170 /*
171  * Limit the number of "big" pipes
172  */
173 #define LIMITBIGPIPES  32
174 static int maxbigpipes = LIMITBIGPIPES;
175 static int nbigpipe = 0;
176 
177 /*
178  * Amount of KVA consumed by pipe buffers.
179  */
180 static int amountpipekva = 0;
181 
182 static void pipeclose __P((struct pipe *cpipe));
183 static void pipe_free_kmem __P((struct pipe *cpipe));
184 static int pipe_create __P((struct pipe **cpipep, int allockva));
185 static __inline int pipelock __P((struct pipe *cpipe, int catch));
186 static __inline void pipeunlock __P((struct pipe *cpipe));
187 static __inline void pipeselwakeup __P((struct pipe *selp,
188 			struct pipe *sigp));
189 static int pipespace __P((struct pipe *cpipe, int size));
190 
191 #ifdef __FreeBSD__
192 #ifndef PIPE_NODIRECT
193 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
194 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
195 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
196 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
197 #endif
198 
199 static vm_zone_t pipe_zone;
200 #endif /* FreeBSD */
201 
202 #ifdef __NetBSD__
203 #ifndef PIPE_NODIRECT
204 static __inline int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
205 static __inline int pipe_loan_alloc __P((struct pipe *wpipe, int npages,
206 						vsize_t blen));
207 static void pipe_loan_free __P((struct pipe *wpipe));
208 #endif /* PIPE_NODIRECT */
209 
210 static struct pool pipe_pool;
211 #endif /* NetBSD */
212 
213 /*
214  * The pipe system call for the DTYPE_PIPE type of pipes
215  */
216 
217 /* ARGSUSED */
218 #ifdef __FreeBSD__
219 int
220 pipe(p, uap)
221 	struct proc *p;
222 	struct pipe_args /* {
223 		int	dummy;
224 	} */ *uap;
225 #elif defined(__NetBSD__)
226 int
227 sys_pipe(p, v, retval)
228 	struct proc *p;
229 	void *v;
230 	register_t *retval;
231 #endif
232 {
233 	struct file *rf, *wf;
234 	struct pipe *rpipe, *wpipe;
235 	int fd, error;
236 
237 #ifdef __FreeBSD__
238 	if (pipe_zone == NULL)
239 		pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
240 
241 	rpipe = wpipe = NULL;
242 	if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 1)) {
243 		pipeclose(rpipe);
244 		pipeclose(wpipe);
245 		return (ENFILE);
246 	}
247 
248 	error = falloc(p, &rf, &fd);
249 	if (error) {
250 		pipeclose(rpipe);
251 		pipeclose(wpipe);
252 		return (error);
253 	}
254 	fhold(rf);
255 	p->p_retval[0] = fd;
256 
257 	/*
258 	 * Warning: once we've gotten past allocation of the fd for the
259 	 * read-side, we can only drop the read side via fdrop() in order
260 	 * to avoid races against processes which manage to dup() the read
261 	 * side while we are blocked trying to allocate the write side.
262 	 */
263 	rf->f_flag = FREAD | FWRITE;
264 	rf->f_type = DTYPE_PIPE;
265 	rf->f_data = (caddr_t)rpipe;
266 	rf->f_ops = &pipeops;
267 	error = falloc(p, &wf, &fd);
268 	if (error) {
269 		struct filedesc *fdp = p->p_fd;
270 
271 		if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
272 			fdp->fd_ofiles[p->p_retval[0]] = NULL;
273 			fdrop(rf, p);
274 		}
275 		fdrop(rf, p);
276 		/* rpipe has been closed by fdrop(). */
277 		pipeclose(wpipe);
278 		return (error);
279 	}
280 	wf->f_flag = FREAD | FWRITE;
281 	wf->f_type = DTYPE_PIPE;
282 	wf->f_data = (caddr_t)wpipe;
283 	wf->f_ops = &pipeops;
284 	p->p_retval[1] = fd;
285 
286 	rpipe->pipe_peer = wpipe;
287 	wpipe->pipe_peer = rpipe;
288 	fdrop(rf, p);
289 #endif /* FreeBSD */
290 
291 #ifdef __NetBSD__
292 	rpipe = wpipe = NULL;
293 	if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) {
294 		pipeclose(rpipe);
295 		pipeclose(wpipe);
296 		return (ENFILE);
297 	}
298 
299 	/*
300 	 * Note: the file structure returned from falloc() is marked
301 	 * as 'larval' initially. Unless we mark it as 'mature' by
302 	 * FILE_SET_MATURE(), any attempt to do anything with it would
303 	 * return EBADF, including e.g. dup(2) or close(2). This avoids
304 	 * file descriptor races if we block in the second falloc().
305 	 */
306 
307 	error = falloc(p, &rf, &fd);
308 	if (error)
309 		goto free2;
310 	retval[0] = fd;
311 	rf->f_flag = FREAD;
312 	rf->f_type = DTYPE_PIPE;
313 	rf->f_data = (caddr_t)rpipe;
314 	rf->f_ops = &pipeops;
315 
316 	error = falloc(p, &wf, &fd);
317 	if (error)
318 		goto free3;
319 	retval[1] = fd;
320 	wf->f_flag = FWRITE;
321 	wf->f_type = DTYPE_PIPE;
322 	wf->f_data = (caddr_t)wpipe;
323 	wf->f_ops = &pipeops;
324 
325 	rpipe->pipe_peer = wpipe;
326 	wpipe->pipe_peer = rpipe;
327 
328 	FILE_SET_MATURE(rf);
329 	FILE_SET_MATURE(wf);
330 	FILE_UNUSE(rf, p);
331 	FILE_UNUSE(wf, p);
332 	return (0);
333 free3:
334 	FILE_UNUSE(rf, p);
335 	ffree(rf);
336 	fdremove(p->p_fd, retval[0]);
337 free2:
338 	pipeclose(wpipe);
339 	pipeclose(rpipe);
340 #endif /* NetBSD */
341 
342 	return (error);
343 }
344 
345 /*
346  * Allocate kva for pipe circular buffer, the space is pageable
347  * This routine will 'realloc' the size of a pipe safely, if it fails
348  * it will retain the old buffer.
349  * If it fails it will return ENOMEM.
350  */
351 static int
352 pipespace(cpipe, size)
353 	struct pipe *cpipe;
354 	int size;
355 {
356 	caddr_t buffer;
357 #ifdef __FreeBSD__
358 	struct vm_object *object;
359 	int npages, error;
360 
361 	npages = round_page(size)/PAGE_SIZE;
362 	/*
363 	 * Create an object, I don't like the idea of paging to/from
364 	 * kernel_object.
365 	 */
366 	mtx_lock(&vm_mtx);
367 	object = vm_object_allocate(OBJT_DEFAULT, npages);
368 	buffer = (caddr_t) vm_map_min(kernel_map);
369 
370 	/*
371 	 * Insert the object into the kernel map, and allocate kva for it.
372 	 * The map entry is, by default, pageable.
373 	 */
374 	error = vm_map_find(kernel_map, object, 0,
375 		(vm_offset_t *) &buffer, size, 1,
376 		VM_PROT_ALL, VM_PROT_ALL, 0);
377 
378 	if (error != KERN_SUCCESS) {
379 		vm_object_deallocate(object);
380 		mtx_unlock(&vm_mtx);
381 		return (ENOMEM);
382 	}
383 #endif /* FreeBSD */
384 
385 #ifdef __NetBSD__
386 	/*
387 	 * Allocate pageable virtual address space. Physical memory is allocated
388 	 * on demand.
389 	 */
390 	buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
391 	if (buffer == NULL)
392 		return (ENOMEM);
393 #endif /* NetBSD */
394 
395 	/* free old resources if we're resizing */
396 	pipe_free_kmem(cpipe);
397 #ifdef __FreeBSD__
398 	mtx_unlock(&vm_mtx);
399 	cpipe->pipe_buffer.object = object;
400 #endif
401 	cpipe->pipe_buffer.buffer = buffer;
402 	cpipe->pipe_buffer.size = size;
403 	cpipe->pipe_buffer.in = 0;
404 	cpipe->pipe_buffer.out = 0;
405 	cpipe->pipe_buffer.cnt = 0;
406 	amountpipekva += cpipe->pipe_buffer.size;
407 	return (0);
408 }
409 
410 /*
411  * initialize and allocate VM and memory for pipe
412  */
413 static int
414 pipe_create(cpipep, allockva)
415 	struct pipe **cpipep;
416 	int allockva;
417 {
418 	struct pipe *cpipe;
419 	int error;
420 
421 #ifdef __FreeBSD__
422 	*cpipep = zalloc(pipe_zone);
423 #endif
424 #ifdef __NetBSD__
425 	*cpipep = pool_get(&pipe_pool, M_WAITOK);
426 #endif
427 	if (*cpipep == NULL)
428 		return (ENOMEM);
429 
430 	cpipe = *cpipep;
431 
432 	/* Initialize */
433 	memset(cpipe, 0, sizeof(*cpipe));
434 	cpipe->pipe_state = PIPE_SIGNALR;
435 
436 	if (allockva && (error = pipespace(cpipe, PIPE_SIZE)))
437 		return (error);
438 
439 	vfs_timestamp(&cpipe->pipe_ctime);
440 	cpipe->pipe_atime = cpipe->pipe_ctime;
441 	cpipe->pipe_mtime = cpipe->pipe_ctime;
442 #ifdef __NetBSD__
443 	cpipe->pipe_pgid = NO_PID;
444 	lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
445 #endif
446 
447 	return (0);
448 }
449 
450 
451 /*
452  * lock a pipe for I/O, blocking other access
453  */
454 static __inline int
455 pipelock(cpipe, catch)
456 	struct pipe *cpipe;
457 	int catch;
458 {
459 	int error;
460 
461 #ifdef __FreeBSD__
462 	while (cpipe->pipe_state & PIPE_LOCK) {
463 		cpipe->pipe_state |= PIPE_LWANT;
464 		error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
465 		    "pipelk", 0);
466 		if (error != 0)
467 			return (error);
468 	}
469 	cpipe->pipe_state |= PIPE_LOCK;
470 	return (0);
471 #endif
472 
473 #ifdef __NetBSD__
474 	do {
475 		error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
476 	} while (!catch && (error == EINTR || error == ERESTART));
477 	return (error);
478 #endif
479 }
480 
481 /*
482  * unlock a pipe I/O lock
483  */
484 static __inline void
485 pipeunlock(cpipe)
486 	struct pipe *cpipe;
487 {
488 #ifdef __FreeBSD__
489 	cpipe->pipe_state &= ~PIPE_LOCK;
490 	if (cpipe->pipe_state & PIPE_LWANT) {
491 		cpipe->pipe_state &= ~PIPE_LWANT;
492 		wakeup(cpipe);
493 	}
494 #endif
495 
496 #ifdef __NetBSD__
497 	lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
498 #endif
499 }
500 
501 /*
502  * Select/poll wakup. This also sends SIGIO to peer connected to
503  * 'sigpipe' side of pipe.
504  */
505 static __inline void
506 pipeselwakeup(selp, sigp)
507 	struct pipe *selp, *sigp;
508 {
509 	if (selp->pipe_state & PIPE_SEL) {
510 		selp->pipe_state &= ~PIPE_SEL;
511 		selwakeup(&selp->pipe_sel);
512 	}
513 #ifdef __FreeBSD__
514 	if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
515 		pgsigio(sigp->pipe_sigio, SIGIO, 0);
516 	KNOTE(&selp->pipe_sel.si_note, 0);
517 #endif
518 
519 #ifdef __NetBSD__
520 	if (sigp && (sigp->pipe_state & PIPE_ASYNC)
521 	    && sigp->pipe_pgid != NO_PID){
522 		struct proc *p;
523 
524 		if (sigp->pipe_pgid < 0)
525 			gsignal(-sigp->pipe_pgid, SIGIO);
526 		else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0)
527 			psignal(p, SIGIO);
528 	}
529 #endif /* NetBSD */
530 }
531 
532 /* ARGSUSED */
533 #ifdef __FreeBSD__
534 static int
535 pipe_read(fp, uio, cred, flags, p)
536 	struct file *fp;
537 	struct uio *uio;
538 	struct ucred *cred;
539 	int flags;
540 	struct proc *p;
541 #elif defined(__NetBSD__)
542 static int
543 pipe_read(fp, offset, uio, cred, flags)
544 	struct file *fp;
545 	off_t *offset;
546 	struct uio *uio;
547 	struct ucred *cred;
548 	int flags;
549 #endif
550 {
551 	struct pipe *rpipe = (struct pipe *) fp->f_data;
552 	int error;
553 	size_t nread = 0;
554 	size_t size;
555 	size_t ocnt;
556 
557 	++rpipe->pipe_busy;
558 	error = pipelock(rpipe, 1);
559 	if (error)
560 		goto unlocked_error;
561 
562 	ocnt = rpipe->pipe_buffer.cnt;
563 
564 	while (uio->uio_resid) {
565 		/*
566 		 * normal pipe buffer receive
567 		 */
568 		if (rpipe->pipe_buffer.cnt > 0) {
569 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
570 			if (size > rpipe->pipe_buffer.cnt)
571 				size = rpipe->pipe_buffer.cnt;
572 			if (size > uio->uio_resid)
573 				size = uio->uio_resid;
574 
575 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
576 					size, uio);
577 			if (error)
578 				break;
579 
580 			rpipe->pipe_buffer.out += size;
581 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
582 				rpipe->pipe_buffer.out = 0;
583 
584 			rpipe->pipe_buffer.cnt -= size;
585 
586 			/*
587 			 * If there is no more to read in the pipe, reset
588 			 * its pointers to the beginning.  This improves
589 			 * cache hit stats.
590 			 */
591 			if (rpipe->pipe_buffer.cnt == 0) {
592 				rpipe->pipe_buffer.in = 0;
593 				rpipe->pipe_buffer.out = 0;
594 			}
595 			nread += size;
596 #ifndef PIPE_NODIRECT
597 		/*
598 		 * Direct copy, bypassing a kernel buffer.
599 		 */
600 		} else if ((size = rpipe->pipe_map.cnt) &&
601 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
602 			caddr_t	va;
603 			if (size > uio->uio_resid)
604 				size = uio->uio_resid;
605 
606 			va = (caddr_t) rpipe->pipe_map.kva +
607 			    rpipe->pipe_map.pos;
608 			error = uiomove(va, size, uio);
609 			if (error)
610 				break;
611 			nread += size;
612 			rpipe->pipe_map.pos += size;
613 			rpipe->pipe_map.cnt -= size;
614 			if (rpipe->pipe_map.cnt == 0) {
615 				rpipe->pipe_state &= ~PIPE_DIRECTW;
616 				wakeup(rpipe);
617 			}
618 #endif
619 		} else {
620 			/*
621 			 * detect EOF condition
622 			 * read returns 0 on EOF, no need to set error
623 			 */
624 			if (rpipe->pipe_state & PIPE_EOF)
625 				break;
626 
627 			/*
628 			 * If the "write-side" has been blocked, wake it up now.
629 			 */
630 			if (rpipe->pipe_state & PIPE_WANTW) {
631 				rpipe->pipe_state &= ~PIPE_WANTW;
632 				wakeup(rpipe);
633 			}
634 
635 			/*
636 			 * Break if some data was read.
637 			 */
638 			if (nread > 0)
639 				break;
640 
641 			/*
642 			 * don't block on non-blocking I/O
643 			 */
644 			if (fp->f_flag & FNONBLOCK) {
645 				error = EAGAIN;
646 				break;
647 			}
648 
649 			/*
650 			 * Unlock the pipe buffer for our remaining processing.
651 			 * We will either break out with an error or we will
652 			 * sleep and relock to loop.
653 			 */
654 			pipeunlock(rpipe);
655 
656 			/*
657 			 * We want to read more, wake up select/poll.
658 			 */
659 			pipeselwakeup(rpipe, rpipe->pipe_peer);
660 
661 			rpipe->pipe_state |= PIPE_WANTR;
662 			error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
663 			if (error != 0 || (error = pipelock(rpipe, 1)))
664 				goto unlocked_error;
665 		}
666 	}
667 	pipeunlock(rpipe);
668 
669 	if (error == 0)
670 		vfs_timestamp(&rpipe->pipe_atime);
671 unlocked_error:
672 	--rpipe->pipe_busy;
673 
674 	/*
675 	 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
676 	 */
677 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
678 		rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
679 		wakeup(rpipe);
680 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
681 		/*
682 		 * Handle write blocking hysteresis.
683 		 */
684 		if (rpipe->pipe_state & PIPE_WANTW) {
685 			rpipe->pipe_state &= ~PIPE_WANTW;
686 			wakeup(rpipe);
687 		}
688 	}
689 
690 	/*
691 	 * If anything was read off the buffer, signal to the writer it's
692 	 * possible to write more data. Also send signal if we are here for the
693 	 * first time after last write.
694 	 */
695 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
696 	    && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
697 		pipeselwakeup(rpipe, rpipe->pipe_peer);
698 		rpipe->pipe_state &= ~PIPE_SIGNALR;
699 	}
700 
701 	return (error);
702 }
703 
704 #ifdef __FreeBSD__
705 #ifndef PIPE_NODIRECT
706 /*
707  * Map the sending processes' buffer into kernel space and wire it.
708  * This is similar to a physical write operation.
709  */
710 static int
711 pipe_build_write_buffer(wpipe, uio)
712 	struct pipe *wpipe;
713 	struct uio *uio;
714 {
715 	size_t size;
716 	int i;
717 	vm_offset_t addr, endaddr, paddr;
718 
719 	size = uio->uio_iov->iov_len;
720 	if (size > wpipe->pipe_buffer.size)
721 		size = wpipe->pipe_buffer.size;
722 
723 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
724 	mtx_lock(&vm_mtx);
725 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
726 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
727 		vm_page_t m;
728 
729 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
730 		    (paddr = pmap_kextract(addr)) == 0) {
731 			int j;
732 
733 			for (j = 0; j < i; j++)
734 				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
735 			mtx_unlock(&vm_mtx);
736 			return (EFAULT);
737 		}
738 
739 		m = PHYS_TO_VM_PAGE(paddr);
740 		vm_page_wire(m);
741 		wpipe->pipe_map.ms[i] = m;
742 	}
743 
744 /*
745  * set up the control block
746  */
747 	wpipe->pipe_map.npages = i;
748 	wpipe->pipe_map.pos =
749 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
750 	wpipe->pipe_map.cnt = size;
751 
752 /*
753  * and map the buffer
754  */
755 	if (wpipe->pipe_map.kva == 0) {
756 		/*
757 		 * We need to allocate space for an extra page because the
758 		 * address range might (will) span pages at times.
759 		 */
760 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
761 			wpipe->pipe_buffer.size + PAGE_SIZE);
762 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
763 	}
764 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
765 		wpipe->pipe_map.npages);
766 
767 	mtx_unlock(&vm_mtx);
768 /*
769  * and update the uio data
770  */
771 
772 	uio->uio_iov->iov_len -= size;
773 	uio->uio_iov->iov_base += size;
774 	if (uio->uio_iov->iov_len == 0)
775 		uio->uio_iov++;
776 	uio->uio_resid -= size;
777 	uio->uio_offset += size;
778 	return (0);
779 }
780 
781 /*
782  * unmap and unwire the process buffer
783  */
784 static void
785 pipe_destroy_write_buffer(wpipe)
786 	struct pipe *wpipe;
787 {
788 	int i;
789 
790 	mtx_lock(&vm_mtx);
791 	if (wpipe->pipe_map.kva) {
792 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
793 
794 		if (amountpipekva > maxpipekva) {
795 			vm_offset_t kva = wpipe->pipe_map.kva;
796 			wpipe->pipe_map.kva = 0;
797 			kmem_free(kernel_map, kva,
798 				wpipe->pipe_buffer.size + PAGE_SIZE);
799 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
800 		}
801 	}
802 	for (i = 0; i < wpipe->pipe_map.npages; i++)
803 		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
804 	mtx_unlock(&vm_mtx);
805 }
806 
807 /*
808  * In the case of a signal, the writing process might go away.  This
809  * code copies the data into the circular buffer so that the source
810  * pages can be freed without loss of data.
811  */
812 static void
813 pipe_clone_write_buffer(wpipe)
814 	struct pipe *wpipe;
815 {
816 	int size;
817 	int pos;
818 
819 	size = wpipe->pipe_map.cnt;
820 	pos = wpipe->pipe_map.pos;
821 	memcpy((caddr_t) wpipe->pipe_buffer.buffer,
822 	    (caddr_t) wpipe->pipe_map.kva + pos, size);
823 
824 	wpipe->pipe_buffer.in = size;
825 	wpipe->pipe_buffer.out = 0;
826 	wpipe->pipe_buffer.cnt = size;
827 	wpipe->pipe_state &= ~PIPE_DIRECTW;
828 
829 	pipe_destroy_write_buffer(wpipe);
830 }
831 
832 /*
833  * This implements the pipe buffer write mechanism.  Note that only
834  * a direct write OR a normal pipe write can be pending at any given time.
835  * If there are any characters in the pipe buffer, the direct write will
836  * be deferred until the receiving process grabs all of the bytes from
837  * the pipe buffer.  Then the direct mapping write is set-up.
838  */
839 static int
840 pipe_direct_write(wpipe, uio)
841 	struct pipe *wpipe;
842 	struct uio *uio;
843 {
844 	int error;
845 
846 retry:
847 	while (wpipe->pipe_state & PIPE_DIRECTW) {
848 		if (wpipe->pipe_state & PIPE_WANTR) {
849 			wpipe->pipe_state &= ~PIPE_WANTR;
850 			wakeup(wpipe);
851 		}
852 		wpipe->pipe_state |= PIPE_WANTW;
853 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
854 		if (error)
855 			goto error1;
856 		if (wpipe->pipe_state & PIPE_EOF) {
857 			error = EPIPE;
858 			goto error1;
859 		}
860 	}
861 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
862 	if (wpipe->pipe_buffer.cnt > 0) {
863 		if (wpipe->pipe_state & PIPE_WANTR) {
864 			wpipe->pipe_state &= ~PIPE_WANTR;
865 			wakeup(wpipe);
866 		}
867 
868 		wpipe->pipe_state |= PIPE_WANTW;
869 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
870 		if (error)
871 			goto error1;
872 		if (wpipe->pipe_state & PIPE_EOF) {
873 			error = EPIPE;
874 			goto error1;
875 		}
876 		goto retry;
877 	}
878 
879 	wpipe->pipe_state |= PIPE_DIRECTW;
880 
881 	error = pipe_build_write_buffer(wpipe, uio);
882 	if (error) {
883 		wpipe->pipe_state &= ~PIPE_DIRECTW;
884 		goto error1;
885 	}
886 
887 	error = 0;
888 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
889 		if (wpipe->pipe_state & PIPE_EOF) {
890 			pipelock(wpipe, 0);
891 			pipe_destroy_write_buffer(wpipe);
892 			pipeunlock(wpipe);
893 			pipeselwakeup(wpipe, wpipe);
894 			error = EPIPE;
895 			goto error1;
896 		}
897 		if (wpipe->pipe_state & PIPE_WANTR) {
898 			wpipe->pipe_state &= ~PIPE_WANTR;
899 			wakeup(wpipe);
900 		}
901 		pipeselwakeup(wpipe, wpipe);
902 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
903 	}
904 
905 	pipelock(wpipe,0);
906 	if (wpipe->pipe_state & PIPE_DIRECTW) {
907 		/*
908 		 * this bit of trickery substitutes a kernel buffer for
909 		 * the process that might be going away.
910 		 */
911 		pipe_clone_write_buffer(wpipe);
912 	} else {
913 		pipe_destroy_write_buffer(wpipe);
914 	}
915 	pipeunlock(wpipe);
916 	return (error);
917 
918 error1:
919 	wakeup(wpipe);
920 	return (error);
921 }
922 #endif /* !PIPE_NODIRECT */
923 #endif /* FreeBSD */
924 
925 #ifdef __NetBSD__
926 #ifndef PIPE_NODIRECT
927 /*
928  * Allocate structure for loan transfer.
929  */
930 static __inline int
931 pipe_loan_alloc(wpipe, npages, blen)
932 	struct pipe *wpipe;
933 	int npages;
934 	vsize_t blen;
935 {
936 	wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, blen);
937 	if (wpipe->pipe_map.kva == NULL)
938 		return (ENOMEM);
939 
940 	amountpipekva += blen;
941 	wpipe->pipe_map.npages = npages;
942 	wpipe->pipe_map.ms = (struct vm_page **) malloc(
943 		npages * sizeof(struct vm_page *), M_PIPE, M_WAITOK);
944 
945 	return (0);
946 }
947 
948 /*
949  * Free resources allocated for loan transfer.
950  */
951 static void
952 pipe_loan_free(wpipe)
953 	struct pipe *wpipe;
954 {
955 	uvm_km_free(kernel_map, wpipe->pipe_map.kva,
956 			wpipe->pipe_map.npages * PAGE_SIZE);
957 	wpipe->pipe_map.kva = NULL;
958 	amountpipekva -= wpipe->pipe_map.npages * PAGE_SIZE;
959 	free(wpipe->pipe_map.ms, M_PIPE);
960 	wpipe->pipe_map.ms = NULL;
961 }
962 
963 /*
964  * NetBSD direct write, using uvm_loan() mechanism.
965  * This implements the pipe buffer write mechanism.  Note that only
966  * a direct write OR a normal pipe write can be pending at any given time.
967  * If there are any characters in the pipe buffer, the direct write will
968  * be deferred until the receiving process grabs all of the bytes from
969  * the pipe buffer.  Then the direct mapping write is set-up.
970  */
971 static __inline int
972 pipe_direct_write(wpipe, uio)
973 	struct pipe *wpipe;
974 	struct uio *uio;
975 {
976 	int error, npages, j;
977 	struct vm_page **res = NULL;
978 	vaddr_t bbase, kva, base, bend;
979 	vsize_t blen, bcnt;
980 	voff_t bpos;
981 
982 retry:
983 	while (wpipe->pipe_state & PIPE_DIRECTW) {
984 		if (wpipe->pipe_state & PIPE_WANTR) {
985 			wpipe->pipe_state &= ~PIPE_WANTR;
986 			wakeup(wpipe);
987 		}
988 		wpipe->pipe_state |= PIPE_WANTW;
989 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
990 		if (error)
991 			goto error;
992 		if (wpipe->pipe_state & PIPE_EOF) {
993 			error = EPIPE;
994 			goto error;
995 		}
996 	}
997 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
998 	if (wpipe->pipe_buffer.cnt > 0) {
999 		if (wpipe->pipe_state & PIPE_WANTR) {
1000 			wpipe->pipe_state &= ~PIPE_WANTR;
1001 			wakeup(wpipe);
1002 		}
1003 
1004 		wpipe->pipe_state |= PIPE_WANTW;
1005 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1006 		if (error)
1007 			goto error;
1008 		if (wpipe->pipe_state & PIPE_EOF) {
1009 			error = EPIPE;
1010 			goto error;
1011 		}
1012 		goto retry;
1013 	}
1014 
1015 	/*
1016 	 * Handle first iovec, first PIPE_CHUNK_SIZE bytes. Expect caller
1017 	 * to deal with short write.
1018 	 *
1019 	 * Note: need to deal with buffers not aligned to PAGE_SIZE.
1020 	 */
1021 	bbase = (vaddr_t)uio->uio_iov[0].iov_base;
1022 	base = trunc_page(bbase);
1023 	bend = round_page(bbase + uio->uio_iov[0].iov_len);
1024 	blen = bend - base;
1025 	bpos = bbase - base;
1026 
1027 	if (blen > PIPE_DIRECT_CHUNK) {
1028 		blen = PIPE_DIRECT_CHUNK;
1029 		bend = base + blen;
1030 		bcnt = PIPE_DIRECT_CHUNK - bpos;
1031 	} else
1032 		bcnt = uio->uio_iov[0].iov_len;
1033 
1034 	npages = blen / PAGE_SIZE;
1035 
1036 	wpipe->pipe_map.pos = bpos;
1037 	wpipe->pipe_map.cnt = bcnt;
1038 
1039 	/*
1040 	 * Free the old kva if we need more pages than we have
1041 	 * allocated.
1042 	 */
1043 	if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages)
1044 		pipe_loan_free(wpipe);
1045 
1046 	/* Allocate new kva. */
1047 	if (!wpipe->pipe_map.kva
1048 	    && (error = pipe_loan_alloc(wpipe, npages, blen)))
1049 		goto error;
1050 
1051 	/* Loan the write buffer memory from writer process */
1052 	error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen,
1053 	    (void **) wpipe->pipe_map.ms, UVM_LOAN_TOPAGE);
1054 	if (error)
1055 		goto cleanup;
1056 	res = wpipe->pipe_map.ms;
1057 
1058 	/* Enter the loaned pages to kva */
1059 	kva = wpipe->pipe_map.kva;
1060 	for(j=0; j < npages; j++, kva += PAGE_SIZE)
1061 		pmap_enter(pmap_kernel(), kva, res[j]->phys_addr,
1062 			VM_PROT_READ, 0);
1063 
1064 	wpipe->pipe_state |= PIPE_DIRECTW;
1065 	error = 0;
1066 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1067 		if (wpipe->pipe_state & PIPE_EOF) {
1068 			error = EPIPE;
1069 			break;
1070 		}
1071 		if (wpipe->pipe_state & PIPE_WANTR) {
1072 			wpipe->pipe_state &= ~PIPE_WANTR;
1073 			wakeup(wpipe);
1074 		}
1075 		pipeselwakeup(wpipe, wpipe);
1076 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1077 	}
1078 
1079 	if (error)
1080 		wpipe->pipe_state &= ~PIPE_DIRECTW;
1081 
1082     cleanup:
1083 	pipelock(wpipe, 0);
1084 	if (error || amountpipekva > maxpipekva)
1085 		pipe_loan_free(wpipe);
1086 	else if (res)
1087 		uvm_unloanpage(res, npages);
1088 	pipeunlock(wpipe);
1089 
1090 	if (error == EPIPE) {
1091 		pipeselwakeup(wpipe, wpipe);
1092 
1093 		/*
1094 		 * If anything was read from what we offered, return success
1095 		 * and short write. We return EOF on next write(2).
1096 		 */
1097 		if (wpipe->pipe_map.cnt < bcnt) {
1098 			bcnt -= wpipe->pipe_map.cnt;
1099 			error = 0;
1100 		}
1101 	}
1102 
1103 	if (error) {
1104    error:
1105 		wakeup(wpipe);
1106 		return (error);
1107 	}
1108 
1109 	uio->uio_resid  -= bcnt;
1110 	/* uio_offset not updated, not set/used for write(2) */
1111 
1112 	return (0);
1113 }
1114 #endif /* !PIPE_NODIRECT */
1115 #endif /* NetBSD */
1116 
1117 #ifdef __FreeBSD__
1118 static int
1119 pipe_write(fp, uio, cred, flags, p)
1120 	struct file *fp;
1121 	off_t *offset;
1122 	struct uio *uio;
1123 	struct ucred *cred;
1124 	int flags;
1125 	struct proc *p;
1126 #elif defined(__NetBSD__)
1127 static int
1128 pipe_write(fp, offset, uio, cred, flags)
1129 	struct file *fp;
1130 	off_t *offset;
1131 	struct uio *uio;
1132 	struct ucred *cred;
1133 	int flags;
1134 #endif
1135 {
1136 	int error = 0;
1137 	int orig_resid;
1138 	struct pipe *wpipe, *rpipe;
1139 
1140 	rpipe = (struct pipe *) fp->f_data;
1141 	wpipe = rpipe->pipe_peer;
1142 
1143 	/*
1144 	 * detect loss of pipe read side, issue SIGPIPE if lost.
1145 	 */
1146 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF))
1147 		return (EPIPE);
1148 
1149 	++wpipe->pipe_busy;
1150 
1151 	/*
1152 	 * If it is advantageous to resize the pipe buffer, do
1153 	 * so.
1154 	 */
1155 	if ((uio->uio_resid > PIPE_SIZE) &&
1156 		(nbigpipe < maxbigpipes) &&
1157 #ifndef PIPE_NODIRECT
1158 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1159 #endif
1160 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1161 		(wpipe->pipe_buffer.cnt == 0)) {
1162 
1163 		if ((error = pipelock(wpipe,1)) == 0) {
1164 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1165 				nbigpipe++;
1166 			pipeunlock(wpipe);
1167 		} else {
1168 			/*
1169 			 * If an error occurred, unbusy and return, waking up
1170 			 * any waiting readers.
1171 			 */
1172 			--wpipe->pipe_busy;
1173 			if (wpipe->pipe_busy == 0
1174 			    && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1175 				wpipe->pipe_state &=
1176 				    ~(PIPE_WANTCLOSE | PIPE_WANTR);
1177 				wakeup(wpipe);
1178 			}
1179 
1180 			return (error);
1181 		}
1182 	}
1183 
1184 #ifdef __FreeBSD__
1185 	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1186 #endif
1187 
1188 	orig_resid = uio->uio_resid;
1189 	while (uio->uio_resid) {
1190 		int space;
1191 
1192 #ifndef PIPE_NODIRECT
1193 		/*
1194 		 * If the transfer is large, we can gain performance if
1195 		 * we do process-to-process copies directly.
1196 		 * If the write is non-blocking, we don't use the
1197 		 * direct write mechanism.
1198 		 *
1199 		 * The direct write mechanism will detect the reader going
1200 		 * away on us.
1201 		 */
1202 		if ((uio->uio_iov[0].iov_len >= PIPE_MINDIRECT) &&
1203 		    (uio->uio_resid == orig_resid) &&
1204 		    (fp->f_flag & FNONBLOCK) == 0 &&
1205 		    (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1206 			error = pipe_direct_write(wpipe, uio);
1207 
1208 			/*
1209 			 * We either errorred, wrote whole buffer, or
1210 			 * wrote part of buffer. If the error is ENOMEM,
1211 			 * we failed to allocate some resources for direct
1212 			 * write and fall back to ordinary write. Otherwise,
1213 			 * break out now.
1214 			 */
1215 			if (error != ENOMEM)
1216 				break;
1217 		}
1218 #endif /* PIPE_NODIRECT */
1219 
1220 		/*
1221 		 * Pipe buffered writes cannot be coincidental with
1222 		 * direct writes.  We wait until the currently executing
1223 		 * direct write is completed before we start filling the
1224 		 * pipe buffer.  We break out if a signal occurs or the
1225 		 * reader goes away.
1226 		 */
1227 	retrywrite:
1228 		while (wpipe->pipe_state & PIPE_DIRECTW) {
1229 			if (wpipe->pipe_state & PIPE_WANTR) {
1230 				wpipe->pipe_state &= ~PIPE_WANTR;
1231 				wakeup(wpipe);
1232 			}
1233 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1234 			if (wpipe->pipe_state & PIPE_EOF)
1235 				break;
1236 			if (error)
1237 				break;
1238 		}
1239 		if (wpipe->pipe_state & PIPE_EOF) {
1240 			error = EPIPE;
1241 			break;
1242 		}
1243 
1244 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1245 
1246 		/* Writes of size <= PIPE_BUF must be atomic. */
1247 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1248 			space = 0;
1249 
1250 		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
1251 			int size;	/* Transfer size */
1252 			int segsize;	/* first segment to transfer */
1253 
1254 			if ((error = pipelock(wpipe,1)) != 0)
1255 				break;
1256 
1257 			/*
1258 			 * It is possible for a direct write to
1259 			 * slip in on us... handle it here...
1260 			 */
1261 			if (wpipe->pipe_state & PIPE_DIRECTW) {
1262 				pipeunlock(wpipe);
1263 				goto retrywrite;
1264 			}
1265 			/*
1266 			 * If a process blocked in uiomove, our
1267 			 * value for space might be bad.
1268 			 *
1269 			 * XXX will we be ok if the reader has gone
1270 			 * away here?
1271 			 */
1272 			if (space > wpipe->pipe_buffer.size -
1273 				    wpipe->pipe_buffer.cnt) {
1274 				pipeunlock(wpipe);
1275 				goto retrywrite;
1276 			}
1277 
1278 			/*
1279 			 * Transfer size is minimum of uio transfer
1280 			 * and free space in pipe buffer.
1281 			 */
1282 			if (space > uio->uio_resid)
1283 				size = uio->uio_resid;
1284 			else
1285 				size = space;
1286 			/*
1287 			 * First segment to transfer is minimum of
1288 			 * transfer size and contiguous space in
1289 			 * pipe buffer.  If first segment to transfer
1290 			 * is less than the transfer size, we've got
1291 			 * a wraparound in the buffer.
1292 			 */
1293 			segsize = wpipe->pipe_buffer.size -
1294 				wpipe->pipe_buffer.in;
1295 			if (segsize > size)
1296 				segsize = size;
1297 
1298 			/* Transfer first segment */
1299 
1300 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1301 						segsize, uio);
1302 
1303 			if (error == 0 && segsize < size) {
1304 				/*
1305 				 * Transfer remaining part now, to
1306 				 * support atomic writes.  Wraparound
1307 				 * happened.
1308 				 */
1309 #ifdef DEBUG
1310 				if (wpipe->pipe_buffer.in + segsize !=
1311 				    wpipe->pipe_buffer.size)
1312 					panic("Expected pipe buffer wraparound disappeared");
1313 #endif
1314 
1315 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
1316 						size - segsize, uio);
1317 			}
1318 			if (error == 0) {
1319 				wpipe->pipe_buffer.in += size;
1320 				if (wpipe->pipe_buffer.in >=
1321 				    wpipe->pipe_buffer.size) {
1322 #ifdef DEBUG
1323 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1324 						panic("Expected wraparound bad");
1325 #endif
1326 					wpipe->pipe_buffer.in = size - segsize;
1327 				}
1328 
1329 				wpipe->pipe_buffer.cnt += size;
1330 #ifdef DEBUG
1331 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1332 					panic("Pipe buffer overflow");
1333 #endif
1334 
1335 			}
1336 			pipeunlock(wpipe);
1337 			if (error)
1338 				break;
1339 
1340 		} else {
1341 			/*
1342 			 * If the "read-side" has been blocked, wake it up now.
1343 			 */
1344 			if (wpipe->pipe_state & PIPE_WANTR) {
1345 				wpipe->pipe_state &= ~PIPE_WANTR;
1346 				wakeup(wpipe);
1347 			}
1348 
1349 			/*
1350 			 * don't block on non-blocking I/O
1351 			 */
1352 			if (fp->f_flag & FNONBLOCK) {
1353 				error = EAGAIN;
1354 				break;
1355 			}
1356 
1357 			/*
1358 			 * We have no more space and have something to offer,
1359 			 * wake up select/poll.
1360 			 */
1361 			pipeselwakeup(wpipe, wpipe);
1362 
1363 			wpipe->pipe_state |= PIPE_WANTW;
1364 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1365 			if (error != 0)
1366 				break;
1367 			/*
1368 			 * If read side wants to go away, we just issue a signal
1369 			 * to ourselves.
1370 			 */
1371 			if (wpipe->pipe_state & PIPE_EOF) {
1372 				error = EPIPE;
1373 				break;
1374 			}
1375 		}
1376 	}
1377 
1378 	--wpipe->pipe_busy;
1379 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1380 		wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1381 		wakeup(wpipe);
1382 	} else if (wpipe->pipe_buffer.cnt > 0) {
1383 		/*
1384 		 * If we have put any characters in the buffer, we wake up
1385 		 * the reader.
1386 		 */
1387 		if (wpipe->pipe_state & PIPE_WANTR) {
1388 			wpipe->pipe_state &= ~PIPE_WANTR;
1389 			wakeup(wpipe);
1390 		}
1391 	}
1392 
1393 	/*
1394 	 * Don't return EPIPE if I/O was successful
1395 	 */
1396 	if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1397 	    && (uio->uio_resid == 0))
1398 		error = 0;
1399 
1400 	if (error == 0)
1401 		vfs_timestamp(&wpipe->pipe_mtime);
1402 
1403 	/*
1404 	 * We have something to offer, wake up select/poll.
1405 	 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1406 	 * is only done synchronously), so check wpipe->only pipe_buffer.cnt
1407 	 */
1408 	if (wpipe->pipe_buffer.cnt)
1409 		pipeselwakeup(wpipe, wpipe);
1410 
1411 	/*
1412 	 * Arrange for next read(2) to do a signal.
1413 	 */
1414 	wpipe->pipe_state |= PIPE_SIGNALR;
1415 
1416 	return (error);
1417 }
1418 
1419 /*
1420  * we implement a very minimal set of ioctls for compatibility with sockets.
1421  */
1422 int
1423 pipe_ioctl(fp, cmd, data, p)
1424 	struct file *fp;
1425 	u_long cmd;
1426 	caddr_t data;
1427 	struct proc *p;
1428 {
1429 	struct pipe *mpipe = (struct pipe *)fp->f_data;
1430 
1431 	switch (cmd) {
1432 
1433 	case FIONBIO:
1434 		return (0);
1435 
1436 	case FIOASYNC:
1437 		if (*(int *)data) {
1438 			mpipe->pipe_state |= PIPE_ASYNC;
1439 		} else {
1440 			mpipe->pipe_state &= ~PIPE_ASYNC;
1441 		}
1442 		return (0);
1443 
1444 	case FIONREAD:
1445 #ifndef PIPE_NODIRECT
1446 		if (mpipe->pipe_state & PIPE_DIRECTW)
1447 			*(int *)data = mpipe->pipe_map.cnt;
1448 		else
1449 #endif
1450 			*(int *)data = mpipe->pipe_buffer.cnt;
1451 		return (0);
1452 
1453 #ifdef __FreeBSD__
1454 	case FIOSETOWN:
1455 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1456 
1457 	case FIOGETOWN:
1458 		*(int *)data = fgetown(mpipe->pipe_sigio);
1459 		return (0);
1460 
1461 	/* This is deprecated, FIOSETOWN should be used instead. */
1462 	case TIOCSPGRP:
1463 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1464 
1465 	/* This is deprecated, FIOGETOWN should be used instead. */
1466 	case TIOCGPGRP:
1467 		*(int *)data = -fgetown(mpipe->pipe_sigio);
1468 		return (0);
1469 #endif /* FreeBSD */
1470 #ifdef __NetBSD__
1471 	case TIOCSPGRP:
1472 		mpipe->pipe_pgid = *(int *)data;
1473 		return (0);
1474 
1475 	case TIOCGPGRP:
1476 		*(int *)data = mpipe->pipe_pgid;
1477 		return (0);
1478 #endif /* NetBSD */
1479 
1480 	}
1481 	return (ENOTTY);
1482 }
1483 
1484 int
1485 pipe_poll(fp, events, p)
1486 	struct file *fp;
1487 	int events;
1488 	struct proc *p;
1489 {
1490 	struct pipe *rpipe = (struct pipe *)fp->f_data;
1491 	struct pipe *wpipe;
1492 	int revents = 0;
1493 
1494 	wpipe = rpipe->pipe_peer;
1495 	if (events & (POLLIN | POLLRDNORM))
1496 		if ((rpipe->pipe_buffer.cnt > 0) ||
1497 #ifndef PIPE_NODIRECT
1498 		    (rpipe->pipe_state & PIPE_DIRECTW) ||
1499 #endif
1500 		    (rpipe->pipe_state & PIPE_EOF))
1501 			revents |= events & (POLLIN | POLLRDNORM);
1502 
1503 	if (events & (POLLOUT | POLLWRNORM))
1504 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1505 		    || (
1506 #ifndef PIPE_NODIRECT
1507 		     ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1508 #endif
1509 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1510 			revents |= events & (POLLOUT | POLLWRNORM);
1511 
1512 	if ((rpipe->pipe_state & PIPE_EOF) ||
1513 	    (wpipe == NULL) ||
1514 	    (wpipe->pipe_state & PIPE_EOF))
1515 		revents |= POLLHUP;
1516 
1517 	if (revents == 0) {
1518 		if (events & (POLLIN | POLLRDNORM)) {
1519 			selrecord(p, &rpipe->pipe_sel);
1520 			rpipe->pipe_state |= PIPE_SEL;
1521 		}
1522 
1523 		if (events & (POLLOUT | POLLWRNORM)) {
1524 			selrecord(p, &wpipe->pipe_sel);
1525 			wpipe->pipe_state |= PIPE_SEL;
1526 		}
1527 	}
1528 
1529 	return (revents);
1530 }
1531 
1532 static int
1533 pipe_stat(fp, ub, p)
1534 	struct file *fp;
1535 	struct stat *ub;
1536 	struct proc *p;
1537 {
1538 	struct pipe *pipe = (struct pipe *)fp->f_data;
1539 
1540 	memset((caddr_t)ub, 0, sizeof(*ub));
1541 	ub->st_mode = S_IFIFO;
1542 	ub->st_blksize = pipe->pipe_buffer.size;
1543 	ub->st_size = pipe->pipe_buffer.cnt;
1544 	ub->st_blocks = (ub->st_size) ? 1 : 0;
1545 #ifdef __FreeBSD__
1546 	ub->st_atimespec = pipe->pipe_atime;
1547 	ub->st_mtimespec = pipe->pipe_mtime;
1548 	ub->st_ctimespec = pipe->pipe_ctime;
1549 #endif /* FreeBSD */
1550 #ifdef __NetBSD__
1551 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1552 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1553 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1554 #endif /* NetBSD */
1555 	ub->st_uid = fp->f_cred->cr_uid;
1556 	ub->st_gid = fp->f_cred->cr_gid;
1557 	/*
1558 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1559 	 * XXX (st_dev, st_ino) should be unique.
1560 	 */
1561 	return (0);
1562 }
1563 
1564 /* ARGSUSED */
1565 static int
1566 pipe_close(fp, p)
1567 	struct file *fp;
1568 	struct proc *p;
1569 {
1570 	struct pipe *cpipe = (struct pipe *)fp->f_data;
1571 
1572 #ifdef __FreeBSD__
1573 	fp->f_ops = &badfileops;
1574 	funsetown(cpipe->pipe_sigio);
1575 #endif
1576 	fp->f_data = NULL;
1577 	pipeclose(cpipe);
1578 	return (0);
1579 }
1580 
1581 static void
1582 pipe_free_kmem(cpipe)
1583 	struct pipe *cpipe;
1584 {
1585 
1586 #ifdef __FreeBSD__
1587 	mtx_assert(&vm_mtx, MA_OWNED);
1588 #endif
1589 	if (cpipe->pipe_buffer.buffer != NULL) {
1590 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1591 			--nbigpipe;
1592 		amountpipekva -= cpipe->pipe_buffer.size;
1593 #ifdef __FreeBSD__
1594 		kmem_free(kernel_map,
1595 			(vm_offset_t)cpipe->pipe_buffer.buffer,
1596 			cpipe->pipe_buffer.size);
1597 #elif defined(__NetBSD__)
1598 		uvm_km_free(kernel_map,
1599 			(vaddr_t)cpipe->pipe_buffer.buffer,
1600 			cpipe->pipe_buffer.size);
1601 #endif /* NetBSD */
1602 
1603 		cpipe->pipe_buffer.buffer = NULL;
1604 	}
1605 #ifndef PIPE_NODIRECT
1606 	if (cpipe->pipe_map.kva != NULL) {
1607 #ifdef __FreeBSD__
1608 		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1609 		kmem_free(kernel_map,
1610 			cpipe->pipe_map.kva,
1611 			cpipe->pipe_buffer.size + PAGE_SIZE);
1612 #elif defined(__NetBSD__)
1613 		pipe_loan_free(cpipe);
1614 #endif /* NetBSD */
1615 		cpipe->pipe_map.cnt = 0;
1616 		cpipe->pipe_map.kva = NULL;
1617 		cpipe->pipe_map.pos = 0;
1618 		cpipe->pipe_map.npages = 0;
1619 	}
1620 #endif /* !PIPE_NODIRECT */
1621 }
1622 
1623 /*
1624  * shutdown the pipe
1625  */
1626 static void
1627 pipeclose(cpipe)
1628 	struct pipe *cpipe;
1629 {
1630 	struct pipe *ppipe;
1631 
1632 	if (!cpipe)
1633 		return;
1634 
1635 	pipeselwakeup(cpipe, cpipe);
1636 
1637 	/*
1638 	 * If the other side is blocked, wake it up saying that
1639 	 * we want to close it down.
1640 	 */
1641 	while (cpipe->pipe_busy) {
1642 		wakeup(cpipe);
1643 		cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1644 		tsleep(cpipe, PRIBIO, "pipecl", 0);
1645 	}
1646 
1647 	/*
1648 	 * Disconnect from peer
1649 	 */
1650 	if ((ppipe = cpipe->pipe_peer) != NULL) {
1651 		pipeselwakeup(ppipe, ppipe);
1652 
1653 		ppipe->pipe_state |= PIPE_EOF;
1654 		wakeup(ppipe);
1655 		ppipe->pipe_peer = NULL;
1656 	}
1657 
1658 	/*
1659 	 * free resources
1660 	 */
1661 #ifdef _FreeBSD__
1662 	mtx_lock(&vm_mtx);
1663 	pipe_free_kmem(cpipe);
1664 	/* XXX: erm, doesn't zalloc already have its own locks and
1665 	 * not need the giant vm lock?
1666 	 */
1667 	zfree(pipe_zone, cpipe);
1668 	mtx_unlock(&vm_mtx);
1669 #endif /* FreeBSD */
1670 
1671 #ifdef __NetBSD__
1672 	pipe_free_kmem(cpipe);
1673 	(void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1674 	pool_put(&pipe_pool, cpipe);
1675 #endif
1676 }
1677 
1678 #ifdef __FreeBSD__
1679 /*ARGSUSED*/
1680 static int
1681 pipe_kqfilter(struct file *fp, struct knote *kn)
1682 {
1683 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1684 
1685 	switch (kn->kn_filter) {
1686 	case EVFILT_READ:
1687 		kn->kn_fop = &pipe_rfiltops;
1688 		break;
1689 	case EVFILT_WRITE:
1690 		kn->kn_fop = &pipe_wfiltops;
1691 		cpipe = cpipe->pipe_peer;
1692 		break;
1693 	default:
1694 		return (1);
1695 	}
1696 	kn->kn_hook = (caddr_t)cpipe;
1697 
1698 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1699 	return (0);
1700 }
1701 
1702 static void
1703 filt_pipedetach(struct knote *kn)
1704 {
1705 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1706 
1707 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1708 }
1709 
1710 /*ARGSUSED*/
1711 static int
1712 filt_piperead(struct knote *kn, long hint)
1713 {
1714 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1715 	struct pipe *wpipe = rpipe->pipe_peer;
1716 
1717 	kn->kn_data = rpipe->pipe_buffer.cnt;
1718 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1719 		kn->kn_data = rpipe->pipe_map.cnt;
1720 
1721 	if ((rpipe->pipe_state & PIPE_EOF) ||
1722 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1723 		kn->kn_flags |= EV_EOF;
1724 		return (1);
1725 	}
1726 	return (kn->kn_data > 0);
1727 }
1728 
1729 /*ARGSUSED*/
1730 static int
1731 filt_pipewrite(struct knote *kn, long hint)
1732 {
1733 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1734 	struct pipe *wpipe = rpipe->pipe_peer;
1735 
1736 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1737 		kn->kn_data = 0;
1738 		kn->kn_flags |= EV_EOF;
1739 		return (1);
1740 	}
1741 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1742 	if (wpipe->pipe_state & PIPE_DIRECTW)
1743 		kn->kn_data = 0;
1744 
1745 	return (kn->kn_data >= PIPE_BUF);
1746 }
1747 #endif /* FreeBSD */
1748 
1749 #ifdef __NetBSD__
1750 static int
1751 pipe_fcntl(fp, cmd, data, p)
1752 	struct file *fp;
1753 	u_int cmd;
1754 	caddr_t data;
1755 	struct proc *p;
1756 {
1757 	if (cmd == F_SETFL)
1758 		return (0);
1759 	else
1760 		return (EOPNOTSUPP);
1761 }
1762 
1763 /*
1764  * Handle pipe sysctls.
1765  */
1766 int
1767 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1768 	int *name;
1769 	u_int namelen;
1770 	void *oldp;
1771 	size_t *oldlenp;
1772 	void *newp;
1773 	size_t newlen;
1774 {
1775 	/* All sysctl names at this level are terminal. */
1776 	if (namelen != 1)
1777 		return (ENOTDIR);		/* overloaded */
1778 
1779 	switch (name[0]) {
1780 	case KERN_PIPE_MAXKVASZ:
1781 		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1782 	case KERN_PIPE_LIMITKVA:
1783 		return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1784 	case KERN_PIPE_MAXBIGPIPES:
1785 		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1786 	case KERN_PIPE_NBIGPIPES:
1787 		return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1788 	case KERN_PIPE_KVASIZE:
1789 		return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1790 	default:
1791 		return (EOPNOTSUPP);
1792 	}
1793 	/* NOTREACHED */
1794 }
1795 
1796 /*
1797  * Initialize pipe structs.
1798  */
1799 void
1800 pipe_init(void)
1801 {
1802 	pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
1803 		0, NULL, NULL, M_PIPE);
1804 }
1805 
1806 #endif /* __NetBSD __ */
1807