xref: /netbsd-src/sys/kern/sys_pipe.c (revision 17306b8fd0952c7489f93f0230818481e5a1e2c9)
1 /*	$NetBSD: sys_pipe.c,v 1.4 2001/06/21 18:59:51 jdolecek Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  *
21  * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $
22  */
23 
24 /*
25  * This file contains a high-performance replacement for the socket-based
26  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
27  * all features of sockets, but does do everything that pipes normally
28  * do.
29  *
30  * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31  * written by Jaromir Dolecek.
32  */
33 
34 /*
35  * This code has two modes of operation, a small write mode and a large
36  * write mode.  The small write mode acts like conventional pipes with
37  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
38  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
39  * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40  * those pages are also wired), and the receiving process can copy it directly
41  * from the pages in the sending process.
42  *
43  * If the sending process receives a signal, it is possible that it will
44  * go away, and certainly its address space can change, because control
45  * is returned back to the user-mode side.  In that case, the pipe code
46  * arranges to copy the buffer supplied by the user process on FreeBSD, to
47  * a pageable kernel buffer, and the receiving process will grab the data
48  * from the pageable kernel buffer.  Since signals don't happen all that often,
49  * the copy operation is normally eliminated.
50  * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51  * so no explicit handling need to be done, all is handled by standard VM
52  * facilities.
53  *
54  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55  * happen for small transfers so that the system will not spend all of
56  * its time context switching.  PIPE_SIZE is constrained by the
57  * amount of kernel virtual memory.
58  */
59 
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/proc.h>
63 #include <sys/fcntl.h>
64 #include <sys/file.h>
65 #include <sys/filedesc.h>
66 #include <sys/filio.h>
67 #include <sys/ttycom.h>
68 #include <sys/stat.h>
69 #include <sys/poll.h>
70 #include <sys/signalvar.h>
71 #include <sys/vnode.h>
72 #include <sys/uio.h>
73 #include <sys/lock.h>
74 #ifdef __FreeBSD__
75 #include <sys/mutex.h>
76 #include <sys/selinfo.h>
77 #include <sys/sysproto.h>
78 #elif defined(__NetBSD__)
79 #include <sys/select.h>
80 #include <sys/malloc.h>
81 #include <sys/mount.h>
82 #include <sys/syscallargs.h>
83 #include <uvm/uvm.h>
84 #include <sys/sysctl.h>
85 #endif /* NetBSD, FreeBSD */
86 
87 #include <sys/pipe.h>
88 
89 #ifdef __NetBSD__
90 #define vfs_timestamp(tv) 	microtime(tv)
91 #endif
92 
93 /*
94  * Use this define if you want to disable *fancy* VM things.  Expect an
95  * approx 30% decrease in transfer rate.  This could be useful for
96  * OpenBSD.
97  */
98 /* #define PIPE_NODIRECT */
99 
100 /*
101  * interfaces to the outside world
102  */
103 #ifdef __FreeBSD__
104 static int pipe_read __P((struct file *fp, struct uio *uio,
105 		struct ucred *cred, int flags, struct proc *p));
106 static int pipe_write __P((struct file *fp, struct uio *uio,
107 		struct ucred *cred, int flags, struct proc *p));
108 static int pipe_close __P((struct file *fp, struct proc *p));
109 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
110 		struct proc *p));
111 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
112 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
113 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
114 
115 static struct fileops pipeops = {
116 	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
117 	pipe_stat, pipe_close
118 };
119 
120 static void	filt_pipedetach(struct knote *kn);
121 static int	filt_piperead(struct knote *kn, long hint);
122 static int	filt_pipewrite(struct knote *kn, long hint);
123 
124 static struct filterops pipe_rfiltops =
125 	{ 1, NULL, filt_pipedetach, filt_piperead };
126 static struct filterops pipe_wfiltops =
127 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
128 #endif /* FreeBSD */
129 
130 #ifdef __NetBSD__
131 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio,
132 		struct ucred *cred, int flags));
133 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio,
134 		struct ucred *cred, int flags));
135 static int pipe_close __P((struct file *fp, struct proc *p));
136 static int pipe_poll __P((struct file *fp, int events, struct proc *p));
137 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data,
138 		struct proc *p));
139 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
140 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
141 
142 static struct fileops pipeops =
143     { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
144       pipe_stat, pipe_close };
145 #endif /* NetBSD */
146 
147 /*
148  * Default pipe buffer size(s), this can be kind-of large now because pipe
149  * space is pageable.  The pipe code will try to maintain locality of
150  * reference for performance reasons, so small amounts of outstanding I/O
151  * will not wipe the cache.
152  */
153 #define MINPIPESIZE (PIPE_SIZE/3)
154 #define MAXPIPESIZE (2*PIPE_SIZE/3)
155 
156 /*
157  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
158  * is there so that on large systems, we don't exhaust it.
159  */
160 #define MAXPIPEKVA (8*1024*1024)
161 static int maxpipekva = MAXPIPEKVA;
162 
163 /*
164  * Limit for direct transfers, we cannot, of course limit
165  * the amount of kva for pipes in general though.
166  */
167 #define LIMITPIPEKVA (16*1024*1024)
168 static int limitpipekva = LIMITPIPEKVA;
169 
170 /*
171  * Limit the number of "big" pipes
172  */
173 #define LIMITBIGPIPES  32
174 static int maxbigpipes = LIMITBIGPIPES;
175 static int nbigpipe = 0;
176 
177 /*
178  * Amount of KVA consumed by pipe buffers.
179  */
180 static int amountpipekva = 0;
181 
182 static void pipeclose __P((struct pipe *cpipe));
183 static void pipe_free_kmem __P((struct pipe *cpipe));
184 static int pipe_create __P((struct pipe **cpipep));
185 static __inline int pipelock __P((struct pipe *cpipe, int catch));
186 static __inline void pipeunlock __P((struct pipe *cpipe));
187 static __inline void pipeselwakeup __P((struct pipe *selp,
188 			struct pipe *sigp));
189 static int pipespace __P((struct pipe *cpipe, int size));
190 
191 #ifdef __FreeBSD__
192 #ifndef PIPE_NODIRECT
193 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
194 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
195 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
196 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
197 #endif
198 
199 static vm_zone_t pipe_zone;
200 #endif /* FreeBSD */
201 
202 #ifdef __NetBSD__
203 #ifndef PIPE_NODIRECT
204 static __inline int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
205 static __inline int pipe_loan_alloc __P((struct pipe *wpipe, int npages,
206 						vsize_t blen));
207 static void pipe_loan_free __P((struct pipe *wpipe));
208 #endif /* PIPE_NODIRECT */
209 
210 static struct pool pipe_pool;
211 #endif /* NetBSD */
212 
213 /*
214  * The pipe system call for the DTYPE_PIPE type of pipes
215  */
216 
217 /* ARGSUSED */
218 #ifdef __FreeBSD__
219 int
220 pipe(p, uap)
221 	struct proc *p;
222 	struct pipe_args /* {
223 		int	dummy;
224 	} */ *uap;
225 #elif defined(__NetBSD__)
226 int
227 sys_pipe(p, v, retval)
228 	struct proc *p;
229 	void *v;
230 	register_t *retval;
231 #endif
232 {
233 	struct filedesc *fdp = p->p_fd;
234 	struct file *rf, *wf;
235 	struct pipe *rpipe, *wpipe;
236 	int fd, error;
237 
238 #ifdef __FreeBSD__
239 	if (pipe_zone == NULL)
240 		pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
241 #endif
242 
243 	rpipe = wpipe = NULL;
244 	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
245 		pipeclose(rpipe);
246 		pipeclose(wpipe);
247 		return (ENFILE);
248 	}
249 
250 #ifdef __FreeBSD__
251 	error = falloc(p, &rf, &fd);
252 	if (error) {
253 		pipeclose(rpipe);
254 		pipeclose(wpipe);
255 		return (error);
256 	}
257 	fhold(rf);
258 	p->p_retval[0] = fd;
259 
260 	/*
261 	 * Warning: once we've gotten past allocation of the fd for the
262 	 * read-side, we can only drop the read side via fdrop() in order
263 	 * to avoid races against processes which manage to dup() the read
264 	 * side while we are blocked trying to allocate the write side.
265 	 */
266 	rf->f_flag = FREAD | FWRITE;
267 	rf->f_type = DTYPE_PIPE;
268 	rf->f_data = (caddr_t)rpipe;
269 	rf->f_ops = &pipeops;
270 	error = falloc(p, &wf, &fd);
271 	if (error) {
272 		if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
273 			fdp->fd_ofiles[p->p_retval[0]] = NULL;
274 			fdrop(rf, p);
275 		}
276 		fdrop(rf, p);
277 		/* rpipe has been closed by fdrop(). */
278 		pipeclose(wpipe);
279 		return (error);
280 	}
281 	wf->f_flag = FREAD | FWRITE;
282 	wf->f_type = DTYPE_PIPE;
283 	wf->f_data = (caddr_t)wpipe;
284 	wf->f_ops = &pipeops;
285 	p->p_retval[1] = fd;
286 
287 	rpipe->pipe_peer = wpipe;
288 	wpipe->pipe_peer = rpipe;
289 	fdrop(rf, p);
290 #endif /* FreeBSD */
291 
292 #ifdef __NetBSD__
293 	/*
294 	 * Note: the file structure returned from falloc() is marked
295 	 * as 'larval' initially. Unless we mark it as 'mature' by
296 	 * FILE_SET_MATURE(), any attempt to do anything with it would
297 	 * return EBADF, including e.g. dup(2) or close(2). This avoids
298 	 * file descriptor races if we block in the second falloc().
299 	 */
300 
301 	error = falloc(p, &rf, &fd);
302 	if (error)
303 		goto free2;
304 	retval[0] = fd;
305 	rf->f_flag = FREAD;
306 	rf->f_type = DTYPE_PIPE;
307 	rf->f_data = (caddr_t)rpipe;
308 	rf->f_ops = &pipeops;
309 
310 	error = falloc(p, &wf, &fd);
311 	if (error)
312 		goto free3;
313 	retval[1] = fd;
314 	wf->f_flag = FWRITE;
315 	wf->f_type = DTYPE_PIPE;
316 	wf->f_data = (caddr_t)wpipe;
317 	wf->f_ops = &pipeops;
318 
319 	rpipe->pipe_peer = wpipe;
320 	wpipe->pipe_peer = rpipe;
321 
322 	FILE_SET_MATURE(rf);
323 	FILE_SET_MATURE(wf);
324 	FILE_UNUSE(rf, p);
325 	FILE_UNUSE(wf, p);
326 	return (0);
327 free3:
328 	FILE_UNUSE(rf, p);
329 	ffree(rf);
330 	fdremove(fdp, retval[0]);
331 free2:
332 	pipeclose(wpipe);
333 	pipeclose(rpipe);
334 #endif /* NetBSD */
335 
336 	return (error);
337 }
338 
339 /*
340  * Allocate kva for pipe circular buffer, the space is pageable
341  * This routine will 'realloc' the size of a pipe safely, if it fails
342  * it will retain the old buffer.
343  * If it fails it will return ENOMEM.
344  */
345 static int
346 pipespace(cpipe, size)
347 	struct pipe *cpipe;
348 	int size;
349 {
350 	caddr_t buffer;
351 #ifdef __FreeBSD__
352 	struct vm_object *object;
353 	int npages, error;
354 
355 	npages = round_page(size)/PAGE_SIZE;
356 	/*
357 	 * Create an object, I don't like the idea of paging to/from
358 	 * kernel_object.
359 	 */
360 	mtx_lock(&vm_mtx);
361 	object = vm_object_allocate(OBJT_DEFAULT, npages);
362 	buffer = (caddr_t) vm_map_min(kernel_map);
363 
364 	/*
365 	 * Insert the object into the kernel map, and allocate kva for it.
366 	 * The map entry is, by default, pageable.
367 	 */
368 	error = vm_map_find(kernel_map, object, 0,
369 		(vm_offset_t *) &buffer, size, 1,
370 		VM_PROT_ALL, VM_PROT_ALL, 0);
371 
372 	if (error != KERN_SUCCESS) {
373 		vm_object_deallocate(object);
374 		mtx_unlock(&vm_mtx);
375 		return (ENOMEM);
376 	}
377 #endif /* FreeBSD */
378 
379 #ifdef __NetBSD__
380 	/*
381 	 * Allocate pageable virtual address space. Physical memory is allocated
382 	 * on demand.
383 	 */
384 	buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
385 	if (buffer == NULL)
386 		return (ENOMEM);
387 #endif /* NetBSD */
388 
389 	/* free old resources if we're resizing */
390 	pipe_free_kmem(cpipe);
391 #ifdef __FreeBSD__
392 	mtx_unlock(&vm_mtx);
393 	cpipe->pipe_buffer.object = object;
394 #endif
395 	cpipe->pipe_buffer.buffer = buffer;
396 	cpipe->pipe_buffer.size = size;
397 	cpipe->pipe_buffer.in = 0;
398 	cpipe->pipe_buffer.out = 0;
399 	cpipe->pipe_buffer.cnt = 0;
400 	amountpipekva += cpipe->pipe_buffer.size;
401 	return (0);
402 }
403 
404 /*
405  * initialize and allocate VM and memory for pipe
406  */
407 static int
408 pipe_create(cpipep)
409 	struct pipe **cpipep;
410 {
411 	struct pipe *cpipe;
412 	int error;
413 
414 #ifdef __FreeBSD__
415 	*cpipep = zalloc(pipe_zone);
416 #endif
417 #ifdef __NetBSD__
418 	*cpipep = pool_get(&pipe_pool, M_WAITOK);
419 #endif
420 	if (*cpipep == NULL)
421 		return (ENOMEM);
422 
423 	cpipe = *cpipep;
424 
425 #ifdef __FreeBSD__
426 	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
427 	cpipe->pipe_buffer.object = NULL;
428 #endif /* FreeBSD */
429 	/*
430 	 * protect so pipeclose() doesn't follow a junk pointer
431 	 * if pipespace() fails.
432 	 */
433 	cpipe->pipe_buffer.buffer = NULL;
434 	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
435 	cpipe->pipe_state = PIPE_SIGNALR;
436 	cpipe->pipe_peer = NULL;
437 	cpipe->pipe_busy = 0;
438 
439 #ifndef PIPE_NODIRECT
440 	/*
441 	 * pipe data structure initializations to support direct pipe I/O
442 	 */
443 	cpipe->pipe_map.cnt = 0;
444 	cpipe->pipe_map.kva = NULL;
445 	cpipe->pipe_map.pos = 0;
446 	cpipe->pipe_map.npages = 0;
447 #ifdef __NetBSD__
448 	cpipe->pipe_map.ms = NULL;
449 #endif
450 #endif /* !PIPE_NODIRECT */
451 
452 	if ((error = pipespace(cpipe, PIPE_SIZE)))
453 		return (error);
454 
455 	vfs_timestamp(&cpipe->pipe_ctime);
456 	cpipe->pipe_atime = cpipe->pipe_ctime;
457 	cpipe->pipe_mtime = cpipe->pipe_ctime;
458 #ifdef __NetBSD__
459 	cpipe->pipe_pgid = NO_PID;
460 	lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
461 #endif
462 
463 	return (0);
464 }
465 
466 
467 /*
468  * lock a pipe for I/O, blocking other access
469  */
470 static __inline int
471 pipelock(cpipe, catch)
472 	struct pipe *cpipe;
473 	int catch;
474 {
475 	int error;
476 
477 #ifdef __FreeBSD__
478 	while (cpipe->pipe_state & PIPE_LOCK) {
479 		cpipe->pipe_state |= PIPE_LWANT;
480 		error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
481 		    "pipelk", 0);
482 		if (error != 0)
483 			return (error);
484 	}
485 	cpipe->pipe_state |= PIPE_LOCK;
486 	return (0);
487 #endif
488 
489 #ifdef __NetBSD__
490 	do {
491 		error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
492 	} while (!catch && (error == EINTR || error == ERESTART));
493 	return (error);
494 #endif
495 }
496 
497 /*
498  * unlock a pipe I/O lock
499  */
500 static __inline void
501 pipeunlock(cpipe)
502 	struct pipe *cpipe;
503 {
504 #ifdef __FreeBSD__
505 	cpipe->pipe_state &= ~PIPE_LOCK;
506 	if (cpipe->pipe_state & PIPE_LWANT) {
507 		cpipe->pipe_state &= ~PIPE_LWANT;
508 		wakeup(cpipe);
509 	}
510 #endif
511 
512 #ifdef __NetBSD__
513 	lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
514 #endif
515 }
516 
517 /*
518  * Select/poll wakup. This also sends SIGIO to peer connected to
519  * 'sigpipe' side of pipe.
520  */
521 static __inline void
522 pipeselwakeup(selp, sigp)
523 	struct pipe *selp, *sigp;
524 {
525 	if (selp->pipe_state & PIPE_SEL) {
526 		selp->pipe_state &= ~PIPE_SEL;
527 		selwakeup(&selp->pipe_sel);
528 	}
529 #ifdef __FreeBSD__
530 	if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
531 		pgsigio(sigp->pipe_sigio, SIGIO, 0);
532 	KNOTE(&selp->pipe_sel.si_note, 0);
533 #endif
534 
535 #ifdef __NetBSD__
536 	if (sigp && (sigp->pipe_state & PIPE_ASYNC)
537 	    && sigp->pipe_pgid != NO_PID){
538 		struct proc *p;
539 
540 		if (sigp->pipe_pgid < 0)
541 			gsignal(-sigp->pipe_pgid, SIGIO);
542 		else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0)
543 			psignal(p, SIGIO);
544 	}
545 #endif /* NetBSD */
546 }
547 
548 /* ARGSUSED */
549 #ifdef __FreeBSD__
550 static int
551 pipe_read(fp, uio, cred, flags, p)
552 	struct file *fp;
553 	struct uio *uio;
554 	struct ucred *cred;
555 	int flags;
556 	struct proc *p;
557 #elif defined(__NetBSD__)
558 static int
559 pipe_read(fp, offset, uio, cred, flags)
560 	struct file *fp;
561 	off_t *offset;
562 	struct uio *uio;
563 	struct ucred *cred;
564 	int flags;
565 #endif
566 {
567 	struct pipe *rpipe = (struct pipe *) fp->f_data;
568 	int error;
569 	size_t nread = 0;
570 	size_t size;
571 	size_t ocnt;
572 
573 	++rpipe->pipe_busy;
574 	error = pipelock(rpipe, 1);
575 	if (error)
576 		goto unlocked_error;
577 
578 	ocnt = rpipe->pipe_buffer.cnt;
579 
580 	while (uio->uio_resid) {
581 		/*
582 		 * normal pipe buffer receive
583 		 */
584 		if (rpipe->pipe_buffer.cnt > 0) {
585 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
586 			if (size > rpipe->pipe_buffer.cnt)
587 				size = rpipe->pipe_buffer.cnt;
588 			if (size > uio->uio_resid)
589 				size = uio->uio_resid;
590 
591 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
592 					size, uio);
593 			if (error)
594 				break;
595 
596 			rpipe->pipe_buffer.out += size;
597 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
598 				rpipe->pipe_buffer.out = 0;
599 
600 			rpipe->pipe_buffer.cnt -= size;
601 
602 			/*
603 			 * If there is no more to read in the pipe, reset
604 			 * its pointers to the beginning.  This improves
605 			 * cache hit stats.
606 			 */
607 			if (rpipe->pipe_buffer.cnt == 0) {
608 				rpipe->pipe_buffer.in = 0;
609 				rpipe->pipe_buffer.out = 0;
610 			}
611 			nread += size;
612 #ifndef PIPE_NODIRECT
613 		/*
614 		 * Direct copy, bypassing a kernel buffer.
615 		 */
616 		} else if ((size = rpipe->pipe_map.cnt) &&
617 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
618 			caddr_t	va;
619 			if (size > uio->uio_resid)
620 				size = uio->uio_resid;
621 
622 			va = (caddr_t) rpipe->pipe_map.kva +
623 			    rpipe->pipe_map.pos;
624 			error = uiomove(va, size, uio);
625 			if (error)
626 				break;
627 			nread += size;
628 			rpipe->pipe_map.pos += size;
629 			rpipe->pipe_map.cnt -= size;
630 			if (rpipe->pipe_map.cnt == 0) {
631 				rpipe->pipe_state &= ~PIPE_DIRECTW;
632 				wakeup(rpipe);
633 #ifdef __NetBSD__
634 				if (uio->uio_resid > 0 &&
635 				    (rpipe->pipe_state & PIPE_MOREW))
636 					goto waitformore;
637 #endif /* NetBSD */
638 			}
639 #endif
640 		} else {
641 			/*
642 			 * detect EOF condition
643 			 * read returns 0 on EOF, no need to set error
644 			 */
645 			if (rpipe->pipe_state & PIPE_EOF)
646 				break;
647 
648 			/*
649 			 * If the "write-side" has been blocked, wake it up now.
650 			 */
651 			if (rpipe->pipe_state & PIPE_WANTW) {
652 				rpipe->pipe_state &= ~PIPE_WANTW;
653 				wakeup(rpipe);
654 			}
655 
656 			/*
657 			 * Break if some data was read.
658 			 */
659 			if (nread > 0)
660 				break;
661 
662 			/*
663 			 * don't block on non-blocking I/O
664 			 */
665 			if (fp->f_flag & FNONBLOCK) {
666 				error = EAGAIN;
667 				break;
668 			}
669 
670 #if defined(__NetBSD__) && !defined(PIPE_NODIRECT)
671 		waitformore:
672 #endif
673 			/*
674 			 * Unlock the pipe buffer for our remaining processing.
675 			 * We will either break out with an error or we will
676 			 * sleep and relock to loop.
677 			 */
678 			pipeunlock(rpipe);
679 
680 			/*
681 			 * We want to read more, wake up select/poll.
682 			 */
683 			pipeselwakeup(rpipe, rpipe->pipe_peer);
684 
685 			rpipe->pipe_state |= PIPE_WANTR;
686 			error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
687 			if (error != 0 || (error = pipelock(rpipe, 1)))
688 				goto unlocked_error;
689 		}
690 	}
691 	pipeunlock(rpipe);
692 
693 	if (error == 0)
694 		vfs_timestamp(&rpipe->pipe_atime);
695 unlocked_error:
696 	--rpipe->pipe_busy;
697 
698 	/*
699 	 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
700 	 */
701 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
702 		rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
703 		wakeup(rpipe);
704 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
705 		/*
706 		 * Handle write blocking hysteresis.
707 		 */
708 		if (rpipe->pipe_state & PIPE_WANTW) {
709 			rpipe->pipe_state &= ~PIPE_WANTW;
710 			wakeup(rpipe);
711 		}
712 	}
713 
714 	/*
715 	 * If anything was read off the buffer, signal to the writer it's
716 	 * possible to write more data. Also send signal if we are here for the
717 	 * first time after last write.
718 	 */
719 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
720 	    && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
721 		pipeselwakeup(rpipe, rpipe->pipe_peer);
722 		rpipe->pipe_state &= ~PIPE_SIGNALR;
723 	}
724 
725 	return (error);
726 }
727 
728 #ifdef __FreeBSD__
729 #ifndef PIPE_NODIRECT
730 /*
731  * Map the sending processes' buffer into kernel space and wire it.
732  * This is similar to a physical write operation.
733  */
734 static int
735 pipe_build_write_buffer(wpipe, uio)
736 	struct pipe *wpipe;
737 	struct uio *uio;
738 {
739 	size_t size;
740 	int i;
741 	vm_offset_t addr, endaddr, paddr;
742 
743 	size = uio->uio_iov->iov_len;
744 	if (size > wpipe->pipe_buffer.size)
745 		size = wpipe->pipe_buffer.size;
746 
747 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
748 	mtx_lock(&vm_mtx);
749 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
750 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
751 		vm_page_t m;
752 
753 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
754 		    (paddr = pmap_kextract(addr)) == 0) {
755 			int j;
756 
757 			for (j = 0; j < i; j++)
758 				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
759 			mtx_unlock(&vm_mtx);
760 			return (EFAULT);
761 		}
762 
763 		m = PHYS_TO_VM_PAGE(paddr);
764 		vm_page_wire(m);
765 		wpipe->pipe_map.ms[i] = m;
766 	}
767 
768 /*
769  * set up the control block
770  */
771 	wpipe->pipe_map.npages = i;
772 	wpipe->pipe_map.pos =
773 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
774 	wpipe->pipe_map.cnt = size;
775 
776 /*
777  * and map the buffer
778  */
779 	if (wpipe->pipe_map.kva == 0) {
780 		/*
781 		 * We need to allocate space for an extra page because the
782 		 * address range might (will) span pages at times.
783 		 */
784 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
785 			wpipe->pipe_buffer.size + PAGE_SIZE);
786 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
787 	}
788 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
789 		wpipe->pipe_map.npages);
790 
791 	mtx_unlock(&vm_mtx);
792 /*
793  * and update the uio data
794  */
795 
796 	uio->uio_iov->iov_len -= size;
797 	uio->uio_iov->iov_base += size;
798 	if (uio->uio_iov->iov_len == 0)
799 		uio->uio_iov++;
800 	uio->uio_resid -= size;
801 	uio->uio_offset += size;
802 	return (0);
803 }
804 
805 /*
806  * unmap and unwire the process buffer
807  */
808 static void
809 pipe_destroy_write_buffer(wpipe)
810 	struct pipe *wpipe;
811 {
812 	int i;
813 
814 	mtx_lock(&vm_mtx);
815 	if (wpipe->pipe_map.kva) {
816 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
817 
818 		if (amountpipekva > maxpipekva) {
819 			vm_offset_t kva = wpipe->pipe_map.kva;
820 			wpipe->pipe_map.kva = 0;
821 			kmem_free(kernel_map, kva,
822 				wpipe->pipe_buffer.size + PAGE_SIZE);
823 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
824 		}
825 	}
826 	for (i = 0; i < wpipe->pipe_map.npages; i++)
827 		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
828 	mtx_unlock(&vm_mtx);
829 }
830 
831 /*
832  * In the case of a signal, the writing process might go away.  This
833  * code copies the data into the circular buffer so that the source
834  * pages can be freed without loss of data.
835  */
836 static void
837 pipe_clone_write_buffer(wpipe)
838 	struct pipe *wpipe;
839 {
840 	int size;
841 	int pos;
842 
843 	size = wpipe->pipe_map.cnt;
844 	pos = wpipe->pipe_map.pos;
845 	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
846 	    (caddr_t) wpipe->pipe_buffer.buffer, size);
847 
848 	wpipe->pipe_buffer.in = size;
849 	wpipe->pipe_buffer.out = 0;
850 	wpipe->pipe_buffer.cnt = size;
851 	wpipe->pipe_state &= ~PIPE_DIRECTW;
852 
853 	pipe_destroy_write_buffer(wpipe);
854 }
855 
856 /*
857  * This implements the pipe buffer write mechanism.  Note that only
858  * a direct write OR a normal pipe write can be pending at any given time.
859  * If there are any characters in the pipe buffer, the direct write will
860  * be deferred until the receiving process grabs all of the bytes from
861  * the pipe buffer.  Then the direct mapping write is set-up.
862  */
863 static int
864 pipe_direct_write(wpipe, uio)
865 	struct pipe *wpipe;
866 	struct uio *uio;
867 {
868 	int error;
869 
870 retry:
871 	while (wpipe->pipe_state & PIPE_DIRECTW) {
872 		if (wpipe->pipe_state & PIPE_WANTR) {
873 			wpipe->pipe_state &= ~PIPE_WANTR;
874 			wakeup(wpipe);
875 		}
876 		wpipe->pipe_state |= PIPE_WANTW;
877 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
878 		if (error)
879 			goto error1;
880 		if (wpipe->pipe_state & PIPE_EOF) {
881 			error = EPIPE;
882 			goto error1;
883 		}
884 	}
885 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
886 	if (wpipe->pipe_buffer.cnt > 0) {
887 		if (wpipe->pipe_state & PIPE_WANTR) {
888 			wpipe->pipe_state &= ~PIPE_WANTR;
889 			wakeup(wpipe);
890 		}
891 
892 		wpipe->pipe_state |= PIPE_WANTW;
893 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
894 		if (error)
895 			goto error1;
896 		if (wpipe->pipe_state & PIPE_EOF) {
897 			error = EPIPE;
898 			goto error1;
899 		}
900 		goto retry;
901 	}
902 
903 	wpipe->pipe_state |= PIPE_DIRECTW;
904 
905 	error = pipe_build_write_buffer(wpipe, uio);
906 	if (error) {
907 		wpipe->pipe_state &= ~PIPE_DIRECTW;
908 		goto error1;
909 	}
910 
911 	error = 0;
912 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
913 		if (wpipe->pipe_state & PIPE_EOF) {
914 			pipelock(wpipe, 0);
915 			pipe_destroy_write_buffer(wpipe);
916 			pipeunlock(wpipe);
917 			pipeselwakeup(wpipe, wpipe);
918 			error = EPIPE;
919 			goto error1;
920 		}
921 		if (wpipe->pipe_state & PIPE_WANTR) {
922 			wpipe->pipe_state &= ~PIPE_WANTR;
923 			wakeup(wpipe);
924 		}
925 		pipeselwakeup(wpipe, wpipe);
926 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
927 	}
928 
929 	pipelock(wpipe,0);
930 	if (wpipe->pipe_state & PIPE_DIRECTW) {
931 		/*
932 		 * this bit of trickery substitutes a kernel buffer for
933 		 * the process that might be going away.
934 		 */
935 		pipe_clone_write_buffer(wpipe);
936 	} else {
937 		pipe_destroy_write_buffer(wpipe);
938 	}
939 	pipeunlock(wpipe);
940 	return (error);
941 
942 error1:
943 	wakeup(wpipe);
944 	return (error);
945 }
946 #endif /* !PIPE_NODIRECT */
947 #endif /* FreeBSD */
948 
949 #ifdef __NetBSD__
950 #ifndef PIPE_NODIRECT
951 /*
952  * Allocate structure for loan transfer.
953  */
954 static __inline int
955 pipe_loan_alloc(wpipe, npages, blen)
956 	struct pipe *wpipe;
957 	int npages;
958 	vsize_t blen;
959 {
960 	wpipe->pipe_map.kva = uvm_km_valloc(kernel_map, blen);
961 	if (wpipe->pipe_map.kva == NULL)
962 		return (ENOMEM);
963 
964 	amountpipekva += blen;
965 	wpipe->pipe_map.npages = npages;
966 	wpipe->pipe_map.ms = (struct vm_page **) malloc(
967 		npages * sizeof(struct vm_page *), M_PIPE, M_WAITOK);
968 
969 	return (0);
970 }
971 
972 /*
973  * Free resources allocated for loan transfer.
974  */
975 static void
976 pipe_loan_free(wpipe)
977 	struct pipe *wpipe;
978 {
979 	uvm_km_free(kernel_map, wpipe->pipe_map.kva,
980 			wpipe->pipe_map.npages * PAGE_SIZE);
981 	wpipe->pipe_map.kva = NULL;
982 	amountpipekva -= wpipe->pipe_map.npages * PAGE_SIZE;
983 	free(wpipe->pipe_map.ms, M_PIPE);
984 	wpipe->pipe_map.ms = NULL;
985 }
986 
987 /*
988  * NetBSD direct write, using uvm_loan() mechanism.
989  * This implements the pipe buffer write mechanism.  Note that only
990  * a direct write OR a normal pipe write can be pending at any given time.
991  * If there are any characters in the pipe buffer, the direct write will
992  * be deferred until the receiving process grabs all of the bytes from
993  * the pipe buffer.  Then the direct mapping write is set-up.
994  */
995 static __inline int
996 pipe_direct_write(wpipe, uio)
997 	struct pipe *wpipe;
998 	struct uio *uio;
999 {
1000 	int error, i, npages, j;
1001 	struct vm_page **res;
1002 	vaddr_t bbase, kva, base, bend;
1003 	vsize_t blen, bcnt;
1004 	voff_t boff, bpos;
1005 	struct vm_map *wmap = &uio->uio_procp->p_vmspace->vm_map;
1006 retry:
1007 	while (wpipe->pipe_state & PIPE_DIRECTW) {
1008 		if (wpipe->pipe_state & PIPE_WANTR) {
1009 			wpipe->pipe_state &= ~PIPE_WANTR;
1010 			wakeup(wpipe);
1011 		}
1012 		wpipe->pipe_state |= PIPE_WANTW;
1013 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
1014 		if (error)
1015 			goto error1;
1016 		if (wpipe->pipe_state & PIPE_EOF) {
1017 			error = EPIPE;
1018 			goto error1;
1019 		}
1020 	}
1021 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
1022 	if (wpipe->pipe_buffer.cnt > 0) {
1023 		if ( wpipe->pipe_state & PIPE_WANTR) {
1024 			wpipe->pipe_state &= ~PIPE_WANTR;
1025 			wakeup(wpipe);
1026 		}
1027 
1028 		wpipe->pipe_state |= PIPE_WANTW;
1029 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1030 		if (error)
1031 			goto error1;
1032 		if (wpipe->pipe_state & PIPE_EOF) {
1033 			error = EPIPE;
1034 			goto error1;
1035 		}
1036 		goto retry;
1037 	}
1038 
1039 	/*
1040 	 * For each iovec:
1041 	 * 1. Loan the pages to kernel.
1042 	 * 2. Set up pipe structures.
1043 	 * 3. Wait until consumer reads it all or exits.
1044 	 */
1045 	boff = 0;
1046 	for(i=0; i < uio->uio_iovcnt; ) {
1047 		/*
1048 		 * Note: need to handle buffers not aligned to PAGE_SIZE.
1049 		 */
1050 		bbase = (vaddr_t)uio->uio_iov[i].iov_base;
1051 		base = trunc_page(bbase + boff);
1052 		bend = round_page(bbase + uio->uio_iov[i].iov_len);
1053 		blen = bend - base;
1054 
1055 		if (boff == 0)
1056 			bpos = bbase % PAGE_SIZE;
1057 		else
1058 			bpos = 0;
1059 
1060 		if (blen > PIPE_DIRECT_CHUNK) {
1061 			blen = PIPE_DIRECT_CHUNK;
1062 			boff += PIPE_DIRECT_CHUNK;
1063 			bend = base + blen;
1064 			bcnt = PIPE_DIRECT_CHUNK - bpos;
1065 			wpipe->pipe_state |= PIPE_MOREW;
1066 		} else {
1067 			if (boff == 0)
1068 				bcnt = uio->uio_iov[i].iov_len;
1069 			else
1070 				bcnt = ((bbase % PAGE_SIZE) +
1071 				    uio->uio_iov[i].iov_len) %PIPE_DIRECT_CHUNK;
1072 			boff = 0;
1073 			i++;
1074 			wpipe->pipe_state &= ~PIPE_MOREW;
1075 		}
1076 
1077 		npages = blen / PAGE_SIZE;
1078 
1079 		/*
1080 		 * Free the old kva if we need more pages than we have
1081 		 * allocated.
1082 		 */
1083 		if (wpipe->pipe_map.kva
1084 		    && npages > wpipe->pipe_map.npages)
1085 			pipe_loan_free(wpipe);
1086 
1087 		/* Allocate new kva. */
1088 		if (!wpipe->pipe_map.kva) {
1089 			if ((error = pipe_loan_alloc(wpipe,
1090 					npages, blen)))
1091 				goto error;
1092 		}
1093 
1094 		/* Loan the write buffer memory from writer process */
1095 		res = wpipe->pipe_map.ms;
1096 		error = uvm_loan(wmap, base, blen,
1097 				(void **) res, UVM_LOAN_TOPAGE);
1098 		if (error)
1099 			goto cleanup;
1100 
1101 		/* Enter the loaned pages to kva */
1102 		kva = wpipe->pipe_map.kva;
1103 		for(j=0; j < npages; j++, kva += PAGE_SIZE)
1104 			pmap_enter(pmap_kernel(), kva, res[j]->phys_addr,
1105 				VM_PROT_READ, 0);
1106 
1107 		wpipe->pipe_map.pos = bpos;
1108 		wpipe->pipe_map.cnt = bcnt;
1109 		wpipe->pipe_state |= PIPE_DIRECTW;
1110 
1111 		error = 0;
1112 		while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1113 			if (wpipe->pipe_state & PIPE_EOF) {
1114 				error = EPIPE;
1115 				break;
1116 			}
1117 			if (wpipe->pipe_state & PIPE_WANTR) {
1118 				wpipe->pipe_state &= ~PIPE_WANTR;
1119 				wakeup(wpipe);
1120 			}
1121 			pipeselwakeup(wpipe, wpipe);
1122 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1123 		}
1124 
1125 	cleanup:
1126 		pipelock(wpipe,0);
1127 		if (amountpipekva > maxpipekva)
1128 			pipe_loan_free(wpipe);
1129 		uvm_unloanpage(res, npages);
1130 		pipeunlock(wpipe);
1131 		if (error) {
1132 	error:
1133 			/* XXX update uio ? */
1134 			if (error == EPIPE)
1135 				pipeselwakeup(wpipe, wpipe);
1136 
1137 			wpipe->pipe_state &= ~PIPE_MOREW;
1138 			goto error1;
1139 		}
1140 
1141 		uio->uio_offset += bcnt;
1142 		uio->uio_resid  -= bcnt;
1143 
1144 	} /* for */
1145 
1146 	return (error);
1147 
1148 error1:
1149 	wakeup(wpipe);
1150 	return (error);
1151 }
1152 #endif /* !PIPE_NODIRECT */
1153 #endif /* NetBSD */
1154 
1155 #ifdef __FreeBSD__
1156 static int
1157 pipe_write(fp, uio, cred, flags, p)
1158 	struct file *fp;
1159 	off_t *offset;
1160 	struct uio *uio;
1161 	struct ucred *cred;
1162 	int flags;
1163 	struct proc *p;
1164 #elif defined(__NetBSD__)
1165 static int
1166 pipe_write(fp, offset, uio, cred, flags)
1167 	struct file *fp;
1168 	off_t *offset;
1169 	struct uio *uio;
1170 	struct ucred *cred;
1171 	int flags;
1172 #endif
1173 {
1174 	int error = 0;
1175 	int orig_resid;
1176 	struct pipe *wpipe, *rpipe;
1177 
1178 	rpipe = (struct pipe *) fp->f_data;
1179 	wpipe = rpipe->pipe_peer;
1180 
1181 	/*
1182 	 * detect loss of pipe read side, issue SIGPIPE if lost.
1183 	 */
1184 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF))
1185 		return (EPIPE);
1186 
1187 	++wpipe->pipe_busy;
1188 
1189 	/*
1190 	 * If it is advantageous to resize the pipe buffer, do
1191 	 * so.
1192 	 */
1193 	if ((uio->uio_resid > PIPE_SIZE) &&
1194 		(nbigpipe < maxbigpipes) &&
1195 #ifndef PIPE_NODIRECT
1196 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1197 #endif
1198 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1199 		(wpipe->pipe_buffer.cnt == 0)) {
1200 
1201 		if ((error = pipelock(wpipe,1)) == 0) {
1202 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1203 				nbigpipe++;
1204 			pipeunlock(wpipe);
1205 		} else {
1206 			/*
1207 			 * If an error occured unbusy and return, waking up any
1208 			 * pending readers.
1209 			 */
1210 			--wpipe->pipe_busy;
1211 			if (wpipe->pipe_busy == 0
1212 			    && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1213 				wpipe->pipe_state &=
1214 				    ~(PIPE_WANTCLOSE | PIPE_WANTR);
1215 				wakeup(wpipe);
1216 			}
1217 
1218 			return (error);
1219 		}
1220 	}
1221 
1222 #ifdef __FreeBSD__
1223 	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1224 #endif
1225 
1226 	orig_resid = uio->uio_resid;
1227 	while (uio->uio_resid) {
1228 		int space;
1229 
1230 #ifndef PIPE_NODIRECT
1231 		/*
1232 		 * If the transfer is large, we can gain performance if
1233 		 * we do process-to-process copies directly.
1234 		 * If the write is non-blocking, we don't use the
1235 		 * direct write mechanism.
1236 		 *
1237 		 * The direct write mechanism will detect the reader going
1238 		 * away on us.
1239 		 */
1240 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1241 		    (fp->f_flag & FNONBLOCK) == 0 &&
1242 		    (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1243 			error = pipe_direct_write(wpipe, uio);
1244 			if (error)
1245 				break;
1246 			continue;
1247 		}
1248 #endif /* PIPE_NODIRECT */
1249 
1250 		/*
1251 		 * Pipe buffered writes cannot be coincidental with
1252 		 * direct writes.  We wait until the currently executing
1253 		 * direct write is completed before we start filling the
1254 		 * pipe buffer.  We break out if a signal occurs or the
1255 		 * reader goes away.
1256 		 */
1257 	retrywrite:
1258 		while (wpipe->pipe_state & PIPE_DIRECTW) {
1259 			if (wpipe->pipe_state & PIPE_WANTR) {
1260 				wpipe->pipe_state &= ~PIPE_WANTR;
1261 				wakeup(wpipe);
1262 			}
1263 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1264 			if (wpipe->pipe_state & PIPE_EOF)
1265 				break;
1266 			if (error)
1267 				break;
1268 		}
1269 		if (wpipe->pipe_state & PIPE_EOF) {
1270 			error = EPIPE;
1271 			break;
1272 		}
1273 
1274 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1275 
1276 		/* Writes of size <= PIPE_BUF must be atomic. */
1277 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1278 			space = 0;
1279 
1280 		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
1281 			int size;	/* Transfer size */
1282 			int segsize;	/* first segment to transfer */
1283 
1284 			if ((error = pipelock(wpipe,1)) != 0)
1285 				break;
1286 
1287 			/*
1288 			 * It is possible for a direct write to
1289 			 * slip in on us... handle it here...
1290 			 */
1291 			if (wpipe->pipe_state & PIPE_DIRECTW) {
1292 				pipeunlock(wpipe);
1293 				goto retrywrite;
1294 			}
1295 			/*
1296 			 * If a process blocked in uiomove, our
1297 			 * value for space might be bad.
1298 			 *
1299 			 * XXX will we be ok if the reader has gone
1300 			 * away here?
1301 			 */
1302 			if (space > wpipe->pipe_buffer.size -
1303 				    wpipe->pipe_buffer.cnt) {
1304 				pipeunlock(wpipe);
1305 				goto retrywrite;
1306 			}
1307 
1308 			/*
1309 			 * Transfer size is minimum of uio transfer
1310 			 * and free space in pipe buffer.
1311 			 */
1312 			if (space > uio->uio_resid)
1313 				size = uio->uio_resid;
1314 			else
1315 				size = space;
1316 			/*
1317 			 * First segment to transfer is minimum of
1318 			 * transfer size and contiguous space in
1319 			 * pipe buffer.  If first segment to transfer
1320 			 * is less than the transfer size, we've got
1321 			 * a wraparound in the buffer.
1322 			 */
1323 			segsize = wpipe->pipe_buffer.size -
1324 				wpipe->pipe_buffer.in;
1325 			if (segsize > size)
1326 				segsize = size;
1327 
1328 			/* Transfer first segment */
1329 
1330 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1331 						segsize, uio);
1332 
1333 			if (error == 0 && segsize < size) {
1334 				/*
1335 				 * Transfer remaining part now, to
1336 				 * support atomic writes.  Wraparound
1337 				 * happened.
1338 				 */
1339 #ifdef DEBUG
1340 				if (wpipe->pipe_buffer.in + segsize !=
1341 				    wpipe->pipe_buffer.size)
1342 					panic("Expected pipe buffer wraparound disappeared");
1343 #endif
1344 
1345 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
1346 						size - segsize, uio);
1347 			}
1348 			if (error == 0) {
1349 				wpipe->pipe_buffer.in += size;
1350 				if (wpipe->pipe_buffer.in >=
1351 				    wpipe->pipe_buffer.size) {
1352 #ifdef DEBUG
1353 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1354 						panic("Expected wraparound bad");
1355 #endif
1356 					wpipe->pipe_buffer.in = size - segsize;
1357 				}
1358 
1359 				wpipe->pipe_buffer.cnt += size;
1360 #ifdef DEBUG
1361 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1362 					panic("Pipe buffer overflow");
1363 #endif
1364 
1365 			}
1366 			pipeunlock(wpipe);
1367 			if (error)
1368 				break;
1369 
1370 		} else {
1371 			/*
1372 			 * If the "read-side" has been blocked, wake it up now.
1373 			 */
1374 			if (wpipe->pipe_state & PIPE_WANTR) {
1375 				wpipe->pipe_state &= ~PIPE_WANTR;
1376 				wakeup(wpipe);
1377 			}
1378 
1379 			/*
1380 			 * don't block on non-blocking I/O
1381 			 */
1382 			if (fp->f_flag & FNONBLOCK) {
1383 				error = EAGAIN;
1384 				break;
1385 			}
1386 
1387 			/*
1388 			 * We have no more space and have something to offer,
1389 			 * wake up select/poll.
1390 			 */
1391 			pipeselwakeup(wpipe, wpipe);
1392 
1393 			wpipe->pipe_state |= PIPE_WANTW;
1394 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1395 			if (error != 0)
1396 				break;
1397 			/*
1398 			 * If read side wants to go away, we just issue a signal
1399 			 * to ourselves.
1400 			 */
1401 			if (wpipe->pipe_state & PIPE_EOF) {
1402 				error = EPIPE;
1403 				break;
1404 			}
1405 		}
1406 	}
1407 
1408 	--wpipe->pipe_busy;
1409 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1410 		wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1411 		wakeup(wpipe);
1412 	} else if (wpipe->pipe_buffer.cnt > 0) {
1413 		/*
1414 		 * If we have put any characters in the buffer, we wake up
1415 		 * the reader.
1416 		 */
1417 		if (wpipe->pipe_state & PIPE_WANTR) {
1418 			wpipe->pipe_state &= ~PIPE_WANTR;
1419 			wakeup(wpipe);
1420 		}
1421 	}
1422 
1423 	/*
1424 	 * Don't return EPIPE if I/O was successful
1425 	 */
1426 	if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1427 	    && (uio->uio_resid == 0))
1428 		error = 0;
1429 
1430 	if (error == 0)
1431 		vfs_timestamp(&wpipe->pipe_mtime);
1432 
1433 	/*
1434 	 * We have something to offer, wake up select/poll.
1435 	 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1436 	 * is only done synchronously), so check wpipe->only pipe_buffer.cnt
1437 	 */
1438 	if (wpipe->pipe_buffer.cnt)
1439 		pipeselwakeup(wpipe, wpipe);
1440 
1441 	/*
1442 	 * Arrange for next read(2) to do a signal.
1443 	 */
1444 	wpipe->pipe_state |= PIPE_SIGNALR;
1445 
1446 	return (error);
1447 }
1448 
1449 /*
1450  * we implement a very minimal set of ioctls for compatibility with sockets.
1451  */
1452 int
1453 pipe_ioctl(fp, cmd, data, p)
1454 	struct file *fp;
1455 	u_long cmd;
1456 	caddr_t data;
1457 	struct proc *p;
1458 {
1459 	struct pipe *mpipe = (struct pipe *)fp->f_data;
1460 
1461 	switch (cmd) {
1462 
1463 	case FIONBIO:
1464 		return (0);
1465 
1466 	case FIOASYNC:
1467 		if (*(int *)data) {
1468 			mpipe->pipe_state |= PIPE_ASYNC;
1469 		} else {
1470 			mpipe->pipe_state &= ~PIPE_ASYNC;
1471 		}
1472 		return (0);
1473 
1474 	case FIONREAD:
1475 #ifndef PIPE_NODIRECT
1476 		if (mpipe->pipe_state & PIPE_DIRECTW)
1477 			*(int *)data = mpipe->pipe_map.cnt;
1478 		else
1479 #endif
1480 			*(int *)data = mpipe->pipe_buffer.cnt;
1481 		return (0);
1482 
1483 #ifdef __FreeBSD__
1484 	case FIOSETOWN:
1485 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1486 
1487 	case FIOGETOWN:
1488 		*(int *)data = fgetown(mpipe->pipe_sigio);
1489 		return (0);
1490 
1491 	/* This is deprecated, FIOSETOWN should be used instead. */
1492 	case TIOCSPGRP:
1493 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1494 
1495 	/* This is deprecated, FIOGETOWN should be used instead. */
1496 	case TIOCGPGRP:
1497 		*(int *)data = -fgetown(mpipe->pipe_sigio);
1498 		return (0);
1499 #endif /* FreeBSD */
1500 #ifdef __NetBSD__
1501 	case TIOCSPGRP:
1502 		mpipe->pipe_pgid = *(int *)data;
1503 		return (0);
1504 
1505 	case TIOCGPGRP:
1506 		*(int *)data = mpipe->pipe_pgid;
1507 		return (0);
1508 #endif /* NetBSD */
1509 
1510 	}
1511 	return (ENOTTY);
1512 }
1513 
1514 int
1515 pipe_poll(fp, events, p)
1516 	struct file *fp;
1517 	int events;
1518 	struct proc *p;
1519 {
1520 	struct pipe *rpipe = (struct pipe *)fp->f_data;
1521 	struct pipe *wpipe;
1522 	int revents = 0;
1523 
1524 	wpipe = rpipe->pipe_peer;
1525 	if (events & (POLLIN | POLLRDNORM))
1526 		if ((rpipe->pipe_buffer.cnt > 0) ||
1527 #ifndef PIPE_NODIRECT
1528 		    (rpipe->pipe_state & PIPE_DIRECTW) ||
1529 #endif
1530 		    (rpipe->pipe_state & PIPE_EOF))
1531 			revents |= events & (POLLIN | POLLRDNORM);
1532 
1533 	if (events & (POLLOUT | POLLWRNORM))
1534 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1535 		    || (
1536 #ifndef PIPE_NODIRECT
1537 		     ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1538 #endif
1539 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1540 			revents |= events & (POLLOUT | POLLWRNORM);
1541 
1542 	if ((rpipe->pipe_state & PIPE_EOF) ||
1543 	    (wpipe == NULL) ||
1544 	    (wpipe->pipe_state & PIPE_EOF))
1545 		revents |= POLLHUP;
1546 
1547 	if (revents == 0) {
1548 		if (events & (POLLIN | POLLRDNORM)) {
1549 			selrecord(p, &rpipe->pipe_sel);
1550 			rpipe->pipe_state |= PIPE_SEL;
1551 		}
1552 
1553 		if (events & (POLLOUT | POLLWRNORM)) {
1554 			selrecord(p, &wpipe->pipe_sel);
1555 			wpipe->pipe_state |= PIPE_SEL;
1556 		}
1557 	}
1558 
1559 	return (revents);
1560 }
1561 
1562 static int
1563 pipe_stat(fp, ub, p)
1564 	struct file *fp;
1565 	struct stat *ub;
1566 	struct proc *p;
1567 {
1568 	struct pipe *pipe = (struct pipe *)fp->f_data;
1569 
1570 	bzero((caddr_t)ub, sizeof(*ub));
1571 	ub->st_mode = S_IFIFO;
1572 	ub->st_blksize = pipe->pipe_buffer.size;
1573 	ub->st_size = pipe->pipe_buffer.cnt;
1574 	ub->st_blocks = (ub->st_size) ? 1 : 0;
1575 #ifdef __FreeBSD__
1576 	ub->st_atimespec = pipe->pipe_atime;
1577 	ub->st_mtimespec = pipe->pipe_mtime;
1578 	ub->st_ctimespec = pipe->pipe_ctime;
1579 #endif /* FreeBSD */
1580 #ifdef __NetBSD__
1581 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1582 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1583 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1584 #endif /* NetBSD */
1585 	ub->st_uid = fp->f_cred->cr_uid;
1586 	ub->st_gid = fp->f_cred->cr_gid;
1587 	/*
1588 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1589 	 * XXX (st_dev, st_ino) should be unique.
1590 	 */
1591 	return (0);
1592 }
1593 
1594 /* ARGSUSED */
1595 static int
1596 pipe_close(fp, p)
1597 	struct file *fp;
1598 	struct proc *p;
1599 {
1600 	struct pipe *cpipe = (struct pipe *)fp->f_data;
1601 
1602 #ifdef __FreeBSD__
1603 	fp->f_ops = &badfileops;
1604 	funsetown(cpipe->pipe_sigio);
1605 #endif
1606 	fp->f_data = NULL;
1607 	pipeclose(cpipe);
1608 	return (0);
1609 }
1610 
1611 static void
1612 pipe_free_kmem(cpipe)
1613 	struct pipe *cpipe;
1614 {
1615 
1616 #ifdef __FreeBSD__
1617 	mtx_assert(&vm_mtx, MA_OWNED);
1618 #endif
1619 	if (cpipe->pipe_buffer.buffer != NULL) {
1620 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1621 			--nbigpipe;
1622 		amountpipekva -= cpipe->pipe_buffer.size;
1623 #ifdef __FreeBSD__
1624 		kmem_free(kernel_map,
1625 			(vm_offset_t)cpipe->pipe_buffer.buffer,
1626 			cpipe->pipe_buffer.size);
1627 #elif defined(__NetBSD__)
1628 		uvm_km_free(kernel_map,
1629 			(vaddr_t)cpipe->pipe_buffer.buffer,
1630 			cpipe->pipe_buffer.size);
1631 #endif /* NetBSD */
1632 
1633 		cpipe->pipe_buffer.buffer = NULL;
1634 	}
1635 #ifndef PIPE_NODIRECT
1636 	if (cpipe->pipe_map.kva != NULL) {
1637 #ifdef __FreeBSD__
1638 		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1639 		kmem_free(kernel_map,
1640 			cpipe->pipe_map.kva,
1641 			cpipe->pipe_buffer.size + PAGE_SIZE);
1642 #elif defined(__NetBSD__)
1643 		pipe_loan_free(cpipe);
1644 #endif /* NetBSD */
1645 		cpipe->pipe_map.cnt = 0;
1646 		cpipe->pipe_map.kva = NULL;
1647 		cpipe->pipe_map.pos = 0;
1648 		cpipe->pipe_map.npages = 0;
1649 	}
1650 #endif /* !PIPE_NODIRECT */
1651 }
1652 
1653 /*
1654  * shutdown the pipe
1655  */
1656 static void
1657 pipeclose(cpipe)
1658 	struct pipe *cpipe;
1659 {
1660 	struct pipe *ppipe;
1661 
1662 	if (!cpipe)
1663 		return;
1664 
1665 	pipeselwakeup(cpipe, cpipe);
1666 
1667 	/*
1668 	 * If the other side is blocked, wake it up saying that
1669 	 * we want to close it down.
1670 	 */
1671 	while (cpipe->pipe_busy) {
1672 		wakeup(cpipe);
1673 		cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1674 		tsleep(cpipe, PRIBIO, "pipecl", 0);
1675 	}
1676 
1677 	/*
1678 	 * Disconnect from peer
1679 	 */
1680 	if ((ppipe = cpipe->pipe_peer) != NULL) {
1681 		pipeselwakeup(ppipe, ppipe);
1682 
1683 		ppipe->pipe_state |= PIPE_EOF;
1684 		wakeup(ppipe);
1685 		ppipe->pipe_peer = NULL;
1686 	}
1687 
1688 	/*
1689 	 * free resources
1690 	 */
1691 #ifdef _FreeBSD__
1692 	mtx_lock(&vm_mtx);
1693 	pipe_free_kmem(cpipe);
1694 	/* XXX: erm, doesn't zalloc already have its own locks and
1695 	 * not need the giant vm lock?
1696 	 */
1697 	zfree(pipe_zone, cpipe);
1698 	mtx_unlock(&vm_mtx);
1699 #endif /* FreeBSD */
1700 
1701 #ifdef __NetBSD__
1702 	pipe_free_kmem(cpipe);
1703 	(void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1704 	pool_put(&pipe_pool, cpipe);
1705 #endif
1706 }
1707 
1708 #ifdef __FreeBSD__
1709 /*ARGSUSED*/
1710 static int
1711 pipe_kqfilter(struct file *fp, struct knote *kn)
1712 {
1713 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1714 
1715 	switch (kn->kn_filter) {
1716 	case EVFILT_READ:
1717 		kn->kn_fop = &pipe_rfiltops;
1718 		break;
1719 	case EVFILT_WRITE:
1720 		kn->kn_fop = &pipe_wfiltops;
1721 		cpipe = cpipe->pipe_peer;
1722 		break;
1723 	default:
1724 		return (1);
1725 	}
1726 	kn->kn_hook = (caddr_t)cpipe;
1727 
1728 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1729 	return (0);
1730 }
1731 
1732 static void
1733 filt_pipedetach(struct knote *kn)
1734 {
1735 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1736 
1737 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1738 }
1739 
1740 /*ARGSUSED*/
1741 static int
1742 filt_piperead(struct knote *kn, long hint)
1743 {
1744 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1745 	struct pipe *wpipe = rpipe->pipe_peer;
1746 
1747 	kn->kn_data = rpipe->pipe_buffer.cnt;
1748 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1749 		kn->kn_data = rpipe->pipe_map.cnt;
1750 
1751 	if ((rpipe->pipe_state & PIPE_EOF) ||
1752 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1753 		kn->kn_flags |= EV_EOF;
1754 		return (1);
1755 	}
1756 	return (kn->kn_data > 0);
1757 }
1758 
1759 /*ARGSUSED*/
1760 static int
1761 filt_pipewrite(struct knote *kn, long hint)
1762 {
1763 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1764 	struct pipe *wpipe = rpipe->pipe_peer;
1765 
1766 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1767 		kn->kn_data = 0;
1768 		kn->kn_flags |= EV_EOF;
1769 		return (1);
1770 	}
1771 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1772 	if (wpipe->pipe_state & PIPE_DIRECTW)
1773 		kn->kn_data = 0;
1774 
1775 	return (kn->kn_data >= PIPE_BUF);
1776 }
1777 #endif /* FreeBSD */
1778 
1779 #ifdef __NetBSD__
1780 static int
1781 pipe_fcntl(fp, cmd, data, p)
1782 	struct file *fp;
1783 	u_int cmd;
1784 	caddr_t data;
1785 	struct proc *p;
1786 {
1787 	if (cmd == F_SETFL)
1788 		return (0);
1789 	else
1790 		return (EOPNOTSUPP);
1791 }
1792 
1793 /*
1794  * Handle pipe sysctls.
1795  */
1796 int
1797 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1798 	int *name;
1799 	u_int namelen;
1800 	void *oldp;
1801 	size_t *oldlenp;
1802 	void *newp;
1803 	size_t newlen;
1804 {
1805 	/* All sysctl names at this level are terminal. */
1806 	if (namelen != 1)
1807 		return (ENOTDIR);		/* overloaded */
1808 
1809 	switch (name[0]) {
1810 	case KERN_PIPE_MAXKVASZ:
1811 		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1812 	case KERN_PIPE_LIMITKVA:
1813 		return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1814 	case KERN_PIPE_MAXBIGPIPES:
1815 		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1816 	case KERN_PIPE_NBIGPIPES:
1817 		return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1818 	case KERN_PIPE_KVASIZE:
1819 		return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1820 	default:
1821 		return (EOPNOTSUPP);
1822 	}
1823 	/* NOTREACHED */
1824 }
1825 
1826 /*
1827  * Initialize pipe structs.
1828  */
1829 void
1830 pipe_init(void)
1831 {
1832 	pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
1833 		0, NULL, NULL, M_PIPE);
1834 }
1835 
1836 #endif /* __NetBSD __ */
1837