xref: /netbsd-src/sys/kern/sys_pipe.c (revision 220b5c059a84c51ea44107ea8951a57ffaecdc8c)
1 /*	$NetBSD: sys_pipe.c,v 1.20 2001/12/11 18:15:09 jdolecek Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  *
21  * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $
22  */
23 
24 /*
25  * This file contains a high-performance replacement for the socket-based
26  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
27  * all features of sockets, but does do everything that pipes normally
28  * do.
29  *
30  * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31  * written by Jaromir Dolecek.
32  */
33 
34 /*
35  * This code has two modes of operation, a small write mode and a large
36  * write mode.  The small write mode acts like conventional pipes with
37  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
38  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
39  * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40  * those pages are also wired), and the receiving process can copy it directly
41  * from the pages in the sending process.
42  *
43  * If the sending process receives a signal, it is possible that it will
44  * go away, and certainly its address space can change, because control
45  * is returned back to the user-mode side.  In that case, the pipe code
46  * arranges to copy the buffer supplied by the user process on FreeBSD, to
47  * a pageable kernel buffer, and the receiving process will grab the data
48  * from the pageable kernel buffer.  Since signals don't happen all that often,
49  * the copy operation is normally eliminated.
50  * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51  * so no explicit handling need to be done, all is handled by standard VM
52  * facilities.
53  *
54  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55  * happen for small transfers so that the system will not spend all of
56  * its time context switching.  PIPE_SIZE is constrained by the
57  * amount of kernel virtual memory.
58  */
59 
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.20 2001/12/11 18:15:09 jdolecek Exp $");
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/proc.h>
66 #include <sys/fcntl.h>
67 #include <sys/file.h>
68 #include <sys/filedesc.h>
69 #include <sys/filio.h>
70 #include <sys/ttycom.h>
71 #include <sys/stat.h>
72 #include <sys/poll.h>
73 #include <sys/signalvar.h>
74 #include <sys/vnode.h>
75 #include <sys/uio.h>
76 #include <sys/lock.h>
77 #ifdef __FreeBSD__
78 #include <sys/mutex.h>
79 #include <sys/selinfo.h>
80 #include <sys/sysproto.h>
81 #elif defined(__NetBSD__)
82 #include <sys/select.h>
83 #include <sys/malloc.h>
84 #include <sys/mount.h>
85 #include <sys/syscallargs.h>
86 #include <uvm/uvm.h>
87 #include <sys/sysctl.h>
88 #include <sys/kernel.h>
89 #endif /* NetBSD, FreeBSD */
90 
91 #include <sys/pipe.h>
92 
93 #ifdef __NetBSD__
94 /*
95  * Avoid microtime(9), it's slow. We don't guard the read from time(9)
96  * with splclock(9) since we don't actually need to be THAT sure the access
97  * is atomic.
98  */
99 #define vfs_timestamp(tv)	(*(tv) = time)
100 #endif
101 
102 /*
103  * Use this define if you want to disable *fancy* VM things.  Expect an
104  * approx 30% decrease in transfer rate.  This could be useful for
105  * OpenBSD.
106  */
107 /* #define PIPE_NODIRECT */
108 
109 /*
110  * interfaces to the outside world
111  */
112 #ifdef __FreeBSD__
113 static int pipe_read __P((struct file *fp, struct uio *uio,
114 		struct ucred *cred, int flags, struct proc *p));
115 static int pipe_write __P((struct file *fp, struct uio *uio,
116 		struct ucred *cred, int flags, struct proc *p));
117 static int pipe_close __P((struct file *fp, struct proc *p));
118 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
119 		struct proc *p));
120 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
121 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
122 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
123 
124 static struct fileops pipeops = {
125 	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
126 	pipe_stat, pipe_close
127 };
128 
129 static void	filt_pipedetach(struct knote *kn);
130 static int	filt_piperead(struct knote *kn, long hint);
131 static int	filt_pipewrite(struct knote *kn, long hint);
132 
133 static struct filterops pipe_rfiltops =
134 	{ 1, NULL, filt_pipedetach, filt_piperead };
135 static struct filterops pipe_wfiltops =
136 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
137 #endif /* FreeBSD */
138 
139 #ifdef __NetBSD__
140 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio,
141 		struct ucred *cred, int flags));
142 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio,
143 		struct ucred *cred, int flags));
144 static int pipe_close __P((struct file *fp, struct proc *p));
145 static int pipe_poll __P((struct file *fp, int events, struct proc *p));
146 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data,
147 		struct proc *p));
148 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
149 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
150 
151 static struct fileops pipeops =
152     { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
153       pipe_stat, pipe_close };
154 #endif /* NetBSD */
155 
156 /*
157  * Default pipe buffer size(s), this can be kind-of large now because pipe
158  * space is pageable.  The pipe code will try to maintain locality of
159  * reference for performance reasons, so small amounts of outstanding I/O
160  * will not wipe the cache.
161  */
162 #define MINPIPESIZE (PIPE_SIZE/3)
163 #define MAXPIPESIZE (2*PIPE_SIZE/3)
164 
165 /*
166  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
167  * is there so that on large systems, we don't exhaust it.
168  */
169 #define MAXPIPEKVA (8*1024*1024)
170 static int maxpipekva = MAXPIPEKVA;
171 
172 /*
173  * Limit for direct transfers, we cannot, of course limit
174  * the amount of kva for pipes in general though.
175  */
176 #define LIMITPIPEKVA (16*1024*1024)
177 static int limitpipekva = LIMITPIPEKVA;
178 
179 /*
180  * Limit the number of "big" pipes
181  */
182 #define LIMITBIGPIPES  32
183 static int maxbigpipes = LIMITBIGPIPES;
184 static int nbigpipe = 0;
185 
186 /*
187  * Amount of KVA consumed by pipe buffers.
188  */
189 static int amountpipekva = 0;
190 
191 static void pipeclose __P((struct pipe *));
192 static void pipe_free_kmem __P((struct pipe *));
193 static int pipe_create __P((struct pipe **, int));
194 static __inline int pipelock __P((struct pipe *, int));
195 static __inline void pipeunlock __P((struct pipe *));
196 static __inline void pipeselwakeup __P((struct pipe *, struct pipe *));
197 static int pipespace __P((struct pipe *, int));
198 
199 #ifdef __FreeBSD__
200 #ifndef PIPE_NODIRECT
201 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
202 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
203 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
204 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
205 #endif
206 
207 static vm_zone_t pipe_zone;
208 #endif /* FreeBSD */
209 
210 #ifdef __NetBSD__
211 #ifndef PIPE_NODIRECT
212 static int pipe_direct_write __P((struct pipe *, struct uio *));
213 static int pipe_loan_alloc __P((struct pipe *, int));
214 static void pipe_loan_free __P((struct pipe *));
215 #endif /* PIPE_NODIRECT */
216 
217 static struct pool pipe_pool;
218 #endif /* NetBSD */
219 
220 /*
221  * The pipe system call for the DTYPE_PIPE type of pipes
222  */
223 
224 /* ARGSUSED */
225 #ifdef __FreeBSD__
226 int
227 pipe(p, uap)
228 	struct proc *p;
229 	struct pipe_args /* {
230 		int	dummy;
231 	} */ *uap;
232 #elif defined(__NetBSD__)
233 int
234 sys_pipe(p, v, retval)
235 	struct proc *p;
236 	void *v;
237 	register_t *retval;
238 #endif
239 {
240 	struct file *rf, *wf;
241 	struct pipe *rpipe, *wpipe;
242 	int fd, error;
243 
244 #ifdef __FreeBSD__
245 	if (pipe_zone == NULL)
246 		pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
247 
248 	rpipe = wpipe = NULL;
249 	if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 1)) {
250 		pipeclose(rpipe);
251 		pipeclose(wpipe);
252 		return (ENFILE);
253 	}
254 
255 	error = falloc(p, &rf, &fd);
256 	if (error) {
257 		pipeclose(rpipe);
258 		pipeclose(wpipe);
259 		return (error);
260 	}
261 	fhold(rf);
262 	p->p_retval[0] = fd;
263 
264 	/*
265 	 * Warning: once we've gotten past allocation of the fd for the
266 	 * read-side, we can only drop the read side via fdrop() in order
267 	 * to avoid races against processes which manage to dup() the read
268 	 * side while we are blocked trying to allocate the write side.
269 	 */
270 	rf->f_flag = FREAD | FWRITE;
271 	rf->f_type = DTYPE_PIPE;
272 	rf->f_data = (caddr_t)rpipe;
273 	rf->f_ops = &pipeops;
274 	error = falloc(p, &wf, &fd);
275 	if (error) {
276 		struct filedesc *fdp = p->p_fd;
277 
278 		if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
279 			fdp->fd_ofiles[p->p_retval[0]] = NULL;
280 			fdrop(rf, p);
281 		}
282 		fdrop(rf, p);
283 		/* rpipe has been closed by fdrop(). */
284 		pipeclose(wpipe);
285 		return (error);
286 	}
287 	wf->f_flag = FREAD | FWRITE;
288 	wf->f_type = DTYPE_PIPE;
289 	wf->f_data = (caddr_t)wpipe;
290 	wf->f_ops = &pipeops;
291 	p->p_retval[1] = fd;
292 
293 	rpipe->pipe_peer = wpipe;
294 	wpipe->pipe_peer = rpipe;
295 	fdrop(rf, p);
296 #endif /* FreeBSD */
297 
298 #ifdef __NetBSD__
299 	rpipe = wpipe = NULL;
300 	if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) {
301 		pipeclose(rpipe);
302 		pipeclose(wpipe);
303 		return (ENFILE);
304 	}
305 
306 	/*
307 	 * Note: the file structure returned from falloc() is marked
308 	 * as 'larval' initially. Unless we mark it as 'mature' by
309 	 * FILE_SET_MATURE(), any attempt to do anything with it would
310 	 * return EBADF, including e.g. dup(2) or close(2). This avoids
311 	 * file descriptor races if we block in the second falloc().
312 	 */
313 
314 	error = falloc(p, &rf, &fd);
315 	if (error)
316 		goto free2;
317 	retval[0] = fd;
318 	rf->f_flag = FREAD;
319 	rf->f_type = DTYPE_PIPE;
320 	rf->f_data = (caddr_t)rpipe;
321 	rf->f_ops = &pipeops;
322 
323 	error = falloc(p, &wf, &fd);
324 	if (error)
325 		goto free3;
326 	retval[1] = fd;
327 	wf->f_flag = FWRITE;
328 	wf->f_type = DTYPE_PIPE;
329 	wf->f_data = (caddr_t)wpipe;
330 	wf->f_ops = &pipeops;
331 
332 	rpipe->pipe_peer = wpipe;
333 	wpipe->pipe_peer = rpipe;
334 
335 	FILE_SET_MATURE(rf);
336 	FILE_SET_MATURE(wf);
337 	FILE_UNUSE(rf, p);
338 	FILE_UNUSE(wf, p);
339 	return (0);
340 free3:
341 	FILE_UNUSE(rf, p);
342 	ffree(rf);
343 	fdremove(p->p_fd, retval[0]);
344 free2:
345 	pipeclose(wpipe);
346 	pipeclose(rpipe);
347 #endif /* NetBSD */
348 
349 	return (error);
350 }
351 
352 /*
353  * Allocate kva for pipe circular buffer, the space is pageable
354  * This routine will 'realloc' the size of a pipe safely, if it fails
355  * it will retain the old buffer.
356  * If it fails it will return ENOMEM.
357  */
358 static int
359 pipespace(cpipe, size)
360 	struct pipe *cpipe;
361 	int size;
362 {
363 	caddr_t buffer;
364 #ifdef __FreeBSD__
365 	struct vm_object *object;
366 	int npages, error;
367 
368 	npages = round_page(size)/PAGE_SIZE;
369 	/*
370 	 * Create an object, I don't like the idea of paging to/from
371 	 * kernel_object.
372 	 */
373 	mtx_lock(&vm_mtx);
374 	object = vm_object_allocate(OBJT_DEFAULT, npages);
375 	buffer = (caddr_t) vm_map_min(kernel_map);
376 
377 	/*
378 	 * Insert the object into the kernel map, and allocate kva for it.
379 	 * The map entry is, by default, pageable.
380 	 */
381 	error = vm_map_find(kernel_map, object, 0,
382 		(vm_offset_t *) &buffer, size, 1,
383 		VM_PROT_ALL, VM_PROT_ALL, 0);
384 
385 	if (error != KERN_SUCCESS) {
386 		vm_object_deallocate(object);
387 		mtx_unlock(&vm_mtx);
388 		return (ENOMEM);
389 	}
390 #endif /* FreeBSD */
391 
392 #ifdef __NetBSD__
393 	/*
394 	 * Allocate pageable virtual address space. Physical memory is allocated
395 	 * on demand.
396 	 */
397 	buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
398 	if (buffer == NULL)
399 		return (ENOMEM);
400 #endif /* NetBSD */
401 
402 	/* free old resources if we're resizing */
403 	pipe_free_kmem(cpipe);
404 #ifdef __FreeBSD__
405 	mtx_unlock(&vm_mtx);
406 	cpipe->pipe_buffer.object = object;
407 #endif
408 	cpipe->pipe_buffer.buffer = buffer;
409 	cpipe->pipe_buffer.size = size;
410 	cpipe->pipe_buffer.in = 0;
411 	cpipe->pipe_buffer.out = 0;
412 	cpipe->pipe_buffer.cnt = 0;
413 	amountpipekva += cpipe->pipe_buffer.size;
414 	return (0);
415 }
416 
417 /*
418  * initialize and allocate VM and memory for pipe
419  */
420 static int
421 pipe_create(cpipep, allockva)
422 	struct pipe **cpipep;
423 	int allockva;
424 {
425 	struct pipe *cpipe;
426 	int error;
427 
428 #ifdef __FreeBSD__
429 	*cpipep = zalloc(pipe_zone);
430 #endif
431 #ifdef __NetBSD__
432 	*cpipep = pool_get(&pipe_pool, M_WAITOK);
433 #endif
434 	if (*cpipep == NULL)
435 		return (ENOMEM);
436 
437 	cpipe = *cpipep;
438 
439 	/* Initialize */
440 	memset(cpipe, 0, sizeof(*cpipe));
441 	cpipe->pipe_state = PIPE_SIGNALR;
442 
443 	if (allockva && (error = pipespace(cpipe, PIPE_SIZE)))
444 		return (error);
445 
446 	vfs_timestamp(&cpipe->pipe_ctime);
447 	cpipe->pipe_atime = cpipe->pipe_ctime;
448 	cpipe->pipe_mtime = cpipe->pipe_ctime;
449 #ifdef __NetBSD__
450 	cpipe->pipe_pgid = NO_PID;
451 	lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
452 #endif
453 
454 	return (0);
455 }
456 
457 
458 /*
459  * lock a pipe for I/O, blocking other access
460  */
461 static __inline int
462 pipelock(cpipe, catch)
463 	struct pipe *cpipe;
464 	int catch;
465 {
466 	int error;
467 
468 #ifdef __FreeBSD__
469 	while (cpipe->pipe_state & PIPE_LOCK) {
470 		cpipe->pipe_state |= PIPE_LWANT;
471 		error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
472 		    "pipelk", 0);
473 		if (error != 0)
474 			return (error);
475 	}
476 	cpipe->pipe_state |= PIPE_LOCK;
477 	return (0);
478 #endif
479 
480 #ifdef __NetBSD__
481 	do {
482 		error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
483 	} while (!catch && (error == EINTR || error == ERESTART));
484 	return (error);
485 #endif
486 }
487 
488 /*
489  * unlock a pipe I/O lock
490  */
491 static __inline void
492 pipeunlock(cpipe)
493 	struct pipe *cpipe;
494 {
495 #ifdef __FreeBSD__
496 	cpipe->pipe_state &= ~PIPE_LOCK;
497 	if (cpipe->pipe_state & PIPE_LWANT) {
498 		cpipe->pipe_state &= ~PIPE_LWANT;
499 		wakeup(cpipe);
500 	}
501 #endif
502 
503 #ifdef __NetBSD__
504 	lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
505 #endif
506 }
507 
508 /*
509  * Select/poll wakup. This also sends SIGIO to peer connected to
510  * 'sigpipe' side of pipe.
511  */
512 static __inline void
513 pipeselwakeup(selp, sigp)
514 	struct pipe *selp, *sigp;
515 {
516 	if (selp->pipe_state & PIPE_SEL) {
517 		selp->pipe_state &= ~PIPE_SEL;
518 		selwakeup(&selp->pipe_sel);
519 	}
520 #ifdef __FreeBSD__
521 	if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
522 		pgsigio(sigp->pipe_sigio, SIGIO, 0);
523 	KNOTE(&selp->pipe_sel.si_note, 0);
524 #endif
525 
526 #ifdef __NetBSD__
527 	if (sigp && (sigp->pipe_state & PIPE_ASYNC)
528 	    && sigp->pipe_pgid != NO_PID){
529 		struct proc *p;
530 
531 		if (sigp->pipe_pgid < 0)
532 			gsignal(-sigp->pipe_pgid, SIGIO);
533 		else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0)
534 			psignal(p, SIGIO);
535 	}
536 #endif /* NetBSD */
537 }
538 
539 /* ARGSUSED */
540 #ifdef __FreeBSD__
541 static int
542 pipe_read(fp, uio, cred, flags, p)
543 	struct file *fp;
544 	struct uio *uio;
545 	struct ucred *cred;
546 	int flags;
547 	struct proc *p;
548 #elif defined(__NetBSD__)
549 static int
550 pipe_read(fp, offset, uio, cred, flags)
551 	struct file *fp;
552 	off_t *offset;
553 	struct uio *uio;
554 	struct ucred *cred;
555 	int flags;
556 #endif
557 {
558 	struct pipe *rpipe = (struct pipe *) fp->f_data;
559 	int error;
560 	size_t nread = 0;
561 	size_t size;
562 	size_t ocnt;
563 
564 	++rpipe->pipe_busy;
565 	error = pipelock(rpipe, 1);
566 	if (error)
567 		goto unlocked_error;
568 
569 	ocnt = rpipe->pipe_buffer.cnt;
570 
571 	while (uio->uio_resid) {
572 		/*
573 		 * normal pipe buffer receive
574 		 */
575 		if (rpipe->pipe_buffer.cnt > 0) {
576 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
577 			if (size > rpipe->pipe_buffer.cnt)
578 				size = rpipe->pipe_buffer.cnt;
579 			if (size > uio->uio_resid)
580 				size = uio->uio_resid;
581 
582 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
583 					size, uio);
584 			if (error)
585 				break;
586 
587 			rpipe->pipe_buffer.out += size;
588 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
589 				rpipe->pipe_buffer.out = 0;
590 
591 			rpipe->pipe_buffer.cnt -= size;
592 
593 			/*
594 			 * If there is no more to read in the pipe, reset
595 			 * its pointers to the beginning.  This improves
596 			 * cache hit stats.
597 			 */
598 			if (rpipe->pipe_buffer.cnt == 0) {
599 				rpipe->pipe_buffer.in = 0;
600 				rpipe->pipe_buffer.out = 0;
601 			}
602 			nread += size;
603 #ifndef PIPE_NODIRECT
604 		/*
605 		 * Direct copy, bypassing a kernel buffer.
606 		 */
607 		} else if ((size = rpipe->pipe_map.cnt) &&
608 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
609 			caddr_t	va;
610 			if (size > uio->uio_resid)
611 				size = uio->uio_resid;
612 
613 			va = (caddr_t) rpipe->pipe_map.kva +
614 			    rpipe->pipe_map.pos;
615 			error = uiomove(va, size, uio);
616 			if (error)
617 				break;
618 			nread += size;
619 			rpipe->pipe_map.pos += size;
620 			rpipe->pipe_map.cnt -= size;
621 			if (rpipe->pipe_map.cnt == 0) {
622 				rpipe->pipe_state &= ~PIPE_DIRECTW;
623 				wakeup(rpipe);
624 			}
625 #endif
626 		} else {
627 			/*
628 			 * detect EOF condition
629 			 * read returns 0 on EOF, no need to set error
630 			 */
631 			if (rpipe->pipe_state & PIPE_EOF)
632 				break;
633 
634 			/*
635 			 * If the "write-side" has been blocked, wake it up now.
636 			 */
637 			if (rpipe->pipe_state & PIPE_WANTW) {
638 				rpipe->pipe_state &= ~PIPE_WANTW;
639 				wakeup(rpipe);
640 			}
641 
642 			/*
643 			 * Break if some data was read.
644 			 */
645 			if (nread > 0)
646 				break;
647 
648 			/*
649 			 * don't block on non-blocking I/O
650 			 */
651 			if (fp->f_flag & FNONBLOCK) {
652 				error = EAGAIN;
653 				break;
654 			}
655 
656 			/*
657 			 * Unlock the pipe buffer for our remaining processing.
658 			 * We will either break out with an error or we will
659 			 * sleep and relock to loop.
660 			 */
661 			pipeunlock(rpipe);
662 
663 			/*
664 			 * We want to read more, wake up select/poll.
665 			 */
666 			pipeselwakeup(rpipe, rpipe->pipe_peer);
667 
668 			rpipe->pipe_state |= PIPE_WANTR;
669 			error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
670 			if (error != 0 || (error = pipelock(rpipe, 1)))
671 				goto unlocked_error;
672 		}
673 	}
674 	pipeunlock(rpipe);
675 
676 	if (error == 0)
677 		vfs_timestamp(&rpipe->pipe_atime);
678 unlocked_error:
679 	--rpipe->pipe_busy;
680 
681 	/*
682 	 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
683 	 */
684 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
685 		rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
686 		wakeup(rpipe);
687 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
688 		/*
689 		 * Handle write blocking hysteresis.
690 		 */
691 		if (rpipe->pipe_state & PIPE_WANTW) {
692 			rpipe->pipe_state &= ~PIPE_WANTW;
693 			wakeup(rpipe);
694 		}
695 	}
696 
697 	/*
698 	 * If anything was read off the buffer, signal to the writer it's
699 	 * possible to write more data. Also send signal if we are here for the
700 	 * first time after last write.
701 	 */
702 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
703 	    && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
704 		pipeselwakeup(rpipe, rpipe->pipe_peer);
705 		rpipe->pipe_state &= ~PIPE_SIGNALR;
706 	}
707 
708 	return (error);
709 }
710 
711 #ifdef __FreeBSD__
712 #ifndef PIPE_NODIRECT
713 /*
714  * Map the sending processes' buffer into kernel space and wire it.
715  * This is similar to a physical write operation.
716  */
717 static int
718 pipe_build_write_buffer(wpipe, uio)
719 	struct pipe *wpipe;
720 	struct uio *uio;
721 {
722 	size_t size;
723 	int i;
724 	vm_offset_t addr, endaddr, paddr;
725 
726 	size = uio->uio_iov->iov_len;
727 	if (size > wpipe->pipe_buffer.size)
728 		size = wpipe->pipe_buffer.size;
729 
730 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
731 	mtx_lock(&vm_mtx);
732 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
733 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
734 		vm_page_t m;
735 
736 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
737 		    (paddr = pmap_kextract(addr)) == 0) {
738 			int j;
739 
740 			for (j = 0; j < i; j++)
741 				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
742 			mtx_unlock(&vm_mtx);
743 			return (EFAULT);
744 		}
745 
746 		m = PHYS_TO_VM_PAGE(paddr);
747 		vm_page_wire(m);
748 		wpipe->pipe_map.ms[i] = m;
749 	}
750 
751 /*
752  * set up the control block
753  */
754 	wpipe->pipe_map.npages = i;
755 	wpipe->pipe_map.pos =
756 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
757 	wpipe->pipe_map.cnt = size;
758 
759 /*
760  * and map the buffer
761  */
762 	if (wpipe->pipe_map.kva == 0) {
763 		/*
764 		 * We need to allocate space for an extra page because the
765 		 * address range might (will) span pages at times.
766 		 */
767 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
768 			wpipe->pipe_buffer.size + PAGE_SIZE);
769 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
770 	}
771 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
772 		wpipe->pipe_map.npages);
773 
774 	mtx_unlock(&vm_mtx);
775 /*
776  * and update the uio data
777  */
778 
779 	uio->uio_iov->iov_len -= size;
780 	uio->uio_iov->iov_base += size;
781 	if (uio->uio_iov->iov_len == 0)
782 		uio->uio_iov++;
783 	uio->uio_resid -= size;
784 	uio->uio_offset += size;
785 	return (0);
786 }
787 
788 /*
789  * unmap and unwire the process buffer
790  */
791 static void
792 pipe_destroy_write_buffer(wpipe)
793 	struct pipe *wpipe;
794 {
795 	int i;
796 
797 	mtx_lock(&vm_mtx);
798 	if (wpipe->pipe_map.kva) {
799 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
800 
801 		if (amountpipekva > maxpipekva) {
802 			vm_offset_t kva = wpipe->pipe_map.kva;
803 			wpipe->pipe_map.kva = 0;
804 			kmem_free(kernel_map, kva,
805 				wpipe->pipe_buffer.size + PAGE_SIZE);
806 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
807 		}
808 	}
809 	for (i = 0; i < wpipe->pipe_map.npages; i++)
810 		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
811 	mtx_unlock(&vm_mtx);
812 }
813 
814 /*
815  * In the case of a signal, the writing process might go away.  This
816  * code copies the data into the circular buffer so that the source
817  * pages can be freed without loss of data.
818  */
819 static void
820 pipe_clone_write_buffer(wpipe)
821 	struct pipe *wpipe;
822 {
823 	int size;
824 	int pos;
825 
826 	size = wpipe->pipe_map.cnt;
827 	pos = wpipe->pipe_map.pos;
828 	memcpy((caddr_t) wpipe->pipe_buffer.buffer,
829 	    (caddr_t) wpipe->pipe_map.kva + pos, size);
830 
831 	wpipe->pipe_buffer.in = size;
832 	wpipe->pipe_buffer.out = 0;
833 	wpipe->pipe_buffer.cnt = size;
834 	wpipe->pipe_state &= ~PIPE_DIRECTW;
835 
836 	pipe_destroy_write_buffer(wpipe);
837 }
838 
839 /*
840  * This implements the pipe buffer write mechanism.  Note that only
841  * a direct write OR a normal pipe write can be pending at any given time.
842  * If there are any characters in the pipe buffer, the direct write will
843  * be deferred until the receiving process grabs all of the bytes from
844  * the pipe buffer.  Then the direct mapping write is set-up.
845  */
846 static int
847 pipe_direct_write(wpipe, uio)
848 	struct pipe *wpipe;
849 	struct uio *uio;
850 {
851 	int error;
852 
853 retry:
854 	while (wpipe->pipe_state & PIPE_DIRECTW) {
855 		if (wpipe->pipe_state & PIPE_WANTR) {
856 			wpipe->pipe_state &= ~PIPE_WANTR;
857 			wakeup(wpipe);
858 		}
859 		wpipe->pipe_state |= PIPE_WANTW;
860 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
861 		if (error)
862 			goto error1;
863 		if (wpipe->pipe_state & PIPE_EOF) {
864 			error = EPIPE;
865 			goto error1;
866 		}
867 	}
868 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
869 	if (wpipe->pipe_buffer.cnt > 0) {
870 		if (wpipe->pipe_state & PIPE_WANTR) {
871 			wpipe->pipe_state &= ~PIPE_WANTR;
872 			wakeup(wpipe);
873 		}
874 
875 		wpipe->pipe_state |= PIPE_WANTW;
876 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
877 		if (error)
878 			goto error1;
879 		if (wpipe->pipe_state & PIPE_EOF) {
880 			error = EPIPE;
881 			goto error1;
882 		}
883 		goto retry;
884 	}
885 
886 	wpipe->pipe_state |= PIPE_DIRECTW;
887 
888 	error = pipe_build_write_buffer(wpipe, uio);
889 	if (error) {
890 		wpipe->pipe_state &= ~PIPE_DIRECTW;
891 		goto error1;
892 	}
893 
894 	error = 0;
895 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
896 		if (wpipe->pipe_state & PIPE_EOF) {
897 			pipelock(wpipe, 0);
898 			pipe_destroy_write_buffer(wpipe);
899 			pipeunlock(wpipe);
900 			pipeselwakeup(wpipe, wpipe);
901 			error = EPIPE;
902 			goto error1;
903 		}
904 		if (wpipe->pipe_state & PIPE_WANTR) {
905 			wpipe->pipe_state &= ~PIPE_WANTR;
906 			wakeup(wpipe);
907 		}
908 		pipeselwakeup(wpipe, wpipe);
909 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
910 	}
911 
912 	pipelock(wpipe,0);
913 	if (wpipe->pipe_state & PIPE_DIRECTW) {
914 		/*
915 		 * this bit of trickery substitutes a kernel buffer for
916 		 * the process that might be going away.
917 		 */
918 		pipe_clone_write_buffer(wpipe);
919 	} else {
920 		pipe_destroy_write_buffer(wpipe);
921 	}
922 	pipeunlock(wpipe);
923 	return (error);
924 
925 error1:
926 	wakeup(wpipe);
927 	return (error);
928 }
929 #endif /* !PIPE_NODIRECT */
930 #endif /* FreeBSD */
931 
932 #ifdef __NetBSD__
933 #ifndef PIPE_NODIRECT
934 /*
935  * Allocate structure for loan transfer.
936  */
937 static int
938 pipe_loan_alloc(wpipe, npages)
939 	struct pipe *wpipe;
940 	int npages;
941 {
942 	vsize_t len;
943 
944 	len = (vsize_t)npages << PAGE_SHIFT;
945 	wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, len);
946 	if (wpipe->pipe_map.kva == NULL)
947 		return (ENOMEM);
948 
949 	amountpipekva += len;
950 	wpipe->pipe_map.npages = npages;
951 	wpipe->pipe_map.pgs = malloc(npages * sizeof(struct vm_page *), M_PIPE,
952 	    M_WAITOK);
953 	return (0);
954 }
955 
956 /*
957  * Free resources allocated for loan transfer.
958  */
959 static void
960 pipe_loan_free(wpipe)
961 	struct pipe *wpipe;
962 {
963 	vsize_t len;
964 
965 	len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT;
966 	pmap_kremove(wpipe->pipe_map.kva, len);
967 	uvm_km_free(kernel_map, wpipe->pipe_map.kva, len);
968 	wpipe->pipe_map.kva = NULL;
969 	amountpipekva -= len;
970 	free(wpipe->pipe_map.pgs, M_PIPE);
971 	wpipe->pipe_map.pgs = NULL;
972 }
973 
974 /*
975  * NetBSD direct write, using uvm_loan() mechanism.
976  * This implements the pipe buffer write mechanism.  Note that only
977  * a direct write OR a normal pipe write can be pending at any given time.
978  * If there are any characters in the pipe buffer, the direct write will
979  * be deferred until the receiving process grabs all of the bytes from
980  * the pipe buffer.  Then the direct mapping write is set-up.
981  */
982 static int
983 pipe_direct_write(wpipe, uio)
984 	struct pipe *wpipe;
985 	struct uio *uio;
986 {
987 	int error, npages, j;
988 	struct vm_page **pgs;
989 	vaddr_t bbase, kva, base, bend;
990 	vsize_t blen, bcnt;
991 	voff_t bpos;
992 
993 retry:
994 	while (wpipe->pipe_state & PIPE_DIRECTW) {
995 		if (wpipe->pipe_state & PIPE_WANTR) {
996 			wpipe->pipe_state &= ~PIPE_WANTR;
997 			wakeup(wpipe);
998 		}
999 		wpipe->pipe_state |= PIPE_WANTW;
1000 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
1001 		if (error)
1002 			goto error;
1003 		if (wpipe->pipe_state & PIPE_EOF) {
1004 			error = EPIPE;
1005 			goto error;
1006 		}
1007 	}
1008 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
1009 	if (wpipe->pipe_buffer.cnt > 0) {
1010 		if (wpipe->pipe_state & PIPE_WANTR) {
1011 			wpipe->pipe_state &= ~PIPE_WANTR;
1012 			wakeup(wpipe);
1013 		}
1014 
1015 		wpipe->pipe_state |= PIPE_WANTW;
1016 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1017 		if (error)
1018 			goto error;
1019 		if (wpipe->pipe_state & PIPE_EOF) {
1020 			error = EPIPE;
1021 			goto error;
1022 		}
1023 		goto retry;
1024 	}
1025 
1026 	/*
1027 	 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers
1028 	 * not aligned to PAGE_SIZE.
1029 	 */
1030 	bbase = (vaddr_t)uio->uio_iov->iov_base;
1031 	base = trunc_page(bbase);
1032 	bend = round_page(bbase + uio->uio_iov->iov_len);
1033 	blen = bend - base;
1034 	bpos = bbase - base;
1035 
1036 	if (blen > PIPE_DIRECT_CHUNK) {
1037 		blen = PIPE_DIRECT_CHUNK;
1038 		bend = base + blen;
1039 		bcnt = PIPE_DIRECT_CHUNK - bpos;
1040 	} else {
1041 		bcnt = uio->uio_iov->iov_len;
1042 	}
1043 	npages = blen >> PAGE_SHIFT;
1044 
1045 	wpipe->pipe_map.pos = bpos;
1046 	wpipe->pipe_map.cnt = bcnt;
1047 
1048 	/*
1049 	 * Free the old kva if we need more pages than we have
1050 	 * allocated.
1051 	 */
1052 	if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages)
1053 		pipe_loan_free(wpipe);
1054 
1055 	/* Allocate new kva. */
1056 	if (wpipe->pipe_map.kva == NULL) {
1057 		error = pipe_loan_alloc(wpipe, npages);
1058 		if (error) {
1059 			goto error;
1060 		}
1061 	}
1062 
1063 	/* Loan the write buffer memory from writer process */
1064 	pgs = wpipe->pipe_map.pgs;
1065 	error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen,
1066 	    pgs, UVM_LOAN_TOPAGE);
1067 	if (error) {
1068 		pgs = NULL;
1069 		goto cleanup;
1070 	}
1071 
1072 	/* Enter the loaned pages to kva */
1073 	kva = wpipe->pipe_map.kva;
1074 	for (j = 0; j < npages; j++, kva += PAGE_SIZE) {
1075 		pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ);
1076 	}
1077 	pmap_update(pmap_kernel());
1078 
1079 	wpipe->pipe_state |= PIPE_DIRECTW;
1080 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1081 		if (wpipe->pipe_state & PIPE_EOF) {
1082 			error = EPIPE;
1083 			break;
1084 		}
1085 		if (wpipe->pipe_state & PIPE_WANTR) {
1086 			wpipe->pipe_state &= ~PIPE_WANTR;
1087 			wakeup(wpipe);
1088 		}
1089 		pipeselwakeup(wpipe, wpipe);
1090 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1091 	}
1092 
1093 	if (error)
1094 		wpipe->pipe_state &= ~PIPE_DIRECTW;
1095 
1096 cleanup:
1097 	pipelock(wpipe, 0);
1098 	if (pgs != NULL)
1099 		uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
1100 	if (error || amountpipekva > maxpipekva)
1101 		pipe_loan_free(wpipe);
1102 	pipeunlock(wpipe);
1103 
1104 	if (error) {
1105 		pipeselwakeup(wpipe, wpipe);
1106 
1107 		/*
1108 		 * If nothing was read from what we offered, return error
1109 		 * straight on. Otherwise update uio resid first. Caller
1110 		 * will deal with the error condition, returning short
1111 		 * write, error, or restarting the write(2) as appropriate.
1112 		 */
1113 		if (wpipe->pipe_map.cnt == bcnt) {
1114 error:
1115 			wakeup(wpipe);
1116 			return (error);
1117 		}
1118 
1119 		bcnt -= wpipe->pipe_map.cnt;
1120 	}
1121 
1122 	uio->uio_resid -= bcnt;
1123 	/* uio_offset not updated, not set/used for write(2) */
1124 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt;
1125 	uio->uio_iov->iov_len -= bcnt;
1126 	if (uio->uio_iov->iov_len == 0) {
1127 		uio->uio_iov++;
1128 		uio->uio_iovcnt--;
1129 	}
1130 
1131 	return (error);
1132 }
1133 #endif /* !PIPE_NODIRECT */
1134 #endif /* NetBSD */
1135 
1136 #ifdef __FreeBSD__
1137 static int
1138 pipe_write(fp, uio, cred, flags, p)
1139 	struct file *fp;
1140 	off_t *offset;
1141 	struct uio *uio;
1142 	struct ucred *cred;
1143 	int flags;
1144 	struct proc *p;
1145 #elif defined(__NetBSD__)
1146 static int
1147 pipe_write(fp, offset, uio, cred, flags)
1148 	struct file *fp;
1149 	off_t *offset;
1150 	struct uio *uio;
1151 	struct ucred *cred;
1152 	int flags;
1153 #endif
1154 {
1155 	int error = 0;
1156 	struct pipe *wpipe, *rpipe;
1157 
1158 	rpipe = (struct pipe *) fp->f_data;
1159 	wpipe = rpipe->pipe_peer;
1160 
1161 	/*
1162 	 * detect loss of pipe read side, issue SIGPIPE if lost.
1163 	 */
1164 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF))
1165 		return (EPIPE);
1166 
1167 	++wpipe->pipe_busy;
1168 
1169 	/*
1170 	 * If it is advantageous to resize the pipe buffer, do
1171 	 * so.
1172 	 */
1173 	if ((uio->uio_resid > PIPE_SIZE) &&
1174 		(nbigpipe < maxbigpipes) &&
1175 #ifndef PIPE_NODIRECT
1176 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1177 #endif
1178 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1179 		(wpipe->pipe_buffer.cnt == 0)) {
1180 
1181 		if ((error = pipelock(wpipe,1)) == 0) {
1182 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1183 				nbigpipe++;
1184 			pipeunlock(wpipe);
1185 		} else {
1186 			/*
1187 			 * If an error occurred, unbusy and return, waking up
1188 			 * any waiting readers.
1189 			 */
1190 			--wpipe->pipe_busy;
1191 			if (wpipe->pipe_busy == 0
1192 			    && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1193 				wpipe->pipe_state &=
1194 				    ~(PIPE_WANTCLOSE | PIPE_WANTR);
1195 				wakeup(wpipe);
1196 			}
1197 
1198 			return (error);
1199 		}
1200 	}
1201 
1202 #ifdef __FreeBSD__
1203 	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1204 #endif
1205 
1206 	while (uio->uio_resid) {
1207 		int space;
1208 
1209 #ifndef PIPE_NODIRECT
1210 		/*
1211 		 * If the transfer is large, we can gain performance if
1212 		 * we do process-to-process copies directly.
1213 		 * If the write is non-blocking, we don't use the
1214 		 * direct write mechanism.
1215 		 *
1216 		 * The direct write mechanism will detect the reader going
1217 		 * away on us.
1218 		 */
1219 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1220 		    (fp->f_flag & FNONBLOCK) == 0 &&
1221 		    (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1222 			error = pipe_direct_write(wpipe, uio);
1223 
1224 			/*
1225 			 * Break out if error occured, unless it's ENOMEM.
1226 			 * ENOMEM means we failed to allocate some resources
1227 			 * for direct write, so we just fallback to ordinary
1228 			 * write. If the direct write was successful,
1229 			 * process rest of data via ordinary write.
1230 			 */
1231 			if (!error)
1232 				continue;
1233 
1234 			if (error != ENOMEM)
1235 				break;
1236 		}
1237 #endif /* PIPE_NODIRECT */
1238 
1239 		/*
1240 		 * Pipe buffered writes cannot be coincidental with
1241 		 * direct writes.  We wait until the currently executing
1242 		 * direct write is completed before we start filling the
1243 		 * pipe buffer.  We break out if a signal occurs or the
1244 		 * reader goes away.
1245 		 */
1246 	retrywrite:
1247 		while (wpipe->pipe_state & PIPE_DIRECTW) {
1248 			if (wpipe->pipe_state & PIPE_WANTR) {
1249 				wpipe->pipe_state &= ~PIPE_WANTR;
1250 				wakeup(wpipe);
1251 			}
1252 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1253 			if (wpipe->pipe_state & PIPE_EOF)
1254 				break;
1255 			if (error)
1256 				break;
1257 		}
1258 		if (wpipe->pipe_state & PIPE_EOF) {
1259 			error = EPIPE;
1260 			break;
1261 		}
1262 
1263 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1264 
1265 		/* Writes of size <= PIPE_BUF must be atomic. */
1266 		if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF))
1267 			space = 0;
1268 
1269 		if (space > 0) {
1270 			int size;	/* Transfer size */
1271 			int segsize;	/* first segment to transfer */
1272 
1273 			if ((error = pipelock(wpipe,1)) != 0)
1274 				break;
1275 
1276 			/*
1277 			 * It is possible for a direct write to
1278 			 * slip in on us... handle it here...
1279 			 */
1280 			if (wpipe->pipe_state & PIPE_DIRECTW) {
1281 				pipeunlock(wpipe);
1282 				goto retrywrite;
1283 			}
1284 			/*
1285 			 * If a process blocked in uiomove, our
1286 			 * value for space might be bad.
1287 			 *
1288 			 * XXX will we be ok if the reader has gone
1289 			 * away here?
1290 			 */
1291 			if (space > wpipe->pipe_buffer.size -
1292 				    wpipe->pipe_buffer.cnt) {
1293 				pipeunlock(wpipe);
1294 				goto retrywrite;
1295 			}
1296 
1297 			/*
1298 			 * Transfer size is minimum of uio transfer
1299 			 * and free space in pipe buffer.
1300 			 */
1301 			if (space > uio->uio_resid)
1302 				size = uio->uio_resid;
1303 			else
1304 				size = space;
1305 			/*
1306 			 * First segment to transfer is minimum of
1307 			 * transfer size and contiguous space in
1308 			 * pipe buffer.  If first segment to transfer
1309 			 * is less than the transfer size, we've got
1310 			 * a wraparound in the buffer.
1311 			 */
1312 			segsize = wpipe->pipe_buffer.size -
1313 				wpipe->pipe_buffer.in;
1314 			if (segsize > size)
1315 				segsize = size;
1316 
1317 			/* Transfer first segment */
1318 
1319 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1320 						segsize, uio);
1321 
1322 			if (error == 0 && segsize < size) {
1323 				/*
1324 				 * Transfer remaining part now, to
1325 				 * support atomic writes.  Wraparound
1326 				 * happened.
1327 				 */
1328 #ifdef DEBUG
1329 				if (wpipe->pipe_buffer.in + segsize !=
1330 				    wpipe->pipe_buffer.size)
1331 					panic("Expected pipe buffer wraparound disappeared");
1332 #endif
1333 
1334 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
1335 						size - segsize, uio);
1336 			}
1337 			if (error == 0) {
1338 				wpipe->pipe_buffer.in += size;
1339 				if (wpipe->pipe_buffer.in >=
1340 				    wpipe->pipe_buffer.size) {
1341 #ifdef DEBUG
1342 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1343 						panic("Expected wraparound bad");
1344 #endif
1345 					wpipe->pipe_buffer.in = size - segsize;
1346 				}
1347 
1348 				wpipe->pipe_buffer.cnt += size;
1349 #ifdef DEBUG
1350 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1351 					panic("Pipe buffer overflow");
1352 #endif
1353 			}
1354 			pipeunlock(wpipe);
1355 			if (error)
1356 				break;
1357 		} else {
1358 			/*
1359 			 * If the "read-side" has been blocked, wake it up now.
1360 			 */
1361 			if (wpipe->pipe_state & PIPE_WANTR) {
1362 				wpipe->pipe_state &= ~PIPE_WANTR;
1363 				wakeup(wpipe);
1364 			}
1365 
1366 			/*
1367 			 * don't block on non-blocking I/O
1368 			 */
1369 			if (fp->f_flag & FNONBLOCK) {
1370 				error = EAGAIN;
1371 				break;
1372 			}
1373 
1374 			/*
1375 			 * We have no more space and have something to offer,
1376 			 * wake up select/poll.
1377 			 */
1378 			pipeselwakeup(wpipe, wpipe);
1379 
1380 			wpipe->pipe_state |= PIPE_WANTW;
1381 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1382 			if (error != 0)
1383 				break;
1384 			/*
1385 			 * If read side wants to go away, we just issue a signal
1386 			 * to ourselves.
1387 			 */
1388 			if (wpipe->pipe_state & PIPE_EOF) {
1389 				error = EPIPE;
1390 				break;
1391 			}
1392 		}
1393 	}
1394 
1395 	--wpipe->pipe_busy;
1396 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1397 		wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1398 		wakeup(wpipe);
1399 	} else if (wpipe->pipe_buffer.cnt > 0) {
1400 		/*
1401 		 * If we have put any characters in the buffer, we wake up
1402 		 * the reader.
1403 		 */
1404 		if (wpipe->pipe_state & PIPE_WANTR) {
1405 			wpipe->pipe_state &= ~PIPE_WANTR;
1406 			wakeup(wpipe);
1407 		}
1408 	}
1409 
1410 	/*
1411 	 * Don't return EPIPE if I/O was successful
1412 	 */
1413 	if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1414 	    && (uio->uio_resid == 0))
1415 		error = 0;
1416 
1417 	if (error == 0)
1418 		vfs_timestamp(&wpipe->pipe_mtime);
1419 
1420 	/*
1421 	 * We have something to offer, wake up select/poll.
1422 	 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1423 	 * is only done synchronously), so check only wpipe->pipe_buffer.cnt
1424 	 */
1425 	if (wpipe->pipe_buffer.cnt)
1426 		pipeselwakeup(wpipe, wpipe);
1427 
1428 	/*
1429 	 * Arrange for next read(2) to do a signal.
1430 	 */
1431 	wpipe->pipe_state |= PIPE_SIGNALR;
1432 
1433 	return (error);
1434 }
1435 
1436 /*
1437  * we implement a very minimal set of ioctls for compatibility with sockets.
1438  */
1439 int
1440 pipe_ioctl(fp, cmd, data, p)
1441 	struct file *fp;
1442 	u_long cmd;
1443 	caddr_t data;
1444 	struct proc *p;
1445 {
1446 	struct pipe *mpipe = (struct pipe *)fp->f_data;
1447 
1448 	switch (cmd) {
1449 
1450 	case FIONBIO:
1451 		return (0);
1452 
1453 	case FIOASYNC:
1454 		if (*(int *)data) {
1455 			mpipe->pipe_state |= PIPE_ASYNC;
1456 		} else {
1457 			mpipe->pipe_state &= ~PIPE_ASYNC;
1458 		}
1459 		return (0);
1460 
1461 	case FIONREAD:
1462 #ifndef PIPE_NODIRECT
1463 		if (mpipe->pipe_state & PIPE_DIRECTW)
1464 			*(int *)data = mpipe->pipe_map.cnt;
1465 		else
1466 #endif
1467 			*(int *)data = mpipe->pipe_buffer.cnt;
1468 		return (0);
1469 
1470 #ifdef __FreeBSD__
1471 	case FIOSETOWN:
1472 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1473 
1474 	case FIOGETOWN:
1475 		*(int *)data = fgetown(mpipe->pipe_sigio);
1476 		return (0);
1477 
1478 	/* This is deprecated, FIOSETOWN should be used instead. */
1479 	case TIOCSPGRP:
1480 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1481 
1482 	/* This is deprecated, FIOGETOWN should be used instead. */
1483 	case TIOCGPGRP:
1484 		*(int *)data = -fgetown(mpipe->pipe_sigio);
1485 		return (0);
1486 #endif /* FreeBSD */
1487 #ifdef __NetBSD__
1488 	case TIOCSPGRP:
1489 		mpipe->pipe_pgid = *(int *)data;
1490 		return (0);
1491 
1492 	case TIOCGPGRP:
1493 		*(int *)data = mpipe->pipe_pgid;
1494 		return (0);
1495 #endif /* NetBSD */
1496 
1497 	}
1498 	return (ENOTTY);
1499 }
1500 
1501 int
1502 pipe_poll(fp, events, p)
1503 	struct file *fp;
1504 	int events;
1505 	struct proc *p;
1506 {
1507 	struct pipe *rpipe = (struct pipe *)fp->f_data;
1508 	struct pipe *wpipe;
1509 	int revents = 0;
1510 
1511 	wpipe = rpipe->pipe_peer;
1512 	if (events & (POLLIN | POLLRDNORM))
1513 		if ((rpipe->pipe_buffer.cnt > 0) ||
1514 #ifndef PIPE_NODIRECT
1515 		    (rpipe->pipe_state & PIPE_DIRECTW) ||
1516 #endif
1517 		    (rpipe->pipe_state & PIPE_EOF))
1518 			revents |= events & (POLLIN | POLLRDNORM);
1519 
1520 	if (events & (POLLOUT | POLLWRNORM))
1521 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1522 		    || (
1523 #ifndef PIPE_NODIRECT
1524 		     ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1525 #endif
1526 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1527 			revents |= events & (POLLOUT | POLLWRNORM);
1528 
1529 	if ((rpipe->pipe_state & PIPE_EOF) ||
1530 	    (wpipe == NULL) ||
1531 	    (wpipe->pipe_state & PIPE_EOF))
1532 		revents |= POLLHUP;
1533 
1534 	if (revents == 0) {
1535 		if (events & (POLLIN | POLLRDNORM)) {
1536 			selrecord(p, &rpipe->pipe_sel);
1537 			rpipe->pipe_state |= PIPE_SEL;
1538 		}
1539 
1540 		if (events & (POLLOUT | POLLWRNORM)) {
1541 			selrecord(p, &wpipe->pipe_sel);
1542 			wpipe->pipe_state |= PIPE_SEL;
1543 		}
1544 	}
1545 
1546 	return (revents);
1547 }
1548 
1549 static int
1550 pipe_stat(fp, ub, p)
1551 	struct file *fp;
1552 	struct stat *ub;
1553 	struct proc *p;
1554 {
1555 	struct pipe *pipe = (struct pipe *)fp->f_data;
1556 
1557 	memset((caddr_t)ub, 0, sizeof(*ub));
1558 	ub->st_mode = S_IFIFO;
1559 	ub->st_blksize = pipe->pipe_buffer.size;
1560 	ub->st_size = pipe->pipe_buffer.cnt;
1561 	ub->st_blocks = (ub->st_size) ? 1 : 0;
1562 #ifdef __FreeBSD__
1563 	ub->st_atimespec = pipe->pipe_atime;
1564 	ub->st_mtimespec = pipe->pipe_mtime;
1565 	ub->st_ctimespec = pipe->pipe_ctime;
1566 #endif /* FreeBSD */
1567 #ifdef __NetBSD__
1568 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1569 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1570 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1571 #endif /* NetBSD */
1572 	ub->st_uid = fp->f_cred->cr_uid;
1573 	ub->st_gid = fp->f_cred->cr_gid;
1574 	/*
1575 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1576 	 * XXX (st_dev, st_ino) should be unique.
1577 	 */
1578 	return (0);
1579 }
1580 
1581 /* ARGSUSED */
1582 static int
1583 pipe_close(fp, p)
1584 	struct file *fp;
1585 	struct proc *p;
1586 {
1587 	struct pipe *cpipe = (struct pipe *)fp->f_data;
1588 
1589 #ifdef __FreeBSD__
1590 	fp->f_ops = &badfileops;
1591 	funsetown(cpipe->pipe_sigio);
1592 #endif
1593 	fp->f_data = NULL;
1594 	pipeclose(cpipe);
1595 	return (0);
1596 }
1597 
1598 static void
1599 pipe_free_kmem(cpipe)
1600 	struct pipe *cpipe;
1601 {
1602 
1603 #ifdef __FreeBSD__
1604 	mtx_assert(&vm_mtx, MA_OWNED);
1605 #endif
1606 	if (cpipe->pipe_buffer.buffer != NULL) {
1607 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1608 			--nbigpipe;
1609 		amountpipekva -= cpipe->pipe_buffer.size;
1610 #ifdef __FreeBSD__
1611 		kmem_free(kernel_map,
1612 			(vm_offset_t)cpipe->pipe_buffer.buffer,
1613 			cpipe->pipe_buffer.size);
1614 #elif defined(__NetBSD__)
1615 		uvm_km_free(kernel_map,
1616 			(vaddr_t)cpipe->pipe_buffer.buffer,
1617 			cpipe->pipe_buffer.size);
1618 #endif /* NetBSD */
1619 		cpipe->pipe_buffer.buffer = NULL;
1620 	}
1621 #ifndef PIPE_NODIRECT
1622 	if (cpipe->pipe_map.kva != NULL) {
1623 #ifdef __FreeBSD__
1624 		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1625 		kmem_free(kernel_map,
1626 			cpipe->pipe_map.kva,
1627 			cpipe->pipe_buffer.size + PAGE_SIZE);
1628 #elif defined(__NetBSD__)
1629 		pipe_loan_free(cpipe);
1630 #endif /* NetBSD */
1631 		cpipe->pipe_map.cnt = 0;
1632 		cpipe->pipe_map.kva = NULL;
1633 		cpipe->pipe_map.pos = 0;
1634 		cpipe->pipe_map.npages = 0;
1635 	}
1636 #endif /* !PIPE_NODIRECT */
1637 }
1638 
1639 /*
1640  * shutdown the pipe
1641  */
1642 static void
1643 pipeclose(cpipe)
1644 	struct pipe *cpipe;
1645 {
1646 	struct pipe *ppipe;
1647 
1648 	if (!cpipe)
1649 		return;
1650 
1651 	pipeselwakeup(cpipe, cpipe);
1652 
1653 	/*
1654 	 * If the other side is blocked, wake it up saying that
1655 	 * we want to close it down.
1656 	 */
1657 	while (cpipe->pipe_busy) {
1658 		wakeup(cpipe);
1659 		cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1660 		tsleep(cpipe, PRIBIO, "pipecl", 0);
1661 	}
1662 
1663 	/*
1664 	 * Disconnect from peer
1665 	 */
1666 	if ((ppipe = cpipe->pipe_peer) != NULL) {
1667 		pipeselwakeup(ppipe, ppipe);
1668 
1669 		ppipe->pipe_state |= PIPE_EOF;
1670 		wakeup(ppipe);
1671 		ppipe->pipe_peer = NULL;
1672 	}
1673 
1674 	/*
1675 	 * free resources
1676 	 */
1677 #ifdef __FreeBSD__
1678 	mtx_lock(&vm_mtx);
1679 	pipe_free_kmem(cpipe);
1680 	/* XXX: erm, doesn't zalloc already have its own locks and
1681 	 * not need the giant vm lock?
1682 	 */
1683 	zfree(pipe_zone, cpipe);
1684 	mtx_unlock(&vm_mtx);
1685 #endif /* FreeBSD */
1686 
1687 #ifdef __NetBSD__
1688 	pipe_free_kmem(cpipe);
1689 	(void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1690 	pool_put(&pipe_pool, cpipe);
1691 #endif
1692 }
1693 
1694 #ifdef __FreeBSD__
1695 /*ARGSUSED*/
1696 static int
1697 pipe_kqfilter(struct file *fp, struct knote *kn)
1698 {
1699 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1700 
1701 	switch (kn->kn_filter) {
1702 	case EVFILT_READ:
1703 		kn->kn_fop = &pipe_rfiltops;
1704 		break;
1705 	case EVFILT_WRITE:
1706 		kn->kn_fop = &pipe_wfiltops;
1707 		cpipe = cpipe->pipe_peer;
1708 		break;
1709 	default:
1710 		return (1);
1711 	}
1712 	kn->kn_hook = (caddr_t)cpipe;
1713 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1714 	return (0);
1715 }
1716 
1717 static void
1718 filt_pipedetach(struct knote *kn)
1719 {
1720 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1721 
1722 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1723 }
1724 
1725 /*ARGSUSED*/
1726 static int
1727 filt_piperead(struct knote *kn, long hint)
1728 {
1729 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1730 	struct pipe *wpipe = rpipe->pipe_peer;
1731 
1732 	kn->kn_data = rpipe->pipe_buffer.cnt;
1733 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1734 		kn->kn_data = rpipe->pipe_map.cnt;
1735 
1736 	if ((rpipe->pipe_state & PIPE_EOF) ||
1737 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1738 		kn->kn_flags |= EV_EOF;
1739 		return (1);
1740 	}
1741 	return (kn->kn_data > 0);
1742 }
1743 
1744 /*ARGSUSED*/
1745 static int
1746 filt_pipewrite(struct knote *kn, long hint)
1747 {
1748 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1749 	struct pipe *wpipe = rpipe->pipe_peer;
1750 
1751 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1752 		kn->kn_data = 0;
1753 		kn->kn_flags |= EV_EOF;
1754 		return (1);
1755 	}
1756 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1757 	if (wpipe->pipe_state & PIPE_DIRECTW)
1758 		kn->kn_data = 0;
1759 
1760 	return (kn->kn_data >= PIPE_BUF);
1761 }
1762 #endif /* FreeBSD */
1763 
1764 #ifdef __NetBSD__
1765 static int
1766 pipe_fcntl(fp, cmd, data, p)
1767 	struct file *fp;
1768 	u_int cmd;
1769 	caddr_t data;
1770 	struct proc *p;
1771 {
1772 	if (cmd == F_SETFL)
1773 		return (0);
1774 	else
1775 		return (EOPNOTSUPP);
1776 }
1777 
1778 /*
1779  * Handle pipe sysctls.
1780  */
1781 int
1782 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1783 	int *name;
1784 	u_int namelen;
1785 	void *oldp;
1786 	size_t *oldlenp;
1787 	void *newp;
1788 	size_t newlen;
1789 {
1790 	/* All sysctl names at this level are terminal. */
1791 	if (namelen != 1)
1792 		return (ENOTDIR);		/* overloaded */
1793 
1794 	switch (name[0]) {
1795 	case KERN_PIPE_MAXKVASZ:
1796 		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1797 	case KERN_PIPE_LIMITKVA:
1798 		return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1799 	case KERN_PIPE_MAXBIGPIPES:
1800 		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1801 	case KERN_PIPE_NBIGPIPES:
1802 		return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1803 	case KERN_PIPE_KVASIZE:
1804 		return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1805 	default:
1806 		return (EOPNOTSUPP);
1807 	}
1808 	/* NOTREACHED */
1809 }
1810 
1811 /*
1812  * Initialize pipe structs.
1813  */
1814 void
1815 pipe_init(void)
1816 {
1817 	pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
1818 		0, NULL, NULL, M_PIPE);
1819 }
1820 
1821 #endif /* __NetBSD __ */
1822