xref: /openbsd-src/sys/kern/sys_pipe.c (revision 5a38ef86d0b61900239c7913d24a05e7b88a58f0)
1 /*	$OpenBSD: sys_pipe.c,v 1.133 2021/12/13 14:56:55 visa Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/pool.h>
36 #include <sys/ioctl.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/mount.h>
40 #include <sys/syscallargs.h>
41 #include <sys/event.h>
42 #include <sys/lock.h>
43 #include <sys/poll.h>
44 #ifdef KTRACE
45 #include <sys/ktrace.h>
46 #endif
47 
48 #include <uvm/uvm_extern.h>
49 
50 #include <sys/pipe.h>
51 
52 struct pipe_pair {
53 	struct pipe pp_wpipe;
54 	struct pipe pp_rpipe;
55 	struct rwlock pp_lock;
56 };
57 
58 /*
59  * interfaces to the outside world
60  */
61 int	pipe_read(struct file *, struct uio *, int);
62 int	pipe_write(struct file *, struct uio *, int);
63 int	pipe_close(struct file *, struct proc *);
64 int	pipe_poll(struct file *, int events, struct proc *);
65 int	pipe_kqfilter(struct file *fp, struct knote *kn);
66 int	pipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
67 int	pipe_stat(struct file *fp, struct stat *ub, struct proc *p);
68 
69 static const struct fileops pipeops = {
70 	.fo_read	= pipe_read,
71 	.fo_write	= pipe_write,
72 	.fo_ioctl	= pipe_ioctl,
73 	.fo_poll	= pipe_poll,
74 	.fo_kqfilter	= pipe_kqfilter,
75 	.fo_stat	= pipe_stat,
76 	.fo_close	= pipe_close
77 };
78 
79 void	filt_pipedetach(struct knote *kn);
80 int	filt_piperead(struct knote *kn, long hint);
81 int	filt_pipereadmodify(struct kevent *kev, struct knote *kn);
82 int	filt_pipereadprocess(struct knote *kn, struct kevent *kev);
83 int	filt_piperead_common(struct knote *kn, struct pipe *rpipe);
84 int	filt_pipewrite(struct knote *kn, long hint);
85 int	filt_pipewritemodify(struct kevent *kev, struct knote *kn);
86 int	filt_pipewriteprocess(struct knote *kn, struct kevent *kev);
87 int	filt_pipewrite_common(struct knote *kn, struct pipe *rpipe);
88 int	filt_pipeexcept(struct knote *kn, long hint);
89 int	filt_pipeexceptmodify(struct kevent *kev, struct knote *kn);
90 int	filt_pipeexceptprocess(struct knote *kn, struct kevent *kev);
91 int	filt_pipeexcept_common(struct knote *kn, struct pipe *rpipe);
92 
93 const struct filterops pipe_rfiltops = {
94 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
95 	.f_attach	= NULL,
96 	.f_detach	= filt_pipedetach,
97 	.f_event	= filt_piperead,
98 	.f_modify	= filt_pipereadmodify,
99 	.f_process	= filt_pipereadprocess,
100 };
101 
102 const struct filterops pipe_wfiltops = {
103 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
104 	.f_attach	= NULL,
105 	.f_detach	= filt_pipedetach,
106 	.f_event	= filt_pipewrite,
107 	.f_modify	= filt_pipewritemodify,
108 	.f_process	= filt_pipewriteprocess,
109 };
110 
111 const struct filterops pipe_efiltops = {
112 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
113 	.f_attach	= NULL,
114 	.f_detach	= filt_pipedetach,
115 	.f_event	= filt_pipeexcept,
116 	.f_modify	= filt_pipeexceptmodify,
117 	.f_process	= filt_pipeexceptprocess,
118 };
119 
120 /*
121  * Default pipe buffer size(s), this can be kind-of large now because pipe
122  * space is pageable.  The pipe code will try to maintain locality of
123  * reference for performance reasons, so small amounts of outstanding I/O
124  * will not wipe the cache.
125  */
126 #define MINPIPESIZE (PIPE_SIZE/3)
127 
128 /*
129  * Limit the number of "big" pipes
130  */
131 #define LIMITBIGPIPES	32
132 unsigned int nbigpipe;
133 static unsigned int amountpipekva;
134 
135 struct pool pipe_pair_pool;
136 
137 int	dopipe(struct proc *, int *, int);
138 void	pipeselwakeup(struct pipe *);
139 
140 int	pipe_create(struct pipe *);
141 void	pipe_destroy(struct pipe *);
142 int	pipe_rundown(struct pipe *);
143 struct pipe *pipe_peer(struct pipe *);
144 int	pipe_buffer_realloc(struct pipe *, u_int);
145 void	pipe_buffer_free(struct pipe *);
146 
147 int	pipe_iolock(struct pipe *);
148 void	pipe_iounlock(struct pipe *);
149 int	pipe_iosleep(struct pipe *, const char *);
150 
151 struct pipe_pair *pipe_pair_create(void);
152 void	pipe_pair_destroy(struct pipe_pair *);
153 
154 /*
155  * The pipe system call for the DTYPE_PIPE type of pipes
156  */
157 
158 int
159 sys_pipe(struct proc *p, void *v, register_t *retval)
160 {
161 	struct sys_pipe_args /* {
162 		syscallarg(int *) fdp;
163 	} */ *uap = v;
164 
165 	return (dopipe(p, SCARG(uap, fdp), 0));
166 }
167 
168 int
169 sys_pipe2(struct proc *p, void *v, register_t *retval)
170 {
171 	struct sys_pipe2_args /* {
172 		syscallarg(int *) fdp;
173 		syscallarg(int) flags;
174 	} */ *uap = v;
175 
176 	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
177 		return (EINVAL);
178 
179 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
180 }
181 
182 int
183 dopipe(struct proc *p, int *ufds, int flags)
184 {
185 	struct filedesc *fdp = p->p_fd;
186 	struct file *rf, *wf;
187 	struct pipe_pair *pp;
188 	struct pipe *rpipe, *wpipe = NULL;
189 	int fds[2], cloexec, error;
190 
191 	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
192 
193 	pp = pipe_pair_create();
194 	if (pp == NULL)
195 		return (ENOMEM);
196 	wpipe = &pp->pp_wpipe;
197 	rpipe = &pp->pp_rpipe;
198 
199 	fdplock(fdp);
200 
201 	error = falloc(p, &rf, &fds[0]);
202 	if (error != 0)
203 		goto free2;
204 	rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
205 	rf->f_type = DTYPE_PIPE;
206 	rf->f_data = rpipe;
207 	rf->f_ops = &pipeops;
208 
209 	error = falloc(p, &wf, &fds[1]);
210 	if (error != 0)
211 		goto free3;
212 	wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
213 	wf->f_type = DTYPE_PIPE;
214 	wf->f_data = wpipe;
215 	wf->f_ops = &pipeops;
216 
217 	fdinsert(fdp, fds[0], cloexec, rf);
218 	fdinsert(fdp, fds[1], cloexec, wf);
219 
220 	error = copyout(fds, ufds, sizeof(fds));
221 	if (error == 0) {
222 		fdpunlock(fdp);
223 #ifdef KTRACE
224 		if (KTRPOINT(p, KTR_STRUCT))
225 			ktrfds(p, fds, 2);
226 #endif
227 	} else {
228 		/* fdrelease() unlocks fdp. */
229 		fdrelease(p, fds[0]);
230 		fdplock(fdp);
231 		fdrelease(p, fds[1]);
232 	}
233 
234 	FRELE(rf, p);
235 	FRELE(wf, p);
236 	return (error);
237 
238 free3:
239 	fdremove(fdp, fds[0]);
240 	closef(rf, p);
241 	rpipe = NULL;
242 free2:
243 	fdpunlock(fdp);
244 	pipe_destroy(wpipe);
245 	pipe_destroy(rpipe);
246 	return (error);
247 }
248 
249 /*
250  * Allocate kva for pipe circular buffer, the space is pageable.
251  * This routine will 'realloc' the size of a pipe safely, if it fails
252  * it will retain the old buffer.
253  * If it fails it will return ENOMEM.
254  */
255 int
256 pipe_buffer_realloc(struct pipe *cpipe, u_int size)
257 {
258 	caddr_t buffer;
259 
260 	/* buffer uninitialized or pipe locked */
261 	KASSERT((cpipe->pipe_buffer.buffer == NULL) ||
262 	    (cpipe->pipe_state & PIPE_LOCK));
263 
264 	/* buffer should be empty */
265 	KASSERT(cpipe->pipe_buffer.cnt == 0);
266 
267 	KERNEL_LOCK();
268 	buffer = km_alloc(size, &kv_any, &kp_pageable, &kd_waitok);
269 	KERNEL_UNLOCK();
270 	if (buffer == NULL)
271 		return (ENOMEM);
272 
273 	/* free old resources if we are resizing */
274 	pipe_buffer_free(cpipe);
275 
276 	cpipe->pipe_buffer.buffer = buffer;
277 	cpipe->pipe_buffer.size = size;
278 	cpipe->pipe_buffer.in = 0;
279 	cpipe->pipe_buffer.out = 0;
280 
281 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
282 
283 	return (0);
284 }
285 
286 /*
287  * initialize and allocate VM and memory for pipe
288  */
289 int
290 pipe_create(struct pipe *cpipe)
291 {
292 	int error;
293 
294 	error = pipe_buffer_realloc(cpipe, PIPE_SIZE);
295 	if (error != 0)
296 		return (error);
297 
298 	sigio_init(&cpipe->pipe_sigio);
299 
300 	getnanotime(&cpipe->pipe_ctime);
301 	cpipe->pipe_atime = cpipe->pipe_ctime;
302 	cpipe->pipe_mtime = cpipe->pipe_ctime;
303 
304 	return (0);
305 }
306 
307 struct pipe *
308 pipe_peer(struct pipe *cpipe)
309 {
310 	struct pipe *peer;
311 
312 	rw_assert_anylock(cpipe->pipe_lock);
313 
314 	peer = cpipe->pipe_peer;
315 	if (peer == NULL || (peer->pipe_state & PIPE_EOF))
316 		return (NULL);
317 	return (peer);
318 }
319 
320 /*
321  * Lock a pipe for exclusive I/O access.
322  */
323 int
324 pipe_iolock(struct pipe *cpipe)
325 {
326 	int error;
327 
328 	rw_assert_wrlock(cpipe->pipe_lock);
329 
330 	while (cpipe->pipe_state & PIPE_LOCK) {
331 		cpipe->pipe_state |= PIPE_LWANT;
332 		error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH,
333 		    "pipeiolk", INFSLP);
334 		if (error)
335 			return (error);
336 	}
337 	cpipe->pipe_state |= PIPE_LOCK;
338 	return (0);
339 }
340 
341 /*
342  * Unlock a pipe I/O lock.
343  */
344 void
345 pipe_iounlock(struct pipe *cpipe)
346 {
347 	rw_assert_wrlock(cpipe->pipe_lock);
348 	KASSERT(cpipe->pipe_state & PIPE_LOCK);
349 
350 	cpipe->pipe_state &= ~PIPE_LOCK;
351 	if (cpipe->pipe_state & PIPE_LWANT) {
352 		cpipe->pipe_state &= ~PIPE_LWANT;
353 		wakeup(cpipe);
354 	}
355 }
356 
357 /*
358  * Unlock the pipe I/O lock and go to sleep. Returns 0 on success and the I/O
359  * lock is relocked. Otherwise if a signal was caught, non-zero is returned and
360  * the I/O lock is not locked.
361  *
362  * Any caller must obtain a reference to the pipe by incrementing `pipe_busy'
363  * before calling this function in order ensure that the same pipe is not
364  * destroyed while sleeping.
365  */
366 int
367 pipe_iosleep(struct pipe *cpipe, const char *wmesg)
368 {
369 	int error;
370 
371 	pipe_iounlock(cpipe);
372 	error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH, wmesg,
373 	    INFSLP);
374 	if (error)
375 		return (error);
376 	return (pipe_iolock(cpipe));
377 }
378 
379 void
380 pipeselwakeup(struct pipe *cpipe)
381 {
382 	rw_assert_wrlock(cpipe->pipe_lock);
383 
384 	if (cpipe->pipe_state & PIPE_SEL) {
385 		cpipe->pipe_state &= ~PIPE_SEL;
386 		selwakeup(&cpipe->pipe_sel);
387 	} else {
388 		KNOTE(&cpipe->pipe_sel.si_note, 0);
389 	}
390 
391 	if (cpipe->pipe_state & PIPE_ASYNC)
392 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
393 }
394 
395 int
396 pipe_read(struct file *fp, struct uio *uio, int fflags)
397 {
398 	struct pipe *rpipe = fp->f_data;
399 	size_t nread = 0, size;
400 	int error;
401 
402 	rw_enter_write(rpipe->pipe_lock);
403 	++rpipe->pipe_busy;
404 	error = pipe_iolock(rpipe);
405 	if (error) {
406 		--rpipe->pipe_busy;
407 		pipe_rundown(rpipe);
408 		rw_exit_write(rpipe->pipe_lock);
409 		return (error);
410 	}
411 
412 	while (uio->uio_resid) {
413 		/* Normal pipe buffer receive. */
414 		if (rpipe->pipe_buffer.cnt > 0) {
415 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
416 			if (size > rpipe->pipe_buffer.cnt)
417 				size = rpipe->pipe_buffer.cnt;
418 			if (size > uio->uio_resid)
419 				size = uio->uio_resid;
420 			rw_exit_write(rpipe->pipe_lock);
421 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
422 					size, uio);
423 			rw_enter_write(rpipe->pipe_lock);
424 			if (error) {
425 				break;
426 			}
427 			rpipe->pipe_buffer.out += size;
428 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
429 				rpipe->pipe_buffer.out = 0;
430 
431 			rpipe->pipe_buffer.cnt -= size;
432 			/*
433 			 * If there is no more to read in the pipe, reset
434 			 * its pointers to the beginning.  This improves
435 			 * cache hit stats.
436 			 */
437 			if (rpipe->pipe_buffer.cnt == 0) {
438 				rpipe->pipe_buffer.in = 0;
439 				rpipe->pipe_buffer.out = 0;
440 			}
441 			nread += size;
442 		} else {
443 			/*
444 			 * detect EOF condition
445 			 * read returns 0 on EOF, no need to set error
446 			 */
447 			if (rpipe->pipe_state & PIPE_EOF)
448 				break;
449 
450 			/* If the "write-side" has been blocked, wake it up. */
451 			if (rpipe->pipe_state & PIPE_WANTW) {
452 				rpipe->pipe_state &= ~PIPE_WANTW;
453 				wakeup(rpipe);
454 			}
455 
456 			/* Break if some data was read. */
457 			if (nread > 0)
458 				break;
459 
460 			/* Handle non-blocking mode operation. */
461 			if (fp->f_flag & FNONBLOCK) {
462 				error = EAGAIN;
463 				break;
464 			}
465 
466 			/* Wait for more data. */
467 			rpipe->pipe_state |= PIPE_WANTR;
468 			error = pipe_iosleep(rpipe, "piperd");
469 			if (error)
470 				goto unlocked_error;
471 		}
472 	}
473 	pipe_iounlock(rpipe);
474 
475 	if (error == 0)
476 		getnanotime(&rpipe->pipe_atime);
477 unlocked_error:
478 	--rpipe->pipe_busy;
479 
480 	if (pipe_rundown(rpipe) == 0 && rpipe->pipe_buffer.cnt < MINPIPESIZE) {
481 		/* Handle write blocking hysteresis. */
482 		if (rpipe->pipe_state & PIPE_WANTW) {
483 			rpipe->pipe_state &= ~PIPE_WANTW;
484 			wakeup(rpipe);
485 		}
486 	}
487 
488 	if (rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
489 		pipeselwakeup(rpipe);
490 
491 	rw_exit_write(rpipe->pipe_lock);
492 	return (error);
493 }
494 
495 int
496 pipe_write(struct file *fp, struct uio *uio, int fflags)
497 {
498 	struct pipe *rpipe = fp->f_data, *wpipe;
499 	struct rwlock *lock = rpipe->pipe_lock;
500 	size_t orig_resid;
501 	int error;
502 
503 	rw_enter_write(lock);
504 	wpipe = pipe_peer(rpipe);
505 
506 	/* Detect loss of pipe read side, issue SIGPIPE if lost. */
507 	if (wpipe == NULL) {
508 		rw_exit_write(lock);
509 		return (EPIPE);
510 	}
511 
512 	++wpipe->pipe_busy;
513 	error = pipe_iolock(wpipe);
514 	if (error) {
515 		--wpipe->pipe_busy;
516 		pipe_rundown(wpipe);
517 		rw_exit_write(lock);
518 		return (error);
519 	}
520 
521 
522 	/* If it is advantageous to resize the pipe buffer, do so. */
523 	if (uio->uio_resid > PIPE_SIZE &&
524 	    wpipe->pipe_buffer.size <= PIPE_SIZE &&
525 	    wpipe->pipe_buffer.cnt == 0) {
526 	    	unsigned int npipe;
527 
528 		npipe = atomic_inc_int_nv(&nbigpipe);
529 		if (npipe > LIMITBIGPIPES ||
530 		    pipe_buffer_realloc(wpipe, BIG_PIPE_SIZE) != 0)
531 			atomic_dec_int(&nbigpipe);
532 	}
533 
534 	orig_resid = uio->uio_resid;
535 
536 	while (uio->uio_resid) {
537 		size_t space;
538 
539 		if (wpipe->pipe_state & PIPE_EOF) {
540 			error = EPIPE;
541 			break;
542 		}
543 
544 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
545 
546 		/* Writes of size <= PIPE_BUF must be atomic. */
547 		if (space < uio->uio_resid && orig_resid <= PIPE_BUF)
548 			space = 0;
549 
550 		if (space > 0) {
551 			size_t size;	/* Transfer size */
552 			size_t segsize;	/* first segment to transfer */
553 
554 			/*
555 			 * Transfer size is minimum of uio transfer
556 			 * and free space in pipe buffer.
557 			 */
558 			if (space > uio->uio_resid)
559 				size = uio->uio_resid;
560 			else
561 				size = space;
562 			/*
563 			 * First segment to transfer is minimum of
564 			 * transfer size and contiguous space in
565 			 * pipe buffer.  If first segment to transfer
566 			 * is less than the transfer size, we've got
567 			 * a wraparound in the buffer.
568 			 */
569 			segsize = wpipe->pipe_buffer.size -
570 				wpipe->pipe_buffer.in;
571 			if (segsize > size)
572 				segsize = size;
573 
574 			/* Transfer first segment */
575 
576 			rw_exit_write(lock);
577 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
578 					segsize, uio);
579 			rw_enter_write(lock);
580 
581 			if (error == 0 && segsize < size) {
582 				/*
583 				 * Transfer remaining part now, to
584 				 * support atomic writes.  Wraparound
585 				 * happened.
586 				 */
587 #ifdef DIAGNOSTIC
588 				if (wpipe->pipe_buffer.in + segsize !=
589 				    wpipe->pipe_buffer.size)
590 					panic("Expected pipe buffer wraparound disappeared");
591 #endif
592 
593 				rw_exit_write(lock);
594 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
595 						size - segsize, uio);
596 				rw_enter_write(lock);
597 			}
598 			if (error == 0) {
599 				wpipe->pipe_buffer.in += size;
600 				if (wpipe->pipe_buffer.in >=
601 				    wpipe->pipe_buffer.size) {
602 #ifdef DIAGNOSTIC
603 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
604 						panic("Expected wraparound bad");
605 #endif
606 					wpipe->pipe_buffer.in = size - segsize;
607 				}
608 
609 				wpipe->pipe_buffer.cnt += size;
610 #ifdef DIAGNOSTIC
611 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
612 					panic("Pipe buffer overflow");
613 #endif
614 			}
615 			if (error)
616 				break;
617 		} else {
618 			/* If the "read-side" has been blocked, wake it up. */
619 			if (wpipe->pipe_state & PIPE_WANTR) {
620 				wpipe->pipe_state &= ~PIPE_WANTR;
621 				wakeup(wpipe);
622 			}
623 
624 			/* Don't block on non-blocking I/O. */
625 			if (fp->f_flag & FNONBLOCK) {
626 				error = EAGAIN;
627 				break;
628 			}
629 
630 			/*
631 			 * We have no more space and have something to offer,
632 			 * wake up select/poll.
633 			 */
634 			pipeselwakeup(wpipe);
635 
636 			wpipe->pipe_state |= PIPE_WANTW;
637 			error = pipe_iosleep(wpipe, "pipewr");
638 			if (error)
639 				goto unlocked_error;
640 
641 			/*
642 			 * If read side wants to go away, we just issue a
643 			 * signal to ourselves.
644 			 */
645 			if (wpipe->pipe_state & PIPE_EOF) {
646 				error = EPIPE;
647 				break;
648 			}
649 		}
650 	}
651 	pipe_iounlock(wpipe);
652 
653 unlocked_error:
654 	--wpipe->pipe_busy;
655 
656 	if (pipe_rundown(wpipe) == 0 && wpipe->pipe_buffer.cnt > 0) {
657 		/*
658 		 * If we have put any characters in the buffer, we wake up
659 		 * the reader.
660 		 */
661 		if (wpipe->pipe_state & PIPE_WANTR) {
662 			wpipe->pipe_state &= ~PIPE_WANTR;
663 			wakeup(wpipe);
664 		}
665 	}
666 
667 	/* Don't return EPIPE if I/O was successful. */
668 	if (wpipe->pipe_buffer.cnt == 0 &&
669 	    uio->uio_resid == 0 &&
670 	    error == EPIPE) {
671 		error = 0;
672 	}
673 
674 	if (error == 0)
675 		getnanotime(&wpipe->pipe_mtime);
676 	/* We have something to offer, wake up select/poll. */
677 	if (wpipe->pipe_buffer.cnt)
678 		pipeselwakeup(wpipe);
679 
680 	rw_exit_write(lock);
681 	return (error);
682 }
683 
684 /*
685  * we implement a very minimal set of ioctls for compatibility with sockets.
686  */
687 int
688 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
689 {
690 	struct pipe *mpipe = fp->f_data;
691 	int error = 0;
692 
693 	switch (cmd) {
694 
695 	case FIONBIO:
696 		break;
697 
698 	case FIOASYNC:
699 		rw_enter_write(mpipe->pipe_lock);
700 		if (*(int *)data) {
701 			mpipe->pipe_state |= PIPE_ASYNC;
702 		} else {
703 			mpipe->pipe_state &= ~PIPE_ASYNC;
704 		}
705 		rw_exit_write(mpipe->pipe_lock);
706 		break;
707 
708 	case FIONREAD:
709 		rw_enter_read(mpipe->pipe_lock);
710 		*(int *)data = mpipe->pipe_buffer.cnt;
711 		rw_exit_read(mpipe->pipe_lock);
712 		break;
713 
714 	case FIOSETOWN:
715 	case SIOCSPGRP:
716 	case TIOCSPGRP:
717 		error = sigio_setown(&mpipe->pipe_sigio, cmd, data);
718 		break;
719 
720 	case FIOGETOWN:
721 	case SIOCGPGRP:
722 	case TIOCGPGRP:
723 		sigio_getown(&mpipe->pipe_sigio, cmd, data);
724 		break;
725 
726 	default:
727 		error = ENOTTY;
728 	}
729 
730 	return (error);
731 }
732 
733 int
734 pipe_poll(struct file *fp, int events, struct proc *p)
735 {
736 	struct pipe *rpipe = fp->f_data, *wpipe;
737 	struct rwlock *lock = rpipe->pipe_lock;
738 	int revents = 0;
739 
740 	rw_enter_write(lock);
741 	wpipe = pipe_peer(rpipe);
742 
743 	if (events & (POLLIN | POLLRDNORM)) {
744 		if (rpipe->pipe_buffer.cnt > 0 ||
745 		    (rpipe->pipe_state & PIPE_EOF))
746 			revents |= events & (POLLIN | POLLRDNORM);
747 	}
748 
749 	/* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
750 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL)
751 		revents |= POLLHUP;
752 	else if (events & (POLLOUT | POLLWRNORM)) {
753 		if (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt >= PIPE_BUF)
754 			revents |= events & (POLLOUT | POLLWRNORM);
755 	}
756 
757 	if (revents == 0) {
758 		if (events & (POLLIN | POLLRDNORM)) {
759 			selrecord(p, &rpipe->pipe_sel);
760 			rpipe->pipe_state |= PIPE_SEL;
761 		}
762 		if (events & (POLLOUT | POLLWRNORM)) {
763 			selrecord(p, &wpipe->pipe_sel);
764 			wpipe->pipe_state |= PIPE_SEL;
765 		}
766 	}
767 
768 	rw_exit_write(lock);
769 
770 	return (revents);
771 }
772 
773 int
774 pipe_stat(struct file *fp, struct stat *ub, struct proc *p)
775 {
776 	struct pipe *pipe = fp->f_data;
777 
778 	memset(ub, 0, sizeof(*ub));
779 
780 	rw_enter_read(pipe->pipe_lock);
781 	ub->st_mode = S_IFIFO;
782 	ub->st_blksize = pipe->pipe_buffer.size;
783 	ub->st_size = pipe->pipe_buffer.cnt;
784 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
785 	ub->st_atim.tv_sec  = pipe->pipe_atime.tv_sec;
786 	ub->st_atim.tv_nsec = pipe->pipe_atime.tv_nsec;
787 	ub->st_mtim.tv_sec  = pipe->pipe_mtime.tv_sec;
788 	ub->st_mtim.tv_nsec = pipe->pipe_mtime.tv_nsec;
789 	ub->st_ctim.tv_sec  = pipe->pipe_ctime.tv_sec;
790 	ub->st_ctim.tv_nsec = pipe->pipe_ctime.tv_nsec;
791 	ub->st_uid = fp->f_cred->cr_uid;
792 	ub->st_gid = fp->f_cred->cr_gid;
793 	rw_exit_read(pipe->pipe_lock);
794 	/*
795 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
796 	 * XXX (st_dev, st_ino) should be unique.
797 	 */
798 	return (0);
799 }
800 
801 int
802 pipe_close(struct file *fp, struct proc *p)
803 {
804 	struct pipe *cpipe = fp->f_data;
805 
806 	fp->f_ops = NULL;
807 	fp->f_data = NULL;
808 	pipe_destroy(cpipe);
809 	return (0);
810 }
811 
812 /*
813  * Free kva for pipe circular buffer.
814  * No pipe lock check as only called from pipe_buffer_realloc() and pipeclose()
815  */
816 void
817 pipe_buffer_free(struct pipe *cpipe)
818 {
819 	u_int size;
820 
821 	if (cpipe->pipe_buffer.buffer == NULL)
822 		return;
823 
824 	size = cpipe->pipe_buffer.size;
825 
826 	KERNEL_LOCK();
827 	km_free(cpipe->pipe_buffer.buffer, size, &kv_any, &kp_pageable);
828 	KERNEL_UNLOCK();
829 
830 	cpipe->pipe_buffer.buffer = NULL;
831 
832 	atomic_sub_int(&amountpipekva, size);
833 	if (size > PIPE_SIZE)
834 		atomic_dec_int(&nbigpipe);
835 }
836 
837 /*
838  * shutdown the pipe, and free resources.
839  */
840 void
841 pipe_destroy(struct pipe *cpipe)
842 {
843 	struct pipe *ppipe;
844 
845 	if (cpipe == NULL)
846 		return;
847 
848 	rw_enter_write(cpipe->pipe_lock);
849 
850 	pipeselwakeup(cpipe);
851 	sigio_free(&cpipe->pipe_sigio);
852 
853 	/*
854 	 * If the other side is blocked, wake it up saying that
855 	 * we want to close it down.
856 	 */
857 	cpipe->pipe_state |= PIPE_EOF;
858 	while (cpipe->pipe_busy) {
859 		wakeup(cpipe);
860 		cpipe->pipe_state |= PIPE_WANTD;
861 		rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO, "pipecl", INFSLP);
862 	}
863 
864 	/* Disconnect from peer. */
865 	if ((ppipe = cpipe->pipe_peer) != NULL) {
866 		pipeselwakeup(ppipe);
867 
868 		ppipe->pipe_state |= PIPE_EOF;
869 		wakeup(ppipe);
870 		ppipe->pipe_peer = NULL;
871 	}
872 
873 	pipe_buffer_free(cpipe);
874 
875 	rw_exit_write(cpipe->pipe_lock);
876 
877 	if (ppipe == NULL)
878 		pipe_pair_destroy(cpipe->pipe_pair);
879 }
880 
881 /*
882  * Returns non-zero if a rundown is currently ongoing.
883  */
884 int
885 pipe_rundown(struct pipe *cpipe)
886 {
887 	rw_assert_wrlock(cpipe->pipe_lock);
888 
889 	if (cpipe->pipe_busy > 0 || (cpipe->pipe_state & PIPE_WANTD) == 0)
890 		return (0);
891 
892 	/* Only wakeup pipe_destroy() once the pipe is no longer busy. */
893 	cpipe->pipe_state &= ~(PIPE_WANTD | PIPE_WANTR | PIPE_WANTW);
894 	wakeup(cpipe);
895 	return (1);
896 }
897 
898 int
899 pipe_kqfilter(struct file *fp, struct knote *kn)
900 {
901 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
902 	struct rwlock *lock = rpipe->pipe_lock;
903 	int error = 0;
904 
905 	rw_enter_write(lock);
906 	wpipe = pipe_peer(rpipe);
907 
908 	switch (kn->kn_filter) {
909 	case EVFILT_READ:
910 		kn->kn_fop = &pipe_rfiltops;
911 		kn->kn_hook = rpipe;
912 		klist_insert_locked(&rpipe->pipe_sel.si_note, kn);
913 		break;
914 	case EVFILT_WRITE:
915 		if (wpipe == NULL) {
916 			/* other end of pipe has been closed */
917 			error = EPIPE;
918 			break;
919 		}
920 		kn->kn_fop = &pipe_wfiltops;
921 		kn->kn_hook = wpipe;
922 		klist_insert_locked(&wpipe->pipe_sel.si_note, kn);
923 		break;
924 	case EVFILT_EXCEPT:
925 		if (kn->kn_flags & __EV_SELECT) {
926 			/* Prevent triggering exceptfds. */
927 			error = EPERM;
928 			break;
929 		}
930 		if ((kn->kn_flags & __EV_POLL) == 0) {
931 			/* Disallow usage through kevent(2). */
932 			error = EINVAL;
933 			break;
934 		}
935 		kn->kn_fop = &pipe_efiltops;
936 		kn->kn_hook = rpipe;
937 		klist_insert_locked(&rpipe->pipe_sel.si_note, kn);
938 		break;
939 	default:
940 		error = EINVAL;
941 	}
942 
943 	rw_exit_write(lock);
944 
945 	return (error);
946 }
947 
948 void
949 filt_pipedetach(struct knote *kn)
950 {
951 	struct pipe *cpipe = kn->kn_hook;
952 
953 	klist_remove(&cpipe->pipe_sel.si_note, kn);
954 }
955 
956 int
957 filt_piperead_common(struct knote *kn, struct pipe *rpipe)
958 {
959 	struct pipe *wpipe;
960 
961 	rw_assert_wrlock(rpipe->pipe_lock);
962 
963 	wpipe = pipe_peer(rpipe);
964 
965 	kn->kn_data = rpipe->pipe_buffer.cnt;
966 
967 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
968 		kn->kn_flags |= EV_EOF;
969 		if (kn->kn_flags & __EV_POLL)
970 			kn->kn_flags |= __EV_HUP;
971 		return (1);
972 	}
973 
974 	return (kn->kn_data > 0);
975 }
976 
977 int
978 filt_piperead(struct knote *kn, long hint)
979 {
980 	struct pipe *rpipe = kn->kn_fp->f_data;
981 
982 	return (filt_piperead_common(kn, rpipe));
983 }
984 
985 int
986 filt_pipereadmodify(struct kevent *kev, struct knote *kn)
987 {
988 	struct pipe *rpipe = kn->kn_fp->f_data;
989 	int active;
990 
991 	rw_enter_write(rpipe->pipe_lock);
992 	knote_modify(kev, kn);
993 	active = filt_piperead_common(kn, rpipe);
994 	rw_exit_write(rpipe->pipe_lock);
995 
996 	return (active);
997 }
998 
999 int
1000 filt_pipereadprocess(struct knote *kn, struct kevent *kev)
1001 {
1002 	struct pipe *rpipe = kn->kn_fp->f_data;
1003 	int active;
1004 
1005 	rw_enter_write(rpipe->pipe_lock);
1006 	if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
1007 		active = 1;
1008 	else
1009 		active = filt_piperead_common(kn, rpipe);
1010 	if (active)
1011 		knote_submit(kn, kev);
1012 	rw_exit_write(rpipe->pipe_lock);
1013 
1014 	return (active);
1015 }
1016 
1017 int
1018 filt_pipewrite_common(struct knote *kn, struct pipe *rpipe)
1019 {
1020 	struct pipe *wpipe;
1021 
1022 	rw_assert_wrlock(rpipe->pipe_lock);
1023 
1024 	wpipe = pipe_peer(rpipe);
1025 
1026 	if (wpipe == NULL) {
1027 		kn->kn_data = 0;
1028 		kn->kn_flags |= EV_EOF;
1029 		if (kn->kn_flags & __EV_POLL)
1030 			kn->kn_flags |= __EV_HUP;
1031 		return (1);
1032 	}
1033 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1034 
1035 	return (kn->kn_data >= PIPE_BUF);
1036 }
1037 
1038 int
1039 filt_pipewrite(struct knote *kn, long hint)
1040 {
1041 	struct pipe *rpipe = kn->kn_fp->f_data;
1042 
1043 	return (filt_pipewrite_common(kn, rpipe));
1044 }
1045 
1046 int
1047 filt_pipewritemodify(struct kevent *kev, struct knote *kn)
1048 {
1049 	struct pipe *rpipe = kn->kn_fp->f_data;
1050 	int active;
1051 
1052 	rw_enter_write(rpipe->pipe_lock);
1053 	knote_modify(kev, kn);
1054 	active = filt_pipewrite_common(kn, rpipe);
1055 	rw_exit_write(rpipe->pipe_lock);
1056 
1057 	return (active);
1058 }
1059 
1060 int
1061 filt_pipewriteprocess(struct knote *kn, struct kevent *kev)
1062 {
1063 	struct pipe *rpipe = kn->kn_fp->f_data;
1064 	int active;
1065 
1066 	rw_enter_write(rpipe->pipe_lock);
1067 	if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
1068 		active = 1;
1069 	else
1070 		active = filt_pipewrite_common(kn, rpipe);
1071 	if (active)
1072 		knote_submit(kn, kev);
1073 	rw_exit_write(rpipe->pipe_lock);
1074 
1075 	return (active);
1076 }
1077 
1078 int
1079 filt_pipeexcept_common(struct knote *kn, struct pipe *rpipe)
1080 {
1081 	struct pipe *wpipe;
1082 	int active = 0;
1083 
1084 	rw_assert_wrlock(rpipe->pipe_lock);
1085 
1086 	wpipe = pipe_peer(rpipe);
1087 
1088 	if (kn->kn_flags & __EV_POLL) {
1089 		if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
1090 			kn->kn_flags |= __EV_HUP;
1091 			active = 1;
1092 		}
1093 	}
1094 
1095 	return (active);
1096 }
1097 
1098 int
1099 filt_pipeexcept(struct knote *kn, long hint)
1100 {
1101 	struct pipe *rpipe = kn->kn_fp->f_data;
1102 
1103 	return (filt_pipeexcept_common(kn, rpipe));
1104 }
1105 
1106 int
1107 filt_pipeexceptmodify(struct kevent *kev, struct knote *kn)
1108 {
1109 	struct pipe *rpipe = kn->kn_fp->f_data;
1110 	int active;
1111 
1112 	rw_enter_write(rpipe->pipe_lock);
1113 	knote_modify(kev, kn);
1114 	active = filt_pipeexcept_common(kn, rpipe);
1115 	rw_exit_write(rpipe->pipe_lock);
1116 
1117 	return (active);
1118 }
1119 
1120 int
1121 filt_pipeexceptprocess(struct knote *kn, struct kevent *kev)
1122 {
1123 	struct pipe *rpipe = kn->kn_fp->f_data;
1124 	int active;
1125 
1126 	rw_enter_write(rpipe->pipe_lock);
1127 	if (kev != NULL && (kn->kn_flags & EV_ONESHOT))
1128 		active = 1;
1129 	else
1130 		active = filt_pipeexcept_common(kn, rpipe);
1131 	if (active)
1132 		knote_submit(kn, kev);
1133 	rw_exit_write(rpipe->pipe_lock);
1134 
1135 	return (active);
1136 }
1137 
1138 void
1139 pipe_init(void)
1140 {
1141 	pool_init(&pipe_pair_pool, sizeof(struct pipe_pair), 0, IPL_MPFLOOR,
1142 	    PR_WAITOK, "pipepl", NULL);
1143 }
1144 
1145 struct pipe_pair *
1146 pipe_pair_create(void)
1147 {
1148 	struct pipe_pair *pp;
1149 
1150 	pp = pool_get(&pipe_pair_pool, PR_WAITOK | PR_ZERO);
1151 	pp->pp_wpipe.pipe_pair = pp;
1152 	pp->pp_rpipe.pipe_pair = pp;
1153 	pp->pp_wpipe.pipe_peer = &pp->pp_rpipe;
1154 	pp->pp_rpipe.pipe_peer = &pp->pp_wpipe;
1155 	/*
1156 	 * One lock is used per pipe pair in order to obtain exclusive access to
1157 	 * the pipe pair.
1158 	 */
1159 	rw_init(&pp->pp_lock, "pipelk");
1160 	pp->pp_wpipe.pipe_lock = &pp->pp_lock;
1161 	pp->pp_rpipe.pipe_lock = &pp->pp_lock;
1162 
1163 	klist_init_rwlock(&pp->pp_wpipe.pipe_sel.si_note, &pp->pp_lock);
1164 	klist_init_rwlock(&pp->pp_rpipe.pipe_sel.si_note, &pp->pp_lock);
1165 
1166 	if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
1167 		goto err;
1168 	return (pp);
1169 err:
1170 	pipe_destroy(&pp->pp_wpipe);
1171 	pipe_destroy(&pp->pp_rpipe);
1172 	return (NULL);
1173 }
1174 
1175 void
1176 pipe_pair_destroy(struct pipe_pair *pp)
1177 {
1178 	klist_free(&pp->pp_wpipe.pipe_sel.si_note);
1179 	klist_free(&pp->pp_rpipe.pipe_sel.si_note);
1180 	pool_put(&pipe_pair_pool, pp);
1181 }
1182