xref: /openbsd-src/sys/kern/sys_pipe.c (revision 0d7a239ec4d17be3402c4deba0178e1d8f7f21a7)
1 /*	$OpenBSD: sys_pipe.c,v 1.138 2022/05/09 14:49:55 visa Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/pool.h>
36 #include <sys/ioctl.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/mount.h>
40 #include <sys/syscallargs.h>
41 #include <sys/event.h>
42 #include <sys/lock.h>
43 #include <sys/poll.h>
44 #ifdef KTRACE
45 #include <sys/ktrace.h>
46 #endif
47 
48 #include <uvm/uvm_extern.h>
49 
50 #include <sys/pipe.h>
51 
52 struct pipe_pair {
53 	struct pipe pp_wpipe;
54 	struct pipe pp_rpipe;
55 	struct rwlock pp_lock;
56 };
57 
58 /*
59  * interfaces to the outside world
60  */
61 int	pipe_read(struct file *, struct uio *, int);
62 int	pipe_write(struct file *, struct uio *, int);
63 int	pipe_close(struct file *, struct proc *);
64 int	pipe_poll(struct file *, int events, struct proc *);
65 int	pipe_kqfilter(struct file *fp, struct knote *kn);
66 int	pipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
67 int	pipe_stat(struct file *fp, struct stat *ub, struct proc *p);
68 
69 static const struct fileops pipeops = {
70 	.fo_read	= pipe_read,
71 	.fo_write	= pipe_write,
72 	.fo_ioctl	= pipe_ioctl,
73 	.fo_poll	= pipe_poll,
74 	.fo_kqfilter	= pipe_kqfilter,
75 	.fo_stat	= pipe_stat,
76 	.fo_close	= pipe_close
77 };
78 
79 void	filt_pipedetach(struct knote *kn);
80 int	filt_piperead(struct knote *kn, long hint);
81 int	filt_pipewrite(struct knote *kn, long hint);
82 int	filt_pipeexcept(struct knote *kn, long hint);
83 int	filt_pipemodify(struct kevent *kev, struct knote *kn);
84 int	filt_pipeprocess(struct knote *kn, struct kevent *kev);
85 
86 const struct filterops pipe_rfiltops = {
87 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
88 	.f_attach	= NULL,
89 	.f_detach	= filt_pipedetach,
90 	.f_event	= filt_piperead,
91 	.f_modify	= filt_pipemodify,
92 	.f_process	= filt_pipeprocess,
93 };
94 
95 const struct filterops pipe_wfiltops = {
96 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
97 	.f_attach	= NULL,
98 	.f_detach	= filt_pipedetach,
99 	.f_event	= filt_pipewrite,
100 	.f_modify	= filt_pipemodify,
101 	.f_process	= filt_pipeprocess,
102 };
103 
104 const struct filterops pipe_efiltops = {
105 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
106 	.f_attach	= NULL,
107 	.f_detach	= filt_pipedetach,
108 	.f_event	= filt_pipeexcept,
109 	.f_modify	= filt_pipemodify,
110 	.f_process	= filt_pipeprocess,
111 };
112 
113 /*
114  * Default pipe buffer size(s), this can be kind-of large now because pipe
115  * space is pageable.  The pipe code will try to maintain locality of
116  * reference for performance reasons, so small amounts of outstanding I/O
117  * will not wipe the cache.
118  */
119 #define MINPIPESIZE (PIPE_SIZE/3)
120 
121 /*
122  * Limit the number of "big" pipes
123  */
124 #define LIMITBIGPIPES	32
125 unsigned int nbigpipe;
126 static unsigned int amountpipekva;
127 
128 struct pool pipe_pair_pool;
129 
130 int	dopipe(struct proc *, int *, int);
131 void	pipeselwakeup(struct pipe *);
132 
133 int	pipe_create(struct pipe *);
134 void	pipe_destroy(struct pipe *);
135 int	pipe_rundown(struct pipe *);
136 struct pipe *pipe_peer(struct pipe *);
137 int	pipe_buffer_realloc(struct pipe *, u_int);
138 void	pipe_buffer_free(struct pipe *);
139 
140 int	pipe_iolock(struct pipe *);
141 void	pipe_iounlock(struct pipe *);
142 int	pipe_iosleep(struct pipe *, const char *);
143 
144 struct pipe_pair *pipe_pair_create(void);
145 void	pipe_pair_destroy(struct pipe_pair *);
146 
147 /*
148  * The pipe system call for the DTYPE_PIPE type of pipes
149  */
150 
151 int
152 sys_pipe(struct proc *p, void *v, register_t *retval)
153 {
154 	struct sys_pipe_args /* {
155 		syscallarg(int *) fdp;
156 	} */ *uap = v;
157 
158 	return (dopipe(p, SCARG(uap, fdp), 0));
159 }
160 
161 int
162 sys_pipe2(struct proc *p, void *v, register_t *retval)
163 {
164 	struct sys_pipe2_args /* {
165 		syscallarg(int *) fdp;
166 		syscallarg(int) flags;
167 	} */ *uap = v;
168 
169 	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
170 		return (EINVAL);
171 
172 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
173 }
174 
175 int
176 dopipe(struct proc *p, int *ufds, int flags)
177 {
178 	struct filedesc *fdp = p->p_fd;
179 	struct file *rf, *wf;
180 	struct pipe_pair *pp;
181 	struct pipe *rpipe, *wpipe = NULL;
182 	int fds[2], cloexec, error;
183 
184 	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
185 
186 	pp = pipe_pair_create();
187 	if (pp == NULL)
188 		return (ENOMEM);
189 	wpipe = &pp->pp_wpipe;
190 	rpipe = &pp->pp_rpipe;
191 
192 	fdplock(fdp);
193 
194 	error = falloc(p, &rf, &fds[0]);
195 	if (error != 0)
196 		goto free2;
197 	rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
198 	rf->f_type = DTYPE_PIPE;
199 	rf->f_data = rpipe;
200 	rf->f_ops = &pipeops;
201 
202 	error = falloc(p, &wf, &fds[1]);
203 	if (error != 0)
204 		goto free3;
205 	wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
206 	wf->f_type = DTYPE_PIPE;
207 	wf->f_data = wpipe;
208 	wf->f_ops = &pipeops;
209 
210 	fdinsert(fdp, fds[0], cloexec, rf);
211 	fdinsert(fdp, fds[1], cloexec, wf);
212 
213 	error = copyout(fds, ufds, sizeof(fds));
214 	if (error == 0) {
215 		fdpunlock(fdp);
216 #ifdef KTRACE
217 		if (KTRPOINT(p, KTR_STRUCT))
218 			ktrfds(p, fds, 2);
219 #endif
220 	} else {
221 		/* fdrelease() unlocks fdp. */
222 		fdrelease(p, fds[0]);
223 		fdplock(fdp);
224 		fdrelease(p, fds[1]);
225 	}
226 
227 	FRELE(rf, p);
228 	FRELE(wf, p);
229 	return (error);
230 
231 free3:
232 	fdremove(fdp, fds[0]);
233 	closef(rf, p);
234 	rpipe = NULL;
235 free2:
236 	fdpunlock(fdp);
237 	pipe_destroy(wpipe);
238 	pipe_destroy(rpipe);
239 	return (error);
240 }
241 
242 /*
243  * Allocate kva for pipe circular buffer, the space is pageable.
244  * This routine will 'realloc' the size of a pipe safely, if it fails
245  * it will retain the old buffer.
246  * If it fails it will return ENOMEM.
247  */
248 int
249 pipe_buffer_realloc(struct pipe *cpipe, u_int size)
250 {
251 	caddr_t buffer;
252 
253 	/* buffer uninitialized or pipe locked */
254 	KASSERT((cpipe->pipe_buffer.buffer == NULL) ||
255 	    (cpipe->pipe_state & PIPE_LOCK));
256 
257 	/* buffer should be empty */
258 	KASSERT(cpipe->pipe_buffer.cnt == 0);
259 
260 	KERNEL_LOCK();
261 	buffer = km_alloc(size, &kv_any, &kp_pageable, &kd_waitok);
262 	KERNEL_UNLOCK();
263 	if (buffer == NULL)
264 		return (ENOMEM);
265 
266 	/* free old resources if we are resizing */
267 	pipe_buffer_free(cpipe);
268 
269 	cpipe->pipe_buffer.buffer = buffer;
270 	cpipe->pipe_buffer.size = size;
271 	cpipe->pipe_buffer.in = 0;
272 	cpipe->pipe_buffer.out = 0;
273 
274 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
275 
276 	return (0);
277 }
278 
279 /*
280  * initialize and allocate VM and memory for pipe
281  */
282 int
283 pipe_create(struct pipe *cpipe)
284 {
285 	int error;
286 
287 	error = pipe_buffer_realloc(cpipe, PIPE_SIZE);
288 	if (error != 0)
289 		return (error);
290 
291 	sigio_init(&cpipe->pipe_sigio);
292 
293 	getnanotime(&cpipe->pipe_ctime);
294 	cpipe->pipe_atime = cpipe->pipe_ctime;
295 	cpipe->pipe_mtime = cpipe->pipe_ctime;
296 
297 	return (0);
298 }
299 
300 struct pipe *
301 pipe_peer(struct pipe *cpipe)
302 {
303 	struct pipe *peer;
304 
305 	rw_assert_anylock(cpipe->pipe_lock);
306 
307 	peer = cpipe->pipe_peer;
308 	if (peer == NULL || (peer->pipe_state & PIPE_EOF))
309 		return (NULL);
310 	return (peer);
311 }
312 
313 /*
314  * Lock a pipe for exclusive I/O access.
315  */
316 int
317 pipe_iolock(struct pipe *cpipe)
318 {
319 	int error;
320 
321 	rw_assert_wrlock(cpipe->pipe_lock);
322 
323 	while (cpipe->pipe_state & PIPE_LOCK) {
324 		cpipe->pipe_state |= PIPE_LWANT;
325 		error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH,
326 		    "pipeiolk", INFSLP);
327 		if (error)
328 			return (error);
329 	}
330 	cpipe->pipe_state |= PIPE_LOCK;
331 	return (0);
332 }
333 
334 /*
335  * Unlock a pipe I/O lock.
336  */
337 void
338 pipe_iounlock(struct pipe *cpipe)
339 {
340 	rw_assert_wrlock(cpipe->pipe_lock);
341 	KASSERT(cpipe->pipe_state & PIPE_LOCK);
342 
343 	cpipe->pipe_state &= ~PIPE_LOCK;
344 	if (cpipe->pipe_state & PIPE_LWANT) {
345 		cpipe->pipe_state &= ~PIPE_LWANT;
346 		wakeup(cpipe);
347 	}
348 }
349 
350 /*
351  * Unlock the pipe I/O lock and go to sleep. Returns 0 on success and the I/O
352  * lock is relocked. Otherwise if a signal was caught, non-zero is returned and
353  * the I/O lock is not locked.
354  *
355  * Any caller must obtain a reference to the pipe by incrementing `pipe_busy'
356  * before calling this function in order ensure that the same pipe is not
357  * destroyed while sleeping.
358  */
359 int
360 pipe_iosleep(struct pipe *cpipe, const char *wmesg)
361 {
362 	int error;
363 
364 	pipe_iounlock(cpipe);
365 	error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH, wmesg,
366 	    INFSLP);
367 	if (error)
368 		return (error);
369 	return (pipe_iolock(cpipe));
370 }
371 
372 void
373 pipeselwakeup(struct pipe *cpipe)
374 {
375 	rw_assert_wrlock(cpipe->pipe_lock);
376 
377 	if (cpipe->pipe_state & PIPE_SEL) {
378 		cpipe->pipe_state &= ~PIPE_SEL;
379 		selwakeup(&cpipe->pipe_sel);
380 	} else {
381 		KNOTE(&cpipe->pipe_sel.si_note, 0);
382 	}
383 
384 	if (cpipe->pipe_state & PIPE_ASYNC)
385 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
386 }
387 
388 int
389 pipe_read(struct file *fp, struct uio *uio, int fflags)
390 {
391 	struct pipe *rpipe = fp->f_data;
392 	size_t nread = 0, size;
393 	int error;
394 
395 	rw_enter_write(rpipe->pipe_lock);
396 	++rpipe->pipe_busy;
397 	error = pipe_iolock(rpipe);
398 	if (error) {
399 		--rpipe->pipe_busy;
400 		pipe_rundown(rpipe);
401 		rw_exit_write(rpipe->pipe_lock);
402 		return (error);
403 	}
404 
405 	while (uio->uio_resid) {
406 		/* Normal pipe buffer receive. */
407 		if (rpipe->pipe_buffer.cnt > 0) {
408 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
409 			if (size > rpipe->pipe_buffer.cnt)
410 				size = rpipe->pipe_buffer.cnt;
411 			if (size > uio->uio_resid)
412 				size = uio->uio_resid;
413 			rw_exit_write(rpipe->pipe_lock);
414 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
415 					size, uio);
416 			rw_enter_write(rpipe->pipe_lock);
417 			if (error) {
418 				break;
419 			}
420 			rpipe->pipe_buffer.out += size;
421 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
422 				rpipe->pipe_buffer.out = 0;
423 
424 			rpipe->pipe_buffer.cnt -= size;
425 			/*
426 			 * If there is no more to read in the pipe, reset
427 			 * its pointers to the beginning.  This improves
428 			 * cache hit stats.
429 			 */
430 			if (rpipe->pipe_buffer.cnt == 0) {
431 				rpipe->pipe_buffer.in = 0;
432 				rpipe->pipe_buffer.out = 0;
433 			}
434 			nread += size;
435 		} else {
436 			/*
437 			 * detect EOF condition
438 			 * read returns 0 on EOF, no need to set error
439 			 */
440 			if (rpipe->pipe_state & PIPE_EOF)
441 				break;
442 
443 			/* If the "write-side" has been blocked, wake it up. */
444 			if (rpipe->pipe_state & PIPE_WANTW) {
445 				rpipe->pipe_state &= ~PIPE_WANTW;
446 				wakeup(rpipe);
447 			}
448 
449 			/* Break if some data was read. */
450 			if (nread > 0)
451 				break;
452 
453 			/* Handle non-blocking mode operation. */
454 			if (fp->f_flag & FNONBLOCK) {
455 				error = EAGAIN;
456 				break;
457 			}
458 
459 			/* Wait for more data. */
460 			rpipe->pipe_state |= PIPE_WANTR;
461 			error = pipe_iosleep(rpipe, "piperd");
462 			if (error)
463 				goto unlocked_error;
464 		}
465 	}
466 	pipe_iounlock(rpipe);
467 
468 	if (error == 0)
469 		getnanotime(&rpipe->pipe_atime);
470 unlocked_error:
471 	--rpipe->pipe_busy;
472 
473 	if (pipe_rundown(rpipe) == 0 && rpipe->pipe_buffer.cnt < MINPIPESIZE) {
474 		/* Handle write blocking hysteresis. */
475 		if (rpipe->pipe_state & PIPE_WANTW) {
476 			rpipe->pipe_state &= ~PIPE_WANTW;
477 			wakeup(rpipe);
478 		}
479 	}
480 
481 	if (rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
482 		pipeselwakeup(rpipe);
483 
484 	rw_exit_write(rpipe->pipe_lock);
485 	return (error);
486 }
487 
488 int
489 pipe_write(struct file *fp, struct uio *uio, int fflags)
490 {
491 	struct pipe *rpipe = fp->f_data, *wpipe;
492 	struct rwlock *lock = rpipe->pipe_lock;
493 	size_t orig_resid;
494 	int error;
495 
496 	rw_enter_write(lock);
497 	wpipe = pipe_peer(rpipe);
498 
499 	/* Detect loss of pipe read side, issue SIGPIPE if lost. */
500 	if (wpipe == NULL) {
501 		rw_exit_write(lock);
502 		return (EPIPE);
503 	}
504 
505 	++wpipe->pipe_busy;
506 	error = pipe_iolock(wpipe);
507 	if (error) {
508 		--wpipe->pipe_busy;
509 		pipe_rundown(wpipe);
510 		rw_exit_write(lock);
511 		return (error);
512 	}
513 
514 
515 	/* If it is advantageous to resize the pipe buffer, do so. */
516 	if (uio->uio_resid > PIPE_SIZE &&
517 	    wpipe->pipe_buffer.size <= PIPE_SIZE &&
518 	    wpipe->pipe_buffer.cnt == 0) {
519 	    	unsigned int npipe;
520 
521 		npipe = atomic_inc_int_nv(&nbigpipe);
522 		if (npipe > LIMITBIGPIPES ||
523 		    pipe_buffer_realloc(wpipe, BIG_PIPE_SIZE) != 0)
524 			atomic_dec_int(&nbigpipe);
525 	}
526 
527 	orig_resid = uio->uio_resid;
528 
529 	while (uio->uio_resid) {
530 		size_t space;
531 
532 		if (wpipe->pipe_state & PIPE_EOF) {
533 			error = EPIPE;
534 			break;
535 		}
536 
537 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
538 
539 		/* Writes of size <= PIPE_BUF must be atomic. */
540 		if (space < uio->uio_resid && orig_resid <= PIPE_BUF)
541 			space = 0;
542 
543 		if (space > 0) {
544 			size_t size;	/* Transfer size */
545 			size_t segsize;	/* first segment to transfer */
546 
547 			/*
548 			 * Transfer size is minimum of uio transfer
549 			 * and free space in pipe buffer.
550 			 */
551 			if (space > uio->uio_resid)
552 				size = uio->uio_resid;
553 			else
554 				size = space;
555 			/*
556 			 * First segment to transfer is minimum of
557 			 * transfer size and contiguous space in
558 			 * pipe buffer.  If first segment to transfer
559 			 * is less than the transfer size, we've got
560 			 * a wraparound in the buffer.
561 			 */
562 			segsize = wpipe->pipe_buffer.size -
563 				wpipe->pipe_buffer.in;
564 			if (segsize > size)
565 				segsize = size;
566 
567 			/* Transfer first segment */
568 
569 			rw_exit_write(lock);
570 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
571 					segsize, uio);
572 			rw_enter_write(lock);
573 
574 			if (error == 0 && segsize < size) {
575 				/*
576 				 * Transfer remaining part now, to
577 				 * support atomic writes.  Wraparound
578 				 * happened.
579 				 */
580 #ifdef DIAGNOSTIC
581 				if (wpipe->pipe_buffer.in + segsize !=
582 				    wpipe->pipe_buffer.size)
583 					panic("Expected pipe buffer wraparound disappeared");
584 #endif
585 
586 				rw_exit_write(lock);
587 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
588 						size - segsize, uio);
589 				rw_enter_write(lock);
590 			}
591 			if (error == 0) {
592 				wpipe->pipe_buffer.in += size;
593 				if (wpipe->pipe_buffer.in >=
594 				    wpipe->pipe_buffer.size) {
595 #ifdef DIAGNOSTIC
596 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
597 						panic("Expected wraparound bad");
598 #endif
599 					wpipe->pipe_buffer.in = size - segsize;
600 				}
601 
602 				wpipe->pipe_buffer.cnt += size;
603 #ifdef DIAGNOSTIC
604 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
605 					panic("Pipe buffer overflow");
606 #endif
607 			}
608 			if (error)
609 				break;
610 		} else {
611 			/* If the "read-side" has been blocked, wake it up. */
612 			if (wpipe->pipe_state & PIPE_WANTR) {
613 				wpipe->pipe_state &= ~PIPE_WANTR;
614 				wakeup(wpipe);
615 			}
616 
617 			/* Don't block on non-blocking I/O. */
618 			if (fp->f_flag & FNONBLOCK) {
619 				error = EAGAIN;
620 				break;
621 			}
622 
623 			/*
624 			 * We have no more space and have something to offer,
625 			 * wake up select/poll.
626 			 */
627 			pipeselwakeup(wpipe);
628 
629 			wpipe->pipe_state |= PIPE_WANTW;
630 			error = pipe_iosleep(wpipe, "pipewr");
631 			if (error)
632 				goto unlocked_error;
633 
634 			/*
635 			 * If read side wants to go away, we just issue a
636 			 * signal to ourselves.
637 			 */
638 			if (wpipe->pipe_state & PIPE_EOF) {
639 				error = EPIPE;
640 				break;
641 			}
642 		}
643 	}
644 	pipe_iounlock(wpipe);
645 
646 unlocked_error:
647 	--wpipe->pipe_busy;
648 
649 	if (pipe_rundown(wpipe) == 0 && wpipe->pipe_buffer.cnt > 0) {
650 		/*
651 		 * If we have put any characters in the buffer, we wake up
652 		 * the reader.
653 		 */
654 		if (wpipe->pipe_state & PIPE_WANTR) {
655 			wpipe->pipe_state &= ~PIPE_WANTR;
656 			wakeup(wpipe);
657 		}
658 	}
659 
660 	/* Don't return EPIPE if I/O was successful. */
661 	if (wpipe->pipe_buffer.cnt == 0 &&
662 	    uio->uio_resid == 0 &&
663 	    error == EPIPE) {
664 		error = 0;
665 	}
666 
667 	if (error == 0)
668 		getnanotime(&wpipe->pipe_mtime);
669 	/* We have something to offer, wake up select/poll. */
670 	if (wpipe->pipe_buffer.cnt)
671 		pipeselwakeup(wpipe);
672 
673 	rw_exit_write(lock);
674 	return (error);
675 }
676 
677 /*
678  * we implement a very minimal set of ioctls for compatibility with sockets.
679  */
680 int
681 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
682 {
683 	struct pipe *mpipe = fp->f_data;
684 	int error = 0;
685 
686 	switch (cmd) {
687 
688 	case FIONBIO:
689 		break;
690 
691 	case FIOASYNC:
692 		rw_enter_write(mpipe->pipe_lock);
693 		if (*(int *)data) {
694 			mpipe->pipe_state |= PIPE_ASYNC;
695 		} else {
696 			mpipe->pipe_state &= ~PIPE_ASYNC;
697 		}
698 		rw_exit_write(mpipe->pipe_lock);
699 		break;
700 
701 	case FIONREAD:
702 		rw_enter_read(mpipe->pipe_lock);
703 		*(int *)data = mpipe->pipe_buffer.cnt;
704 		rw_exit_read(mpipe->pipe_lock);
705 		break;
706 
707 	case FIOSETOWN:
708 	case SIOCSPGRP:
709 	case TIOCSPGRP:
710 		error = sigio_setown(&mpipe->pipe_sigio, cmd, data);
711 		break;
712 
713 	case FIOGETOWN:
714 	case SIOCGPGRP:
715 	case TIOCGPGRP:
716 		sigio_getown(&mpipe->pipe_sigio, cmd, data);
717 		break;
718 
719 	default:
720 		error = ENOTTY;
721 	}
722 
723 	return (error);
724 }
725 
726 int
727 pipe_poll(struct file *fp, int events, struct proc *p)
728 {
729 	struct pipe *rpipe = fp->f_data, *wpipe;
730 	struct rwlock *lock = rpipe->pipe_lock;
731 	int revents = 0;
732 
733 	rw_enter_write(lock);
734 	wpipe = pipe_peer(rpipe);
735 
736 	if (events & (POLLIN | POLLRDNORM)) {
737 		if (rpipe->pipe_buffer.cnt > 0 ||
738 		    (rpipe->pipe_state & PIPE_EOF))
739 			revents |= events & (POLLIN | POLLRDNORM);
740 	}
741 
742 	/* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
743 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL)
744 		revents |= POLLHUP;
745 	else if (events & (POLLOUT | POLLWRNORM)) {
746 		if (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt >= PIPE_BUF)
747 			revents |= events & (POLLOUT | POLLWRNORM);
748 	}
749 
750 	if (revents == 0) {
751 		if (events & (POLLIN | POLLRDNORM)) {
752 			selrecord(p, &rpipe->pipe_sel);
753 			rpipe->pipe_state |= PIPE_SEL;
754 		}
755 		if (events & (POLLOUT | POLLWRNORM)) {
756 			selrecord(p, &wpipe->pipe_sel);
757 			wpipe->pipe_state |= PIPE_SEL;
758 		}
759 	}
760 
761 	rw_exit_write(lock);
762 
763 	return (revents);
764 }
765 
766 int
767 pipe_stat(struct file *fp, struct stat *ub, struct proc *p)
768 {
769 	struct pipe *pipe = fp->f_data;
770 
771 	memset(ub, 0, sizeof(*ub));
772 
773 	rw_enter_read(pipe->pipe_lock);
774 	ub->st_mode = S_IFIFO;
775 	ub->st_blksize = pipe->pipe_buffer.size;
776 	ub->st_size = pipe->pipe_buffer.cnt;
777 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
778 	ub->st_atim.tv_sec  = pipe->pipe_atime.tv_sec;
779 	ub->st_atim.tv_nsec = pipe->pipe_atime.tv_nsec;
780 	ub->st_mtim.tv_sec  = pipe->pipe_mtime.tv_sec;
781 	ub->st_mtim.tv_nsec = pipe->pipe_mtime.tv_nsec;
782 	ub->st_ctim.tv_sec  = pipe->pipe_ctime.tv_sec;
783 	ub->st_ctim.tv_nsec = pipe->pipe_ctime.tv_nsec;
784 	ub->st_uid = fp->f_cred->cr_uid;
785 	ub->st_gid = fp->f_cred->cr_gid;
786 	rw_exit_read(pipe->pipe_lock);
787 	/*
788 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
789 	 * XXX (st_dev, st_ino) should be unique.
790 	 */
791 	return (0);
792 }
793 
794 int
795 pipe_close(struct file *fp, struct proc *p)
796 {
797 	struct pipe *cpipe = fp->f_data;
798 
799 	fp->f_ops = NULL;
800 	fp->f_data = NULL;
801 	pipe_destroy(cpipe);
802 	return (0);
803 }
804 
805 /*
806  * Free kva for pipe circular buffer.
807  * No pipe lock check as only called from pipe_buffer_realloc() and pipeclose()
808  */
809 void
810 pipe_buffer_free(struct pipe *cpipe)
811 {
812 	u_int size;
813 
814 	if (cpipe->pipe_buffer.buffer == NULL)
815 		return;
816 
817 	size = cpipe->pipe_buffer.size;
818 
819 	KERNEL_LOCK();
820 	km_free(cpipe->pipe_buffer.buffer, size, &kv_any, &kp_pageable);
821 	KERNEL_UNLOCK();
822 
823 	cpipe->pipe_buffer.buffer = NULL;
824 
825 	atomic_sub_int(&amountpipekva, size);
826 	if (size > PIPE_SIZE)
827 		atomic_dec_int(&nbigpipe);
828 }
829 
830 /*
831  * shutdown the pipe, and free resources.
832  */
833 void
834 pipe_destroy(struct pipe *cpipe)
835 {
836 	struct pipe *ppipe;
837 
838 	if (cpipe == NULL)
839 		return;
840 
841 	rw_enter_write(cpipe->pipe_lock);
842 
843 	pipeselwakeup(cpipe);
844 	sigio_free(&cpipe->pipe_sigio);
845 
846 	/*
847 	 * If the other side is blocked, wake it up saying that
848 	 * we want to close it down.
849 	 */
850 	cpipe->pipe_state |= PIPE_EOF;
851 	while (cpipe->pipe_busy) {
852 		wakeup(cpipe);
853 		cpipe->pipe_state |= PIPE_WANTD;
854 		rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO, "pipecl", INFSLP);
855 	}
856 
857 	/* Disconnect from peer. */
858 	if ((ppipe = cpipe->pipe_peer) != NULL) {
859 		pipeselwakeup(ppipe);
860 
861 		ppipe->pipe_state |= PIPE_EOF;
862 		wakeup(ppipe);
863 		ppipe->pipe_peer = NULL;
864 	}
865 
866 	pipe_buffer_free(cpipe);
867 
868 	rw_exit_write(cpipe->pipe_lock);
869 
870 	if (ppipe == NULL)
871 		pipe_pair_destroy(cpipe->pipe_pair);
872 }
873 
874 /*
875  * Returns non-zero if a rundown is currently ongoing.
876  */
877 int
878 pipe_rundown(struct pipe *cpipe)
879 {
880 	rw_assert_wrlock(cpipe->pipe_lock);
881 
882 	if (cpipe->pipe_busy > 0 || (cpipe->pipe_state & PIPE_WANTD) == 0)
883 		return (0);
884 
885 	/* Only wakeup pipe_destroy() once the pipe is no longer busy. */
886 	cpipe->pipe_state &= ~(PIPE_WANTD | PIPE_WANTR | PIPE_WANTW);
887 	wakeup(cpipe);
888 	return (1);
889 }
890 
891 int
892 pipe_kqfilter(struct file *fp, struct knote *kn)
893 {
894 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
895 	struct rwlock *lock = rpipe->pipe_lock;
896 	int error = 0;
897 
898 	rw_enter_write(lock);
899 	wpipe = pipe_peer(rpipe);
900 
901 	switch (kn->kn_filter) {
902 	case EVFILT_READ:
903 		kn->kn_fop = &pipe_rfiltops;
904 		kn->kn_hook = rpipe;
905 		klist_insert_locked(&rpipe->pipe_sel.si_note, kn);
906 		break;
907 	case EVFILT_WRITE:
908 		if (wpipe == NULL) {
909 			/* other end of pipe has been closed */
910 			error = EPIPE;
911 			break;
912 		}
913 		kn->kn_fop = &pipe_wfiltops;
914 		kn->kn_hook = wpipe;
915 		klist_insert_locked(&wpipe->pipe_sel.si_note, kn);
916 		break;
917 	case EVFILT_EXCEPT:
918 		if (kn->kn_flags & __EV_SELECT) {
919 			/* Prevent triggering exceptfds. */
920 			error = EPERM;
921 			break;
922 		}
923 		if ((kn->kn_flags & __EV_POLL) == 0) {
924 			/* Disallow usage through kevent(2). */
925 			error = EINVAL;
926 			break;
927 		}
928 		kn->kn_fop = &pipe_efiltops;
929 		kn->kn_hook = rpipe;
930 		klist_insert_locked(&rpipe->pipe_sel.si_note, kn);
931 		break;
932 	default:
933 		error = EINVAL;
934 	}
935 
936 	rw_exit_write(lock);
937 
938 	return (error);
939 }
940 
941 void
942 filt_pipedetach(struct knote *kn)
943 {
944 	struct pipe *cpipe = kn->kn_hook;
945 
946 	klist_remove(&cpipe->pipe_sel.si_note, kn);
947 }
948 
949 int
950 filt_piperead(struct knote *kn, long hint)
951 {
952 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
953 
954 	rw_assert_wrlock(rpipe->pipe_lock);
955 
956 	wpipe = pipe_peer(rpipe);
957 
958 	kn->kn_data = rpipe->pipe_buffer.cnt;
959 
960 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
961 		kn->kn_flags |= EV_EOF;
962 		if (kn->kn_flags & __EV_POLL)
963 			kn->kn_flags |= __EV_HUP;
964 		return (1);
965 	}
966 
967 	return (kn->kn_data > 0);
968 }
969 
970 int
971 filt_pipewrite(struct knote *kn, long hint)
972 {
973 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
974 
975 	rw_assert_wrlock(rpipe->pipe_lock);
976 
977 	wpipe = pipe_peer(rpipe);
978 
979 	if (wpipe == NULL) {
980 		kn->kn_data = 0;
981 		kn->kn_flags |= EV_EOF;
982 		if (kn->kn_flags & __EV_POLL)
983 			kn->kn_flags |= __EV_HUP;
984 		return (1);
985 	}
986 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
987 
988 	return (kn->kn_data >= PIPE_BUF);
989 }
990 
991 int
992 filt_pipeexcept(struct knote *kn, long hint)
993 {
994 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
995 	int active = 0;
996 
997 	rw_assert_wrlock(rpipe->pipe_lock);
998 
999 	wpipe = pipe_peer(rpipe);
1000 
1001 	if (kn->kn_flags & __EV_POLL) {
1002 		if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
1003 			kn->kn_flags |= __EV_HUP;
1004 			active = 1;
1005 		}
1006 	}
1007 
1008 	return (active);
1009 }
1010 
1011 int
1012 filt_pipemodify(struct kevent *kev, struct knote *kn)
1013 {
1014 	struct pipe *rpipe = kn->kn_fp->f_data;
1015 	int active;
1016 
1017 	rw_enter_write(rpipe->pipe_lock);
1018 	active = knote_modify(kev, kn);
1019 	rw_exit_write(rpipe->pipe_lock);
1020 
1021 	return (active);
1022 }
1023 
1024 int
1025 filt_pipeprocess(struct knote *kn, struct kevent *kev)
1026 {
1027 	struct pipe *rpipe = kn->kn_fp->f_data;
1028 	int active;
1029 
1030 	rw_enter_write(rpipe->pipe_lock);
1031 	active = knote_process(kn, kev);
1032 	rw_exit_write(rpipe->pipe_lock);
1033 
1034 	return (active);
1035 }
1036 
1037 void
1038 pipe_init(void)
1039 {
1040 	pool_init(&pipe_pair_pool, sizeof(struct pipe_pair), 0, IPL_MPFLOOR,
1041 	    PR_WAITOK, "pipepl", NULL);
1042 }
1043 
1044 struct pipe_pair *
1045 pipe_pair_create(void)
1046 {
1047 	struct pipe_pair *pp;
1048 
1049 	pp = pool_get(&pipe_pair_pool, PR_WAITOK | PR_ZERO);
1050 	pp->pp_wpipe.pipe_pair = pp;
1051 	pp->pp_rpipe.pipe_pair = pp;
1052 	pp->pp_wpipe.pipe_peer = &pp->pp_rpipe;
1053 	pp->pp_rpipe.pipe_peer = &pp->pp_wpipe;
1054 	/*
1055 	 * One lock is used per pipe pair in order to obtain exclusive access to
1056 	 * the pipe pair.
1057 	 */
1058 	rw_init(&pp->pp_lock, "pipelk");
1059 	pp->pp_wpipe.pipe_lock = &pp->pp_lock;
1060 	pp->pp_rpipe.pipe_lock = &pp->pp_lock;
1061 
1062 	klist_init_rwlock(&pp->pp_wpipe.pipe_sel.si_note, &pp->pp_lock);
1063 	klist_init_rwlock(&pp->pp_rpipe.pipe_sel.si_note, &pp->pp_lock);
1064 
1065 	if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
1066 		goto err;
1067 	return (pp);
1068 err:
1069 	pipe_destroy(&pp->pp_wpipe);
1070 	pipe_destroy(&pp->pp_rpipe);
1071 	return (NULL);
1072 }
1073 
1074 void
1075 pipe_pair_destroy(struct pipe_pair *pp)
1076 {
1077 	klist_free(&pp->pp_wpipe.pipe_sel.si_note);
1078 	klist_free(&pp->pp_rpipe.pipe_sel.si_note);
1079 	pool_put(&pipe_pair_pool, pp);
1080 }
1081