xref: /openbsd-src/sys/kern/sys_pipe.c (revision 99fd087599a8791921855f21bd7e36130f39aadc)
1 /*	$OpenBSD: sys_pipe.c,v 1.118 2020/02/20 16:56:52 visa Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/pool.h>
36 #include <sys/ioctl.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/mount.h>
40 #include <sys/syscallargs.h>
41 #include <sys/event.h>
42 #include <sys/lock.h>
43 #include <sys/poll.h>
44 #ifdef KTRACE
45 #include <sys/ktrace.h>
46 #endif
47 
48 #include <uvm/uvm_extern.h>
49 
50 #include <sys/pipe.h>
51 
52 /*
53  * interfaces to the outside world
54  */
55 int	pipe_read(struct file *, struct uio *, int);
56 int	pipe_write(struct file *, struct uio *, int);
57 int	pipe_close(struct file *, struct proc *);
58 int	pipe_poll(struct file *, int events, struct proc *);
59 int	pipe_kqfilter(struct file *fp, struct knote *kn);
60 int	pipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
61 int	pipe_stat(struct file *fp, struct stat *ub, struct proc *p);
62 
63 static const struct fileops pipeops = {
64 	.fo_read	= pipe_read,
65 	.fo_write	= pipe_write,
66 	.fo_ioctl	= pipe_ioctl,
67 	.fo_poll	= pipe_poll,
68 	.fo_kqfilter	= pipe_kqfilter,
69 	.fo_stat	= pipe_stat,
70 	.fo_close	= pipe_close
71 };
72 
73 void	filt_pipedetach(struct knote *kn);
74 int	filt_piperead(struct knote *kn, long hint);
75 int	filt_pipewrite(struct knote *kn, long hint);
76 
77 const struct filterops pipe_rfiltops = {
78 	.f_flags	= FILTEROP_ISFD,
79 	.f_attach	= NULL,
80 	.f_detach	= filt_pipedetach,
81 	.f_event	= filt_piperead,
82 };
83 
84 const struct filterops pipe_wfiltops = {
85 	.f_flags	= FILTEROP_ISFD,
86 	.f_attach	= NULL,
87 	.f_detach	= filt_pipedetach,
88 	.f_event	= filt_pipewrite,
89 };
90 
91 /*
92  * Default pipe buffer size(s), this can be kind-of large now because pipe
93  * space is pageable.  The pipe code will try to maintain locality of
94  * reference for performance reasons, so small amounts of outstanding I/O
95  * will not wipe the cache.
96  */
97 #define MINPIPESIZE (PIPE_SIZE/3)
98 
99 /*
100  * Limit the number of "big" pipes
101  */
102 #define LIMITBIGPIPES	32
103 unsigned int nbigpipe;
104 static unsigned int amountpipekva;
105 
106 struct pool pipe_pool;
107 struct pool pipe_lock_pool;
108 
109 int	dopipe(struct proc *, int *, int);
110 void	pipeselwakeup(struct pipe *);
111 
112 struct pipe *pipe_create(void);
113 void	pipe_destroy(struct pipe *);
114 int	pipe_rundown(struct pipe *);
115 struct pipe *pipe_peer(struct pipe *);
116 int	pipe_buffer_realloc(struct pipe *, u_int);
117 void	pipe_buffer_free(struct pipe *);
118 
119 int	pipe_iolock(struct pipe *);
120 void	pipe_iounlock(struct pipe *);
121 int	pipe_iosleep(struct pipe *, const char *);
122 
123 /*
124  * The pipe system call for the DTYPE_PIPE type of pipes
125  */
126 
127 int
128 sys_pipe(struct proc *p, void *v, register_t *retval)
129 {
130 	struct sys_pipe_args /* {
131 		syscallarg(int *) fdp;
132 	} */ *uap = v;
133 
134 	return (dopipe(p, SCARG(uap, fdp), 0));
135 }
136 
137 int
138 sys_pipe2(struct proc *p, void *v, register_t *retval)
139 {
140 	struct sys_pipe2_args /* {
141 		syscallarg(int *) fdp;
142 		syscallarg(int) flags;
143 	} */ *uap = v;
144 
145 	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
146 		return (EINVAL);
147 
148 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
149 }
150 
151 int
152 dopipe(struct proc *p, int *ufds, int flags)
153 {
154 	struct filedesc *fdp = p->p_fd;
155 	struct file *rf, *wf;
156 	struct pipe *rpipe, *wpipe = NULL;
157 	struct rwlock *lock;
158 	int fds[2], cloexec, error;
159 
160 	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
161 
162 	if ((rpipe = pipe_create()) == NULL) {
163 		error = ENOMEM;
164 		goto free1;
165 	}
166 
167 	/*
168 	 * One lock is used per pipe pair in order to obtain exclusive access to
169 	 * the pipe pair.
170 	 */
171 	lock = pool_get(&pipe_lock_pool, PR_WAITOK);
172 	rw_init(lock, "pipelk");
173 	rpipe->pipe_lock = lock;
174 
175 	if ((wpipe = pipe_create()) == NULL) {
176 		error = ENOMEM;
177 		goto free1;
178 	}
179 	wpipe->pipe_lock = lock;
180 
181 	rpipe->pipe_peer = wpipe;
182 	wpipe->pipe_peer = rpipe;
183 
184 	fdplock(fdp);
185 
186 	error = falloc(p, &rf, &fds[0]);
187 	if (error != 0)
188 		goto free2;
189 	rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
190 	rf->f_type = DTYPE_PIPE;
191 	rf->f_data = rpipe;
192 	rf->f_ops = &pipeops;
193 
194 	error = falloc(p, &wf, &fds[1]);
195 	if (error != 0)
196 		goto free3;
197 	wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
198 	wf->f_type = DTYPE_PIPE;
199 	wf->f_data = wpipe;
200 	wf->f_ops = &pipeops;
201 
202 	fdinsert(fdp, fds[0], cloexec, rf);
203 	fdinsert(fdp, fds[1], cloexec, wf);
204 
205 	error = copyout(fds, ufds, sizeof(fds));
206 	if (error == 0) {
207 		fdpunlock(fdp);
208 #ifdef KTRACE
209 		if (KTRPOINT(p, KTR_STRUCT))
210 			ktrfds(p, fds, 2);
211 #endif
212 	} else {
213 		/* fdrelease() unlocks fdp. */
214 		fdrelease(p, fds[0]);
215 		fdplock(fdp);
216 		fdrelease(p, fds[1]);
217 	}
218 
219 	FRELE(rf, p);
220 	FRELE(wf, p);
221 	return (error);
222 
223 free3:
224 	fdremove(fdp, fds[0]);
225 	closef(rf, p);
226 	rpipe = NULL;
227 free2:
228 	fdpunlock(fdp);
229 free1:
230 	pipe_destroy(wpipe);
231 	pipe_destroy(rpipe);
232 	return (error);
233 }
234 
235 /*
236  * Allocate kva for pipe circular buffer, the space is pageable.
237  * This routine will 'realloc' the size of a pipe safely, if it fails
238  * it will retain the old buffer.
239  * If it fails it will return ENOMEM.
240  */
241 int
242 pipe_buffer_realloc(struct pipe *cpipe, u_int size)
243 {
244 	caddr_t buffer;
245 
246 	/* buffer uninitialized or pipe locked */
247 	KASSERT((cpipe->pipe_buffer.buffer == NULL) ||
248 	    (cpipe->pipe_state & PIPE_LOCK));
249 
250 	/* buffer should be empty */
251 	KASSERT(cpipe->pipe_buffer.cnt == 0);
252 
253 	KERNEL_LOCK();
254 	buffer = km_alloc(size, &kv_any, &kp_pageable, &kd_waitok);
255 	KERNEL_UNLOCK();
256 	if (buffer == NULL)
257 		return (ENOMEM);
258 
259 	/* free old resources if we are resizing */
260 	pipe_buffer_free(cpipe);
261 
262 	cpipe->pipe_buffer.buffer = buffer;
263 	cpipe->pipe_buffer.size = size;
264 	cpipe->pipe_buffer.in = 0;
265 	cpipe->pipe_buffer.out = 0;
266 
267 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
268 
269 	return (0);
270 }
271 
272 /*
273  * initialize and allocate VM and memory for pipe
274  */
275 struct pipe *
276 pipe_create(void)
277 {
278 	struct pipe *cpipe;
279 	int error;
280 
281 	cpipe = pool_get(&pipe_pool, PR_WAITOK | PR_ZERO);
282 
283 	error = pipe_buffer_realloc(cpipe, PIPE_SIZE);
284 	if (error != 0) {
285 		pool_put(&pipe_pool, cpipe);
286 		return (NULL);
287 	}
288 
289 	sigio_init(&cpipe->pipe_sigio);
290 
291 	getnanotime(&cpipe->pipe_ctime);
292 	cpipe->pipe_atime = cpipe->pipe_ctime;
293 	cpipe->pipe_mtime = cpipe->pipe_ctime;
294 
295 	return (cpipe);
296 }
297 
298 struct pipe *
299 pipe_peer(struct pipe *cpipe)
300 {
301 	struct pipe *peer;
302 
303 	rw_assert_anylock(cpipe->pipe_lock);
304 
305 	peer = cpipe->pipe_peer;
306 	if (peer == NULL || (peer->pipe_state & PIPE_EOF))
307 		return (NULL);
308 	return (peer);
309 }
310 
311 /*
312  * Lock a pipe for exclusive I/O access.
313  */
314 int
315 pipe_iolock(struct pipe *cpipe)
316 {
317 	int error;
318 
319 	rw_assert_wrlock(cpipe->pipe_lock);
320 
321 	while (cpipe->pipe_state & PIPE_LOCK) {
322 		cpipe->pipe_state |= PIPE_LWANT;
323 		error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH,
324 		    "pipeiolk", INFSLP);
325 		if (error)
326 			return (error);
327 	}
328 	cpipe->pipe_state |= PIPE_LOCK;
329 	return (0);
330 }
331 
332 /*
333  * Unlock a pipe I/O lock.
334  */
335 void
336 pipe_iounlock(struct pipe *cpipe)
337 {
338 	rw_assert_wrlock(cpipe->pipe_lock);
339 	KASSERT(cpipe->pipe_state & PIPE_LOCK);
340 
341 	cpipe->pipe_state &= ~PIPE_LOCK;
342 	if (cpipe->pipe_state & PIPE_LWANT) {
343 		cpipe->pipe_state &= ~PIPE_LWANT;
344 		wakeup(cpipe);
345 	}
346 }
347 
348 /*
349  * Unlock the pipe I/O lock and go to sleep. Returns 0 on success and the I/O
350  * lock is relocked. Otherwise if a signal was caught, non-zero is returned and
351  * the I/O lock is not locked.
352  *
353  * Any caller must obtain a reference to the pipe by incrementing `pipe_busy'
354  * before calling this function in order ensure that the same pipe is not
355  * destroyed while sleeping.
356  */
357 int
358 pipe_iosleep(struct pipe *cpipe, const char *wmesg)
359 {
360 	int error;
361 
362 	pipe_iounlock(cpipe);
363 	error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH, wmesg,
364 	    INFSLP);
365 	if (error)
366 		return (error);
367 	return (pipe_iolock(cpipe));
368 }
369 
370 void
371 pipeselwakeup(struct pipe *cpipe)
372 {
373 	rw_assert_wrlock(cpipe->pipe_lock);
374 
375 	if (cpipe->pipe_state & PIPE_SEL) {
376 		cpipe->pipe_state &= ~PIPE_SEL;
377 		selwakeup(&cpipe->pipe_sel);
378 	} else {
379 		KERNEL_LOCK();
380 		KNOTE(&cpipe->pipe_sel.si_note, NOTE_SUBMIT);
381 		KERNEL_UNLOCK();
382 	}
383 
384 	if (cpipe->pipe_state & PIPE_ASYNC)
385 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
386 }
387 
388 int
389 pipe_read(struct file *fp, struct uio *uio, int fflags)
390 {
391 	struct pipe *rpipe = fp->f_data;
392 	size_t nread = 0, size;
393 	int error;
394 
395 	rw_enter_write(rpipe->pipe_lock);
396 	++rpipe->pipe_busy;
397 	error = pipe_iolock(rpipe);
398 	if (error) {
399 		--rpipe->pipe_busy;
400 		pipe_rundown(rpipe);
401 		rw_exit_write(rpipe->pipe_lock);
402 		return (error);
403 	}
404 
405 	while (uio->uio_resid) {
406 		/* Normal pipe buffer receive. */
407 		if (rpipe->pipe_buffer.cnt > 0) {
408 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
409 			if (size > rpipe->pipe_buffer.cnt)
410 				size = rpipe->pipe_buffer.cnt;
411 			if (size > uio->uio_resid)
412 				size = uio->uio_resid;
413 			rw_exit_write(rpipe->pipe_lock);
414 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
415 					size, uio);
416 			rw_enter_write(rpipe->pipe_lock);
417 			if (error) {
418 				break;
419 			}
420 			rpipe->pipe_buffer.out += size;
421 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
422 				rpipe->pipe_buffer.out = 0;
423 
424 			rpipe->pipe_buffer.cnt -= size;
425 			/*
426 			 * If there is no more to read in the pipe, reset
427 			 * its pointers to the beginning.  This improves
428 			 * cache hit stats.
429 			 */
430 			if (rpipe->pipe_buffer.cnt == 0) {
431 				rpipe->pipe_buffer.in = 0;
432 				rpipe->pipe_buffer.out = 0;
433 			}
434 			nread += size;
435 		} else {
436 			/*
437 			 * detect EOF condition
438 			 * read returns 0 on EOF, no need to set error
439 			 */
440 			if (rpipe->pipe_state & PIPE_EOF)
441 				break;
442 
443 			/* If the "write-side" has been blocked, wake it up. */
444 			if (rpipe->pipe_state & PIPE_WANTW) {
445 				rpipe->pipe_state &= ~PIPE_WANTW;
446 				wakeup(rpipe);
447 			}
448 
449 			/* Break if some data was read. */
450 			if (nread > 0)
451 				break;
452 
453 			/* Handle non-blocking mode operation. */
454 			if (fp->f_flag & FNONBLOCK) {
455 				error = EAGAIN;
456 				break;
457 			}
458 
459 			/* Wait for more data. */
460 			rpipe->pipe_state |= PIPE_WANTR;
461 			error = pipe_iosleep(rpipe, "piperd");
462 			if (error)
463 				goto unlocked_error;
464 		}
465 	}
466 	pipe_iounlock(rpipe);
467 
468 	if (error == 0)
469 		getnanotime(&rpipe->pipe_atime);
470 unlocked_error:
471 	--rpipe->pipe_busy;
472 
473 	if (pipe_rundown(rpipe) == 0 && rpipe->pipe_buffer.cnt < MINPIPESIZE) {
474 		/* Handle write blocking hysteresis. */
475 		if (rpipe->pipe_state & PIPE_WANTW) {
476 			rpipe->pipe_state &= ~PIPE_WANTW;
477 			wakeup(rpipe);
478 		}
479 	}
480 
481 	if (rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
482 		pipeselwakeup(rpipe);
483 
484 	rw_exit_write(rpipe->pipe_lock);
485 	return (error);
486 }
487 
488 int
489 pipe_write(struct file *fp, struct uio *uio, int fflags)
490 {
491 	struct pipe *rpipe = fp->f_data, *wpipe;
492 	struct rwlock *lock = rpipe->pipe_lock;
493 	size_t orig_resid;
494 	int error;
495 
496 	rw_enter_write(lock);
497 	wpipe = pipe_peer(rpipe);
498 
499 	/* Detect loss of pipe read side, issue SIGPIPE if lost. */
500 	if (wpipe == NULL) {
501 		rw_exit_write(lock);
502 		return (EPIPE);
503 	}
504 
505 	++wpipe->pipe_busy;
506 	error = pipe_iolock(wpipe);
507 	if (error) {
508 		--wpipe->pipe_busy;
509 		pipe_rundown(wpipe);
510 		rw_exit_write(lock);
511 		return (error);
512 	}
513 
514 
515 	/* If it is advantageous to resize the pipe buffer, do so. */
516 	if (uio->uio_resid > PIPE_SIZE &&
517 	    wpipe->pipe_buffer.size <= PIPE_SIZE &&
518 	    wpipe->pipe_buffer.cnt == 0) {
519 	    	unsigned int npipe;
520 
521 		npipe = atomic_inc_int_nv(&nbigpipe);
522 		if (npipe > LIMITBIGPIPES ||
523 		    pipe_buffer_realloc(wpipe, BIG_PIPE_SIZE) != 0)
524 			atomic_dec_int(&nbigpipe);
525 	}
526 
527 	orig_resid = uio->uio_resid;
528 
529 	while (uio->uio_resid) {
530 		size_t space;
531 
532 		if (wpipe->pipe_state & PIPE_EOF) {
533 			error = EPIPE;
534 			break;
535 		}
536 
537 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
538 
539 		/* Writes of size <= PIPE_BUF must be atomic. */
540 		if (space < uio->uio_resid && orig_resid <= PIPE_BUF)
541 			space = 0;
542 
543 		if (space > 0) {
544 			size_t size;	/* Transfer size */
545 			size_t segsize;	/* first segment to transfer */
546 
547 			/*
548 			 * Transfer size is minimum of uio transfer
549 			 * and free space in pipe buffer.
550 			 */
551 			if (space > uio->uio_resid)
552 				size = uio->uio_resid;
553 			else
554 				size = space;
555 			/*
556 			 * First segment to transfer is minimum of
557 			 * transfer size and contiguous space in
558 			 * pipe buffer.  If first segment to transfer
559 			 * is less than the transfer size, we've got
560 			 * a wraparound in the buffer.
561 			 */
562 			segsize = wpipe->pipe_buffer.size -
563 				wpipe->pipe_buffer.in;
564 			if (segsize > size)
565 				segsize = size;
566 
567 			/* Transfer first segment */
568 
569 			rw_exit_write(lock);
570 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
571 					segsize, uio);
572 			rw_enter_write(lock);
573 
574 			if (error == 0 && segsize < size) {
575 				/*
576 				 * Transfer remaining part now, to
577 				 * support atomic writes.  Wraparound
578 				 * happened.
579 				 */
580 #ifdef DIAGNOSTIC
581 				if (wpipe->pipe_buffer.in + segsize !=
582 				    wpipe->pipe_buffer.size)
583 					panic("Expected pipe buffer wraparound disappeared");
584 #endif
585 
586 				rw_exit_write(lock);
587 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
588 						size - segsize, uio);
589 				rw_enter_write(lock);
590 			}
591 			if (error == 0) {
592 				wpipe->pipe_buffer.in += size;
593 				if (wpipe->pipe_buffer.in >=
594 				    wpipe->pipe_buffer.size) {
595 #ifdef DIAGNOSTIC
596 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
597 						panic("Expected wraparound bad");
598 #endif
599 					wpipe->pipe_buffer.in = size - segsize;
600 				}
601 
602 				wpipe->pipe_buffer.cnt += size;
603 #ifdef DIAGNOSTIC
604 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
605 					panic("Pipe buffer overflow");
606 #endif
607 			}
608 			if (error)
609 				break;
610 		} else {
611 			/* If the "read-side" has been blocked, wake it up. */
612 			if (wpipe->pipe_state & PIPE_WANTR) {
613 				wpipe->pipe_state &= ~PIPE_WANTR;
614 				wakeup(wpipe);
615 			}
616 
617 			/* Don't block on non-blocking I/O. */
618 			if (fp->f_flag & FNONBLOCK) {
619 				error = EAGAIN;
620 				break;
621 			}
622 
623 			/*
624 			 * We have no more space and have something to offer,
625 			 * wake up select/poll.
626 			 */
627 			pipeselwakeup(wpipe);
628 
629 			wpipe->pipe_state |= PIPE_WANTW;
630 			error = pipe_iosleep(wpipe, "pipewr");
631 			if (error)
632 				goto unlocked_error;
633 
634 			/*
635 			 * If read side wants to go away, we just issue a
636 			 * signal to ourselves.
637 			 */
638 			if (wpipe->pipe_state & PIPE_EOF) {
639 				error = EPIPE;
640 				break;
641 			}
642 		}
643 	}
644 	pipe_iounlock(wpipe);
645 
646 unlocked_error:
647 	--wpipe->pipe_busy;
648 
649 	if (pipe_rundown(wpipe) == 0 && wpipe->pipe_buffer.cnt > 0) {
650 		/*
651 		 * If we have put any characters in the buffer, we wake up
652 		 * the reader.
653 		 */
654 		if (wpipe->pipe_state & PIPE_WANTR) {
655 			wpipe->pipe_state &= ~PIPE_WANTR;
656 			wakeup(wpipe);
657 		}
658 	}
659 
660 	/* Don't return EPIPE if I/O was successful. */
661 	if (wpipe->pipe_buffer.cnt == 0 &&
662 	    uio->uio_resid == 0 &&
663 	    error == EPIPE) {
664 		error = 0;
665 	}
666 
667 	if (error == 0)
668 		getnanotime(&wpipe->pipe_mtime);
669 	/* We have something to offer, wake up select/poll. */
670 	if (wpipe->pipe_buffer.cnt)
671 		pipeselwakeup(wpipe);
672 
673 	rw_exit_write(lock);
674 	return (error);
675 }
676 
677 /*
678  * we implement a very minimal set of ioctls for compatibility with sockets.
679  */
680 int
681 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
682 {
683 	struct pipe *mpipe = fp->f_data;
684 	int error = 0;
685 
686 	switch (cmd) {
687 
688 	case FIONBIO:
689 		break;
690 
691 	case FIOASYNC:
692 		rw_enter_write(mpipe->pipe_lock);
693 		if (*(int *)data) {
694 			mpipe->pipe_state |= PIPE_ASYNC;
695 		} else {
696 			mpipe->pipe_state &= ~PIPE_ASYNC;
697 		}
698 		rw_exit_write(mpipe->pipe_lock);
699 		break;
700 
701 	case FIONREAD:
702 		rw_enter_read(mpipe->pipe_lock);
703 		*(int *)data = mpipe->pipe_buffer.cnt;
704 		rw_exit_read(mpipe->pipe_lock);
705 		break;
706 
707 	case FIOSETOWN:
708 	case SIOCSPGRP:
709 	case TIOCSPGRP:
710 		error = sigio_setown(&mpipe->pipe_sigio, cmd, data);
711 		break;
712 
713 	case FIOGETOWN:
714 	case SIOCGPGRP:
715 	case TIOCGPGRP:
716 		sigio_getown(&mpipe->pipe_sigio, cmd, data);
717 		break;
718 
719 	default:
720 		error = ENOTTY;
721 	}
722 
723 	return (error);
724 }
725 
726 int
727 pipe_poll(struct file *fp, int events, struct proc *p)
728 {
729 	struct pipe *rpipe = fp->f_data, *wpipe;
730 	struct rwlock *lock = rpipe->pipe_lock;
731 	int revents = 0;
732 
733 	rw_enter_write(lock);
734 	wpipe = pipe_peer(rpipe);
735 
736 	if (events & (POLLIN | POLLRDNORM)) {
737 		if (rpipe->pipe_buffer.cnt > 0 ||
738 		    (rpipe->pipe_state & PIPE_EOF))
739 			revents |= events & (POLLIN | POLLRDNORM);
740 	}
741 
742 	/* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
743 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL)
744 		revents |= POLLHUP;
745 	else if (events & (POLLOUT | POLLWRNORM)) {
746 		if (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt >= PIPE_BUF)
747 			revents |= events & (POLLOUT | POLLWRNORM);
748 	}
749 
750 	if (revents == 0) {
751 		if (events & (POLLIN | POLLRDNORM)) {
752 			selrecord(p, &rpipe->pipe_sel);
753 			rpipe->pipe_state |= PIPE_SEL;
754 		}
755 		if (events & (POLLOUT | POLLWRNORM)) {
756 			selrecord(p, &wpipe->pipe_sel);
757 			wpipe->pipe_state |= PIPE_SEL;
758 		}
759 	}
760 
761 	rw_exit_write(lock);
762 
763 	return (revents);
764 }
765 
766 int
767 pipe_stat(struct file *fp, struct stat *ub, struct proc *p)
768 {
769 	struct pipe *pipe = fp->f_data;
770 
771 	memset(ub, 0, sizeof(*ub));
772 
773 	rw_enter_read(pipe->pipe_lock);
774 	ub->st_mode = S_IFIFO;
775 	ub->st_blksize = pipe->pipe_buffer.size;
776 	ub->st_size = pipe->pipe_buffer.cnt;
777 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
778 	ub->st_atim.tv_sec  = pipe->pipe_atime.tv_sec;
779 	ub->st_atim.tv_nsec = pipe->pipe_atime.tv_nsec;
780 	ub->st_mtim.tv_sec  = pipe->pipe_mtime.tv_sec;
781 	ub->st_mtim.tv_nsec = pipe->pipe_mtime.tv_nsec;
782 	ub->st_ctim.tv_sec  = pipe->pipe_ctime.tv_sec;
783 	ub->st_ctim.tv_nsec = pipe->pipe_ctime.tv_nsec;
784 	ub->st_uid = fp->f_cred->cr_uid;
785 	ub->st_gid = fp->f_cred->cr_gid;
786 	rw_exit_read(pipe->pipe_lock);
787 	/*
788 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
789 	 * XXX (st_dev, st_ino) should be unique.
790 	 */
791 	return (0);
792 }
793 
794 int
795 pipe_close(struct file *fp, struct proc *p)
796 {
797 	struct pipe *cpipe = fp->f_data;
798 
799 	fp->f_ops = NULL;
800 	fp->f_data = NULL;
801 	pipe_destroy(cpipe);
802 	return (0);
803 }
804 
805 /*
806  * Free kva for pipe circular buffer.
807  * No pipe lock check as only called from pipe_buffer_realloc() and pipeclose()
808  */
809 void
810 pipe_buffer_free(struct pipe *cpipe)
811 {
812 	u_int size;
813 
814 	if (cpipe->pipe_buffer.buffer == NULL)
815 		return;
816 
817 	size = cpipe->pipe_buffer.size;
818 
819 	KERNEL_LOCK();
820 	km_free(cpipe->pipe_buffer.buffer, size, &kv_any, &kp_pageable);
821 	KERNEL_UNLOCK();
822 
823 	cpipe->pipe_buffer.buffer = NULL;
824 
825 	atomic_sub_int(&amountpipekva, size);
826 	if (size > PIPE_SIZE)
827 		atomic_dec_int(&nbigpipe);
828 }
829 
830 /*
831  * shutdown the pipe, and free resources.
832  */
833 void
834 pipe_destroy(struct pipe *cpipe)
835 {
836 	struct pipe *ppipe;
837 	struct rwlock *lock = NULL;
838 
839 	if (cpipe == NULL)
840 		return;
841 
842 	rw_enter_write(cpipe->pipe_lock);
843 
844 	pipeselwakeup(cpipe);
845 	sigio_free(&cpipe->pipe_sigio);
846 
847 	/*
848 	 * If the other side is blocked, wake it up saying that
849 	 * we want to close it down.
850 	 */
851 	cpipe->pipe_state |= PIPE_EOF;
852 	while (cpipe->pipe_busy) {
853 		wakeup(cpipe);
854 		cpipe->pipe_state |= PIPE_WANTD;
855 		rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO, "pipecl", INFSLP);
856 	}
857 
858 	/* Disconnect from peer. */
859 	if ((ppipe = cpipe->pipe_peer) != NULL) {
860 		pipeselwakeup(ppipe);
861 
862 		ppipe->pipe_state |= PIPE_EOF;
863 		wakeup(ppipe);
864 		ppipe->pipe_peer = NULL;
865 	} else {
866 		/*
867 		 * Peer already gone. This is last reference to the pipe lock
868 		 * and it must therefore be freed below.
869 		 */
870 		lock = cpipe->pipe_lock;
871 	}
872 
873 	rw_exit_write(cpipe->pipe_lock);
874 
875 	pipe_buffer_free(cpipe);
876 	if (lock != NULL)
877 		pool_put(&pipe_lock_pool, lock);
878 	pool_put(&pipe_pool, cpipe);
879 }
880 
881 /*
882  * Returns non-zero if a rundown is currently ongoing.
883  */
884 int
885 pipe_rundown(struct pipe *cpipe)
886 {
887 	rw_assert_wrlock(cpipe->pipe_lock);
888 
889 	if (cpipe->pipe_busy > 0 || (cpipe->pipe_state & PIPE_WANTD) == 0)
890 		return (0);
891 
892 	/* Only wakeup pipe_destroy() once the pipe is no longer busy. */
893 	cpipe->pipe_state &= ~(PIPE_WANTD | PIPE_WANTR | PIPE_WANTW);
894 	wakeup(cpipe);
895 	return (1);
896 }
897 
898 int
899 pipe_kqfilter(struct file *fp, struct knote *kn)
900 {
901 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
902 	struct rwlock *lock = rpipe->pipe_lock;
903 	int error = 0;
904 
905 	rw_enter_write(lock);
906 	wpipe = pipe_peer(rpipe);
907 
908 	switch (kn->kn_filter) {
909 	case EVFILT_READ:
910 		kn->kn_fop = &pipe_rfiltops;
911 		SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
912 		break;
913 	case EVFILT_WRITE:
914 		if (wpipe == NULL) {
915 			/* other end of pipe has been closed */
916 			error = EPIPE;
917 			break;
918 		}
919 		kn->kn_fop = &pipe_wfiltops;
920 		SLIST_INSERT_HEAD(&wpipe->pipe_sel.si_note, kn, kn_selnext);
921 		break;
922 	default:
923 		error = EINVAL;
924 	}
925 
926 	rw_exit_write(lock);
927 
928 	return (error);
929 }
930 
931 void
932 filt_pipedetach(struct knote *kn)
933 {
934 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
935 	struct rwlock *lock = rpipe->pipe_lock;
936 
937 	rw_enter_write(lock);
938 	wpipe = pipe_peer(rpipe);
939 
940 	switch (kn->kn_filter) {
941 	case EVFILT_READ:
942 		SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
943 		break;
944 	case EVFILT_WRITE:
945 		if (wpipe == NULL)
946 			break;
947 		SLIST_REMOVE(&wpipe->pipe_sel.si_note, kn, knote, kn_selnext);
948 		break;
949 	}
950 
951 	rw_exit_write(lock);
952 }
953 
954 int
955 filt_piperead(struct knote *kn, long hint)
956 {
957 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
958 	struct rwlock *lock = rpipe->pipe_lock;
959 
960 	if ((hint & NOTE_SUBMIT) == 0)
961 		rw_enter_read(lock);
962 	wpipe = pipe_peer(rpipe);
963 
964 	kn->kn_data = rpipe->pipe_buffer.cnt;
965 
966 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
967 		if ((hint & NOTE_SUBMIT) == 0)
968 			rw_exit_read(lock);
969 		kn->kn_flags |= EV_EOF;
970 		return (1);
971 	}
972 
973 	if ((hint & NOTE_SUBMIT) == 0)
974 		rw_exit_read(lock);
975 
976 	return (kn->kn_data > 0);
977 }
978 
979 int
980 filt_pipewrite(struct knote *kn, long hint)
981 {
982 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
983 	struct rwlock *lock = rpipe->pipe_lock;
984 
985 	if ((hint & NOTE_SUBMIT) == 0)
986 		rw_enter_read(lock);
987 	wpipe = pipe_peer(rpipe);
988 
989 	if (wpipe == NULL) {
990 		if ((hint & NOTE_SUBMIT) == 0)
991 			rw_exit_read(lock);
992 		kn->kn_data = 0;
993 		kn->kn_flags |= EV_EOF;
994 		return (1);
995 	}
996 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
997 
998 	if ((hint & NOTE_SUBMIT) == 0)
999 		rw_exit_read(lock);
1000 
1001 	return (kn->kn_data >= PIPE_BUF);
1002 }
1003 
1004 void
1005 pipe_init(void)
1006 {
1007 	pool_init(&pipe_pool, sizeof(struct pipe), 0, IPL_MPFLOOR, PR_WAITOK,
1008 	    "pipepl", NULL);
1009 	pool_init(&pipe_lock_pool, sizeof(struct rwlock), 0, IPL_MPFLOOR,
1010 	    PR_WAITOK, "pipelkpl", NULL);
1011 }
1012