xref: /openbsd-src/sys/kern/sys_pipe.c (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1 /*	$OpenBSD: sys_pipe.c,v 1.126 2020/12/30 17:02:32 visa Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/pool.h>
36 #include <sys/ioctl.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/mount.h>
40 #include <sys/syscallargs.h>
41 #include <sys/event.h>
42 #include <sys/lock.h>
43 #include <sys/poll.h>
44 #ifdef KTRACE
45 #include <sys/ktrace.h>
46 #endif
47 
48 #include <uvm/uvm_extern.h>
49 
50 #include <sys/pipe.h>
51 
52 struct pipe_pair {
53 	struct pipe pp_wpipe;
54 	struct pipe pp_rpipe;
55 	struct rwlock pp_lock;
56 };
57 
58 /*
59  * interfaces to the outside world
60  */
61 int	pipe_read(struct file *, struct uio *, int);
62 int	pipe_write(struct file *, struct uio *, int);
63 int	pipe_close(struct file *, struct proc *);
64 int	pipe_poll(struct file *, int events, struct proc *);
65 int	pipe_kqfilter(struct file *fp, struct knote *kn);
66 int	pipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
67 int	pipe_stat(struct file *fp, struct stat *ub, struct proc *p);
68 
69 static const struct fileops pipeops = {
70 	.fo_read	= pipe_read,
71 	.fo_write	= pipe_write,
72 	.fo_ioctl	= pipe_ioctl,
73 	.fo_poll	= pipe_poll,
74 	.fo_kqfilter	= pipe_kqfilter,
75 	.fo_stat	= pipe_stat,
76 	.fo_close	= pipe_close
77 };
78 
79 void	filt_pipedetach(struct knote *kn);
80 int	filt_piperead(struct knote *kn, long hint);
81 int	filt_pipewrite(struct knote *kn, long hint);
82 
83 const struct filterops pipe_rfiltops = {
84 	.f_flags	= FILTEROP_ISFD,
85 	.f_attach	= NULL,
86 	.f_detach	= filt_pipedetach,
87 	.f_event	= filt_piperead,
88 };
89 
90 const struct filterops pipe_wfiltops = {
91 	.f_flags	= FILTEROP_ISFD,
92 	.f_attach	= NULL,
93 	.f_detach	= filt_pipedetach,
94 	.f_event	= filt_pipewrite,
95 };
96 
97 /*
98  * Default pipe buffer size(s), this can be kind-of large now because pipe
99  * space is pageable.  The pipe code will try to maintain locality of
100  * reference for performance reasons, so small amounts of outstanding I/O
101  * will not wipe the cache.
102  */
103 #define MINPIPESIZE (PIPE_SIZE/3)
104 
105 /*
106  * Limit the number of "big" pipes
107  */
108 #define LIMITBIGPIPES	32
109 unsigned int nbigpipe;
110 static unsigned int amountpipekva;
111 
112 struct pool pipe_pair_pool;
113 
114 int	dopipe(struct proc *, int *, int);
115 void	pipeselwakeup(struct pipe *);
116 
117 int	pipe_create(struct pipe *);
118 void	pipe_destroy(struct pipe *);
119 int	pipe_rundown(struct pipe *);
120 struct pipe *pipe_peer(struct pipe *);
121 int	pipe_buffer_realloc(struct pipe *, u_int);
122 void	pipe_buffer_free(struct pipe *);
123 
124 int	pipe_iolock(struct pipe *);
125 void	pipe_iounlock(struct pipe *);
126 int	pipe_iosleep(struct pipe *, const char *);
127 
128 struct pipe_pair *pipe_pair_create(void);
129 void	pipe_pair_destroy(struct pipe_pair *);
130 
131 /*
132  * The pipe system call for the DTYPE_PIPE type of pipes
133  */
134 
135 int
136 sys_pipe(struct proc *p, void *v, register_t *retval)
137 {
138 	struct sys_pipe_args /* {
139 		syscallarg(int *) fdp;
140 	} */ *uap = v;
141 
142 	return (dopipe(p, SCARG(uap, fdp), 0));
143 }
144 
145 int
146 sys_pipe2(struct proc *p, void *v, register_t *retval)
147 {
148 	struct sys_pipe2_args /* {
149 		syscallarg(int *) fdp;
150 		syscallarg(int) flags;
151 	} */ *uap = v;
152 
153 	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
154 		return (EINVAL);
155 
156 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
157 }
158 
159 int
160 dopipe(struct proc *p, int *ufds, int flags)
161 {
162 	struct filedesc *fdp = p->p_fd;
163 	struct file *rf, *wf;
164 	struct pipe_pair *pp;
165 	struct pipe *rpipe, *wpipe = NULL;
166 	int fds[2], cloexec, error;
167 
168 	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
169 
170 	pp = pipe_pair_create();
171 	if (pp == NULL)
172 		return (ENOMEM);
173 	wpipe = &pp->pp_wpipe;
174 	rpipe = &pp->pp_rpipe;
175 
176 	fdplock(fdp);
177 
178 	error = falloc(p, &rf, &fds[0]);
179 	if (error != 0)
180 		goto free2;
181 	rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
182 	rf->f_type = DTYPE_PIPE;
183 	rf->f_data = rpipe;
184 	rf->f_ops = &pipeops;
185 
186 	error = falloc(p, &wf, &fds[1]);
187 	if (error != 0)
188 		goto free3;
189 	wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
190 	wf->f_type = DTYPE_PIPE;
191 	wf->f_data = wpipe;
192 	wf->f_ops = &pipeops;
193 
194 	fdinsert(fdp, fds[0], cloexec, rf);
195 	fdinsert(fdp, fds[1], cloexec, wf);
196 
197 	error = copyout(fds, ufds, sizeof(fds));
198 	if (error == 0) {
199 		fdpunlock(fdp);
200 #ifdef KTRACE
201 		if (KTRPOINT(p, KTR_STRUCT))
202 			ktrfds(p, fds, 2);
203 #endif
204 	} else {
205 		/* fdrelease() unlocks fdp. */
206 		fdrelease(p, fds[0]);
207 		fdplock(fdp);
208 		fdrelease(p, fds[1]);
209 	}
210 
211 	FRELE(rf, p);
212 	FRELE(wf, p);
213 	return (error);
214 
215 free3:
216 	fdremove(fdp, fds[0]);
217 	closef(rf, p);
218 	rpipe = NULL;
219 free2:
220 	fdpunlock(fdp);
221 	pipe_destroy(wpipe);
222 	pipe_destroy(rpipe);
223 	return (error);
224 }
225 
226 /*
227  * Allocate kva for pipe circular buffer, the space is pageable.
228  * This routine will 'realloc' the size of a pipe safely, if it fails
229  * it will retain the old buffer.
230  * If it fails it will return ENOMEM.
231  */
232 int
233 pipe_buffer_realloc(struct pipe *cpipe, u_int size)
234 {
235 	caddr_t buffer;
236 
237 	/* buffer uninitialized or pipe locked */
238 	KASSERT((cpipe->pipe_buffer.buffer == NULL) ||
239 	    (cpipe->pipe_state & PIPE_LOCK));
240 
241 	/* buffer should be empty */
242 	KASSERT(cpipe->pipe_buffer.cnt == 0);
243 
244 	KERNEL_LOCK();
245 	buffer = km_alloc(size, &kv_any, &kp_pageable, &kd_waitok);
246 	KERNEL_UNLOCK();
247 	if (buffer == NULL)
248 		return (ENOMEM);
249 
250 	/* free old resources if we are resizing */
251 	pipe_buffer_free(cpipe);
252 
253 	cpipe->pipe_buffer.buffer = buffer;
254 	cpipe->pipe_buffer.size = size;
255 	cpipe->pipe_buffer.in = 0;
256 	cpipe->pipe_buffer.out = 0;
257 
258 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
259 
260 	return (0);
261 }
262 
263 /*
264  * initialize and allocate VM and memory for pipe
265  */
266 int
267 pipe_create(struct pipe *cpipe)
268 {
269 	int error;
270 
271 	error = pipe_buffer_realloc(cpipe, PIPE_SIZE);
272 	if (error != 0)
273 		return (error);
274 
275 	sigio_init(&cpipe->pipe_sigio);
276 
277 	getnanotime(&cpipe->pipe_ctime);
278 	cpipe->pipe_atime = cpipe->pipe_ctime;
279 	cpipe->pipe_mtime = cpipe->pipe_ctime;
280 
281 	return (0);
282 }
283 
284 struct pipe *
285 pipe_peer(struct pipe *cpipe)
286 {
287 	struct pipe *peer;
288 
289 	rw_assert_anylock(cpipe->pipe_lock);
290 
291 	peer = cpipe->pipe_peer;
292 	if (peer == NULL || (peer->pipe_state & PIPE_EOF))
293 		return (NULL);
294 	return (peer);
295 }
296 
297 /*
298  * Lock a pipe for exclusive I/O access.
299  */
300 int
301 pipe_iolock(struct pipe *cpipe)
302 {
303 	int error;
304 
305 	rw_assert_wrlock(cpipe->pipe_lock);
306 
307 	while (cpipe->pipe_state & PIPE_LOCK) {
308 		cpipe->pipe_state |= PIPE_LWANT;
309 		error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH,
310 		    "pipeiolk", INFSLP);
311 		if (error)
312 			return (error);
313 	}
314 	cpipe->pipe_state |= PIPE_LOCK;
315 	return (0);
316 }
317 
318 /*
319  * Unlock a pipe I/O lock.
320  */
321 void
322 pipe_iounlock(struct pipe *cpipe)
323 {
324 	rw_assert_wrlock(cpipe->pipe_lock);
325 	KASSERT(cpipe->pipe_state & PIPE_LOCK);
326 
327 	cpipe->pipe_state &= ~PIPE_LOCK;
328 	if (cpipe->pipe_state & PIPE_LWANT) {
329 		cpipe->pipe_state &= ~PIPE_LWANT;
330 		wakeup(cpipe);
331 	}
332 }
333 
334 /*
335  * Unlock the pipe I/O lock and go to sleep. Returns 0 on success and the I/O
336  * lock is relocked. Otherwise if a signal was caught, non-zero is returned and
337  * the I/O lock is not locked.
338  *
339  * Any caller must obtain a reference to the pipe by incrementing `pipe_busy'
340  * before calling this function in order ensure that the same pipe is not
341  * destroyed while sleeping.
342  */
343 int
344 pipe_iosleep(struct pipe *cpipe, const char *wmesg)
345 {
346 	int error;
347 
348 	pipe_iounlock(cpipe);
349 	error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH, wmesg,
350 	    INFSLP);
351 	if (error)
352 		return (error);
353 	return (pipe_iolock(cpipe));
354 }
355 
356 void
357 pipeselwakeup(struct pipe *cpipe)
358 {
359 	rw_assert_wrlock(cpipe->pipe_lock);
360 
361 	if (cpipe->pipe_state & PIPE_SEL) {
362 		cpipe->pipe_state &= ~PIPE_SEL;
363 		selwakeup(&cpipe->pipe_sel);
364 	} else {
365 		KERNEL_LOCK();
366 		KNOTE(&cpipe->pipe_sel.si_note, NOTE_SUBMIT);
367 		KERNEL_UNLOCK();
368 	}
369 
370 	if (cpipe->pipe_state & PIPE_ASYNC)
371 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
372 }
373 
374 int
375 pipe_read(struct file *fp, struct uio *uio, int fflags)
376 {
377 	struct pipe *rpipe = fp->f_data;
378 	size_t nread = 0, size;
379 	int error;
380 
381 	rw_enter_write(rpipe->pipe_lock);
382 	++rpipe->pipe_busy;
383 	error = pipe_iolock(rpipe);
384 	if (error) {
385 		--rpipe->pipe_busy;
386 		pipe_rundown(rpipe);
387 		rw_exit_write(rpipe->pipe_lock);
388 		return (error);
389 	}
390 
391 	while (uio->uio_resid) {
392 		/* Normal pipe buffer receive. */
393 		if (rpipe->pipe_buffer.cnt > 0) {
394 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
395 			if (size > rpipe->pipe_buffer.cnt)
396 				size = rpipe->pipe_buffer.cnt;
397 			if (size > uio->uio_resid)
398 				size = uio->uio_resid;
399 			rw_exit_write(rpipe->pipe_lock);
400 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
401 					size, uio);
402 			rw_enter_write(rpipe->pipe_lock);
403 			if (error) {
404 				break;
405 			}
406 			rpipe->pipe_buffer.out += size;
407 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
408 				rpipe->pipe_buffer.out = 0;
409 
410 			rpipe->pipe_buffer.cnt -= size;
411 			/*
412 			 * If there is no more to read in the pipe, reset
413 			 * its pointers to the beginning.  This improves
414 			 * cache hit stats.
415 			 */
416 			if (rpipe->pipe_buffer.cnt == 0) {
417 				rpipe->pipe_buffer.in = 0;
418 				rpipe->pipe_buffer.out = 0;
419 			}
420 			nread += size;
421 		} else {
422 			/*
423 			 * detect EOF condition
424 			 * read returns 0 on EOF, no need to set error
425 			 */
426 			if (rpipe->pipe_state & PIPE_EOF)
427 				break;
428 
429 			/* If the "write-side" has been blocked, wake it up. */
430 			if (rpipe->pipe_state & PIPE_WANTW) {
431 				rpipe->pipe_state &= ~PIPE_WANTW;
432 				wakeup(rpipe);
433 			}
434 
435 			/* Break if some data was read. */
436 			if (nread > 0)
437 				break;
438 
439 			/* Handle non-blocking mode operation. */
440 			if (fp->f_flag & FNONBLOCK) {
441 				error = EAGAIN;
442 				break;
443 			}
444 
445 			/* Wait for more data. */
446 			rpipe->pipe_state |= PIPE_WANTR;
447 			error = pipe_iosleep(rpipe, "piperd");
448 			if (error)
449 				goto unlocked_error;
450 		}
451 	}
452 	pipe_iounlock(rpipe);
453 
454 	if (error == 0)
455 		getnanotime(&rpipe->pipe_atime);
456 unlocked_error:
457 	--rpipe->pipe_busy;
458 
459 	if (pipe_rundown(rpipe) == 0 && rpipe->pipe_buffer.cnt < MINPIPESIZE) {
460 		/* Handle write blocking hysteresis. */
461 		if (rpipe->pipe_state & PIPE_WANTW) {
462 			rpipe->pipe_state &= ~PIPE_WANTW;
463 			wakeup(rpipe);
464 		}
465 	}
466 
467 	if (rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
468 		pipeselwakeup(rpipe);
469 
470 	rw_exit_write(rpipe->pipe_lock);
471 	return (error);
472 }
473 
474 int
475 pipe_write(struct file *fp, struct uio *uio, int fflags)
476 {
477 	struct pipe *rpipe = fp->f_data, *wpipe;
478 	struct rwlock *lock = rpipe->pipe_lock;
479 	size_t orig_resid;
480 	int error;
481 
482 	rw_enter_write(lock);
483 	wpipe = pipe_peer(rpipe);
484 
485 	/* Detect loss of pipe read side, issue SIGPIPE if lost. */
486 	if (wpipe == NULL) {
487 		rw_exit_write(lock);
488 		return (EPIPE);
489 	}
490 
491 	++wpipe->pipe_busy;
492 	error = pipe_iolock(wpipe);
493 	if (error) {
494 		--wpipe->pipe_busy;
495 		pipe_rundown(wpipe);
496 		rw_exit_write(lock);
497 		return (error);
498 	}
499 
500 
501 	/* If it is advantageous to resize the pipe buffer, do so. */
502 	if (uio->uio_resid > PIPE_SIZE &&
503 	    wpipe->pipe_buffer.size <= PIPE_SIZE &&
504 	    wpipe->pipe_buffer.cnt == 0) {
505 	    	unsigned int npipe;
506 
507 		npipe = atomic_inc_int_nv(&nbigpipe);
508 		if (npipe > LIMITBIGPIPES ||
509 		    pipe_buffer_realloc(wpipe, BIG_PIPE_SIZE) != 0)
510 			atomic_dec_int(&nbigpipe);
511 	}
512 
513 	orig_resid = uio->uio_resid;
514 
515 	while (uio->uio_resid) {
516 		size_t space;
517 
518 		if (wpipe->pipe_state & PIPE_EOF) {
519 			error = EPIPE;
520 			break;
521 		}
522 
523 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
524 
525 		/* Writes of size <= PIPE_BUF must be atomic. */
526 		if (space < uio->uio_resid && orig_resid <= PIPE_BUF)
527 			space = 0;
528 
529 		if (space > 0) {
530 			size_t size;	/* Transfer size */
531 			size_t segsize;	/* first segment to transfer */
532 
533 			/*
534 			 * Transfer size is minimum of uio transfer
535 			 * and free space in pipe buffer.
536 			 */
537 			if (space > uio->uio_resid)
538 				size = uio->uio_resid;
539 			else
540 				size = space;
541 			/*
542 			 * First segment to transfer is minimum of
543 			 * transfer size and contiguous space in
544 			 * pipe buffer.  If first segment to transfer
545 			 * is less than the transfer size, we've got
546 			 * a wraparound in the buffer.
547 			 */
548 			segsize = wpipe->pipe_buffer.size -
549 				wpipe->pipe_buffer.in;
550 			if (segsize > size)
551 				segsize = size;
552 
553 			/* Transfer first segment */
554 
555 			rw_exit_write(lock);
556 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
557 					segsize, uio);
558 			rw_enter_write(lock);
559 
560 			if (error == 0 && segsize < size) {
561 				/*
562 				 * Transfer remaining part now, to
563 				 * support atomic writes.  Wraparound
564 				 * happened.
565 				 */
566 #ifdef DIAGNOSTIC
567 				if (wpipe->pipe_buffer.in + segsize !=
568 				    wpipe->pipe_buffer.size)
569 					panic("Expected pipe buffer wraparound disappeared");
570 #endif
571 
572 				rw_exit_write(lock);
573 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
574 						size - segsize, uio);
575 				rw_enter_write(lock);
576 			}
577 			if (error == 0) {
578 				wpipe->pipe_buffer.in += size;
579 				if (wpipe->pipe_buffer.in >=
580 				    wpipe->pipe_buffer.size) {
581 #ifdef DIAGNOSTIC
582 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
583 						panic("Expected wraparound bad");
584 #endif
585 					wpipe->pipe_buffer.in = size - segsize;
586 				}
587 
588 				wpipe->pipe_buffer.cnt += size;
589 #ifdef DIAGNOSTIC
590 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
591 					panic("Pipe buffer overflow");
592 #endif
593 			}
594 			if (error)
595 				break;
596 		} else {
597 			/* If the "read-side" has been blocked, wake it up. */
598 			if (wpipe->pipe_state & PIPE_WANTR) {
599 				wpipe->pipe_state &= ~PIPE_WANTR;
600 				wakeup(wpipe);
601 			}
602 
603 			/* Don't block on non-blocking I/O. */
604 			if (fp->f_flag & FNONBLOCK) {
605 				error = EAGAIN;
606 				break;
607 			}
608 
609 			/*
610 			 * We have no more space and have something to offer,
611 			 * wake up select/poll.
612 			 */
613 			pipeselwakeup(wpipe);
614 
615 			wpipe->pipe_state |= PIPE_WANTW;
616 			error = pipe_iosleep(wpipe, "pipewr");
617 			if (error)
618 				goto unlocked_error;
619 
620 			/*
621 			 * If read side wants to go away, we just issue a
622 			 * signal to ourselves.
623 			 */
624 			if (wpipe->pipe_state & PIPE_EOF) {
625 				error = EPIPE;
626 				break;
627 			}
628 		}
629 	}
630 	pipe_iounlock(wpipe);
631 
632 unlocked_error:
633 	--wpipe->pipe_busy;
634 
635 	if (pipe_rundown(wpipe) == 0 && wpipe->pipe_buffer.cnt > 0) {
636 		/*
637 		 * If we have put any characters in the buffer, we wake up
638 		 * the reader.
639 		 */
640 		if (wpipe->pipe_state & PIPE_WANTR) {
641 			wpipe->pipe_state &= ~PIPE_WANTR;
642 			wakeup(wpipe);
643 		}
644 	}
645 
646 	/* Don't return EPIPE if I/O was successful. */
647 	if (wpipe->pipe_buffer.cnt == 0 &&
648 	    uio->uio_resid == 0 &&
649 	    error == EPIPE) {
650 		error = 0;
651 	}
652 
653 	if (error == 0)
654 		getnanotime(&wpipe->pipe_mtime);
655 	/* We have something to offer, wake up select/poll. */
656 	if (wpipe->pipe_buffer.cnt)
657 		pipeselwakeup(wpipe);
658 
659 	rw_exit_write(lock);
660 	return (error);
661 }
662 
663 /*
664  * we implement a very minimal set of ioctls for compatibility with sockets.
665  */
666 int
667 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
668 {
669 	struct pipe *mpipe = fp->f_data;
670 	int error = 0;
671 
672 	switch (cmd) {
673 
674 	case FIONBIO:
675 		break;
676 
677 	case FIOASYNC:
678 		rw_enter_write(mpipe->pipe_lock);
679 		if (*(int *)data) {
680 			mpipe->pipe_state |= PIPE_ASYNC;
681 		} else {
682 			mpipe->pipe_state &= ~PIPE_ASYNC;
683 		}
684 		rw_exit_write(mpipe->pipe_lock);
685 		break;
686 
687 	case FIONREAD:
688 		rw_enter_read(mpipe->pipe_lock);
689 		*(int *)data = mpipe->pipe_buffer.cnt;
690 		rw_exit_read(mpipe->pipe_lock);
691 		break;
692 
693 	case FIOSETOWN:
694 	case SIOCSPGRP:
695 	case TIOCSPGRP:
696 		error = sigio_setown(&mpipe->pipe_sigio, cmd, data);
697 		break;
698 
699 	case FIOGETOWN:
700 	case SIOCGPGRP:
701 	case TIOCGPGRP:
702 		sigio_getown(&mpipe->pipe_sigio, cmd, data);
703 		break;
704 
705 	default:
706 		error = ENOTTY;
707 	}
708 
709 	return (error);
710 }
711 
712 int
713 pipe_poll(struct file *fp, int events, struct proc *p)
714 {
715 	struct pipe *rpipe = fp->f_data, *wpipe;
716 	struct rwlock *lock = rpipe->pipe_lock;
717 	int revents = 0;
718 
719 	rw_enter_write(lock);
720 	wpipe = pipe_peer(rpipe);
721 
722 	if (events & (POLLIN | POLLRDNORM)) {
723 		if (rpipe->pipe_buffer.cnt > 0 ||
724 		    (rpipe->pipe_state & PIPE_EOF))
725 			revents |= events & (POLLIN | POLLRDNORM);
726 	}
727 
728 	/* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
729 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL)
730 		revents |= POLLHUP;
731 	else if (events & (POLLOUT | POLLWRNORM)) {
732 		if (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt >= PIPE_BUF)
733 			revents |= events & (POLLOUT | POLLWRNORM);
734 	}
735 
736 	if (revents == 0) {
737 		if (events & (POLLIN | POLLRDNORM)) {
738 			selrecord(p, &rpipe->pipe_sel);
739 			rpipe->pipe_state |= PIPE_SEL;
740 		}
741 		if (events & (POLLOUT | POLLWRNORM)) {
742 			selrecord(p, &wpipe->pipe_sel);
743 			wpipe->pipe_state |= PIPE_SEL;
744 		}
745 	}
746 
747 	rw_exit_write(lock);
748 
749 	return (revents);
750 }
751 
752 int
753 pipe_stat(struct file *fp, struct stat *ub, struct proc *p)
754 {
755 	struct pipe *pipe = fp->f_data;
756 
757 	memset(ub, 0, sizeof(*ub));
758 
759 	rw_enter_read(pipe->pipe_lock);
760 	ub->st_mode = S_IFIFO;
761 	ub->st_blksize = pipe->pipe_buffer.size;
762 	ub->st_size = pipe->pipe_buffer.cnt;
763 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
764 	ub->st_atim.tv_sec  = pipe->pipe_atime.tv_sec;
765 	ub->st_atim.tv_nsec = pipe->pipe_atime.tv_nsec;
766 	ub->st_mtim.tv_sec  = pipe->pipe_mtime.tv_sec;
767 	ub->st_mtim.tv_nsec = pipe->pipe_mtime.tv_nsec;
768 	ub->st_ctim.tv_sec  = pipe->pipe_ctime.tv_sec;
769 	ub->st_ctim.tv_nsec = pipe->pipe_ctime.tv_nsec;
770 	ub->st_uid = fp->f_cred->cr_uid;
771 	ub->st_gid = fp->f_cred->cr_gid;
772 	rw_exit_read(pipe->pipe_lock);
773 	/*
774 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
775 	 * XXX (st_dev, st_ino) should be unique.
776 	 */
777 	return (0);
778 }
779 
780 int
781 pipe_close(struct file *fp, struct proc *p)
782 {
783 	struct pipe *cpipe = fp->f_data;
784 
785 	fp->f_ops = NULL;
786 	fp->f_data = NULL;
787 	pipe_destroy(cpipe);
788 	return (0);
789 }
790 
791 /*
792  * Free kva for pipe circular buffer.
793  * No pipe lock check as only called from pipe_buffer_realloc() and pipeclose()
794  */
795 void
796 pipe_buffer_free(struct pipe *cpipe)
797 {
798 	u_int size;
799 
800 	if (cpipe->pipe_buffer.buffer == NULL)
801 		return;
802 
803 	size = cpipe->pipe_buffer.size;
804 
805 	KERNEL_LOCK();
806 	km_free(cpipe->pipe_buffer.buffer, size, &kv_any, &kp_pageable);
807 	KERNEL_UNLOCK();
808 
809 	cpipe->pipe_buffer.buffer = NULL;
810 
811 	atomic_sub_int(&amountpipekva, size);
812 	if (size > PIPE_SIZE)
813 		atomic_dec_int(&nbigpipe);
814 }
815 
816 /*
817  * shutdown the pipe, and free resources.
818  */
819 void
820 pipe_destroy(struct pipe *cpipe)
821 {
822 	struct pipe *ppipe;
823 
824 	if (cpipe == NULL)
825 		return;
826 
827 	rw_enter_write(cpipe->pipe_lock);
828 
829 	pipeselwakeup(cpipe);
830 	sigio_free(&cpipe->pipe_sigio);
831 
832 	/*
833 	 * If the other side is blocked, wake it up saying that
834 	 * we want to close it down.
835 	 */
836 	cpipe->pipe_state |= PIPE_EOF;
837 	while (cpipe->pipe_busy) {
838 		wakeup(cpipe);
839 		cpipe->pipe_state |= PIPE_WANTD;
840 		rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO, "pipecl", INFSLP);
841 	}
842 
843 	/* Disconnect from peer. */
844 	if ((ppipe = cpipe->pipe_peer) != NULL) {
845 		pipeselwakeup(ppipe);
846 
847 		ppipe->pipe_state |= PIPE_EOF;
848 		wakeup(ppipe);
849 		ppipe->pipe_peer = NULL;
850 	}
851 
852 	pipe_buffer_free(cpipe);
853 
854 	rw_exit_write(cpipe->pipe_lock);
855 
856 	if (ppipe == NULL)
857 		pipe_pair_destroy(cpipe->pipe_pair);
858 }
859 
860 /*
861  * Returns non-zero if a rundown is currently ongoing.
862  */
863 int
864 pipe_rundown(struct pipe *cpipe)
865 {
866 	rw_assert_wrlock(cpipe->pipe_lock);
867 
868 	if (cpipe->pipe_busy > 0 || (cpipe->pipe_state & PIPE_WANTD) == 0)
869 		return (0);
870 
871 	/* Only wakeup pipe_destroy() once the pipe is no longer busy. */
872 	cpipe->pipe_state &= ~(PIPE_WANTD | PIPE_WANTR | PIPE_WANTW);
873 	wakeup(cpipe);
874 	return (1);
875 }
876 
877 int
878 pipe_kqfilter(struct file *fp, struct knote *kn)
879 {
880 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
881 	struct rwlock *lock = rpipe->pipe_lock;
882 	int error = 0;
883 
884 	rw_enter_write(lock);
885 	wpipe = pipe_peer(rpipe);
886 
887 	switch (kn->kn_filter) {
888 	case EVFILT_READ:
889 		kn->kn_fop = &pipe_rfiltops;
890 		kn->kn_hook = rpipe;
891 		klist_insert_locked(&rpipe->pipe_sel.si_note, kn);
892 		break;
893 	case EVFILT_WRITE:
894 		if (wpipe == NULL) {
895 			/* other end of pipe has been closed */
896 			error = EPIPE;
897 			break;
898 		}
899 		kn->kn_fop = &pipe_wfiltops;
900 		kn->kn_hook = wpipe;
901 		klist_insert_locked(&wpipe->pipe_sel.si_note, kn);
902 		break;
903 	default:
904 		error = EINVAL;
905 	}
906 
907 	rw_exit_write(lock);
908 
909 	return (error);
910 }
911 
912 void
913 filt_pipedetach(struct knote *kn)
914 {
915 	struct pipe *cpipe = kn->kn_hook;
916 
917 	klist_remove(&cpipe->pipe_sel.si_note, kn);
918 }
919 
920 int
921 filt_piperead(struct knote *kn, long hint)
922 {
923 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
924 	struct rwlock *lock = rpipe->pipe_lock;
925 
926 	if ((hint & NOTE_SUBMIT) == 0)
927 		rw_enter_read(lock);
928 	wpipe = pipe_peer(rpipe);
929 
930 	kn->kn_data = rpipe->pipe_buffer.cnt;
931 
932 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
933 		if ((hint & NOTE_SUBMIT) == 0)
934 			rw_exit_read(lock);
935 		kn->kn_flags |= EV_EOF;
936 		if (kn->kn_flags & __EV_POLL)
937 			kn->kn_flags |= __EV_HUP;
938 		return (1);
939 	}
940 
941 	if ((hint & NOTE_SUBMIT) == 0)
942 		rw_exit_read(lock);
943 
944 	return (kn->kn_data > 0);
945 }
946 
947 int
948 filt_pipewrite(struct knote *kn, long hint)
949 {
950 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
951 	struct rwlock *lock = rpipe->pipe_lock;
952 
953 	if ((hint & NOTE_SUBMIT) == 0)
954 		rw_enter_read(lock);
955 	wpipe = pipe_peer(rpipe);
956 
957 	if (wpipe == NULL) {
958 		if ((hint & NOTE_SUBMIT) == 0)
959 			rw_exit_read(lock);
960 		kn->kn_data = 0;
961 		kn->kn_flags |= EV_EOF;
962 		if (kn->kn_flags & __EV_POLL)
963 			kn->kn_flags |= __EV_HUP;
964 		return (1);
965 	}
966 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
967 
968 	if ((hint & NOTE_SUBMIT) == 0)
969 		rw_exit_read(lock);
970 
971 	return (kn->kn_data >= PIPE_BUF);
972 }
973 
974 void
975 pipe_init(void)
976 {
977 	pool_init(&pipe_pair_pool, sizeof(struct pipe_pair), 0, IPL_MPFLOOR,
978 	    PR_WAITOK, "pipepl", NULL);
979 }
980 
981 struct pipe_pair *
982 pipe_pair_create(void)
983 {
984 	struct pipe_pair *pp;
985 
986 	pp = pool_get(&pipe_pair_pool, PR_WAITOK | PR_ZERO);
987 	pp->pp_wpipe.pipe_pair = pp;
988 	pp->pp_rpipe.pipe_pair = pp;
989 	pp->pp_wpipe.pipe_peer = &pp->pp_rpipe;
990 	pp->pp_rpipe.pipe_peer = &pp->pp_wpipe;
991 	/*
992 	 * One lock is used per pipe pair in order to obtain exclusive access to
993 	 * the pipe pair.
994 	 */
995 	rw_init(&pp->pp_lock, "pipelk");
996 	pp->pp_wpipe.pipe_lock = &pp->pp_lock;
997 	pp->pp_rpipe.pipe_lock = &pp->pp_lock;
998 
999 	klist_init_rwlock(&pp->pp_wpipe.pipe_sel.si_note, &pp->pp_lock);
1000 	klist_init_rwlock(&pp->pp_rpipe.pipe_sel.si_note, &pp->pp_lock);
1001 
1002 	if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
1003 		goto err;
1004 	return (pp);
1005 err:
1006 	pipe_destroy(&pp->pp_wpipe);
1007 	pipe_destroy(&pp->pp_rpipe);
1008 	return (NULL);
1009 }
1010 
1011 void
1012 pipe_pair_destroy(struct pipe_pair *pp)
1013 {
1014 	klist_free(&pp->pp_wpipe.pipe_sel.si_note);
1015 	klist_free(&pp->pp_rpipe.pipe_sel.si_note);
1016 	pool_put(&pipe_pair_pool, pp);
1017 }
1018