xref: /openbsd-src/sys/kern/sys_pipe.c (revision 1a8dbaac879b9f3335ad7fb25429ce63ac1d6bac)
1 /*	$OpenBSD: sys_pipe.c,v 1.123 2020/06/29 18:23:18 anton Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/pool.h>
36 #include <sys/ioctl.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/mount.h>
40 #include <sys/syscallargs.h>
41 #include <sys/event.h>
42 #include <sys/lock.h>
43 #include <sys/poll.h>
44 #ifdef KTRACE
45 #include <sys/ktrace.h>
46 #endif
47 
48 #include <uvm/uvm_extern.h>
49 
50 #include <sys/pipe.h>
51 
52 struct pipe_pair {
53 	struct pipe pp_wpipe;
54 	struct pipe pp_rpipe;
55 	struct rwlock pp_lock;
56 };
57 
58 /*
59  * interfaces to the outside world
60  */
61 int	pipe_read(struct file *, struct uio *, int);
62 int	pipe_write(struct file *, struct uio *, int);
63 int	pipe_close(struct file *, struct proc *);
64 int	pipe_poll(struct file *, int events, struct proc *);
65 int	pipe_kqfilter(struct file *fp, struct knote *kn);
66 int	pipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
67 int	pipe_stat(struct file *fp, struct stat *ub, struct proc *p);
68 
69 static const struct fileops pipeops = {
70 	.fo_read	= pipe_read,
71 	.fo_write	= pipe_write,
72 	.fo_ioctl	= pipe_ioctl,
73 	.fo_poll	= pipe_poll,
74 	.fo_kqfilter	= pipe_kqfilter,
75 	.fo_stat	= pipe_stat,
76 	.fo_close	= pipe_close
77 };
78 
79 void	filt_pipedetach(struct knote *kn);
80 int	filt_piperead(struct knote *kn, long hint);
81 int	filt_pipewrite(struct knote *kn, long hint);
82 
83 const struct filterops pipe_rfiltops = {
84 	.f_flags	= FILTEROP_ISFD,
85 	.f_attach	= NULL,
86 	.f_detach	= filt_pipedetach,
87 	.f_event	= filt_piperead,
88 };
89 
90 const struct filterops pipe_wfiltops = {
91 	.f_flags	= FILTEROP_ISFD,
92 	.f_attach	= NULL,
93 	.f_detach	= filt_pipedetach,
94 	.f_event	= filt_pipewrite,
95 };
96 
97 /*
98  * Default pipe buffer size(s), this can be kind-of large now because pipe
99  * space is pageable.  The pipe code will try to maintain locality of
100  * reference for performance reasons, so small amounts of outstanding I/O
101  * will not wipe the cache.
102  */
103 #define MINPIPESIZE (PIPE_SIZE/3)
104 
105 /*
106  * Limit the number of "big" pipes
107  */
108 #define LIMITBIGPIPES	32
109 unsigned int nbigpipe;
110 static unsigned int amountpipekva;
111 
112 struct pool pipe_pair_pool;
113 
114 int	dopipe(struct proc *, int *, int);
115 void	pipeselwakeup(struct pipe *);
116 
117 int	pipe_create(struct pipe *);
118 void	pipe_destroy(struct pipe *);
119 int	pipe_rundown(struct pipe *);
120 struct pipe *pipe_peer(struct pipe *);
121 int	pipe_buffer_realloc(struct pipe *, u_int);
122 void	pipe_buffer_free(struct pipe *);
123 
124 int	pipe_iolock(struct pipe *);
125 void	pipe_iounlock(struct pipe *);
126 int	pipe_iosleep(struct pipe *, const char *);
127 
128 struct pipe_pair *pipe_pair_create(void);
129 
130 /*
131  * The pipe system call for the DTYPE_PIPE type of pipes
132  */
133 
134 int
135 sys_pipe(struct proc *p, void *v, register_t *retval)
136 {
137 	struct sys_pipe_args /* {
138 		syscallarg(int *) fdp;
139 	} */ *uap = v;
140 
141 	return (dopipe(p, SCARG(uap, fdp), 0));
142 }
143 
144 int
145 sys_pipe2(struct proc *p, void *v, register_t *retval)
146 {
147 	struct sys_pipe2_args /* {
148 		syscallarg(int *) fdp;
149 		syscallarg(int) flags;
150 	} */ *uap = v;
151 
152 	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
153 		return (EINVAL);
154 
155 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
156 }
157 
158 int
159 dopipe(struct proc *p, int *ufds, int flags)
160 {
161 	struct filedesc *fdp = p->p_fd;
162 	struct file *rf, *wf;
163 	struct pipe_pair *pp;
164 	struct pipe *rpipe, *wpipe = NULL;
165 	int fds[2], cloexec, error;
166 
167 	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
168 
169 	pp = pipe_pair_create();
170 	if (pp == NULL)
171 		return (ENOMEM);
172 	wpipe = &pp->pp_wpipe;
173 	rpipe = &pp->pp_rpipe;
174 
175 	fdplock(fdp);
176 
177 	error = falloc(p, &rf, &fds[0]);
178 	if (error != 0)
179 		goto free2;
180 	rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
181 	rf->f_type = DTYPE_PIPE;
182 	rf->f_data = rpipe;
183 	rf->f_ops = &pipeops;
184 
185 	error = falloc(p, &wf, &fds[1]);
186 	if (error != 0)
187 		goto free3;
188 	wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
189 	wf->f_type = DTYPE_PIPE;
190 	wf->f_data = wpipe;
191 	wf->f_ops = &pipeops;
192 
193 	fdinsert(fdp, fds[0], cloexec, rf);
194 	fdinsert(fdp, fds[1], cloexec, wf);
195 
196 	error = copyout(fds, ufds, sizeof(fds));
197 	if (error == 0) {
198 		fdpunlock(fdp);
199 #ifdef KTRACE
200 		if (KTRPOINT(p, KTR_STRUCT))
201 			ktrfds(p, fds, 2);
202 #endif
203 	} else {
204 		/* fdrelease() unlocks fdp. */
205 		fdrelease(p, fds[0]);
206 		fdplock(fdp);
207 		fdrelease(p, fds[1]);
208 	}
209 
210 	FRELE(rf, p);
211 	FRELE(wf, p);
212 	return (error);
213 
214 free3:
215 	fdremove(fdp, fds[0]);
216 	closef(rf, p);
217 	rpipe = NULL;
218 free2:
219 	fdpunlock(fdp);
220 	pipe_destroy(wpipe);
221 	pipe_destroy(rpipe);
222 	return (error);
223 }
224 
225 /*
226  * Allocate kva for pipe circular buffer, the space is pageable.
227  * This routine will 'realloc' the size of a pipe safely, if it fails
228  * it will retain the old buffer.
229  * If it fails it will return ENOMEM.
230  */
231 int
232 pipe_buffer_realloc(struct pipe *cpipe, u_int size)
233 {
234 	caddr_t buffer;
235 
236 	/* buffer uninitialized or pipe locked */
237 	KASSERT((cpipe->pipe_buffer.buffer == NULL) ||
238 	    (cpipe->pipe_state & PIPE_LOCK));
239 
240 	/* buffer should be empty */
241 	KASSERT(cpipe->pipe_buffer.cnt == 0);
242 
243 	KERNEL_LOCK();
244 	buffer = km_alloc(size, &kv_any, &kp_pageable, &kd_waitok);
245 	KERNEL_UNLOCK();
246 	if (buffer == NULL)
247 		return (ENOMEM);
248 
249 	/* free old resources if we are resizing */
250 	pipe_buffer_free(cpipe);
251 
252 	cpipe->pipe_buffer.buffer = buffer;
253 	cpipe->pipe_buffer.size = size;
254 	cpipe->pipe_buffer.in = 0;
255 	cpipe->pipe_buffer.out = 0;
256 
257 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
258 
259 	return (0);
260 }
261 
262 /*
263  * initialize and allocate VM and memory for pipe
264  */
265 int
266 pipe_create(struct pipe *cpipe)
267 {
268 	int error;
269 
270 	error = pipe_buffer_realloc(cpipe, PIPE_SIZE);
271 	if (error != 0)
272 		return (error);
273 
274 	sigio_init(&cpipe->pipe_sigio);
275 
276 	getnanotime(&cpipe->pipe_ctime);
277 	cpipe->pipe_atime = cpipe->pipe_ctime;
278 	cpipe->pipe_mtime = cpipe->pipe_ctime;
279 
280 	return (0);
281 }
282 
283 struct pipe *
284 pipe_peer(struct pipe *cpipe)
285 {
286 	struct pipe *peer;
287 
288 	rw_assert_anylock(cpipe->pipe_lock);
289 
290 	peer = cpipe->pipe_peer;
291 	if (peer == NULL || (peer->pipe_state & PIPE_EOF))
292 		return (NULL);
293 	return (peer);
294 }
295 
296 /*
297  * Lock a pipe for exclusive I/O access.
298  */
299 int
300 pipe_iolock(struct pipe *cpipe)
301 {
302 	int error;
303 
304 	rw_assert_wrlock(cpipe->pipe_lock);
305 
306 	while (cpipe->pipe_state & PIPE_LOCK) {
307 		cpipe->pipe_state |= PIPE_LWANT;
308 		error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH,
309 		    "pipeiolk", INFSLP);
310 		if (error)
311 			return (error);
312 	}
313 	cpipe->pipe_state |= PIPE_LOCK;
314 	return (0);
315 }
316 
317 /*
318  * Unlock a pipe I/O lock.
319  */
320 void
321 pipe_iounlock(struct pipe *cpipe)
322 {
323 	rw_assert_wrlock(cpipe->pipe_lock);
324 	KASSERT(cpipe->pipe_state & PIPE_LOCK);
325 
326 	cpipe->pipe_state &= ~PIPE_LOCK;
327 	if (cpipe->pipe_state & PIPE_LWANT) {
328 		cpipe->pipe_state &= ~PIPE_LWANT;
329 		wakeup(cpipe);
330 	}
331 }
332 
333 /*
334  * Unlock the pipe I/O lock and go to sleep. Returns 0 on success and the I/O
335  * lock is relocked. Otherwise if a signal was caught, non-zero is returned and
336  * the I/O lock is not locked.
337  *
338  * Any caller must obtain a reference to the pipe by incrementing `pipe_busy'
339  * before calling this function in order ensure that the same pipe is not
340  * destroyed while sleeping.
341  */
342 int
343 pipe_iosleep(struct pipe *cpipe, const char *wmesg)
344 {
345 	int error;
346 
347 	pipe_iounlock(cpipe);
348 	error = rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO | PCATCH, wmesg,
349 	    INFSLP);
350 	if (error)
351 		return (error);
352 	return (pipe_iolock(cpipe));
353 }
354 
355 void
356 pipeselwakeup(struct pipe *cpipe)
357 {
358 	rw_assert_wrlock(cpipe->pipe_lock);
359 
360 	if (cpipe->pipe_state & PIPE_SEL) {
361 		cpipe->pipe_state &= ~PIPE_SEL;
362 		selwakeup(&cpipe->pipe_sel);
363 	} else {
364 		KERNEL_LOCK();
365 		KNOTE(&cpipe->pipe_sel.si_note, NOTE_SUBMIT);
366 		KERNEL_UNLOCK();
367 	}
368 
369 	if (cpipe->pipe_state & PIPE_ASYNC)
370 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
371 }
372 
373 int
374 pipe_read(struct file *fp, struct uio *uio, int fflags)
375 {
376 	struct pipe *rpipe = fp->f_data;
377 	size_t nread = 0, size;
378 	int error;
379 
380 	rw_enter_write(rpipe->pipe_lock);
381 	++rpipe->pipe_busy;
382 	error = pipe_iolock(rpipe);
383 	if (error) {
384 		--rpipe->pipe_busy;
385 		pipe_rundown(rpipe);
386 		rw_exit_write(rpipe->pipe_lock);
387 		return (error);
388 	}
389 
390 	while (uio->uio_resid) {
391 		/* Normal pipe buffer receive. */
392 		if (rpipe->pipe_buffer.cnt > 0) {
393 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
394 			if (size > rpipe->pipe_buffer.cnt)
395 				size = rpipe->pipe_buffer.cnt;
396 			if (size > uio->uio_resid)
397 				size = uio->uio_resid;
398 			rw_exit_write(rpipe->pipe_lock);
399 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
400 					size, uio);
401 			rw_enter_write(rpipe->pipe_lock);
402 			if (error) {
403 				break;
404 			}
405 			rpipe->pipe_buffer.out += size;
406 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
407 				rpipe->pipe_buffer.out = 0;
408 
409 			rpipe->pipe_buffer.cnt -= size;
410 			/*
411 			 * If there is no more to read in the pipe, reset
412 			 * its pointers to the beginning.  This improves
413 			 * cache hit stats.
414 			 */
415 			if (rpipe->pipe_buffer.cnt == 0) {
416 				rpipe->pipe_buffer.in = 0;
417 				rpipe->pipe_buffer.out = 0;
418 			}
419 			nread += size;
420 		} else {
421 			/*
422 			 * detect EOF condition
423 			 * read returns 0 on EOF, no need to set error
424 			 */
425 			if (rpipe->pipe_state & PIPE_EOF)
426 				break;
427 
428 			/* If the "write-side" has been blocked, wake it up. */
429 			if (rpipe->pipe_state & PIPE_WANTW) {
430 				rpipe->pipe_state &= ~PIPE_WANTW;
431 				wakeup(rpipe);
432 			}
433 
434 			/* Break if some data was read. */
435 			if (nread > 0)
436 				break;
437 
438 			/* Handle non-blocking mode operation. */
439 			if (fp->f_flag & FNONBLOCK) {
440 				error = EAGAIN;
441 				break;
442 			}
443 
444 			/* Wait for more data. */
445 			rpipe->pipe_state |= PIPE_WANTR;
446 			error = pipe_iosleep(rpipe, "piperd");
447 			if (error)
448 				goto unlocked_error;
449 		}
450 	}
451 	pipe_iounlock(rpipe);
452 
453 	if (error == 0)
454 		getnanotime(&rpipe->pipe_atime);
455 unlocked_error:
456 	--rpipe->pipe_busy;
457 
458 	if (pipe_rundown(rpipe) == 0 && rpipe->pipe_buffer.cnt < MINPIPESIZE) {
459 		/* Handle write blocking hysteresis. */
460 		if (rpipe->pipe_state & PIPE_WANTW) {
461 			rpipe->pipe_state &= ~PIPE_WANTW;
462 			wakeup(rpipe);
463 		}
464 	}
465 
466 	if (rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
467 		pipeselwakeup(rpipe);
468 
469 	rw_exit_write(rpipe->pipe_lock);
470 	return (error);
471 }
472 
473 int
474 pipe_write(struct file *fp, struct uio *uio, int fflags)
475 {
476 	struct pipe *rpipe = fp->f_data, *wpipe;
477 	struct rwlock *lock = rpipe->pipe_lock;
478 	size_t orig_resid;
479 	int error;
480 
481 	rw_enter_write(lock);
482 	wpipe = pipe_peer(rpipe);
483 
484 	/* Detect loss of pipe read side, issue SIGPIPE if lost. */
485 	if (wpipe == NULL) {
486 		rw_exit_write(lock);
487 		return (EPIPE);
488 	}
489 
490 	++wpipe->pipe_busy;
491 	error = pipe_iolock(wpipe);
492 	if (error) {
493 		--wpipe->pipe_busy;
494 		pipe_rundown(wpipe);
495 		rw_exit_write(lock);
496 		return (error);
497 	}
498 
499 
500 	/* If it is advantageous to resize the pipe buffer, do so. */
501 	if (uio->uio_resid > PIPE_SIZE &&
502 	    wpipe->pipe_buffer.size <= PIPE_SIZE &&
503 	    wpipe->pipe_buffer.cnt == 0) {
504 	    	unsigned int npipe;
505 
506 		npipe = atomic_inc_int_nv(&nbigpipe);
507 		if (npipe > LIMITBIGPIPES ||
508 		    pipe_buffer_realloc(wpipe, BIG_PIPE_SIZE) != 0)
509 			atomic_dec_int(&nbigpipe);
510 	}
511 
512 	orig_resid = uio->uio_resid;
513 
514 	while (uio->uio_resid) {
515 		size_t space;
516 
517 		if (wpipe->pipe_state & PIPE_EOF) {
518 			error = EPIPE;
519 			break;
520 		}
521 
522 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
523 
524 		/* Writes of size <= PIPE_BUF must be atomic. */
525 		if (space < uio->uio_resid && orig_resid <= PIPE_BUF)
526 			space = 0;
527 
528 		if (space > 0) {
529 			size_t size;	/* Transfer size */
530 			size_t segsize;	/* first segment to transfer */
531 
532 			/*
533 			 * Transfer size is minimum of uio transfer
534 			 * and free space in pipe buffer.
535 			 */
536 			if (space > uio->uio_resid)
537 				size = uio->uio_resid;
538 			else
539 				size = space;
540 			/*
541 			 * First segment to transfer is minimum of
542 			 * transfer size and contiguous space in
543 			 * pipe buffer.  If first segment to transfer
544 			 * is less than the transfer size, we've got
545 			 * a wraparound in the buffer.
546 			 */
547 			segsize = wpipe->pipe_buffer.size -
548 				wpipe->pipe_buffer.in;
549 			if (segsize > size)
550 				segsize = size;
551 
552 			/* Transfer first segment */
553 
554 			rw_exit_write(lock);
555 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
556 					segsize, uio);
557 			rw_enter_write(lock);
558 
559 			if (error == 0 && segsize < size) {
560 				/*
561 				 * Transfer remaining part now, to
562 				 * support atomic writes.  Wraparound
563 				 * happened.
564 				 */
565 #ifdef DIAGNOSTIC
566 				if (wpipe->pipe_buffer.in + segsize !=
567 				    wpipe->pipe_buffer.size)
568 					panic("Expected pipe buffer wraparound disappeared");
569 #endif
570 
571 				rw_exit_write(lock);
572 				error = uiomove(&wpipe->pipe_buffer.buffer[0],
573 						size - segsize, uio);
574 				rw_enter_write(lock);
575 			}
576 			if (error == 0) {
577 				wpipe->pipe_buffer.in += size;
578 				if (wpipe->pipe_buffer.in >=
579 				    wpipe->pipe_buffer.size) {
580 #ifdef DIAGNOSTIC
581 					if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
582 						panic("Expected wraparound bad");
583 #endif
584 					wpipe->pipe_buffer.in = size - segsize;
585 				}
586 
587 				wpipe->pipe_buffer.cnt += size;
588 #ifdef DIAGNOSTIC
589 				if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
590 					panic("Pipe buffer overflow");
591 #endif
592 			}
593 			if (error)
594 				break;
595 		} else {
596 			/* If the "read-side" has been blocked, wake it up. */
597 			if (wpipe->pipe_state & PIPE_WANTR) {
598 				wpipe->pipe_state &= ~PIPE_WANTR;
599 				wakeup(wpipe);
600 			}
601 
602 			/* Don't block on non-blocking I/O. */
603 			if (fp->f_flag & FNONBLOCK) {
604 				error = EAGAIN;
605 				break;
606 			}
607 
608 			/*
609 			 * We have no more space and have something to offer,
610 			 * wake up select/poll.
611 			 */
612 			pipeselwakeup(wpipe);
613 
614 			wpipe->pipe_state |= PIPE_WANTW;
615 			error = pipe_iosleep(wpipe, "pipewr");
616 			if (error)
617 				goto unlocked_error;
618 
619 			/*
620 			 * If read side wants to go away, we just issue a
621 			 * signal to ourselves.
622 			 */
623 			if (wpipe->pipe_state & PIPE_EOF) {
624 				error = EPIPE;
625 				break;
626 			}
627 		}
628 	}
629 	pipe_iounlock(wpipe);
630 
631 unlocked_error:
632 	--wpipe->pipe_busy;
633 
634 	if (pipe_rundown(wpipe) == 0 && wpipe->pipe_buffer.cnt > 0) {
635 		/*
636 		 * If we have put any characters in the buffer, we wake up
637 		 * the reader.
638 		 */
639 		if (wpipe->pipe_state & PIPE_WANTR) {
640 			wpipe->pipe_state &= ~PIPE_WANTR;
641 			wakeup(wpipe);
642 		}
643 	}
644 
645 	/* Don't return EPIPE if I/O was successful. */
646 	if (wpipe->pipe_buffer.cnt == 0 &&
647 	    uio->uio_resid == 0 &&
648 	    error == EPIPE) {
649 		error = 0;
650 	}
651 
652 	if (error == 0)
653 		getnanotime(&wpipe->pipe_mtime);
654 	/* We have something to offer, wake up select/poll. */
655 	if (wpipe->pipe_buffer.cnt)
656 		pipeselwakeup(wpipe);
657 
658 	rw_exit_write(lock);
659 	return (error);
660 }
661 
662 /*
663  * we implement a very minimal set of ioctls for compatibility with sockets.
664  */
665 int
666 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
667 {
668 	struct pipe *mpipe = fp->f_data;
669 	int error = 0;
670 
671 	switch (cmd) {
672 
673 	case FIONBIO:
674 		break;
675 
676 	case FIOASYNC:
677 		rw_enter_write(mpipe->pipe_lock);
678 		if (*(int *)data) {
679 			mpipe->pipe_state |= PIPE_ASYNC;
680 		} else {
681 			mpipe->pipe_state &= ~PIPE_ASYNC;
682 		}
683 		rw_exit_write(mpipe->pipe_lock);
684 		break;
685 
686 	case FIONREAD:
687 		rw_enter_read(mpipe->pipe_lock);
688 		*(int *)data = mpipe->pipe_buffer.cnt;
689 		rw_exit_read(mpipe->pipe_lock);
690 		break;
691 
692 	case FIOSETOWN:
693 	case SIOCSPGRP:
694 	case TIOCSPGRP:
695 		error = sigio_setown(&mpipe->pipe_sigio, cmd, data);
696 		break;
697 
698 	case FIOGETOWN:
699 	case SIOCGPGRP:
700 	case TIOCGPGRP:
701 		sigio_getown(&mpipe->pipe_sigio, cmd, data);
702 		break;
703 
704 	default:
705 		error = ENOTTY;
706 	}
707 
708 	return (error);
709 }
710 
711 int
712 pipe_poll(struct file *fp, int events, struct proc *p)
713 {
714 	struct pipe *rpipe = fp->f_data, *wpipe;
715 	struct rwlock *lock = rpipe->pipe_lock;
716 	int revents = 0;
717 
718 	rw_enter_write(lock);
719 	wpipe = pipe_peer(rpipe);
720 
721 	if (events & (POLLIN | POLLRDNORM)) {
722 		if (rpipe->pipe_buffer.cnt > 0 ||
723 		    (rpipe->pipe_state & PIPE_EOF))
724 			revents |= events & (POLLIN | POLLRDNORM);
725 	}
726 
727 	/* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
728 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL)
729 		revents |= POLLHUP;
730 	else if (events & (POLLOUT | POLLWRNORM)) {
731 		if (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt >= PIPE_BUF)
732 			revents |= events & (POLLOUT | POLLWRNORM);
733 	}
734 
735 	if (revents == 0) {
736 		if (events & (POLLIN | POLLRDNORM)) {
737 			selrecord(p, &rpipe->pipe_sel);
738 			rpipe->pipe_state |= PIPE_SEL;
739 		}
740 		if (events & (POLLOUT | POLLWRNORM)) {
741 			selrecord(p, &wpipe->pipe_sel);
742 			wpipe->pipe_state |= PIPE_SEL;
743 		}
744 	}
745 
746 	rw_exit_write(lock);
747 
748 	return (revents);
749 }
750 
751 int
752 pipe_stat(struct file *fp, struct stat *ub, struct proc *p)
753 {
754 	struct pipe *pipe = fp->f_data;
755 
756 	memset(ub, 0, sizeof(*ub));
757 
758 	rw_enter_read(pipe->pipe_lock);
759 	ub->st_mode = S_IFIFO;
760 	ub->st_blksize = pipe->pipe_buffer.size;
761 	ub->st_size = pipe->pipe_buffer.cnt;
762 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
763 	ub->st_atim.tv_sec  = pipe->pipe_atime.tv_sec;
764 	ub->st_atim.tv_nsec = pipe->pipe_atime.tv_nsec;
765 	ub->st_mtim.tv_sec  = pipe->pipe_mtime.tv_sec;
766 	ub->st_mtim.tv_nsec = pipe->pipe_mtime.tv_nsec;
767 	ub->st_ctim.tv_sec  = pipe->pipe_ctime.tv_sec;
768 	ub->st_ctim.tv_nsec = pipe->pipe_ctime.tv_nsec;
769 	ub->st_uid = fp->f_cred->cr_uid;
770 	ub->st_gid = fp->f_cred->cr_gid;
771 	rw_exit_read(pipe->pipe_lock);
772 	/*
773 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
774 	 * XXX (st_dev, st_ino) should be unique.
775 	 */
776 	return (0);
777 }
778 
779 int
780 pipe_close(struct file *fp, struct proc *p)
781 {
782 	struct pipe *cpipe = fp->f_data;
783 
784 	fp->f_ops = NULL;
785 	fp->f_data = NULL;
786 	pipe_destroy(cpipe);
787 	return (0);
788 }
789 
790 /*
791  * Free kva for pipe circular buffer.
792  * No pipe lock check as only called from pipe_buffer_realloc() and pipeclose()
793  */
794 void
795 pipe_buffer_free(struct pipe *cpipe)
796 {
797 	u_int size;
798 
799 	if (cpipe->pipe_buffer.buffer == NULL)
800 		return;
801 
802 	size = cpipe->pipe_buffer.size;
803 
804 	KERNEL_LOCK();
805 	km_free(cpipe->pipe_buffer.buffer, size, &kv_any, &kp_pageable);
806 	KERNEL_UNLOCK();
807 
808 	cpipe->pipe_buffer.buffer = NULL;
809 
810 	atomic_sub_int(&amountpipekva, size);
811 	if (size > PIPE_SIZE)
812 		atomic_dec_int(&nbigpipe);
813 }
814 
815 /*
816  * shutdown the pipe, and free resources.
817  */
818 void
819 pipe_destroy(struct pipe *cpipe)
820 {
821 	struct pipe *ppipe;
822 
823 	if (cpipe == NULL)
824 		return;
825 
826 	rw_enter_write(cpipe->pipe_lock);
827 
828 	pipeselwakeup(cpipe);
829 	sigio_free(&cpipe->pipe_sigio);
830 
831 	/*
832 	 * If the other side is blocked, wake it up saying that
833 	 * we want to close it down.
834 	 */
835 	cpipe->pipe_state |= PIPE_EOF;
836 	while (cpipe->pipe_busy) {
837 		wakeup(cpipe);
838 		cpipe->pipe_state |= PIPE_WANTD;
839 		rwsleep_nsec(cpipe, cpipe->pipe_lock, PRIBIO, "pipecl", INFSLP);
840 	}
841 
842 	/* Disconnect from peer. */
843 	if ((ppipe = cpipe->pipe_peer) != NULL) {
844 		pipeselwakeup(ppipe);
845 
846 		ppipe->pipe_state |= PIPE_EOF;
847 		wakeup(ppipe);
848 		ppipe->pipe_peer = NULL;
849 	}
850 
851 	pipe_buffer_free(cpipe);
852 
853 	rw_exit_write(cpipe->pipe_lock);
854 
855 	if (ppipe == NULL)
856 		pool_put(&pipe_pair_pool, cpipe->pipe_pair);
857 }
858 
859 /*
860  * Returns non-zero if a rundown is currently ongoing.
861  */
862 int
863 pipe_rundown(struct pipe *cpipe)
864 {
865 	rw_assert_wrlock(cpipe->pipe_lock);
866 
867 	if (cpipe->pipe_busy > 0 || (cpipe->pipe_state & PIPE_WANTD) == 0)
868 		return (0);
869 
870 	/* Only wakeup pipe_destroy() once the pipe is no longer busy. */
871 	cpipe->pipe_state &= ~(PIPE_WANTD | PIPE_WANTR | PIPE_WANTW);
872 	wakeup(cpipe);
873 	return (1);
874 }
875 
876 int
877 pipe_kqfilter(struct file *fp, struct knote *kn)
878 {
879 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
880 	struct rwlock *lock = rpipe->pipe_lock;
881 	int error = 0;
882 
883 	rw_enter_write(lock);
884 	wpipe = pipe_peer(rpipe);
885 
886 	switch (kn->kn_filter) {
887 	case EVFILT_READ:
888 		kn->kn_fop = &pipe_rfiltops;
889 		klist_insert(&rpipe->pipe_sel.si_note, kn);
890 		break;
891 	case EVFILT_WRITE:
892 		if (wpipe == NULL) {
893 			/* other end of pipe has been closed */
894 			error = EPIPE;
895 			break;
896 		}
897 		kn->kn_fop = &pipe_wfiltops;
898 		klist_insert(&wpipe->pipe_sel.si_note, kn);
899 		break;
900 	default:
901 		error = EINVAL;
902 	}
903 
904 	rw_exit_write(lock);
905 
906 	return (error);
907 }
908 
909 void
910 filt_pipedetach(struct knote *kn)
911 {
912 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
913 	struct rwlock *lock = rpipe->pipe_lock;
914 
915 	rw_enter_write(lock);
916 	wpipe = pipe_peer(rpipe);
917 
918 	switch (kn->kn_filter) {
919 	case EVFILT_READ:
920 		klist_remove(&rpipe->pipe_sel.si_note, kn);
921 		break;
922 	case EVFILT_WRITE:
923 		if (wpipe == NULL)
924 			break;
925 		klist_remove(&wpipe->pipe_sel.si_note, kn);
926 		break;
927 	}
928 
929 	rw_exit_write(lock);
930 }
931 
932 int
933 filt_piperead(struct knote *kn, long hint)
934 {
935 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
936 	struct rwlock *lock = rpipe->pipe_lock;
937 
938 	if ((hint & NOTE_SUBMIT) == 0)
939 		rw_enter_read(lock);
940 	wpipe = pipe_peer(rpipe);
941 
942 	kn->kn_data = rpipe->pipe_buffer.cnt;
943 
944 	if ((rpipe->pipe_state & PIPE_EOF) || wpipe == NULL) {
945 		if ((hint & NOTE_SUBMIT) == 0)
946 			rw_exit_read(lock);
947 		kn->kn_flags |= EV_EOF;
948 		if (kn->kn_flags & __EV_POLL)
949 			kn->kn_flags |= __EV_HUP;
950 		return (1);
951 	}
952 
953 	if ((hint & NOTE_SUBMIT) == 0)
954 		rw_exit_read(lock);
955 
956 	return (kn->kn_data > 0);
957 }
958 
959 int
960 filt_pipewrite(struct knote *kn, long hint)
961 {
962 	struct pipe *rpipe = kn->kn_fp->f_data, *wpipe;
963 	struct rwlock *lock = rpipe->pipe_lock;
964 
965 	if ((hint & NOTE_SUBMIT) == 0)
966 		rw_enter_read(lock);
967 	wpipe = pipe_peer(rpipe);
968 
969 	if (wpipe == NULL) {
970 		if ((hint & NOTE_SUBMIT) == 0)
971 			rw_exit_read(lock);
972 		kn->kn_data = 0;
973 		kn->kn_flags |= EV_EOF;
974 		if (kn->kn_flags & __EV_POLL)
975 			kn->kn_flags |= __EV_HUP;
976 		return (1);
977 	}
978 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
979 
980 	if ((hint & NOTE_SUBMIT) == 0)
981 		rw_exit_read(lock);
982 
983 	return (kn->kn_data >= PIPE_BUF);
984 }
985 
986 void
987 pipe_init(void)
988 {
989 	pool_init(&pipe_pair_pool, sizeof(struct pipe_pair), 0, IPL_MPFLOOR,
990 	    PR_WAITOK, "pipepl", NULL);
991 }
992 
993 struct pipe_pair *
994 pipe_pair_create(void)
995 {
996 	struct pipe_pair *pp;
997 
998 	pp = pool_get(&pipe_pair_pool, PR_WAITOK | PR_ZERO);
999 	pp->pp_wpipe.pipe_pair = pp;
1000 	pp->pp_rpipe.pipe_pair = pp;
1001 	pp->pp_wpipe.pipe_peer = &pp->pp_rpipe;
1002 	pp->pp_rpipe.pipe_peer = &pp->pp_wpipe;
1003 	/*
1004 	 * One lock is used per pipe pair in order to obtain exclusive access to
1005 	 * the pipe pair.
1006 	 */
1007 	rw_init(&pp->pp_lock, "pipelk");
1008 	pp->pp_wpipe.pipe_lock = &pp->pp_lock;
1009 	pp->pp_rpipe.pipe_lock = &pp->pp_lock;
1010 
1011 	if (pipe_create(&pp->pp_wpipe) || pipe_create(&pp->pp_rpipe))
1012 		goto err;
1013 	return (pp);
1014 err:
1015 	pipe_destroy(&pp->pp_wpipe);
1016 	pipe_destroy(&pp->pp_rpipe);
1017 	return (NULL);
1018 }
1019