xref: /openbsd-src/sys/kern/sys_pipe.c (revision 7350f337b9e3eb4461d99580e625c7ef148d107c)
1 /*	$OpenBSD: sys_pipe.c,v 1.88 2019/06/22 06:48:25 semarie Exp $	*/
2 
3 /*
4  * Copyright (c) 1996 John S. Dyson
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice immediately at the beginning of the file, without modification,
12  *    this list of conditions, and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Absolutely no warranty of function or purpose is made by the author
17  *    John S. Dyson.
18  * 4. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/proc.h>
32 #include <sys/fcntl.h>
33 #include <sys/file.h>
34 #include <sys/filedesc.h>
35 #include <sys/pool.h>
36 #include <sys/ioctl.h>
37 #include <sys/stat.h>
38 #include <sys/signalvar.h>
39 #include <sys/mount.h>
40 #include <sys/syscallargs.h>
41 #include <sys/event.h>
42 #include <sys/lock.h>
43 #include <sys/poll.h>
44 #ifdef KTRACE
45 #include <sys/ktrace.h>
46 #endif
47 
48 #include <uvm/uvm_extern.h>
49 
50 #include <sys/pipe.h>
51 
52 /*
53  * interfaces to the outside world
54  */
55 int	pipe_read(struct file *, struct uio *, int);
56 int	pipe_write(struct file *, struct uio *, int);
57 int	pipe_close(struct file *, struct proc *);
58 int	pipe_poll(struct file *, int events, struct proc *);
59 int	pipe_kqfilter(struct file *fp, struct knote *kn);
60 int	pipe_ioctl(struct file *, u_long, caddr_t, struct proc *);
61 int	pipe_stat(struct file *fp, struct stat *ub, struct proc *p);
62 
63 static struct fileops pipeops = {
64 	.fo_read	= pipe_read,
65 	.fo_write	= pipe_write,
66 	.fo_ioctl	= pipe_ioctl,
67 	.fo_poll	= pipe_poll,
68 	.fo_kqfilter	= pipe_kqfilter,
69 	.fo_stat	= pipe_stat,
70 	.fo_close	= pipe_close
71 };
72 
73 void	filt_pipedetach(struct knote *kn);
74 int	filt_piperead(struct knote *kn, long hint);
75 int	filt_pipewrite(struct knote *kn, long hint);
76 
77 struct filterops pipe_rfiltops =
78 	{ 1, NULL, filt_pipedetach, filt_piperead };
79 struct filterops pipe_wfiltops =
80 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
81 
82 /*
83  * Default pipe buffer size(s), this can be kind-of large now because pipe
84  * space is pageable.  The pipe code will try to maintain locality of
85  * reference for performance reasons, so small amounts of outstanding I/O
86  * will not wipe the cache.
87  */
88 #define MINPIPESIZE (PIPE_SIZE/3)
89 
90 /*
91  * Limit the number of "big" pipes
92  */
93 #define LIMITBIGPIPES	32
94 unsigned int nbigpipe;
95 static unsigned int amountpipekva;
96 
97 struct pool pipe_pool;
98 
99 int	dopipe(struct proc *, int *, int);
100 void	pipeclose(struct pipe *);
101 void	pipe_free_kmem(struct pipe *);
102 int	pipe_create(struct pipe *);
103 int	pipelock(struct pipe *);
104 void	pipeunlock(struct pipe *);
105 void	pipeselwakeup(struct pipe *);
106 int	pipespace(struct pipe *, u_int);
107 
108 /*
109  * The pipe system call for the DTYPE_PIPE type of pipes
110  */
111 
112 int
113 sys_pipe(struct proc *p, void *v, register_t *retval)
114 {
115 	struct sys_pipe_args /* {
116 		syscallarg(int *) fdp;
117 	} */ *uap = v;
118 
119 	return (dopipe(p, SCARG(uap, fdp), 0));
120 }
121 
122 int
123 sys_pipe2(struct proc *p, void *v, register_t *retval)
124 {
125 	struct sys_pipe2_args /* {
126 		syscallarg(int *) fdp;
127 		syscallarg(int) flags;
128 	} */ *uap = v;
129 
130 	if (SCARG(uap, flags) & ~(O_CLOEXEC | FNONBLOCK))
131 		return (EINVAL);
132 
133 	return (dopipe(p, SCARG(uap, fdp), SCARG(uap, flags)));
134 }
135 
136 int
137 dopipe(struct proc *p, int *ufds, int flags)
138 {
139 	struct filedesc *fdp = p->p_fd;
140 	struct file *rf, *wf;
141 	struct pipe *rpipe, *wpipe = NULL;
142 	int fds[2], cloexec, error;
143 
144 	cloexec = (flags & O_CLOEXEC) ? UF_EXCLOSE : 0;
145 
146 	rpipe = pool_get(&pipe_pool, PR_WAITOK);
147 	error = pipe_create(rpipe);
148 	if (error != 0)
149 		goto free1;
150 	wpipe = pool_get(&pipe_pool, PR_WAITOK);
151 	error = pipe_create(wpipe);
152 	if (error != 0)
153 		goto free1;
154 
155 	fdplock(fdp);
156 
157 	error = falloc(p, &rf, &fds[0]);
158 	if (error != 0)
159 		goto free2;
160 	rf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
161 	rf->f_type = DTYPE_PIPE;
162 	rf->f_data = rpipe;
163 	rf->f_ops = &pipeops;
164 
165 	error = falloc(p, &wf, &fds[1]);
166 	if (error != 0)
167 		goto free3;
168 	wf->f_flag = FREAD | FWRITE | (flags & FNONBLOCK);
169 	wf->f_type = DTYPE_PIPE;
170 	wf->f_data = wpipe;
171 	wf->f_ops = &pipeops;
172 
173 	rpipe->pipe_peer = wpipe;
174 	wpipe->pipe_peer = rpipe;
175 
176 	fdinsert(fdp, fds[0], cloexec, rf);
177 	fdinsert(fdp, fds[1], cloexec, wf);
178 
179 	error = copyout(fds, ufds, sizeof(fds));
180 	if (error != 0) {
181 		fdrelease(p, fds[0]);
182 		fdrelease(p, fds[1]);
183 	}
184 #ifdef KTRACE
185 	else if (KTRPOINT(p, KTR_STRUCT))
186 		ktrfds(p, fds, 2);
187 #endif
188 	fdpunlock(fdp);
189 
190 	FRELE(rf, p);
191 	FRELE(wf, p);
192 	return (error);
193 
194 free3:
195 	fdremove(fdp, fds[0]);
196 	closef(rf, p);
197 	rpipe = NULL;
198 free2:
199 	fdpunlock(fdp);
200 free1:
201 	pipeclose(wpipe);
202 	pipeclose(rpipe);
203 	return (error);
204 }
205 
206 /*
207  * Allocate kva for pipe circular buffer, the space is pageable.
208  * This routine will 'realloc' the size of a pipe safely, if it fails
209  * it will retain the old buffer.
210  * If it fails it will return ENOMEM.
211  */
212 int
213 pipespace(struct pipe *cpipe, u_int size)
214 {
215 	caddr_t buffer;
216 
217 	KERNEL_LOCK();
218 	buffer = km_alloc(size, &kv_any, &kp_pageable, &kd_waitok);
219 	KERNEL_UNLOCK();
220 	if (buffer == NULL) {
221 		return (ENOMEM);
222 	}
223 
224 	/* free old resources if we are resizing */
225 	pipe_free_kmem(cpipe);
226 	cpipe->pipe_buffer.buffer = buffer;
227 	cpipe->pipe_buffer.size = size;
228 	cpipe->pipe_buffer.in = 0;
229 	cpipe->pipe_buffer.out = 0;
230 	cpipe->pipe_buffer.cnt = 0;
231 
232 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
233 
234 	return (0);
235 }
236 
237 /*
238  * initialize and allocate VM and memory for pipe
239  */
240 int
241 pipe_create(struct pipe *cpipe)
242 {
243 	int error;
244 
245 	/* so pipe_free_kmem() doesn't follow junk pointer */
246 	cpipe->pipe_buffer.buffer = NULL;
247 	/*
248 	 * protect so pipeclose() doesn't follow a junk pointer
249 	 * if pipespace() fails.
250 	 */
251 	memset(&cpipe->pipe_sel, 0, sizeof(cpipe->pipe_sel));
252 	cpipe->pipe_state = 0;
253 	cpipe->pipe_peer = NULL;
254 	cpipe->pipe_busy = 0;
255 	sigio_init(&cpipe->pipe_sigio);
256 
257 	error = pipespace(cpipe, PIPE_SIZE);
258 	if (error != 0)
259 		return (error);
260 
261 	getnanotime(&cpipe->pipe_ctime);
262 	cpipe->pipe_atime = cpipe->pipe_ctime;
263 	cpipe->pipe_mtime = cpipe->pipe_ctime;
264 
265 	return (0);
266 }
267 
268 
269 /*
270  * lock a pipe for I/O, blocking other access
271  */
272 int
273 pipelock(struct pipe *cpipe)
274 {
275 	int error;
276 	while (cpipe->pipe_state & PIPE_LOCK) {
277 		cpipe->pipe_state |= PIPE_LWANT;
278 		if ((error = tsleep(cpipe, PRIBIO|PCATCH, "pipelk", 0)))
279 			return error;
280 	}
281 	cpipe->pipe_state |= PIPE_LOCK;
282 	return 0;
283 }
284 
285 /*
286  * unlock a pipe I/O lock
287  */
288 void
289 pipeunlock(struct pipe *cpipe)
290 {
291 	cpipe->pipe_state &= ~PIPE_LOCK;
292 	if (cpipe->pipe_state & PIPE_LWANT) {
293 		cpipe->pipe_state &= ~PIPE_LWANT;
294 		wakeup(cpipe);
295 	}
296 }
297 
298 void
299 pipeselwakeup(struct pipe *cpipe)
300 {
301 	if (cpipe->pipe_state & PIPE_SEL) {
302 		cpipe->pipe_state &= ~PIPE_SEL;
303 		selwakeup(&cpipe->pipe_sel);
304 	} else
305 		KNOTE(&cpipe->pipe_sel.si_note, 0);
306 	if (cpipe->pipe_state & PIPE_ASYNC)
307 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
308 }
309 
310 int
311 pipe_read(struct file *fp, struct uio *uio, int fflags)
312 {
313 	struct pipe *rpipe = fp->f_data;
314 	int error;
315 	size_t size, nread = 0;
316 
317 	KERNEL_LOCK();
318 
319 	error = pipelock(rpipe);
320 	if (error)
321 		goto done;
322 
323 	++rpipe->pipe_busy;
324 
325 	while (uio->uio_resid) {
326 		/*
327 		 * normal pipe buffer receive
328 		 */
329 		if (rpipe->pipe_buffer.cnt > 0) {
330 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
331 			if (size > rpipe->pipe_buffer.cnt)
332 				size = rpipe->pipe_buffer.cnt;
333 			if (size > uio->uio_resid)
334 				size = uio->uio_resid;
335 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
336 					size, uio);
337 			if (error) {
338 				break;
339 			}
340 			rpipe->pipe_buffer.out += size;
341 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
342 				rpipe->pipe_buffer.out = 0;
343 
344 			rpipe->pipe_buffer.cnt -= size;
345 			/*
346 			 * If there is no more to read in the pipe, reset
347 			 * its pointers to the beginning.  This improves
348 			 * cache hit stats.
349 			 */
350 			if (rpipe->pipe_buffer.cnt == 0) {
351 				rpipe->pipe_buffer.in = 0;
352 				rpipe->pipe_buffer.out = 0;
353 			}
354 			nread += size;
355 		} else {
356 			/*
357 			 * detect EOF condition
358 			 * read returns 0 on EOF, no need to set error
359 			 */
360 			if (rpipe->pipe_state & PIPE_EOF)
361 				break;
362 
363 			/*
364 			 * If the "write-side" has been blocked, wake it up now.
365 			 */
366 			if (rpipe->pipe_state & PIPE_WANTW) {
367 				rpipe->pipe_state &= ~PIPE_WANTW;
368 				wakeup(rpipe);
369 			}
370 
371 			/*
372 			 * Break if some data was read.
373 			 */
374 			if (nread > 0)
375 				break;
376 
377 			/*
378 			 * Unlock the pipe buffer for our remaining processing.
379 			 * We will either break out with an error or we will
380 			 * sleep and relock to loop.
381 			 */
382 			pipeunlock(rpipe);
383 
384 			/*
385 			 * Handle non-blocking mode operation or
386 			 * wait for more data.
387 			 */
388 			if (fp->f_flag & FNONBLOCK) {
389 				error = EAGAIN;
390 			} else {
391 				rpipe->pipe_state |= PIPE_WANTR;
392 				if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0)
393 					error = pipelock(rpipe);
394 			}
395 			if (error)
396 				goto unlocked_error;
397 		}
398 	}
399 	pipeunlock(rpipe);
400 
401 	if (error == 0)
402 		getnanotime(&rpipe->pipe_atime);
403 unlocked_error:
404 	--rpipe->pipe_busy;
405 
406 	/*
407 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
408 	 */
409 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
410 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
411 		wakeup(rpipe);
412 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
413 		/*
414 		 * Handle write blocking hysteresis.
415 		 */
416 		if (rpipe->pipe_state & PIPE_WANTW) {
417 			rpipe->pipe_state &= ~PIPE_WANTW;
418 			wakeup(rpipe);
419 		}
420 	}
421 
422 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
423 		pipeselwakeup(rpipe);
424 
425 done:
426 	KERNEL_UNLOCK();
427 	return (error);
428 }
429 
430 int
431 pipe_write(struct file *fp, struct uio *uio, int fflags)
432 {
433 	int error = 0;
434 	size_t orig_resid;
435 	struct pipe *wpipe, *rpipe;
436 
437 	KERNEL_LOCK();
438 
439 	rpipe = fp->f_data;
440 	wpipe = rpipe->pipe_peer;
441 
442 	/*
443 	 * detect loss of pipe read side, issue SIGPIPE if lost.
444 	 */
445 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
446 		error = EPIPE;
447 		goto done;
448 	}
449 	++wpipe->pipe_busy;
450 
451 	/*
452 	 * If it is advantageous to resize the pipe buffer, do
453 	 * so.
454 	 */
455 	if ((uio->uio_resid > PIPE_SIZE) &&
456 	    (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
457 	    (wpipe->pipe_buffer.cnt == 0)) {
458 	    	unsigned int npipe;
459 
460 		npipe = atomic_inc_int_nv(&nbigpipe);
461 		if ((npipe <= LIMITBIGPIPES) &&
462 		    (error = pipelock(wpipe)) == 0) {
463 			if (pipespace(wpipe, BIG_PIPE_SIZE) != 0)
464 				atomic_dec_int(&nbigpipe);
465 			pipeunlock(wpipe);
466 		} else
467 			atomic_dec_int(&nbigpipe);
468 	}
469 
470 	/*
471 	 * If an early error occurred unbusy and return, waking up any pending
472 	 * readers.
473 	 */
474 	if (error) {
475 		--wpipe->pipe_busy;
476 		if ((wpipe->pipe_busy == 0) &&
477 		    (wpipe->pipe_state & PIPE_WANT)) {
478 			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
479 			wakeup(wpipe);
480 		}
481 		goto done;
482 	}
483 
484 	orig_resid = uio->uio_resid;
485 
486 	while (uio->uio_resid) {
487 		size_t space;
488 
489 retrywrite:
490 		if (wpipe->pipe_state & PIPE_EOF) {
491 			error = EPIPE;
492 			break;
493 		}
494 
495 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
496 
497 		/* Writes of size <= PIPE_BUF must be atomic. */
498 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
499 			space = 0;
500 
501 		if (space > 0) {
502 			if ((error = pipelock(wpipe)) == 0) {
503 				size_t size;	/* Transfer size */
504 				size_t segsize;	/* first segment to transfer */
505 
506 				/*
507 				 * If a process blocked in uiomove, our
508 				 * value for space might be bad.
509 				 *
510 				 * XXX will we be ok if the reader has gone
511 				 * away here?
512 				 */
513 				if (space > wpipe->pipe_buffer.size -
514 				    wpipe->pipe_buffer.cnt) {
515 					pipeunlock(wpipe);
516 					goto retrywrite;
517 				}
518 
519 				/*
520 				 * Transfer size is minimum of uio transfer
521 				 * and free space in pipe buffer.
522 				 */
523 				if (space > uio->uio_resid)
524 					size = uio->uio_resid;
525 				else
526 					size = space;
527 				/*
528 				 * First segment to transfer is minimum of
529 				 * transfer size and contiguous space in
530 				 * pipe buffer.  If first segment to transfer
531 				 * is less than the transfer size, we've got
532 				 * a wraparound in the buffer.
533 				 */
534 				segsize = wpipe->pipe_buffer.size -
535 					wpipe->pipe_buffer.in;
536 				if (segsize > size)
537 					segsize = size;
538 
539 				/* Transfer first segment */
540 
541 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
542 						segsize, uio);
543 
544 				if (error == 0 && segsize < size) {
545 					/*
546 					 * Transfer remaining part now, to
547 					 * support atomic writes.  Wraparound
548 					 * happened.
549 					 */
550 #ifdef DIAGNOSTIC
551 					if (wpipe->pipe_buffer.in + segsize !=
552 					    wpipe->pipe_buffer.size)
553 						panic("Expected pipe buffer wraparound disappeared");
554 #endif
555 
556 					error = uiomove(&wpipe->pipe_buffer.buffer[0],
557 							size - segsize, uio);
558 				}
559 				if (error == 0) {
560 					wpipe->pipe_buffer.in += size;
561 					if (wpipe->pipe_buffer.in >=
562 					    wpipe->pipe_buffer.size) {
563 #ifdef DIAGNOSTIC
564 						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
565 							panic("Expected wraparound bad");
566 #endif
567 						wpipe->pipe_buffer.in = size - segsize;
568 					}
569 
570 					wpipe->pipe_buffer.cnt += size;
571 #ifdef DIAGNOSTIC
572 					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
573 						panic("Pipe buffer overflow");
574 #endif
575 				}
576 				pipeunlock(wpipe);
577 			}
578 			if (error)
579 				break;
580 		} else {
581 			/*
582 			 * If the "read-side" has been blocked, wake it up now.
583 			 */
584 			if (wpipe->pipe_state & PIPE_WANTR) {
585 				wpipe->pipe_state &= ~PIPE_WANTR;
586 				wakeup(wpipe);
587 			}
588 
589 			/*
590 			 * don't block on non-blocking I/O
591 			 */
592 			if (fp->f_flag & FNONBLOCK) {
593 				error = EAGAIN;
594 				break;
595 			}
596 
597 			/*
598 			 * We have no more space and have something to offer,
599 			 * wake up select/poll.
600 			 */
601 			pipeselwakeup(wpipe);
602 
603 			wpipe->pipe_state |= PIPE_WANTW;
604 			error = tsleep(wpipe, (PRIBIO + 1)|PCATCH,
605 			    "pipewr", 0);
606 			if (error)
607 				break;
608 			/*
609 			 * If read side wants to go away, we just issue a
610 			 * signal to ourselves.
611 			 */
612 			if (wpipe->pipe_state & PIPE_EOF) {
613 				error = EPIPE;
614 				break;
615 			}
616 		}
617 	}
618 
619 	--wpipe->pipe_busy;
620 
621 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
622 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
623 		wakeup(wpipe);
624 	} else if (wpipe->pipe_buffer.cnt > 0) {
625 		/*
626 		 * If we have put any characters in the buffer, we wake up
627 		 * the reader.
628 		 */
629 		if (wpipe->pipe_state & PIPE_WANTR) {
630 			wpipe->pipe_state &= ~PIPE_WANTR;
631 			wakeup(wpipe);
632 		}
633 	}
634 
635 	/*
636 	 * Don't return EPIPE if I/O was successful
637 	 */
638 	if ((wpipe->pipe_buffer.cnt == 0) &&
639 	    (uio->uio_resid == 0) &&
640 	    (error == EPIPE)) {
641 		error = 0;
642 	}
643 
644 	if (error == 0)
645 		getnanotime(&wpipe->pipe_mtime);
646 	/*
647 	 * We have something to offer, wake up select/poll.
648 	 */
649 	if (wpipe->pipe_buffer.cnt)
650 		pipeselwakeup(wpipe);
651 
652 done:
653 	KERNEL_UNLOCK();
654 	return (error);
655 }
656 
657 /*
658  * we implement a very minimal set of ioctls for compatibility with sockets.
659  */
660 int
661 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
662 {
663 	struct pipe *mpipe = fp->f_data;
664 
665 	switch (cmd) {
666 
667 	case FIONBIO:
668 		return (0);
669 
670 	case FIOASYNC:
671 		if (*(int *)data) {
672 			mpipe->pipe_state |= PIPE_ASYNC;
673 		} else {
674 			mpipe->pipe_state &= ~PIPE_ASYNC;
675 		}
676 		return (0);
677 
678 	case FIONREAD:
679 		*(int *)data = mpipe->pipe_buffer.cnt;
680 		return (0);
681 
682 	case TIOCSPGRP:
683 		/* FALLTHROUGH */
684 	case SIOCSPGRP:
685 		return (sigio_setown(&mpipe->pipe_sigio, *(int *)data));
686 
687 	case SIOCGPGRP:
688 		*(int *)data = sigio_getown(&mpipe->pipe_sigio);
689 		return (0);
690 
691 	case TIOCGPGRP:
692 		*(int *)data = -sigio_getown(&mpipe->pipe_sigio);
693 		return (0);
694 
695 	}
696 	return (ENOTTY);
697 }
698 
699 int
700 pipe_poll(struct file *fp, int events, struct proc *p)
701 {
702 	struct pipe *rpipe = fp->f_data;
703 	struct pipe *wpipe;
704 	int revents = 0;
705 
706 	wpipe = rpipe->pipe_peer;
707 	if (events & (POLLIN | POLLRDNORM)) {
708 		if ((rpipe->pipe_buffer.cnt > 0) ||
709 		    (rpipe->pipe_state & PIPE_EOF))
710 			revents |= events & (POLLIN | POLLRDNORM);
711 	}
712 
713 	/* NOTE: POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
714 	if ((rpipe->pipe_state & PIPE_EOF) ||
715 	    (wpipe == NULL) ||
716 	    (wpipe->pipe_state & PIPE_EOF))
717 		revents |= POLLHUP;
718 	else if (events & (POLLOUT | POLLWRNORM)) {
719 		if ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)
720 			revents |= events & (POLLOUT | POLLWRNORM);
721 	}
722 
723 	if (revents == 0) {
724 		if (events & (POLLIN | POLLRDNORM)) {
725 			selrecord(p, &rpipe->pipe_sel);
726 			rpipe->pipe_state |= PIPE_SEL;
727 		}
728 		if (events & (POLLOUT | POLLWRNORM)) {
729 			selrecord(p, &wpipe->pipe_sel);
730 			wpipe->pipe_state |= PIPE_SEL;
731 		}
732 	}
733 	return (revents);
734 }
735 
736 int
737 pipe_stat(struct file *fp, struct stat *ub, struct proc *p)
738 {
739 	struct pipe *pipe = fp->f_data;
740 
741 	memset(ub, 0, sizeof(*ub));
742 	ub->st_mode = S_IFIFO;
743 	ub->st_blksize = pipe->pipe_buffer.size;
744 	ub->st_size = pipe->pipe_buffer.cnt;
745 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
746 	ub->st_atim.tv_sec  = pipe->pipe_atime.tv_sec;
747 	ub->st_atim.tv_nsec = pipe->pipe_atime.tv_nsec;
748 	ub->st_mtim.tv_sec  = pipe->pipe_mtime.tv_sec;
749 	ub->st_mtim.tv_nsec = pipe->pipe_mtime.tv_nsec;
750 	ub->st_ctim.tv_sec  = pipe->pipe_ctime.tv_sec;
751 	ub->st_ctim.tv_nsec = pipe->pipe_ctime.tv_nsec;
752 	ub->st_uid = fp->f_cred->cr_uid;
753 	ub->st_gid = fp->f_cred->cr_gid;
754 	/*
755 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
756 	 * XXX (st_dev, st_ino) should be unique.
757 	 */
758 	return (0);
759 }
760 
761 int
762 pipe_close(struct file *fp, struct proc *p)
763 {
764 	struct pipe *cpipe = fp->f_data;
765 
766 	fp->f_ops = NULL;
767 	fp->f_data = NULL;
768 	KERNEL_LOCK();
769 	pipeclose(cpipe);
770 	KERNEL_UNLOCK();
771 	return (0);
772 }
773 
774 void
775 pipe_free_kmem(struct pipe *cpipe)
776 {
777 	u_int size = cpipe->pipe_buffer.size;
778 
779 	if (cpipe->pipe_buffer.buffer != NULL) {
780 		KERNEL_LOCK();
781 		km_free(cpipe->pipe_buffer.buffer, size, &kv_any, &kp_pageable);
782 		KERNEL_UNLOCK();
783 		atomic_sub_int(&amountpipekva, size);
784 		cpipe->pipe_buffer.buffer = NULL;
785 		if (size > PIPE_SIZE)
786 			atomic_dec_int(&nbigpipe);
787 	}
788 }
789 
790 /*
791  * shutdown the pipe
792  */
793 void
794 pipeclose(struct pipe *cpipe)
795 {
796 	struct pipe *ppipe;
797 	if (cpipe) {
798 		pipeselwakeup(cpipe);
799 		sigio_free(&cpipe->pipe_sigio);
800 
801 		/*
802 		 * If the other side is blocked, wake it up saying that
803 		 * we want to close it down.
804 		 */
805 		cpipe->pipe_state |= PIPE_EOF;
806 		while (cpipe->pipe_busy) {
807 			wakeup(cpipe);
808 			cpipe->pipe_state |= PIPE_WANT;
809 			tsleep(cpipe, PRIBIO, "pipecl", 0);
810 		}
811 
812 		/*
813 		 * Disconnect from peer
814 		 */
815 		if ((ppipe = cpipe->pipe_peer) != NULL) {
816 			pipeselwakeup(ppipe);
817 
818 			ppipe->pipe_state |= PIPE_EOF;
819 			wakeup(ppipe);
820 			ppipe->pipe_peer = NULL;
821 		}
822 
823 		/*
824 		 * free resources
825 		 */
826 		pipe_free_kmem(cpipe);
827 		pool_put(&pipe_pool, cpipe);
828 	}
829 }
830 
831 int
832 pipe_kqfilter(struct file *fp, struct knote *kn)
833 {
834 	struct pipe *rpipe = kn->kn_fp->f_data;
835 	struct pipe *wpipe = rpipe->pipe_peer;
836 
837 	switch (kn->kn_filter) {
838 	case EVFILT_READ:
839 		kn->kn_fop = &pipe_rfiltops;
840 		SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
841 		break;
842 	case EVFILT_WRITE:
843 		if (wpipe == NULL) {
844 			/* other end of pipe has been closed */
845 			return (EPIPE);
846 		}
847 		kn->kn_fop = &pipe_wfiltops;
848 		SLIST_INSERT_HEAD(&wpipe->pipe_sel.si_note, kn, kn_selnext);
849 		break;
850 	default:
851 		return (EINVAL);
852 	}
853 
854 	return (0);
855 }
856 
857 void
858 filt_pipedetach(struct knote *kn)
859 {
860 	struct pipe *rpipe = kn->kn_fp->f_data;
861 	struct pipe *wpipe = rpipe->pipe_peer;
862 
863 	switch (kn->kn_filter) {
864 	case EVFILT_READ:
865 		SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
866 		break;
867 	case EVFILT_WRITE:
868 		if (wpipe == NULL)
869 			return;
870 		SLIST_REMOVE(&wpipe->pipe_sel.si_note, kn, knote, kn_selnext);
871 		break;
872 	}
873 }
874 
875 int
876 filt_piperead(struct knote *kn, long hint)
877 {
878 	struct pipe *rpipe = kn->kn_fp->f_data;
879 	struct pipe *wpipe = rpipe->pipe_peer;
880 
881 	kn->kn_data = rpipe->pipe_buffer.cnt;
882 
883 	if ((rpipe->pipe_state & PIPE_EOF) ||
884 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
885 		kn->kn_flags |= EV_EOF;
886 		return (1);
887 	}
888 	return (kn->kn_data > 0);
889 }
890 
891 int
892 filt_pipewrite(struct knote *kn, long hint)
893 {
894 	struct pipe *rpipe = kn->kn_fp->f_data;
895 	struct pipe *wpipe = rpipe->pipe_peer;
896 
897 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
898 		kn->kn_data = 0;
899 		kn->kn_flags |= EV_EOF;
900 		return (1);
901 	}
902 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
903 
904 	return (kn->kn_data >= PIPE_BUF);
905 }
906 
907 void
908 pipe_init(void)
909 {
910 	pool_init(&pipe_pool, sizeof(struct pipe), 0, IPL_MPFLOOR, PR_WAITOK,
911 	    "pipepl", NULL);
912 }
913 
914