1 /* $NetBSD: sys_pipe.c,v 1.88 2007/12/05 17:19:58 pooka Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Paul Kranenburg, and by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1996 John S. Dyson 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice immediately at the beginning of the file, without modification, 48 * this list of conditions, and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Absolutely no warranty of function or purpose is made by the author 53 * John S. Dyson. 54 * 4. Modifications may be freely made to this file if the above conditions 55 * are met. 56 * 57 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.95 2002/03/09 22:06:31 alfred Exp $ 58 */ 59 60 /* 61 * This file contains a high-performance replacement for the socket-based 62 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 63 * all features of sockets, but does do everything that pipes normally 64 * do. 65 * 66 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was 67 * written by Jaromir Dolecek. 68 */ 69 70 /* 71 * This code has two modes of operation, a small write mode and a large 72 * write mode. The small write mode acts like conventional pipes with 73 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 74 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 75 * and PIPE_SIZE in size it is mapped read-only into the kernel address space 76 * using the UVM page loan facility from where the receiving process can copy 77 * the data directly from the pages in the sending process. 78 * 79 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 80 * happen for small transfers so that the system will not spend all of 81 * its time context switching. PIPE_SIZE is constrained by the 82 * amount of kernel virtual memory. 83 */ 84 85 #include <sys/cdefs.h> 86 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.88 2007/12/05 17:19:58 pooka Exp $"); 87 88 #include <sys/param.h> 89 #include <sys/systm.h> 90 #include <sys/proc.h> 91 #include <sys/fcntl.h> 92 #include <sys/file.h> 93 #include <sys/filedesc.h> 94 #include <sys/filio.h> 95 #include <sys/kernel.h> 96 #include <sys/ttycom.h> 97 #include <sys/stat.h> 98 #include <sys/malloc.h> 99 #include <sys/poll.h> 100 #include <sys/signalvar.h> 101 #include <sys/vnode.h> 102 #include <sys/uio.h> 103 #include <sys/lock.h> 104 #include <sys/select.h> 105 #include <sys/mount.h> 106 #include <sys/syscallargs.h> 107 #include <uvm/uvm.h> 108 #include <sys/sysctl.h> 109 #include <sys/kauth.h> 110 111 #include <sys/pipe.h> 112 113 /* 114 * Use this define if you want to disable *fancy* VM things. Expect an 115 * approx 30% decrease in transfer rate. 116 */ 117 /* #define PIPE_NODIRECT */ 118 119 /* 120 * interfaces to the outside world 121 */ 122 static int pipe_read(struct file *fp, off_t *offset, struct uio *uio, 123 kauth_cred_t cred, int flags); 124 static int pipe_write(struct file *fp, off_t *offset, struct uio *uio, 125 kauth_cred_t cred, int flags); 126 static int pipe_close(struct file *fp, struct lwp *l); 127 static int pipe_poll(struct file *fp, int events, struct lwp *l); 128 static int pipe_kqfilter(struct file *fp, struct knote *kn); 129 static int pipe_stat(struct file *fp, struct stat *sb, struct lwp *l); 130 static int pipe_ioctl(struct file *fp, u_long cmd, void *data, 131 struct lwp *l); 132 133 static const struct fileops pipeops = { 134 pipe_read, pipe_write, pipe_ioctl, fnullop_fcntl, pipe_poll, 135 pipe_stat, pipe_close, pipe_kqfilter 136 }; 137 138 /* 139 * Default pipe buffer size(s), this can be kind-of large now because pipe 140 * space is pageable. The pipe code will try to maintain locality of 141 * reference for performance reasons, so small amounts of outstanding I/O 142 * will not wipe the cache. 143 */ 144 #define MINPIPESIZE (PIPE_SIZE/3) 145 #define MAXPIPESIZE (2*PIPE_SIZE/3) 146 147 /* 148 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 149 * is there so that on large systems, we don't exhaust it. 150 */ 151 #define MAXPIPEKVA (8*1024*1024) 152 static int maxpipekva = MAXPIPEKVA; 153 154 /* 155 * Limit for direct transfers, we cannot, of course limit 156 * the amount of kva for pipes in general though. 157 */ 158 #define LIMITPIPEKVA (16*1024*1024) 159 static int limitpipekva = LIMITPIPEKVA; 160 161 /* 162 * Limit the number of "big" pipes 163 */ 164 #define LIMITBIGPIPES 32 165 static int maxbigpipes = LIMITBIGPIPES; 166 static int nbigpipe = 0; 167 168 /* 169 * Amount of KVA consumed by pipe buffers. 170 */ 171 static int amountpipekva = 0; 172 173 MALLOC_DEFINE(M_PIPE, "pipe", "Pipe structures"); 174 175 static void pipeclose(struct file *fp, struct pipe *pipe); 176 static void pipe_free_kmem(struct pipe *pipe); 177 static int pipe_create(struct pipe **pipep, int allockva); 178 static int pipelock(struct pipe *pipe, int catch); 179 static inline void pipeunlock(struct pipe *pipe); 180 static void pipeselwakeup(struct pipe *pipe, struct pipe *sigp, int code); 181 #ifndef PIPE_NODIRECT 182 static int pipe_direct_write(struct file *fp, struct pipe *wpipe, 183 struct uio *uio); 184 #endif 185 static int pipespace(struct pipe *pipe, int size); 186 187 #ifndef PIPE_NODIRECT 188 static int pipe_loan_alloc(struct pipe *, int); 189 static void pipe_loan_free(struct pipe *); 190 #endif /* PIPE_NODIRECT */ 191 192 static krwlock_t pipe_peer_lock; 193 static pool_cache_t pipe_cache; 194 195 void 196 pipe_init(void) 197 { 198 199 pipe_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "pipepl", 200 NULL, IPL_NONE, NULL, NULL, NULL); 201 KASSERT(pipe_cache != NULL); 202 rw_init(&pipe_peer_lock); 203 } 204 205 /* 206 * The pipe system call for the DTYPE_PIPE type of pipes 207 */ 208 209 /* ARGSUSED */ 210 int 211 sys_pipe(struct lwp *l, void *v, register_t *retval) 212 { 213 struct file *rf, *wf; 214 struct pipe *rpipe, *wpipe; 215 int fd, error; 216 217 rpipe = wpipe = NULL; 218 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) { 219 pipeclose(NULL, rpipe); 220 pipeclose(NULL, wpipe); 221 return (ENFILE); 222 } 223 224 /* 225 * Note: the file structure returned from falloc() is marked 226 * as 'larval' initially. Unless we mark it as 'mature' by 227 * FILE_SET_MATURE(), any attempt to do anything with it would 228 * return EBADF, including e.g. dup(2) or close(2). This avoids 229 * file descriptor races if we block in the second falloc(). 230 */ 231 232 error = falloc(l, &rf, &fd); 233 if (error) 234 goto free2; 235 retval[0] = fd; 236 rf->f_flag = FREAD; 237 rf->f_type = DTYPE_PIPE; 238 rf->f_data = (void *)rpipe; 239 rf->f_ops = &pipeops; 240 241 error = falloc(l, &wf, &fd); 242 if (error) 243 goto free3; 244 retval[1] = fd; 245 wf->f_flag = FWRITE; 246 wf->f_type = DTYPE_PIPE; 247 wf->f_data = (void *)wpipe; 248 wf->f_ops = &pipeops; 249 250 rpipe->pipe_peer = wpipe; 251 wpipe->pipe_peer = rpipe; 252 253 FILE_SET_MATURE(rf); 254 FILE_SET_MATURE(wf); 255 FILE_UNUSE(rf, l); 256 FILE_UNUSE(wf, l); 257 return (0); 258 free3: 259 FILE_UNUSE(rf, l); 260 ffree(rf); 261 fdremove(l->l_proc->p_fd, retval[0]); 262 free2: 263 pipeclose(NULL, wpipe); 264 pipeclose(NULL, rpipe); 265 266 return (error); 267 } 268 269 /* 270 * Allocate kva for pipe circular buffer, the space is pageable 271 * This routine will 'realloc' the size of a pipe safely, if it fails 272 * it will retain the old buffer. 273 * If it fails it will return ENOMEM. 274 */ 275 static int 276 pipespace(struct pipe *pipe, int size) 277 { 278 void *buffer; 279 /* 280 * Allocate pageable virtual address space. Physical memory is 281 * allocated on demand. 282 */ 283 buffer = (void *) uvm_km_alloc(kernel_map, round_page(size), 0, 284 UVM_KMF_PAGEABLE); 285 if (buffer == NULL) 286 return (ENOMEM); 287 288 /* free old resources if we're resizing */ 289 pipe_free_kmem(pipe); 290 pipe->pipe_buffer.buffer = buffer; 291 pipe->pipe_buffer.size = size; 292 pipe->pipe_buffer.in = 0; 293 pipe->pipe_buffer.out = 0; 294 pipe->pipe_buffer.cnt = 0; 295 amountpipekva += pipe->pipe_buffer.size; 296 return (0); 297 } 298 299 /* 300 * Initialize and allocate VM and memory for pipe. 301 */ 302 static int 303 pipe_create(struct pipe **pipep, int allockva) 304 { 305 struct pipe *pipe; 306 int error; 307 308 pipe = *pipep = pool_cache_get(pipe_cache, PR_WAITOK); 309 310 /* Initialize */ 311 memset(pipe, 0, sizeof(struct pipe)); 312 pipe->pipe_state = PIPE_SIGNALR; 313 314 getmicrotime(&pipe->pipe_ctime); 315 pipe->pipe_atime = pipe->pipe_ctime; 316 pipe->pipe_mtime = pipe->pipe_ctime; 317 mutex_init(&pipe->pipe_lock, MUTEX_DEFAULT, IPL_NONE); 318 cv_init(&pipe->pipe_cv, "pipe"); 319 cv_init(&pipe->pipe_lkcv, "pipelk"); 320 selinit(&pipe->pipe_sel); 321 322 if (allockva && (error = pipespace(pipe, PIPE_SIZE))) 323 return (error); 324 325 return (0); 326 } 327 328 329 /* 330 * Lock a pipe for I/O, blocking other access 331 * Called with pipe spin lock held. 332 * Return with pipe spin lock released on success. 333 */ 334 static int 335 pipelock(struct pipe *pipe, int catch) 336 { 337 int error; 338 339 KASSERT(mutex_owned(&pipe->pipe_lock)); 340 341 while (pipe->pipe_state & PIPE_LOCKFL) { 342 pipe->pipe_state |= PIPE_LWANT; 343 if (catch) { 344 error = cv_wait_sig(&pipe->pipe_lkcv, 345 &pipe->pipe_lock); 346 if (error != 0) 347 return error; 348 } else 349 cv_wait(&pipe->pipe_lkcv, &pipe->pipe_lock); 350 } 351 352 pipe->pipe_state |= PIPE_LOCKFL; 353 mutex_exit(&pipe->pipe_lock); 354 355 return 0; 356 } 357 358 /* 359 * unlock a pipe I/O lock 360 */ 361 static inline void 362 pipeunlock(struct pipe *pipe) 363 { 364 365 KASSERT(pipe->pipe_state & PIPE_LOCKFL); 366 367 pipe->pipe_state &= ~PIPE_LOCKFL; 368 if (pipe->pipe_state & PIPE_LWANT) { 369 pipe->pipe_state &= ~PIPE_LWANT; 370 cv_broadcast(&pipe->pipe_lkcv); 371 } 372 } 373 374 /* 375 * Select/poll wakup. This also sends SIGIO to peer connected to 376 * 'sigpipe' side of pipe. 377 */ 378 static void 379 pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code) 380 { 381 int band; 382 383 selnotify(&selp->pipe_sel, NOTE_SUBMIT); 384 385 if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0) 386 return; 387 388 switch (code) { 389 case POLL_IN: 390 band = POLLIN|POLLRDNORM; 391 break; 392 case POLL_OUT: 393 band = POLLOUT|POLLWRNORM; 394 break; 395 case POLL_HUP: 396 band = POLLHUP; 397 break; 398 #if POLL_HUP != POLL_ERR 399 case POLL_ERR: 400 band = POLLERR; 401 break; 402 #endif 403 default: 404 band = 0; 405 #ifdef DIAGNOSTIC 406 printf("bad siginfo code %d in pipe notification.\n", code); 407 #endif 408 break; 409 } 410 411 fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp); 412 } 413 414 /* ARGSUSED */ 415 static int 416 pipe_read(struct file *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 417 int flags) 418 { 419 struct pipe *rpipe = (struct pipe *) fp->f_data; 420 struct pipebuf *bp = &rpipe->pipe_buffer; 421 int error; 422 size_t nread = 0; 423 size_t size; 424 size_t ocnt; 425 426 mutex_enter(&rpipe->pipe_lock); 427 ++rpipe->pipe_busy; 428 ocnt = bp->cnt; 429 430 again: 431 error = pipelock(rpipe, 1); 432 if (error) 433 goto unlocked_error; 434 435 while (uio->uio_resid) { 436 /* 437 * normal pipe buffer receive 438 */ 439 if (bp->cnt > 0) { 440 size = bp->size - bp->out; 441 if (size > bp->cnt) 442 size = bp->cnt; 443 if (size > uio->uio_resid) 444 size = uio->uio_resid; 445 446 error = uiomove((char *)bp->buffer + bp->out, size, uio); 447 if (error) 448 break; 449 450 bp->out += size; 451 if (bp->out >= bp->size) 452 bp->out = 0; 453 454 bp->cnt -= size; 455 456 /* 457 * If there is no more to read in the pipe, reset 458 * its pointers to the beginning. This improves 459 * cache hit stats. 460 */ 461 if (bp->cnt == 0) { 462 bp->in = 0; 463 bp->out = 0; 464 } 465 nread += size; 466 continue; 467 } 468 469 /* Lock to see up-to-date value of pipe_status. */ 470 mutex_enter(&rpipe->pipe_lock); 471 472 #ifndef PIPE_NODIRECT 473 if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) { 474 /* 475 * Direct copy, bypassing a kernel buffer. 476 */ 477 void * va; 478 479 KASSERT(rpipe->pipe_state & PIPE_DIRECTW); 480 mutex_exit(&rpipe->pipe_lock); 481 482 size = rpipe->pipe_map.cnt; 483 if (size > uio->uio_resid) 484 size = uio->uio_resid; 485 486 va = (char *)rpipe->pipe_map.kva + rpipe->pipe_map.pos; 487 error = uiomove(va, size, uio); 488 if (error) 489 break; 490 nread += size; 491 rpipe->pipe_map.pos += size; 492 rpipe->pipe_map.cnt -= size; 493 if (rpipe->pipe_map.cnt == 0) { 494 mutex_enter(&rpipe->pipe_lock); 495 rpipe->pipe_state &= ~PIPE_DIRECTR; 496 cv_broadcast(&rpipe->pipe_cv); 497 mutex_exit(&rpipe->pipe_lock); 498 } 499 continue; 500 } 501 #endif 502 /* 503 * Break if some data was read. 504 */ 505 if (nread > 0) { 506 mutex_exit(&rpipe->pipe_lock); 507 break; 508 } 509 510 /* 511 * detect EOF condition 512 * read returns 0 on EOF, no need to set error 513 */ 514 if (rpipe->pipe_state & PIPE_EOF) { 515 mutex_exit(&rpipe->pipe_lock); 516 break; 517 } 518 519 /* 520 * don't block on non-blocking I/O 521 */ 522 if (fp->f_flag & FNONBLOCK) { 523 mutex_exit(&rpipe->pipe_lock); 524 error = EAGAIN; 525 break; 526 } 527 528 /* 529 * Unlock the pipe buffer for our remaining processing. 530 * We will either break out with an error or we will 531 * sleep and relock to loop. 532 */ 533 pipeunlock(rpipe); 534 535 /* 536 * Re-check to see if more direct writes are pending. 537 */ 538 if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) 539 goto again; 540 541 /* 542 * We want to read more, wake up select/poll. 543 */ 544 pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_IN); 545 546 /* 547 * If the "write-side" is blocked, wake it up now. 548 */ 549 if (rpipe->pipe_state & PIPE_WANTW) { 550 rpipe->pipe_state &= ~PIPE_WANTW; 551 cv_broadcast(&rpipe->pipe_cv); 552 } 553 554 /* Now wait until the pipe is filled */ 555 rpipe->pipe_state |= PIPE_WANTR; 556 error = cv_wait_sig(&rpipe->pipe_cv, &rpipe->pipe_lock); 557 if (error != 0) 558 goto unlocked_error; 559 goto again; 560 } 561 562 if (error == 0) 563 getmicrotime(&rpipe->pipe_atime); 564 565 mutex_enter(&rpipe->pipe_lock); 566 pipeunlock(rpipe); 567 568 unlocked_error: 569 --rpipe->pipe_busy; 570 571 /* 572 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0. 573 */ 574 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) { 575 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW); 576 cv_broadcast(&rpipe->pipe_cv); 577 } else if (bp->cnt < MINPIPESIZE) { 578 /* 579 * Handle write blocking hysteresis. 580 */ 581 if (rpipe->pipe_state & PIPE_WANTW) { 582 rpipe->pipe_state &= ~PIPE_WANTW; 583 cv_broadcast(&rpipe->pipe_cv); 584 } 585 } 586 587 /* 588 * If anything was read off the buffer, signal to the writer it's 589 * possible to write more data. Also send signal if we are here for the 590 * first time after last write. 591 */ 592 if ((bp->size - bp->cnt) >= PIPE_BUF 593 && (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) { 594 pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT); 595 rpipe->pipe_state &= ~PIPE_SIGNALR; 596 } 597 598 mutex_exit(&rpipe->pipe_lock); 599 return (error); 600 } 601 602 #ifndef PIPE_NODIRECT 603 /* 604 * Allocate structure for loan transfer. 605 */ 606 static int 607 pipe_loan_alloc(struct pipe *wpipe, int npages) 608 { 609 vsize_t len; 610 611 len = (vsize_t)npages << PAGE_SHIFT; 612 wpipe->pipe_map.kva = uvm_km_alloc(kernel_map, len, 0, 613 UVM_KMF_VAONLY | UVM_KMF_WAITVA); 614 if (wpipe->pipe_map.kva == 0) 615 return (ENOMEM); 616 617 amountpipekva += len; 618 wpipe->pipe_map.npages = npages; 619 wpipe->pipe_map.pgs = malloc(npages * sizeof(struct vm_page *), M_PIPE, 620 M_WAITOK); 621 return (0); 622 } 623 624 /* 625 * Free resources allocated for loan transfer. 626 */ 627 static void 628 pipe_loan_free(struct pipe *wpipe) 629 { 630 vsize_t len; 631 632 len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT; 633 uvm_km_free(kernel_map, wpipe->pipe_map.kva, len, UVM_KMF_VAONLY); 634 wpipe->pipe_map.kva = 0; 635 amountpipekva -= len; 636 free(wpipe->pipe_map.pgs, M_PIPE); 637 wpipe->pipe_map.pgs = NULL; 638 } 639 640 /* 641 * NetBSD direct write, using uvm_loan() mechanism. 642 * This implements the pipe buffer write mechanism. Note that only 643 * a direct write OR a normal pipe write can be pending at any given time. 644 * If there are any characters in the pipe buffer, the direct write will 645 * be deferred until the receiving process grabs all of the bytes from 646 * the pipe buffer. Then the direct mapping write is set-up. 647 * 648 * Called with the long-term pipe lock held. 649 */ 650 static int 651 pipe_direct_write(struct file *fp, struct pipe *wpipe, struct uio *uio) 652 { 653 int error, npages, j; 654 struct vm_page **pgs; 655 vaddr_t bbase, kva, base, bend; 656 vsize_t blen, bcnt; 657 voff_t bpos; 658 659 KASSERT(wpipe->pipe_map.cnt == 0); 660 661 /* 662 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers 663 * not aligned to PAGE_SIZE. 664 */ 665 bbase = (vaddr_t)uio->uio_iov->iov_base; 666 base = trunc_page(bbase); 667 bend = round_page(bbase + uio->uio_iov->iov_len); 668 blen = bend - base; 669 bpos = bbase - base; 670 671 if (blen > PIPE_DIRECT_CHUNK) { 672 blen = PIPE_DIRECT_CHUNK; 673 bend = base + blen; 674 bcnt = PIPE_DIRECT_CHUNK - bpos; 675 } else { 676 bcnt = uio->uio_iov->iov_len; 677 } 678 npages = blen >> PAGE_SHIFT; 679 680 /* 681 * Free the old kva if we need more pages than we have 682 * allocated. 683 */ 684 if (wpipe->pipe_map.kva != 0 && npages > wpipe->pipe_map.npages) 685 pipe_loan_free(wpipe); 686 687 /* Allocate new kva. */ 688 if (wpipe->pipe_map.kva == 0) { 689 error = pipe_loan_alloc(wpipe, npages); 690 if (error) 691 return (error); 692 } 693 694 /* Loan the write buffer memory from writer process */ 695 pgs = wpipe->pipe_map.pgs; 696 error = uvm_loan(&uio->uio_vmspace->vm_map, base, blen, 697 pgs, UVM_LOAN_TOPAGE); 698 if (error) { 699 pipe_loan_free(wpipe); 700 return (ENOMEM); /* so that caller fallback to ordinary write */ 701 } 702 703 /* Enter the loaned pages to kva */ 704 kva = wpipe->pipe_map.kva; 705 for (j = 0; j < npages; j++, kva += PAGE_SIZE) { 706 pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ); 707 } 708 pmap_update(pmap_kernel()); 709 710 /* Now we can put the pipe in direct write mode */ 711 wpipe->pipe_map.pos = bpos; 712 wpipe->pipe_map.cnt = bcnt; 713 714 /* 715 * But before we can let someone do a direct read, we 716 * have to wait until the pipe is drained. Release the 717 * pipe lock while we wait. 718 */ 719 mutex_enter(&wpipe->pipe_lock); 720 wpipe->pipe_state |= PIPE_DIRECTW; 721 pipeunlock(wpipe); 722 723 while (error == 0 && wpipe->pipe_buffer.cnt > 0) { 724 if (wpipe->pipe_state & PIPE_WANTR) { 725 wpipe->pipe_state &= ~PIPE_WANTR; 726 cv_broadcast(&wpipe->pipe_cv); 727 } 728 729 wpipe->pipe_state |= PIPE_WANTW; 730 error = cv_wait_sig(&wpipe->pipe_cv, &wpipe->pipe_lock); 731 if (error == 0 && wpipe->pipe_state & PIPE_EOF) 732 error = EPIPE; 733 } 734 735 /* Pipe is drained; next read will off the direct buffer */ 736 wpipe->pipe_state |= PIPE_DIRECTR; 737 738 /* Wait until the reader is done */ 739 while (error == 0 && (wpipe->pipe_state & PIPE_DIRECTR)) { 740 if (wpipe->pipe_state & PIPE_WANTR) { 741 wpipe->pipe_state &= ~PIPE_WANTR; 742 cv_broadcast(&wpipe->pipe_cv); 743 } 744 pipeselwakeup(wpipe, wpipe, POLL_IN); 745 error = cv_wait_sig(&wpipe->pipe_cv, &wpipe->pipe_lock); 746 if (error == 0 && wpipe->pipe_state & PIPE_EOF) 747 error = EPIPE; 748 } 749 750 /* Take pipe out of direct write mode */ 751 wpipe->pipe_state &= ~(PIPE_DIRECTW | PIPE_DIRECTR); 752 753 /* Acquire the pipe lock and cleanup */ 754 (void)pipelock(wpipe, 0); 755 756 if (pgs != NULL) { 757 pmap_kremove(wpipe->pipe_map.kva, blen); 758 uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE); 759 } 760 if (error || amountpipekva > maxpipekva) 761 pipe_loan_free(wpipe); 762 763 if (error) { 764 pipeselwakeup(wpipe, wpipe, POLL_ERR); 765 766 /* 767 * If nothing was read from what we offered, return error 768 * straight on. Otherwise update uio resid first. Caller 769 * will deal with the error condition, returning short 770 * write, error, or restarting the write(2) as appropriate. 771 */ 772 if (wpipe->pipe_map.cnt == bcnt) { 773 wpipe->pipe_map.cnt = 0; 774 cv_broadcast(&wpipe->pipe_cv); 775 return (error); 776 } 777 778 bcnt -= wpipe->pipe_map.cnt; 779 } 780 781 uio->uio_resid -= bcnt; 782 /* uio_offset not updated, not set/used for write(2) */ 783 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt; 784 uio->uio_iov->iov_len -= bcnt; 785 if (uio->uio_iov->iov_len == 0) { 786 uio->uio_iov++; 787 uio->uio_iovcnt--; 788 } 789 790 wpipe->pipe_map.cnt = 0; 791 return (error); 792 } 793 #endif /* !PIPE_NODIRECT */ 794 795 static int 796 pipe_write(struct file *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 797 int flags) 798 { 799 struct pipe *wpipe, *rpipe; 800 struct pipebuf *bp; 801 int error; 802 803 /* We want to write to our peer */ 804 rpipe = (struct pipe *) fp->f_data; 805 806 retry: 807 error = 0; 808 mutex_enter(&rpipe->pipe_lock); 809 wpipe = rpipe->pipe_peer; 810 811 /* 812 * Detect loss of pipe read side, issue SIGPIPE if lost. 813 */ 814 if (wpipe == NULL) 815 error = EPIPE; 816 else if (mutex_tryenter(&wpipe->pipe_lock) == 0) { 817 /* Deal with race for peer */ 818 mutex_exit(&rpipe->pipe_lock); 819 /* XXX Might be about to deadlock w/kernel_lock. */ 820 yield(); 821 goto retry; 822 } else if ((wpipe->pipe_state & PIPE_EOF) != 0) { 823 mutex_exit(&wpipe->pipe_lock); 824 error = EPIPE; 825 } 826 827 mutex_exit(&rpipe->pipe_lock); 828 if (error != 0) 829 return (error); 830 831 ++wpipe->pipe_busy; 832 833 /* Aquire the long-term pipe lock */ 834 if ((error = pipelock(wpipe,1)) != 0) { 835 --wpipe->pipe_busy; 836 if (wpipe->pipe_busy == 0 837 && (wpipe->pipe_state & PIPE_WANTCLOSE)) { 838 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR); 839 cv_broadcast(&wpipe->pipe_cv); 840 } 841 mutex_exit(&wpipe->pipe_lock); 842 return (error); 843 } 844 845 bp = &wpipe->pipe_buffer; 846 847 /* 848 * If it is advantageous to resize the pipe buffer, do so. 849 */ 850 if ((uio->uio_resid > PIPE_SIZE) && 851 (nbigpipe < maxbigpipes) && 852 #ifndef PIPE_NODIRECT 853 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 854 #endif 855 (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) { 856 857 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 858 nbigpipe++; 859 } 860 861 while (uio->uio_resid) { 862 size_t space; 863 864 #ifndef PIPE_NODIRECT 865 /* 866 * Pipe buffered writes cannot be coincidental with 867 * direct writes. Also, only one direct write can be 868 * in progress at any one time. We wait until the currently 869 * executing direct write is completed before continuing. 870 * 871 * We break out if a signal occurs or the reader goes away. 872 */ 873 while (error == 0 && wpipe->pipe_state & PIPE_DIRECTW) { 874 mutex_enter(&wpipe->pipe_lock); 875 if (wpipe->pipe_state & PIPE_WANTR) { 876 wpipe->pipe_state &= ~PIPE_WANTR; 877 cv_broadcast(&wpipe->pipe_cv); 878 } 879 pipeunlock(wpipe); 880 error = cv_wait_sig(&wpipe->pipe_cv, 881 &wpipe->pipe_lock); 882 883 (void)pipelock(wpipe, 0); 884 if (wpipe->pipe_state & PIPE_EOF) 885 error = EPIPE; 886 } 887 if (error) 888 break; 889 890 /* 891 * If the transfer is large, we can gain performance if 892 * we do process-to-process copies directly. 893 * If the write is non-blocking, we don't use the 894 * direct write mechanism. 895 * 896 * The direct write mechanism will detect the reader going 897 * away on us. 898 */ 899 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 900 (fp->f_flag & FNONBLOCK) == 0 && 901 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) { 902 error = pipe_direct_write(fp, wpipe, uio); 903 904 /* 905 * Break out if error occurred, unless it's ENOMEM. 906 * ENOMEM means we failed to allocate some resources 907 * for direct write, so we just fallback to ordinary 908 * write. If the direct write was successful, 909 * process rest of data via ordinary write. 910 */ 911 if (error == 0) 912 continue; 913 914 if (error != ENOMEM) 915 break; 916 } 917 #endif /* PIPE_NODIRECT */ 918 919 space = bp->size - bp->cnt; 920 921 /* Writes of size <= PIPE_BUF must be atomic. */ 922 if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF)) 923 space = 0; 924 925 if (space > 0) { 926 int size; /* Transfer size */ 927 int segsize; /* first segment to transfer */ 928 929 /* 930 * Transfer size is minimum of uio transfer 931 * and free space in pipe buffer. 932 */ 933 if (space > uio->uio_resid) 934 size = uio->uio_resid; 935 else 936 size = space; 937 /* 938 * First segment to transfer is minimum of 939 * transfer size and contiguous space in 940 * pipe buffer. If first segment to transfer 941 * is less than the transfer size, we've got 942 * a wraparound in the buffer. 943 */ 944 segsize = bp->size - bp->in; 945 if (segsize > size) 946 segsize = size; 947 948 /* Transfer first segment */ 949 error = uiomove((char *)bp->buffer + bp->in, segsize, 950 uio); 951 952 if (error == 0 && segsize < size) { 953 /* 954 * Transfer remaining part now, to 955 * support atomic writes. Wraparound 956 * happened. 957 */ 958 #ifdef DEBUG 959 if (bp->in + segsize != bp->size) 960 panic("Expected pipe buffer wraparound disappeared"); 961 #endif 962 963 error = uiomove(bp->buffer, 964 size - segsize, uio); 965 } 966 if (error) 967 break; 968 969 bp->in += size; 970 if (bp->in >= bp->size) { 971 #ifdef DEBUG 972 if (bp->in != size - segsize + bp->size) 973 panic("Expected wraparound bad"); 974 #endif 975 bp->in = size - segsize; 976 } 977 978 bp->cnt += size; 979 #ifdef DEBUG 980 if (bp->cnt > bp->size) 981 panic("Pipe buffer overflow"); 982 #endif 983 } else { 984 /* 985 * If the "read-side" has been blocked, wake it up now. 986 */ 987 mutex_enter(&wpipe->pipe_lock); 988 if (wpipe->pipe_state & PIPE_WANTR) { 989 wpipe->pipe_state &= ~PIPE_WANTR; 990 cv_broadcast(&wpipe->pipe_cv); 991 } 992 mutex_exit(&wpipe->pipe_lock); 993 994 /* 995 * don't block on non-blocking I/O 996 */ 997 if (fp->f_flag & FNONBLOCK) { 998 error = EAGAIN; 999 break; 1000 } 1001 1002 /* 1003 * We have no more space and have something to offer, 1004 * wake up select/poll. 1005 */ 1006 if (bp->cnt) 1007 pipeselwakeup(wpipe, wpipe, POLL_OUT); 1008 1009 mutex_enter(&wpipe->pipe_lock); 1010 pipeunlock(wpipe); 1011 wpipe->pipe_state |= PIPE_WANTW; 1012 error = cv_wait_sig(&wpipe->pipe_cv, 1013 &wpipe->pipe_lock); 1014 (void)pipelock(wpipe, 0); 1015 if (error != 0) 1016 break; 1017 /* 1018 * If read side wants to go away, we just issue a signal 1019 * to ourselves. 1020 */ 1021 if (wpipe->pipe_state & PIPE_EOF) { 1022 error = EPIPE; 1023 break; 1024 } 1025 } 1026 } 1027 1028 mutex_enter(&wpipe->pipe_lock); 1029 --wpipe->pipe_busy; 1030 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) { 1031 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR); 1032 cv_broadcast(&wpipe->pipe_cv); 1033 } else if (bp->cnt > 0) { 1034 /* 1035 * If we have put any characters in the buffer, we wake up 1036 * the reader. 1037 */ 1038 if (wpipe->pipe_state & PIPE_WANTR) { 1039 wpipe->pipe_state &= ~PIPE_WANTR; 1040 cv_broadcast(&wpipe->pipe_cv); 1041 } 1042 } 1043 1044 /* 1045 * Don't return EPIPE if I/O was successful 1046 */ 1047 if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0) 1048 error = 0; 1049 1050 if (error == 0) 1051 getmicrotime(&wpipe->pipe_mtime); 1052 1053 /* 1054 * We have something to offer, wake up select/poll. 1055 * wpipe->pipe_map.cnt is always 0 in this point (direct write 1056 * is only done synchronously), so check only wpipe->pipe_buffer.cnt 1057 */ 1058 if (bp->cnt) 1059 pipeselwakeup(wpipe, wpipe, POLL_OUT); 1060 1061 /* 1062 * Arrange for next read(2) to do a signal. 1063 */ 1064 wpipe->pipe_state |= PIPE_SIGNALR; 1065 1066 pipeunlock(wpipe); 1067 mutex_exit(&wpipe->pipe_lock); 1068 return (error); 1069 } 1070 1071 /* 1072 * we implement a very minimal set of ioctls for compatibility with sockets. 1073 */ 1074 int 1075 pipe_ioctl(struct file *fp, u_long cmd, void *data, struct lwp *l) 1076 { 1077 struct pipe *pipe = (struct pipe *)fp->f_data; 1078 struct proc *p = l->l_proc; 1079 1080 switch (cmd) { 1081 1082 case FIONBIO: 1083 return (0); 1084 1085 case FIOASYNC: 1086 mutex_enter(&pipe->pipe_lock); 1087 if (*(int *)data) { 1088 pipe->pipe_state |= PIPE_ASYNC; 1089 } else { 1090 pipe->pipe_state &= ~PIPE_ASYNC; 1091 } 1092 mutex_exit(&pipe->pipe_lock); 1093 return (0); 1094 1095 case FIONREAD: 1096 mutex_enter(&pipe->pipe_lock); 1097 #ifndef PIPE_NODIRECT 1098 if (pipe->pipe_state & PIPE_DIRECTW) 1099 *(int *)data = pipe->pipe_map.cnt; 1100 else 1101 #endif 1102 *(int *)data = pipe->pipe_buffer.cnt; 1103 mutex_exit(&pipe->pipe_lock); 1104 return (0); 1105 1106 case FIONWRITE: 1107 /* Look at other side */ 1108 rw_enter(&pipe_peer_lock, RW_READER); 1109 pipe = pipe->pipe_peer; 1110 mutex_enter(&pipe->pipe_lock); 1111 #ifndef PIPE_NODIRECT 1112 if (pipe->pipe_state & PIPE_DIRECTW) 1113 *(int *)data = pipe->pipe_map.cnt; 1114 else 1115 #endif 1116 *(int *)data = pipe->pipe_buffer.cnt; 1117 mutex_exit(&pipe->pipe_lock); 1118 rw_exit(&pipe_peer_lock); 1119 return (0); 1120 1121 case FIONSPACE: 1122 /* Look at other side */ 1123 rw_enter(&pipe_peer_lock, RW_READER); 1124 pipe = pipe->pipe_peer; 1125 mutex_enter(&pipe->pipe_lock); 1126 #ifndef PIPE_NODIRECT 1127 /* 1128 * If we're in direct-mode, we don't really have a 1129 * send queue, and any other write will block. Thus 1130 * zero seems like the best answer. 1131 */ 1132 if (pipe->pipe_state & PIPE_DIRECTW) 1133 *(int *)data = 0; 1134 else 1135 #endif 1136 *(int *)data = pipe->pipe_buffer.size - 1137 pipe->pipe_buffer.cnt; 1138 mutex_exit(&pipe->pipe_lock); 1139 rw_exit(&pipe_peer_lock); 1140 return (0); 1141 1142 case TIOCSPGRP: 1143 case FIOSETOWN: 1144 return fsetown(p, &pipe->pipe_pgid, cmd, data); 1145 1146 case TIOCGPGRP: 1147 case FIOGETOWN: 1148 return fgetown(p, pipe->pipe_pgid, cmd, data); 1149 1150 } 1151 return (EPASSTHROUGH); 1152 } 1153 1154 int 1155 pipe_poll(struct file *fp, int events, struct lwp *l) 1156 { 1157 struct pipe *rpipe = (struct pipe *)fp->f_data; 1158 struct pipe *wpipe; 1159 int eof = 0; 1160 int revents = 0; 1161 1162 retry: 1163 mutex_enter(&rpipe->pipe_lock); 1164 wpipe = rpipe->pipe_peer; 1165 if (wpipe != NULL && mutex_tryenter(&wpipe->pipe_lock) == 0) { 1166 /* Deal with race for peer */ 1167 mutex_exit(&rpipe->pipe_lock); 1168 /* XXX Might be about to deadlock w/kernel_lock. */ 1169 yield(); 1170 goto retry; 1171 } 1172 1173 if (events & (POLLIN | POLLRDNORM)) 1174 if ((rpipe->pipe_buffer.cnt > 0) || 1175 #ifndef PIPE_NODIRECT 1176 (rpipe->pipe_state & PIPE_DIRECTR) || 1177 #endif 1178 (rpipe->pipe_state & PIPE_EOF)) 1179 revents |= events & (POLLIN | POLLRDNORM); 1180 1181 eof |= (rpipe->pipe_state & PIPE_EOF); 1182 mutex_exit(&rpipe->pipe_lock); 1183 1184 if (wpipe == NULL) 1185 revents |= events & (POLLOUT | POLLWRNORM); 1186 else { 1187 if (events & (POLLOUT | POLLWRNORM)) 1188 if ((wpipe->pipe_state & PIPE_EOF) || ( 1189 #ifndef PIPE_NODIRECT 1190 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 1191 #endif 1192 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1193 revents |= events & (POLLOUT | POLLWRNORM); 1194 1195 eof |= (wpipe->pipe_state & PIPE_EOF); 1196 mutex_exit(&wpipe->pipe_lock); 1197 } 1198 1199 if (wpipe == NULL || eof) 1200 revents |= POLLHUP; 1201 1202 if (revents == 0) { 1203 if (events & (POLLIN | POLLRDNORM)) 1204 selrecord(l, &rpipe->pipe_sel); 1205 1206 if (events & (POLLOUT | POLLWRNORM)) 1207 selrecord(l, &wpipe->pipe_sel); 1208 } 1209 1210 return (revents); 1211 } 1212 1213 static int 1214 pipe_stat(struct file *fp, struct stat *ub, struct lwp *l) 1215 { 1216 struct pipe *pipe = (struct pipe *)fp->f_data; 1217 1218 rw_enter(&pipe_peer_lock, RW_READER); 1219 1220 memset((void *)ub, 0, sizeof(*ub)); 1221 ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 1222 ub->st_blksize = pipe->pipe_buffer.size; 1223 if (ub->st_blksize == 0 && pipe->pipe_peer) 1224 ub->st_blksize = pipe->pipe_peer->pipe_buffer.size; 1225 ub->st_size = pipe->pipe_buffer.cnt; 1226 ub->st_blocks = (ub->st_size) ? 1 : 0; 1227 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec); 1228 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec); 1229 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec); 1230 ub->st_uid = kauth_cred_geteuid(fp->f_cred); 1231 ub->st_gid = kauth_cred_getegid(fp->f_cred); 1232 1233 rw_exit(&pipe_peer_lock); 1234 1235 /* 1236 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1237 * XXX (st_dev, st_ino) should be unique. 1238 */ 1239 return (0); 1240 } 1241 1242 /* ARGSUSED */ 1243 static int 1244 pipe_close(struct file *fp, struct lwp *l) 1245 { 1246 struct pipe *pipe = (struct pipe *)fp->f_data; 1247 1248 fp->f_data = NULL; 1249 pipeclose(fp, pipe); 1250 return (0); 1251 } 1252 1253 static void 1254 pipe_free_kmem(struct pipe *pipe) 1255 { 1256 1257 if (pipe->pipe_buffer.buffer != NULL) { 1258 if (pipe->pipe_buffer.size > PIPE_SIZE) 1259 --nbigpipe; 1260 amountpipekva -= pipe->pipe_buffer.size; 1261 uvm_km_free(kernel_map, 1262 (vaddr_t)pipe->pipe_buffer.buffer, 1263 pipe->pipe_buffer.size, UVM_KMF_PAGEABLE); 1264 pipe->pipe_buffer.buffer = NULL; 1265 } 1266 #ifndef PIPE_NODIRECT 1267 if (pipe->pipe_map.kva != 0) { 1268 pipe_loan_free(pipe); 1269 pipe->pipe_map.cnt = 0; 1270 pipe->pipe_map.kva = 0; 1271 pipe->pipe_map.pos = 0; 1272 pipe->pipe_map.npages = 0; 1273 } 1274 #endif /* !PIPE_NODIRECT */ 1275 } 1276 1277 /* 1278 * shutdown the pipe 1279 */ 1280 static void 1281 pipeclose(struct file *fp, struct pipe *pipe) 1282 { 1283 struct pipe *ppipe; 1284 1285 if (pipe == NULL) 1286 return; 1287 1288 retry: 1289 rw_enter(&pipe_peer_lock, RW_WRITER); 1290 mutex_enter(&pipe->pipe_lock); 1291 1292 pipeselwakeup(pipe, pipe, POLL_HUP); 1293 1294 /* 1295 * If the other side is blocked, wake it up saying that 1296 * we want to close it down. 1297 */ 1298 pipe->pipe_state |= PIPE_EOF; 1299 if (pipe->pipe_busy) { 1300 rw_exit(&pipe_peer_lock); 1301 while (pipe->pipe_busy) { 1302 cv_broadcast(&pipe->pipe_cv); 1303 pipe->pipe_state |= PIPE_WANTCLOSE; 1304 cv_wait_sig(&pipe->pipe_cv, &pipe->pipe_lock); 1305 } 1306 if (!rw_tryenter(&pipe_peer_lock, RW_READER)) { 1307 mutex_exit(&pipe->pipe_lock); 1308 /* XXX Might be about to deadlock w/kernel_lock. */ 1309 yield(); 1310 goto retry; 1311 } 1312 } 1313 1314 /* 1315 * Disconnect from peer 1316 */ 1317 if ((ppipe = pipe->pipe_peer) != NULL) { 1318 /* Deal with race for peer */ 1319 if (mutex_tryenter(&ppipe->pipe_lock) == 0) { 1320 mutex_exit(&pipe->pipe_lock); 1321 rw_exit(&pipe_peer_lock); 1322 /* XXX Might be about to deadlock w/kernel_lock. */ 1323 yield(); 1324 goto retry; 1325 } 1326 pipeselwakeup(ppipe, ppipe, POLL_HUP); 1327 1328 ppipe->pipe_state |= PIPE_EOF; 1329 cv_broadcast(&ppipe->pipe_cv); 1330 ppipe->pipe_peer = NULL; 1331 mutex_exit(&ppipe->pipe_lock); 1332 } 1333 1334 KASSERT((pipe->pipe_state & PIPE_LOCKFL) == 0); 1335 1336 mutex_exit(&pipe->pipe_lock); 1337 rw_exit(&pipe_peer_lock); 1338 1339 /* 1340 * free resources 1341 */ 1342 pipe_free_kmem(pipe); 1343 mutex_destroy(&pipe->pipe_lock); 1344 cv_destroy(&pipe->pipe_cv); 1345 cv_destroy(&pipe->pipe_lkcv); 1346 seldestroy(&pipe->pipe_sel); 1347 pool_cache_put(pipe_cache, pipe); 1348 } 1349 1350 static void 1351 filt_pipedetach(struct knote *kn) 1352 { 1353 struct pipe *pipe = (struct pipe *)kn->kn_fp->f_data; 1354 1355 rw_enter(&pipe_peer_lock, RW_READER); 1356 1357 switch(kn->kn_filter) { 1358 case EVFILT_WRITE: 1359 /* need the peer structure, not our own */ 1360 pipe = pipe->pipe_peer; 1361 1362 /* if reader end already closed, just return */ 1363 if (pipe == NULL) { 1364 rw_exit(&pipe_peer_lock); 1365 return; 1366 } 1367 1368 break; 1369 default: 1370 /* nothing to do */ 1371 break; 1372 } 1373 1374 #ifdef DIAGNOSTIC 1375 if (kn->kn_hook != pipe) 1376 panic("filt_pipedetach: inconsistent knote"); 1377 #endif 1378 1379 mutex_enter(&pipe->pipe_lock); 1380 SLIST_REMOVE(&pipe->pipe_sel.sel_klist, kn, knote, kn_selnext); 1381 mutex_exit(&pipe->pipe_lock); 1382 rw_exit(&pipe_peer_lock); 1383 } 1384 1385 /*ARGSUSED*/ 1386 static int 1387 filt_piperead(struct knote *kn, long hint) 1388 { 1389 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1390 struct pipe *wpipe; 1391 1392 if ((hint & NOTE_SUBMIT) == 0) { 1393 rw_enter(&pipe_peer_lock, RW_READER); 1394 mutex_enter(&rpipe->pipe_lock); 1395 } 1396 wpipe = rpipe->pipe_peer; 1397 kn->kn_data = rpipe->pipe_buffer.cnt; 1398 1399 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1400 kn->kn_data = rpipe->pipe_map.cnt; 1401 1402 if ((rpipe->pipe_state & PIPE_EOF) || 1403 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1404 kn->kn_flags |= EV_EOF; 1405 if ((hint & NOTE_SUBMIT) == 0) { 1406 mutex_exit(&rpipe->pipe_lock); 1407 rw_exit(&pipe_peer_lock); 1408 } 1409 return (1); 1410 } 1411 1412 if ((hint & NOTE_SUBMIT) == 0) { 1413 mutex_exit(&rpipe->pipe_lock); 1414 rw_exit(&pipe_peer_lock); 1415 } 1416 return (kn->kn_data > 0); 1417 } 1418 1419 /*ARGSUSED*/ 1420 static int 1421 filt_pipewrite(struct knote *kn, long hint) 1422 { 1423 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1424 struct pipe *wpipe; 1425 1426 if ((hint & NOTE_SUBMIT) == 0) { 1427 rw_enter(&pipe_peer_lock, RW_READER); 1428 mutex_enter(&rpipe->pipe_lock); 1429 } 1430 wpipe = rpipe->pipe_peer; 1431 1432 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1433 kn->kn_data = 0; 1434 kn->kn_flags |= EV_EOF; 1435 if ((hint & NOTE_SUBMIT) == 0) { 1436 mutex_exit(&rpipe->pipe_lock); 1437 rw_exit(&pipe_peer_lock); 1438 } 1439 return (1); 1440 } 1441 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1442 if (wpipe->pipe_state & PIPE_DIRECTW) 1443 kn->kn_data = 0; 1444 1445 if ((hint & NOTE_SUBMIT) == 0) { 1446 mutex_exit(&rpipe->pipe_lock); 1447 rw_exit(&pipe_peer_lock); 1448 } 1449 return (kn->kn_data >= PIPE_BUF); 1450 } 1451 1452 static const struct filterops pipe_rfiltops = 1453 { 1, NULL, filt_pipedetach, filt_piperead }; 1454 static const struct filterops pipe_wfiltops = 1455 { 1, NULL, filt_pipedetach, filt_pipewrite }; 1456 1457 /*ARGSUSED*/ 1458 static int 1459 pipe_kqfilter(struct file *fp, struct knote *kn) 1460 { 1461 struct pipe *pipe; 1462 1463 rw_enter(&pipe_peer_lock, RW_READER); 1464 pipe = (struct pipe *)kn->kn_fp->f_data; 1465 1466 switch (kn->kn_filter) { 1467 case EVFILT_READ: 1468 kn->kn_fop = &pipe_rfiltops; 1469 break; 1470 case EVFILT_WRITE: 1471 kn->kn_fop = &pipe_wfiltops; 1472 pipe = pipe->pipe_peer; 1473 if (pipe == NULL) { 1474 /* other end of pipe has been closed */ 1475 rw_exit(&pipe_peer_lock); 1476 return (EBADF); 1477 } 1478 break; 1479 default: 1480 rw_exit(&pipe_peer_lock); 1481 return (EINVAL); 1482 } 1483 1484 kn->kn_hook = pipe; 1485 mutex_enter(&pipe->pipe_lock); 1486 SLIST_INSERT_HEAD(&pipe->pipe_sel.sel_klist, kn, kn_selnext); 1487 mutex_exit(&pipe->pipe_lock); 1488 rw_exit(&pipe_peer_lock); 1489 1490 return (0); 1491 } 1492 1493 /* 1494 * Handle pipe sysctls. 1495 */ 1496 SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup") 1497 { 1498 1499 sysctl_createv(clog, 0, NULL, NULL, 1500 CTLFLAG_PERMANENT, 1501 CTLTYPE_NODE, "kern", NULL, 1502 NULL, 0, NULL, 0, 1503 CTL_KERN, CTL_EOL); 1504 sysctl_createv(clog, 0, NULL, NULL, 1505 CTLFLAG_PERMANENT, 1506 CTLTYPE_NODE, "pipe", 1507 SYSCTL_DESCR("Pipe settings"), 1508 NULL, 0, NULL, 0, 1509 CTL_KERN, KERN_PIPE, CTL_EOL); 1510 1511 sysctl_createv(clog, 0, NULL, NULL, 1512 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1513 CTLTYPE_INT, "maxkvasz", 1514 SYSCTL_DESCR("Maximum amount of kernel memory to be " 1515 "used for pipes"), 1516 NULL, 0, &maxpipekva, 0, 1517 CTL_KERN, KERN_PIPE, KERN_PIPE_MAXKVASZ, CTL_EOL); 1518 sysctl_createv(clog, 0, NULL, NULL, 1519 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1520 CTLTYPE_INT, "maxloankvasz", 1521 SYSCTL_DESCR("Limit for direct transfers via page loan"), 1522 NULL, 0, &limitpipekva, 0, 1523 CTL_KERN, KERN_PIPE, KERN_PIPE_LIMITKVA, CTL_EOL); 1524 sysctl_createv(clog, 0, NULL, NULL, 1525 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1526 CTLTYPE_INT, "maxbigpipes", 1527 SYSCTL_DESCR("Maximum number of \"big\" pipes"), 1528 NULL, 0, &maxbigpipes, 0, 1529 CTL_KERN, KERN_PIPE, KERN_PIPE_MAXBIGPIPES, CTL_EOL); 1530 sysctl_createv(clog, 0, NULL, NULL, 1531 CTLFLAG_PERMANENT, 1532 CTLTYPE_INT, "nbigpipes", 1533 SYSCTL_DESCR("Number of \"big\" pipes"), 1534 NULL, 0, &nbigpipe, 0, 1535 CTL_KERN, KERN_PIPE, KERN_PIPE_NBIGPIPES, CTL_EOL); 1536 sysctl_createv(clog, 0, NULL, NULL, 1537 CTLFLAG_PERMANENT, 1538 CTLTYPE_INT, "kvasize", 1539 SYSCTL_DESCR("Amount of kernel memory consumed by pipe " 1540 "buffers"), 1541 NULL, 0, &amountpipekva, 0, 1542 CTL_KERN, KERN_PIPE, KERN_PIPE_KVASIZE, CTL_EOL); 1543 } 1544