1 /* $NetBSD: sys_pipe.c,v 1.140 2014/09/05 09:20:59 matt Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Paul Kranenburg, and by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1996 John S. Dyson 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice immediately at the beginning of the file, without modification, 41 * this list of conditions, and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Absolutely no warranty of function or purpose is made by the author 46 * John S. Dyson. 47 * 4. Modifications may be freely made to this file if the above conditions 48 * are met. 49 */ 50 51 /* 52 * This file contains a high-performance replacement for the socket-based 53 * pipes scheme originally used. It does not support all features of 54 * sockets, but does do everything that pipes normally do. 55 * 56 * This code has two modes of operation, a small write mode and a large 57 * write mode. The small write mode acts like conventional pipes with 58 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 59 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 60 * and PIPE_SIZE in size it is mapped read-only into the kernel address space 61 * using the UVM page loan facility from where the receiving process can copy 62 * the data directly from the pages in the sending process. 63 * 64 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 65 * happen for small transfers so that the system will not spend all of 66 * its time context switching. PIPE_SIZE is constrained by the 67 * amount of kernel virtual memory. 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.140 2014/09/05 09:20:59 matt Exp $"); 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/proc.h> 76 #include <sys/fcntl.h> 77 #include <sys/file.h> 78 #include <sys/filedesc.h> 79 #include <sys/filio.h> 80 #include <sys/kernel.h> 81 #include <sys/ttycom.h> 82 #include <sys/stat.h> 83 #include <sys/poll.h> 84 #include <sys/signalvar.h> 85 #include <sys/vnode.h> 86 #include <sys/uio.h> 87 #include <sys/select.h> 88 #include <sys/mount.h> 89 #include <sys/syscallargs.h> 90 #include <sys/sysctl.h> 91 #include <sys/kauth.h> 92 #include <sys/atomic.h> 93 #include <sys/pipe.h> 94 95 #include <uvm/uvm_extern.h> 96 97 /* 98 * Use this to disable direct I/O and decrease the code size: 99 * #define PIPE_NODIRECT 100 */ 101 102 /* XXX Disabled for now; rare hangs switching between direct/buffered */ 103 #define PIPE_NODIRECT 104 105 static int pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); 106 static int pipe_write(file_t *, off_t *, struct uio *, kauth_cred_t, int); 107 static int pipe_close(file_t *); 108 static int pipe_poll(file_t *, int); 109 static int pipe_kqfilter(file_t *, struct knote *); 110 static int pipe_stat(file_t *, struct stat *); 111 static int pipe_ioctl(file_t *, u_long, void *); 112 static void pipe_restart(file_t *); 113 114 static const struct fileops pipeops = { 115 .fo_read = pipe_read, 116 .fo_write = pipe_write, 117 .fo_ioctl = pipe_ioctl, 118 .fo_fcntl = fnullop_fcntl, 119 .fo_poll = pipe_poll, 120 .fo_stat = pipe_stat, 121 .fo_close = pipe_close, 122 .fo_kqfilter = pipe_kqfilter, 123 .fo_restart = pipe_restart, 124 }; 125 126 /* 127 * Default pipe buffer size(s), this can be kind-of large now because pipe 128 * space is pageable. The pipe code will try to maintain locality of 129 * reference for performance reasons, so small amounts of outstanding I/O 130 * will not wipe the cache. 131 */ 132 #define MINPIPESIZE (PIPE_SIZE / 3) 133 #define MAXPIPESIZE (2 * PIPE_SIZE / 3) 134 135 /* 136 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 137 * is there so that on large systems, we don't exhaust it. 138 */ 139 #define MAXPIPEKVA (8 * 1024 * 1024) 140 static u_int maxpipekva = MAXPIPEKVA; 141 142 /* 143 * Limit for direct transfers, we cannot, of course limit 144 * the amount of kva for pipes in general though. 145 */ 146 #define LIMITPIPEKVA (16 * 1024 * 1024) 147 static u_int limitpipekva = LIMITPIPEKVA; 148 149 /* 150 * Limit the number of "big" pipes 151 */ 152 #define LIMITBIGPIPES 32 153 static u_int maxbigpipes = LIMITBIGPIPES; 154 static u_int nbigpipe = 0; 155 156 /* 157 * Amount of KVA consumed by pipe buffers. 158 */ 159 static u_int amountpipekva = 0; 160 161 static void pipeclose(struct pipe *); 162 static void pipe_free_kmem(struct pipe *); 163 static int pipe_create(struct pipe **, pool_cache_t); 164 static int pipelock(struct pipe *, bool); 165 static inline void pipeunlock(struct pipe *); 166 static void pipeselwakeup(struct pipe *, struct pipe *, int); 167 #ifndef PIPE_NODIRECT 168 static int pipe_direct_write(file_t *, struct pipe *, struct uio *); 169 #endif 170 static int pipespace(struct pipe *, int); 171 static int pipe_ctor(void *, void *, int); 172 static void pipe_dtor(void *, void *); 173 174 #ifndef PIPE_NODIRECT 175 static int pipe_loan_alloc(struct pipe *, int); 176 static void pipe_loan_free(struct pipe *); 177 #endif /* PIPE_NODIRECT */ 178 179 static pool_cache_t pipe_wr_cache; 180 static pool_cache_t pipe_rd_cache; 181 182 void 183 pipe_init(void) 184 { 185 186 /* Writer side is not automatically allocated KVA. */ 187 pipe_wr_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "pipewr", 188 NULL, IPL_NONE, pipe_ctor, pipe_dtor, NULL); 189 KASSERT(pipe_wr_cache != NULL); 190 191 /* Reader side gets preallocated KVA. */ 192 pipe_rd_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "piperd", 193 NULL, IPL_NONE, pipe_ctor, pipe_dtor, (void *)1); 194 KASSERT(pipe_rd_cache != NULL); 195 } 196 197 static int 198 pipe_ctor(void *arg, void *obj, int flags) 199 { 200 struct pipe *pipe; 201 vaddr_t va; 202 203 pipe = obj; 204 205 memset(pipe, 0, sizeof(struct pipe)); 206 if (arg != NULL) { 207 /* Preallocate space. */ 208 va = uvm_km_alloc(kernel_map, PIPE_SIZE, 0, 209 UVM_KMF_PAGEABLE | UVM_KMF_WAITVA); 210 KASSERT(va != 0); 211 pipe->pipe_kmem = va; 212 atomic_add_int(&amountpipekva, PIPE_SIZE); 213 } 214 cv_init(&pipe->pipe_rcv, "pipe_rd"); 215 cv_init(&pipe->pipe_wcv, "pipe_wr"); 216 cv_init(&pipe->pipe_draincv, "pipe_drn"); 217 cv_init(&pipe->pipe_lkcv, "pipe_lk"); 218 selinit(&pipe->pipe_sel); 219 pipe->pipe_state = PIPE_SIGNALR; 220 221 return 0; 222 } 223 224 static void 225 pipe_dtor(void *arg, void *obj) 226 { 227 struct pipe *pipe; 228 229 pipe = obj; 230 231 cv_destroy(&pipe->pipe_rcv); 232 cv_destroy(&pipe->pipe_wcv); 233 cv_destroy(&pipe->pipe_draincv); 234 cv_destroy(&pipe->pipe_lkcv); 235 seldestroy(&pipe->pipe_sel); 236 if (pipe->pipe_kmem != 0) { 237 uvm_km_free(kernel_map, pipe->pipe_kmem, PIPE_SIZE, 238 UVM_KMF_PAGEABLE); 239 atomic_add_int(&amountpipekva, -PIPE_SIZE); 240 } 241 } 242 243 /* 244 * The pipe system call for the DTYPE_PIPE type of pipes 245 */ 246 int 247 pipe1(struct lwp *l, register_t *retval, int flags) 248 { 249 struct pipe *rpipe, *wpipe; 250 file_t *rf, *wf; 251 int fd, error; 252 proc_t *p; 253 254 if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE)) 255 return EINVAL; 256 p = curproc; 257 rpipe = wpipe = NULL; 258 if ((error = pipe_create(&rpipe, pipe_rd_cache)) || 259 (error = pipe_create(&wpipe, pipe_wr_cache))) { 260 goto free2; 261 } 262 rpipe->pipe_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 263 wpipe->pipe_lock = rpipe->pipe_lock; 264 mutex_obj_hold(wpipe->pipe_lock); 265 266 error = fd_allocfile(&rf, &fd); 267 if (error) 268 goto free2; 269 retval[0] = fd; 270 271 error = fd_allocfile(&wf, &fd); 272 if (error) 273 goto free3; 274 retval[1] = fd; 275 276 rf->f_flag = FREAD | flags; 277 rf->f_type = DTYPE_PIPE; 278 rf->f_pipe = rpipe; 279 rf->f_ops = &pipeops; 280 fd_set_exclose(l, (int)retval[0], (flags & O_CLOEXEC) != 0); 281 282 wf->f_flag = FWRITE | flags; 283 wf->f_type = DTYPE_PIPE; 284 wf->f_pipe = wpipe; 285 wf->f_ops = &pipeops; 286 fd_set_exclose(l, (int)retval[1], (flags & O_CLOEXEC) != 0); 287 288 rpipe->pipe_peer = wpipe; 289 wpipe->pipe_peer = rpipe; 290 291 fd_affix(p, rf, (int)retval[0]); 292 fd_affix(p, wf, (int)retval[1]); 293 return (0); 294 free3: 295 fd_abort(p, rf, (int)retval[0]); 296 free2: 297 pipeclose(wpipe); 298 pipeclose(rpipe); 299 300 return (error); 301 } 302 303 /* 304 * Allocate kva for pipe circular buffer, the space is pageable 305 * This routine will 'realloc' the size of a pipe safely, if it fails 306 * it will retain the old buffer. 307 * If it fails it will return ENOMEM. 308 */ 309 static int 310 pipespace(struct pipe *pipe, int size) 311 { 312 void *buffer; 313 314 /* 315 * Allocate pageable virtual address space. Physical memory is 316 * allocated on demand. 317 */ 318 if (size == PIPE_SIZE && pipe->pipe_kmem != 0) { 319 buffer = (void *)pipe->pipe_kmem; 320 } else { 321 buffer = (void *)uvm_km_alloc(kernel_map, round_page(size), 322 0, UVM_KMF_PAGEABLE); 323 if (buffer == NULL) 324 return (ENOMEM); 325 atomic_add_int(&amountpipekva, size); 326 } 327 328 /* free old resources if we're resizing */ 329 pipe_free_kmem(pipe); 330 pipe->pipe_buffer.buffer = buffer; 331 pipe->pipe_buffer.size = size; 332 pipe->pipe_buffer.in = 0; 333 pipe->pipe_buffer.out = 0; 334 pipe->pipe_buffer.cnt = 0; 335 return (0); 336 } 337 338 /* 339 * Initialize and allocate VM and memory for pipe. 340 */ 341 static int 342 pipe_create(struct pipe **pipep, pool_cache_t cache) 343 { 344 struct pipe *pipe; 345 int error; 346 347 pipe = pool_cache_get(cache, PR_WAITOK); 348 KASSERT(pipe != NULL); 349 *pipep = pipe; 350 error = 0; 351 getnanotime(&pipe->pipe_btime); 352 pipe->pipe_atime = pipe->pipe_mtime = pipe->pipe_btime; 353 pipe->pipe_lock = NULL; 354 if (cache == pipe_rd_cache) { 355 error = pipespace(pipe, PIPE_SIZE); 356 } else { 357 pipe->pipe_buffer.buffer = NULL; 358 pipe->pipe_buffer.size = 0; 359 pipe->pipe_buffer.in = 0; 360 pipe->pipe_buffer.out = 0; 361 pipe->pipe_buffer.cnt = 0; 362 } 363 return error; 364 } 365 366 /* 367 * Lock a pipe for I/O, blocking other access 368 * Called with pipe spin lock held. 369 */ 370 static int 371 pipelock(struct pipe *pipe, bool catch_p) 372 { 373 int error; 374 375 KASSERT(mutex_owned(pipe->pipe_lock)); 376 377 while (pipe->pipe_state & PIPE_LOCKFL) { 378 pipe->pipe_state |= PIPE_LWANT; 379 if (catch_p) { 380 error = cv_wait_sig(&pipe->pipe_lkcv, pipe->pipe_lock); 381 if (error != 0) 382 return error; 383 } else 384 cv_wait(&pipe->pipe_lkcv, pipe->pipe_lock); 385 } 386 387 pipe->pipe_state |= PIPE_LOCKFL; 388 389 return 0; 390 } 391 392 /* 393 * unlock a pipe I/O lock 394 */ 395 static inline void 396 pipeunlock(struct pipe *pipe) 397 { 398 399 KASSERT(pipe->pipe_state & PIPE_LOCKFL); 400 401 pipe->pipe_state &= ~PIPE_LOCKFL; 402 if (pipe->pipe_state & PIPE_LWANT) { 403 pipe->pipe_state &= ~PIPE_LWANT; 404 cv_broadcast(&pipe->pipe_lkcv); 405 } 406 } 407 408 /* 409 * Select/poll wakup. This also sends SIGIO to peer connected to 410 * 'sigpipe' side of pipe. 411 */ 412 static void 413 pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code) 414 { 415 int band; 416 417 switch (code) { 418 case POLL_IN: 419 band = POLLIN|POLLRDNORM; 420 break; 421 case POLL_OUT: 422 band = POLLOUT|POLLWRNORM; 423 break; 424 case POLL_HUP: 425 band = POLLHUP; 426 break; 427 case POLL_ERR: 428 band = POLLERR; 429 break; 430 default: 431 band = 0; 432 #ifdef DIAGNOSTIC 433 printf("bad siginfo code %d in pipe notification.\n", code); 434 #endif 435 break; 436 } 437 438 selnotify(&selp->pipe_sel, band, NOTE_SUBMIT); 439 440 if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0) 441 return; 442 443 fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp); 444 } 445 446 static int 447 pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 448 int flags) 449 { 450 struct pipe *rpipe = fp->f_pipe; 451 struct pipebuf *bp = &rpipe->pipe_buffer; 452 kmutex_t *lock = rpipe->pipe_lock; 453 int error; 454 size_t nread = 0; 455 size_t size; 456 size_t ocnt; 457 unsigned int wakeup_state = 0; 458 459 mutex_enter(lock); 460 ++rpipe->pipe_busy; 461 ocnt = bp->cnt; 462 463 again: 464 error = pipelock(rpipe, true); 465 if (error) 466 goto unlocked_error; 467 468 while (uio->uio_resid) { 469 /* 470 * Normal pipe buffer receive. 471 */ 472 if (bp->cnt > 0) { 473 size = bp->size - bp->out; 474 if (size > bp->cnt) 475 size = bp->cnt; 476 if (size > uio->uio_resid) 477 size = uio->uio_resid; 478 479 mutex_exit(lock); 480 error = uiomove((char *)bp->buffer + bp->out, size, uio); 481 mutex_enter(lock); 482 if (error) 483 break; 484 485 bp->out += size; 486 if (bp->out >= bp->size) 487 bp->out = 0; 488 489 bp->cnt -= size; 490 491 /* 492 * If there is no more to read in the pipe, reset 493 * its pointers to the beginning. This improves 494 * cache hit stats. 495 */ 496 if (bp->cnt == 0) { 497 bp->in = 0; 498 bp->out = 0; 499 } 500 nread += size; 501 continue; 502 } 503 504 #ifndef PIPE_NODIRECT 505 if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) { 506 struct pipemapping * const rmap = &rpipe->pipe_map; 507 /* 508 * Direct copy, bypassing a kernel buffer. 509 */ 510 void *va; 511 u_int gen; 512 513 KASSERT(rpipe->pipe_state & PIPE_DIRECTW); 514 515 size = rmap->cnt; 516 if (size > uio->uio_resid) 517 size = uio->uio_resid; 518 519 va = (char *)rmap->kva + rmap->pos; 520 gen = rmap->egen; 521 mutex_exit(lock); 522 523 /* 524 * Consume emap and read the data from loaned pages. 525 */ 526 uvm_emap_consume(gen); 527 error = uiomove(va, size, uio); 528 529 mutex_enter(lock); 530 if (error) 531 break; 532 nread += size; 533 rmap->pos += size; 534 rmap->cnt -= size; 535 if (rmap->cnt == 0) { 536 rpipe->pipe_state &= ~PIPE_DIRECTR; 537 cv_broadcast(&rpipe->pipe_wcv); 538 } 539 continue; 540 } 541 #endif 542 /* 543 * Break if some data was read. 544 */ 545 if (nread > 0) 546 break; 547 548 /* 549 * Detect EOF condition. 550 * Read returns 0 on EOF, no need to set error. 551 */ 552 if (rpipe->pipe_state & PIPE_EOF) 553 break; 554 555 /* 556 * Don't block on non-blocking I/O. 557 */ 558 if (fp->f_flag & FNONBLOCK) { 559 error = EAGAIN; 560 break; 561 } 562 563 /* 564 * Unlock the pipe buffer for our remaining processing. 565 * We will either break out with an error or we will 566 * sleep and relock to loop. 567 */ 568 pipeunlock(rpipe); 569 570 /* 571 * Re-check to see if more direct writes are pending. 572 */ 573 if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) 574 goto again; 575 576 #if 1 /* XXX (dsl) I'm sure these aren't needed here ... */ 577 /* 578 * We want to read more, wake up select/poll. 579 */ 580 pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT); 581 582 /* 583 * If the "write-side" is blocked, wake it up now. 584 */ 585 cv_broadcast(&rpipe->pipe_wcv); 586 #endif 587 588 if (wakeup_state & PIPE_RESTART) { 589 error = ERESTART; 590 goto unlocked_error; 591 } 592 593 /* Now wait until the pipe is filled */ 594 error = cv_wait_sig(&rpipe->pipe_rcv, lock); 595 if (error != 0) 596 goto unlocked_error; 597 wakeup_state = rpipe->pipe_state; 598 goto again; 599 } 600 601 if (error == 0) 602 getnanotime(&rpipe->pipe_atime); 603 pipeunlock(rpipe); 604 605 unlocked_error: 606 --rpipe->pipe_busy; 607 if (rpipe->pipe_busy == 0) { 608 rpipe->pipe_state &= ~PIPE_RESTART; 609 cv_broadcast(&rpipe->pipe_draincv); 610 } 611 if (bp->cnt < MINPIPESIZE) { 612 cv_broadcast(&rpipe->pipe_wcv); 613 } 614 615 /* 616 * If anything was read off the buffer, signal to the writer it's 617 * possible to write more data. Also send signal if we are here for the 618 * first time after last write. 619 */ 620 if ((bp->size - bp->cnt) >= PIPE_BUF 621 && (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) { 622 pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT); 623 rpipe->pipe_state &= ~PIPE_SIGNALR; 624 } 625 626 mutex_exit(lock); 627 return (error); 628 } 629 630 #ifndef PIPE_NODIRECT 631 /* 632 * Allocate structure for loan transfer. 633 */ 634 static int 635 pipe_loan_alloc(struct pipe *wpipe, int npages) 636 { 637 struct pipemapping * const wmap = &wpipe->pipe_map; 638 const vsize_t len = ptoa(npages); 639 640 atomic_add_int(&amountpipekva, len); 641 wmap->kva = uvm_km_alloc(kernel_map, len, 0, 642 UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA); 643 if (wmap->kva == 0) { 644 atomic_add_int(&amountpipekva, -len); 645 return (ENOMEM); 646 } 647 648 wmap->npages = npages; 649 wmap->pgs = kmem_alloc(npages * sizeof(struct vm_page *), KM_SLEEP); 650 return (0); 651 } 652 653 /* 654 * Free resources allocated for loan transfer. 655 */ 656 static void 657 pipe_loan_free(struct pipe *wpipe) 658 { 659 struct pipemapping * const wmap = &wpipe->pipe_map; 660 const vsize_t len = ptoa(wmap->npages); 661 662 uvm_emap_remove(wmap->kva, len); /* XXX */ 663 uvm_km_free(kernel_map, wmap->kva, len, UVM_KMF_VAONLY); 664 wmap->kva = 0; 665 atomic_add_int(&amountpipekva, -len); 666 kmem_free(wmap->pgs, wmap->npages * sizeof(struct vm_page *)); 667 wmap->pgs = NULL; 668 #if 0 669 wmap->npages = 0; 670 wmap->pos = 0; 671 wmap->cnt = 0; 672 #endif 673 } 674 675 /* 676 * NetBSD direct write, using uvm_loan() mechanism. 677 * This implements the pipe buffer write mechanism. Note that only 678 * a direct write OR a normal pipe write can be pending at any given time. 679 * If there are any characters in the pipe buffer, the direct write will 680 * be deferred until the receiving process grabs all of the bytes from 681 * the pipe buffer. Then the direct mapping write is set-up. 682 * 683 * Called with the long-term pipe lock held. 684 */ 685 static int 686 pipe_direct_write(file_t *fp, struct pipe *wpipe, struct uio *uio) 687 { 688 struct pipemapping * const wmap = &wpipe->pipe_map; 689 kmutex_t * const lock = wpipe->pipe_lock; 690 struct vm_page **pgs; 691 vaddr_t bbase, base, bend; 692 vsize_t blen, bcnt; 693 int error, npages; 694 voff_t bpos; 695 u_int starting_color; 696 697 KASSERT(mutex_owned(wpipe->pipe_lock)); 698 KASSERT(wmap->cnt == 0); 699 700 mutex_exit(lock); 701 702 /* 703 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers 704 * not aligned to PAGE_SIZE. 705 */ 706 bbase = (vaddr_t)uio->uio_iov->iov_base; 707 base = trunc_page(bbase); 708 bend = round_page(bbase + uio->uio_iov->iov_len); 709 blen = bend - base; 710 bpos = bbase - base; 711 712 if (blen > PIPE_DIRECT_CHUNK) { 713 blen = PIPE_DIRECT_CHUNK; 714 bend = base + blen; 715 bcnt = PIPE_DIRECT_CHUNK - bpos; 716 } else { 717 bcnt = uio->uio_iov->iov_len; 718 } 719 npages = atop(blen); 720 starting_color = atop(base) & uvmexp.colormask; 721 722 /* 723 * Free the old kva if we need more pages than we have 724 * allocated. 725 */ 726 if (wmap->kva != 0 && starting_color + npages > wmap->npages) 727 pipe_loan_free(wpipe); 728 729 /* Allocate new kva. */ 730 if (wmap->kva == 0) { 731 error = pipe_loan_alloc(wpipe, starting_color + npages); 732 if (error) { 733 mutex_enter(lock); 734 return (error); 735 } 736 } 737 738 /* Loan the write buffer memory from writer process */ 739 pgs = wmap->pgs + starting_color; 740 error = uvm_loan(&uio->uio_vmspace->vm_map, base, blen, 741 pgs, UVM_LOAN_TOPAGE); 742 if (error) { 743 pipe_loan_free(wpipe); 744 mutex_enter(lock); 745 return (ENOMEM); /* so that caller fallback to ordinary write */ 746 } 747 748 /* Enter the loaned pages to KVA, produce new emap generation number. */ 749 uvm_emap_enter(wmap->kva + ptoa(starting_color), pgs, npages); 750 wmap->egen = uvm_emap_produce(); 751 752 /* Now we can put the pipe in direct write mode */ 753 wmap->pos = bpos + ptoa(starting_color); 754 wmap->cnt = bcnt; 755 756 /* 757 * But before we can let someone do a direct read, we 758 * have to wait until the pipe is drained. Release the 759 * pipe lock while we wait. 760 */ 761 mutex_enter(lock); 762 wpipe->pipe_state |= PIPE_DIRECTW; 763 pipeunlock(wpipe); 764 765 while (error == 0 && wpipe->pipe_buffer.cnt > 0) { 766 cv_broadcast(&wpipe->pipe_rcv); 767 error = cv_wait_sig(&wpipe->pipe_wcv, lock); 768 if (error == 0 && wpipe->pipe_state & PIPE_EOF) 769 error = EPIPE; 770 } 771 772 /* Pipe is drained; next read will off the direct buffer */ 773 wpipe->pipe_state |= PIPE_DIRECTR; 774 775 /* Wait until the reader is done */ 776 while (error == 0 && (wpipe->pipe_state & PIPE_DIRECTR)) { 777 cv_broadcast(&wpipe->pipe_rcv); 778 pipeselwakeup(wpipe, wpipe, POLL_IN); 779 error = cv_wait_sig(&wpipe->pipe_wcv, lock); 780 if (error == 0 && wpipe->pipe_state & PIPE_EOF) 781 error = EPIPE; 782 } 783 784 /* Take pipe out of direct write mode */ 785 wpipe->pipe_state &= ~(PIPE_DIRECTW | PIPE_DIRECTR); 786 787 /* Acquire the pipe lock and cleanup */ 788 (void)pipelock(wpipe, false); 789 mutex_exit(lock); 790 791 if (pgs != NULL) { 792 /* XXX: uvm_emap_remove */ 793 uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE); 794 } 795 if (error || amountpipekva > maxpipekva) 796 pipe_loan_free(wpipe); 797 798 mutex_enter(lock); 799 if (error) { 800 pipeselwakeup(wpipe, wpipe, POLL_ERR); 801 802 /* 803 * If nothing was read from what we offered, return error 804 * straight on. Otherwise update uio resid first. Caller 805 * will deal with the error condition, returning short 806 * write, error, or restarting the write(2) as appropriate. 807 */ 808 if (wmap->cnt == bcnt) { 809 wmap->cnt = 0; 810 cv_broadcast(&wpipe->pipe_wcv); 811 return (error); 812 } 813 814 bcnt -= wpipe->cnt; 815 } 816 817 uio->uio_resid -= bcnt; 818 /* uio_offset not updated, not set/used for write(2) */ 819 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt; 820 uio->uio_iov->iov_len -= bcnt; 821 if (uio->uio_iov->iov_len == 0) { 822 uio->uio_iov++; 823 uio->uio_iovcnt--; 824 } 825 826 wmap->cnt = 0; 827 return (error); 828 } 829 #endif /* !PIPE_NODIRECT */ 830 831 static int 832 pipe_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 833 int flags) 834 { 835 struct pipe *wpipe, *rpipe; 836 struct pipebuf *bp; 837 kmutex_t *lock; 838 int error; 839 unsigned int wakeup_state = 0; 840 841 /* We want to write to our peer */ 842 rpipe = fp->f_pipe; 843 lock = rpipe->pipe_lock; 844 error = 0; 845 846 mutex_enter(lock); 847 wpipe = rpipe->pipe_peer; 848 849 /* 850 * Detect loss of pipe read side, issue SIGPIPE if lost. 851 */ 852 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) != 0) { 853 mutex_exit(lock); 854 return EPIPE; 855 } 856 ++wpipe->pipe_busy; 857 858 /* Aquire the long-term pipe lock */ 859 if ((error = pipelock(wpipe, true)) != 0) { 860 --wpipe->pipe_busy; 861 if (wpipe->pipe_busy == 0) { 862 wpipe->pipe_state &= ~PIPE_RESTART; 863 cv_broadcast(&wpipe->pipe_draincv); 864 } 865 mutex_exit(lock); 866 return (error); 867 } 868 869 bp = &wpipe->pipe_buffer; 870 871 /* 872 * If it is advantageous to resize the pipe buffer, do so. 873 */ 874 if ((uio->uio_resid > PIPE_SIZE) && 875 (nbigpipe < maxbigpipes) && 876 #ifndef PIPE_NODIRECT 877 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 878 #endif 879 (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) { 880 881 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 882 atomic_inc_uint(&nbigpipe); 883 } 884 885 while (uio->uio_resid) { 886 size_t space; 887 888 #ifndef PIPE_NODIRECT 889 /* 890 * Pipe buffered writes cannot be coincidental with 891 * direct writes. Also, only one direct write can be 892 * in progress at any one time. We wait until the currently 893 * executing direct write is completed before continuing. 894 * 895 * We break out if a signal occurs or the reader goes away. 896 */ 897 while (error == 0 && wpipe->pipe_state & PIPE_DIRECTW) { 898 cv_broadcast(&wpipe->pipe_rcv); 899 pipeunlock(wpipe); 900 error = cv_wait_sig(&wpipe->pipe_wcv, lock); 901 (void)pipelock(wpipe, false); 902 if (wpipe->pipe_state & PIPE_EOF) 903 error = EPIPE; 904 } 905 if (error) 906 break; 907 908 /* 909 * If the transfer is large, we can gain performance if 910 * we do process-to-process copies directly. 911 * If the write is non-blocking, we don't use the 912 * direct write mechanism. 913 * 914 * The direct write mechanism will detect the reader going 915 * away on us. 916 */ 917 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 918 (fp->f_flag & FNONBLOCK) == 0 && 919 (wmap->kva || (amountpipekva < limitpipekva))) { 920 error = pipe_direct_write(fp, wpipe, uio); 921 922 /* 923 * Break out if error occurred, unless it's ENOMEM. 924 * ENOMEM means we failed to allocate some resources 925 * for direct write, so we just fallback to ordinary 926 * write. If the direct write was successful, 927 * process rest of data via ordinary write. 928 */ 929 if (error == 0) 930 continue; 931 932 if (error != ENOMEM) 933 break; 934 } 935 #endif /* PIPE_NODIRECT */ 936 937 space = bp->size - bp->cnt; 938 939 /* Writes of size <= PIPE_BUF must be atomic. */ 940 if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF)) 941 space = 0; 942 943 if (space > 0) { 944 int size; /* Transfer size */ 945 int segsize; /* first segment to transfer */ 946 947 /* 948 * Transfer size is minimum of uio transfer 949 * and free space in pipe buffer. 950 */ 951 if (space > uio->uio_resid) 952 size = uio->uio_resid; 953 else 954 size = space; 955 /* 956 * First segment to transfer is minimum of 957 * transfer size and contiguous space in 958 * pipe buffer. If first segment to transfer 959 * is less than the transfer size, we've got 960 * a wraparound in the buffer. 961 */ 962 segsize = bp->size - bp->in; 963 if (segsize > size) 964 segsize = size; 965 966 /* Transfer first segment */ 967 mutex_exit(lock); 968 error = uiomove((char *)bp->buffer + bp->in, segsize, 969 uio); 970 971 if (error == 0 && segsize < size) { 972 /* 973 * Transfer remaining part now, to 974 * support atomic writes. Wraparound 975 * happened. 976 */ 977 KASSERT(bp->in + segsize == bp->size); 978 error = uiomove(bp->buffer, 979 size - segsize, uio); 980 } 981 mutex_enter(lock); 982 if (error) 983 break; 984 985 bp->in += size; 986 if (bp->in >= bp->size) { 987 KASSERT(bp->in == size - segsize + bp->size); 988 bp->in = size - segsize; 989 } 990 991 bp->cnt += size; 992 KASSERT(bp->cnt <= bp->size); 993 wakeup_state = 0; 994 } else { 995 /* 996 * If the "read-side" has been blocked, wake it up now. 997 */ 998 cv_broadcast(&wpipe->pipe_rcv); 999 1000 /* 1001 * Don't block on non-blocking I/O. 1002 */ 1003 if (fp->f_flag & FNONBLOCK) { 1004 error = EAGAIN; 1005 break; 1006 } 1007 1008 /* 1009 * We have no more space and have something to offer, 1010 * wake up select/poll. 1011 */ 1012 if (bp->cnt) 1013 pipeselwakeup(wpipe, wpipe, POLL_IN); 1014 1015 if (wakeup_state & PIPE_RESTART) { 1016 error = ERESTART; 1017 break; 1018 } 1019 1020 pipeunlock(wpipe); 1021 error = cv_wait_sig(&wpipe->pipe_wcv, lock); 1022 (void)pipelock(wpipe, false); 1023 if (error != 0) 1024 break; 1025 /* 1026 * If read side wants to go away, we just issue a signal 1027 * to ourselves. 1028 */ 1029 if (wpipe->pipe_state & PIPE_EOF) { 1030 error = EPIPE; 1031 break; 1032 } 1033 wakeup_state = wpipe->pipe_state; 1034 } 1035 } 1036 1037 --wpipe->pipe_busy; 1038 if (wpipe->pipe_busy == 0) { 1039 wpipe->pipe_state &= ~PIPE_RESTART; 1040 cv_broadcast(&wpipe->pipe_draincv); 1041 } 1042 if (bp->cnt > 0) { 1043 cv_broadcast(&wpipe->pipe_rcv); 1044 } 1045 1046 /* 1047 * Don't return EPIPE if I/O was successful 1048 */ 1049 if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0) 1050 error = 0; 1051 1052 if (error == 0) 1053 getnanotime(&wpipe->pipe_mtime); 1054 1055 /* 1056 * We have something to offer, wake up select/poll. 1057 * wmap->cnt is always 0 in this point (direct write 1058 * is only done synchronously), so check only wpipe->pipe_buffer.cnt 1059 */ 1060 if (bp->cnt) 1061 pipeselwakeup(wpipe, wpipe, POLL_IN); 1062 1063 /* 1064 * Arrange for next read(2) to do a signal. 1065 */ 1066 wpipe->pipe_state |= PIPE_SIGNALR; 1067 1068 pipeunlock(wpipe); 1069 mutex_exit(lock); 1070 return (error); 1071 } 1072 1073 /* 1074 * We implement a very minimal set of ioctls for compatibility with sockets. 1075 */ 1076 int 1077 pipe_ioctl(file_t *fp, u_long cmd, void *data) 1078 { 1079 struct pipe *pipe = fp->f_pipe; 1080 kmutex_t *lock = pipe->pipe_lock; 1081 1082 switch (cmd) { 1083 1084 case FIONBIO: 1085 return (0); 1086 1087 case FIOASYNC: 1088 mutex_enter(lock); 1089 if (*(int *)data) { 1090 pipe->pipe_state |= PIPE_ASYNC; 1091 } else { 1092 pipe->pipe_state &= ~PIPE_ASYNC; 1093 } 1094 mutex_exit(lock); 1095 return (0); 1096 1097 case FIONREAD: 1098 mutex_enter(lock); 1099 #ifndef PIPE_NODIRECT 1100 if (pipe->pipe_state & PIPE_DIRECTW) 1101 *(int *)data = pipe->pipe_map.cnt; 1102 else 1103 #endif 1104 *(int *)data = pipe->pipe_buffer.cnt; 1105 mutex_exit(lock); 1106 return (0); 1107 1108 case FIONWRITE: 1109 /* Look at other side */ 1110 pipe = pipe->pipe_peer; 1111 mutex_enter(lock); 1112 #ifndef PIPE_NODIRECT 1113 if (pipe->pipe_state & PIPE_DIRECTW) 1114 *(int *)data = pipe->pipe_map.cnt; 1115 else 1116 #endif 1117 *(int *)data = pipe->pipe_buffer.cnt; 1118 mutex_exit(lock); 1119 return (0); 1120 1121 case FIONSPACE: 1122 /* Look at other side */ 1123 pipe = pipe->pipe_peer; 1124 mutex_enter(lock); 1125 #ifndef PIPE_NODIRECT 1126 /* 1127 * If we're in direct-mode, we don't really have a 1128 * send queue, and any other write will block. Thus 1129 * zero seems like the best answer. 1130 */ 1131 if (pipe->pipe_state & PIPE_DIRECTW) 1132 *(int *)data = 0; 1133 else 1134 #endif 1135 *(int *)data = pipe->pipe_buffer.size - 1136 pipe->pipe_buffer.cnt; 1137 mutex_exit(lock); 1138 return (0); 1139 1140 case TIOCSPGRP: 1141 case FIOSETOWN: 1142 return fsetown(&pipe->pipe_pgid, cmd, data); 1143 1144 case TIOCGPGRP: 1145 case FIOGETOWN: 1146 return fgetown(pipe->pipe_pgid, cmd, data); 1147 1148 } 1149 return (EPASSTHROUGH); 1150 } 1151 1152 int 1153 pipe_poll(file_t *fp, int events) 1154 { 1155 struct pipe *rpipe = fp->f_pipe; 1156 struct pipe *wpipe; 1157 int eof = 0; 1158 int revents = 0; 1159 1160 mutex_enter(rpipe->pipe_lock); 1161 wpipe = rpipe->pipe_peer; 1162 1163 if (events & (POLLIN | POLLRDNORM)) 1164 if ((rpipe->pipe_buffer.cnt > 0) || 1165 #ifndef PIPE_NODIRECT 1166 (rpipe->pipe_state & PIPE_DIRECTR) || 1167 #endif 1168 (rpipe->pipe_state & PIPE_EOF)) 1169 revents |= events & (POLLIN | POLLRDNORM); 1170 1171 eof |= (rpipe->pipe_state & PIPE_EOF); 1172 1173 if (wpipe == NULL) 1174 revents |= events & (POLLOUT | POLLWRNORM); 1175 else { 1176 if (events & (POLLOUT | POLLWRNORM)) 1177 if ((wpipe->pipe_state & PIPE_EOF) || ( 1178 #ifndef PIPE_NODIRECT 1179 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 1180 #endif 1181 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1182 revents |= events & (POLLOUT | POLLWRNORM); 1183 1184 eof |= (wpipe->pipe_state & PIPE_EOF); 1185 } 1186 1187 if (wpipe == NULL || eof) 1188 revents |= POLLHUP; 1189 1190 if (revents == 0) { 1191 if (events & (POLLIN | POLLRDNORM)) 1192 selrecord(curlwp, &rpipe->pipe_sel); 1193 1194 if (events & (POLLOUT | POLLWRNORM)) 1195 selrecord(curlwp, &wpipe->pipe_sel); 1196 } 1197 mutex_exit(rpipe->pipe_lock); 1198 1199 return (revents); 1200 } 1201 1202 static int 1203 pipe_stat(file_t *fp, struct stat *ub) 1204 { 1205 struct pipe *pipe = fp->f_pipe; 1206 1207 mutex_enter(pipe->pipe_lock); 1208 memset(ub, 0, sizeof(*ub)); 1209 ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 1210 ub->st_blksize = pipe->pipe_buffer.size; 1211 if (ub->st_blksize == 0 && pipe->pipe_peer) 1212 ub->st_blksize = pipe->pipe_peer->pipe_buffer.size; 1213 ub->st_size = pipe->pipe_buffer.cnt; 1214 ub->st_blocks = (ub->st_size) ? 1 : 0; 1215 ub->st_atimespec = pipe->pipe_atime; 1216 ub->st_mtimespec = pipe->pipe_mtime; 1217 ub->st_ctimespec = ub->st_birthtimespec = pipe->pipe_btime; 1218 ub->st_uid = kauth_cred_geteuid(fp->f_cred); 1219 ub->st_gid = kauth_cred_getegid(fp->f_cred); 1220 1221 /* 1222 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1223 * XXX (st_dev, st_ino) should be unique. 1224 */ 1225 mutex_exit(pipe->pipe_lock); 1226 return 0; 1227 } 1228 1229 static int 1230 pipe_close(file_t *fp) 1231 { 1232 struct pipe *pipe = fp->f_pipe; 1233 1234 fp->f_pipe = NULL; 1235 pipeclose(pipe); 1236 return (0); 1237 } 1238 1239 static void 1240 pipe_restart(file_t *fp) 1241 { 1242 struct pipe *pipe = fp->f_pipe; 1243 1244 /* 1245 * Unblock blocked reads/writes in order to allow close() to complete. 1246 * System calls return ERESTART so that the fd is revalidated. 1247 * (Partial writes return the transfer length.) 1248 */ 1249 mutex_enter(pipe->pipe_lock); 1250 pipe->pipe_state |= PIPE_RESTART; 1251 /* Wakeup both cvs, maybe we only need one, but maybe there are some 1252 * other paths where wakeup is needed, and it saves deciding which! */ 1253 cv_broadcast(&pipe->pipe_rcv); 1254 cv_broadcast(&pipe->pipe_wcv); 1255 mutex_exit(pipe->pipe_lock); 1256 } 1257 1258 static void 1259 pipe_free_kmem(struct pipe *pipe) 1260 { 1261 1262 if (pipe->pipe_buffer.buffer != NULL) { 1263 if (pipe->pipe_buffer.size > PIPE_SIZE) { 1264 atomic_dec_uint(&nbigpipe); 1265 } 1266 if (pipe->pipe_buffer.buffer != (void *)pipe->pipe_kmem) { 1267 uvm_km_free(kernel_map, 1268 (vaddr_t)pipe->pipe_buffer.buffer, 1269 pipe->pipe_buffer.size, UVM_KMF_PAGEABLE); 1270 atomic_add_int(&amountpipekva, 1271 -pipe->pipe_buffer.size); 1272 } 1273 pipe->pipe_buffer.buffer = NULL; 1274 } 1275 #ifndef PIPE_NODIRECT 1276 if (pipe->pipe_map.kva != 0) { 1277 pipe_loan_free(pipe); 1278 pipe->pipe_map.cnt = 0; 1279 pipe->pipe_map.pos = 0; 1280 pipe->pipe_map.npages = 0; 1281 } 1282 #endif /* !PIPE_NODIRECT */ 1283 } 1284 1285 /* 1286 * Shutdown the pipe. 1287 */ 1288 static void 1289 pipeclose(struct pipe *pipe) 1290 { 1291 kmutex_t *lock; 1292 struct pipe *ppipe; 1293 1294 if (pipe == NULL) 1295 return; 1296 1297 KASSERT(cv_is_valid(&pipe->pipe_rcv)); 1298 KASSERT(cv_is_valid(&pipe->pipe_wcv)); 1299 KASSERT(cv_is_valid(&pipe->pipe_draincv)); 1300 KASSERT(cv_is_valid(&pipe->pipe_lkcv)); 1301 1302 lock = pipe->pipe_lock; 1303 if (lock == NULL) 1304 /* Must have failed during create */ 1305 goto free_resources; 1306 1307 mutex_enter(lock); 1308 pipeselwakeup(pipe, pipe, POLL_HUP); 1309 1310 /* 1311 * If the other side is blocked, wake it up saying that 1312 * we want to close it down. 1313 */ 1314 pipe->pipe_state |= PIPE_EOF; 1315 if (pipe->pipe_busy) { 1316 while (pipe->pipe_busy) { 1317 cv_broadcast(&pipe->pipe_wcv); 1318 cv_wait_sig(&pipe->pipe_draincv, lock); 1319 } 1320 } 1321 1322 /* 1323 * Disconnect from peer. 1324 */ 1325 if ((ppipe = pipe->pipe_peer) != NULL) { 1326 pipeselwakeup(ppipe, ppipe, POLL_HUP); 1327 ppipe->pipe_state |= PIPE_EOF; 1328 cv_broadcast(&ppipe->pipe_rcv); 1329 ppipe->pipe_peer = NULL; 1330 } 1331 1332 /* 1333 * Any knote objects still left in the list are 1334 * the one attached by peer. Since no one will 1335 * traverse this list, we just clear it. 1336 */ 1337 SLIST_INIT(&pipe->pipe_sel.sel_klist); 1338 1339 KASSERT((pipe->pipe_state & PIPE_LOCKFL) == 0); 1340 mutex_exit(lock); 1341 mutex_obj_free(lock); 1342 1343 /* 1344 * Free resources. 1345 */ 1346 free_resources: 1347 pipe->pipe_pgid = 0; 1348 pipe->pipe_state = PIPE_SIGNALR; 1349 pipe_free_kmem(pipe); 1350 if (pipe->pipe_kmem != 0) { 1351 pool_cache_put(pipe_rd_cache, pipe); 1352 } else { 1353 pool_cache_put(pipe_wr_cache, pipe); 1354 } 1355 } 1356 1357 static void 1358 filt_pipedetach(struct knote *kn) 1359 { 1360 struct pipe *pipe; 1361 kmutex_t *lock; 1362 1363 pipe = ((file_t *)kn->kn_obj)->f_pipe; 1364 lock = pipe->pipe_lock; 1365 1366 mutex_enter(lock); 1367 1368 switch(kn->kn_filter) { 1369 case EVFILT_WRITE: 1370 /* Need the peer structure, not our own. */ 1371 pipe = pipe->pipe_peer; 1372 1373 /* If reader end already closed, just return. */ 1374 if (pipe == NULL) { 1375 mutex_exit(lock); 1376 return; 1377 } 1378 1379 break; 1380 default: 1381 /* Nothing to do. */ 1382 break; 1383 } 1384 1385 KASSERT(kn->kn_hook == pipe); 1386 SLIST_REMOVE(&pipe->pipe_sel.sel_klist, kn, knote, kn_selnext); 1387 mutex_exit(lock); 1388 } 1389 1390 static int 1391 filt_piperead(struct knote *kn, long hint) 1392 { 1393 struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe; 1394 struct pipe *wpipe; 1395 1396 if ((hint & NOTE_SUBMIT) == 0) { 1397 mutex_enter(rpipe->pipe_lock); 1398 } 1399 wpipe = rpipe->pipe_peer; 1400 kn->kn_data = rpipe->pipe_buffer.cnt; 1401 1402 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1403 kn->kn_data = rpipe->pipe_map.cnt; 1404 1405 if ((rpipe->pipe_state & PIPE_EOF) || 1406 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1407 kn->kn_flags |= EV_EOF; 1408 if ((hint & NOTE_SUBMIT) == 0) { 1409 mutex_exit(rpipe->pipe_lock); 1410 } 1411 return (1); 1412 } 1413 1414 if ((hint & NOTE_SUBMIT) == 0) { 1415 mutex_exit(rpipe->pipe_lock); 1416 } 1417 return (kn->kn_data > 0); 1418 } 1419 1420 static int 1421 filt_pipewrite(struct knote *kn, long hint) 1422 { 1423 struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe; 1424 struct pipe *wpipe; 1425 1426 if ((hint & NOTE_SUBMIT) == 0) { 1427 mutex_enter(rpipe->pipe_lock); 1428 } 1429 wpipe = rpipe->pipe_peer; 1430 1431 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1432 kn->kn_data = 0; 1433 kn->kn_flags |= EV_EOF; 1434 if ((hint & NOTE_SUBMIT) == 0) { 1435 mutex_exit(rpipe->pipe_lock); 1436 } 1437 return (1); 1438 } 1439 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1440 if (wpipe->pipe_state & PIPE_DIRECTW) 1441 kn->kn_data = 0; 1442 1443 if ((hint & NOTE_SUBMIT) == 0) { 1444 mutex_exit(rpipe->pipe_lock); 1445 } 1446 return (kn->kn_data >= PIPE_BUF); 1447 } 1448 1449 static const struct filterops pipe_rfiltops = 1450 { 1, NULL, filt_pipedetach, filt_piperead }; 1451 static const struct filterops pipe_wfiltops = 1452 { 1, NULL, filt_pipedetach, filt_pipewrite }; 1453 1454 static int 1455 pipe_kqfilter(file_t *fp, struct knote *kn) 1456 { 1457 struct pipe *pipe; 1458 kmutex_t *lock; 1459 1460 pipe = ((file_t *)kn->kn_obj)->f_pipe; 1461 lock = pipe->pipe_lock; 1462 1463 mutex_enter(lock); 1464 1465 switch (kn->kn_filter) { 1466 case EVFILT_READ: 1467 kn->kn_fop = &pipe_rfiltops; 1468 break; 1469 case EVFILT_WRITE: 1470 kn->kn_fop = &pipe_wfiltops; 1471 pipe = pipe->pipe_peer; 1472 if (pipe == NULL) { 1473 /* Other end of pipe has been closed. */ 1474 mutex_exit(lock); 1475 return (EBADF); 1476 } 1477 break; 1478 default: 1479 mutex_exit(lock); 1480 return (EINVAL); 1481 } 1482 1483 kn->kn_hook = pipe; 1484 SLIST_INSERT_HEAD(&pipe->pipe_sel.sel_klist, kn, kn_selnext); 1485 mutex_exit(lock); 1486 1487 return (0); 1488 } 1489 1490 /* 1491 * Handle pipe sysctls. 1492 */ 1493 SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup") 1494 { 1495 1496 sysctl_createv(clog, 0, NULL, NULL, 1497 CTLFLAG_PERMANENT, 1498 CTLTYPE_NODE, "pipe", 1499 SYSCTL_DESCR("Pipe settings"), 1500 NULL, 0, NULL, 0, 1501 CTL_KERN, KERN_PIPE, CTL_EOL); 1502 1503 sysctl_createv(clog, 0, NULL, NULL, 1504 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1505 CTLTYPE_INT, "maxkvasz", 1506 SYSCTL_DESCR("Maximum amount of kernel memory to be " 1507 "used for pipes"), 1508 NULL, 0, &maxpipekva, 0, 1509 CTL_KERN, KERN_PIPE, KERN_PIPE_MAXKVASZ, CTL_EOL); 1510 sysctl_createv(clog, 0, NULL, NULL, 1511 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1512 CTLTYPE_INT, "maxloankvasz", 1513 SYSCTL_DESCR("Limit for direct transfers via page loan"), 1514 NULL, 0, &limitpipekva, 0, 1515 CTL_KERN, KERN_PIPE, KERN_PIPE_LIMITKVA, CTL_EOL); 1516 sysctl_createv(clog, 0, NULL, NULL, 1517 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1518 CTLTYPE_INT, "maxbigpipes", 1519 SYSCTL_DESCR("Maximum number of \"big\" pipes"), 1520 NULL, 0, &maxbigpipes, 0, 1521 CTL_KERN, KERN_PIPE, KERN_PIPE_MAXBIGPIPES, CTL_EOL); 1522 sysctl_createv(clog, 0, NULL, NULL, 1523 CTLFLAG_PERMANENT, 1524 CTLTYPE_INT, "nbigpipes", 1525 SYSCTL_DESCR("Number of \"big\" pipes"), 1526 NULL, 0, &nbigpipe, 0, 1527 CTL_KERN, KERN_PIPE, KERN_PIPE_NBIGPIPES, CTL_EOL); 1528 sysctl_createv(clog, 0, NULL, NULL, 1529 CTLFLAG_PERMANENT, 1530 CTLTYPE_INT, "kvasize", 1531 SYSCTL_DESCR("Amount of kernel memory consumed by pipe " 1532 "buffers"), 1533 NULL, 0, &amountpipekva, 0, 1534 CTL_KERN, KERN_PIPE, KERN_PIPE_KVASIZE, CTL_EOL); 1535 } 1536