1 /* $NetBSD: sys_pipe.c,v 1.98 2008/03/01 14:16:51 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 2003, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Paul Kranenburg, and by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1996 John S. Dyson 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice immediately at the beginning of the file, without modification, 48 * this list of conditions, and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Absolutely no warranty of function or purpose is made by the author 53 * John S. Dyson. 54 * 4. Modifications may be freely made to this file if the above conditions 55 * are met. 56 * 57 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.95 2002/03/09 22:06:31 alfred Exp $ 58 */ 59 60 /* 61 * This file contains a high-performance replacement for the socket-based 62 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 63 * all features of sockets, but does do everything that pipes normally 64 * do. 65 * 66 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was 67 * written by Jaromir Dolecek. 68 */ 69 70 /* 71 * This code has two modes of operation, a small write mode and a large 72 * write mode. The small write mode acts like conventional pipes with 73 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 74 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 75 * and PIPE_SIZE in size it is mapped read-only into the kernel address space 76 * using the UVM page loan facility from where the receiving process can copy 77 * the data directly from the pages in the sending process. 78 * 79 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 80 * happen for small transfers so that the system will not spend all of 81 * its time context switching. PIPE_SIZE is constrained by the 82 * amount of kernel virtual memory. 83 */ 84 85 #include <sys/cdefs.h> 86 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.98 2008/03/01 14:16:51 rmind Exp $"); 87 88 #include <sys/param.h> 89 #include <sys/systm.h> 90 #include <sys/proc.h> 91 #include <sys/fcntl.h> 92 #include <sys/file.h> 93 #include <sys/filedesc.h> 94 #include <sys/filio.h> 95 #include <sys/kernel.h> 96 #include <sys/ttycom.h> 97 #include <sys/stat.h> 98 #include <sys/malloc.h> 99 #include <sys/poll.h> 100 #include <sys/signalvar.h> 101 #include <sys/vnode.h> 102 #include <sys/uio.h> 103 #include <sys/select.h> 104 #include <sys/mount.h> 105 #include <sys/syscallargs.h> 106 #include <sys/sysctl.h> 107 #include <sys/kauth.h> 108 #include <sys/atomic.h> 109 #include <sys/pipe.h> 110 111 #include <uvm/uvm.h> 112 113 /* 114 * Use this define if you want to disable *fancy* VM things. Expect an 115 * approx 30% decrease in transfer rate. 116 */ 117 /* #define PIPE_NODIRECT */ 118 119 /* 120 * interfaces to the outside world 121 */ 122 static int pipe_read(struct file *fp, off_t *offset, struct uio *uio, 123 kauth_cred_t cred, int flags); 124 static int pipe_write(struct file *fp, off_t *offset, struct uio *uio, 125 kauth_cred_t cred, int flags); 126 static int pipe_close(struct file *fp, struct lwp *l); 127 static int pipe_poll(struct file *fp, int events, struct lwp *l); 128 static int pipe_kqfilter(struct file *fp, struct knote *kn); 129 static int pipe_stat(struct file *fp, struct stat *sb, struct lwp *l); 130 static int pipe_ioctl(struct file *fp, u_long cmd, void *data, 131 struct lwp *l); 132 133 static const struct fileops pipeops = { 134 pipe_read, pipe_write, pipe_ioctl, fnullop_fcntl, pipe_poll, 135 pipe_stat, pipe_close, pipe_kqfilter 136 }; 137 138 /* 139 * Single mutex shared between both ends of the pipe. 140 */ 141 142 struct pipe_mutex { 143 kmutex_t pm_mutex; 144 u_int pm_refcnt; 145 }; 146 147 /* 148 * Default pipe buffer size(s), this can be kind-of large now because pipe 149 * space is pageable. The pipe code will try to maintain locality of 150 * reference for performance reasons, so small amounts of outstanding I/O 151 * will not wipe the cache. 152 */ 153 #define MINPIPESIZE (PIPE_SIZE/3) 154 #define MAXPIPESIZE (2*PIPE_SIZE/3) 155 156 /* 157 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 158 * is there so that on large systems, we don't exhaust it. 159 */ 160 #define MAXPIPEKVA (8*1024*1024) 161 static u_int maxpipekva = MAXPIPEKVA; 162 163 /* 164 * Limit for direct transfers, we cannot, of course limit 165 * the amount of kva for pipes in general though. 166 */ 167 #define LIMITPIPEKVA (16*1024*1024) 168 static u_int limitpipekva = LIMITPIPEKVA; 169 170 /* 171 * Limit the number of "big" pipes 172 */ 173 #define LIMITBIGPIPES 32 174 static u_int maxbigpipes = LIMITBIGPIPES; 175 static u_int nbigpipe = 0; 176 177 /* 178 * Amount of KVA consumed by pipe buffers. 179 */ 180 static u_int amountpipekva = 0; 181 182 MALLOC_DEFINE(M_PIPE, "pipe", "Pipe structures"); 183 184 static void pipeclose(struct file *fp, struct pipe *pipe); 185 static void pipe_free_kmem(struct pipe *pipe); 186 static int pipe_create(struct pipe **pipep, int allockva, struct pipe_mutex *); 187 static int pipelock(struct pipe *pipe, int catch); 188 static inline void pipeunlock(struct pipe *pipe); 189 static void pipeselwakeup(struct pipe *pipe, struct pipe *sigp, int code); 190 #ifndef PIPE_NODIRECT 191 static int pipe_direct_write(struct file *fp, struct pipe *wpipe, 192 struct uio *uio); 193 #endif 194 static int pipespace(struct pipe *pipe, int size); 195 196 #ifndef PIPE_NODIRECT 197 static int pipe_loan_alloc(struct pipe *, int); 198 static void pipe_loan_free(struct pipe *); 199 #endif /* PIPE_NODIRECT */ 200 201 static int pipe_mutex_ctor(void *, void *, int); 202 static void pipe_mutex_dtor(void *, void *); 203 204 static pool_cache_t pipe_cache; 205 static pool_cache_t pipe_mutex_cache; 206 207 void 208 pipe_init(void) 209 { 210 size_t size; 211 212 pipe_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "pipepl", 213 NULL, IPL_NONE, NULL, NULL, NULL); 214 KASSERT(pipe_cache != NULL); 215 216 size = (sizeof(struct pipe_mutex) + (CACHE_LINE_SIZE - 1)) & 217 (CACHE_LINE_SIZE - 1); 218 pipe_mutex_cache = pool_cache_init(size, CACHE_LINE_SIZE, 219 0, 0, "pipemtxpl", NULL, IPL_NONE, pipe_mutex_ctor, 220 pipe_mutex_dtor, NULL); 221 KASSERT(pipe_cache != NULL); 222 } 223 224 static int 225 pipe_mutex_ctor(void *arg, void *obj, int flag) 226 { 227 struct pipe_mutex *pm = obj; 228 229 mutex_init(&pm->pm_mutex, MUTEX_DEFAULT, IPL_NONE); 230 pm->pm_refcnt = 0; 231 232 return 0; 233 } 234 235 static void 236 pipe_mutex_dtor(void *arg, void *obj) 237 { 238 struct pipe_mutex *pm = obj; 239 240 KASSERT(pm->pm_refcnt == 0); 241 242 mutex_destroy(&pm->pm_mutex); 243 } 244 245 /* 246 * The pipe system call for the DTYPE_PIPE type of pipes 247 */ 248 249 /* ARGSUSED */ 250 int 251 sys_pipe(struct lwp *l, const void *v, register_t *retval) 252 { 253 struct file *rf, *wf; 254 struct pipe *rpipe, *wpipe; 255 struct pipe_mutex *mutex; 256 int fd, error; 257 258 rpipe = wpipe = NULL; 259 mutex = pool_cache_get(pipe_mutex_cache, PR_WAITOK); 260 if (mutex == NULL) 261 return (ENOMEM); 262 if (pipe_create(&rpipe, 1, mutex) || pipe_create(&wpipe, 0, mutex)) { 263 pipeclose(NULL, rpipe); 264 pipeclose(NULL, wpipe); 265 return (ENFILE); 266 } 267 268 /* 269 * Note: the file structure returned from falloc() is marked 270 * as 'larval' initially. Unless we mark it as 'mature' by 271 * FILE_SET_MATURE(), any attempt to do anything with it would 272 * return EBADF, including e.g. dup(2) or close(2). This avoids 273 * file descriptor races if we block in the second falloc(). 274 */ 275 276 error = falloc(l, &rf, &fd); 277 if (error) 278 goto free2; 279 retval[0] = fd; 280 rf->f_flag = FREAD; 281 rf->f_type = DTYPE_PIPE; 282 rf->f_data = (void *)rpipe; 283 rf->f_ops = &pipeops; 284 285 error = falloc(l, &wf, &fd); 286 if (error) 287 goto free3; 288 retval[1] = fd; 289 wf->f_flag = FWRITE; 290 wf->f_type = DTYPE_PIPE; 291 wf->f_data = (void *)wpipe; 292 wf->f_ops = &pipeops; 293 294 rpipe->pipe_peer = wpipe; 295 wpipe->pipe_peer = rpipe; 296 297 FILE_SET_MATURE(rf); 298 FILE_SET_MATURE(wf); 299 FILE_UNUSE(rf, l); 300 FILE_UNUSE(wf, l); 301 return (0); 302 free3: 303 FILE_UNUSE(rf, l); 304 ffree(rf); 305 fdremove(l->l_proc->p_fd, retval[0]); 306 free2: 307 pipeclose(NULL, wpipe); 308 pipeclose(NULL, rpipe); 309 310 return (error); 311 } 312 313 /* 314 * Allocate kva for pipe circular buffer, the space is pageable 315 * This routine will 'realloc' the size of a pipe safely, if it fails 316 * it will retain the old buffer. 317 * If it fails it will return ENOMEM. 318 */ 319 static int 320 pipespace(struct pipe *pipe, int size) 321 { 322 void *buffer; 323 /* 324 * Allocate pageable virtual address space. Physical memory is 325 * allocated on demand. 326 */ 327 buffer = (void *) uvm_km_alloc(kernel_map, round_page(size), 0, 328 UVM_KMF_PAGEABLE); 329 if (buffer == NULL) 330 return (ENOMEM); 331 332 /* free old resources if we're resizing */ 333 pipe_free_kmem(pipe); 334 pipe->pipe_buffer.buffer = buffer; 335 pipe->pipe_buffer.size = size; 336 pipe->pipe_buffer.in = 0; 337 pipe->pipe_buffer.out = 0; 338 pipe->pipe_buffer.cnt = 0; 339 atomic_add_int(&amountpipekva, pipe->pipe_buffer.size); 340 return (0); 341 } 342 343 /* 344 * Initialize and allocate VM and memory for pipe. 345 */ 346 static int 347 pipe_create(struct pipe **pipep, int allockva, struct pipe_mutex *mutex) 348 { 349 struct pipe *pipe; 350 int error; 351 352 pipe = *pipep = pool_cache_get(pipe_cache, PR_WAITOK); 353 mutex->pm_refcnt++; 354 355 /* Initialize */ 356 memset(pipe, 0, sizeof(struct pipe)); 357 pipe->pipe_state = PIPE_SIGNALR; 358 359 getmicrotime(&pipe->pipe_ctime); 360 pipe->pipe_atime = pipe->pipe_ctime; 361 pipe->pipe_mtime = pipe->pipe_ctime; 362 pipe->pipe_lock = &mutex->pm_mutex; 363 cv_init(&pipe->pipe_rcv, "piperd"); 364 cv_init(&pipe->pipe_wcv, "pipewr"); 365 cv_init(&pipe->pipe_draincv, "pipedrain"); 366 cv_init(&pipe->pipe_lkcv, "pipelk"); 367 selinit(&pipe->pipe_sel); 368 369 if (allockva && (error = pipespace(pipe, PIPE_SIZE))) 370 return (error); 371 372 return (0); 373 } 374 375 376 /* 377 * Lock a pipe for I/O, blocking other access 378 * Called with pipe spin lock held. 379 * Return with pipe spin lock released on success. 380 */ 381 static int 382 pipelock(struct pipe *pipe, int catch) 383 { 384 int error; 385 386 KASSERT(mutex_owned(pipe->pipe_lock)); 387 388 while (pipe->pipe_state & PIPE_LOCKFL) { 389 pipe->pipe_state |= PIPE_LWANT; 390 if (catch) { 391 error = cv_wait_sig(&pipe->pipe_lkcv, pipe->pipe_lock); 392 if (error != 0) 393 return error; 394 } else 395 cv_wait(&pipe->pipe_lkcv, pipe->pipe_lock); 396 } 397 398 pipe->pipe_state |= PIPE_LOCKFL; 399 400 return 0; 401 } 402 403 /* 404 * unlock a pipe I/O lock 405 */ 406 static inline void 407 pipeunlock(struct pipe *pipe) 408 { 409 410 KASSERT(pipe->pipe_state & PIPE_LOCKFL); 411 412 pipe->pipe_state &= ~PIPE_LOCKFL; 413 if (pipe->pipe_state & PIPE_LWANT) { 414 pipe->pipe_state &= ~PIPE_LWANT; 415 cv_broadcast(&pipe->pipe_lkcv); 416 } 417 } 418 419 /* 420 * Select/poll wakup. This also sends SIGIO to peer connected to 421 * 'sigpipe' side of pipe. 422 */ 423 static void 424 pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code) 425 { 426 int band; 427 428 switch (code) { 429 case POLL_IN: 430 band = POLLIN|POLLRDNORM; 431 break; 432 case POLL_OUT: 433 band = POLLOUT|POLLWRNORM; 434 break; 435 case POLL_HUP: 436 band = POLLHUP; 437 break; 438 #if POLL_HUP != POLL_ERR 439 case POLL_ERR: 440 band = POLLERR; 441 break; 442 #endif 443 default: 444 band = 0; 445 #ifdef DIAGNOSTIC 446 printf("bad siginfo code %d in pipe notification.\n", code); 447 #endif 448 break; 449 } 450 451 selnotify(&selp->pipe_sel, band, NOTE_SUBMIT); 452 453 if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0) 454 return; 455 456 fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp); 457 } 458 459 /* ARGSUSED */ 460 static int 461 pipe_read(struct file *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 462 int flags) 463 { 464 struct pipe *rpipe = (struct pipe *) fp->f_data; 465 struct pipebuf *bp = &rpipe->pipe_buffer; 466 kmutex_t *lock = rpipe->pipe_lock; 467 int error; 468 size_t nread = 0; 469 size_t size; 470 size_t ocnt; 471 472 mutex_enter(lock); 473 ++rpipe->pipe_busy; 474 ocnt = bp->cnt; 475 476 again: 477 error = pipelock(rpipe, 1); 478 if (error) 479 goto unlocked_error; 480 481 while (uio->uio_resid) { 482 /* 483 * normal pipe buffer receive 484 */ 485 if (bp->cnt > 0) { 486 size = bp->size - bp->out; 487 if (size > bp->cnt) 488 size = bp->cnt; 489 if (size > uio->uio_resid) 490 size = uio->uio_resid; 491 492 mutex_exit(lock); 493 error = uiomove((char *)bp->buffer + bp->out, size, uio); 494 mutex_enter(lock); 495 if (error) 496 break; 497 498 bp->out += size; 499 if (bp->out >= bp->size) 500 bp->out = 0; 501 502 bp->cnt -= size; 503 504 /* 505 * If there is no more to read in the pipe, reset 506 * its pointers to the beginning. This improves 507 * cache hit stats. 508 */ 509 if (bp->cnt == 0) { 510 bp->in = 0; 511 bp->out = 0; 512 } 513 nread += size; 514 continue; 515 } 516 517 #ifndef PIPE_NODIRECT 518 if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) { 519 /* 520 * Direct copy, bypassing a kernel buffer. 521 */ 522 void * va; 523 524 KASSERT(rpipe->pipe_state & PIPE_DIRECTW); 525 526 size = rpipe->pipe_map.cnt; 527 if (size > uio->uio_resid) 528 size = uio->uio_resid; 529 530 va = (char *)rpipe->pipe_map.kva + rpipe->pipe_map.pos; 531 mutex_exit(lock); 532 error = uiomove(va, size, uio); 533 mutex_enter(lock); 534 if (error) 535 break; 536 nread += size; 537 rpipe->pipe_map.pos += size; 538 rpipe->pipe_map.cnt -= size; 539 if (rpipe->pipe_map.cnt == 0) { 540 rpipe->pipe_state &= ~PIPE_DIRECTR; 541 cv_broadcast(&rpipe->pipe_wcv); 542 } 543 continue; 544 } 545 #endif 546 /* 547 * Break if some data was read. 548 */ 549 if (nread > 0) 550 break; 551 552 /* 553 * detect EOF condition 554 * read returns 0 on EOF, no need to set error 555 */ 556 if (rpipe->pipe_state & PIPE_EOF) 557 break; 558 559 /* 560 * don't block on non-blocking I/O 561 */ 562 if (fp->f_flag & FNONBLOCK) { 563 error = EAGAIN; 564 break; 565 } 566 567 /* 568 * Unlock the pipe buffer for our remaining processing. 569 * We will either break out with an error or we will 570 * sleep and relock to loop. 571 */ 572 pipeunlock(rpipe); 573 574 /* 575 * Re-check to see if more direct writes are pending. 576 */ 577 if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) 578 goto again; 579 580 /* 581 * We want to read more, wake up select/poll. 582 */ 583 pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_IN); 584 585 /* 586 * If the "write-side" is blocked, wake it up now. 587 */ 588 cv_broadcast(&rpipe->pipe_wcv); 589 590 /* Now wait until the pipe is filled */ 591 error = cv_wait_sig(&rpipe->pipe_rcv, lock); 592 if (error != 0) 593 goto unlocked_error; 594 goto again; 595 } 596 597 if (error == 0) 598 getmicrotime(&rpipe->pipe_atime); 599 pipeunlock(rpipe); 600 601 unlocked_error: 602 --rpipe->pipe_busy; 603 if (rpipe->pipe_busy == 0) { 604 cv_broadcast(&rpipe->pipe_draincv); 605 } 606 if (bp->cnt < MINPIPESIZE) { 607 cv_broadcast(&rpipe->pipe_wcv); 608 } 609 610 /* 611 * If anything was read off the buffer, signal to the writer it's 612 * possible to write more data. Also send signal if we are here for the 613 * first time after last write. 614 */ 615 if ((bp->size - bp->cnt) >= PIPE_BUF 616 && (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) { 617 pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT); 618 rpipe->pipe_state &= ~PIPE_SIGNALR; 619 } 620 621 mutex_exit(lock); 622 return (error); 623 } 624 625 #ifndef PIPE_NODIRECT 626 /* 627 * Allocate structure for loan transfer. 628 */ 629 static int 630 pipe_loan_alloc(struct pipe *wpipe, int npages) 631 { 632 vsize_t len; 633 634 len = (vsize_t)npages << PAGE_SHIFT; 635 atomic_add_int(&amountpipekva, len); 636 wpipe->pipe_map.kva = uvm_km_alloc(kernel_map, len, 0, 637 UVM_KMF_VAONLY | UVM_KMF_WAITVA); 638 if (wpipe->pipe_map.kva == 0) { 639 atomic_add_int(&amountpipekva, -len); 640 return (ENOMEM); 641 } 642 643 wpipe->pipe_map.npages = npages; 644 wpipe->pipe_map.pgs = malloc(npages * sizeof(struct vm_page *), M_PIPE, 645 M_WAITOK); 646 return (0); 647 } 648 649 /* 650 * Free resources allocated for loan transfer. 651 */ 652 static void 653 pipe_loan_free(struct pipe *wpipe) 654 { 655 vsize_t len; 656 657 len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT; 658 uvm_km_free(kernel_map, wpipe->pipe_map.kva, len, UVM_KMF_VAONLY); 659 wpipe->pipe_map.kva = 0; 660 atomic_add_int(&amountpipekva, -len); 661 free(wpipe->pipe_map.pgs, M_PIPE); 662 wpipe->pipe_map.pgs = NULL; 663 } 664 665 /* 666 * NetBSD direct write, using uvm_loan() mechanism. 667 * This implements the pipe buffer write mechanism. Note that only 668 * a direct write OR a normal pipe write can be pending at any given time. 669 * If there are any characters in the pipe buffer, the direct write will 670 * be deferred until the receiving process grabs all of the bytes from 671 * the pipe buffer. Then the direct mapping write is set-up. 672 * 673 * Called with the long-term pipe lock held. 674 */ 675 static int 676 pipe_direct_write(struct file *fp, struct pipe *wpipe, struct uio *uio) 677 { 678 int error, npages, j; 679 struct vm_page **pgs; 680 vaddr_t bbase, kva, base, bend; 681 vsize_t blen, bcnt; 682 voff_t bpos; 683 kmutex_t *lock = wpipe->pipe_lock; 684 685 KASSERT(mutex_owned(wpipe->pipe_lock)); 686 KASSERT(wpipe->pipe_map.cnt == 0); 687 688 mutex_exit(lock); 689 690 /* 691 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers 692 * not aligned to PAGE_SIZE. 693 */ 694 bbase = (vaddr_t)uio->uio_iov->iov_base; 695 base = trunc_page(bbase); 696 bend = round_page(bbase + uio->uio_iov->iov_len); 697 blen = bend - base; 698 bpos = bbase - base; 699 700 if (blen > PIPE_DIRECT_CHUNK) { 701 blen = PIPE_DIRECT_CHUNK; 702 bend = base + blen; 703 bcnt = PIPE_DIRECT_CHUNK - bpos; 704 } else { 705 bcnt = uio->uio_iov->iov_len; 706 } 707 npages = blen >> PAGE_SHIFT; 708 709 /* 710 * Free the old kva if we need more pages than we have 711 * allocated. 712 */ 713 if (wpipe->pipe_map.kva != 0 && npages > wpipe->pipe_map.npages) 714 pipe_loan_free(wpipe); 715 716 /* Allocate new kva. */ 717 if (wpipe->pipe_map.kva == 0) { 718 error = pipe_loan_alloc(wpipe, npages); 719 if (error) { 720 mutex_enter(lock); 721 return (error); 722 } 723 } 724 725 /* Loan the write buffer memory from writer process */ 726 pgs = wpipe->pipe_map.pgs; 727 error = uvm_loan(&uio->uio_vmspace->vm_map, base, blen, 728 pgs, UVM_LOAN_TOPAGE); 729 if (error) { 730 pipe_loan_free(wpipe); 731 mutex_enter(lock); 732 return (ENOMEM); /* so that caller fallback to ordinary write */ 733 } 734 735 /* Enter the loaned pages to kva */ 736 kva = wpipe->pipe_map.kva; 737 for (j = 0; j < npages; j++, kva += PAGE_SIZE) { 738 pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ); 739 } 740 pmap_update(pmap_kernel()); 741 742 /* Now we can put the pipe in direct write mode */ 743 wpipe->pipe_map.pos = bpos; 744 wpipe->pipe_map.cnt = bcnt; 745 746 /* 747 * But before we can let someone do a direct read, we 748 * have to wait until the pipe is drained. Release the 749 * pipe lock while we wait. 750 */ 751 mutex_enter(lock); 752 wpipe->pipe_state |= PIPE_DIRECTW; 753 pipeunlock(wpipe); 754 755 while (error == 0 && wpipe->pipe_buffer.cnt > 0) { 756 cv_broadcast(&wpipe->pipe_rcv); 757 error = cv_wait_sig(&wpipe->pipe_wcv, lock); 758 if (error == 0 && wpipe->pipe_state & PIPE_EOF) 759 error = EPIPE; 760 } 761 762 /* Pipe is drained; next read will off the direct buffer */ 763 wpipe->pipe_state |= PIPE_DIRECTR; 764 765 /* Wait until the reader is done */ 766 while (error == 0 && (wpipe->pipe_state & PIPE_DIRECTR)) { 767 cv_broadcast(&wpipe->pipe_rcv); 768 pipeselwakeup(wpipe, wpipe, POLL_IN); 769 error = cv_wait_sig(&wpipe->pipe_wcv, lock); 770 if (error == 0 && wpipe->pipe_state & PIPE_EOF) 771 error = EPIPE; 772 } 773 774 /* Take pipe out of direct write mode */ 775 wpipe->pipe_state &= ~(PIPE_DIRECTW | PIPE_DIRECTR); 776 777 /* Acquire the pipe lock and cleanup */ 778 (void)pipelock(wpipe, 0); 779 mutex_exit(lock); 780 781 if (pgs != NULL) { 782 pmap_kremove(wpipe->pipe_map.kva, blen); 783 pmap_update(pmap_kernel()); 784 uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE); 785 } 786 if (error || amountpipekva > maxpipekva) 787 pipe_loan_free(wpipe); 788 789 mutex_enter(lock); 790 if (error) { 791 pipeselwakeup(wpipe, wpipe, POLL_ERR); 792 793 /* 794 * If nothing was read from what we offered, return error 795 * straight on. Otherwise update uio resid first. Caller 796 * will deal with the error condition, returning short 797 * write, error, or restarting the write(2) as appropriate. 798 */ 799 if (wpipe->pipe_map.cnt == bcnt) { 800 wpipe->pipe_map.cnt = 0; 801 cv_broadcast(&wpipe->pipe_wcv); 802 return (error); 803 } 804 805 bcnt -= wpipe->pipe_map.cnt; 806 } 807 808 uio->uio_resid -= bcnt; 809 /* uio_offset not updated, not set/used for write(2) */ 810 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt; 811 uio->uio_iov->iov_len -= bcnt; 812 if (uio->uio_iov->iov_len == 0) { 813 uio->uio_iov++; 814 uio->uio_iovcnt--; 815 } 816 817 wpipe->pipe_map.cnt = 0; 818 return (error); 819 } 820 #endif /* !PIPE_NODIRECT */ 821 822 static int 823 pipe_write(struct file *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, 824 int flags) 825 { 826 struct pipe *wpipe, *rpipe; 827 struct pipebuf *bp; 828 kmutex_t *lock; 829 int error; 830 831 /* We want to write to our peer */ 832 rpipe = (struct pipe *) fp->f_data; 833 lock = rpipe->pipe_lock; 834 error = 0; 835 836 mutex_enter(lock); 837 wpipe = rpipe->pipe_peer; 838 839 /* 840 * Detect loss of pipe read side, issue SIGPIPE if lost. 841 */ 842 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) != 0) { 843 mutex_exit(lock); 844 return EPIPE; 845 } 846 ++wpipe->pipe_busy; 847 848 /* Aquire the long-term pipe lock */ 849 if ((error = pipelock(wpipe, 1)) != 0) { 850 --wpipe->pipe_busy; 851 if (wpipe->pipe_busy == 0) { 852 cv_broadcast(&wpipe->pipe_draincv); 853 } 854 mutex_exit(lock); 855 return (error); 856 } 857 858 bp = &wpipe->pipe_buffer; 859 860 /* 861 * If it is advantageous to resize the pipe buffer, do so. 862 */ 863 if ((uio->uio_resid > PIPE_SIZE) && 864 (nbigpipe < maxbigpipes) && 865 #ifndef PIPE_NODIRECT 866 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 867 #endif 868 (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) { 869 870 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 871 atomic_inc_uint(&nbigpipe); 872 } 873 874 while (uio->uio_resid) { 875 size_t space; 876 877 #ifndef PIPE_NODIRECT 878 /* 879 * Pipe buffered writes cannot be coincidental with 880 * direct writes. Also, only one direct write can be 881 * in progress at any one time. We wait until the currently 882 * executing direct write is completed before continuing. 883 * 884 * We break out if a signal occurs or the reader goes away. 885 */ 886 while (error == 0 && wpipe->pipe_state & PIPE_DIRECTW) { 887 cv_broadcast(&wpipe->pipe_rcv); 888 pipeunlock(wpipe); 889 error = cv_wait_sig(&wpipe->pipe_wcv, lock); 890 (void)pipelock(wpipe, 0); 891 if (wpipe->pipe_state & PIPE_EOF) 892 error = EPIPE; 893 } 894 if (error) 895 break; 896 897 /* 898 * If the transfer is large, we can gain performance if 899 * we do process-to-process copies directly. 900 * If the write is non-blocking, we don't use the 901 * direct write mechanism. 902 * 903 * The direct write mechanism will detect the reader going 904 * away on us. 905 */ 906 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 907 (fp->f_flag & FNONBLOCK) == 0 && 908 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) { 909 error = pipe_direct_write(fp, wpipe, uio); 910 911 /* 912 * Break out if error occurred, unless it's ENOMEM. 913 * ENOMEM means we failed to allocate some resources 914 * for direct write, so we just fallback to ordinary 915 * write. If the direct write was successful, 916 * process rest of data via ordinary write. 917 */ 918 if (error == 0) 919 continue; 920 921 if (error != ENOMEM) 922 break; 923 } 924 #endif /* PIPE_NODIRECT */ 925 926 space = bp->size - bp->cnt; 927 928 /* Writes of size <= PIPE_BUF must be atomic. */ 929 if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF)) 930 space = 0; 931 932 if (space > 0) { 933 int size; /* Transfer size */ 934 int segsize; /* first segment to transfer */ 935 936 /* 937 * Transfer size is minimum of uio transfer 938 * and free space in pipe buffer. 939 */ 940 if (space > uio->uio_resid) 941 size = uio->uio_resid; 942 else 943 size = space; 944 /* 945 * First segment to transfer is minimum of 946 * transfer size and contiguous space in 947 * pipe buffer. If first segment to transfer 948 * is less than the transfer size, we've got 949 * a wraparound in the buffer. 950 */ 951 segsize = bp->size - bp->in; 952 if (segsize > size) 953 segsize = size; 954 955 /* Transfer first segment */ 956 mutex_exit(lock); 957 error = uiomove((char *)bp->buffer + bp->in, segsize, 958 uio); 959 960 if (error == 0 && segsize < size) { 961 /* 962 * Transfer remaining part now, to 963 * support atomic writes. Wraparound 964 * happened. 965 */ 966 #ifdef DEBUG 967 if (bp->in + segsize != bp->size) 968 panic("Expected pipe buffer wraparound disappeared"); 969 #endif 970 971 error = uiomove(bp->buffer, 972 size - segsize, uio); 973 } 974 mutex_enter(lock); 975 if (error) 976 break; 977 978 bp->in += size; 979 if (bp->in >= bp->size) { 980 #ifdef DEBUG 981 if (bp->in != size - segsize + bp->size) 982 panic("Expected wraparound bad"); 983 #endif 984 bp->in = size - segsize; 985 } 986 987 bp->cnt += size; 988 #ifdef DEBUG 989 if (bp->cnt > bp->size) 990 panic("Pipe buffer overflow"); 991 #endif 992 } else { 993 /* 994 * If the "read-side" has been blocked, wake it up now. 995 */ 996 cv_broadcast(&wpipe->pipe_rcv); 997 998 /* 999 * don't block on non-blocking I/O 1000 */ 1001 if (fp->f_flag & FNONBLOCK) { 1002 error = EAGAIN; 1003 break; 1004 } 1005 1006 /* 1007 * We have no more space and have something to offer, 1008 * wake up select/poll. 1009 */ 1010 if (bp->cnt) 1011 pipeselwakeup(wpipe, wpipe, POLL_OUT); 1012 1013 pipeunlock(wpipe); 1014 error = cv_wait_sig(&wpipe->pipe_wcv, lock); 1015 (void)pipelock(wpipe, 0); 1016 if (error != 0) 1017 break; 1018 /* 1019 * If read side wants to go away, we just issue a signal 1020 * to ourselves. 1021 */ 1022 if (wpipe->pipe_state & PIPE_EOF) { 1023 error = EPIPE; 1024 break; 1025 } 1026 } 1027 } 1028 1029 --wpipe->pipe_busy; 1030 if (wpipe->pipe_busy == 0) { 1031 cv_broadcast(&wpipe->pipe_draincv); 1032 } 1033 if (bp->cnt > 0) { 1034 cv_broadcast(&wpipe->pipe_rcv); 1035 } 1036 1037 /* 1038 * Don't return EPIPE if I/O was successful 1039 */ 1040 if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0) 1041 error = 0; 1042 1043 if (error == 0) 1044 getmicrotime(&wpipe->pipe_mtime); 1045 1046 /* 1047 * We have something to offer, wake up select/poll. 1048 * wpipe->pipe_map.cnt is always 0 in this point (direct write 1049 * is only done synchronously), so check only wpipe->pipe_buffer.cnt 1050 */ 1051 if (bp->cnt) 1052 pipeselwakeup(wpipe, wpipe, POLL_OUT); 1053 1054 /* 1055 * Arrange for next read(2) to do a signal. 1056 */ 1057 wpipe->pipe_state |= PIPE_SIGNALR; 1058 1059 pipeunlock(wpipe); 1060 mutex_exit(lock); 1061 return (error); 1062 } 1063 1064 /* 1065 * we implement a very minimal set of ioctls for compatibility with sockets. 1066 */ 1067 int 1068 pipe_ioctl(struct file *fp, u_long cmd, void *data, struct lwp *l) 1069 { 1070 struct pipe *pipe = (struct pipe *)fp->f_data; 1071 struct proc *p = l->l_proc; 1072 kmutex_t *lock = pipe->pipe_lock; 1073 1074 switch (cmd) { 1075 1076 case FIONBIO: 1077 return (0); 1078 1079 case FIOASYNC: 1080 mutex_enter(lock); 1081 if (*(int *)data) { 1082 pipe->pipe_state |= PIPE_ASYNC; 1083 } else { 1084 pipe->pipe_state &= ~PIPE_ASYNC; 1085 } 1086 mutex_exit(lock); 1087 return (0); 1088 1089 case FIONREAD: 1090 mutex_enter(lock); 1091 #ifndef PIPE_NODIRECT 1092 if (pipe->pipe_state & PIPE_DIRECTW) 1093 *(int *)data = pipe->pipe_map.cnt; 1094 else 1095 #endif 1096 *(int *)data = pipe->pipe_buffer.cnt; 1097 mutex_exit(lock); 1098 return (0); 1099 1100 case FIONWRITE: 1101 /* Look at other side */ 1102 pipe = pipe->pipe_peer; 1103 mutex_enter(lock); 1104 #ifndef PIPE_NODIRECT 1105 if (pipe->pipe_state & PIPE_DIRECTW) 1106 *(int *)data = pipe->pipe_map.cnt; 1107 else 1108 #endif 1109 *(int *)data = pipe->pipe_buffer.cnt; 1110 mutex_exit(lock); 1111 return (0); 1112 1113 case FIONSPACE: 1114 /* Look at other side */ 1115 pipe = pipe->pipe_peer; 1116 mutex_enter(lock); 1117 #ifndef PIPE_NODIRECT 1118 /* 1119 * If we're in direct-mode, we don't really have a 1120 * send queue, and any other write will block. Thus 1121 * zero seems like the best answer. 1122 */ 1123 if (pipe->pipe_state & PIPE_DIRECTW) 1124 *(int *)data = 0; 1125 else 1126 #endif 1127 *(int *)data = pipe->pipe_buffer.size - 1128 pipe->pipe_buffer.cnt; 1129 mutex_exit(lock); 1130 return (0); 1131 1132 case TIOCSPGRP: 1133 case FIOSETOWN: 1134 return fsetown(p, &pipe->pipe_pgid, cmd, data); 1135 1136 case TIOCGPGRP: 1137 case FIOGETOWN: 1138 return fgetown(p, pipe->pipe_pgid, cmd, data); 1139 1140 } 1141 return (EPASSTHROUGH); 1142 } 1143 1144 int 1145 pipe_poll(struct file *fp, int events, struct lwp *l) 1146 { 1147 struct pipe *rpipe = (struct pipe *)fp->f_data; 1148 struct pipe *wpipe; 1149 int eof = 0; 1150 int revents = 0; 1151 1152 mutex_enter(rpipe->pipe_lock); 1153 wpipe = rpipe->pipe_peer; 1154 1155 if (events & (POLLIN | POLLRDNORM)) 1156 if ((rpipe->pipe_buffer.cnt > 0) || 1157 #ifndef PIPE_NODIRECT 1158 (rpipe->pipe_state & PIPE_DIRECTR) || 1159 #endif 1160 (rpipe->pipe_state & PIPE_EOF)) 1161 revents |= events & (POLLIN | POLLRDNORM); 1162 1163 eof |= (rpipe->pipe_state & PIPE_EOF); 1164 1165 if (wpipe == NULL) 1166 revents |= events & (POLLOUT | POLLWRNORM); 1167 else { 1168 if (events & (POLLOUT | POLLWRNORM)) 1169 if ((wpipe->pipe_state & PIPE_EOF) || ( 1170 #ifndef PIPE_NODIRECT 1171 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 1172 #endif 1173 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1174 revents |= events & (POLLOUT | POLLWRNORM); 1175 1176 eof |= (wpipe->pipe_state & PIPE_EOF); 1177 } 1178 1179 if (wpipe == NULL || eof) 1180 revents |= POLLHUP; 1181 1182 if (revents == 0) { 1183 if (events & (POLLIN | POLLRDNORM)) 1184 selrecord(l, &rpipe->pipe_sel); 1185 1186 if (events & (POLLOUT | POLLWRNORM)) 1187 selrecord(l, &wpipe->pipe_sel); 1188 } 1189 mutex_exit(rpipe->pipe_lock); 1190 1191 return (revents); 1192 } 1193 1194 static int 1195 pipe_stat(struct file *fp, struct stat *ub, struct lwp *l) 1196 { 1197 struct pipe *pipe = (struct pipe *)fp->f_data; 1198 1199 memset((void *)ub, 0, sizeof(*ub)); 1200 ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR; 1201 ub->st_blksize = pipe->pipe_buffer.size; 1202 if (ub->st_blksize == 0 && pipe->pipe_peer) 1203 ub->st_blksize = pipe->pipe_peer->pipe_buffer.size; 1204 ub->st_size = pipe->pipe_buffer.cnt; 1205 ub->st_blocks = (ub->st_size) ? 1 : 0; 1206 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec); 1207 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec); 1208 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec); 1209 ub->st_uid = kauth_cred_geteuid(fp->f_cred); 1210 ub->st_gid = kauth_cred_getegid(fp->f_cred); 1211 1212 /* 1213 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1214 * XXX (st_dev, st_ino) should be unique. 1215 */ 1216 return (0); 1217 } 1218 1219 /* ARGSUSED */ 1220 static int 1221 pipe_close(struct file *fp, struct lwp *l) 1222 { 1223 struct pipe *pipe = (struct pipe *)fp->f_data; 1224 1225 fp->f_data = NULL; 1226 pipeclose(fp, pipe); 1227 return (0); 1228 } 1229 1230 static void 1231 pipe_free_kmem(struct pipe *pipe) 1232 { 1233 1234 if (pipe->pipe_buffer.buffer != NULL) { 1235 if (pipe->pipe_buffer.size > PIPE_SIZE) 1236 atomic_dec_uint(&nbigpipe); 1237 uvm_km_free(kernel_map, 1238 (vaddr_t)pipe->pipe_buffer.buffer, 1239 pipe->pipe_buffer.size, UVM_KMF_PAGEABLE); 1240 atomic_add_int(&amountpipekva, -pipe->pipe_buffer.size); 1241 pipe->pipe_buffer.buffer = NULL; 1242 } 1243 #ifndef PIPE_NODIRECT 1244 if (pipe->pipe_map.kva != 0) { 1245 pipe_loan_free(pipe); 1246 pipe->pipe_map.cnt = 0; 1247 pipe->pipe_map.kva = 0; 1248 pipe->pipe_map.pos = 0; 1249 pipe->pipe_map.npages = 0; 1250 } 1251 #endif /* !PIPE_NODIRECT */ 1252 } 1253 1254 /* 1255 * shutdown the pipe 1256 */ 1257 static void 1258 pipeclose(struct file *fp, struct pipe *pipe) 1259 { 1260 struct pipe_mutex *mutex; 1261 kmutex_t *lock; 1262 struct pipe *ppipe; 1263 u_int refcnt; 1264 1265 if (pipe == NULL) 1266 return; 1267 lock = pipe->pipe_lock; 1268 mutex_enter(lock); 1269 pipeselwakeup(pipe, pipe, POLL_HUP); 1270 1271 /* 1272 * If the other side is blocked, wake it up saying that 1273 * we want to close it down. 1274 */ 1275 pipe->pipe_state |= PIPE_EOF; 1276 if (pipe->pipe_busy) { 1277 while (pipe->pipe_busy) { 1278 cv_broadcast(&pipe->pipe_wcv); 1279 cv_wait_sig(&pipe->pipe_draincv, lock); 1280 } 1281 } 1282 1283 /* 1284 * Disconnect from peer 1285 */ 1286 if ((ppipe = pipe->pipe_peer) != NULL) { 1287 pipeselwakeup(ppipe, ppipe, POLL_HUP); 1288 ppipe->pipe_state |= PIPE_EOF; 1289 cv_broadcast(&ppipe->pipe_rcv); 1290 ppipe->pipe_peer = NULL; 1291 } 1292 1293 KASSERT((pipe->pipe_state & PIPE_LOCKFL) == 0); 1294 1295 mutex = (struct pipe_mutex *)lock; 1296 refcnt = --(mutex->pm_refcnt); 1297 KASSERT(refcnt == 0 || refcnt == 1); 1298 mutex_exit(lock); 1299 1300 /* 1301 * free resources 1302 */ 1303 pipe_free_kmem(pipe); 1304 cv_destroy(&pipe->pipe_rcv); 1305 cv_destroy(&pipe->pipe_wcv); 1306 cv_destroy(&pipe->pipe_draincv); 1307 cv_destroy(&pipe->pipe_lkcv); 1308 seldestroy(&pipe->pipe_sel); 1309 pool_cache_put(pipe_cache, pipe); 1310 if (refcnt == 0) 1311 pool_cache_put(pipe_mutex_cache, mutex); 1312 } 1313 1314 static void 1315 filt_pipedetach(struct knote *kn) 1316 { 1317 struct pipe *pipe; 1318 kmutex_t *lock; 1319 1320 pipe = (struct pipe *)kn->kn_fp->f_data; 1321 lock = pipe->pipe_lock; 1322 1323 mutex_enter(lock); 1324 1325 switch(kn->kn_filter) { 1326 case EVFILT_WRITE: 1327 /* need the peer structure, not our own */ 1328 pipe = pipe->pipe_peer; 1329 1330 /* if reader end already closed, just return */ 1331 if (pipe == NULL) { 1332 mutex_exit(lock); 1333 return; 1334 } 1335 1336 break; 1337 default: 1338 /* nothing to do */ 1339 break; 1340 } 1341 1342 #ifdef DIAGNOSTIC 1343 if (kn->kn_hook != pipe) 1344 panic("filt_pipedetach: inconsistent knote"); 1345 #endif 1346 1347 SLIST_REMOVE(&pipe->pipe_sel.sel_klist, kn, knote, kn_selnext); 1348 mutex_exit(lock); 1349 } 1350 1351 /*ARGSUSED*/ 1352 static int 1353 filt_piperead(struct knote *kn, long hint) 1354 { 1355 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1356 struct pipe *wpipe; 1357 1358 if ((hint & NOTE_SUBMIT) == 0) { 1359 mutex_enter(rpipe->pipe_lock); 1360 } 1361 wpipe = rpipe->pipe_peer; 1362 kn->kn_data = rpipe->pipe_buffer.cnt; 1363 1364 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1365 kn->kn_data = rpipe->pipe_map.cnt; 1366 1367 if ((rpipe->pipe_state & PIPE_EOF) || 1368 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1369 kn->kn_flags |= EV_EOF; 1370 if ((hint & NOTE_SUBMIT) == 0) { 1371 mutex_exit(rpipe->pipe_lock); 1372 } 1373 return (1); 1374 } 1375 1376 if ((hint & NOTE_SUBMIT) == 0) { 1377 mutex_exit(rpipe->pipe_lock); 1378 } 1379 return (kn->kn_data > 0); 1380 } 1381 1382 /*ARGSUSED*/ 1383 static int 1384 filt_pipewrite(struct knote *kn, long hint) 1385 { 1386 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1387 struct pipe *wpipe; 1388 1389 if ((hint & NOTE_SUBMIT) == 0) { 1390 mutex_enter(rpipe->pipe_lock); 1391 } 1392 wpipe = rpipe->pipe_peer; 1393 1394 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1395 kn->kn_data = 0; 1396 kn->kn_flags |= EV_EOF; 1397 if ((hint & NOTE_SUBMIT) == 0) { 1398 mutex_exit(rpipe->pipe_lock); 1399 } 1400 return (1); 1401 } 1402 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1403 if (wpipe->pipe_state & PIPE_DIRECTW) 1404 kn->kn_data = 0; 1405 1406 if ((hint & NOTE_SUBMIT) == 0) { 1407 mutex_exit(rpipe->pipe_lock); 1408 } 1409 return (kn->kn_data >= PIPE_BUF); 1410 } 1411 1412 static const struct filterops pipe_rfiltops = 1413 { 1, NULL, filt_pipedetach, filt_piperead }; 1414 static const struct filterops pipe_wfiltops = 1415 { 1, NULL, filt_pipedetach, filt_pipewrite }; 1416 1417 /*ARGSUSED*/ 1418 static int 1419 pipe_kqfilter(struct file *fp, struct knote *kn) 1420 { 1421 struct pipe *pipe; 1422 kmutex_t *lock; 1423 1424 pipe = (struct pipe *)kn->kn_fp->f_data; 1425 lock = pipe->pipe_lock; 1426 1427 mutex_enter(lock); 1428 1429 switch (kn->kn_filter) { 1430 case EVFILT_READ: 1431 kn->kn_fop = &pipe_rfiltops; 1432 break; 1433 case EVFILT_WRITE: 1434 kn->kn_fop = &pipe_wfiltops; 1435 pipe = pipe->pipe_peer; 1436 if (pipe == NULL) { 1437 /* other end of pipe has been closed */ 1438 mutex_exit(lock); 1439 return (EBADF); 1440 } 1441 break; 1442 default: 1443 mutex_exit(lock); 1444 return (EINVAL); 1445 } 1446 1447 kn->kn_hook = pipe; 1448 SLIST_INSERT_HEAD(&pipe->pipe_sel.sel_klist, kn, kn_selnext); 1449 mutex_exit(lock); 1450 1451 return (0); 1452 } 1453 1454 /* 1455 * Handle pipe sysctls. 1456 */ 1457 SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup") 1458 { 1459 1460 sysctl_createv(clog, 0, NULL, NULL, 1461 CTLFLAG_PERMANENT, 1462 CTLTYPE_NODE, "kern", NULL, 1463 NULL, 0, NULL, 0, 1464 CTL_KERN, CTL_EOL); 1465 sysctl_createv(clog, 0, NULL, NULL, 1466 CTLFLAG_PERMANENT, 1467 CTLTYPE_NODE, "pipe", 1468 SYSCTL_DESCR("Pipe settings"), 1469 NULL, 0, NULL, 0, 1470 CTL_KERN, KERN_PIPE, CTL_EOL); 1471 1472 sysctl_createv(clog, 0, NULL, NULL, 1473 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1474 CTLTYPE_INT, "maxkvasz", 1475 SYSCTL_DESCR("Maximum amount of kernel memory to be " 1476 "used for pipes"), 1477 NULL, 0, &maxpipekva, 0, 1478 CTL_KERN, KERN_PIPE, KERN_PIPE_MAXKVASZ, CTL_EOL); 1479 sysctl_createv(clog, 0, NULL, NULL, 1480 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1481 CTLTYPE_INT, "maxloankvasz", 1482 SYSCTL_DESCR("Limit for direct transfers via page loan"), 1483 NULL, 0, &limitpipekva, 0, 1484 CTL_KERN, KERN_PIPE, KERN_PIPE_LIMITKVA, CTL_EOL); 1485 sysctl_createv(clog, 0, NULL, NULL, 1486 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1487 CTLTYPE_INT, "maxbigpipes", 1488 SYSCTL_DESCR("Maximum number of \"big\" pipes"), 1489 NULL, 0, &maxbigpipes, 0, 1490 CTL_KERN, KERN_PIPE, KERN_PIPE_MAXBIGPIPES, CTL_EOL); 1491 sysctl_createv(clog, 0, NULL, NULL, 1492 CTLFLAG_PERMANENT, 1493 CTLTYPE_INT, "nbigpipes", 1494 SYSCTL_DESCR("Number of \"big\" pipes"), 1495 NULL, 0, &nbigpipe, 0, 1496 CTL_KERN, KERN_PIPE, KERN_PIPE_NBIGPIPES, CTL_EOL); 1497 sysctl_createv(clog, 0, NULL, NULL, 1498 CTLFLAG_PERMANENT, 1499 CTLTYPE_INT, "kvasize", 1500 SYSCTL_DESCR("Amount of kernel memory consumed by pipe " 1501 "buffers"), 1502 NULL, 0, &amountpipekva, 0, 1503 CTL_KERN, KERN_PIPE, KERN_PIPE_KVASIZE, CTL_EOL); 1504 } 1505