1 /* $NetBSD: sys_pipe.c,v 1.11 2001/07/26 14:14:28 jdolecek Exp $ */ 2 3 /* 4 * Copyright (c) 1996 John S. Dyson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice immediately at the beginning of the file, without modification, 12 * this list of conditions, and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Absolutely no warranty of function or purpose is made by the author 17 * John S. Dyson. 18 * 4. Modifications may be freely made to this file if the above conditions 19 * are met. 20 * 21 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $ 22 */ 23 24 /* 25 * This file contains a high-performance replacement for the socket-based 26 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 27 * all features of sockets, but does do everything that pipes normally 28 * do. 29 * 30 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was 31 * written by Jaromir Dolecek. 32 */ 33 34 /* 35 * This code has two modes of operation, a small write mode and a large 36 * write mode. The small write mode acts like conventional pipes with 37 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 38 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 39 * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD, 40 * those pages are also wired), and the receiving process can copy it directly 41 * from the pages in the sending process. 42 * 43 * If the sending process receives a signal, it is possible that it will 44 * go away, and certainly its address space can change, because control 45 * is returned back to the user-mode side. In that case, the pipe code 46 * arranges to copy the buffer supplied by the user process on FreeBSD, to 47 * a pageable kernel buffer, and the receiving process will grab the data 48 * from the pageable kernel buffer. Since signals don't happen all that often, 49 * the copy operation is normally eliminated. 50 * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(), 51 * so no explicit handling need to be done, all is handled by standard VM 52 * facilities. 53 * 54 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 55 * happen for small transfers so that the system will not spend all of 56 * its time context switching. PIPE_SIZE is constrained by the 57 * amount of kernel virtual memory. 58 */ 59 60 #include <sys/param.h> 61 #include <sys/systm.h> 62 #include <sys/proc.h> 63 #include <sys/fcntl.h> 64 #include <sys/file.h> 65 #include <sys/filedesc.h> 66 #include <sys/filio.h> 67 #include <sys/ttycom.h> 68 #include <sys/stat.h> 69 #include <sys/poll.h> 70 #include <sys/signalvar.h> 71 #include <sys/vnode.h> 72 #include <sys/uio.h> 73 #include <sys/lock.h> 74 #ifdef __FreeBSD__ 75 #include <sys/mutex.h> 76 #include <sys/selinfo.h> 77 #include <sys/sysproto.h> 78 #elif defined(__NetBSD__) 79 #include <sys/select.h> 80 #include <sys/malloc.h> 81 #include <sys/mount.h> 82 #include <sys/syscallargs.h> 83 #include <uvm/uvm.h> 84 #include <sys/sysctl.h> 85 #endif /* NetBSD, FreeBSD */ 86 87 #include <sys/pipe.h> 88 89 #ifdef __NetBSD__ 90 #define vfs_timestamp(tv) microtime(tv) 91 #endif 92 93 /* 94 * Use this define if you want to disable *fancy* VM things. Expect an 95 * approx 30% decrease in transfer rate. This could be useful for 96 * OpenBSD. 97 */ 98 /* #define PIPE_NODIRECT */ 99 100 /* 101 * interfaces to the outside world 102 */ 103 #ifdef __FreeBSD__ 104 static int pipe_read __P((struct file *fp, struct uio *uio, 105 struct ucred *cred, int flags, struct proc *p)); 106 static int pipe_write __P((struct file *fp, struct uio *uio, 107 struct ucred *cred, int flags, struct proc *p)); 108 static int pipe_close __P((struct file *fp, struct proc *p)); 109 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred, 110 struct proc *p)); 111 static int pipe_kqfilter __P((struct file *fp, struct knote *kn)); 112 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p)); 113 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p)); 114 115 static struct fileops pipeops = { 116 pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter, 117 pipe_stat, pipe_close 118 }; 119 120 static void filt_pipedetach(struct knote *kn); 121 static int filt_piperead(struct knote *kn, long hint); 122 static int filt_pipewrite(struct knote *kn, long hint); 123 124 static struct filterops pipe_rfiltops = 125 { 1, NULL, filt_pipedetach, filt_piperead }; 126 static struct filterops pipe_wfiltops = 127 { 1, NULL, filt_pipedetach, filt_pipewrite }; 128 #endif /* FreeBSD */ 129 130 #ifdef __NetBSD__ 131 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio, 132 struct ucred *cred, int flags)); 133 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio, 134 struct ucred *cred, int flags)); 135 static int pipe_close __P((struct file *fp, struct proc *p)); 136 static int pipe_poll __P((struct file *fp, int events, struct proc *p)); 137 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data, 138 struct proc *p)); 139 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p)); 140 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p)); 141 142 static struct fileops pipeops = 143 { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll, 144 pipe_stat, pipe_close }; 145 #endif /* NetBSD */ 146 147 /* 148 * Default pipe buffer size(s), this can be kind-of large now because pipe 149 * space is pageable. The pipe code will try to maintain locality of 150 * reference for performance reasons, so small amounts of outstanding I/O 151 * will not wipe the cache. 152 */ 153 #define MINPIPESIZE (PIPE_SIZE/3) 154 #define MAXPIPESIZE (2*PIPE_SIZE/3) 155 156 /* 157 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 158 * is there so that on large systems, we don't exhaust it. 159 */ 160 #define MAXPIPEKVA (8*1024*1024) 161 static int maxpipekva = MAXPIPEKVA; 162 163 /* 164 * Limit for direct transfers, we cannot, of course limit 165 * the amount of kva for pipes in general though. 166 */ 167 #define LIMITPIPEKVA (16*1024*1024) 168 static int limitpipekva = LIMITPIPEKVA; 169 170 /* 171 * Limit the number of "big" pipes 172 */ 173 #define LIMITBIGPIPES 32 174 static int maxbigpipes = LIMITBIGPIPES; 175 static int nbigpipe = 0; 176 177 /* 178 * Amount of KVA consumed by pipe buffers. 179 */ 180 static int amountpipekva = 0; 181 182 static void pipeclose __P((struct pipe *cpipe)); 183 static void pipe_free_kmem __P((struct pipe *cpipe)); 184 static int pipe_create __P((struct pipe **cpipep, int allockva)); 185 static __inline int pipelock __P((struct pipe *cpipe, int catch)); 186 static __inline void pipeunlock __P((struct pipe *cpipe)); 187 static __inline void pipeselwakeup __P((struct pipe *selp, 188 struct pipe *sigp)); 189 static int pipespace __P((struct pipe *cpipe, int size)); 190 191 #ifdef __FreeBSD__ 192 #ifndef PIPE_NODIRECT 193 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio)); 194 static void pipe_destroy_write_buffer __P((struct pipe *wpipe)); 195 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); 196 static void pipe_clone_write_buffer __P((struct pipe *wpipe)); 197 #endif 198 199 static vm_zone_t pipe_zone; 200 #endif /* FreeBSD */ 201 202 #ifdef __NetBSD__ 203 #ifndef PIPE_NODIRECT 204 static __inline int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); 205 static __inline int pipe_loan_alloc __P((struct pipe *wpipe, int npages, 206 vsize_t blen)); 207 static void pipe_loan_free __P((struct pipe *wpipe)); 208 #endif /* PIPE_NODIRECT */ 209 210 static struct pool pipe_pool; 211 #endif /* NetBSD */ 212 213 /* 214 * The pipe system call for the DTYPE_PIPE type of pipes 215 */ 216 217 /* ARGSUSED */ 218 #ifdef __FreeBSD__ 219 int 220 pipe(p, uap) 221 struct proc *p; 222 struct pipe_args /* { 223 int dummy; 224 } */ *uap; 225 #elif defined(__NetBSD__) 226 int 227 sys_pipe(p, v, retval) 228 struct proc *p; 229 void *v; 230 register_t *retval; 231 #endif 232 { 233 struct file *rf, *wf; 234 struct pipe *rpipe, *wpipe; 235 int fd, error; 236 237 #ifdef __FreeBSD__ 238 if (pipe_zone == NULL) 239 pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4); 240 241 rpipe = wpipe = NULL; 242 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 1)) { 243 pipeclose(rpipe); 244 pipeclose(wpipe); 245 return (ENFILE); 246 } 247 248 error = falloc(p, &rf, &fd); 249 if (error) { 250 pipeclose(rpipe); 251 pipeclose(wpipe); 252 return (error); 253 } 254 fhold(rf); 255 p->p_retval[0] = fd; 256 257 /* 258 * Warning: once we've gotten past allocation of the fd for the 259 * read-side, we can only drop the read side via fdrop() in order 260 * to avoid races against processes which manage to dup() the read 261 * side while we are blocked trying to allocate the write side. 262 */ 263 rf->f_flag = FREAD | FWRITE; 264 rf->f_type = DTYPE_PIPE; 265 rf->f_data = (caddr_t)rpipe; 266 rf->f_ops = &pipeops; 267 error = falloc(p, &wf, &fd); 268 if (error) { 269 struct filedesc *fdp = p->p_fd; 270 271 if (fdp->fd_ofiles[p->p_retval[0]] == rf) { 272 fdp->fd_ofiles[p->p_retval[0]] = NULL; 273 fdrop(rf, p); 274 } 275 fdrop(rf, p); 276 /* rpipe has been closed by fdrop(). */ 277 pipeclose(wpipe); 278 return (error); 279 } 280 wf->f_flag = FREAD | FWRITE; 281 wf->f_type = DTYPE_PIPE; 282 wf->f_data = (caddr_t)wpipe; 283 wf->f_ops = &pipeops; 284 p->p_retval[1] = fd; 285 286 rpipe->pipe_peer = wpipe; 287 wpipe->pipe_peer = rpipe; 288 fdrop(rf, p); 289 #endif /* FreeBSD */ 290 291 #ifdef __NetBSD__ 292 rpipe = wpipe = NULL; 293 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) { 294 pipeclose(rpipe); 295 pipeclose(wpipe); 296 return (ENFILE); 297 } 298 299 /* 300 * Note: the file structure returned from falloc() is marked 301 * as 'larval' initially. Unless we mark it as 'mature' by 302 * FILE_SET_MATURE(), any attempt to do anything with it would 303 * return EBADF, including e.g. dup(2) or close(2). This avoids 304 * file descriptor races if we block in the second falloc(). 305 */ 306 307 error = falloc(p, &rf, &fd); 308 if (error) 309 goto free2; 310 retval[0] = fd; 311 rf->f_flag = FREAD; 312 rf->f_type = DTYPE_PIPE; 313 rf->f_data = (caddr_t)rpipe; 314 rf->f_ops = &pipeops; 315 316 error = falloc(p, &wf, &fd); 317 if (error) 318 goto free3; 319 retval[1] = fd; 320 wf->f_flag = FWRITE; 321 wf->f_type = DTYPE_PIPE; 322 wf->f_data = (caddr_t)wpipe; 323 wf->f_ops = &pipeops; 324 325 rpipe->pipe_peer = wpipe; 326 wpipe->pipe_peer = rpipe; 327 328 FILE_SET_MATURE(rf); 329 FILE_SET_MATURE(wf); 330 FILE_UNUSE(rf, p); 331 FILE_UNUSE(wf, p); 332 return (0); 333 free3: 334 FILE_UNUSE(rf, p); 335 ffree(rf); 336 fdremove(p->p_fd, retval[0]); 337 free2: 338 pipeclose(wpipe); 339 pipeclose(rpipe); 340 #endif /* NetBSD */ 341 342 return (error); 343 } 344 345 /* 346 * Allocate kva for pipe circular buffer, the space is pageable 347 * This routine will 'realloc' the size of a pipe safely, if it fails 348 * it will retain the old buffer. 349 * If it fails it will return ENOMEM. 350 */ 351 static int 352 pipespace(cpipe, size) 353 struct pipe *cpipe; 354 int size; 355 { 356 caddr_t buffer; 357 #ifdef __FreeBSD__ 358 struct vm_object *object; 359 int npages, error; 360 361 npages = round_page(size)/PAGE_SIZE; 362 /* 363 * Create an object, I don't like the idea of paging to/from 364 * kernel_object. 365 */ 366 mtx_lock(&vm_mtx); 367 object = vm_object_allocate(OBJT_DEFAULT, npages); 368 buffer = (caddr_t) vm_map_min(kernel_map); 369 370 /* 371 * Insert the object into the kernel map, and allocate kva for it. 372 * The map entry is, by default, pageable. 373 */ 374 error = vm_map_find(kernel_map, object, 0, 375 (vm_offset_t *) &buffer, size, 1, 376 VM_PROT_ALL, VM_PROT_ALL, 0); 377 378 if (error != KERN_SUCCESS) { 379 vm_object_deallocate(object); 380 mtx_unlock(&vm_mtx); 381 return (ENOMEM); 382 } 383 #endif /* FreeBSD */ 384 385 #ifdef __NetBSD__ 386 /* 387 * Allocate pageable virtual address space. Physical memory is allocated 388 * on demand. 389 */ 390 buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size)); 391 if (buffer == NULL) 392 return (ENOMEM); 393 #endif /* NetBSD */ 394 395 /* free old resources if we're resizing */ 396 pipe_free_kmem(cpipe); 397 #ifdef __FreeBSD__ 398 mtx_unlock(&vm_mtx); 399 cpipe->pipe_buffer.object = object; 400 #endif 401 cpipe->pipe_buffer.buffer = buffer; 402 cpipe->pipe_buffer.size = size; 403 cpipe->pipe_buffer.in = 0; 404 cpipe->pipe_buffer.out = 0; 405 cpipe->pipe_buffer.cnt = 0; 406 amountpipekva += cpipe->pipe_buffer.size; 407 return (0); 408 } 409 410 /* 411 * initialize and allocate VM and memory for pipe 412 */ 413 static int 414 pipe_create(cpipep, allockva) 415 struct pipe **cpipep; 416 int allockva; 417 { 418 struct pipe *cpipe; 419 int error; 420 421 #ifdef __FreeBSD__ 422 *cpipep = zalloc(pipe_zone); 423 #endif 424 #ifdef __NetBSD__ 425 *cpipep = pool_get(&pipe_pool, M_WAITOK); 426 #endif 427 if (*cpipep == NULL) 428 return (ENOMEM); 429 430 cpipe = *cpipep; 431 432 /* Initialize */ 433 memset(cpipe, 0, sizeof(*cpipe)); 434 cpipe->pipe_state = PIPE_SIGNALR; 435 436 if (allockva && (error = pipespace(cpipe, PIPE_SIZE))) 437 return (error); 438 439 vfs_timestamp(&cpipe->pipe_ctime); 440 cpipe->pipe_atime = cpipe->pipe_ctime; 441 cpipe->pipe_mtime = cpipe->pipe_ctime; 442 #ifdef __NetBSD__ 443 cpipe->pipe_pgid = NO_PID; 444 lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0); 445 #endif 446 447 return (0); 448 } 449 450 451 /* 452 * lock a pipe for I/O, blocking other access 453 */ 454 static __inline int 455 pipelock(cpipe, catch) 456 struct pipe *cpipe; 457 int catch; 458 { 459 int error; 460 461 #ifdef __FreeBSD__ 462 while (cpipe->pipe_state & PIPE_LOCK) { 463 cpipe->pipe_state |= PIPE_LWANT; 464 error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO, 465 "pipelk", 0); 466 if (error != 0) 467 return (error); 468 } 469 cpipe->pipe_state |= PIPE_LOCK; 470 return (0); 471 #endif 472 473 #ifdef __NetBSD__ 474 do { 475 error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL); 476 } while (!catch && (error == EINTR || error == ERESTART)); 477 return (error); 478 #endif 479 } 480 481 /* 482 * unlock a pipe I/O lock 483 */ 484 static __inline void 485 pipeunlock(cpipe) 486 struct pipe *cpipe; 487 { 488 #ifdef __FreeBSD__ 489 cpipe->pipe_state &= ~PIPE_LOCK; 490 if (cpipe->pipe_state & PIPE_LWANT) { 491 cpipe->pipe_state &= ~PIPE_LWANT; 492 wakeup(cpipe); 493 } 494 #endif 495 496 #ifdef __NetBSD__ 497 lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL); 498 #endif 499 } 500 501 /* 502 * Select/poll wakup. This also sends SIGIO to peer connected to 503 * 'sigpipe' side of pipe. 504 */ 505 static __inline void 506 pipeselwakeup(selp, sigp) 507 struct pipe *selp, *sigp; 508 { 509 if (selp->pipe_state & PIPE_SEL) { 510 selp->pipe_state &= ~PIPE_SEL; 511 selwakeup(&selp->pipe_sel); 512 } 513 #ifdef __FreeBSD__ 514 if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio) 515 pgsigio(sigp->pipe_sigio, SIGIO, 0); 516 KNOTE(&selp->pipe_sel.si_note, 0); 517 #endif 518 519 #ifdef __NetBSD__ 520 if (sigp && (sigp->pipe_state & PIPE_ASYNC) 521 && sigp->pipe_pgid != NO_PID){ 522 struct proc *p; 523 524 if (sigp->pipe_pgid < 0) 525 gsignal(-sigp->pipe_pgid, SIGIO); 526 else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0) 527 psignal(p, SIGIO); 528 } 529 #endif /* NetBSD */ 530 } 531 532 /* ARGSUSED */ 533 #ifdef __FreeBSD__ 534 static int 535 pipe_read(fp, uio, cred, flags, p) 536 struct file *fp; 537 struct uio *uio; 538 struct ucred *cred; 539 int flags; 540 struct proc *p; 541 #elif defined(__NetBSD__) 542 static int 543 pipe_read(fp, offset, uio, cred, flags) 544 struct file *fp; 545 off_t *offset; 546 struct uio *uio; 547 struct ucred *cred; 548 int flags; 549 #endif 550 { 551 struct pipe *rpipe = (struct pipe *) fp->f_data; 552 int error; 553 size_t nread = 0; 554 size_t size; 555 size_t ocnt; 556 557 ++rpipe->pipe_busy; 558 error = pipelock(rpipe, 1); 559 if (error) 560 goto unlocked_error; 561 562 ocnt = rpipe->pipe_buffer.cnt; 563 564 while (uio->uio_resid) { 565 /* 566 * normal pipe buffer receive 567 */ 568 if (rpipe->pipe_buffer.cnt > 0) { 569 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 570 if (size > rpipe->pipe_buffer.cnt) 571 size = rpipe->pipe_buffer.cnt; 572 if (size > uio->uio_resid) 573 size = uio->uio_resid; 574 575 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 576 size, uio); 577 if (error) 578 break; 579 580 rpipe->pipe_buffer.out += size; 581 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 582 rpipe->pipe_buffer.out = 0; 583 584 rpipe->pipe_buffer.cnt -= size; 585 586 /* 587 * If there is no more to read in the pipe, reset 588 * its pointers to the beginning. This improves 589 * cache hit stats. 590 */ 591 if (rpipe->pipe_buffer.cnt == 0) { 592 rpipe->pipe_buffer.in = 0; 593 rpipe->pipe_buffer.out = 0; 594 } 595 nread += size; 596 #ifndef PIPE_NODIRECT 597 /* 598 * Direct copy, bypassing a kernel buffer. 599 */ 600 } else if ((size = rpipe->pipe_map.cnt) && 601 (rpipe->pipe_state & PIPE_DIRECTW)) { 602 caddr_t va; 603 if (size > uio->uio_resid) 604 size = uio->uio_resid; 605 606 va = (caddr_t) rpipe->pipe_map.kva + 607 rpipe->pipe_map.pos; 608 error = uiomove(va, size, uio); 609 if (error) 610 break; 611 nread += size; 612 rpipe->pipe_map.pos += size; 613 rpipe->pipe_map.cnt -= size; 614 if (rpipe->pipe_map.cnt == 0) { 615 rpipe->pipe_state &= ~PIPE_DIRECTW; 616 wakeup(rpipe); 617 } 618 #endif 619 } else { 620 /* 621 * detect EOF condition 622 * read returns 0 on EOF, no need to set error 623 */ 624 if (rpipe->pipe_state & PIPE_EOF) 625 break; 626 627 /* 628 * If the "write-side" has been blocked, wake it up now. 629 */ 630 if (rpipe->pipe_state & PIPE_WANTW) { 631 rpipe->pipe_state &= ~PIPE_WANTW; 632 wakeup(rpipe); 633 } 634 635 /* 636 * Break if some data was read. 637 */ 638 if (nread > 0) 639 break; 640 641 /* 642 * don't block on non-blocking I/O 643 */ 644 if (fp->f_flag & FNONBLOCK) { 645 error = EAGAIN; 646 break; 647 } 648 649 /* 650 * Unlock the pipe buffer for our remaining processing. 651 * We will either break out with an error or we will 652 * sleep and relock to loop. 653 */ 654 pipeunlock(rpipe); 655 656 /* 657 * We want to read more, wake up select/poll. 658 */ 659 pipeselwakeup(rpipe, rpipe->pipe_peer); 660 661 rpipe->pipe_state |= PIPE_WANTR; 662 error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0); 663 if (error != 0 || (error = pipelock(rpipe, 1))) 664 goto unlocked_error; 665 } 666 } 667 pipeunlock(rpipe); 668 669 if (error == 0) 670 vfs_timestamp(&rpipe->pipe_atime); 671 unlocked_error: 672 --rpipe->pipe_busy; 673 674 /* 675 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0. 676 */ 677 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) { 678 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW); 679 wakeup(rpipe); 680 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 681 /* 682 * Handle write blocking hysteresis. 683 */ 684 if (rpipe->pipe_state & PIPE_WANTW) { 685 rpipe->pipe_state &= ~PIPE_WANTW; 686 wakeup(rpipe); 687 } 688 } 689 690 /* 691 * If anything was read off the buffer, signal to the writer it's 692 * possible to write more data. Also send signal if we are here for the 693 * first time after last write. 694 */ 695 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF 696 && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) { 697 pipeselwakeup(rpipe, rpipe->pipe_peer); 698 rpipe->pipe_state &= ~PIPE_SIGNALR; 699 } 700 701 return (error); 702 } 703 704 #ifdef __FreeBSD__ 705 #ifndef PIPE_NODIRECT 706 /* 707 * Map the sending processes' buffer into kernel space and wire it. 708 * This is similar to a physical write operation. 709 */ 710 static int 711 pipe_build_write_buffer(wpipe, uio) 712 struct pipe *wpipe; 713 struct uio *uio; 714 { 715 size_t size; 716 int i; 717 vm_offset_t addr, endaddr, paddr; 718 719 size = uio->uio_iov->iov_len; 720 if (size > wpipe->pipe_buffer.size) 721 size = wpipe->pipe_buffer.size; 722 723 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 724 mtx_lock(&vm_mtx); 725 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 726 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { 727 vm_page_t m; 728 729 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 || 730 (paddr = pmap_kextract(addr)) == 0) { 731 int j; 732 733 for (j = 0; j < i; j++) 734 vm_page_unwire(wpipe->pipe_map.ms[j], 1); 735 mtx_unlock(&vm_mtx); 736 return (EFAULT); 737 } 738 739 m = PHYS_TO_VM_PAGE(paddr); 740 vm_page_wire(m); 741 wpipe->pipe_map.ms[i] = m; 742 } 743 744 /* 745 * set up the control block 746 */ 747 wpipe->pipe_map.npages = i; 748 wpipe->pipe_map.pos = 749 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 750 wpipe->pipe_map.cnt = size; 751 752 /* 753 * and map the buffer 754 */ 755 if (wpipe->pipe_map.kva == 0) { 756 /* 757 * We need to allocate space for an extra page because the 758 * address range might (will) span pages at times. 759 */ 760 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, 761 wpipe->pipe_buffer.size + PAGE_SIZE); 762 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; 763 } 764 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, 765 wpipe->pipe_map.npages); 766 767 mtx_unlock(&vm_mtx); 768 /* 769 * and update the uio data 770 */ 771 772 uio->uio_iov->iov_len -= size; 773 uio->uio_iov->iov_base += size; 774 if (uio->uio_iov->iov_len == 0) 775 uio->uio_iov++; 776 uio->uio_resid -= size; 777 uio->uio_offset += size; 778 return (0); 779 } 780 781 /* 782 * unmap and unwire the process buffer 783 */ 784 static void 785 pipe_destroy_write_buffer(wpipe) 786 struct pipe *wpipe; 787 { 788 int i; 789 790 mtx_lock(&vm_mtx); 791 if (wpipe->pipe_map.kva) { 792 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); 793 794 if (amountpipekva > maxpipekva) { 795 vm_offset_t kva = wpipe->pipe_map.kva; 796 wpipe->pipe_map.kva = 0; 797 kmem_free(kernel_map, kva, 798 wpipe->pipe_buffer.size + PAGE_SIZE); 799 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; 800 } 801 } 802 for (i = 0; i < wpipe->pipe_map.npages; i++) 803 vm_page_unwire(wpipe->pipe_map.ms[i], 1); 804 mtx_unlock(&vm_mtx); 805 } 806 807 /* 808 * In the case of a signal, the writing process might go away. This 809 * code copies the data into the circular buffer so that the source 810 * pages can be freed without loss of data. 811 */ 812 static void 813 pipe_clone_write_buffer(wpipe) 814 struct pipe *wpipe; 815 { 816 int size; 817 int pos; 818 819 size = wpipe->pipe_map.cnt; 820 pos = wpipe->pipe_map.pos; 821 memcpy((caddr_t) wpipe->pipe_buffer.buffer, 822 (caddr_t) wpipe->pipe_map.kva + pos, size); 823 824 wpipe->pipe_buffer.in = size; 825 wpipe->pipe_buffer.out = 0; 826 wpipe->pipe_buffer.cnt = size; 827 wpipe->pipe_state &= ~PIPE_DIRECTW; 828 829 pipe_destroy_write_buffer(wpipe); 830 } 831 832 /* 833 * This implements the pipe buffer write mechanism. Note that only 834 * a direct write OR a normal pipe write can be pending at any given time. 835 * If there are any characters in the pipe buffer, the direct write will 836 * be deferred until the receiving process grabs all of the bytes from 837 * the pipe buffer. Then the direct mapping write is set-up. 838 */ 839 static int 840 pipe_direct_write(wpipe, uio) 841 struct pipe *wpipe; 842 struct uio *uio; 843 { 844 int error; 845 846 retry: 847 while (wpipe->pipe_state & PIPE_DIRECTW) { 848 if (wpipe->pipe_state & PIPE_WANTR) { 849 wpipe->pipe_state &= ~PIPE_WANTR; 850 wakeup(wpipe); 851 } 852 wpipe->pipe_state |= PIPE_WANTW; 853 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0); 854 if (error) 855 goto error1; 856 if (wpipe->pipe_state & PIPE_EOF) { 857 error = EPIPE; 858 goto error1; 859 } 860 } 861 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 862 if (wpipe->pipe_buffer.cnt > 0) { 863 if (wpipe->pipe_state & PIPE_WANTR) { 864 wpipe->pipe_state &= ~PIPE_WANTR; 865 wakeup(wpipe); 866 } 867 868 wpipe->pipe_state |= PIPE_WANTW; 869 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0); 870 if (error) 871 goto error1; 872 if (wpipe->pipe_state & PIPE_EOF) { 873 error = EPIPE; 874 goto error1; 875 } 876 goto retry; 877 } 878 879 wpipe->pipe_state |= PIPE_DIRECTW; 880 881 error = pipe_build_write_buffer(wpipe, uio); 882 if (error) { 883 wpipe->pipe_state &= ~PIPE_DIRECTW; 884 goto error1; 885 } 886 887 error = 0; 888 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 889 if (wpipe->pipe_state & PIPE_EOF) { 890 pipelock(wpipe, 0); 891 pipe_destroy_write_buffer(wpipe); 892 pipeunlock(wpipe); 893 pipeselwakeup(wpipe, wpipe); 894 error = EPIPE; 895 goto error1; 896 } 897 if (wpipe->pipe_state & PIPE_WANTR) { 898 wpipe->pipe_state &= ~PIPE_WANTR; 899 wakeup(wpipe); 900 } 901 pipeselwakeup(wpipe, wpipe); 902 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0); 903 } 904 905 pipelock(wpipe,0); 906 if (wpipe->pipe_state & PIPE_DIRECTW) { 907 /* 908 * this bit of trickery substitutes a kernel buffer for 909 * the process that might be going away. 910 */ 911 pipe_clone_write_buffer(wpipe); 912 } else { 913 pipe_destroy_write_buffer(wpipe); 914 } 915 pipeunlock(wpipe); 916 return (error); 917 918 error1: 919 wakeup(wpipe); 920 return (error); 921 } 922 #endif /* !PIPE_NODIRECT */ 923 #endif /* FreeBSD */ 924 925 #ifdef __NetBSD__ 926 #ifndef PIPE_NODIRECT 927 /* 928 * Allocate structure for loan transfer. 929 */ 930 static __inline int 931 pipe_loan_alloc(wpipe, npages, blen) 932 struct pipe *wpipe; 933 int npages; 934 vsize_t blen; 935 { 936 wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, blen); 937 if (wpipe->pipe_map.kva == NULL) 938 return (ENOMEM); 939 940 amountpipekva += blen; 941 wpipe->pipe_map.npages = npages; 942 wpipe->pipe_map.ms = (struct vm_page **) malloc( 943 npages * sizeof(struct vm_page *), M_PIPE, M_WAITOK); 944 945 return (0); 946 } 947 948 /* 949 * Free resources allocated for loan transfer. 950 */ 951 static void 952 pipe_loan_free(wpipe) 953 struct pipe *wpipe; 954 { 955 uvm_km_free(kernel_map, wpipe->pipe_map.kva, 956 wpipe->pipe_map.npages * PAGE_SIZE); 957 wpipe->pipe_map.kva = NULL; 958 amountpipekva -= wpipe->pipe_map.npages * PAGE_SIZE; 959 free(wpipe->pipe_map.ms, M_PIPE); 960 wpipe->pipe_map.ms = NULL; 961 } 962 963 /* 964 * NetBSD direct write, using uvm_loan() mechanism. 965 * This implements the pipe buffer write mechanism. Note that only 966 * a direct write OR a normal pipe write can be pending at any given time. 967 * If there are any characters in the pipe buffer, the direct write will 968 * be deferred until the receiving process grabs all of the bytes from 969 * the pipe buffer. Then the direct mapping write is set-up. 970 */ 971 static __inline int 972 pipe_direct_write(wpipe, uio) 973 struct pipe *wpipe; 974 struct uio *uio; 975 { 976 int error, npages, j; 977 struct vm_page **res = NULL; 978 vaddr_t bbase, kva, base, bend; 979 vsize_t blen, bcnt; 980 voff_t bpos; 981 982 retry: 983 while (wpipe->pipe_state & PIPE_DIRECTW) { 984 if (wpipe->pipe_state & PIPE_WANTR) { 985 wpipe->pipe_state &= ~PIPE_WANTR; 986 wakeup(wpipe); 987 } 988 wpipe->pipe_state |= PIPE_WANTW; 989 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0); 990 if (error) 991 goto error; 992 if (wpipe->pipe_state & PIPE_EOF) { 993 error = EPIPE; 994 goto error; 995 } 996 } 997 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 998 if (wpipe->pipe_buffer.cnt > 0) { 999 if (wpipe->pipe_state & PIPE_WANTR) { 1000 wpipe->pipe_state &= ~PIPE_WANTR; 1001 wakeup(wpipe); 1002 } 1003 1004 wpipe->pipe_state |= PIPE_WANTW; 1005 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0); 1006 if (error) 1007 goto error; 1008 if (wpipe->pipe_state & PIPE_EOF) { 1009 error = EPIPE; 1010 goto error; 1011 } 1012 goto retry; 1013 } 1014 1015 /* 1016 * Handle first iovec, first PIPE_CHUNK_SIZE bytes. Expect caller 1017 * to deal with short write. 1018 * 1019 * Note: need to deal with buffers not aligned to PAGE_SIZE. 1020 */ 1021 bbase = (vaddr_t)uio->uio_iov[0].iov_base; 1022 base = trunc_page(bbase); 1023 bend = round_page(bbase + uio->uio_iov[0].iov_len); 1024 blen = bend - base; 1025 bpos = bbase - base; 1026 1027 if (blen > PIPE_DIRECT_CHUNK) { 1028 blen = PIPE_DIRECT_CHUNK; 1029 bend = base + blen; 1030 bcnt = PIPE_DIRECT_CHUNK - bpos; 1031 } else 1032 bcnt = uio->uio_iov[0].iov_len; 1033 1034 npages = blen / PAGE_SIZE; 1035 1036 wpipe->pipe_map.pos = bpos; 1037 wpipe->pipe_map.cnt = bcnt; 1038 1039 /* 1040 * Free the old kva if we need more pages than we have 1041 * allocated. 1042 */ 1043 if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages) 1044 pipe_loan_free(wpipe); 1045 1046 /* Allocate new kva. */ 1047 if (!wpipe->pipe_map.kva 1048 && (error = pipe_loan_alloc(wpipe, npages, blen))) 1049 goto error; 1050 1051 /* Loan the write buffer memory from writer process */ 1052 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen, 1053 (void **) wpipe->pipe_map.ms, UVM_LOAN_TOPAGE); 1054 if (error) 1055 goto cleanup; 1056 res = wpipe->pipe_map.ms; 1057 1058 /* Enter the loaned pages to kva */ 1059 kva = wpipe->pipe_map.kva; 1060 for(j=0; j < npages; j++, kva += PAGE_SIZE) 1061 pmap_enter(pmap_kernel(), kva, res[j]->phys_addr, 1062 VM_PROT_READ, 0); 1063 1064 wpipe->pipe_state |= PIPE_DIRECTW; 1065 error = 0; 1066 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 1067 if (wpipe->pipe_state & PIPE_EOF) { 1068 error = EPIPE; 1069 break; 1070 } 1071 if (wpipe->pipe_state & PIPE_WANTR) { 1072 wpipe->pipe_state &= ~PIPE_WANTR; 1073 wakeup(wpipe); 1074 } 1075 pipeselwakeup(wpipe, wpipe); 1076 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0); 1077 } 1078 1079 if (error) 1080 wpipe->pipe_state &= ~PIPE_DIRECTW; 1081 1082 cleanup: 1083 pipelock(wpipe, 0); 1084 if (error || amountpipekva > maxpipekva) 1085 pipe_loan_free(wpipe); 1086 else if (res) 1087 uvm_unloanpage(res, npages); 1088 pipeunlock(wpipe); 1089 1090 if (error == EPIPE) { 1091 pipeselwakeup(wpipe, wpipe); 1092 1093 /* 1094 * If anything was read from what we offered, return success 1095 * and short write. We return EOF on next write(2). 1096 */ 1097 if (wpipe->pipe_map.cnt < bcnt) { 1098 bcnt -= wpipe->pipe_map.cnt; 1099 error = 0; 1100 } 1101 } 1102 1103 if (error) { 1104 error: 1105 wakeup(wpipe); 1106 return (error); 1107 } 1108 1109 uio->uio_resid -= bcnt; 1110 /* uio_offset not updated, not set/used for write(2) */ 1111 1112 return (0); 1113 } 1114 #endif /* !PIPE_NODIRECT */ 1115 #endif /* NetBSD */ 1116 1117 #ifdef __FreeBSD__ 1118 static int 1119 pipe_write(fp, uio, cred, flags, p) 1120 struct file *fp; 1121 off_t *offset; 1122 struct uio *uio; 1123 struct ucred *cred; 1124 int flags; 1125 struct proc *p; 1126 #elif defined(__NetBSD__) 1127 static int 1128 pipe_write(fp, offset, uio, cred, flags) 1129 struct file *fp; 1130 off_t *offset; 1131 struct uio *uio; 1132 struct ucred *cred; 1133 int flags; 1134 #endif 1135 { 1136 int error = 0; 1137 int orig_resid; 1138 struct pipe *wpipe, *rpipe; 1139 1140 rpipe = (struct pipe *) fp->f_data; 1141 wpipe = rpipe->pipe_peer; 1142 1143 /* 1144 * detect loss of pipe read side, issue SIGPIPE if lost. 1145 */ 1146 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) 1147 return (EPIPE); 1148 1149 ++wpipe->pipe_busy; 1150 1151 /* 1152 * If it is advantageous to resize the pipe buffer, do 1153 * so. 1154 */ 1155 if ((uio->uio_resid > PIPE_SIZE) && 1156 (nbigpipe < maxbigpipes) && 1157 #ifndef PIPE_NODIRECT 1158 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 1159 #endif 1160 (wpipe->pipe_buffer.size <= PIPE_SIZE) && 1161 (wpipe->pipe_buffer.cnt == 0)) { 1162 1163 if ((error = pipelock(wpipe,1)) == 0) { 1164 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 1165 nbigpipe++; 1166 pipeunlock(wpipe); 1167 } else { 1168 /* 1169 * If an error occurred, unbusy and return, waking up 1170 * any waiting readers. 1171 */ 1172 --wpipe->pipe_busy; 1173 if (wpipe->pipe_busy == 0 1174 && (wpipe->pipe_state & PIPE_WANTCLOSE)) { 1175 wpipe->pipe_state &= 1176 ~(PIPE_WANTCLOSE | PIPE_WANTR); 1177 wakeup(wpipe); 1178 } 1179 1180 return (error); 1181 } 1182 } 1183 1184 #ifdef __FreeBSD__ 1185 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone")); 1186 #endif 1187 1188 orig_resid = uio->uio_resid; 1189 while (uio->uio_resid) { 1190 int space; 1191 1192 #ifndef PIPE_NODIRECT 1193 /* 1194 * If the transfer is large, we can gain performance if 1195 * we do process-to-process copies directly. 1196 * If the write is non-blocking, we don't use the 1197 * direct write mechanism. 1198 * 1199 * The direct write mechanism will detect the reader going 1200 * away on us. 1201 */ 1202 if ((uio->uio_iov[0].iov_len >= PIPE_MINDIRECT) && 1203 (uio->uio_resid == orig_resid) && 1204 (fp->f_flag & FNONBLOCK) == 0 && 1205 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) { 1206 error = pipe_direct_write(wpipe, uio); 1207 1208 /* 1209 * We either errorred, wrote whole buffer, or 1210 * wrote part of buffer. If the error is ENOMEM, 1211 * we failed to allocate some resources for direct 1212 * write and fall back to ordinary write. Otherwise, 1213 * break out now. 1214 */ 1215 if (error != ENOMEM) 1216 break; 1217 } 1218 #endif /* PIPE_NODIRECT */ 1219 1220 /* 1221 * Pipe buffered writes cannot be coincidental with 1222 * direct writes. We wait until the currently executing 1223 * direct write is completed before we start filling the 1224 * pipe buffer. We break out if a signal occurs or the 1225 * reader goes away. 1226 */ 1227 retrywrite: 1228 while (wpipe->pipe_state & PIPE_DIRECTW) { 1229 if (wpipe->pipe_state & PIPE_WANTR) { 1230 wpipe->pipe_state &= ~PIPE_WANTR; 1231 wakeup(wpipe); 1232 } 1233 error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0); 1234 if (wpipe->pipe_state & PIPE_EOF) 1235 break; 1236 if (error) 1237 break; 1238 } 1239 if (wpipe->pipe_state & PIPE_EOF) { 1240 error = EPIPE; 1241 break; 1242 } 1243 1244 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1245 1246 /* Writes of size <= PIPE_BUF must be atomic. */ 1247 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1248 space = 0; 1249 1250 if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { 1251 int size; /* Transfer size */ 1252 int segsize; /* first segment to transfer */ 1253 1254 if ((error = pipelock(wpipe,1)) != 0) 1255 break; 1256 1257 /* 1258 * It is possible for a direct write to 1259 * slip in on us... handle it here... 1260 */ 1261 if (wpipe->pipe_state & PIPE_DIRECTW) { 1262 pipeunlock(wpipe); 1263 goto retrywrite; 1264 } 1265 /* 1266 * If a process blocked in uiomove, our 1267 * value for space might be bad. 1268 * 1269 * XXX will we be ok if the reader has gone 1270 * away here? 1271 */ 1272 if (space > wpipe->pipe_buffer.size - 1273 wpipe->pipe_buffer.cnt) { 1274 pipeunlock(wpipe); 1275 goto retrywrite; 1276 } 1277 1278 /* 1279 * Transfer size is minimum of uio transfer 1280 * and free space in pipe buffer. 1281 */ 1282 if (space > uio->uio_resid) 1283 size = uio->uio_resid; 1284 else 1285 size = space; 1286 /* 1287 * First segment to transfer is minimum of 1288 * transfer size and contiguous space in 1289 * pipe buffer. If first segment to transfer 1290 * is less than the transfer size, we've got 1291 * a wraparound in the buffer. 1292 */ 1293 segsize = wpipe->pipe_buffer.size - 1294 wpipe->pipe_buffer.in; 1295 if (segsize > size) 1296 segsize = size; 1297 1298 /* Transfer first segment */ 1299 1300 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1301 segsize, uio); 1302 1303 if (error == 0 && segsize < size) { 1304 /* 1305 * Transfer remaining part now, to 1306 * support atomic writes. Wraparound 1307 * happened. 1308 */ 1309 #ifdef DEBUG 1310 if (wpipe->pipe_buffer.in + segsize != 1311 wpipe->pipe_buffer.size) 1312 panic("Expected pipe buffer wraparound disappeared"); 1313 #endif 1314 1315 error = uiomove(&wpipe->pipe_buffer.buffer[0], 1316 size - segsize, uio); 1317 } 1318 if (error == 0) { 1319 wpipe->pipe_buffer.in += size; 1320 if (wpipe->pipe_buffer.in >= 1321 wpipe->pipe_buffer.size) { 1322 #ifdef DEBUG 1323 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size) 1324 panic("Expected wraparound bad"); 1325 #endif 1326 wpipe->pipe_buffer.in = size - segsize; 1327 } 1328 1329 wpipe->pipe_buffer.cnt += size; 1330 #ifdef DEBUG 1331 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size) 1332 panic("Pipe buffer overflow"); 1333 #endif 1334 1335 } 1336 pipeunlock(wpipe); 1337 if (error) 1338 break; 1339 1340 } else { 1341 /* 1342 * If the "read-side" has been blocked, wake it up now. 1343 */ 1344 if (wpipe->pipe_state & PIPE_WANTR) { 1345 wpipe->pipe_state &= ~PIPE_WANTR; 1346 wakeup(wpipe); 1347 } 1348 1349 /* 1350 * don't block on non-blocking I/O 1351 */ 1352 if (fp->f_flag & FNONBLOCK) { 1353 error = EAGAIN; 1354 break; 1355 } 1356 1357 /* 1358 * We have no more space and have something to offer, 1359 * wake up select/poll. 1360 */ 1361 pipeselwakeup(wpipe, wpipe); 1362 1363 wpipe->pipe_state |= PIPE_WANTW; 1364 error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0); 1365 if (error != 0) 1366 break; 1367 /* 1368 * If read side wants to go away, we just issue a signal 1369 * to ourselves. 1370 */ 1371 if (wpipe->pipe_state & PIPE_EOF) { 1372 error = EPIPE; 1373 break; 1374 } 1375 } 1376 } 1377 1378 --wpipe->pipe_busy; 1379 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) { 1380 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR); 1381 wakeup(wpipe); 1382 } else if (wpipe->pipe_buffer.cnt > 0) { 1383 /* 1384 * If we have put any characters in the buffer, we wake up 1385 * the reader. 1386 */ 1387 if (wpipe->pipe_state & PIPE_WANTR) { 1388 wpipe->pipe_state &= ~PIPE_WANTR; 1389 wakeup(wpipe); 1390 } 1391 } 1392 1393 /* 1394 * Don't return EPIPE if I/O was successful 1395 */ 1396 if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0) 1397 && (uio->uio_resid == 0)) 1398 error = 0; 1399 1400 if (error == 0) 1401 vfs_timestamp(&wpipe->pipe_mtime); 1402 1403 /* 1404 * We have something to offer, wake up select/poll. 1405 * wpipe->pipe_map.cnt is always 0 in this point (direct write 1406 * is only done synchronously), so check wpipe->only pipe_buffer.cnt 1407 */ 1408 if (wpipe->pipe_buffer.cnt) 1409 pipeselwakeup(wpipe, wpipe); 1410 1411 /* 1412 * Arrange for next read(2) to do a signal. 1413 */ 1414 wpipe->pipe_state |= PIPE_SIGNALR; 1415 1416 return (error); 1417 } 1418 1419 /* 1420 * we implement a very minimal set of ioctls for compatibility with sockets. 1421 */ 1422 int 1423 pipe_ioctl(fp, cmd, data, p) 1424 struct file *fp; 1425 u_long cmd; 1426 caddr_t data; 1427 struct proc *p; 1428 { 1429 struct pipe *mpipe = (struct pipe *)fp->f_data; 1430 1431 switch (cmd) { 1432 1433 case FIONBIO: 1434 return (0); 1435 1436 case FIOASYNC: 1437 if (*(int *)data) { 1438 mpipe->pipe_state |= PIPE_ASYNC; 1439 } else { 1440 mpipe->pipe_state &= ~PIPE_ASYNC; 1441 } 1442 return (0); 1443 1444 case FIONREAD: 1445 #ifndef PIPE_NODIRECT 1446 if (mpipe->pipe_state & PIPE_DIRECTW) 1447 *(int *)data = mpipe->pipe_map.cnt; 1448 else 1449 #endif 1450 *(int *)data = mpipe->pipe_buffer.cnt; 1451 return (0); 1452 1453 #ifdef __FreeBSD__ 1454 case FIOSETOWN: 1455 return (fsetown(*(int *)data, &mpipe->pipe_sigio)); 1456 1457 case FIOGETOWN: 1458 *(int *)data = fgetown(mpipe->pipe_sigio); 1459 return (0); 1460 1461 /* This is deprecated, FIOSETOWN should be used instead. */ 1462 case TIOCSPGRP: 1463 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); 1464 1465 /* This is deprecated, FIOGETOWN should be used instead. */ 1466 case TIOCGPGRP: 1467 *(int *)data = -fgetown(mpipe->pipe_sigio); 1468 return (0); 1469 #endif /* FreeBSD */ 1470 #ifdef __NetBSD__ 1471 case TIOCSPGRP: 1472 mpipe->pipe_pgid = *(int *)data; 1473 return (0); 1474 1475 case TIOCGPGRP: 1476 *(int *)data = mpipe->pipe_pgid; 1477 return (0); 1478 #endif /* NetBSD */ 1479 1480 } 1481 return (ENOTTY); 1482 } 1483 1484 int 1485 pipe_poll(fp, events, p) 1486 struct file *fp; 1487 int events; 1488 struct proc *p; 1489 { 1490 struct pipe *rpipe = (struct pipe *)fp->f_data; 1491 struct pipe *wpipe; 1492 int revents = 0; 1493 1494 wpipe = rpipe->pipe_peer; 1495 if (events & (POLLIN | POLLRDNORM)) 1496 if ((rpipe->pipe_buffer.cnt > 0) || 1497 #ifndef PIPE_NODIRECT 1498 (rpipe->pipe_state & PIPE_DIRECTW) || 1499 #endif 1500 (rpipe->pipe_state & PIPE_EOF)) 1501 revents |= events & (POLLIN | POLLRDNORM); 1502 1503 if (events & (POLLOUT | POLLWRNORM)) 1504 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) 1505 || ( 1506 #ifndef PIPE_NODIRECT 1507 ((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1508 #endif 1509 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1510 revents |= events & (POLLOUT | POLLWRNORM); 1511 1512 if ((rpipe->pipe_state & PIPE_EOF) || 1513 (wpipe == NULL) || 1514 (wpipe->pipe_state & PIPE_EOF)) 1515 revents |= POLLHUP; 1516 1517 if (revents == 0) { 1518 if (events & (POLLIN | POLLRDNORM)) { 1519 selrecord(p, &rpipe->pipe_sel); 1520 rpipe->pipe_state |= PIPE_SEL; 1521 } 1522 1523 if (events & (POLLOUT | POLLWRNORM)) { 1524 selrecord(p, &wpipe->pipe_sel); 1525 wpipe->pipe_state |= PIPE_SEL; 1526 } 1527 } 1528 1529 return (revents); 1530 } 1531 1532 static int 1533 pipe_stat(fp, ub, p) 1534 struct file *fp; 1535 struct stat *ub; 1536 struct proc *p; 1537 { 1538 struct pipe *pipe = (struct pipe *)fp->f_data; 1539 1540 memset((caddr_t)ub, 0, sizeof(*ub)); 1541 ub->st_mode = S_IFIFO; 1542 ub->st_blksize = pipe->pipe_buffer.size; 1543 ub->st_size = pipe->pipe_buffer.cnt; 1544 ub->st_blocks = (ub->st_size) ? 1 : 0; 1545 #ifdef __FreeBSD__ 1546 ub->st_atimespec = pipe->pipe_atime; 1547 ub->st_mtimespec = pipe->pipe_mtime; 1548 ub->st_ctimespec = pipe->pipe_ctime; 1549 #endif /* FreeBSD */ 1550 #ifdef __NetBSD__ 1551 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec) 1552 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec); 1553 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec); 1554 #endif /* NetBSD */ 1555 ub->st_uid = fp->f_cred->cr_uid; 1556 ub->st_gid = fp->f_cred->cr_gid; 1557 /* 1558 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1559 * XXX (st_dev, st_ino) should be unique. 1560 */ 1561 return (0); 1562 } 1563 1564 /* ARGSUSED */ 1565 static int 1566 pipe_close(fp, p) 1567 struct file *fp; 1568 struct proc *p; 1569 { 1570 struct pipe *cpipe = (struct pipe *)fp->f_data; 1571 1572 #ifdef __FreeBSD__ 1573 fp->f_ops = &badfileops; 1574 funsetown(cpipe->pipe_sigio); 1575 #endif 1576 fp->f_data = NULL; 1577 pipeclose(cpipe); 1578 return (0); 1579 } 1580 1581 static void 1582 pipe_free_kmem(cpipe) 1583 struct pipe *cpipe; 1584 { 1585 1586 #ifdef __FreeBSD__ 1587 mtx_assert(&vm_mtx, MA_OWNED); 1588 #endif 1589 if (cpipe->pipe_buffer.buffer != NULL) { 1590 if (cpipe->pipe_buffer.size > PIPE_SIZE) 1591 --nbigpipe; 1592 amountpipekva -= cpipe->pipe_buffer.size; 1593 #ifdef __FreeBSD__ 1594 kmem_free(kernel_map, 1595 (vm_offset_t)cpipe->pipe_buffer.buffer, 1596 cpipe->pipe_buffer.size); 1597 #elif defined(__NetBSD__) 1598 uvm_km_free(kernel_map, 1599 (vaddr_t)cpipe->pipe_buffer.buffer, 1600 cpipe->pipe_buffer.size); 1601 #endif /* NetBSD */ 1602 1603 cpipe->pipe_buffer.buffer = NULL; 1604 } 1605 #ifndef PIPE_NODIRECT 1606 if (cpipe->pipe_map.kva != NULL) { 1607 #ifdef __FreeBSD__ 1608 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; 1609 kmem_free(kernel_map, 1610 cpipe->pipe_map.kva, 1611 cpipe->pipe_buffer.size + PAGE_SIZE); 1612 #elif defined(__NetBSD__) 1613 pipe_loan_free(cpipe); 1614 #endif /* NetBSD */ 1615 cpipe->pipe_map.cnt = 0; 1616 cpipe->pipe_map.kva = NULL; 1617 cpipe->pipe_map.pos = 0; 1618 cpipe->pipe_map.npages = 0; 1619 } 1620 #endif /* !PIPE_NODIRECT */ 1621 } 1622 1623 /* 1624 * shutdown the pipe 1625 */ 1626 static void 1627 pipeclose(cpipe) 1628 struct pipe *cpipe; 1629 { 1630 struct pipe *ppipe; 1631 1632 if (!cpipe) 1633 return; 1634 1635 pipeselwakeup(cpipe, cpipe); 1636 1637 /* 1638 * If the other side is blocked, wake it up saying that 1639 * we want to close it down. 1640 */ 1641 while (cpipe->pipe_busy) { 1642 wakeup(cpipe); 1643 cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF; 1644 tsleep(cpipe, PRIBIO, "pipecl", 0); 1645 } 1646 1647 /* 1648 * Disconnect from peer 1649 */ 1650 if ((ppipe = cpipe->pipe_peer) != NULL) { 1651 pipeselwakeup(ppipe, ppipe); 1652 1653 ppipe->pipe_state |= PIPE_EOF; 1654 wakeup(ppipe); 1655 ppipe->pipe_peer = NULL; 1656 } 1657 1658 /* 1659 * free resources 1660 */ 1661 #ifdef _FreeBSD__ 1662 mtx_lock(&vm_mtx); 1663 pipe_free_kmem(cpipe); 1664 /* XXX: erm, doesn't zalloc already have its own locks and 1665 * not need the giant vm lock? 1666 */ 1667 zfree(pipe_zone, cpipe); 1668 mtx_unlock(&vm_mtx); 1669 #endif /* FreeBSD */ 1670 1671 #ifdef __NetBSD__ 1672 pipe_free_kmem(cpipe); 1673 (void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL); 1674 pool_put(&pipe_pool, cpipe); 1675 #endif 1676 } 1677 1678 #ifdef __FreeBSD__ 1679 /*ARGSUSED*/ 1680 static int 1681 pipe_kqfilter(struct file *fp, struct knote *kn) 1682 { 1683 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1684 1685 switch (kn->kn_filter) { 1686 case EVFILT_READ: 1687 kn->kn_fop = &pipe_rfiltops; 1688 break; 1689 case EVFILT_WRITE: 1690 kn->kn_fop = &pipe_wfiltops; 1691 cpipe = cpipe->pipe_peer; 1692 break; 1693 default: 1694 return (1); 1695 } 1696 kn->kn_hook = (caddr_t)cpipe; 1697 1698 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); 1699 return (0); 1700 } 1701 1702 static void 1703 filt_pipedetach(struct knote *kn) 1704 { 1705 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1706 1707 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); 1708 } 1709 1710 /*ARGSUSED*/ 1711 static int 1712 filt_piperead(struct knote *kn, long hint) 1713 { 1714 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1715 struct pipe *wpipe = rpipe->pipe_peer; 1716 1717 kn->kn_data = rpipe->pipe_buffer.cnt; 1718 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1719 kn->kn_data = rpipe->pipe_map.cnt; 1720 1721 if ((rpipe->pipe_state & PIPE_EOF) || 1722 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1723 kn->kn_flags |= EV_EOF; 1724 return (1); 1725 } 1726 return (kn->kn_data > 0); 1727 } 1728 1729 /*ARGSUSED*/ 1730 static int 1731 filt_pipewrite(struct knote *kn, long hint) 1732 { 1733 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1734 struct pipe *wpipe = rpipe->pipe_peer; 1735 1736 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1737 kn->kn_data = 0; 1738 kn->kn_flags |= EV_EOF; 1739 return (1); 1740 } 1741 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1742 if (wpipe->pipe_state & PIPE_DIRECTW) 1743 kn->kn_data = 0; 1744 1745 return (kn->kn_data >= PIPE_BUF); 1746 } 1747 #endif /* FreeBSD */ 1748 1749 #ifdef __NetBSD__ 1750 static int 1751 pipe_fcntl(fp, cmd, data, p) 1752 struct file *fp; 1753 u_int cmd; 1754 caddr_t data; 1755 struct proc *p; 1756 { 1757 if (cmd == F_SETFL) 1758 return (0); 1759 else 1760 return (EOPNOTSUPP); 1761 } 1762 1763 /* 1764 * Handle pipe sysctls. 1765 */ 1766 int 1767 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen) 1768 int *name; 1769 u_int namelen; 1770 void *oldp; 1771 size_t *oldlenp; 1772 void *newp; 1773 size_t newlen; 1774 { 1775 /* All sysctl names at this level are terminal. */ 1776 if (namelen != 1) 1777 return (ENOTDIR); /* overloaded */ 1778 1779 switch (name[0]) { 1780 case KERN_PIPE_MAXKVASZ: 1781 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva)); 1782 case KERN_PIPE_LIMITKVA: 1783 return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva)); 1784 case KERN_PIPE_MAXBIGPIPES: 1785 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes)); 1786 case KERN_PIPE_NBIGPIPES: 1787 return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe)); 1788 case KERN_PIPE_KVASIZE: 1789 return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva)); 1790 default: 1791 return (EOPNOTSUPP); 1792 } 1793 /* NOTREACHED */ 1794 } 1795 1796 /* 1797 * Initialize pipe structs. 1798 */ 1799 void 1800 pipe_init(void) 1801 { 1802 pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl", 1803 0, NULL, NULL, M_PIPE); 1804 } 1805 1806 #endif /* __NetBSD __ */ 1807