1 /* 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.60.2.13 2002/08/05 15:05:15 des Exp $ 20 * $DragonFly: src/sys/kern/sys_pipe.c,v 1.42 2006/09/11 20:25:01 dillon Exp $ 21 */ 22 23 /* 24 * This file contains a high-performance replacement for the socket-based 25 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 26 * all features of sockets, but does do everything that pipes normally 27 * do. 28 */ 29 30 /* 31 * This code has two modes of operation, a small write mode and a large 32 * write mode. The small write mode acts like conventional pipes with 33 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 34 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 35 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 36 * the receiving process can copy it directly from the pages in the sending 37 * process. 38 * 39 * If the sending process receives a signal, it is possible that it will 40 * go away, and certainly its address space can change, because control 41 * is returned back to the user-mode side. In that case, the pipe code 42 * arranges to copy the buffer supplied by the user process, to a pageable 43 * kernel buffer, and the receiving process will grab the data from the 44 * pageable kernel buffer. Since signals don't happen all that often, 45 * the copy operation is normally eliminated. 46 * 47 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 48 * happen for small transfers so that the system will not spend all of 49 * its time context switching. PIPE_SIZE is constrained by the 50 * amount of kernel virtual memory. 51 */ 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/kernel.h> 56 #include <sys/proc.h> 57 #include <sys/fcntl.h> 58 #include <sys/file.h> 59 #include <sys/filedesc.h> 60 #include <sys/filio.h> 61 #include <sys/ttycom.h> 62 #include <sys/stat.h> 63 #include <sys/poll.h> 64 #include <sys/select.h> 65 #include <sys/signalvar.h> 66 #include <sys/sysproto.h> 67 #include <sys/pipe.h> 68 #include <sys/vnode.h> 69 #include <sys/uio.h> 70 #include <sys/event.h> 71 #include <sys/globaldata.h> 72 #include <sys/module.h> 73 #include <sys/malloc.h> 74 #include <sys/sysctl.h> 75 #include <sys/socket.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_param.h> 79 #include <sys/lock.h> 80 #include <vm/vm_object.h> 81 #include <vm/vm_kern.h> 82 #include <vm/vm_extern.h> 83 #include <vm/pmap.h> 84 #include <vm/vm_map.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_zone.h> 87 88 #include <sys/file2.h> 89 90 #include <machine/cpufunc.h> 91 92 /* 93 * interfaces to the outside world 94 */ 95 static int pipe_read (struct file *fp, struct uio *uio, 96 struct ucred *cred, int flags); 97 static int pipe_write (struct file *fp, struct uio *uio, 98 struct ucred *cred, int flags); 99 static int pipe_close (struct file *fp); 100 static int pipe_shutdown (struct file *fp, int how); 101 static int pipe_poll (struct file *fp, int events, struct ucred *cred); 102 static int pipe_kqfilter (struct file *fp, struct knote *kn); 103 static int pipe_stat (struct file *fp, struct stat *sb, struct ucred *cred); 104 static int pipe_ioctl (struct file *fp, u_long cmd, caddr_t data, struct ucred *cred); 105 106 static struct fileops pipeops = { 107 .fo_read = pipe_read, 108 .fo_write = pipe_write, 109 .fo_ioctl = pipe_ioctl, 110 .fo_poll = pipe_poll, 111 .fo_kqfilter = pipe_kqfilter, 112 .fo_stat = pipe_stat, 113 .fo_close = pipe_close, 114 .fo_shutdown = pipe_shutdown 115 }; 116 117 static void filt_pipedetach(struct knote *kn); 118 static int filt_piperead(struct knote *kn, long hint); 119 static int filt_pipewrite(struct knote *kn, long hint); 120 121 static struct filterops pipe_rfiltops = 122 { 1, NULL, filt_pipedetach, filt_piperead }; 123 static struct filterops pipe_wfiltops = 124 { 1, NULL, filt_pipedetach, filt_pipewrite }; 125 126 MALLOC_DEFINE(M_PIPE, "pipe", "pipe structures"); 127 128 /* 129 * Default pipe buffer size(s), this can be kind-of large now because pipe 130 * space is pageable. The pipe code will try to maintain locality of 131 * reference for performance reasons, so small amounts of outstanding I/O 132 * will not wipe the cache. 133 */ 134 #define MINPIPESIZE (PIPE_SIZE/3) 135 #define MAXPIPESIZE (2*PIPE_SIZE/3) 136 137 /* 138 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 139 * is there so that on large systems, we don't exhaust it. 140 */ 141 #define MAXPIPEKVA (8*1024*1024) 142 143 /* 144 * Limit for direct transfers, we cannot, of course limit 145 * the amount of kva for pipes in general though. 146 */ 147 #define LIMITPIPEKVA (16*1024*1024) 148 149 /* 150 * Limit the number of "big" pipes 151 */ 152 #define LIMITBIGPIPES 32 153 #define PIPEQ_MAX_CACHE 16 /* per-cpu pipe structure cache */ 154 155 static int pipe_maxbig = LIMITBIGPIPES; 156 static int pipe_maxcache = PIPEQ_MAX_CACHE; 157 static int pipe_nbig; 158 static int pipe_bcache_alloc; 159 static int pipe_bkmem_alloc; 160 static int pipe_dwrite_enable = 1; /* 0:copy, 1:kmem/sfbuf 2:force */ 161 static int pipe_dwrite_sfbuf = 1; /* 0:kmem_map 1:sfbufs 2:sfbufs_dmap */ 162 /* 3:sfbuf_dmap w/ forced invlpg */ 163 164 SYSCTL_NODE(_kern, OID_AUTO, pipe, CTLFLAG_RW, 0, "Pipe operation"); 165 SYSCTL_INT(_kern_pipe, OID_AUTO, nbig, 166 CTLFLAG_RD, &pipe_nbig, 0, "numer of big pipes allocated"); 167 SYSCTL_INT(_kern_pipe, OID_AUTO, maxcache, 168 CTLFLAG_RW, &pipe_maxcache, 0, "max pipes cached per-cpu"); 169 SYSCTL_INT(_kern_pipe, OID_AUTO, maxbig, 170 CTLFLAG_RW, &pipe_maxbig, 0, "max number of big pipes"); 171 SYSCTL_INT(_kern_pipe, OID_AUTO, dwrite_enable, 172 CTLFLAG_RW, &pipe_dwrite_enable, 0, "1:enable/2:force direct writes"); 173 SYSCTL_INT(_kern_pipe, OID_AUTO, dwrite_sfbuf, 174 CTLFLAG_RW, &pipe_dwrite_sfbuf, 0, 175 "(if dwrite_enable) 0:kmem 1:sfbuf 2:sfbuf_dmap 3:sfbuf_dmap_forceinvlpg"); 176 #if !defined(NO_PIPE_SYSCTL_STATS) 177 SYSCTL_INT(_kern_pipe, OID_AUTO, bcache_alloc, 178 CTLFLAG_RW, &pipe_bcache_alloc, 0, "pipe buffer from pcpu cache"); 179 SYSCTL_INT(_kern_pipe, OID_AUTO, bkmem_alloc, 180 CTLFLAG_RW, &pipe_bkmem_alloc, 0, "pipe buffer from kmem"); 181 #endif 182 183 static void pipeclose (struct pipe *cpipe); 184 static void pipe_free_kmem (struct pipe *cpipe); 185 static int pipe_create (struct pipe **cpipep); 186 static __inline int pipelock (struct pipe *cpipe, int catch); 187 static __inline void pipeunlock (struct pipe *cpipe); 188 static __inline void pipeselwakeup (struct pipe *cpipe); 189 #ifndef PIPE_NODIRECT 190 static int pipe_build_write_buffer (struct pipe *wpipe, struct uio *uio); 191 static int pipe_direct_write (struct pipe *wpipe, struct uio *uio); 192 static void pipe_clone_write_buffer (struct pipe *wpipe); 193 #endif 194 static int pipespace (struct pipe *cpipe, int size); 195 196 /* 197 * The pipe system call for the DTYPE_PIPE type of pipes 198 * 199 * pipe_ARgs(int dummy) 200 */ 201 202 /* ARGSUSED */ 203 int 204 sys_pipe(struct pipe_args *uap) 205 { 206 struct thread *td = curthread; 207 struct proc *p = td->td_proc; 208 struct file *rf, *wf; 209 struct pipe *rpipe, *wpipe; 210 int fd1, fd2, error; 211 212 KKASSERT(p); 213 214 rpipe = wpipe = NULL; 215 if (pipe_create(&rpipe) || pipe_create(&wpipe)) { 216 pipeclose(rpipe); 217 pipeclose(wpipe); 218 return (ENFILE); 219 } 220 221 rpipe->pipe_state |= PIPE_DIRECTOK; 222 wpipe->pipe_state |= PIPE_DIRECTOK; 223 224 /* 225 * Select the direct-map features to use for this pipe. Since the 226 * sysctl's can change on the fly we record the settings when the 227 * pipe is created. 228 * 229 * Generally speaking the system will default to what we consider 230 * to be the best-balanced and most stable option. Right now this 231 * is SFBUF1. Modes 2 and 3 are considered experiemental at the 232 * moment. 233 */ 234 wpipe->pipe_feature = PIPE_COPY; 235 if (pipe_dwrite_enable) { 236 switch(pipe_dwrite_sfbuf) { 237 case 0: 238 wpipe->pipe_feature = PIPE_KMEM; 239 break; 240 case 1: 241 wpipe->pipe_feature = PIPE_SFBUF1; 242 break; 243 case 2: 244 case 3: 245 wpipe->pipe_feature = PIPE_SFBUF2; 246 break; 247 } 248 } 249 rpipe->pipe_feature = wpipe->pipe_feature; 250 251 error = falloc(p, &rf, &fd1); 252 if (error) { 253 pipeclose(rpipe); 254 pipeclose(wpipe); 255 return (error); 256 } 257 uap->sysmsg_fds[0] = fd1; 258 259 /* 260 * Warning: once we've gotten past allocation of the fd for the 261 * read-side, we can only drop the read side via fdrop() in order 262 * to avoid races against processes which manage to dup() the read 263 * side while we are blocked trying to allocate the write side. 264 */ 265 rf->f_type = DTYPE_PIPE; 266 rf->f_flag = FREAD | FWRITE; 267 rf->f_ops = &pipeops; 268 rf->f_data = rpipe; 269 error = falloc(p, &wf, &fd2); 270 if (error) { 271 fsetfd(p, NULL, fd1); 272 fdrop(rf); 273 /* rpipe has been closed by fdrop(). */ 274 pipeclose(wpipe); 275 return (error); 276 } 277 wf->f_type = DTYPE_PIPE; 278 wf->f_flag = FREAD | FWRITE; 279 wf->f_ops = &pipeops; 280 wf->f_data = wpipe; 281 uap->sysmsg_fds[1] = fd2; 282 283 rpipe->pipe_peer = wpipe; 284 wpipe->pipe_peer = rpipe; 285 286 fsetfd(p, rf, fd1); 287 fsetfd(p, wf, fd2); 288 fdrop(rf); 289 fdrop(wf); 290 291 return (0); 292 } 293 294 /* 295 * Allocate kva for pipe circular buffer, the space is pageable 296 * This routine will 'realloc' the size of a pipe safely, if it fails 297 * it will retain the old buffer. 298 * If it fails it will return ENOMEM. 299 */ 300 static int 301 pipespace(struct pipe *cpipe, int size) 302 { 303 struct vm_object *object; 304 caddr_t buffer; 305 int npages, error; 306 307 npages = round_page(size) / PAGE_SIZE; 308 object = cpipe->pipe_buffer.object; 309 310 /* 311 * [re]create the object if necessary and reserve space for it 312 * in the kernel_map. The object and memory are pageable. On 313 * success, free the old resources before assigning the new 314 * ones. 315 */ 316 if (object == NULL || object->size != npages) { 317 object = vm_object_allocate(OBJT_DEFAULT, npages); 318 buffer = (caddr_t) vm_map_min(kernel_map); 319 320 error = vm_map_find(kernel_map, object, 0, 321 (vm_offset_t *)&buffer, size, 322 1, 323 VM_MAPTYPE_NORMAL, 324 VM_PROT_ALL, VM_PROT_ALL, 325 0); 326 327 if (error != KERN_SUCCESS) { 328 vm_object_deallocate(object); 329 return (ENOMEM); 330 } 331 pipe_free_kmem(cpipe); 332 cpipe->pipe_buffer.object = object; 333 cpipe->pipe_buffer.buffer = buffer; 334 cpipe->pipe_buffer.size = size; 335 ++pipe_bkmem_alloc; 336 } else { 337 ++pipe_bcache_alloc; 338 } 339 cpipe->pipe_buffer.in = 0; 340 cpipe->pipe_buffer.out = 0; 341 cpipe->pipe_buffer.cnt = 0; 342 return (0); 343 } 344 345 /* 346 * Initialize and allocate VM and memory for pipe, pulling the pipe from 347 * our per-cpu cache if possible. For now make sure it is sized for the 348 * smaller PIPE_SIZE default. 349 */ 350 static int 351 pipe_create(cpipep) 352 struct pipe **cpipep; 353 { 354 globaldata_t gd = mycpu; 355 struct pipe *cpipe; 356 int error; 357 358 if ((cpipe = gd->gd_pipeq) != NULL) { 359 gd->gd_pipeq = cpipe->pipe_peer; 360 --gd->gd_pipeqcount; 361 cpipe->pipe_peer = NULL; 362 } else { 363 cpipe = kmalloc(sizeof(struct pipe), M_PIPE, M_WAITOK|M_ZERO); 364 } 365 *cpipep = cpipe; 366 if ((error = pipespace(cpipe, PIPE_SIZE)) != 0) 367 return (error); 368 vfs_timestamp(&cpipe->pipe_ctime); 369 cpipe->pipe_atime = cpipe->pipe_ctime; 370 cpipe->pipe_mtime = cpipe->pipe_ctime; 371 return (0); 372 } 373 374 375 /* 376 * lock a pipe for I/O, blocking other access 377 */ 378 static __inline int 379 pipelock(cpipe, catch) 380 struct pipe *cpipe; 381 int catch; 382 { 383 int error; 384 385 while (cpipe->pipe_state & PIPE_LOCK) { 386 cpipe->pipe_state |= PIPE_LWANT; 387 error = tsleep(cpipe, (catch ? PCATCH : 0), "pipelk", 0); 388 if (error != 0) 389 return (error); 390 } 391 cpipe->pipe_state |= PIPE_LOCK; 392 return (0); 393 } 394 395 /* 396 * unlock a pipe I/O lock 397 */ 398 static __inline void 399 pipeunlock(cpipe) 400 struct pipe *cpipe; 401 { 402 403 cpipe->pipe_state &= ~PIPE_LOCK; 404 if (cpipe->pipe_state & PIPE_LWANT) { 405 cpipe->pipe_state &= ~PIPE_LWANT; 406 wakeup(cpipe); 407 } 408 } 409 410 static __inline void 411 pipeselwakeup(cpipe) 412 struct pipe *cpipe; 413 { 414 415 if (cpipe->pipe_state & PIPE_SEL) { 416 cpipe->pipe_state &= ~PIPE_SEL; 417 selwakeup(&cpipe->pipe_sel); 418 } 419 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 420 pgsigio(cpipe->pipe_sigio, SIGIO, 0); 421 KNOTE(&cpipe->pipe_sel.si_note, 0); 422 } 423 424 /* 425 * MPALMOSTSAFE (acquires mplock) 426 */ 427 static int 428 pipe_read(struct file *fp, struct uio *uio, struct ucred *cred, int fflags) 429 { 430 struct pipe *rpipe; 431 int error; 432 int nread = 0; 433 int nbio; 434 u_int size; 435 436 get_mplock(); 437 rpipe = (struct pipe *) fp->f_data; 438 ++rpipe->pipe_busy; 439 error = pipelock(rpipe, 1); 440 if (error) 441 goto unlocked_error; 442 443 if (fflags & O_FBLOCKING) 444 nbio = 0; 445 else if (fflags & O_FNONBLOCKING) 446 nbio = 1; 447 else if (fp->f_flag & O_NONBLOCK) 448 nbio = 1; 449 else 450 nbio = 0; 451 452 while (uio->uio_resid) { 453 caddr_t va; 454 455 if (rpipe->pipe_buffer.cnt > 0) { 456 /* 457 * normal pipe buffer receive 458 */ 459 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 460 if (size > rpipe->pipe_buffer.cnt) 461 size = rpipe->pipe_buffer.cnt; 462 if (size > (u_int) uio->uio_resid) 463 size = (u_int) uio->uio_resid; 464 465 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 466 size, uio); 467 if (error) 468 break; 469 470 rpipe->pipe_buffer.out += size; 471 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 472 rpipe->pipe_buffer.out = 0; 473 474 rpipe->pipe_buffer.cnt -= size; 475 476 /* 477 * If there is no more to read in the pipe, reset 478 * its pointers to the beginning. This improves 479 * cache hit stats. 480 */ 481 if (rpipe->pipe_buffer.cnt == 0) { 482 rpipe->pipe_buffer.in = 0; 483 rpipe->pipe_buffer.out = 0; 484 } 485 nread += size; 486 #ifndef PIPE_NODIRECT 487 } else if (rpipe->pipe_kva && 488 rpipe->pipe_feature == PIPE_KMEM && 489 (rpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) 490 == PIPE_DIRECTW 491 ) { 492 /* 493 * Direct copy using source-side kva mapping 494 */ 495 size = rpipe->pipe_map.xio_bytes - 496 rpipe->pipe_buffer.out; 497 if (size > (u_int)uio->uio_resid) 498 size = (u_int)uio->uio_resid; 499 va = (caddr_t)rpipe->pipe_kva + 500 xio_kvaoffset(&rpipe->pipe_map, rpipe->pipe_buffer.out); 501 error = uiomove(va, size, uio); 502 if (error) 503 break; 504 nread += size; 505 rpipe->pipe_buffer.out += size; 506 if (rpipe->pipe_buffer.out == rpipe->pipe_map.xio_bytes) { 507 rpipe->pipe_state |= PIPE_DIRECTIP; 508 rpipe->pipe_state &= ~PIPE_DIRECTW; 509 /* reset out index for copy mode */ 510 rpipe->pipe_buffer.out = 0; 511 wakeup(rpipe); 512 } 513 } else if (rpipe->pipe_buffer.out != rpipe->pipe_map.xio_bytes && 514 rpipe->pipe_kva && 515 rpipe->pipe_feature == PIPE_SFBUF2 && 516 (rpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) 517 == PIPE_DIRECTW 518 ) { 519 /* 520 * Direct copy, bypassing a kernel buffer. We cannot 521 * mess with the direct-write buffer until 522 * PIPE_DIRECTIP is cleared. In order to prevent 523 * the pipe_write code from racing itself in 524 * direct_write, we set DIRECTIP when we clear 525 * DIRECTW after we have exhausted the buffer. 526 */ 527 if (pipe_dwrite_sfbuf == 3) 528 rpipe->pipe_kvamask = 0; 529 pmap_qenter2(rpipe->pipe_kva, rpipe->pipe_map.xio_pages, 530 rpipe->pipe_map.xio_npages, 531 &rpipe->pipe_kvamask); 532 size = rpipe->pipe_map.xio_bytes - 533 rpipe->pipe_buffer.out; 534 if (size > (u_int)uio->uio_resid) 535 size = (u_int)uio->uio_resid; 536 va = (caddr_t)rpipe->pipe_kva + xio_kvaoffset(&rpipe->pipe_map, rpipe->pipe_buffer.out); 537 error = uiomove(va, size, uio); 538 if (error) 539 break; 540 nread += size; 541 rpipe->pipe_buffer.out += size; 542 if (rpipe->pipe_buffer.out == rpipe->pipe_map.xio_bytes) { 543 rpipe->pipe_state |= PIPE_DIRECTIP; 544 rpipe->pipe_state &= ~PIPE_DIRECTW; 545 /* reset out index for copy mode */ 546 rpipe->pipe_buffer.out = 0; 547 wakeup(rpipe); 548 } 549 } else if (rpipe->pipe_buffer.out != rpipe->pipe_map.xio_bytes && 550 rpipe->pipe_feature == PIPE_SFBUF1 && 551 (rpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) 552 == PIPE_DIRECTW 553 ) { 554 /* 555 * Direct copy, bypassing a kernel buffer. We cannot 556 * mess with the direct-write buffer until 557 * PIPE_DIRECTIP is cleared. In order to prevent 558 * the pipe_write code from racing itself in 559 * direct_write, we set DIRECTIP when we clear 560 * DIRECTW after we have exhausted the buffer. 561 */ 562 error = xio_uio_copy(&rpipe->pipe_map, rpipe->pipe_buffer.out, uio, &size); 563 if (error) 564 break; 565 nread += size; 566 rpipe->pipe_buffer.out += size; 567 if (rpipe->pipe_buffer.out == rpipe->pipe_map.xio_bytes) { 568 rpipe->pipe_state |= PIPE_DIRECTIP; 569 rpipe->pipe_state &= ~PIPE_DIRECTW; 570 /* reset out index for copy mode */ 571 rpipe->pipe_buffer.out = 0; 572 wakeup(rpipe); 573 } 574 #endif 575 } else { 576 /* 577 * detect EOF condition 578 * read returns 0 on EOF, no need to set error 579 */ 580 if (rpipe->pipe_state & PIPE_EOF) 581 break; 582 583 /* 584 * If the "write-side" has been blocked, wake it up now. 585 */ 586 if (rpipe->pipe_state & PIPE_WANTW) { 587 rpipe->pipe_state &= ~PIPE_WANTW; 588 wakeup(rpipe); 589 } 590 591 /* 592 * Break if some data was read. 593 */ 594 if (nread > 0) 595 break; 596 597 /* 598 * Unlock the pipe buffer for our remaining 599 * processing. We will either break out with an 600 * error or we will sleep and relock to loop. 601 */ 602 pipeunlock(rpipe); 603 604 /* 605 * Handle non-blocking mode operation or 606 * wait for more data. 607 */ 608 if (nbio) { 609 error = EAGAIN; 610 } else { 611 rpipe->pipe_state |= PIPE_WANTR; 612 if ((error = tsleep(rpipe, PCATCH|PNORESCHED, 613 "piperd", 0)) == 0) { 614 error = pipelock(rpipe, 1); 615 } 616 } 617 if (error) 618 goto unlocked_error; 619 } 620 } 621 pipeunlock(rpipe); 622 623 if (error == 0) 624 vfs_timestamp(&rpipe->pipe_atime); 625 unlocked_error: 626 --rpipe->pipe_busy; 627 628 /* 629 * PIPE_WANT processing only makes sense if pipe_busy is 0. 630 */ 631 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 632 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 633 wakeup(rpipe); 634 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 635 /* 636 * Handle write blocking hysteresis. 637 */ 638 if (rpipe->pipe_state & PIPE_WANTW) { 639 rpipe->pipe_state &= ~PIPE_WANTW; 640 wakeup(rpipe); 641 } 642 } 643 644 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 645 pipeselwakeup(rpipe); 646 rel_mplock(); 647 return (error); 648 } 649 650 #ifndef PIPE_NODIRECT 651 /* 652 * Map the sending processes' buffer into kernel space and wire it. 653 * This is similar to a physical write operation. 654 */ 655 static int 656 pipe_build_write_buffer(wpipe, uio) 657 struct pipe *wpipe; 658 struct uio *uio; 659 { 660 int error; 661 u_int size; 662 663 size = (u_int) uio->uio_iov->iov_len; 664 if (size > wpipe->pipe_buffer.size) 665 size = wpipe->pipe_buffer.size; 666 667 if (uio->uio_segflg == UIO_SYSSPACE) { 668 error = xio_init_kbuf(&wpipe->pipe_map, uio->uio_iov->iov_base, 669 size); 670 } else { 671 error = xio_init_ubuf(&wpipe->pipe_map, uio->uio_iov->iov_base, 672 size, XIOF_READ); 673 } 674 wpipe->pipe_buffer.out = 0; 675 if (error) 676 return(error); 677 678 /* 679 * Create a kernel map for KMEM and SFBUF2 copy modes. SFBUF2 will 680 * map the pages on the target while KMEM maps the pages now. 681 */ 682 switch(wpipe->pipe_feature) { 683 case PIPE_KMEM: 684 case PIPE_SFBUF2: 685 if (wpipe->pipe_kva == NULL) { 686 wpipe->pipe_kva = 687 kmem_alloc_nofault(kernel_map, XIO_INTERNAL_SIZE); 688 wpipe->pipe_kvamask = 0; 689 } 690 if (wpipe->pipe_feature == PIPE_KMEM) { 691 pmap_qenter(wpipe->pipe_kva, wpipe->pipe_map.xio_pages, 692 wpipe->pipe_map.xio_npages); 693 } 694 break; 695 default: 696 break; 697 } 698 699 /* 700 * And update the uio data. The XIO might have loaded fewer bytes 701 * then requested so reload 'size'. 702 */ 703 size = wpipe->pipe_map.xio_bytes; 704 uio->uio_iov->iov_len -= size; 705 uio->uio_iov->iov_base += size; 706 if (uio->uio_iov->iov_len == 0) 707 uio->uio_iov++; 708 uio->uio_resid -= size; 709 uio->uio_offset += size; 710 return (0); 711 } 712 713 /* 714 * In the case of a signal, the writing process might go away. This 715 * code copies the data into the circular buffer so that the source 716 * pages can be freed without loss of data. 717 * 718 * Note that in direct mode pipe_buffer.out is used to track the 719 * XIO offset. We are converting the direct mode into buffered mode 720 * which changes the meaning of pipe_buffer.out. 721 */ 722 static void 723 pipe_clone_write_buffer(wpipe) 724 struct pipe *wpipe; 725 { 726 int size; 727 int offset; 728 729 offset = wpipe->pipe_buffer.out; 730 size = wpipe->pipe_map.xio_bytes - offset; 731 732 KKASSERT(size <= wpipe->pipe_buffer.size); 733 734 wpipe->pipe_buffer.in = size; 735 wpipe->pipe_buffer.out = 0; 736 wpipe->pipe_buffer.cnt = size; 737 wpipe->pipe_state &= ~(PIPE_DIRECTW | PIPE_DIRECTIP); 738 739 xio_copy_xtok(&wpipe->pipe_map, offset, wpipe->pipe_buffer.buffer, size); 740 xio_release(&wpipe->pipe_map); 741 if (wpipe->pipe_kva) { 742 pmap_qremove(wpipe->pipe_kva, XIO_INTERNAL_PAGES); 743 kmem_free(kernel_map, wpipe->pipe_kva, XIO_INTERNAL_SIZE); 744 wpipe->pipe_kva = NULL; 745 } 746 } 747 748 /* 749 * This implements the pipe buffer write mechanism. Note that only 750 * a direct write OR a normal pipe write can be pending at any given time. 751 * If there are any characters in the pipe buffer, the direct write will 752 * be deferred until the receiving process grabs all of the bytes from 753 * the pipe buffer. Then the direct mapping write is set-up. 754 */ 755 static int 756 pipe_direct_write(wpipe, uio) 757 struct pipe *wpipe; 758 struct uio *uio; 759 { 760 int error; 761 762 retry: 763 while (wpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) { 764 if (wpipe->pipe_state & PIPE_WANTR) { 765 wpipe->pipe_state &= ~PIPE_WANTR; 766 wakeup(wpipe); 767 } 768 wpipe->pipe_state |= PIPE_WANTW; 769 error = tsleep(wpipe, PCATCH, "pipdww", 0); 770 if (error) 771 goto error2; 772 if (wpipe->pipe_state & PIPE_EOF) { 773 error = EPIPE; 774 goto error2; 775 } 776 } 777 KKASSERT(wpipe->pipe_map.xio_bytes == 0); 778 if (wpipe->pipe_buffer.cnt > 0) { 779 if (wpipe->pipe_state & PIPE_WANTR) { 780 wpipe->pipe_state &= ~PIPE_WANTR; 781 wakeup(wpipe); 782 } 783 784 wpipe->pipe_state |= PIPE_WANTW; 785 error = tsleep(wpipe, PCATCH, "pipdwc", 0); 786 if (error) 787 goto error2; 788 if (wpipe->pipe_state & PIPE_EOF) { 789 error = EPIPE; 790 goto error2; 791 } 792 goto retry; 793 } 794 795 /* 796 * Build our direct-write buffer 797 */ 798 wpipe->pipe_state |= PIPE_DIRECTW | PIPE_DIRECTIP; 799 error = pipe_build_write_buffer(wpipe, uio); 800 if (error) 801 goto error1; 802 wpipe->pipe_state &= ~PIPE_DIRECTIP; 803 804 /* 805 * Wait until the receiver has snarfed the data. Since we are likely 806 * going to sleep we optimize the case and yield synchronously, 807 * possibly avoiding the tsleep(). 808 */ 809 error = 0; 810 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 811 if (wpipe->pipe_state & PIPE_EOF) { 812 pipelock(wpipe, 0); 813 xio_release(&wpipe->pipe_map); 814 if (wpipe->pipe_kva) { 815 pmap_qremove(wpipe->pipe_kva, XIO_INTERNAL_PAGES); 816 kmem_free(kernel_map, wpipe->pipe_kva, XIO_INTERNAL_SIZE); 817 wpipe->pipe_kva = NULL; 818 } 819 pipeunlock(wpipe); 820 pipeselwakeup(wpipe); 821 error = EPIPE; 822 goto error1; 823 } 824 if (wpipe->pipe_state & PIPE_WANTR) { 825 wpipe->pipe_state &= ~PIPE_WANTR; 826 wakeup(wpipe); 827 } 828 pipeselwakeup(wpipe); 829 error = tsleep(wpipe, PCATCH|PNORESCHED, "pipdwt", 0); 830 } 831 pipelock(wpipe,0); 832 if (wpipe->pipe_state & PIPE_DIRECTW) { 833 /* 834 * this bit of trickery substitutes a kernel buffer for 835 * the process that might be going away. 836 */ 837 pipe_clone_write_buffer(wpipe); 838 KKASSERT((wpipe->pipe_state & PIPE_DIRECTIP) == 0); 839 } else { 840 /* 841 * note: The pipe_kva mapping is not qremove'd here. For 842 * legacy PIPE_KMEM mode this constitutes an improvement 843 * over the original FreeBSD-4 algorithm. For PIPE_SFBUF2 844 * mode the kva mapping must not be removed to get the 845 * caching benefit. 846 * 847 * For testing purposes we will give the original algorithm 848 * the benefit of the doubt 'what it could have been', and 849 * keep the optimization. 850 */ 851 KKASSERT(wpipe->pipe_state & PIPE_DIRECTIP); 852 xio_release(&wpipe->pipe_map); 853 wpipe->pipe_state &= ~PIPE_DIRECTIP; 854 } 855 pipeunlock(wpipe); 856 return (error); 857 858 /* 859 * Direct-write error, clear the direct write flags. 860 */ 861 error1: 862 wpipe->pipe_state &= ~(PIPE_DIRECTW | PIPE_DIRECTIP); 863 /* fallthrough */ 864 865 /* 866 * General error, wakeup the other side if it happens to be sleeping. 867 */ 868 error2: 869 wakeup(wpipe); 870 return (error); 871 } 872 #endif 873 874 /* 875 * MPALMOSTSAFE - acquires mplock 876 */ 877 static int 878 pipe_write(struct file *fp, struct uio *uio, struct ucred *cred, int fflags) 879 { 880 int error = 0; 881 int orig_resid; 882 int nbio; 883 struct pipe *wpipe, *rpipe; 884 885 get_mplock(); 886 rpipe = (struct pipe *) fp->f_data; 887 wpipe = rpipe->pipe_peer; 888 889 /* 890 * detect loss of pipe read side, issue SIGPIPE if lost. 891 */ 892 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 893 rel_mplock(); 894 return (EPIPE); 895 } 896 ++wpipe->pipe_busy; 897 898 if (fflags & O_FBLOCKING) 899 nbio = 0; 900 else if (fflags & O_FNONBLOCKING) 901 nbio = 1; 902 else if (fp->f_flag & O_NONBLOCK) 903 nbio = 1; 904 else 905 nbio = 0; 906 907 /* 908 * If it is advantageous to resize the pipe buffer, do 909 * so. 910 */ 911 if ((uio->uio_resid > PIPE_SIZE) && 912 (pipe_nbig < pipe_maxbig) && 913 (wpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) == 0 && 914 (wpipe->pipe_buffer.size <= PIPE_SIZE) && 915 (wpipe->pipe_buffer.cnt == 0)) { 916 917 if ((error = pipelock(wpipe,1)) == 0) { 918 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 919 pipe_nbig++; 920 pipeunlock(wpipe); 921 } 922 } 923 924 /* 925 * If an early error occured unbusy and return, waking up any pending 926 * readers. 927 */ 928 if (error) { 929 --wpipe->pipe_busy; 930 if ((wpipe->pipe_busy == 0) && 931 (wpipe->pipe_state & PIPE_WANT)) { 932 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 933 wakeup(wpipe); 934 } 935 rel_mplock(); 936 return(error); 937 } 938 939 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone")); 940 941 orig_resid = uio->uio_resid; 942 943 while (uio->uio_resid) { 944 int space; 945 946 #ifndef PIPE_NODIRECT 947 /* 948 * If the transfer is large, we can gain performance if 949 * we do process-to-process copies directly. 950 * If the write is non-blocking, we don't use the 951 * direct write mechanism. 952 * 953 * The direct write mechanism will detect the reader going 954 * away on us. 955 */ 956 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT || 957 pipe_dwrite_enable > 1) && 958 nbio == 0 && 959 pipe_dwrite_enable) { 960 error = pipe_direct_write( wpipe, uio); 961 if (error) 962 break; 963 continue; 964 } 965 #endif 966 967 /* 968 * Pipe buffered writes cannot be coincidental with 969 * direct writes. We wait until the currently executing 970 * direct write is completed before we start filling the 971 * pipe buffer. We break out if a signal occurs or the 972 * reader goes away. 973 */ 974 retrywrite: 975 while (wpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) { 976 if (wpipe->pipe_state & PIPE_WANTR) { 977 wpipe->pipe_state &= ~PIPE_WANTR; 978 wakeup(wpipe); 979 } 980 error = tsleep(wpipe, PCATCH, "pipbww", 0); 981 if (wpipe->pipe_state & PIPE_EOF) 982 break; 983 if (error) 984 break; 985 } 986 if (wpipe->pipe_state & PIPE_EOF) { 987 error = EPIPE; 988 break; 989 } 990 991 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 992 993 /* Writes of size <= PIPE_BUF must be atomic. */ 994 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 995 space = 0; 996 997 /* 998 * Write to fill, read size handles write hysteresis. Also 999 * additional restrictions can cause select-based non-blocking 1000 * writes to spin. 1001 */ 1002 if (space > 0) { 1003 if ((error = pipelock(wpipe,1)) == 0) { 1004 int size; /* Transfer size */ 1005 int segsize; /* first segment to transfer */ 1006 1007 /* 1008 * It is possible for a direct write to 1009 * slip in on us... handle it here... 1010 */ 1011 if (wpipe->pipe_state & (PIPE_DIRECTW|PIPE_DIRECTIP)) { 1012 pipeunlock(wpipe); 1013 goto retrywrite; 1014 } 1015 /* 1016 * If a process blocked in uiomove, our 1017 * value for space might be bad. 1018 * 1019 * XXX will we be ok if the reader has gone 1020 * away here? 1021 */ 1022 if (space > wpipe->pipe_buffer.size - 1023 wpipe->pipe_buffer.cnt) { 1024 pipeunlock(wpipe); 1025 goto retrywrite; 1026 } 1027 1028 /* 1029 * Transfer size is minimum of uio transfer 1030 * and free space in pipe buffer. 1031 */ 1032 if (space > uio->uio_resid) 1033 size = uio->uio_resid; 1034 else 1035 size = space; 1036 /* 1037 * First segment to transfer is minimum of 1038 * transfer size and contiguous space in 1039 * pipe buffer. If first segment to transfer 1040 * is less than the transfer size, we've got 1041 * a wraparound in the buffer. 1042 */ 1043 segsize = wpipe->pipe_buffer.size - 1044 wpipe->pipe_buffer.in; 1045 if (segsize > size) 1046 segsize = size; 1047 1048 /* Transfer first segment */ 1049 1050 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1051 segsize, uio); 1052 1053 if (error == 0 && segsize < size) { 1054 /* 1055 * Transfer remaining part now, to 1056 * support atomic writes. Wraparound 1057 * happened. 1058 */ 1059 if (wpipe->pipe_buffer.in + segsize != 1060 wpipe->pipe_buffer.size) 1061 panic("Expected pipe buffer wraparound disappeared"); 1062 1063 error = uiomove(&wpipe->pipe_buffer.buffer[0], 1064 size - segsize, uio); 1065 } 1066 if (error == 0) { 1067 wpipe->pipe_buffer.in += size; 1068 if (wpipe->pipe_buffer.in >= 1069 wpipe->pipe_buffer.size) { 1070 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size) 1071 panic("Expected wraparound bad"); 1072 wpipe->pipe_buffer.in = size - segsize; 1073 } 1074 1075 wpipe->pipe_buffer.cnt += size; 1076 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size) 1077 panic("Pipe buffer overflow"); 1078 1079 } 1080 pipeunlock(wpipe); 1081 } 1082 if (error) 1083 break; 1084 1085 } else { 1086 /* 1087 * If the "read-side" has been blocked, wake it up now 1088 * and yield to let it drain synchronously rather 1089 * then block. 1090 */ 1091 if (wpipe->pipe_state & PIPE_WANTR) { 1092 wpipe->pipe_state &= ~PIPE_WANTR; 1093 wakeup(wpipe); 1094 } 1095 1096 /* 1097 * don't block on non-blocking I/O 1098 */ 1099 if (nbio) { 1100 error = EAGAIN; 1101 break; 1102 } 1103 1104 /* 1105 * We have no more space and have something to offer, 1106 * wake up select/poll. 1107 */ 1108 pipeselwakeup(wpipe); 1109 1110 wpipe->pipe_state |= PIPE_WANTW; 1111 error = tsleep(wpipe, PCATCH|PNORESCHED, "pipewr", 0); 1112 if (error != 0) 1113 break; 1114 /* 1115 * If read side wants to go away, we just issue a signal 1116 * to ourselves. 1117 */ 1118 if (wpipe->pipe_state & PIPE_EOF) { 1119 error = EPIPE; 1120 break; 1121 } 1122 } 1123 } 1124 1125 --wpipe->pipe_busy; 1126 1127 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1128 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1129 wakeup(wpipe); 1130 } else if (wpipe->pipe_buffer.cnt > 0) { 1131 /* 1132 * If we have put any characters in the buffer, we wake up 1133 * the reader. 1134 */ 1135 if (wpipe->pipe_state & PIPE_WANTR) { 1136 wpipe->pipe_state &= ~PIPE_WANTR; 1137 wakeup(wpipe); 1138 } 1139 } 1140 1141 /* 1142 * Don't return EPIPE if I/O was successful 1143 */ 1144 if ((wpipe->pipe_buffer.cnt == 0) && 1145 (uio->uio_resid == 0) && 1146 (error == EPIPE)) { 1147 error = 0; 1148 } 1149 1150 if (error == 0) 1151 vfs_timestamp(&wpipe->pipe_mtime); 1152 1153 /* 1154 * We have something to offer, 1155 * wake up select/poll. 1156 */ 1157 if (wpipe->pipe_buffer.cnt) 1158 pipeselwakeup(wpipe); 1159 rel_mplock(); 1160 return (error); 1161 } 1162 1163 /* 1164 * MPALMOSTSAFE - acquires mplock 1165 * 1166 * we implement a very minimal set of ioctls for compatibility with sockets. 1167 */ 1168 int 1169 pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct ucred *cred) 1170 { 1171 struct pipe *mpipe; 1172 int error; 1173 1174 get_mplock(); 1175 mpipe = (struct pipe *)fp->f_data; 1176 1177 switch (cmd) { 1178 case FIOASYNC: 1179 if (*(int *)data) { 1180 mpipe->pipe_state |= PIPE_ASYNC; 1181 } else { 1182 mpipe->pipe_state &= ~PIPE_ASYNC; 1183 } 1184 error = 0; 1185 break; 1186 case FIONREAD: 1187 if (mpipe->pipe_state & PIPE_DIRECTW) { 1188 *(int *)data = mpipe->pipe_map.xio_bytes - 1189 mpipe->pipe_buffer.out; 1190 } else { 1191 *(int *)data = mpipe->pipe_buffer.cnt; 1192 } 1193 error = 0; 1194 break; 1195 case FIOSETOWN: 1196 error = fsetown(*(int *)data, &mpipe->pipe_sigio); 1197 break; 1198 case FIOGETOWN: 1199 *(int *)data = fgetown(mpipe->pipe_sigio); 1200 error = 0; 1201 break; 1202 case TIOCSPGRP: 1203 /* This is deprecated, FIOSETOWN should be used instead. */ 1204 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); 1205 break; 1206 1207 case TIOCGPGRP: 1208 /* This is deprecated, FIOGETOWN should be used instead. */ 1209 *(int *)data = -fgetown(mpipe->pipe_sigio); 1210 error = 0; 1211 break; 1212 default: 1213 error = ENOTTY; 1214 break; 1215 } 1216 rel_mplock(); 1217 return (error); 1218 } 1219 1220 /* 1221 * MPALMOSTSAFE - acquires mplock 1222 */ 1223 int 1224 pipe_poll(struct file *fp, int events, struct ucred *cred) 1225 { 1226 struct pipe *rpipe; 1227 struct pipe *wpipe; 1228 int revents = 0; 1229 1230 get_mplock(); 1231 rpipe = (struct pipe *)fp->f_data; 1232 wpipe = rpipe->pipe_peer; 1233 if (events & (POLLIN | POLLRDNORM)) 1234 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1235 (rpipe->pipe_buffer.cnt > 0) || 1236 (rpipe->pipe_state & PIPE_EOF)) 1237 revents |= events & (POLLIN | POLLRDNORM); 1238 1239 if (events & (POLLOUT | POLLWRNORM)) 1240 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || 1241 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1242 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1243 revents |= events & (POLLOUT | POLLWRNORM); 1244 1245 if ((rpipe->pipe_state & PIPE_EOF) || 1246 (wpipe == NULL) || 1247 (wpipe->pipe_state & PIPE_EOF)) 1248 revents |= POLLHUP; 1249 1250 if (revents == 0) { 1251 if (events & (POLLIN | POLLRDNORM)) { 1252 selrecord(curthread, &rpipe->pipe_sel); 1253 rpipe->pipe_state |= PIPE_SEL; 1254 } 1255 1256 if (events & (POLLOUT | POLLWRNORM)) { 1257 selrecord(curthread, &wpipe->pipe_sel); 1258 wpipe->pipe_state |= PIPE_SEL; 1259 } 1260 } 1261 rel_mplock(); 1262 return (revents); 1263 } 1264 1265 /* 1266 * MPALMOSTSAFE - acquires mplock 1267 */ 1268 static int 1269 pipe_stat(struct file *fp, struct stat *ub, struct ucred *cred) 1270 { 1271 struct pipe *pipe; 1272 1273 get_mplock(); 1274 pipe = (struct pipe *)fp->f_data; 1275 1276 bzero((caddr_t)ub, sizeof(*ub)); 1277 ub->st_mode = S_IFIFO; 1278 ub->st_blksize = pipe->pipe_buffer.size; 1279 ub->st_size = pipe->pipe_buffer.cnt; 1280 if (ub->st_size == 0 && (pipe->pipe_state & PIPE_DIRECTW)) { 1281 ub->st_size = pipe->pipe_map.xio_bytes - 1282 pipe->pipe_buffer.out; 1283 } 1284 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1285 ub->st_atimespec = pipe->pipe_atime; 1286 ub->st_mtimespec = pipe->pipe_mtime; 1287 ub->st_ctimespec = pipe->pipe_ctime; 1288 /* 1289 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev, 1290 * st_flags, st_gen. 1291 * XXX (st_dev, st_ino) should be unique. 1292 */ 1293 rel_mplock(); 1294 return (0); 1295 } 1296 1297 /* 1298 * MPALMOSTSAFE - acquires mplock 1299 */ 1300 static int 1301 pipe_close(struct file *fp) 1302 { 1303 struct pipe *cpipe = (struct pipe *)fp->f_data; 1304 1305 get_mplock(); 1306 fp->f_ops = &badfileops; 1307 fp->f_data = NULL; 1308 funsetown(cpipe->pipe_sigio); 1309 pipeclose(cpipe); 1310 rel_mplock(); 1311 return (0); 1312 } 1313 1314 /* 1315 * Shutdown one or both directions of a full-duplex pipe. 1316 * 1317 * MPALMOSTSAFE - acquires mplock 1318 */ 1319 static int 1320 pipe_shutdown(struct file *fp, int how) 1321 { 1322 struct pipe *rpipe; 1323 struct pipe *wpipe; 1324 int error = EPIPE; 1325 1326 get_mplock(); 1327 rpipe = (struct pipe *)fp->f_data; 1328 1329 switch(how) { 1330 case SHUT_RDWR: 1331 case SHUT_RD: 1332 if (rpipe) { 1333 rpipe->pipe_state |= PIPE_EOF; 1334 pipeselwakeup(rpipe); 1335 if (rpipe->pipe_busy) 1336 wakeup(rpipe); 1337 error = 0; 1338 } 1339 if (how == SHUT_RD) 1340 break; 1341 /* fall through */ 1342 case SHUT_WR: 1343 if (rpipe && (wpipe = rpipe->pipe_peer) != NULL) { 1344 wpipe->pipe_state |= PIPE_EOF; 1345 pipeselwakeup(wpipe); 1346 if (wpipe->pipe_busy) 1347 wakeup(wpipe); 1348 error = 0; 1349 } 1350 } 1351 rel_mplock(); 1352 return (error); 1353 } 1354 1355 static void 1356 pipe_free_kmem(struct pipe *cpipe) 1357 { 1358 if (cpipe->pipe_buffer.buffer != NULL) { 1359 if (cpipe->pipe_buffer.size > PIPE_SIZE) 1360 --pipe_nbig; 1361 kmem_free(kernel_map, 1362 (vm_offset_t)cpipe->pipe_buffer.buffer, 1363 cpipe->pipe_buffer.size); 1364 cpipe->pipe_buffer.buffer = NULL; 1365 cpipe->pipe_buffer.object = NULL; 1366 } 1367 #ifndef PIPE_NODIRECT 1368 KKASSERT(cpipe->pipe_map.xio_bytes == 0 && 1369 cpipe->pipe_map.xio_offset == 0 && 1370 cpipe->pipe_map.xio_npages == 0); 1371 #endif 1372 } 1373 1374 /* 1375 * shutdown the pipe 1376 */ 1377 static void 1378 pipeclose(struct pipe *cpipe) 1379 { 1380 globaldata_t gd; 1381 struct pipe *ppipe; 1382 1383 if (cpipe == NULL) 1384 return; 1385 1386 pipeselwakeup(cpipe); 1387 1388 /* 1389 * If the other side is blocked, wake it up saying that 1390 * we want to close it down. 1391 */ 1392 while (cpipe->pipe_busy) { 1393 wakeup(cpipe); 1394 cpipe->pipe_state |= PIPE_WANT | PIPE_EOF; 1395 tsleep(cpipe, 0, "pipecl", 0); 1396 } 1397 1398 /* 1399 * Disconnect from peer 1400 */ 1401 if ((ppipe = cpipe->pipe_peer) != NULL) { 1402 pipeselwakeup(ppipe); 1403 1404 ppipe->pipe_state |= PIPE_EOF; 1405 wakeup(ppipe); 1406 KNOTE(&ppipe->pipe_sel.si_note, 0); 1407 ppipe->pipe_peer = NULL; 1408 } 1409 1410 if (cpipe->pipe_kva) { 1411 pmap_qremove(cpipe->pipe_kva, XIO_INTERNAL_PAGES); 1412 kmem_free(kernel_map, cpipe->pipe_kva, XIO_INTERNAL_SIZE); 1413 cpipe->pipe_kva = NULL; 1414 } 1415 1416 /* 1417 * free or cache resources 1418 */ 1419 gd = mycpu; 1420 if (gd->gd_pipeqcount >= pipe_maxcache || 1421 cpipe->pipe_buffer.size != PIPE_SIZE 1422 ) { 1423 pipe_free_kmem(cpipe); 1424 kfree(cpipe, M_PIPE); 1425 } else { 1426 KKASSERT(cpipe->pipe_map.xio_npages == 0 && 1427 cpipe->pipe_map.xio_bytes == 0 && 1428 cpipe->pipe_map.xio_offset == 0); 1429 cpipe->pipe_state = 0; 1430 cpipe->pipe_busy = 0; 1431 cpipe->pipe_peer = gd->gd_pipeq; 1432 gd->gd_pipeq = cpipe; 1433 ++gd->gd_pipeqcount; 1434 } 1435 } 1436 1437 /* 1438 * MPALMOSTSAFE - acquires mplock 1439 */ 1440 static int 1441 pipe_kqfilter(struct file *fp, struct knote *kn) 1442 { 1443 struct pipe *cpipe; 1444 1445 get_mplock(); 1446 cpipe = (struct pipe *)kn->kn_fp->f_data; 1447 1448 switch (kn->kn_filter) { 1449 case EVFILT_READ: 1450 kn->kn_fop = &pipe_rfiltops; 1451 break; 1452 case EVFILT_WRITE: 1453 kn->kn_fop = &pipe_wfiltops; 1454 cpipe = cpipe->pipe_peer; 1455 if (cpipe == NULL) { 1456 /* other end of pipe has been closed */ 1457 rel_mplock(); 1458 return (EPIPE); 1459 } 1460 break; 1461 default: 1462 return (1); 1463 } 1464 kn->kn_hook = (caddr_t)cpipe; 1465 1466 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); 1467 rel_mplock(); 1468 return (0); 1469 } 1470 1471 static void 1472 filt_pipedetach(struct knote *kn) 1473 { 1474 struct pipe *cpipe = (struct pipe *)kn->kn_hook; 1475 1476 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); 1477 } 1478 1479 /*ARGSUSED*/ 1480 static int 1481 filt_piperead(struct knote *kn, long hint) 1482 { 1483 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1484 struct pipe *wpipe = rpipe->pipe_peer; 1485 1486 kn->kn_data = rpipe->pipe_buffer.cnt; 1487 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) { 1488 kn->kn_data = rpipe->pipe_map.xio_bytes - 1489 rpipe->pipe_buffer.out; 1490 } 1491 1492 if ((rpipe->pipe_state & PIPE_EOF) || 1493 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1494 kn->kn_flags |= EV_EOF; 1495 return (1); 1496 } 1497 return (kn->kn_data > 0); 1498 } 1499 1500 /*ARGSUSED*/ 1501 static int 1502 filt_pipewrite(struct knote *kn, long hint) 1503 { 1504 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; 1505 struct pipe *wpipe = rpipe->pipe_peer; 1506 1507 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 1508 kn->kn_data = 0; 1509 kn->kn_flags |= EV_EOF; 1510 return (1); 1511 } 1512 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1513 if (wpipe->pipe_state & PIPE_DIRECTW) 1514 kn->kn_data = 0; 1515 1516 return (kn->kn_data >= PIPE_BUF); 1517 } 1518