1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $ 40 * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.58 2008/06/28 17:59:49 dillon Exp $ 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/fcntl.h> 46 #include <sys/file.h> 47 #include <sys/stat.h> 48 #include <sys/proc.h> 49 #include <sys/priv.h> 50 #include <sys/mount.h> 51 #include <sys/nlookup.h> 52 #include <sys/vnode.h> 53 #include <sys/buf.h> 54 #include <sys/filio.h> 55 #include <sys/ttycom.h> 56 #include <sys/conf.h> 57 #include <sys/sysctl.h> 58 #include <sys/syslog.h> 59 60 #include <sys/thread2.h> 61 62 static int vn_closefile (struct file *fp); 63 static int vn_ioctl (struct file *fp, u_long com, caddr_t data, 64 struct ucred *cred); 65 static int vn_read (struct file *fp, struct uio *uio, 66 struct ucred *cred, int flags); 67 static int svn_read (struct file *fp, struct uio *uio, 68 struct ucred *cred, int flags); 69 static int vn_poll (struct file *fp, int events, struct ucred *cred); 70 static int vn_kqfilter (struct file *fp, struct knote *kn); 71 static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred); 72 static int vn_write (struct file *fp, struct uio *uio, 73 struct ucred *cred, int flags); 74 static int svn_write (struct file *fp, struct uio *uio, 75 struct ucred *cred, int flags); 76 77 #ifdef SMP 78 static int read_mpsafe = 0; 79 SYSCTL_INT(_vfs, OID_AUTO, read_mpsafe, CTLFLAG_RW, &read_mpsafe, 0, ""); 80 static int write_mpsafe = 0; 81 SYSCTL_INT(_vfs, OID_AUTO, write_mpsafe, CTLFLAG_RW, &write_mpsafe, 0, ""); 82 static int getattr_mpsafe = 0; 83 SYSCTL_INT(_vfs, OID_AUTO, getattr_mpsafe, CTLFLAG_RW, &getattr_mpsafe, 0, ""); 84 #else 85 #define read_mpsafe 0 86 #define write_mpsafe 0 87 #define getattr_mpsafe 0 88 #endif 89 90 struct fileops vnode_fileops = { 91 .fo_read = vn_read, 92 .fo_write = vn_write, 93 .fo_ioctl = vn_ioctl, 94 .fo_poll = vn_poll, 95 .fo_kqfilter = vn_kqfilter, 96 .fo_stat = vn_statfile, 97 .fo_close = vn_closefile, 98 .fo_shutdown = nofo_shutdown 99 }; 100 101 struct fileops specvnode_fileops = { 102 .fo_read = svn_read, 103 .fo_write = svn_write, 104 .fo_ioctl = vn_ioctl, 105 .fo_poll = vn_poll, 106 .fo_kqfilter = vn_kqfilter, 107 .fo_stat = vn_statfile, 108 .fo_close = vn_closefile, 109 .fo_shutdown = nofo_shutdown 110 }; 111 112 /* 113 * Shortcut the device read/write. This avoids a lot of vnode junk. 114 * Basically the specfs vnops for read and write take the locked vnode, 115 * unlock it (because we can't hold the vnode locked while reading or writing 116 * a device which may block indefinitely), issues the device operation, then 117 * relock the vnode before returning, plus other junk. This bypasses all 118 * of that and just does the device operation. 119 */ 120 void 121 vn_setspecops(struct file *fp) 122 { 123 if (vfs_fastdev && fp->f_ops == &vnode_fileops) { 124 fp->f_ops = &specvnode_fileops; 125 } 126 } 127 128 /* 129 * Common code for vnode open operations. Check permissions, and call 130 * the VOP_NOPEN or VOP_NCREATE routine. 131 * 132 * The caller is responsible for setting up nd with nlookup_init() and 133 * for cleaning it up with nlookup_done(), whether we return an error 134 * or not. 135 * 136 * On success nd->nl_open_vp will hold a referenced and, if requested, 137 * locked vnode. A locked vnode is requested via NLC_LOCKVP. If fp 138 * is non-NULL the vnode will be installed in the file pointer. 139 * 140 * NOTE: The vnode is referenced just once on return whether or not it 141 * is also installed in the file pointer. 142 */ 143 int 144 vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode) 145 { 146 struct vnode *vp; 147 struct ucred *cred = nd->nl_cred; 148 struct vattr vat; 149 struct vattr *vap = &vat; 150 int error; 151 152 /* 153 * Certain combinations are illegal 154 */ 155 if ((fmode & (FWRITE | O_TRUNC)) == O_TRUNC) 156 return(EACCES); 157 158 /* 159 * Lookup the path and create or obtain the vnode. After a 160 * successful lookup a locked nd->nl_nch will be returned. 161 * 162 * The result of this section should be a locked vnode. 163 * 164 * XXX with only a little work we should be able to avoid locking 165 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set. 166 */ 167 nd->nl_flags |= NLC_OPEN; 168 if (fmode & O_APPEND) 169 nd->nl_flags |= NLC_APPEND; 170 if (fmode & O_TRUNC) 171 nd->nl_flags |= NLC_TRUNCATE; 172 if (fmode & FREAD) 173 nd->nl_flags |= NLC_READ; 174 if (fmode & FWRITE) 175 nd->nl_flags |= NLC_WRITE; 176 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 177 nd->nl_flags |= NLC_FOLLOW; 178 179 if (fmode & O_CREAT) { 180 /* 181 * CONDITIONAL CREATE FILE CASE 182 * 183 * Setting NLC_CREATE causes a negative hit to store 184 * the negative hit ncp and not return an error. Then 185 * nc_error or nc_vp may be checked to see if the ncp 186 * represents a negative hit. NLC_CREATE also requires 187 * write permission on the governing directory or EPERM 188 * is returned. 189 */ 190 nd->nl_flags |= NLC_CREATE; 191 nd->nl_flags |= NLC_REFDVP; 192 bwillinode(1); 193 error = nlookup(nd); 194 } else { 195 /* 196 * NORMAL OPEN FILE CASE 197 */ 198 error = nlookup(nd); 199 } 200 201 if (error) 202 return (error); 203 204 /* 205 * split case to allow us to re-resolve and retry the ncp in case 206 * we get ESTALE. 207 */ 208 again: 209 if (fmode & O_CREAT) { 210 if (nd->nl_nch.ncp->nc_vp == NULL) { 211 if ((error = ncp_writechk(&nd->nl_nch)) != 0) 212 return (error); 213 VATTR_NULL(vap); 214 vap->va_type = VREG; 215 vap->va_mode = cmode; 216 if (fmode & O_EXCL) 217 vap->va_vaflags |= VA_EXCLUSIVE; 218 error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp, 219 nd->nl_cred, vap); 220 if (error) 221 return (error); 222 fmode &= ~O_TRUNC; 223 /* locked vnode is returned */ 224 } else { 225 if (fmode & O_EXCL) { 226 error = EEXIST; 227 } else { 228 error = cache_vget(&nd->nl_nch, cred, 229 LK_EXCLUSIVE, &vp); 230 } 231 if (error) 232 return (error); 233 fmode &= ~O_CREAT; 234 } 235 } else { 236 error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp); 237 if (error) 238 return (error); 239 } 240 241 /* 242 * We have a locked vnode and ncp now. Note that the ncp will 243 * be cleaned up by the caller if nd->nl_nch is left intact. 244 */ 245 if (vp->v_type == VLNK) { 246 error = EMLINK; 247 goto bad; 248 } 249 if (vp->v_type == VSOCK) { 250 error = EOPNOTSUPP; 251 goto bad; 252 } 253 if ((fmode & O_CREAT) == 0) { 254 if (fmode & (FWRITE | O_TRUNC)) { 255 if (vp->v_type == VDIR) { 256 error = EISDIR; 257 goto bad; 258 } 259 error = vn_writechk(vp, &nd->nl_nch); 260 if (error) { 261 /* 262 * Special stale handling, re-resolve the 263 * vnode. 264 */ 265 if (error == ESTALE) { 266 vput(vp); 267 vp = NULL; 268 cache_setunresolved(&nd->nl_nch); 269 error = cache_resolve(&nd->nl_nch, cred); 270 if (error == 0) 271 goto again; 272 } 273 goto bad; 274 } 275 } 276 } 277 if (fmode & O_TRUNC) { 278 vn_unlock(vp); /* XXX */ 279 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */ 280 VATTR_NULL(vap); 281 vap->va_size = 0; 282 error = VOP_SETATTR(vp, vap, cred); 283 if (error) 284 goto bad; 285 } 286 287 /* 288 * Setup the fp so VOP_OPEN can override it. No descriptor has been 289 * associated with the fp yet so we own it clean. 290 * 291 * f_nchandle inherits nl_nch. This used to be necessary only for 292 * directories but now we do it unconditionally so f*() ops 293 * such as fchmod() can access the actual namespace that was 294 * used to open the file. 295 */ 296 if (fp) { 297 if (nd->nl_flags & NLC_APPENDONLY) 298 fmode |= FAPPENDONLY; 299 fp->f_nchandle = nd->nl_nch; 300 cache_zero(&nd->nl_nch); 301 cache_unlock(&fp->f_nchandle); 302 } 303 304 /* 305 * Get rid of nl_nch. vn_open does not return it (it returns the 306 * vnode or the file pointer). Note: we can't leave nl_nch locked 307 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g. 308 * on /dev/ttyd0 309 */ 310 if (nd->nl_nch.ncp) 311 cache_put(&nd->nl_nch); 312 313 error = VOP_OPEN(vp, fmode, cred, fp); 314 if (error) { 315 /* 316 * setting f_ops to &badfileops will prevent the descriptor 317 * code from trying to close and release the vnode, since 318 * the open failed we do not want to call close. 319 */ 320 if (fp) { 321 fp->f_data = NULL; 322 fp->f_ops = &badfileops; 323 } 324 goto bad; 325 } 326 327 #if 0 328 /* 329 * Assert that VREG files have been setup for vmio. 330 */ 331 KASSERT(vp->v_type != VREG || vp->v_object != NULL, 332 ("vn_open: regular file was not VMIO enabled!")); 333 #endif 334 335 /* 336 * Return the vnode. XXX needs some cleaning up. The vnode is 337 * only returned in the fp == NULL case. 338 */ 339 if (fp == NULL) { 340 nd->nl_open_vp = vp; 341 nd->nl_vp_fmode = fmode; 342 if ((nd->nl_flags & NLC_LOCKVP) == 0) 343 vn_unlock(vp); 344 } else { 345 vput(vp); 346 } 347 return (0); 348 bad: 349 if (vp) 350 vput(vp); 351 return (error); 352 } 353 354 int 355 vn_opendisk(const char *devname, int fmode, struct vnode **vpp) 356 { 357 struct vnode *vp; 358 int error; 359 360 if (strncmp(devname, "/dev/", 5) == 0) 361 devname += 5; 362 if ((vp = getsynthvnode(devname)) == NULL) { 363 error = ENODEV; 364 } else { 365 error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL); 366 vn_unlock(vp); 367 if (error) { 368 vrele(vp); 369 vp = NULL; 370 } 371 } 372 *vpp = vp; 373 return (error); 374 } 375 376 /* 377 * Check for write permissions on the specified vnode. nch may be NULL. 378 */ 379 int 380 vn_writechk(struct vnode *vp, struct nchandle *nch) 381 { 382 /* 383 * If there's shared text associated with 384 * the vnode, try to free it up once. If 385 * we fail, we can't allow writing. 386 */ 387 if (vp->v_flag & VTEXT) 388 return (ETXTBSY); 389 390 /* 391 * If the vnode represents a regular file, check the mount 392 * point via the nch. This may be a different mount point 393 * then the one embedded in the vnode (e.g. nullfs). 394 * 395 * We can still write to non-regular files (e.g. devices) 396 * via read-only mounts. 397 */ 398 if (nch && nch->ncp && vp->v_type == VREG) 399 return (ncp_writechk(nch)); 400 return (0); 401 } 402 403 /* 404 * Check whether the underlying mount is read-only. The mount point 405 * referenced by the namecache may be different from the mount point 406 * used by the underlying vnode in the case of NULLFS, so a separate 407 * check is needed. 408 */ 409 int 410 ncp_writechk(struct nchandle *nch) 411 { 412 if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY)) 413 return (EROFS); 414 return(0); 415 } 416 417 /* 418 * Vnode close call 419 */ 420 int 421 vn_close(struct vnode *vp, int flags) 422 { 423 int error; 424 425 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 426 if (error == 0) { 427 error = VOP_CLOSE(vp, flags); 428 vn_unlock(vp); 429 } 430 vrele(vp); 431 return (error); 432 } 433 434 static __inline 435 int 436 sequential_heuristic(struct uio *uio, struct file *fp) 437 { 438 /* 439 * Sequential heuristic - detect sequential operation 440 * 441 * NOTE: SMP: We allow f_seqcount updates to race. 442 */ 443 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 444 uio->uio_offset == fp->f_nextoff) { 445 int tmpseq = fp->f_seqcount; 446 /* 447 * XXX we assume that the filesystem block size is 448 * the default. Not true, but still gives us a pretty 449 * good indicator of how sequential the read operations 450 * are. 451 */ 452 tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; 453 if (tmpseq > IO_SEQMAX) 454 tmpseq = IO_SEQMAX; 455 fp->f_seqcount = tmpseq; 456 return(fp->f_seqcount << IO_SEQSHIFT); 457 } 458 459 /* 460 * Not sequential, quick draw-down of seqcount 461 * 462 * NOTE: SMP: We allow f_seqcount updates to race. 463 */ 464 if (fp->f_seqcount > 1) 465 fp->f_seqcount = 1; 466 else 467 fp->f_seqcount = 0; 468 return(0); 469 } 470 471 /* 472 * get - lock and return the f_offset field. 473 * set - set and unlock the f_offset field. 474 * 475 * These routines serve the dual purpose of serializing access to the 476 * f_offset field (at least on i386) and guaranteeing operational integrity 477 * when multiple read()ers and write()ers are present on the same fp. 478 */ 479 static __inline off_t 480 vn_get_fpf_offset(struct file *fp) 481 { 482 u_int flags; 483 u_int nflags; 484 485 /* 486 * Shortcut critical path. 487 */ 488 flags = fp->f_flag & ~FOFFSETLOCK; 489 if (atomic_cmpset_int(&fp->f_flag, flags, flags | FOFFSETLOCK)) 490 return(fp->f_offset); 491 492 /* 493 * The hard way 494 */ 495 for (;;) { 496 flags = fp->f_flag; 497 if (flags & FOFFSETLOCK) { 498 nflags = flags | FOFFSETWAKE; 499 crit_enter(); 500 tsleep_interlock(&fp->f_flag); 501 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) 502 tsleep(&fp->f_flag, 0, "fpoff", 0); 503 crit_exit(); 504 } else { 505 nflags = flags | FOFFSETLOCK; 506 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) 507 break; 508 } 509 } 510 return(fp->f_offset); 511 } 512 513 static __inline void 514 vn_set_fpf_offset(struct file *fp, off_t offset) 515 { 516 u_int flags; 517 u_int nflags; 518 519 /* 520 * We hold the lock so we can set the offset without interference. 521 */ 522 fp->f_offset = offset; 523 524 /* 525 * Normal release is already a reasonably critical path. 526 */ 527 for (;;) { 528 flags = fp->f_flag; 529 nflags = flags & ~(FOFFSETLOCK | FOFFSETWAKE); 530 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) { 531 if (flags & FOFFSETWAKE) 532 wakeup(&fp->f_flag); 533 break; 534 } 535 } 536 } 537 538 static __inline off_t 539 vn_poll_fpf_offset(struct file *fp) 540 { 541 #if defined(__amd64__) || !defined(SMP) 542 return(fp->f_offset); 543 #else 544 off_t off = vn_get_fpf_offset(fp); 545 vn_set_fpf_offset(fp, off); 546 return(off); 547 #endif 548 } 549 550 /* 551 * Package up an I/O request on a vnode into a uio and do it. 552 */ 553 int 554 vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, 555 off_t offset, enum uio_seg segflg, int ioflg, 556 struct ucred *cred, int *aresid) 557 { 558 struct uio auio; 559 struct iovec aiov; 560 struct ccms_lock ccms_lock; 561 int error; 562 563 if ((ioflg & IO_NODELOCKED) == 0) 564 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 565 auio.uio_iov = &aiov; 566 auio.uio_iovcnt = 1; 567 aiov.iov_base = base; 568 aiov.iov_len = len; 569 auio.uio_resid = len; 570 auio.uio_offset = offset; 571 auio.uio_segflg = segflg; 572 auio.uio_rw = rw; 573 auio.uio_td = curthread; 574 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, &auio); 575 if (rw == UIO_READ) { 576 error = VOP_READ(vp, &auio, ioflg, cred); 577 } else { 578 error = VOP_WRITE(vp, &auio, ioflg, cred); 579 } 580 ccms_lock_put(&vp->v_ccms, &ccms_lock); 581 if (aresid) 582 *aresid = auio.uio_resid; 583 else 584 if (auio.uio_resid && error == 0) 585 error = EIO; 586 if ((ioflg & IO_NODELOCKED) == 0) 587 vn_unlock(vp); 588 return (error); 589 } 590 591 /* 592 * Package up an I/O request on a vnode into a uio and do it. The I/O 593 * request is split up into smaller chunks and we try to avoid saturating 594 * the buffer cache while potentially holding a vnode locked, so we 595 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() 596 * to give other processes a chance to lock the vnode (either other processes 597 * core'ing the same binary, or unrelated processes scanning the directory). 598 */ 599 int 600 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, 601 off_t offset, enum uio_seg segflg, int ioflg, 602 struct ucred *cred, int *aresid) 603 { 604 int error = 0; 605 606 do { 607 int chunk; 608 609 /* 610 * Force `offset' to a multiple of MAXBSIZE except possibly 611 * for the first chunk, so that filesystems only need to 612 * write full blocks except possibly for the first and last 613 * chunks. 614 */ 615 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 616 617 if (chunk > len) 618 chunk = len; 619 if (vp->v_type == VREG) { 620 switch(rw) { 621 case UIO_READ: 622 bwillread(chunk); 623 break; 624 case UIO_WRITE: 625 bwillwrite(chunk); 626 break; 627 } 628 } 629 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 630 ioflg, cred, aresid); 631 len -= chunk; /* aresid calc already includes length */ 632 if (error) 633 break; 634 offset += chunk; 635 base += chunk; 636 uio_yield(); 637 } while (len); 638 if (aresid) 639 *aresid += len; 640 return (error); 641 } 642 643 /* 644 * MPALMOSTSAFE - acquires mplock 645 * 646 * File pointers can no longer get ripped up by revoke so 647 * we don't need to lock access to the vp. 648 * 649 * f_offset updates are not guaranteed against multiple readers 650 */ 651 static int 652 vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 653 { 654 struct ccms_lock ccms_lock; 655 struct vnode *vp; 656 int error, ioflag; 657 658 KASSERT(uio->uio_td == curthread, 659 ("uio_td %p is not td %p", uio->uio_td, curthread)); 660 vp = (struct vnode *)fp->f_data; 661 662 ioflag = 0; 663 if (flags & O_FBLOCKING) { 664 /* ioflag &= ~IO_NDELAY; */ 665 } else if (flags & O_FNONBLOCKING) { 666 ioflag |= IO_NDELAY; 667 } else if (fp->f_flag & FNONBLOCK) { 668 ioflag |= IO_NDELAY; 669 } 670 if (flags & O_FBUFFERED) { 671 /* ioflag &= ~IO_DIRECT; */ 672 } else if (flags & O_FUNBUFFERED) { 673 ioflag |= IO_DIRECT; 674 } else if (fp->f_flag & O_DIRECT) { 675 ioflag |= IO_DIRECT; 676 } 677 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 678 uio->uio_offset = vn_get_fpf_offset(fp); 679 vn_lock(vp, LK_SHARED | LK_RETRY); 680 ioflag |= sequential_heuristic(uio, fp); 681 682 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio); 683 if (read_mpsafe && (vp->v_flag & VMP_READ)) { 684 error = VOP_READ(vp, uio, ioflag, cred); 685 } else { 686 get_mplock(); 687 error = VOP_READ(vp, uio, ioflag, cred); 688 rel_mplock(); 689 } 690 ccms_lock_put(&vp->v_ccms, &ccms_lock); 691 fp->f_nextoff = uio->uio_offset; 692 vn_unlock(vp); 693 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 694 vn_set_fpf_offset(fp, uio->uio_offset); 695 return (error); 696 } 697 698 /* 699 * Device-optimized file table vnode read routine. 700 * 701 * This bypasses the VOP table and talks directly to the device. Most 702 * filesystems just route to specfs and can make this optimization. 703 * 704 * MPALMOSTSAFE - acquires mplock 705 */ 706 static int 707 svn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 708 { 709 struct vnode *vp; 710 int ioflag; 711 int error; 712 cdev_t dev; 713 714 get_mplock(); 715 KASSERT(uio->uio_td == curthread, 716 ("uio_td %p is not td %p", uio->uio_td, curthread)); 717 718 vp = (struct vnode *)fp->f_data; 719 if (vp == NULL || vp->v_type == VBAD) { 720 error = EBADF; 721 goto done; 722 } 723 724 if ((dev = vp->v_rdev) == NULL) { 725 error = EBADF; 726 goto done; 727 } 728 reference_dev(dev); 729 730 if (uio->uio_resid == 0) { 731 error = 0; 732 goto done; 733 } 734 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 735 uio->uio_offset = vn_get_fpf_offset(fp); 736 737 ioflag = 0; 738 if (flags & O_FBLOCKING) { 739 /* ioflag &= ~IO_NDELAY; */ 740 } else if (flags & O_FNONBLOCKING) { 741 ioflag |= IO_NDELAY; 742 } else if (fp->f_flag & FNONBLOCK) { 743 ioflag |= IO_NDELAY; 744 } 745 if (flags & O_FBUFFERED) { 746 /* ioflag &= ~IO_DIRECT; */ 747 } else if (flags & O_FUNBUFFERED) { 748 ioflag |= IO_DIRECT; 749 } else if (fp->f_flag & O_DIRECT) { 750 ioflag |= IO_DIRECT; 751 } 752 ioflag |= sequential_heuristic(uio, fp); 753 754 error = dev_dread(dev, uio, ioflag); 755 756 release_dev(dev); 757 fp->f_nextoff = uio->uio_offset; 758 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 759 vn_set_fpf_offset(fp, uio->uio_offset); 760 done: 761 rel_mplock(); 762 return (error); 763 } 764 765 /* 766 * MPALMOSTSAFE - acquires mplock 767 */ 768 static int 769 vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 770 { 771 struct ccms_lock ccms_lock; 772 struct vnode *vp; 773 int error, ioflag; 774 775 KASSERT(uio->uio_td == curthread, 776 ("uio_td %p is not p %p", uio->uio_td, curthread)); 777 vp = (struct vnode *)fp->f_data; 778 779 ioflag = IO_UNIT; 780 if (vp->v_type == VREG && 781 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) { 782 ioflag |= IO_APPEND; 783 } 784 785 if (flags & O_FBLOCKING) { 786 /* ioflag &= ~IO_NDELAY; */ 787 } else if (flags & O_FNONBLOCKING) { 788 ioflag |= IO_NDELAY; 789 } else if (fp->f_flag & FNONBLOCK) { 790 ioflag |= IO_NDELAY; 791 } 792 if (flags & O_FBUFFERED) { 793 /* ioflag &= ~IO_DIRECT; */ 794 } else if (flags & O_FUNBUFFERED) { 795 ioflag |= IO_DIRECT; 796 } else if (fp->f_flag & O_DIRECT) { 797 ioflag |= IO_DIRECT; 798 } 799 if (flags & O_FASYNCWRITE) { 800 /* ioflag &= ~IO_SYNC; */ 801 } else if (flags & O_FSYNCWRITE) { 802 ioflag |= IO_SYNC; 803 } else if (fp->f_flag & O_FSYNC) { 804 ioflag |= IO_SYNC; 805 } 806 807 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)) 808 ioflag |= IO_SYNC; 809 if ((flags & O_FOFFSET) == 0) 810 uio->uio_offset = vn_get_fpf_offset(fp); 811 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 812 ioflag |= sequential_heuristic(uio, fp); 813 ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio); 814 if (write_mpsafe && (vp->v_flag & VMP_WRITE)) { 815 error = VOP_WRITE(vp, uio, ioflag, cred); 816 } else { 817 get_mplock(); 818 error = VOP_WRITE(vp, uio, ioflag, cred); 819 rel_mplock(); 820 } 821 ccms_lock_put(&vp->v_ccms, &ccms_lock); 822 fp->f_nextoff = uio->uio_offset; 823 vn_unlock(vp); 824 if ((flags & O_FOFFSET) == 0) 825 vn_set_fpf_offset(fp, uio->uio_offset); 826 return (error); 827 } 828 829 /* 830 * Device-optimized file table vnode write routine. 831 * 832 * This bypasses the VOP table and talks directly to the device. Most 833 * filesystems just route to specfs and can make this optimization. 834 * 835 * MPALMOSTSAFE - acquires mplock 836 */ 837 static int 838 svn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 839 { 840 struct vnode *vp; 841 int ioflag; 842 int error; 843 cdev_t dev; 844 845 get_mplock(); 846 KASSERT(uio->uio_td == curthread, 847 ("uio_td %p is not p %p", uio->uio_td, curthread)); 848 849 vp = (struct vnode *)fp->f_data; 850 if (vp == NULL || vp->v_type == VBAD) { 851 error = EBADF; 852 goto done; 853 } 854 if (vp->v_type == VREG) 855 bwillwrite(uio->uio_resid); 856 vp = (struct vnode *)fp->f_data; /* XXX needed? */ 857 858 if ((dev = vp->v_rdev) == NULL) { 859 error = EBADF; 860 goto done; 861 } 862 reference_dev(dev); 863 864 if ((flags & O_FOFFSET) == 0) 865 uio->uio_offset = vn_get_fpf_offset(fp); 866 867 ioflag = IO_UNIT; 868 if (vp->v_type == VREG && 869 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) { 870 ioflag |= IO_APPEND; 871 } 872 873 if (flags & O_FBLOCKING) { 874 /* ioflag &= ~IO_NDELAY; */ 875 } else if (flags & O_FNONBLOCKING) { 876 ioflag |= IO_NDELAY; 877 } else if (fp->f_flag & FNONBLOCK) { 878 ioflag |= IO_NDELAY; 879 } 880 if (flags & O_FBUFFERED) { 881 /* ioflag &= ~IO_DIRECT; */ 882 } else if (flags & O_FUNBUFFERED) { 883 ioflag |= IO_DIRECT; 884 } else if (fp->f_flag & O_DIRECT) { 885 ioflag |= IO_DIRECT; 886 } 887 if (flags & O_FASYNCWRITE) { 888 /* ioflag &= ~IO_SYNC; */ 889 } else if (flags & O_FSYNCWRITE) { 890 ioflag |= IO_SYNC; 891 } else if (fp->f_flag & O_FSYNC) { 892 ioflag |= IO_SYNC; 893 } 894 895 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)) 896 ioflag |= IO_SYNC; 897 ioflag |= sequential_heuristic(uio, fp); 898 899 error = dev_dwrite(dev, uio, ioflag); 900 901 release_dev(dev); 902 fp->f_nextoff = uio->uio_offset; 903 if ((flags & O_FOFFSET) == 0) 904 vn_set_fpf_offset(fp, uio->uio_offset); 905 done: 906 rel_mplock(); 907 return (error); 908 } 909 910 /* 911 * MPSAFE 912 */ 913 static int 914 vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred) 915 { 916 struct vnode *vp; 917 int error; 918 919 vp = (struct vnode *)fp->f_data; 920 error = vn_stat(vp, sb, cred); 921 return (error); 922 } 923 924 /* 925 * MPSAFE (if vnode has VMP_GETATTR) 926 */ 927 int 928 vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred) 929 { 930 struct vattr vattr; 931 struct vattr *vap; 932 int error; 933 u_short mode; 934 cdev_t dev; 935 936 vap = &vattr; 937 if (getattr_mpsafe && (vp->v_flag & VMP_GETATTR)) { 938 error = VOP_GETATTR(vp, vap); 939 } else { 940 get_mplock(); 941 error = VOP_GETATTR(vp, vap); 942 rel_mplock(); 943 } 944 if (error) 945 return (error); 946 947 /* 948 * Zero the spare stat fields 949 */ 950 sb->st_lspare = 0; 951 sb->st_qspare = 0; 952 953 /* 954 * Copy from vattr table 955 */ 956 if (vap->va_fsid != VNOVAL) 957 sb->st_dev = vap->va_fsid; 958 else 959 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 960 sb->st_ino = vap->va_fileid; 961 mode = vap->va_mode; 962 switch (vap->va_type) { 963 case VREG: 964 mode |= S_IFREG; 965 break; 966 case VDATABASE: 967 mode |= S_IFDB; 968 break; 969 case VDIR: 970 mode |= S_IFDIR; 971 break; 972 case VBLK: 973 mode |= S_IFBLK; 974 break; 975 case VCHR: 976 mode |= S_IFCHR; 977 break; 978 case VLNK: 979 mode |= S_IFLNK; 980 /* This is a cosmetic change, symlinks do not have a mode. */ 981 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) 982 sb->st_mode &= ~ACCESSPERMS; /* 0000 */ 983 else 984 sb->st_mode |= ACCESSPERMS; /* 0777 */ 985 break; 986 case VSOCK: 987 mode |= S_IFSOCK; 988 break; 989 case VFIFO: 990 mode |= S_IFIFO; 991 break; 992 default: 993 return (EBADF); 994 } 995 sb->st_mode = mode; 996 if (vap->va_nlink > (nlink_t)-1) 997 sb->st_nlink = (nlink_t)-1; 998 else 999 sb->st_nlink = vap->va_nlink; 1000 sb->st_uid = vap->va_uid; 1001 sb->st_gid = vap->va_gid; 1002 sb->st_rdev = makeudev(vap->va_rmajor, vap->va_rminor); 1003 sb->st_size = vap->va_size; 1004 sb->st_atimespec = vap->va_atime; 1005 sb->st_mtimespec = vap->va_mtime; 1006 sb->st_ctimespec = vap->va_ctime; 1007 1008 /* 1009 * A VCHR and VBLK device may track the last access and last modified 1010 * time independantly of the filesystem. This is particularly true 1011 * because device read and write calls may bypass the filesystem. 1012 */ 1013 if (vp->v_type == VCHR || vp->v_type == VBLK) { 1014 dev = vp->v_rdev; 1015 if (dev != NULL) { 1016 if (dev->si_lastread) { 1017 sb->st_atimespec.tv_sec = dev->si_lastread; 1018 sb->st_atimespec.tv_nsec = 0; 1019 } 1020 if (dev->si_lastwrite) { 1021 sb->st_atimespec.tv_sec = dev->si_lastwrite; 1022 sb->st_atimespec.tv_nsec = 0; 1023 } 1024 } 1025 } 1026 1027 /* 1028 * According to www.opengroup.org, the meaning of st_blksize is 1029 * "a filesystem-specific preferred I/O block size for this 1030 * object. In some filesystem types, this may vary from file 1031 * to file" 1032 * Default to PAGE_SIZE after much discussion. 1033 */ 1034 1035 if (vap->va_type == VREG) { 1036 sb->st_blksize = vap->va_blocksize; 1037 } else if (vn_isdisk(vp, NULL)) { 1038 /* 1039 * XXX this is broken. If the device is not yet open (aka 1040 * stat() call, aka v_rdev == NULL), how are we supposed 1041 * to get a valid block size out of it? 1042 */ 1043 dev = vp->v_rdev; 1044 if (dev == NULL && vp->v_type == VCHR) { 1045 get_mplock(); 1046 dev = get_dev(vp->v_umajor, vp->v_uminor); 1047 rel_mplock(); 1048 } 1049 sb->st_blksize = dev->si_bsize_best; 1050 if (sb->st_blksize < dev->si_bsize_phys) 1051 sb->st_blksize = dev->si_bsize_phys; 1052 if (sb->st_blksize < BLKDEV_IOSIZE) 1053 sb->st_blksize = BLKDEV_IOSIZE; 1054 } else { 1055 sb->st_blksize = PAGE_SIZE; 1056 } 1057 1058 sb->st_flags = vap->va_flags; 1059 1060 error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0); 1061 if (error) 1062 sb->st_gen = 0; 1063 else 1064 sb->st_gen = (u_int32_t)vap->va_gen; 1065 1066 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 1067 sb->st_fsmid = vap->va_fsmid; 1068 return (0); 1069 } 1070 1071 /* 1072 * MPALMOSTSAFE - acquires mplock 1073 */ 1074 static int 1075 vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred) 1076 { 1077 struct vnode *vp = ((struct vnode *)fp->f_data); 1078 struct vnode *ovp; 1079 struct vattr vattr; 1080 int error; 1081 off_t size; 1082 1083 get_mplock(); 1084 1085 switch (vp->v_type) { 1086 case VREG: 1087 case VDIR: 1088 if (com == FIONREAD) { 1089 error = VOP_GETATTR(vp, &vattr); 1090 if (error) 1091 break; 1092 size = vattr.va_size; 1093 if ((vp->v_flag & VNOTSEEKABLE) == 0) 1094 size -= vn_poll_fpf_offset(fp); 1095 if (size > 0x7FFFFFFF) 1096 size = 0x7FFFFFFF; 1097 *(int *)data = size; 1098 error = 0; 1099 break; 1100 } 1101 if (com == FIOASYNC) { /* XXX */ 1102 error = 0; /* XXX */ 1103 break; 1104 } 1105 /* fall into ... */ 1106 default: 1107 #if 0 1108 return (ENOTTY); 1109 #endif 1110 case VFIFO: 1111 case VCHR: 1112 case VBLK: 1113 if (com == FIODTYPE) { 1114 if (vp->v_type != VCHR && vp->v_type != VBLK) { 1115 error = ENOTTY; 1116 break; 1117 } 1118 *(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK; 1119 error = 0; 1120 break; 1121 } 1122 error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred); 1123 if (error == 0 && com == TIOCSCTTY) { 1124 struct proc *p = curthread->td_proc; 1125 struct session *sess; 1126 1127 if (p == NULL) { 1128 error = ENOTTY; 1129 break; 1130 } 1131 1132 sess = p->p_session; 1133 /* Do nothing if reassigning same control tty */ 1134 if (sess->s_ttyvp == vp) { 1135 error = 0; 1136 break; 1137 } 1138 1139 /* Get rid of reference to old control tty */ 1140 ovp = sess->s_ttyvp; 1141 vref(vp); 1142 sess->s_ttyvp = vp; 1143 if (ovp) 1144 vrele(ovp); 1145 } 1146 break; 1147 } 1148 rel_mplock(); 1149 return (error); 1150 } 1151 1152 /* 1153 * MPALMOSTSAFE - acquires mplock 1154 */ 1155 static int 1156 vn_poll(struct file *fp, int events, struct ucred *cred) 1157 { 1158 int error; 1159 1160 get_mplock(); 1161 error = VOP_POLL(((struct vnode *)fp->f_data), events, cred); 1162 rel_mplock(); 1163 return (error); 1164 } 1165 1166 /* 1167 * Check that the vnode is still valid, and if so 1168 * acquire requested lock. 1169 */ 1170 int 1171 #ifndef DEBUG_LOCKS 1172 vn_lock(struct vnode *vp, int flags) 1173 #else 1174 debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line) 1175 #endif 1176 { 1177 int error; 1178 1179 do { 1180 #ifdef DEBUG_LOCKS 1181 vp->filename = filename; 1182 vp->line = line; 1183 error = debuglockmgr(&vp->v_lock, flags, 1184 "vn_lock", filename, line); 1185 #else 1186 error = lockmgr(&vp->v_lock, flags); 1187 #endif 1188 if (error == 0) 1189 break; 1190 } while (flags & LK_RETRY); 1191 1192 /* 1193 * Because we (had better!) have a ref on the vnode, once it 1194 * goes to VRECLAIMED state it will not be recycled until all 1195 * refs go away. So we can just check the flag. 1196 */ 1197 if (error == 0 && (vp->v_flag & VRECLAIMED)) { 1198 lockmgr(&vp->v_lock, LK_RELEASE); 1199 error = ENOENT; 1200 } 1201 return (error); 1202 } 1203 1204 void 1205 vn_unlock(struct vnode *vp) 1206 { 1207 lockmgr(&vp->v_lock, LK_RELEASE); 1208 } 1209 1210 int 1211 vn_islocked(struct vnode *vp) 1212 { 1213 return (lockstatus(&vp->v_lock, curthread)); 1214 } 1215 1216 /* 1217 * MPALMOSTSAFE - acquires mplock 1218 */ 1219 static int 1220 vn_closefile(struct file *fp) 1221 { 1222 int error; 1223 1224 get_mplock(); 1225 fp->f_ops = &badfileops; 1226 error = vn_close(((struct vnode *)fp->f_data), fp->f_flag); 1227 rel_mplock(); 1228 return (error); 1229 } 1230 1231 /* 1232 * MPALMOSTSAFE - acquires mplock 1233 */ 1234 static int 1235 vn_kqfilter(struct file *fp, struct knote *kn) 1236 { 1237 int error; 1238 1239 get_mplock(); 1240 error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn); 1241 rel_mplock(); 1242 return (error); 1243 } 1244