1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $ 40 * $DragonFly: src/sys/kern/vfs_vnops.c,v 1.58 2008/06/28 17:59:49 dillon Exp $ 41 */ 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/fcntl.h> 46 #include <sys/file.h> 47 #include <sys/stat.h> 48 #include <sys/proc.h> 49 #include <sys/priv.h> 50 #include <sys/mount.h> 51 #include <sys/nlookup.h> 52 #include <sys/vnode.h> 53 #include <sys/buf.h> 54 #include <sys/filio.h> 55 #include <sys/ttycom.h> 56 #include <sys/conf.h> 57 #include <sys/sysctl.h> 58 #include <sys/syslog.h> 59 60 #include <sys/thread2.h> 61 #include <sys/mplock2.h> 62 63 static int vn_closefile (struct file *fp); 64 static int vn_ioctl (struct file *fp, u_long com, caddr_t data, 65 struct ucred *cred, struct sysmsg *msg); 66 static int vn_read (struct file *fp, struct uio *uio, 67 struct ucred *cred, int flags); 68 static int vn_kqfilter (struct file *fp, struct knote *kn); 69 static int vn_statfile (struct file *fp, struct stat *sb, struct ucred *cred); 70 static int vn_write (struct file *fp, struct uio *uio, 71 struct ucred *cred, int flags); 72 73 struct fileops vnode_fileops = { 74 .fo_read = vn_read, 75 .fo_write = vn_write, 76 .fo_ioctl = vn_ioctl, 77 .fo_kqfilter = vn_kqfilter, 78 .fo_stat = vn_statfile, 79 .fo_close = vn_closefile, 80 .fo_shutdown = nofo_shutdown 81 }; 82 83 /* 84 * Common code for vnode open operations. Check permissions, and call 85 * the VOP_NOPEN or VOP_NCREATE routine. 86 * 87 * The caller is responsible for setting up nd with nlookup_init() and 88 * for cleaning it up with nlookup_done(), whether we return an error 89 * or not. 90 * 91 * On success nd->nl_open_vp will hold a referenced and, if requested, 92 * locked vnode. A locked vnode is requested via NLC_LOCKVP. If fp 93 * is non-NULL the vnode will be installed in the file pointer. 94 * 95 * NOTE: The vnode is referenced just once on return whether or not it 96 * is also installed in the file pointer. 97 */ 98 int 99 vn_open(struct nlookupdata *nd, struct file *fp, int fmode, int cmode) 100 { 101 struct vnode *vp; 102 struct ucred *cred = nd->nl_cred; 103 struct vattr vat; 104 struct vattr *vap = &vat; 105 int error; 106 u_int flags; 107 uint64_t osize; 108 struct mount *mp; 109 110 /* 111 * Certain combinations are illegal 112 */ 113 if ((fmode & (FWRITE | O_TRUNC)) == O_TRUNC) 114 return(EACCES); 115 116 /* 117 * Lookup the path and create or obtain the vnode. After a 118 * successful lookup a locked nd->nl_nch will be returned. 119 * 120 * The result of this section should be a locked vnode. 121 * 122 * XXX with only a little work we should be able to avoid locking 123 * the vnode if FWRITE, O_CREAT, and O_TRUNC are *not* set. 124 */ 125 nd->nl_flags |= NLC_OPEN; 126 if (fmode & O_APPEND) 127 nd->nl_flags |= NLC_APPEND; 128 if (fmode & O_TRUNC) 129 nd->nl_flags |= NLC_TRUNCATE; 130 if (fmode & FREAD) 131 nd->nl_flags |= NLC_READ; 132 if (fmode & FWRITE) 133 nd->nl_flags |= NLC_WRITE; 134 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) 135 nd->nl_flags |= NLC_FOLLOW; 136 137 if (fmode & O_CREAT) { 138 /* 139 * CONDITIONAL CREATE FILE CASE 140 * 141 * Setting NLC_CREATE causes a negative hit to store 142 * the negative hit ncp and not return an error. Then 143 * nc_error or nc_vp may be checked to see if the ncp 144 * represents a negative hit. NLC_CREATE also requires 145 * write permission on the governing directory or EPERM 146 * is returned. 147 */ 148 nd->nl_flags |= NLC_CREATE; 149 nd->nl_flags |= NLC_REFDVP; 150 bwillinode(1); 151 error = nlookup(nd); 152 } else { 153 /* 154 * NORMAL OPEN FILE CASE 155 */ 156 error = nlookup(nd); 157 } 158 159 if (error) 160 return (error); 161 162 /* 163 * split case to allow us to re-resolve and retry the ncp in case 164 * we get ESTALE. 165 */ 166 again: 167 if (fmode & O_CREAT) { 168 if (nd->nl_nch.ncp->nc_vp == NULL) { 169 if ((error = ncp_writechk(&nd->nl_nch)) != 0) 170 return (error); 171 VATTR_NULL(vap); 172 vap->va_type = VREG; 173 vap->va_mode = cmode; 174 if (fmode & O_EXCL) 175 vap->va_vaflags |= VA_EXCLUSIVE; 176 error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp, 177 nd->nl_cred, vap); 178 if (error) 179 return (error); 180 fmode &= ~O_TRUNC; 181 /* locked vnode is returned */ 182 } else { 183 if (fmode & O_EXCL) { 184 error = EEXIST; 185 } else { 186 error = cache_vget(&nd->nl_nch, cred, 187 LK_EXCLUSIVE, &vp); 188 } 189 if (error) 190 return (error); 191 fmode &= ~O_CREAT; 192 } 193 } else { 194 error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp); 195 if (error) 196 return (error); 197 } 198 199 /* 200 * We have a locked vnode and ncp now. Note that the ncp will 201 * be cleaned up by the caller if nd->nl_nch is left intact. 202 */ 203 if (vp->v_type == VLNK) { 204 error = EMLINK; 205 goto bad; 206 } 207 if (vp->v_type == VSOCK) { 208 error = EOPNOTSUPP; 209 goto bad; 210 } 211 if (vp->v_type != VDIR && (fmode & O_DIRECTORY)) { 212 error = ENOTDIR; 213 goto bad; 214 } 215 if ((fmode & O_CREAT) == 0) { 216 if (fmode & (FWRITE | O_TRUNC)) { 217 if (vp->v_type == VDIR) { 218 error = EISDIR; 219 goto bad; 220 } 221 error = vn_writechk(vp, &nd->nl_nch); 222 if (error) { 223 /* 224 * Special stale handling, re-resolve the 225 * vnode. 226 */ 227 if (error == ESTALE) { 228 vput(vp); 229 vp = NULL; 230 cache_setunresolved(&nd->nl_nch); 231 error = cache_resolve(&nd->nl_nch, cred); 232 if (error == 0) 233 goto again; 234 } 235 goto bad; 236 } 237 } 238 } 239 if (fmode & O_TRUNC) { 240 vn_unlock(vp); /* XXX */ 241 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */ 242 osize = vp->v_filesize; 243 VATTR_NULL(vap); 244 vap->va_size = 0; 245 error = VOP_SETATTR(vp, vap, cred); 246 if (error) 247 goto bad; 248 error = VOP_GETATTR(vp, vap); 249 if (error) 250 goto bad; 251 mp = vq_vptomp(vp); 252 VFS_ACCOUNT(mp, vap->va_uid, vap->va_gid, -osize); 253 } 254 255 /* 256 * Set or clear VNSWAPCACHE on the vp based on nd->nl_nch.ncp->nc_flag. 257 * These particular bits a tracked all the way from the root. 258 * 259 * NOTE: Might not work properly on NFS servers due to the 260 * disconnected namecache. 261 */ 262 flags = nd->nl_nch.ncp->nc_flag; 263 if ((flags & (NCF_UF_CACHE | NCF_UF_PCACHE)) && 264 (flags & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) == 0) { 265 vsetflags(vp, VSWAPCACHE); 266 } else { 267 vclrflags(vp, VSWAPCACHE); 268 } 269 270 /* 271 * Setup the fp so VOP_OPEN can override it. No descriptor has been 272 * associated with the fp yet so we own it clean. 273 * 274 * f_nchandle inherits nl_nch. This used to be necessary only for 275 * directories but now we do it unconditionally so f*() ops 276 * such as fchmod() can access the actual namespace that was 277 * used to open the file. 278 */ 279 if (fp) { 280 if (nd->nl_flags & NLC_APPENDONLY) 281 fmode |= FAPPENDONLY; 282 fp->f_nchandle = nd->nl_nch; 283 cache_zero(&nd->nl_nch); 284 cache_unlock(&fp->f_nchandle); 285 } 286 287 /* 288 * Get rid of nl_nch. vn_open does not return it (it returns the 289 * vnode or the file pointer). Note: we can't leave nl_nch locked 290 * through the VOP_OPEN anyway since the VOP_OPEN may block, e.g. 291 * on /dev/ttyd0 292 */ 293 if (nd->nl_nch.ncp) 294 cache_put(&nd->nl_nch); 295 296 error = VOP_OPEN(vp, fmode, cred, fp); 297 if (error) { 298 /* 299 * setting f_ops to &badfileops will prevent the descriptor 300 * code from trying to close and release the vnode, since 301 * the open failed we do not want to call close. 302 */ 303 if (fp) { 304 fp->f_data = NULL; 305 fp->f_ops = &badfileops; 306 } 307 goto bad; 308 } 309 310 #if 0 311 /* 312 * Assert that VREG files have been setup for vmio. 313 */ 314 KASSERT(vp->v_type != VREG || vp->v_object != NULL, 315 ("vn_open: regular file was not VMIO enabled!")); 316 #endif 317 318 /* 319 * Return the vnode. XXX needs some cleaning up. The vnode is 320 * only returned in the fp == NULL case. 321 */ 322 if (fp == NULL) { 323 nd->nl_open_vp = vp; 324 nd->nl_vp_fmode = fmode; 325 if ((nd->nl_flags & NLC_LOCKVP) == 0) 326 vn_unlock(vp); 327 } else { 328 vput(vp); 329 } 330 return (0); 331 bad: 332 if (vp) 333 vput(vp); 334 return (error); 335 } 336 337 int 338 vn_opendisk(const char *devname, int fmode, struct vnode **vpp) 339 { 340 struct vnode *vp; 341 int error; 342 343 if (strncmp(devname, "/dev/", 5) == 0) 344 devname += 5; 345 if ((vp = getsynthvnode(devname)) == NULL) { 346 error = ENODEV; 347 } else { 348 error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL); 349 vn_unlock(vp); 350 if (error) { 351 vrele(vp); 352 vp = NULL; 353 } 354 } 355 *vpp = vp; 356 return (error); 357 } 358 359 /* 360 * Check for write permissions on the specified vnode. nch may be NULL. 361 */ 362 int 363 vn_writechk(struct vnode *vp, struct nchandle *nch) 364 { 365 /* 366 * If there's shared text associated with 367 * the vnode, try to free it up once. If 368 * we fail, we can't allow writing. 369 */ 370 if (vp->v_flag & VTEXT) 371 return (ETXTBSY); 372 373 /* 374 * If the vnode represents a regular file, check the mount 375 * point via the nch. This may be a different mount point 376 * then the one embedded in the vnode (e.g. nullfs). 377 * 378 * We can still write to non-regular files (e.g. devices) 379 * via read-only mounts. 380 */ 381 if (nch && nch->ncp && vp->v_type == VREG) 382 return (ncp_writechk(nch)); 383 return (0); 384 } 385 386 /* 387 * Check whether the underlying mount is read-only. The mount point 388 * referenced by the namecache may be different from the mount point 389 * used by the underlying vnode in the case of NULLFS, so a separate 390 * check is needed. 391 */ 392 int 393 ncp_writechk(struct nchandle *nch) 394 { 395 if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY)) 396 return (EROFS); 397 return(0); 398 } 399 400 /* 401 * Vnode close call 402 * 403 * MPSAFE 404 */ 405 int 406 vn_close(struct vnode *vp, int flags) 407 { 408 int error; 409 410 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 411 if (error == 0) { 412 error = VOP_CLOSE(vp, flags); 413 vn_unlock(vp); 414 } 415 vrele(vp); 416 return (error); 417 } 418 419 /* 420 * Sequential heuristic. 421 * 422 * MPSAFE (f_seqcount and f_nextoff are allowed to race) 423 */ 424 static __inline 425 int 426 sequential_heuristic(struct uio *uio, struct file *fp) 427 { 428 /* 429 * Sequential heuristic - detect sequential operation 430 * 431 * NOTE: SMP: We allow f_seqcount updates to race. 432 */ 433 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || 434 uio->uio_offset == fp->f_nextoff) { 435 int tmpseq = fp->f_seqcount; 436 437 tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; 438 if (tmpseq > IO_SEQMAX) 439 tmpseq = IO_SEQMAX; 440 fp->f_seqcount = tmpseq; 441 return(fp->f_seqcount << IO_SEQSHIFT); 442 } 443 444 /* 445 * Not sequential, quick draw-down of seqcount 446 * 447 * NOTE: SMP: We allow f_seqcount updates to race. 448 */ 449 if (fp->f_seqcount > 1) 450 fp->f_seqcount = 1; 451 else 452 fp->f_seqcount = 0; 453 return(0); 454 } 455 456 /* 457 * get - lock and return the f_offset field. 458 * set - set and unlock the f_offset field. 459 * 460 * These routines serve the dual purpose of serializing access to the 461 * f_offset field (at least on i386) and guaranteeing operational integrity 462 * when multiple read()ers and write()ers are present on the same fp. 463 * 464 * MPSAFE 465 */ 466 static __inline off_t 467 vn_get_fpf_offset(struct file *fp) 468 { 469 u_int flags; 470 u_int nflags; 471 472 /* 473 * Shortcut critical path. 474 */ 475 flags = fp->f_flag & ~FOFFSETLOCK; 476 if (atomic_cmpset_int(&fp->f_flag, flags, flags | FOFFSETLOCK)) 477 return(fp->f_offset); 478 479 /* 480 * The hard way 481 */ 482 for (;;) { 483 flags = fp->f_flag; 484 if (flags & FOFFSETLOCK) { 485 nflags = flags | FOFFSETWAKE; 486 tsleep_interlock(&fp->f_flag, 0); 487 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) 488 tsleep(&fp->f_flag, PINTERLOCKED, "fpoff", 0); 489 } else { 490 nflags = flags | FOFFSETLOCK; 491 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) 492 break; 493 } 494 } 495 return(fp->f_offset); 496 } 497 498 /* 499 * MPSAFE 500 */ 501 static __inline void 502 vn_set_fpf_offset(struct file *fp, off_t offset) 503 { 504 u_int flags; 505 u_int nflags; 506 507 /* 508 * We hold the lock so we can set the offset without interference. 509 */ 510 fp->f_offset = offset; 511 512 /* 513 * Normal release is already a reasonably critical path. 514 */ 515 for (;;) { 516 flags = fp->f_flag; 517 nflags = flags & ~(FOFFSETLOCK | FOFFSETWAKE); 518 if (atomic_cmpset_int(&fp->f_flag, flags, nflags)) { 519 if (flags & FOFFSETWAKE) 520 wakeup(&fp->f_flag); 521 break; 522 } 523 } 524 } 525 526 /* 527 * MPSAFE 528 */ 529 static __inline off_t 530 vn_poll_fpf_offset(struct file *fp) 531 { 532 #if defined(__x86_64__) || !defined(SMP) 533 return(fp->f_offset); 534 #else 535 off_t off = vn_get_fpf_offset(fp); 536 vn_set_fpf_offset(fp, off); 537 return(off); 538 #endif 539 } 540 541 /* 542 * Package up an I/O request on a vnode into a uio and do it. 543 * 544 * MPSAFE 545 */ 546 int 547 vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, 548 off_t offset, enum uio_seg segflg, int ioflg, 549 struct ucred *cred, int *aresid) 550 { 551 struct uio auio; 552 struct iovec aiov; 553 int error; 554 555 if ((ioflg & IO_NODELOCKED) == 0) 556 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 557 auio.uio_iov = &aiov; 558 auio.uio_iovcnt = 1; 559 aiov.iov_base = base; 560 aiov.iov_len = len; 561 auio.uio_resid = len; 562 auio.uio_offset = offset; 563 auio.uio_segflg = segflg; 564 auio.uio_rw = rw; 565 auio.uio_td = curthread; 566 if (rw == UIO_READ) { 567 error = VOP_READ(vp, &auio, ioflg, cred); 568 } else { 569 error = VOP_WRITE(vp, &auio, ioflg, cred); 570 } 571 if (aresid) 572 *aresid = auio.uio_resid; 573 else 574 if (auio.uio_resid && error == 0) 575 error = EIO; 576 if ((ioflg & IO_NODELOCKED) == 0) 577 vn_unlock(vp); 578 return (error); 579 } 580 581 /* 582 * Package up an I/O request on a vnode into a uio and do it. The I/O 583 * request is split up into smaller chunks and we try to avoid saturating 584 * the buffer cache while potentially holding a vnode locked, so we 585 * check bwillwrite() before calling vn_rdwr(). We also call lwkt_user_yield() 586 * to give other processes a chance to lock the vnode (either other processes 587 * core'ing the same binary, or unrelated processes scanning the directory). 588 * 589 * MPSAFE 590 */ 591 int 592 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, 593 off_t offset, enum uio_seg segflg, int ioflg, 594 struct ucred *cred, int *aresid) 595 { 596 int error = 0; 597 598 do { 599 int chunk; 600 601 /* 602 * Force `offset' to a multiple of MAXBSIZE except possibly 603 * for the first chunk, so that filesystems only need to 604 * write full blocks except possibly for the first and last 605 * chunks. 606 */ 607 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; 608 609 if (chunk > len) 610 chunk = len; 611 if (vp->v_type == VREG) { 612 switch(rw) { 613 case UIO_READ: 614 bwillread(chunk); 615 break; 616 case UIO_WRITE: 617 bwillwrite(chunk); 618 break; 619 } 620 } 621 error = vn_rdwr(rw, vp, base, chunk, offset, segflg, 622 ioflg, cred, aresid); 623 len -= chunk; /* aresid calc already includes length */ 624 if (error) 625 break; 626 offset += chunk; 627 base += chunk; 628 lwkt_user_yield(); 629 } while (len); 630 if (aresid) 631 *aresid += len; 632 return (error); 633 } 634 635 /* 636 * File pointers can no longer get ripped up by revoke so 637 * we don't need to lock access to the vp. 638 * 639 * f_offset updates are not guaranteed against multiple readers 640 * 641 * MPSAFE 642 */ 643 static int 644 vn_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 645 { 646 struct vnode *vp; 647 int error, ioflag; 648 649 KASSERT(uio->uio_td == curthread, 650 ("uio_td %p is not td %p", uio->uio_td, curthread)); 651 vp = (struct vnode *)fp->f_data; 652 653 ioflag = 0; 654 if (flags & O_FBLOCKING) { 655 /* ioflag &= ~IO_NDELAY; */ 656 } else if (flags & O_FNONBLOCKING) { 657 ioflag |= IO_NDELAY; 658 } else if (fp->f_flag & FNONBLOCK) { 659 ioflag |= IO_NDELAY; 660 } 661 if (flags & O_FBUFFERED) { 662 /* ioflag &= ~IO_DIRECT; */ 663 } else if (flags & O_FUNBUFFERED) { 664 ioflag |= IO_DIRECT; 665 } else if (fp->f_flag & O_DIRECT) { 666 ioflag |= IO_DIRECT; 667 } 668 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 669 uio->uio_offset = vn_get_fpf_offset(fp); 670 vn_lock(vp, LK_SHARED | LK_RETRY); 671 ioflag |= sequential_heuristic(uio, fp); 672 673 error = VOP_READ(vp, uio, ioflag, cred); 674 fp->f_nextoff = uio->uio_offset; 675 vn_unlock(vp); 676 if ((flags & O_FOFFSET) == 0 && (vp->v_flag & VNOTSEEKABLE) == 0) 677 vn_set_fpf_offset(fp, uio->uio_offset); 678 return (error); 679 } 680 681 /* 682 * MPSAFE 683 */ 684 static int 685 vn_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) 686 { 687 struct vnode *vp; 688 int error, ioflag; 689 690 KASSERT(uio->uio_td == curthread, 691 ("uio_td %p is not p %p", uio->uio_td, curthread)); 692 vp = (struct vnode *)fp->f_data; 693 694 ioflag = IO_UNIT; 695 if (vp->v_type == VREG && 696 ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) { 697 ioflag |= IO_APPEND; 698 } 699 700 if (flags & O_FBLOCKING) { 701 /* ioflag &= ~IO_NDELAY; */ 702 } else if (flags & O_FNONBLOCKING) { 703 ioflag |= IO_NDELAY; 704 } else if (fp->f_flag & FNONBLOCK) { 705 ioflag |= IO_NDELAY; 706 } 707 if (flags & O_FBUFFERED) { 708 /* ioflag &= ~IO_DIRECT; */ 709 } else if (flags & O_FUNBUFFERED) { 710 ioflag |= IO_DIRECT; 711 } else if (fp->f_flag & O_DIRECT) { 712 ioflag |= IO_DIRECT; 713 } 714 if (flags & O_FASYNCWRITE) { 715 /* ioflag &= ~IO_SYNC; */ 716 } else if (flags & O_FSYNCWRITE) { 717 ioflag |= IO_SYNC; 718 } else if (fp->f_flag & O_FSYNC) { 719 ioflag |= IO_SYNC; 720 } 721 722 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)) 723 ioflag |= IO_SYNC; 724 if ((flags & O_FOFFSET) == 0) 725 uio->uio_offset = vn_get_fpf_offset(fp); 726 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 727 ioflag |= sequential_heuristic(uio, fp); 728 error = VOP_WRITE(vp, uio, ioflag, cred); 729 fp->f_nextoff = uio->uio_offset; 730 vn_unlock(vp); 731 if ((flags & O_FOFFSET) == 0) 732 vn_set_fpf_offset(fp, uio->uio_offset); 733 return (error); 734 } 735 736 /* 737 * MPSAFE 738 */ 739 static int 740 vn_statfile(struct file *fp, struct stat *sb, struct ucred *cred) 741 { 742 struct vnode *vp; 743 int error; 744 745 vp = (struct vnode *)fp->f_data; 746 error = vn_stat(vp, sb, cred); 747 return (error); 748 } 749 750 /* 751 * MPSAFE 752 */ 753 int 754 vn_stat(struct vnode *vp, struct stat *sb, struct ucred *cred) 755 { 756 struct vattr vattr; 757 struct vattr *vap; 758 int error; 759 u_short mode; 760 cdev_t dev; 761 762 vap = &vattr; 763 error = VOP_GETATTR(vp, vap); 764 if (error) 765 return (error); 766 767 /* 768 * Zero the spare stat fields 769 */ 770 sb->st_lspare = 0; 771 sb->st_qspare1 = 0; 772 sb->st_qspare2 = 0; 773 774 /* 775 * Copy from vattr table 776 */ 777 if (vap->va_fsid != VNOVAL) 778 sb->st_dev = vap->va_fsid; 779 else 780 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; 781 sb->st_ino = vap->va_fileid; 782 mode = vap->va_mode; 783 switch (vap->va_type) { 784 case VREG: 785 mode |= S_IFREG; 786 break; 787 case VDATABASE: 788 mode |= S_IFDB; 789 break; 790 case VDIR: 791 mode |= S_IFDIR; 792 break; 793 case VBLK: 794 mode |= S_IFBLK; 795 break; 796 case VCHR: 797 mode |= S_IFCHR; 798 break; 799 case VLNK: 800 mode |= S_IFLNK; 801 /* This is a cosmetic change, symlinks do not have a mode. */ 802 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) 803 sb->st_mode &= ~ACCESSPERMS; /* 0000 */ 804 else 805 sb->st_mode |= ACCESSPERMS; /* 0777 */ 806 break; 807 case VSOCK: 808 mode |= S_IFSOCK; 809 break; 810 case VFIFO: 811 mode |= S_IFIFO; 812 break; 813 default: 814 return (EBADF); 815 } 816 sb->st_mode = mode; 817 if (vap->va_nlink > (nlink_t)-1) 818 sb->st_nlink = (nlink_t)-1; 819 else 820 sb->st_nlink = vap->va_nlink; 821 sb->st_uid = vap->va_uid; 822 sb->st_gid = vap->va_gid; 823 sb->st_rdev = dev2udev(vp->v_rdev); 824 sb->st_size = vap->va_size; 825 sb->st_atimespec = vap->va_atime; 826 sb->st_mtimespec = vap->va_mtime; 827 sb->st_ctimespec = vap->va_ctime; 828 829 /* 830 * A VCHR and VBLK device may track the last access and last modified 831 * time independantly of the filesystem. This is particularly true 832 * because device read and write calls may bypass the filesystem. 833 */ 834 if (vp->v_type == VCHR || vp->v_type == VBLK) { 835 dev = vp->v_rdev; 836 if (dev != NULL) { 837 if (dev->si_lastread) { 838 sb->st_atimespec.tv_sec = dev->si_lastread; 839 sb->st_atimespec.tv_nsec = 0; 840 } 841 if (dev->si_lastwrite) { 842 sb->st_atimespec.tv_sec = dev->si_lastwrite; 843 sb->st_atimespec.tv_nsec = 0; 844 } 845 } 846 } 847 848 /* 849 * According to www.opengroup.org, the meaning of st_blksize is 850 * "a filesystem-specific preferred I/O block size for this 851 * object. In some filesystem types, this may vary from file 852 * to file" 853 * Default to PAGE_SIZE after much discussion. 854 */ 855 856 if (vap->va_type == VREG) { 857 sb->st_blksize = vap->va_blocksize; 858 } else if (vn_isdisk(vp, NULL)) { 859 /* 860 * XXX this is broken. If the device is not yet open (aka 861 * stat() call, aka v_rdev == NULL), how are we supposed 862 * to get a valid block size out of it? 863 */ 864 dev = vp->v_rdev; 865 866 sb->st_blksize = dev->si_bsize_best; 867 if (sb->st_blksize < dev->si_bsize_phys) 868 sb->st_blksize = dev->si_bsize_phys; 869 if (sb->st_blksize < BLKDEV_IOSIZE) 870 sb->st_blksize = BLKDEV_IOSIZE; 871 } else { 872 sb->st_blksize = PAGE_SIZE; 873 } 874 875 sb->st_flags = vap->va_flags; 876 877 error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0); 878 if (error) 879 sb->st_gen = 0; 880 else 881 sb->st_gen = (u_int32_t)vap->va_gen; 882 883 sb->st_blocks = vap->va_bytes / S_BLKSIZE; 884 return (0); 885 } 886 887 /* 888 * MPALMOSTSAFE - acquires mplock 889 */ 890 static int 891 vn_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *ucred, 892 struct sysmsg *msg) 893 { 894 struct vnode *vp = ((struct vnode *)fp->f_data); 895 struct vnode *ovp; 896 struct vattr vattr; 897 int error; 898 off_t size; 899 900 switch (vp->v_type) { 901 case VREG: 902 case VDIR: 903 if (com == FIONREAD) { 904 error = VOP_GETATTR(vp, &vattr); 905 if (error) 906 break; 907 size = vattr.va_size; 908 if ((vp->v_flag & VNOTSEEKABLE) == 0) 909 size -= vn_poll_fpf_offset(fp); 910 if (size > 0x7FFFFFFF) 911 size = 0x7FFFFFFF; 912 *(int *)data = size; 913 error = 0; 914 break; 915 } 916 if (com == FIOASYNC) { /* XXX */ 917 error = 0; /* XXX */ 918 break; 919 } 920 /* fall into ... */ 921 default: 922 #if 0 923 return (ENOTTY); 924 #endif 925 case VFIFO: 926 case VCHR: 927 case VBLK: 928 if (com == FIODTYPE) { 929 if (vp->v_type != VCHR && vp->v_type != VBLK) { 930 error = ENOTTY; 931 break; 932 } 933 *(int *)data = dev_dflags(vp->v_rdev) & D_TYPEMASK; 934 error = 0; 935 break; 936 } 937 error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred, msg); 938 if (error == 0 && com == TIOCSCTTY) { 939 struct proc *p = curthread->td_proc; 940 struct session *sess; 941 942 if (p == NULL) { 943 error = ENOTTY; 944 break; 945 } 946 947 get_mplock(); 948 sess = p->p_session; 949 /* Do nothing if reassigning same control tty */ 950 if (sess->s_ttyvp == vp) { 951 error = 0; 952 rel_mplock(); 953 break; 954 } 955 956 /* Get rid of reference to old control tty */ 957 ovp = sess->s_ttyvp; 958 vref(vp); 959 sess->s_ttyvp = vp; 960 if (ovp) 961 vrele(ovp); 962 rel_mplock(); 963 } 964 break; 965 } 966 return (error); 967 } 968 969 /* 970 * Check that the vnode is still valid, and if so 971 * acquire requested lock. 972 */ 973 int 974 #ifndef DEBUG_LOCKS 975 vn_lock(struct vnode *vp, int flags) 976 #else 977 debug_vn_lock(struct vnode *vp, int flags, const char *filename, int line) 978 #endif 979 { 980 int error; 981 982 do { 983 #ifdef DEBUG_LOCKS 984 vp->filename = filename; 985 vp->line = line; 986 error = debuglockmgr(&vp->v_lock, flags, 987 "vn_lock", filename, line); 988 #else 989 error = lockmgr(&vp->v_lock, flags); 990 #endif 991 if (error == 0) 992 break; 993 } while (flags & LK_RETRY); 994 995 /* 996 * Because we (had better!) have a ref on the vnode, once it 997 * goes to VRECLAIMED state it will not be recycled until all 998 * refs go away. So we can just check the flag. 999 */ 1000 if (error == 0 && (vp->v_flag & VRECLAIMED)) { 1001 lockmgr(&vp->v_lock, LK_RELEASE); 1002 error = ENOENT; 1003 } 1004 return (error); 1005 } 1006 1007 #ifdef DEBUG_VN_UNLOCK 1008 1009 void 1010 debug_vn_unlock(struct vnode *vp, const char *filename, int line) 1011 { 1012 kprintf("vn_unlock from %s:%d\n", filename, line); 1013 lockmgr(&vp->v_lock, LK_RELEASE); 1014 } 1015 1016 #else 1017 1018 void 1019 vn_unlock(struct vnode *vp) 1020 { 1021 lockmgr(&vp->v_lock, LK_RELEASE); 1022 } 1023 1024 #endif 1025 1026 /* 1027 * MPSAFE 1028 */ 1029 int 1030 vn_islocked(struct vnode *vp) 1031 { 1032 return (lockstatus(&vp->v_lock, curthread)); 1033 } 1034 1035 /* 1036 * Return the lock status of a vnode and unlock the vnode 1037 * if we owned the lock. This is not a boolean, if the 1038 * caller cares what the lock status is the caller must 1039 * check the various possible values. 1040 * 1041 * This only unlocks exclusive locks held by the caller, 1042 * it will NOT unlock shared locks (there is no way to 1043 * tell who the shared lock belongs to). 1044 * 1045 * MPSAFE 1046 */ 1047 int 1048 vn_islocked_unlock(struct vnode *vp) 1049 { 1050 int vpls; 1051 1052 vpls = lockstatus(&vp->v_lock, curthread); 1053 if (vpls == LK_EXCLUSIVE) 1054 lockmgr(&vp->v_lock, LK_RELEASE); 1055 return(vpls); 1056 } 1057 1058 /* 1059 * Restore a vnode lock that we previously released via 1060 * vn_islocked_unlock(). This is a NOP if we did not 1061 * own the original lock. 1062 * 1063 * MPSAFE 1064 */ 1065 void 1066 vn_islocked_relock(struct vnode *vp, int vpls) 1067 { 1068 int error; 1069 1070 if (vpls == LK_EXCLUSIVE) 1071 error = lockmgr(&vp->v_lock, vpls); 1072 } 1073 1074 /* 1075 * MPSAFE 1076 */ 1077 static int 1078 vn_closefile(struct file *fp) 1079 { 1080 int error; 1081 1082 fp->f_ops = &badfileops; 1083 error = vn_close(((struct vnode *)fp->f_data), fp->f_flag); 1084 return (error); 1085 } 1086 1087 /* 1088 * MPSAFE 1089 */ 1090 static int 1091 vn_kqfilter(struct file *fp, struct knote *kn) 1092 { 1093 int error; 1094 1095 error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn); 1096 return (error); 1097 } 1098