1 /* $OpenBSD: spec_vnops.c,v 1.53 2008/07/24 18:48:18 thib Exp $ */ 2 /* $NetBSD: spec_vnops.c,v 1.29 1996/04/22 01:42:38 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)spec_vnops.c 8.8 (Berkeley) 11/21/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/proc.h> 37 #include <sys/systm.h> 38 #include <sys/kernel.h> 39 #include <sys/conf.h> 40 #include <sys/buf.h> 41 #include <sys/mount.h> 42 #include <sys/namei.h> 43 #include <sys/vnode.h> 44 #include <sys/stat.h> 45 #include <sys/errno.h> 46 #include <sys/ioctl.h> 47 #include <sys/file.h> 48 #include <sys/disklabel.h> 49 #include <sys/lockf.h> 50 #include <sys/poll.h> 51 52 #include <miscfs/specfs/specdev.h> 53 54 #define v_lastr v_specinfo->si_lastr 55 56 struct vnode *speclisth[SPECHSZ]; 57 58 int (**spec_vnodeop_p)(void *); 59 struct vnodeopv_entry_desc spec_vnodeop_entries[] = { 60 { &vop_default_desc, eopnotsupp }, 61 { &vop_lookup_desc, vop_generic_lookup }, /* lookup */ 62 { &vop_create_desc, spec_badop }, /* create */ 63 { &vop_mknod_desc, spec_badop }, /* mknod */ 64 { &vop_open_desc, spec_open }, /* open */ 65 { &vop_close_desc, spec_close }, /* close */ 66 { &vop_access_desc, spec_access }, /* access */ 67 { &vop_getattr_desc, spec_getattr }, /* getattr */ 68 { &vop_setattr_desc, spec_setattr }, /* setattr */ 69 { &vop_read_desc, spec_read }, /* read */ 70 { &vop_write_desc, spec_write }, /* write */ 71 { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ 72 { &vop_poll_desc, spec_poll }, /* poll */ 73 { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ 74 { &vop_revoke_desc, vop_generic_revoke }, /* revoke */ 75 { &vop_fsync_desc, spec_fsync }, /* fsync */ 76 { &vop_remove_desc, spec_badop }, /* remove */ 77 { &vop_link_desc, spec_badop }, /* link */ 78 { &vop_rename_desc, spec_badop }, /* rename */ 79 { &vop_mkdir_desc, spec_badop }, /* mkdir */ 80 { &vop_rmdir_desc, spec_badop }, /* rmdir */ 81 { &vop_symlink_desc, spec_badop }, /* symlink */ 82 { &vop_readdir_desc, spec_badop }, /* readdir */ 83 { &vop_readlink_desc, spec_badop }, /* readlink */ 84 { &vop_abortop_desc, spec_badop }, /* abortop */ 85 { &vop_inactive_desc, spec_inactive }, /* inactive */ 86 { &vop_reclaim_desc, nullop }, /* reclaim */ 87 { &vop_lock_desc, vop_generic_lock }, /* lock */ 88 { &vop_unlock_desc, vop_generic_unlock }, /* unlock */ 89 { &vop_bmap_desc, vop_generic_bmap }, /* bmap */ 90 { &vop_strategy_desc, spec_strategy }, /* strategy */ 91 { &vop_print_desc, spec_print }, /* print */ 92 { &vop_islocked_desc, vop_generic_islocked }, /* islocked */ 93 { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ 94 { &vop_advlock_desc, spec_advlock }, /* advlock */ 95 { &vop_bwrite_desc, vop_generic_bwrite }, /* bwrite */ 96 { NULL, NULL } 97 }; 98 struct vnodeopv_desc spec_vnodeop_opv_desc = 99 { &spec_vnodeop_p, spec_vnodeop_entries }; 100 101 int 102 spec_vnoperate(void *v) 103 { 104 struct vop_generic_args *ap = v; 105 106 return (VOCALL(spec_vnodeop_p, ap->a_desc->vdesc_offset, ap)); 107 } 108 109 /* 110 * Open a special file. 111 */ 112 int 113 spec_open(void *v) 114 { 115 struct vop_open_args *ap = v; 116 struct proc *p = ap->a_p; 117 struct vnode *vp = ap->a_vp; 118 struct vnode *bvp; 119 dev_t bdev; 120 dev_t dev = (dev_t)vp->v_rdev; 121 int maj = major(dev); 122 int error; 123 124 /* 125 * Don't allow open if fs is mounted -nodev. 126 */ 127 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) 128 return (ENXIO); 129 130 switch (vp->v_type) { 131 132 case VCHR: 133 if ((u_int)maj >= nchrdev) 134 return (ENXIO); 135 if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { 136 /* 137 * When running in very secure mode, do not allow 138 * opens for writing of any disk character devices. 139 */ 140 if (securelevel >= 2 && cdevsw[maj].d_type == D_DISK) 141 return (EPERM); 142 /* 143 * When running in secure mode, do not allow opens 144 * for writing of /dev/mem, /dev/kmem, or character 145 * devices whose corresponding block devices are 146 * currently mounted. 147 */ 148 if (securelevel >= 1) { 149 if ((bdev = chrtoblk(dev)) != NODEV && 150 vfinddev(bdev, VBLK, &bvp) && 151 bvp->v_usecount > 0 && 152 (error = vfs_mountedon(bvp))) 153 return (error); 154 if (iskmemdev(dev)) 155 return (EPERM); 156 } 157 } 158 if (cdevsw[maj].d_type == D_TTY) 159 vp->v_flag |= VISTTY; 160 if (cdevsw[maj].d_flags & D_CLONE) 161 return (spec_open_clone(ap)); 162 VOP_UNLOCK(vp, 0, p); 163 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, ap->a_p); 164 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 165 return (error); 166 167 case VBLK: 168 if ((u_int)maj >= nblkdev) 169 return (ENXIO); 170 /* 171 * When running in very secure mode, do not allow 172 * opens for writing of any disk block devices. 173 */ 174 if (securelevel >= 2 && ap->a_cred != FSCRED && 175 (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK) 176 return (EPERM); 177 /* 178 * Do not allow opens of block devices that are 179 * currently mounted. 180 */ 181 if ((error = vfs_mountedon(vp)) != 0) 182 return (error); 183 return ((*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, ap->a_p)); 184 case VNON: 185 case VLNK: 186 case VDIR: 187 case VREG: 188 case VBAD: 189 case VFIFO: 190 case VSOCK: 191 break; 192 } 193 return (0); 194 } 195 196 /* 197 * Vnode op for read 198 */ 199 int 200 spec_read(void *v) 201 { 202 struct vop_read_args *ap = v; 203 struct vnode *vp = ap->a_vp; 204 struct uio *uio = ap->a_uio; 205 struct proc *p = uio->uio_procp; 206 struct buf *bp; 207 daddr64_t bn, nextbn, bscale; 208 int bsize; 209 struct partinfo dpart; 210 int n, on, majordev; 211 int (*ioctl)(dev_t, u_long, caddr_t, int, struct proc *); 212 int error = 0; 213 214 #ifdef DIAGNOSTIC 215 if (uio->uio_rw != UIO_READ) 216 panic("spec_read mode"); 217 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 218 panic("spec_read proc"); 219 #endif 220 if (uio->uio_resid == 0) 221 return (0); 222 223 switch (vp->v_type) { 224 225 case VCHR: 226 VOP_UNLOCK(vp, 0, p); 227 error = (*cdevsw[major(vp->v_rdev)].d_read) 228 (vp->v_rdev, uio, ap->a_ioflag); 229 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 230 return (error); 231 232 case VBLK: 233 if (uio->uio_offset < 0) 234 return (EINVAL); 235 bsize = BLKDEV_IOSIZE; 236 if ((majordev = major(vp->v_rdev)) < nblkdev && 237 (ioctl = bdevsw[majordev].d_ioctl) != NULL && 238 (*ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) { 239 u_int32_t frag = 240 DISKLABELV1_FFS_FRAG(dpart.part->p_fragblock); 241 u_int32_t fsize = 242 DISKLABELV1_FFS_FSIZE(dpart.part->p_fragblock); 243 if (dpart.part->p_fstype == FS_BSDFFS && frag != 0 && 244 fsize != 0) 245 bsize = frag * fsize; 246 } 247 bscale = btodb(bsize); 248 do { 249 bn = btodb(uio->uio_offset) & ~(bscale - 1); 250 on = uio->uio_offset % bsize; 251 n = min((bsize - on), uio->uio_resid); 252 if (vp->v_lastr + bscale == bn) { 253 nextbn = bn + bscale; 254 error = breadn(vp, bn, bsize, &nextbn, &bsize, 255 1, NOCRED, &bp); 256 } else 257 error = bread(vp, bn, bsize, NOCRED, &bp); 258 vp->v_lastr = bn; 259 n = min(n, bsize - bp->b_resid); 260 if (error) { 261 brelse(bp); 262 return (error); 263 } 264 error = uiomove((char *)bp->b_data + on, n, uio); 265 brelse(bp); 266 } while (error == 0 && uio->uio_resid > 0 && n != 0); 267 return (error); 268 269 default: 270 panic("spec_read type"); 271 } 272 /* NOTREACHED */ 273 } 274 275 int 276 spec_inactive(void *v) 277 { 278 struct vop_inactive_args *ap = v; 279 280 VOP_UNLOCK(ap->a_vp, 0, ap->a_p); 281 return (0); 282 } 283 284 /* 285 * Vnode op for write 286 */ 287 int 288 spec_write(void *v) 289 { 290 struct vop_write_args *ap = v; 291 struct vnode *vp = ap->a_vp; 292 struct uio *uio = ap->a_uio; 293 struct proc *p = uio->uio_procp; 294 struct buf *bp; 295 daddr64_t bn, bscale; 296 int bsize; 297 struct partinfo dpart; 298 int n, on, majordev; 299 int (*ioctl)(dev_t, u_long, caddr_t, int, struct proc *); 300 int error = 0; 301 302 #ifdef DIAGNOSTIC 303 if (uio->uio_rw != UIO_WRITE) 304 panic("spec_write mode"); 305 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 306 panic("spec_write proc"); 307 #endif 308 309 switch (vp->v_type) { 310 311 case VCHR: 312 VOP_UNLOCK(vp, 0, p); 313 error = (*cdevsw[major(vp->v_rdev)].d_write) 314 (vp->v_rdev, uio, ap->a_ioflag); 315 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 316 return (error); 317 318 case VBLK: 319 if (uio->uio_resid == 0) 320 return (0); 321 if (uio->uio_offset < 0) 322 return (EINVAL); 323 bsize = BLKDEV_IOSIZE; 324 if ((majordev = major(vp->v_rdev)) < nblkdev && 325 (ioctl = bdevsw[majordev].d_ioctl) != NULL && 326 (*ioctl)(vp->v_rdev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0) { 327 u_int32_t frag = 328 DISKLABELV1_FFS_FRAG(dpart.part->p_fragblock); 329 u_int32_t fsize = 330 DISKLABELV1_FFS_FSIZE(dpart.part->p_fragblock); 331 if (dpart.part->p_fstype == FS_BSDFFS && frag != 0 && 332 fsize != 0) 333 bsize = frag * fsize; 334 } 335 bscale = btodb(bsize); 336 do { 337 bn = btodb(uio->uio_offset) & ~(bscale - 1); 338 on = uio->uio_offset % bsize; 339 n = min((bsize - on), uio->uio_resid); 340 error = bread(vp, bn, bsize, NOCRED, &bp); 341 n = min(n, bsize - bp->b_resid); 342 if (error) { 343 brelse(bp); 344 return (error); 345 } 346 error = uiomove((char *)bp->b_data + on, n, uio); 347 if (n + on == bsize) 348 bawrite(bp); 349 else 350 bdwrite(bp); 351 } while (error == 0 && uio->uio_resid > 0 && n != 0); 352 return (error); 353 354 default: 355 panic("spec_write type"); 356 } 357 /* NOTREACHED */ 358 } 359 360 /* 361 * Device ioctl operation. 362 */ 363 int 364 spec_ioctl(void *v) 365 { 366 struct vop_ioctl_args *ap = v; 367 dev_t dev = ap->a_vp->v_rdev; 368 int maj = major(dev); 369 370 switch (ap->a_vp->v_type) { 371 372 case VCHR: 373 return ((*cdevsw[maj].d_ioctl)(dev, ap->a_command, ap->a_data, 374 ap->a_fflag, ap->a_p)); 375 376 case VBLK: 377 return ((*bdevsw[maj].d_ioctl)(dev, ap->a_command, ap->a_data, 378 ap->a_fflag, ap->a_p)); 379 380 default: 381 panic("spec_ioctl"); 382 /* NOTREACHED */ 383 } 384 } 385 386 int 387 spec_poll(void *v) 388 { 389 struct vop_poll_args *ap = v; 390 dev_t dev; 391 392 switch (ap->a_vp->v_type) { 393 394 default: 395 return (ap->a_events & 396 (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 397 398 case VCHR: 399 dev = ap->a_vp->v_rdev; 400 return (*cdevsw[major(dev)].d_poll)(dev, ap->a_events, ap->a_p); 401 } 402 } 403 int 404 spec_kqfilter(void *v) 405 { 406 struct vop_kqfilter_args *ap = v; 407 408 dev_t dev; 409 410 dev = ap->a_vp->v_rdev; 411 if (cdevsw[major(dev)].d_flags & D_KQFILTER) 412 return (*cdevsw[major(dev)].d_kqfilter)(dev, ap->a_kn); 413 return (1); 414 } 415 416 /* 417 * Synch buffers associated with a block device 418 */ 419 int 420 spec_fsync(void *v) 421 { 422 struct vop_fsync_args *ap = v; 423 struct vnode *vp = ap->a_vp; 424 struct buf *bp; 425 struct buf *nbp; 426 int s; 427 428 if (vp->v_type == VCHR) 429 return (0); 430 /* 431 * Flush all dirty buffers associated with a block device. 432 */ 433 loop: 434 s = splbio(); 435 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); 436 bp != LIST_END(&vp->v_dirtyblkhd); bp = nbp) { 437 nbp = LIST_NEXT(bp, b_vnbufs); 438 if ((bp->b_flags & B_BUSY)) 439 continue; 440 if ((bp->b_flags & B_DELWRI) == 0) 441 panic("spec_fsync: not dirty"); 442 bremfree(bp); 443 buf_acquire(bp); 444 splx(s); 445 bawrite(bp); 446 goto loop; 447 } 448 if (ap->a_waitfor == MNT_WAIT) { 449 vwaitforio (vp, 0, "spec_fsync", 0); 450 451 #ifdef DIAGNOSTIC 452 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { 453 splx(s); 454 vprint("spec_fsync: dirty", vp); 455 goto loop; 456 } 457 #endif 458 } 459 splx(s); 460 return (0); 461 } 462 463 int 464 spec_strategy(void *v) 465 { 466 struct vop_strategy_args *ap = v; 467 struct buf *bp = ap->a_bp; 468 int maj = major(bp->b_dev); 469 470 if (LIST_FIRST(&bp->b_dep) != NULL) 471 buf_start(bp); 472 473 (*bdevsw[maj].d_strategy)(bp); 474 return (0); 475 } 476 477 /* 478 * Device close routine 479 */ 480 int 481 spec_close(void *v) 482 { 483 struct vop_close_args *ap = v; 484 struct vnode *vp = ap->a_vp; 485 dev_t dev = vp->v_rdev; 486 int (*devclose)(dev_t, int, int, struct proc *); 487 int mode, error; 488 489 switch (vp->v_type) { 490 491 case VCHR: 492 /* 493 * Hack: a tty device that is a controlling terminal 494 * has a reference from the session structure. 495 * We cannot easily tell that a character device is 496 * a controlling terminal, unless it is the closing 497 * process' controlling terminal. In that case, 498 * if the reference count is 2 (this last descriptor 499 * plus the session), release the reference from the session. 500 */ 501 if (vcount(vp) == 2 && ap->a_p && 502 vp == ap->a_p->p_session->s_ttyvp) { 503 vrele(vp); 504 ap->a_p->p_session->s_ttyvp = NULL; 505 } 506 if (cdevsw[major(dev)].d_flags & D_CLONE) 507 return (spec_close_clone(ap)); 508 /* 509 * If the vnode is locked, then we are in the midst 510 * of forcably closing the device, otherwise we only 511 * close on last reference. 512 */ 513 if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) 514 return (0); 515 devclose = cdevsw[major(dev)].d_close; 516 mode = S_IFCHR; 517 break; 518 519 case VBLK: 520 /* 521 * On last close of a block device (that isn't mounted) 522 * we must invalidate any in core blocks, so that 523 * we can, for instance, change floppy disks. In order to do 524 * that, we must lock the vnode. If we are coming from 525 * vclean(), the vnode is already locked. 526 */ 527 if (!(vp->v_flag & VXLOCK)) 528 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, ap->a_p); 529 error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); 530 if (!(vp->v_flag & VXLOCK)) 531 VOP_UNLOCK(vp, 0, ap->a_p); 532 if (error) 533 return (error); 534 /* 535 * We do not want to really close the device if it 536 * is still in use unless we are trying to close it 537 * forcibly. Since every use (buffer, vnode, swap, cmap) 538 * holds a reference to the vnode, and because we mark 539 * any other vnodes that alias this device, when the 540 * sum of the reference counts on all the aliased 541 * vnodes descends to one, we are on last close. 542 */ 543 if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) 544 return (0); 545 devclose = bdevsw[major(dev)].d_close; 546 mode = S_IFBLK; 547 break; 548 549 default: 550 panic("spec_close: not special"); 551 } 552 553 return ((*devclose)(dev, ap->a_fflag, mode, ap->a_p)); 554 } 555 556 int 557 spec_getattr(void *v) 558 { 559 struct vop_getattr_args *ap = v; 560 struct vnode *vp = ap->a_vp; 561 562 if (!(vp->v_flag & VCLONE)) 563 return (EBADF); 564 565 return (VOP_GETATTR(vp->v_specparent, ap->a_vap, ap->a_cred, ap->a_p)); 566 } 567 568 int 569 spec_setattr(void *v) 570 { 571 struct vop_getattr_args *ap = v; 572 struct vnode *vp = ap->a_vp; 573 int error; 574 575 if (!(vp->v_flag & VCLONE)) 576 return (EBADF); 577 578 vn_lock(vp->v_specparent, LK_EXCLUSIVE|LK_RETRY, ap->a_p); 579 error = VOP_SETATTR(vp->v_specparent, ap->a_vap, ap->a_cred, ap->a_p); 580 VOP_UNLOCK(vp, 0, ap->a_p); 581 582 return (error); 583 } 584 585 int 586 spec_access(void *v) 587 { 588 struct vop_access_args *ap = v; 589 struct vnode *vp = ap->a_vp; 590 591 if (!(vp->v_flag & VCLONE)) 592 return (EBADF); 593 594 return (VOP_ACCESS(vp->v_specparent, ap->a_mode, ap->a_cred, ap->a_p)); 595 } 596 597 /* 598 * Print out the contents of a special device vnode. 599 */ 600 int 601 spec_print(void *v) 602 { 603 struct vop_print_args *ap = v; 604 605 printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev), 606 minor(ap->a_vp->v_rdev)); 607 return 0; 608 } 609 610 /* 611 * Return POSIX pathconf information applicable to special devices. 612 */ 613 int 614 spec_pathconf(void *v) 615 { 616 struct vop_pathconf_args *ap = v; 617 618 switch (ap->a_name) { 619 case _PC_LINK_MAX: 620 *ap->a_retval = LINK_MAX; 621 return (0); 622 case _PC_MAX_CANON: 623 *ap->a_retval = MAX_CANON; 624 return (0); 625 case _PC_MAX_INPUT: 626 *ap->a_retval = MAX_INPUT; 627 return (0); 628 case _PC_PIPE_BUF: 629 *ap->a_retval = PIPE_BUF; 630 return (0); 631 case _PC_CHOWN_RESTRICTED: 632 *ap->a_retval = 1; 633 return (0); 634 case _PC_VDISABLE: 635 *ap->a_retval = _POSIX_VDISABLE; 636 return (0); 637 default: 638 return (EINVAL); 639 } 640 /* NOTREACHED */ 641 } 642 643 /* 644 * Special device advisory byte-level locks. 645 */ 646 int 647 spec_advlock(void *v) 648 { 649 struct vop_advlock_args *ap = v; 650 struct vnode *vp = ap->a_vp; 651 652 return (lf_advlock(&vp->v_speclockf, (off_t)0, ap->a_id, 653 ap->a_op, ap->a_fl, ap->a_flags)); 654 } 655 656 /* 657 * Special device bad operation 658 */ 659 /*ARGSUSED*/ 660 int 661 spec_badop(void *v) 662 { 663 664 panic("spec_badop called"); 665 /* NOTREACHED */ 666 } 667