1 /* $NetBSD: spec_vnops.c,v 1.219 2025/01/06 09:45:49 mlelstv Exp $ */ 2 3 /*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. Neither the name of the University nor the names of its contributors 42 * may be used to endorse or promote products derived from this software 43 * without specific prior written permission. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * @(#)spec_vnops.c 8.15 (Berkeley) 7/14/95 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.219 2025/01/06 09:45:49 mlelstv Exp $"); 62 63 #ifdef _KERNEL_OPT 64 #include "opt_ddb.h" 65 #endif 66 67 #include <sys/param.h> 68 #include <sys/proc.h> 69 #include <sys/systm.h> 70 #include <sys/kernel.h> 71 #include <sys/conf.h> 72 #include <sys/buf.h> 73 #include <sys/mount.h> 74 #include <sys/namei.h> 75 #include <sys/vnode_impl.h> 76 #include <sys/stat.h> 77 #include <sys/errno.h> 78 #include <sys/ioctl.h> 79 #include <sys/poll.h> 80 #include <sys/file.h> 81 #include <sys/disklabel.h> 82 #include <sys/disk.h> 83 #include <sys/lockf.h> 84 #include <sys/tty.h> 85 #include <sys/kauth.h> 86 #include <sys/fstrans.h> 87 #include <sys/module.h> 88 #include <sys/atomic.h> 89 90 #include <miscfs/genfs/genfs.h> 91 #include <miscfs/specfs/specdev.h> 92 93 #ifdef DDB 94 #include <ddb/ddb.h> 95 #endif 96 97 /* 98 * Lock order: 99 * 100 * vnode lock 101 * -> device_lock 102 * -> struct vnode::v_interlock 103 */ 104 105 /* symbolic sleep message strings for devices */ 106 const char devopn[] = "devopn"; 107 const char devio[] = "devio"; 108 const char devwait[] = "devwait"; 109 const char devin[] = "devin"; 110 const char devout[] = "devout"; 111 const char devioc[] = "devioc"; 112 const char devcls[] = "devcls"; 113 114 #define SPECHSZ 64 115 #if ((SPECHSZ&(SPECHSZ-1)) == 0) 116 #define SPECHASH(rdev) (((rdev>>5)+(rdev))&(SPECHSZ-1)) 117 #else 118 #define SPECHASH(rdev) (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ) 119 #endif 120 121 static vnode_t *specfs_hash[SPECHSZ]; 122 extern struct mount *dead_rootmount; 123 124 /* 125 * This vnode operations vector is used for special device nodes 126 * created from whole cloth by the kernel. For the ops vector for 127 * vnodes built from special devices found in a filesystem, see (e.g) 128 * ffs_specop_entries[] in ffs_vnops.c or the equivalent for other 129 * filesystems. 130 */ 131 132 int (**spec_vnodeop_p)(void *); 133 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = { 134 { &vop_default_desc, vn_default_error }, 135 { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ 136 { &vop_lookup_desc, spec_lookup }, /* lookup */ 137 { &vop_create_desc, genfs_badop }, /* create */ 138 { &vop_mknod_desc, genfs_badop }, /* mknod */ 139 { &vop_open_desc, spec_open }, /* open */ 140 { &vop_close_desc, spec_close }, /* close */ 141 { &vop_access_desc, genfs_ebadf }, /* access */ 142 { &vop_accessx_desc, genfs_ebadf }, /* accessx */ 143 { &vop_getattr_desc, genfs_ebadf }, /* getattr */ 144 { &vop_setattr_desc, genfs_ebadf }, /* setattr */ 145 { &vop_read_desc, spec_read }, /* read */ 146 { &vop_write_desc, spec_write }, /* write */ 147 { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ 148 { &vop_fdiscard_desc, spec_fdiscard }, /* fdiscard */ 149 { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ 150 { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ 151 { &vop_poll_desc, spec_poll }, /* poll */ 152 { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ 153 { &vop_revoke_desc, genfs_revoke }, /* revoke */ 154 { &vop_mmap_desc, spec_mmap }, /* mmap */ 155 { &vop_fsync_desc, spec_fsync }, /* fsync */ 156 { &vop_seek_desc, spec_seek }, /* seek */ 157 { &vop_remove_desc, genfs_badop }, /* remove */ 158 { &vop_link_desc, genfs_badop }, /* link */ 159 { &vop_rename_desc, genfs_badop }, /* rename */ 160 { &vop_mkdir_desc, genfs_badop }, /* mkdir */ 161 { &vop_rmdir_desc, genfs_badop }, /* rmdir */ 162 { &vop_symlink_desc, genfs_badop }, /* symlink */ 163 { &vop_readdir_desc, genfs_badop }, /* readdir */ 164 { &vop_readlink_desc, genfs_badop }, /* readlink */ 165 { &vop_abortop_desc, genfs_badop }, /* abortop */ 166 { &vop_inactive_desc, spec_inactive }, /* inactive */ 167 { &vop_reclaim_desc, spec_reclaim }, /* reclaim */ 168 { &vop_lock_desc, genfs_lock }, /* lock */ 169 { &vop_unlock_desc, genfs_unlock }, /* unlock */ 170 { &vop_bmap_desc, spec_bmap }, /* bmap */ 171 { &vop_strategy_desc, spec_strategy }, /* strategy */ 172 { &vop_print_desc, spec_print }, /* print */ 173 { &vop_islocked_desc, genfs_islocked }, /* islocked */ 174 { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ 175 { &vop_advlock_desc, spec_advlock }, /* advlock */ 176 { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ 177 { &vop_getpages_desc, genfs_getpages }, /* getpages */ 178 { &vop_putpages_desc, genfs_putpages }, /* putpages */ 179 { NULL, NULL } 180 }; 181 const struct vnodeopv_desc spec_vnodeop_opv_desc = 182 { &spec_vnodeop_p, spec_vnodeop_entries }; 183 184 static kauth_listener_t rawio_listener; 185 static struct kcondvar specfs_iocv; 186 187 /* 188 * Returns true if vnode is /dev/mem or /dev/kmem. 189 */ 190 bool 191 iskmemvp(struct vnode *vp) 192 { 193 return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev)); 194 } 195 196 /* 197 * Returns true if dev is /dev/mem or /dev/kmem. 198 */ 199 int 200 iskmemdev(dev_t dev) 201 { 202 /* mem_no is emitted by config(8) to generated devsw.c */ 203 extern const int mem_no; 204 205 /* minor 14 is /dev/io on i386 with COMPAT_10 */ 206 return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14)); 207 } 208 209 static int 210 rawio_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 211 void *arg0, void *arg1, void *arg2, void *arg3) 212 { 213 int result; 214 215 result = KAUTH_RESULT_DEFER; 216 217 if ((action != KAUTH_DEVICE_RAWIO_SPEC) && 218 (action != KAUTH_DEVICE_RAWIO_PASSTHRU)) 219 return result; 220 221 /* Access is mandated by permissions. */ 222 result = KAUTH_RESULT_ALLOW; 223 224 return result; 225 } 226 227 void 228 spec_init(void) 229 { 230 231 rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, 232 rawio_listener_cb, NULL); 233 cv_init(&specfs_iocv, "specio"); 234 } 235 236 /* 237 * spec_io_enter(vp, &sn, &dev) 238 * 239 * Enter an operation that may not hold vp's vnode lock or an 240 * fstrans on vp's mount. Until spec_io_exit, the vnode will not 241 * be revoked. 242 * 243 * On success, set sn to the specnode pointer and dev to the dev_t 244 * number and return zero. Caller must later call spec_io_exit 245 * when done. 246 * 247 * On failure, return ENXIO -- the device has been revoked and no 248 * longer exists. 249 */ 250 static int 251 spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) 252 { 253 dev_t dev; 254 struct specnode *sn; 255 unsigned iocnt; 256 int error = 0; 257 258 mutex_enter(vp->v_interlock); 259 260 /* 261 * Extract all the info we need from the vnode, unless the 262 * vnode has already been reclaimed. This can happen if the 263 * underlying device has been removed and all the device nodes 264 * for it have been revoked. The caller may not hold a vnode 265 * lock or fstrans to prevent this from happening before it has 266 * had an opportunity to notice the vnode is dead. 267 */ 268 if (vdead_check(vp, VDEAD_NOWAIT) != 0 || 269 (sn = vp->v_specnode) == NULL || 270 (dev = vp->v_rdev) == NODEV) { 271 error = ENXIO; 272 goto out; 273 } 274 275 /* 276 * Notify spec_close that we are doing an I/O operation which 277 * may not be not bracketed by fstrans(9) and thus is not 278 * blocked by vfs suspension. 279 * 280 * We could hold this reference with psref(9) instead, but we 281 * already have to take the interlock for vdead_check, so 282 * there's not much more cost here to another atomic operation. 283 */ 284 do { 285 iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt); 286 if (__predict_false(iocnt == UINT_MAX)) { 287 /* 288 * The I/O count is limited by the number of 289 * LWPs (which will never overflow this) -- 290 * unless one driver uses another driver via 291 * specfs, which is rather unusual, but which 292 * could happen via pud(4) userspace drivers. 293 * We could use a 64-bit count, but can't use 294 * atomics for that on all platforms. 295 * (Probably better to switch to psref or 296 * localcount instead.) 297 */ 298 error = EBUSY; 299 goto out; 300 } 301 } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1) 302 != iocnt); 303 304 /* Success! */ 305 *snp = sn; 306 *devp = dev; 307 error = 0; 308 309 out: mutex_exit(vp->v_interlock); 310 return error; 311 } 312 313 /* 314 * spec_io_exit(vp, sn) 315 * 316 * Exit an operation entered with a successful spec_io_enter -- 317 * allow concurrent spec_node_revoke to proceed. The argument sn 318 * must match the struct specnode pointer returned by spec_io_exit 319 * for vp. 320 */ 321 static void 322 spec_io_exit(struct vnode *vp, struct specnode *sn) 323 { 324 struct specdev *sd = sn->sn_dev; 325 unsigned iocnt; 326 327 KASSERT(vp->v_specnode == sn); 328 329 /* 330 * We are done. Notify spec_close if appropriate. The 331 * transition of 1 -> 0 must happen under device_lock so 332 * spec_close doesn't miss a wakeup. 333 */ 334 do { 335 iocnt = atomic_load_relaxed(&sd->sd_iocnt); 336 KASSERT(iocnt > 0); 337 if (iocnt == 1) { 338 mutex_enter(&device_lock); 339 if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) 340 cv_broadcast(&specfs_iocv); 341 mutex_exit(&device_lock); 342 break; 343 } 344 } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt); 345 } 346 347 /* 348 * spec_io_drain(sd) 349 * 350 * Wait for all existing spec_io_enter/exit sections to complete. 351 * Caller must ensure spec_io_enter will fail at this point. 352 */ 353 static void 354 spec_io_drain(struct specdev *sd) 355 { 356 357 /* 358 * I/O at the same time as closing is unlikely -- it often 359 * indicates an application bug. 360 */ 361 if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0)) 362 return; 363 364 mutex_enter(&device_lock); 365 while (atomic_load_relaxed(&sd->sd_iocnt) > 0) 366 cv_wait(&specfs_iocv, &device_lock); 367 mutex_exit(&device_lock); 368 } 369 370 /* 371 * Initialize a vnode that represents a device. 372 */ 373 void 374 spec_node_init(vnode_t *vp, dev_t rdev) 375 { 376 specnode_t *sn; 377 specdev_t *sd; 378 vnode_t *vp2; 379 vnode_t **vpp; 380 381 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 382 KASSERT(vp->v_specnode == NULL); 383 384 /* 385 * Search the hash table for this device. If known, add a 386 * reference to the device structure. If not known, create 387 * a new entry to represent the device. In all cases add 388 * the vnode to the hash table. 389 */ 390 sn = kmem_alloc(sizeof(*sn), KM_SLEEP); 391 sd = kmem_alloc(sizeof(*sd), KM_SLEEP); 392 mutex_enter(&device_lock); 393 vpp = &specfs_hash[SPECHASH(rdev)]; 394 for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) { 395 KASSERT(vp2->v_specnode != NULL); 396 if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) { 397 break; 398 } 399 } 400 if (vp2 == NULL) { 401 /* No existing record, create a new one. */ 402 sd->sd_mountpoint = NULL; 403 sd->sd_lockf = NULL; 404 sd->sd_refcnt = 1; 405 sd->sd_opencnt = 0; 406 sd->sd_bdevvp = NULL; 407 sd->sd_iocnt = 0; 408 sd->sd_opened = false; 409 sd->sd_closing = false; 410 sn->sn_dev = sd; 411 sd = NULL; 412 } else { 413 /* Use the existing record. */ 414 sn->sn_dev = vp2->v_specnode->sn_dev; 415 sn->sn_dev->sd_refcnt++; 416 } 417 /* Insert vnode into the hash chain. */ 418 sn->sn_opencnt = 0; 419 sn->sn_rdev = rdev; 420 sn->sn_gone = false; 421 vp->v_specnode = sn; 422 vp->v_specnext = *vpp; 423 *vpp = vp; 424 mutex_exit(&device_lock); 425 426 /* Free the record we allocated if unused. */ 427 if (sd != NULL) { 428 kmem_free(sd, sizeof(*sd)); 429 } 430 } 431 432 /* 433 * Lookup a vnode by device number and return it referenced. 434 */ 435 int 436 spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp) 437 { 438 int error; 439 vnode_t *vp; 440 441 top: mutex_enter(&device_lock); 442 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 443 if (type == vp->v_type && dev == vp->v_rdev) { 444 mutex_enter(vp->v_interlock); 445 /* If clean or being cleaned, then ignore it. */ 446 if (vdead_check(vp, VDEAD_NOWAIT) == 0) 447 break; 448 if ((flags & VDEAD_NOWAIT) == 0) { 449 mutex_exit(&device_lock); 450 /* 451 * It may be being revoked as we speak, 452 * and the caller wants to wait until 453 * all revocation has completed. Let 454 * vcache_vget wait for it to finish 455 * dying; as a side effect, vcache_vget 456 * releases vp->v_interlock. Note that 457 * vcache_vget cannot succeed at this 458 * point because vdead_check already 459 * failed. 460 */ 461 error = vcache_vget(vp); 462 KASSERT(error); 463 goto top; 464 } 465 mutex_exit(vp->v_interlock); 466 } 467 } 468 KASSERT(vp == NULL || mutex_owned(vp->v_interlock)); 469 if (vp == NULL) { 470 mutex_exit(&device_lock); 471 return ENOENT; 472 } 473 /* 474 * If it is an opened block device return the opened vnode. 475 */ 476 if (type == VBLK && vp->v_specnode->sn_dev->sd_bdevvp != NULL) { 477 mutex_exit(vp->v_interlock); 478 vp = vp->v_specnode->sn_dev->sd_bdevvp; 479 mutex_enter(vp->v_interlock); 480 } 481 mutex_exit(&device_lock); 482 error = vcache_vget(vp); 483 if (error) 484 return error; 485 *vpp = vp; 486 487 return 0; 488 } 489 490 /* 491 * Lookup a vnode by file system mounted on and return it referenced. 492 */ 493 int 494 spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp) 495 { 496 int i, error; 497 vnode_t *vp, *vq; 498 499 mutex_enter(&device_lock); 500 for (i = 0, vq = NULL; i < SPECHSZ && vq == NULL; i++) { 501 for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) { 502 if (vp->v_type != VBLK) 503 continue; 504 vq = vp->v_specnode->sn_dev->sd_bdevvp; 505 if (vq != NULL && 506 vq->v_specnode->sn_dev->sd_mountpoint == mp) 507 break; 508 vq = NULL; 509 } 510 } 511 if (vq == NULL) { 512 mutex_exit(&device_lock); 513 return ENOENT; 514 } 515 mutex_enter(vq->v_interlock); 516 mutex_exit(&device_lock); 517 error = vcache_vget(vq); 518 if (error) 519 return error; 520 *vpp = vq; 521 522 return 0; 523 524 } 525 526 /* 527 * Get the file system mounted on this block device. 528 * 529 * XXX Caller should hold the vnode lock -- shared or exclusive -- so 530 * that this can't changed, and the vnode can't be revoked while we 531 * examine it. But not all callers do, and they're scattered through a 532 * lot of file systems, so we can't assert this yet. 533 */ 534 struct mount * 535 spec_node_getmountedfs(vnode_t *devvp) 536 { 537 struct mount *mp; 538 539 KASSERT(devvp->v_type == VBLK); 540 mp = devvp->v_specnode->sn_dev->sd_mountpoint; 541 542 return mp; 543 } 544 545 /* 546 * Set the file system mounted on this block device. 547 * 548 * XXX Caller should hold the vnode lock exclusively so this can't be 549 * changed or assumed by spec_node_getmountedfs while we change it, and 550 * the vnode can't be revoked while we handle it. But not all callers 551 * do, and they're scattered through a lot of file systems, so we can't 552 * assert this yet. Instead, for now, we'll take an I/O reference so 553 * at least the ioctl doesn't race with revoke/detach. 554 * 555 * If you do change this to assert an exclusive vnode lock, you must 556 * also do vdead_check before trying bdev_ioctl, because the vnode may 557 * have been revoked by the time the caller locked it, and this is 558 * _not_ a vop -- calls to spec_node_setmountedfs don't go through 559 * v_op, so revoking the vnode doesn't prevent further calls. 560 * 561 * XXX Caller should additionally have the vnode open, at least if mp 562 * is nonnull, but I'm not sure all callers do that -- need to audit. 563 * Currently udf closes the vnode before clearing the mount. 564 */ 565 void 566 spec_node_setmountedfs(vnode_t *devvp, struct mount *mp) 567 { 568 struct dkwedge_info dkw; 569 struct specnode *sn; 570 dev_t dev; 571 int error; 572 573 KASSERT(devvp->v_type == VBLK); 574 575 error = spec_io_enter(devvp, &sn, &dev); 576 if (error) 577 return; 578 579 KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL); 580 sn->sn_dev->sd_mountpoint = mp; 581 if (mp == NULL) 582 goto out; 583 584 error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp); 585 if (error) 586 goto out; 587 588 strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname, 589 sizeof(mp->mnt_stat.f_mntfromlabel)); 590 591 out: spec_io_exit(devvp, sn); 592 } 593 594 /* 595 * A vnode representing a special device is going away. Close 596 * the device if the vnode holds it open. 597 */ 598 void 599 spec_node_revoke(vnode_t *vp) 600 { 601 specnode_t *sn; 602 specdev_t *sd; 603 struct vnode **vpp; 604 605 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 606 607 sn = vp->v_specnode; 608 sd = sn->sn_dev; 609 610 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 611 KASSERT(vp->v_specnode != NULL); 612 KASSERT(sn->sn_gone == false); 613 614 mutex_enter(&device_lock); 615 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 616 "sn_opencnt=%u > sd_opencnt=%u", 617 sn->sn_opencnt, sd->sd_opencnt); 618 sn->sn_gone = true; 619 if (sn->sn_opencnt != 0) { 620 sd->sd_opencnt -= (sn->sn_opencnt - 1); 621 sn->sn_opencnt = 1; 622 mutex_exit(&device_lock); 623 624 VOP_CLOSE(vp, FNONBLOCK, NOCRED); 625 626 mutex_enter(&device_lock); 627 KASSERT(sn->sn_opencnt == 0); 628 } 629 630 /* 631 * We may have revoked the vnode in this thread while another 632 * thread was in the middle of spec_close, in the window when 633 * spec_close releases the vnode lock to call .d_close for the 634 * last close. In that case, wait for the concurrent 635 * spec_close to complete. 636 */ 637 while (sd->sd_closing) 638 cv_wait(&specfs_iocv, &device_lock); 639 640 /* 641 * Remove from the hash so lookups stop returning this 642 * specnode. We will dissociate it from the specdev -- and 643 * possibly free the specdev -- in spec_node_destroy. 644 */ 645 KASSERT(sn->sn_gone); 646 KASSERT(sn->sn_opencnt == 0); 647 for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];; 648 vpp = &(*vpp)->v_specnext) { 649 if (*vpp == vp) { 650 *vpp = vp->v_specnext; 651 vp->v_specnext = NULL; 652 break; 653 } 654 } 655 mutex_exit(&device_lock); 656 } 657 658 /* 659 * A vnode representing a special device is being recycled. 660 * Destroy the specfs component. 661 */ 662 void 663 spec_node_destroy(vnode_t *vp) 664 { 665 specnode_t *sn; 666 specdev_t *sd; 667 int refcnt; 668 669 sn = vp->v_specnode; 670 sd = sn->sn_dev; 671 672 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 673 KASSERT(vp->v_specnode != NULL); 674 KASSERT(sn->sn_opencnt == 0); 675 676 mutex_enter(&device_lock); 677 sn = vp->v_specnode; 678 vp->v_specnode = NULL; 679 refcnt = sd->sd_refcnt--; 680 KASSERT(refcnt > 0); 681 mutex_exit(&device_lock); 682 683 /* If the device is no longer in use, destroy our record. */ 684 if (refcnt == 1) { 685 KASSERT(sd->sd_iocnt == 0); 686 KASSERT(sd->sd_opencnt == 0); 687 KASSERT(sd->sd_bdevvp == NULL); 688 kmem_free(sd, sizeof(*sd)); 689 } 690 kmem_free(sn, sizeof(*sn)); 691 } 692 693 /* 694 * Trivial lookup routine that always fails. 695 */ 696 int 697 spec_lookup(void *v) 698 { 699 struct vop_lookup_v2_args /* { 700 struct vnode *a_dvp; 701 struct vnode **a_vpp; 702 struct componentname *a_cnp; 703 } */ *ap = v; 704 705 *ap->a_vpp = NULL; 706 return ENOTDIR; 707 } 708 709 typedef int (*spec_ioctl_t)(dev_t, u_long, void *, int, struct lwp *); 710 711 /* 712 * Open a special file. 713 */ 714 /* ARGSUSED */ 715 int 716 spec_open(void *v) 717 { 718 struct vop_open_args /* { 719 struct vnode *a_vp; 720 int a_mode; 721 kauth_cred_t a_cred; 722 } */ *ap = v; 723 struct lwp *l = curlwp; 724 struct vnode *vp = ap->a_vp; 725 dev_t dev, dev1; 726 int error; 727 enum kauth_device_req req; 728 specnode_t *sn, *sn1; 729 specdev_t *sd; 730 int dtype; 731 spec_ioctl_t ioctl; 732 u_int gen = 0; 733 const char *name = NULL; 734 bool needclose = false; 735 736 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 737 KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", 738 vp->v_type); 739 740 dev = vp->v_rdev; 741 sn = vp->v_specnode; 742 sd = sn->sn_dev; 743 744 /* 745 * Don't allow open if fs is mounted -nodev. 746 */ 747 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) 748 return ENXIO; 749 750 switch (ap->a_mode & (FREAD | FWRITE)) { 751 case FREAD | FWRITE: 752 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW; 753 break; 754 case FWRITE: 755 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE; 756 break; 757 default: 758 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ; 759 break; 760 } 761 error = kauth_authorize_device_spec(ap->a_cred, req, vp); 762 if (error) 763 return error; 764 765 /* 766 * Acquire an open reference -- as long as we hold onto it, and 767 * the vnode isn't revoked, it can't be closed, and the vnode 768 * can't be revoked until we release the vnode lock. 769 */ 770 mutex_enter(&device_lock); 771 KASSERT(!sn->sn_gone); 772 switch (vp->v_type) { 773 case VCHR: 774 /* 775 * Character devices can accept opens from multiple 776 * vnodes. But first, wait for any close to finish. 777 * Wait under the vnode lock so we don't have to worry 778 * about the vnode being revoked while we wait. 779 */ 780 while (sd->sd_closing) { 781 error = cv_wait_sig(&specfs_iocv, &device_lock); 782 if (error) 783 break; 784 } 785 if (error) 786 break; 787 sd->sd_opencnt++; 788 sn->sn_opencnt++; 789 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 790 "sn_opencnt=%u > sd_opencnt=%u", 791 sn->sn_opencnt, sd->sd_opencnt); 792 break; 793 case VBLK: 794 /* 795 * For block devices, permit only one open. The buffer 796 * cache cannot remain self-consistent with multiple 797 * vnodes holding a block device open. 798 * 799 * Treat zero opencnt with non-NULL mountpoint as open. 800 * This may happen after forced detach of a mounted device. 801 * 802 * Also treat sd_closing, meaning there is a concurrent 803 * close in progress, as still open. 804 */ 805 if (sd->sd_opencnt != 0 || 806 sd->sd_mountpoint != NULL || 807 sd->sd_closing) { 808 error = EBUSY; 809 break; 810 } 811 KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u", 812 sn->sn_opencnt); 813 sn->sn_opencnt = 1; 814 sd->sd_opencnt = 1; 815 sd->sd_bdevvp = vp; 816 break; 817 default: 818 panic("invalid specfs vnode type: %d", vp->v_type); 819 } 820 mutex_exit(&device_lock); 821 if (error) 822 return error; 823 824 /* 825 * Set VV_ISTTY if this is a tty cdev. 826 * 827 * XXX This does the wrong thing if the module has to be 828 * autoloaded. We should maybe set this after autoloading 829 * modules and calling .d_open successfully, except (a) we need 830 * the vnode lock to touch it, and (b) once we acquire the 831 * vnode lock again, the vnode may have been revoked, and 832 * deadfs's dead_read needs VV_ISTTY to be already set in order 833 * to return the right answer. So this needs some additional 834 * synchronization to be made to work correctly with tty driver 835 * module autoload. For now, let's just hope it doesn't cause 836 * too much trouble for a tty from an autoloaded driver module 837 * to fail with EIO instead of returning EOF. 838 */ 839 if (vp->v_type == VCHR) { 840 if (cdev_type(dev) == D_TTY) 841 vp->v_vflag |= VV_ISTTY; 842 } 843 844 /* 845 * Because opening the device may block indefinitely, e.g. when 846 * opening a tty, and loading a module may cross into many 847 * other subsystems, we must not hold the vnode lock while 848 * calling .d_open, so release it now and reacquire it when 849 * done. 850 * 851 * Take an I/O reference so that any concurrent spec_close via 852 * spec_node_revoke will wait for us to finish calling .d_open. 853 * The vnode can't be dead at this point because we have it 854 * locked. Note that if revoked, the driver must interrupt 855 * .d_open before spec_close starts waiting for I/O to drain so 856 * this doesn't deadlock. 857 */ 858 VOP_UNLOCK(vp); 859 error = spec_io_enter(vp, &sn1, &dev1); 860 if (error) { 861 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 862 return error; 863 } 864 KASSERT(sn1 == sn); 865 KASSERT(dev1 == dev); 866 867 /* 868 * Open the device. If .d_open returns ENXIO (device not 869 * configured), the driver may not be loaded, so try 870 * autoloading a module and then try .d_open again if anything 871 * got loaded. 872 */ 873 switch (vp->v_type) { 874 case VCHR: 875 do { 876 const struct cdevsw *cdev; 877 878 gen = module_gen; 879 error = cdev_open(dev, ap->a_mode, S_IFCHR, l); 880 if (error != ENXIO) 881 break; 882 883 /* Check if we already have a valid driver */ 884 mutex_enter(&device_lock); 885 cdev = cdevsw_lookup(dev); 886 mutex_exit(&device_lock); 887 if (cdev != NULL) 888 break; 889 890 /* Get device name from devsw_conv array */ 891 if ((name = cdevsw_getname(major(dev))) == NULL) 892 break; 893 894 /* Try to autoload device module */ 895 (void)module_autoload(name, MODULE_CLASS_DRIVER); 896 } while (gen != module_gen); 897 break; 898 899 case VBLK: 900 do { 901 const struct bdevsw *bdev; 902 903 gen = module_gen; 904 error = bdev_open(dev, ap->a_mode, S_IFBLK, l); 905 if (error != ENXIO) 906 break; 907 908 /* Check if we already have a valid driver */ 909 mutex_enter(&device_lock); 910 bdev = bdevsw_lookup(dev); 911 mutex_exit(&device_lock); 912 if (bdev != NULL) 913 break; 914 915 /* Get device name from devsw_conv array */ 916 if ((name = bdevsw_getname(major(dev))) == NULL) 917 break; 918 919 /* Try to autoload device module */ 920 (void)module_autoload(name, MODULE_CLASS_DRIVER); 921 } while (gen != module_gen); 922 break; 923 924 default: 925 __unreachable(); 926 } 927 928 /* 929 * Release the I/O reference now that we have called .d_open, 930 * and reacquire the vnode lock. At this point, the device may 931 * have been revoked, so we must tread carefully. However, sn 932 * and sd remain valid pointers until we drop our reference. 933 */ 934 spec_io_exit(vp, sn); 935 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 936 KASSERT(vp->v_specnode == sn); 937 938 /* 939 * If it has been revoked since we released the vnode lock and 940 * reacquired it, then spec_node_revoke has closed it, and we 941 * must fail with EBADF. 942 * 943 * Otherwise, if opening it failed, back out and release the 944 * open reference. If it was ever successfully opened and we 945 * got the last reference this way, it's now our job to close 946 * it. This might happen in the following scenario: 947 * 948 * Thread 1 Thread 2 949 * VOP_OPEN 950 * ... 951 * .d_open -> 0 (success) 952 * acquire vnode lock 953 * do stuff VOP_OPEN 954 * release vnode lock ... 955 * .d_open -> EBUSY 956 * VOP_CLOSE 957 * acquire vnode lock 958 * --sd_opencnt != 0 959 * => no .d_close 960 * release vnode lock 961 * acquire vnode lock 962 * --sd_opencnt == 0 963 * 964 * We can't resolve this by making spec_close wait for .d_open 965 * to complete before examining sd_opencnt, because .d_open can 966 * hang indefinitely, e.g. for a tty. 967 */ 968 mutex_enter(&device_lock); 969 if (sn->sn_gone) { 970 if (error == 0) 971 error = EBADF; 972 } else if (error == 0) { 973 /* 974 * Device has not been revoked, so our opencnt can't 975 * have gone away at this point -- transition to 976 * sn_gone=true happens before transition to 977 * sn_opencnt=0 in spec_node_revoke. 978 */ 979 KASSERT(sd->sd_opencnt); 980 KASSERT(sn->sn_opencnt); 981 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 982 "sn_opencnt=%u > sd_opencnt=%u", 983 sn->sn_opencnt, sd->sd_opencnt); 984 KASSERT(!sd->sd_closing); 985 sd->sd_opened = true; 986 } else if (sd->sd_opencnt == 1 && sd->sd_opened) { 987 /* 988 * We're the last reference to a _previous_ open even 989 * though this one failed, so we have to close it. 990 * Don't decrement the reference count here -- 991 * spec_close will do that. 992 */ 993 KASSERT(sn->sn_opencnt == 1); 994 needclose = true; 995 } else { 996 KASSERT(sd->sd_opencnt); 997 KASSERT(sn->sn_opencnt); 998 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 999 "sn_opencnt=%u > sd_opencnt=%u", 1000 sn->sn_opencnt, sd->sd_opencnt); 1001 sd->sd_opencnt--; 1002 sn->sn_opencnt--; 1003 if (vp->v_type == VBLK) 1004 sd->sd_bdevvp = NULL; 1005 } 1006 mutex_exit(&device_lock); 1007 1008 /* 1009 * If this open failed, but the device was previously opened, 1010 * and another thread concurrently closed the vnode while we 1011 * were in the middle of reopening it, the other thread will 1012 * see sd_opencnt > 0 and thus decide not to call .d_close -- 1013 * it is now our responsibility to do so. 1014 * 1015 * XXX The flags passed to VOP_CLOSE here are wrong, but 1016 * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider 1017 * a device opened by thread 0 with O_READ, then opened by 1018 * thread 1 with O_WRITE, then closed by thread 0, and finally 1019 * closed by thread 1; the last .d_close call will have FWRITE 1020 * but not FREAD. We should just eliminate the FREAD/FWRITE 1021 * parameter to .d_close altogether. 1022 */ 1023 if (needclose) { 1024 KASSERT(error); 1025 VOP_CLOSE(vp, FNONBLOCK, NOCRED); 1026 } 1027 1028 /* If anything went wrong, we're done. */ 1029 if (error) 1030 return error; 1031 1032 /* 1033 * For disk devices, automagically set the vnode size to the 1034 * partition size, if we can. This applies to block devices 1035 * and character devices alike -- every block device must have 1036 * a corresponding character device. And if the module is 1037 * loaded it will remain loaded until we're done here (it is 1038 * forbidden to devsw_detach until closed). So it is safe to 1039 * query cdev_type unconditionally here. 1040 */ 1041 switch (vp->v_type) { 1042 case VCHR: 1043 ioctl = cdev_ioctl; 1044 dtype = cdev_type(dev); 1045 break; 1046 default: 1047 ioctl = bdev_ioctl; 1048 dtype = bdev_type(dev); 1049 break; 1050 } 1051 if (dtype == D_DISK) { 1052 struct partinfo pi; 1053 off_t sz; 1054 1055 error = (*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp); 1056 if (error == 0) 1057 sz = (off_t)pi.pi_size * pi.pi_secsize; 1058 else if (error == ENOTTY) 1059 error = (*ioctl)(dev, DIOCGMEDIASIZE, &sz, FREAD, curlwp); 1060 1061 if (error == 0) 1062 uvm_vnp_setsize(vp, (voff_t)sz); 1063 } 1064 1065 /* Success! */ 1066 return 0; 1067 } 1068 1069 /* 1070 * Vnode op for read 1071 */ 1072 /* ARGSUSED */ 1073 int 1074 spec_read(void *v) 1075 { 1076 struct vop_read_args /* { 1077 struct vnode *a_vp; 1078 struct uio *a_uio; 1079 int a_ioflag; 1080 kauth_cred_t a_cred; 1081 } */ *ap = v; 1082 struct vnode *vp = ap->a_vp; 1083 struct uio *uio = ap->a_uio; 1084 struct lwp *l = curlwp; 1085 struct specnode *sn; 1086 dev_t dev; 1087 struct buf *bp; 1088 daddr_t bn; 1089 int bsize, bscale; 1090 struct partinfo pi; 1091 int n, on; 1092 int error = 0; 1093 int i, nra; 1094 daddr_t lastbn, *rablks; 1095 int *rasizes; 1096 int nrablks, ratogo; 1097 1098 KASSERT(uio->uio_rw == UIO_READ); 1099 KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || 1100 uio->uio_vmspace == curproc->p_vmspace), 1101 "vmspace belongs to neither kernel nor curproc"); 1102 1103 if (uio->uio_resid == 0) 1104 return 0; 1105 1106 switch (vp->v_type) { 1107 1108 case VCHR: 1109 /* 1110 * Release the lock while we sleep -- possibly 1111 * indefinitely, if this is, e.g., a tty -- in 1112 * cdev_read, so we don't hold up everything else that 1113 * might want access to the vnode. 1114 * 1115 * But before we issue the read, take an I/O reference 1116 * to the specnode so close will know when we're done 1117 * reading. Note that the moment we release the lock, 1118 * the vnode's identity may change; hence spec_io_enter 1119 * may fail, and the caller may have a dead vnode on 1120 * their hands, if the file system on which vp lived 1121 * has been unmounted. 1122 */ 1123 VOP_UNLOCK(vp); 1124 error = spec_io_enter(vp, &sn, &dev); 1125 if (error) 1126 goto out; 1127 error = cdev_read(dev, uio, ap->a_ioflag); 1128 spec_io_exit(vp, sn); 1129 out: /* XXX What if the caller held an exclusive lock? */ 1130 vn_lock(vp, LK_SHARED | LK_RETRY); 1131 return error; 1132 1133 case VBLK: 1134 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1135 if (uio->uio_offset < 0) 1136 return EINVAL; 1137 1138 if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) 1139 bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); 1140 else 1141 bsize = BLKDEV_IOSIZE; 1142 1143 bscale = bsize >> DEV_BSHIFT; 1144 1145 nra = uimax(16 * MAXPHYS / bsize - 1, 511); 1146 rablks = kmem_alloc(nra * sizeof(*rablks), KM_SLEEP); 1147 rasizes = kmem_alloc(nra * sizeof(*rasizes), KM_SLEEP); 1148 lastbn = ((uio->uio_offset + uio->uio_resid - 1) >> DEV_BSHIFT) 1149 &~ (bscale - 1); 1150 nrablks = ratogo = 0; 1151 do { 1152 bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1); 1153 on = uio->uio_offset % bsize; 1154 n = uimin((unsigned)(bsize - on), uio->uio_resid); 1155 1156 if (ratogo == 0) { 1157 nrablks = uimin((lastbn - bn) / bscale, nra); 1158 ratogo = nrablks; 1159 1160 for (i = 0; i < nrablks; ++i) { 1161 rablks[i] = bn + (i+1) * bscale; 1162 rasizes[i] = bsize; 1163 } 1164 1165 error = breadn(vp, bn, bsize, 1166 rablks, rasizes, nrablks, 1167 0, &bp); 1168 } else { 1169 if (ratogo > 0) 1170 --ratogo; 1171 error = bread(vp, bn, bsize, 0, &bp); 1172 } 1173 if (error) 1174 break; 1175 n = uimin(n, bsize - bp->b_resid); 1176 error = uiomove((char *)bp->b_data + on, n, uio); 1177 brelse(bp, 0); 1178 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1179 1180 kmem_free(rablks, nra * sizeof(*rablks)); 1181 kmem_free(rasizes, nra * sizeof(*rasizes)); 1182 1183 return error; 1184 1185 default: 1186 panic("spec_read type"); 1187 } 1188 /* NOTREACHED */ 1189 } 1190 1191 /* 1192 * Vnode op for write 1193 */ 1194 /* ARGSUSED */ 1195 int 1196 spec_write(void *v) 1197 { 1198 struct vop_write_args /* { 1199 struct vnode *a_vp; 1200 struct uio *a_uio; 1201 int a_ioflag; 1202 kauth_cred_t a_cred; 1203 } */ *ap = v; 1204 struct vnode *vp = ap->a_vp; 1205 struct uio *uio = ap->a_uio; 1206 struct lwp *l = curlwp; 1207 struct specnode *sn; 1208 dev_t dev; 1209 struct buf *bp; 1210 daddr_t bn; 1211 int bsize, bscale; 1212 struct partinfo pi; 1213 int n, on; 1214 int error = 0; 1215 1216 KASSERT(uio->uio_rw == UIO_WRITE); 1217 KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || 1218 uio->uio_vmspace == curproc->p_vmspace), 1219 "vmspace belongs to neither kernel nor curproc"); 1220 1221 switch (vp->v_type) { 1222 1223 case VCHR: 1224 /* 1225 * Release the lock while we sleep -- possibly 1226 * indefinitely, if this is, e.g., a tty -- in 1227 * cdev_write, so we don't hold up everything else that 1228 * might want access to the vnode. 1229 * 1230 * But before we issue the write, take an I/O reference 1231 * to the specnode so close will know when we're done 1232 * writing. Note that the moment we release the lock, 1233 * the vnode's identity may change; hence spec_io_enter 1234 * may fail, and the caller may have a dead vnode on 1235 * their hands, if the file system on which vp lived 1236 * has been unmounted. 1237 */ 1238 VOP_UNLOCK(vp); 1239 error = spec_io_enter(vp, &sn, &dev); 1240 if (error) 1241 goto out; 1242 error = cdev_write(dev, uio, ap->a_ioflag); 1243 spec_io_exit(vp, sn); 1244 out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1245 return error; 1246 1247 case VBLK: 1248 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1249 if (uio->uio_resid == 0) 1250 return 0; 1251 if (uio->uio_offset < 0) 1252 return EINVAL; 1253 1254 if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) 1255 bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); 1256 else 1257 bsize = BLKDEV_IOSIZE; 1258 1259 bscale = bsize >> DEV_BSHIFT; 1260 do { 1261 bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1); 1262 on = uio->uio_offset % bsize; 1263 n = uimin((unsigned)(bsize - on), uio->uio_resid); 1264 if (n == bsize) 1265 bp = getblk(vp, bn, bsize, 0, 0); 1266 else 1267 error = bread(vp, bn, bsize, B_MODIFY, &bp); 1268 if (error) { 1269 return error; 1270 } 1271 n = uimin(n, bsize - bp->b_resid); 1272 error = uiomove((char *)bp->b_data + on, n, uio); 1273 if (error) 1274 brelse(bp, 0); 1275 else { 1276 if (n + on == bsize) 1277 bawrite(bp); 1278 else 1279 bdwrite(bp); 1280 error = bp->b_error; 1281 } 1282 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1283 return error; 1284 1285 default: 1286 panic("spec_write type"); 1287 } 1288 /* NOTREACHED */ 1289 } 1290 1291 /* 1292 * fdiscard, which on disk devices becomes TRIM. 1293 */ 1294 int 1295 spec_fdiscard(void *v) 1296 { 1297 struct vop_fdiscard_args /* { 1298 struct vnode *a_vp; 1299 off_t a_pos; 1300 off_t a_len; 1301 } */ *ap = v; 1302 struct vnode *vp = ap->a_vp; 1303 dev_t dev; 1304 1305 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1306 1307 dev = vp->v_rdev; 1308 1309 switch (vp->v_type) { 1310 case VCHR: 1311 #if 0 /* This is not stored for character devices. */ 1312 KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp); 1313 #endif 1314 return cdev_discard(dev, ap->a_pos, ap->a_len); 1315 case VBLK: 1316 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1317 return bdev_discard(dev, ap->a_pos, ap->a_len); 1318 default: 1319 panic("spec_fdiscard: not a device\n"); 1320 } 1321 } 1322 1323 /* 1324 * Device ioctl operation. 1325 */ 1326 /* ARGSUSED */ 1327 int 1328 spec_ioctl(void *v) 1329 { 1330 struct vop_ioctl_args /* { 1331 struct vnode *a_vp; 1332 u_long a_command; 1333 void *a_data; 1334 int a_fflag; 1335 kauth_cred_t a_cred; 1336 } */ *ap = v; 1337 struct vnode *vp = ap->a_vp; 1338 struct specnode *sn; 1339 dev_t dev; 1340 int error; 1341 1342 error = spec_io_enter(vp, &sn, &dev); 1343 if (error) 1344 return error; 1345 1346 switch (vp->v_type) { 1347 case VCHR: 1348 error = cdev_ioctl(dev, ap->a_command, ap->a_data, 1349 ap->a_fflag, curlwp); 1350 break; 1351 case VBLK: 1352 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1353 error = bdev_ioctl(dev, ap->a_command, ap->a_data, 1354 ap->a_fflag, curlwp); 1355 break; 1356 default: 1357 panic("spec_ioctl"); 1358 /* NOTREACHED */ 1359 } 1360 1361 spec_io_exit(vp, sn); 1362 return error; 1363 } 1364 1365 /* ARGSUSED */ 1366 int 1367 spec_poll(void *v) 1368 { 1369 struct vop_poll_args /* { 1370 struct vnode *a_vp; 1371 int a_events; 1372 } */ *ap = v; 1373 struct vnode *vp = ap->a_vp; 1374 struct specnode *sn; 1375 dev_t dev; 1376 int revents; 1377 1378 if (spec_io_enter(vp, &sn, &dev) != 0) 1379 return POLLERR; 1380 1381 switch (vp->v_type) { 1382 case VCHR: 1383 revents = cdev_poll(dev, ap->a_events, curlwp); 1384 break; 1385 default: 1386 revents = genfs_poll(v); 1387 break; 1388 } 1389 1390 spec_io_exit(vp, sn); 1391 return revents; 1392 } 1393 1394 /* ARGSUSED */ 1395 int 1396 spec_kqfilter(void *v) 1397 { 1398 struct vop_kqfilter_args /* { 1399 struct vnode *a_vp; 1400 struct proc *a_kn; 1401 } */ *ap = v; 1402 struct vnode *vp = ap->a_vp; 1403 struct specnode *sn; 1404 dev_t dev; 1405 int error; 1406 1407 error = spec_io_enter(vp, &sn, &dev); 1408 if (error) 1409 return error; 1410 1411 switch (vp->v_type) { 1412 case VCHR: 1413 error = cdev_kqfilter(dev, ap->a_kn); 1414 break; 1415 default: 1416 /* 1417 * Block devices don't support kqfilter, and refuse it 1418 * for any other files (like those vflush()ed) too. 1419 */ 1420 error = EOPNOTSUPP; 1421 break; 1422 } 1423 1424 spec_io_exit(vp, sn); 1425 return error; 1426 } 1427 1428 /* 1429 * Allow mapping of only D_DISK. This is called only for VBLK. 1430 */ 1431 int 1432 spec_mmap(void *v) 1433 { 1434 struct vop_mmap_args /* { 1435 struct vnode *a_vp; 1436 vm_prot_t a_prot; 1437 kauth_cred_t a_cred; 1438 } */ *ap = v; 1439 struct vnode *vp = ap->a_vp; 1440 struct specnode *sn; 1441 dev_t dev; 1442 int error; 1443 1444 KASSERT(vp->v_type == VBLK); 1445 1446 error = spec_io_enter(vp, &sn, &dev); 1447 if (error) 1448 return error; 1449 1450 error = bdev_type(dev) == D_DISK ? 0 : EINVAL; 1451 1452 spec_io_exit(vp, sn); 1453 return 0; 1454 } 1455 1456 /* 1457 * Synch buffers associated with a block device 1458 */ 1459 /* ARGSUSED */ 1460 int 1461 spec_fsync(void *v) 1462 { 1463 struct vop_fsync_args /* { 1464 struct vnode *a_vp; 1465 kauth_cred_t a_cred; 1466 int a_flags; 1467 off_t offlo; 1468 off_t offhi; 1469 } */ *ap = v; 1470 struct vnode *vp = ap->a_vp; 1471 struct mount *mp; 1472 int error; 1473 1474 if (vp->v_type == VBLK) { 1475 if ((mp = spec_node_getmountedfs(vp)) != NULL) { 1476 error = VFS_FSYNC(mp, vp, ap->a_flags); 1477 if (error != EOPNOTSUPP) 1478 return error; 1479 } 1480 return vflushbuf(vp, ap->a_flags); 1481 } 1482 return 0; 1483 } 1484 1485 /* 1486 * Just call the device strategy routine 1487 */ 1488 int 1489 spec_strategy(void *v) 1490 { 1491 struct vop_strategy_args /* { 1492 struct vnode *a_vp; 1493 struct buf *a_bp; 1494 } */ *ap = v; 1495 struct vnode *vp = ap->a_vp; 1496 struct buf *bp = ap->a_bp; 1497 struct specnode *sn = NULL; 1498 dev_t dev; 1499 int error; 1500 1501 error = spec_io_enter(vp, &sn, &dev); 1502 if (error) 1503 goto out; 1504 1505 bp->b_dev = dev; 1506 1507 if (!(bp->b_flags & B_READ)) { 1508 #ifdef DIAGNOSTIC 1509 if (bp->b_vp && bp->b_vp->v_type == VBLK) { 1510 struct mount *mp = spec_node_getmountedfs(bp->b_vp); 1511 1512 if (mp && (mp->mnt_flag & MNT_RDONLY)) { 1513 printf("%s blk %"PRId64" written while ro!\n", 1514 mp->mnt_stat.f_mntonname, bp->b_blkno); 1515 #ifdef DDB 1516 db_stacktrace(); 1517 #endif 1518 } 1519 } 1520 #endif /* DIAGNOSTIC */ 1521 error = fscow_run(bp, false); 1522 if (error) 1523 goto out; 1524 } 1525 bdev_strategy(bp); 1526 1527 error = 0; 1528 1529 out: if (sn) 1530 spec_io_exit(vp, sn); 1531 if (error) { 1532 bp->b_error = error; 1533 bp->b_resid = bp->b_bcount; 1534 biodone(bp); 1535 } 1536 return error; 1537 } 1538 1539 int 1540 spec_inactive(void *v) 1541 { 1542 struct vop_inactive_v2_args /* { 1543 struct vnode *a_vp; 1544 struct bool *a_recycle; 1545 } */ *ap = v; 1546 1547 KASSERT(ap->a_vp->v_mount == dead_rootmount); 1548 *ap->a_recycle = true; 1549 1550 return 0; 1551 } 1552 1553 int 1554 spec_reclaim(void *v) 1555 { 1556 struct vop_reclaim_v2_args /* { 1557 struct vnode *a_vp; 1558 } */ *ap = v; 1559 struct vnode *vp = ap->a_vp; 1560 1561 KASSERT(vp->v_specnode->sn_opencnt == 0); 1562 1563 VOP_UNLOCK(vp); 1564 1565 KASSERT(vp->v_mount == dead_rootmount); 1566 return 0; 1567 } 1568 1569 /* 1570 * This is a noop, simply returning what one has been given. 1571 */ 1572 int 1573 spec_bmap(void *v) 1574 { 1575 struct vop_bmap_args /* { 1576 struct vnode *a_vp; 1577 daddr_t a_bn; 1578 struct vnode **a_vpp; 1579 daddr_t *a_bnp; 1580 int *a_runp; 1581 } */ *ap = v; 1582 1583 if (ap->a_vpp != NULL) 1584 *ap->a_vpp = ap->a_vp; 1585 if (ap->a_bnp != NULL) 1586 *ap->a_bnp = ap->a_bn; 1587 if (ap->a_runp != NULL) 1588 *ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1; 1589 return 0; 1590 } 1591 1592 /* 1593 * Device close routine 1594 */ 1595 /* ARGSUSED */ 1596 int 1597 spec_close(void *v) 1598 { 1599 struct vop_close_args /* { 1600 struct vnode *a_vp; 1601 int a_fflag; 1602 kauth_cred_t a_cred; 1603 } */ *ap = v; 1604 struct vnode *vp = ap->a_vp; 1605 struct session *sess; 1606 dev_t dev; 1607 int flags = ap->a_fflag; 1608 int mode, error, count; 1609 specnode_t *sn; 1610 specdev_t *sd; 1611 1612 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1613 1614 mutex_enter(vp->v_interlock); 1615 sn = vp->v_specnode; 1616 dev = vp->v_rdev; 1617 sd = sn->sn_dev; 1618 /* 1619 * If we're going away soon, make this non-blocking. 1620 * Also ensures that we won't wedge in vn_lock below. 1621 */ 1622 if (vdead_check(vp, VDEAD_NOWAIT) != 0) 1623 flags |= FNONBLOCK; 1624 mutex_exit(vp->v_interlock); 1625 1626 switch (vp->v_type) { 1627 1628 case VCHR: 1629 /* 1630 * Hack: a tty device that is a controlling terminal 1631 * has a reference from the session structure. We 1632 * cannot easily tell that a character device is a 1633 * controlling terminal, unless it is the closing 1634 * process' controlling terminal. In that case, if the 1635 * open count is 1 release the reference from the 1636 * session. Also, remove the link from the tty back to 1637 * the session and pgrp. 1638 * 1639 * XXX V. fishy. 1640 */ 1641 mutex_enter(&proc_lock); 1642 sess = curlwp->l_proc->p_session; 1643 if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) { 1644 mutex_spin_enter(&tty_lock); 1645 sess->s_ttyvp = NULL; 1646 if (sess->s_ttyp->t_session != NULL) { 1647 sess->s_ttyp->t_pgrp = NULL; 1648 sess->s_ttyp->t_session = NULL; 1649 mutex_spin_exit(&tty_lock); 1650 /* Releases proc_lock. */ 1651 proc_sessrele(sess); 1652 } else { 1653 mutex_spin_exit(&tty_lock); 1654 if (sess->s_ttyp->t_pgrp != NULL) 1655 panic("spec_close: spurious pgrp ref"); 1656 mutex_exit(&proc_lock); 1657 } 1658 vrele(vp); 1659 } else 1660 mutex_exit(&proc_lock); 1661 1662 /* 1663 * If the vnode is locked, then we are in the midst 1664 * of forcably closing the device, otherwise we only 1665 * close on last reference. 1666 */ 1667 mode = S_IFCHR; 1668 break; 1669 1670 case VBLK: 1671 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1672 /* 1673 * On last close of a block device (that isn't mounted) 1674 * we must invalidate any in core blocks, so that 1675 * we can, for instance, change floppy disks. 1676 */ 1677 error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0); 1678 if (error) 1679 return error; 1680 /* 1681 * We do not want to really close the device if it 1682 * is still in use unless we are trying to close it 1683 * forcibly. Since every use (buffer, vnode, swap, cmap) 1684 * holds a reference to the vnode, and because we mark 1685 * any other vnodes that alias this device, when the 1686 * sum of the reference counts on all the aliased 1687 * vnodes descends to one, we are on last close. 1688 */ 1689 mode = S_IFBLK; 1690 break; 1691 1692 default: 1693 panic("spec_close: not special"); 1694 } 1695 1696 /* 1697 * Decrement the open reference count of this node and the 1698 * device. For block devices, the open reference count must be 1699 * 1 at this point. If the device's open reference count goes 1700 * to zero, we're the last one out so get the lights. 1701 * 1702 * We may find --sd->sd_opencnt gives zero, and yet 1703 * sd->sd_opened is false. This happens if the vnode is 1704 * revoked at the same time as it is being opened, which can 1705 * happen when opening a tty blocks indefinitely. In that 1706 * case, we still must call close -- it is the job of close to 1707 * interrupt the open. Either way, the device will be no 1708 * longer opened, so we have to clear sd->sd_opened; subsequent 1709 * opens will have responsibility for issuing close. 1710 * 1711 * This has the side effect that the sequence of opens might 1712 * happen out of order -- we might end up doing open, open, 1713 * close, close, instead of open, close, open, close. This is 1714 * unavoidable with the current devsw API, where open is 1715 * allowed to block and close must be able to run concurrently 1716 * to interrupt it. It is the driver's responsibility to 1717 * ensure that close is idempotent so that this works. Drivers 1718 * requiring per-open state and exact 1:1 correspondence 1719 * between open and close can use fd_clone. 1720 */ 1721 mutex_enter(&device_lock); 1722 KASSERT(sn->sn_opencnt); 1723 KASSERT(sd->sd_opencnt); 1724 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 1725 "sn_opencnt=%u > sd_opencnt=%u", 1726 sn->sn_opencnt, sd->sd_opencnt); 1727 sn->sn_opencnt--; 1728 count = --sd->sd_opencnt; 1729 if (vp->v_type == VBLK) { 1730 KASSERTMSG(count == 0, "block device with %u opens", 1731 count + 1); 1732 sd->sd_bdevvp = NULL; 1733 } 1734 if (count == 0) { 1735 KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u", 1736 sn->sn_opencnt); 1737 KASSERT(!sd->sd_closing); 1738 sd->sd_opened = false; 1739 sd->sd_closing = true; 1740 } 1741 mutex_exit(&device_lock); 1742 1743 if (count != 0) 1744 return 0; 1745 1746 /* 1747 * If we're able to block, release the vnode lock & reacquire. We 1748 * might end up sleeping for someone else who wants our queues. They 1749 * won't get them if we hold the vnode locked. 1750 */ 1751 if (!(flags & FNONBLOCK)) 1752 VOP_UNLOCK(vp); 1753 1754 /* 1755 * If we can cancel all outstanding I/O, then wait for it to 1756 * drain before we call .d_close. Drivers that split up 1757 * .d_cancel and .d_close this way need not have any internal 1758 * mechanism for waiting in .d_close for I/O to drain. 1759 */ 1760 if (vp->v_type == VBLK) 1761 error = bdev_cancel(dev, flags, mode, curlwp); 1762 else 1763 error = cdev_cancel(dev, flags, mode, curlwp); 1764 if (error == 0) 1765 spec_io_drain(sd); 1766 else 1767 KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d", 1768 (unsigned long)dev, error); 1769 1770 if (vp->v_type == VBLK) 1771 error = bdev_close(dev, flags, mode, curlwp); 1772 else 1773 error = cdev_close(dev, flags, mode, curlwp); 1774 1775 /* 1776 * Wait for all other devsw operations to drain. After this 1777 * point, no bdev/cdev_* can be active for this specdev. 1778 */ 1779 spec_io_drain(sd); 1780 1781 /* 1782 * Wake any spec_open calls waiting for close to finish -- do 1783 * this before reacquiring the vnode lock, because spec_open 1784 * holds the vnode lock while waiting, so doing this after 1785 * reacquiring the lock would deadlock. 1786 */ 1787 mutex_enter(&device_lock); 1788 KASSERT(!sd->sd_opened); 1789 KASSERT(sd->sd_closing); 1790 sd->sd_closing = false; 1791 cv_broadcast(&specfs_iocv); 1792 mutex_exit(&device_lock); 1793 1794 if (!(flags & FNONBLOCK)) 1795 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1796 1797 return error; 1798 } 1799 1800 /* 1801 * Print out the contents of a special device vnode. 1802 */ 1803 int 1804 spec_print(void *v) 1805 { 1806 struct vop_print_args /* { 1807 struct vnode *a_vp; 1808 } */ *ap = v; 1809 1810 printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev), 1811 (unsigned long long)minor(ap->a_vp->v_rdev)); 1812 return 0; 1813 } 1814 1815 /* 1816 * Return POSIX pathconf information applicable to special devices. 1817 */ 1818 int 1819 spec_pathconf(void *v) 1820 { 1821 struct vop_pathconf_args /* { 1822 struct vnode *a_vp; 1823 int a_name; 1824 register_t *a_retval; 1825 } */ *ap = v; 1826 1827 switch (ap->a_name) { 1828 case _PC_LINK_MAX: 1829 *ap->a_retval = LINK_MAX; 1830 return 0; 1831 case _PC_MAX_CANON: 1832 *ap->a_retval = MAX_CANON; 1833 return 0; 1834 case _PC_MAX_INPUT: 1835 *ap->a_retval = MAX_INPUT; 1836 return 0; 1837 case _PC_PIPE_BUF: 1838 *ap->a_retval = PIPE_BUF; 1839 return 0; 1840 case _PC_CHOWN_RESTRICTED: 1841 *ap->a_retval = 1; 1842 return 0; 1843 case _PC_VDISABLE: 1844 *ap->a_retval = _POSIX_VDISABLE; 1845 return 0; 1846 case _PC_SYNC_IO: 1847 *ap->a_retval = 1; 1848 return 0; 1849 default: 1850 return genfs_pathconf(ap); 1851 } 1852 /* NOTREACHED */ 1853 } 1854 1855 /* 1856 * Advisory record locking support. 1857 */ 1858 int 1859 spec_advlock(void *v) 1860 { 1861 struct vop_advlock_args /* { 1862 struct vnode *a_vp; 1863 void *a_id; 1864 int a_op; 1865 struct flock *a_fl; 1866 int a_flags; 1867 } */ *ap = v; 1868 struct vnode *vp = ap->a_vp; 1869 1870 return lf_advlock(ap, &vp->v_speclockf, (off_t)0); 1871 } 1872