1 /* $NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. Neither the name of the University nor the names of its contributors 42 * may be used to endorse or promote products derived from this software 43 * without specific prior written permission. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * @(#)spec_vnops.c 8.15 (Berkeley) 7/14/95 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $"); 62 63 #ifdef _KERNEL_OPT 64 #include "opt_ddb.h" 65 #endif 66 67 #include <sys/param.h> 68 #include <sys/proc.h> 69 #include <sys/systm.h> 70 #include <sys/kernel.h> 71 #include <sys/conf.h> 72 #include <sys/buf.h> 73 #include <sys/mount.h> 74 #include <sys/namei.h> 75 #include <sys/vnode_impl.h> 76 #include <sys/stat.h> 77 #include <sys/errno.h> 78 #include <sys/ioctl.h> 79 #include <sys/poll.h> 80 #include <sys/file.h> 81 #include <sys/disklabel.h> 82 #include <sys/disk.h> 83 #include <sys/lockf.h> 84 #include <sys/tty.h> 85 #include <sys/kauth.h> 86 #include <sys/fstrans.h> 87 #include <sys/module.h> 88 #include <sys/atomic.h> 89 90 #include <miscfs/genfs/genfs.h> 91 #include <miscfs/specfs/specdev.h> 92 93 #ifdef DDB 94 #include <ddb/ddb.h> 95 #endif 96 97 /* 98 * Lock order: 99 * 100 * vnode lock 101 * -> device_lock 102 * -> struct vnode::v_interlock 103 */ 104 105 /* symbolic sleep message strings for devices */ 106 const char devopn[] = "devopn"; 107 const char devio[] = "devio"; 108 const char devwait[] = "devwait"; 109 const char devin[] = "devin"; 110 const char devout[] = "devout"; 111 const char devioc[] = "devioc"; 112 const char devcls[] = "devcls"; 113 114 #define SPECHSZ 64 115 #if ((SPECHSZ&(SPECHSZ-1)) == 0) 116 #define SPECHASH(rdev) (((rdev>>5)+(rdev))&(SPECHSZ-1)) 117 #else 118 #define SPECHASH(rdev) (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ) 119 #endif 120 121 static vnode_t *specfs_hash[SPECHSZ]; 122 extern struct mount *dead_rootmount; 123 124 /* 125 * This vnode operations vector is used for special device nodes 126 * created from whole cloth by the kernel. For the ops vector for 127 * vnodes built from special devices found in a filesystem, see (e.g) 128 * ffs_specop_entries[] in ffs_vnops.c or the equivalent for other 129 * filesystems. 130 */ 131 132 int (**spec_vnodeop_p)(void *); 133 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = { 134 { &vop_default_desc, vn_default_error }, 135 { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ 136 { &vop_lookup_desc, spec_lookup }, /* lookup */ 137 { &vop_create_desc, genfs_badop }, /* create */ 138 { &vop_mknod_desc, genfs_badop }, /* mknod */ 139 { &vop_open_desc, spec_open }, /* open */ 140 { &vop_close_desc, spec_close }, /* close */ 141 { &vop_access_desc, genfs_ebadf }, /* access */ 142 { &vop_accessx_desc, genfs_ebadf }, /* accessx */ 143 { &vop_getattr_desc, genfs_ebadf }, /* getattr */ 144 { &vop_setattr_desc, genfs_ebadf }, /* setattr */ 145 { &vop_read_desc, spec_read }, /* read */ 146 { &vop_write_desc, spec_write }, /* write */ 147 { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ 148 { &vop_fdiscard_desc, spec_fdiscard }, /* fdiscard */ 149 { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ 150 { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ 151 { &vop_poll_desc, spec_poll }, /* poll */ 152 { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ 153 { &vop_revoke_desc, genfs_revoke }, /* revoke */ 154 { &vop_mmap_desc, spec_mmap }, /* mmap */ 155 { &vop_fsync_desc, spec_fsync }, /* fsync */ 156 { &vop_seek_desc, spec_seek }, /* seek */ 157 { &vop_remove_desc, genfs_badop }, /* remove */ 158 { &vop_link_desc, genfs_badop }, /* link */ 159 { &vop_rename_desc, genfs_badop }, /* rename */ 160 { &vop_mkdir_desc, genfs_badop }, /* mkdir */ 161 { &vop_rmdir_desc, genfs_badop }, /* rmdir */ 162 { &vop_symlink_desc, genfs_badop }, /* symlink */ 163 { &vop_readdir_desc, genfs_badop }, /* readdir */ 164 { &vop_readlink_desc, genfs_badop }, /* readlink */ 165 { &vop_abortop_desc, genfs_badop }, /* abortop */ 166 { &vop_inactive_desc, spec_inactive }, /* inactive */ 167 { &vop_reclaim_desc, spec_reclaim }, /* reclaim */ 168 { &vop_lock_desc, genfs_lock }, /* lock */ 169 { &vop_unlock_desc, genfs_unlock }, /* unlock */ 170 { &vop_bmap_desc, spec_bmap }, /* bmap */ 171 { &vop_strategy_desc, spec_strategy }, /* strategy */ 172 { &vop_print_desc, spec_print }, /* print */ 173 { &vop_islocked_desc, genfs_islocked }, /* islocked */ 174 { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ 175 { &vop_advlock_desc, spec_advlock }, /* advlock */ 176 { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ 177 { &vop_getpages_desc, genfs_getpages }, /* getpages */ 178 { &vop_putpages_desc, genfs_putpages }, /* putpages */ 179 { NULL, NULL } 180 }; 181 const struct vnodeopv_desc spec_vnodeop_opv_desc = 182 { &spec_vnodeop_p, spec_vnodeop_entries }; 183 184 static kauth_listener_t rawio_listener; 185 static struct kcondvar specfs_iocv; 186 187 /* 188 * Returns true if vnode is /dev/mem or /dev/kmem. 189 */ 190 bool 191 iskmemvp(struct vnode *vp) 192 { 193 return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev)); 194 } 195 196 /* 197 * Returns true if dev is /dev/mem or /dev/kmem. 198 */ 199 int 200 iskmemdev(dev_t dev) 201 { 202 /* mem_no is emitted by config(8) to generated devsw.c */ 203 extern const int mem_no; 204 205 /* minor 14 is /dev/io on i386 with COMPAT_10 */ 206 return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14)); 207 } 208 209 static int 210 rawio_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 211 void *arg0, void *arg1, void *arg2, void *arg3) 212 { 213 int result; 214 215 result = KAUTH_RESULT_DEFER; 216 217 if ((action != KAUTH_DEVICE_RAWIO_SPEC) && 218 (action != KAUTH_DEVICE_RAWIO_PASSTHRU)) 219 return result; 220 221 /* Access is mandated by permissions. */ 222 result = KAUTH_RESULT_ALLOW; 223 224 return result; 225 } 226 227 void 228 spec_init(void) 229 { 230 231 rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, 232 rawio_listener_cb, NULL); 233 cv_init(&specfs_iocv, "specio"); 234 } 235 236 /* 237 * spec_io_enter(vp, &sn, &dev) 238 * 239 * Enter an operation that may not hold vp's vnode lock or an 240 * fstrans on vp's mount. Until spec_io_exit, the vnode will not 241 * be revoked. 242 * 243 * On success, set sn to the specnode pointer and dev to the dev_t 244 * number and return zero. Caller must later call spec_io_exit 245 * when done. 246 * 247 * On failure, return ENXIO -- the device has been revoked and no 248 * longer exists. 249 */ 250 static int 251 spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) 252 { 253 dev_t dev; 254 struct specnode *sn; 255 unsigned iocnt; 256 int error = 0; 257 258 mutex_enter(vp->v_interlock); 259 260 /* 261 * Extract all the info we need from the vnode, unless the 262 * vnode has already been reclaimed. This can happen if the 263 * underlying device has been removed and all the device nodes 264 * for it have been revoked. The caller may not hold a vnode 265 * lock or fstrans to prevent this from happening before it has 266 * had an opportunity to notice the vnode is dead. 267 */ 268 if (vdead_check(vp, VDEAD_NOWAIT) != 0 || 269 (sn = vp->v_specnode) == NULL || 270 (dev = vp->v_rdev) == NODEV) { 271 error = ENXIO; 272 goto out; 273 } 274 275 /* 276 * Notify spec_close that we are doing an I/O operation which 277 * may not be not bracketed by fstrans(9) and thus is not 278 * blocked by vfs suspension. 279 * 280 * We could hold this reference with psref(9) instead, but we 281 * already have to take the interlock for vdead_check, so 282 * there's not much more cost here to another atomic operation. 283 */ 284 do { 285 iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt); 286 if (__predict_false(iocnt == UINT_MAX)) { 287 /* 288 * The I/O count is limited by the number of 289 * LWPs (which will never overflow this) -- 290 * unless one driver uses another driver via 291 * specfs, which is rather unusual, but which 292 * could happen via pud(4) userspace drivers. 293 * We could use a 64-bit count, but can't use 294 * atomics for that on all platforms. 295 * (Probably better to switch to psref or 296 * localcount instead.) 297 */ 298 error = EBUSY; 299 goto out; 300 } 301 } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1) 302 != iocnt); 303 304 /* Success! */ 305 *snp = sn; 306 *devp = dev; 307 error = 0; 308 309 out: mutex_exit(vp->v_interlock); 310 return error; 311 } 312 313 /* 314 * spec_io_exit(vp, sn) 315 * 316 * Exit an operation entered with a successful spec_io_enter -- 317 * allow concurrent spec_node_revoke to proceed. The argument sn 318 * must match the struct specnode pointer returned by spec_io_exit 319 * for vp. 320 */ 321 static void 322 spec_io_exit(struct vnode *vp, struct specnode *sn) 323 { 324 struct specdev *sd = sn->sn_dev; 325 unsigned iocnt; 326 327 KASSERT(vp->v_specnode == sn); 328 329 /* 330 * We are done. Notify spec_close if appropriate. The 331 * transition of 1 -> 0 must happen under device_lock so 332 * spec_close doesn't miss a wakeup. 333 */ 334 do { 335 iocnt = atomic_load_relaxed(&sd->sd_iocnt); 336 KASSERT(iocnt > 0); 337 if (iocnt == 1) { 338 mutex_enter(&device_lock); 339 if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) 340 cv_broadcast(&specfs_iocv); 341 mutex_exit(&device_lock); 342 break; 343 } 344 } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt); 345 } 346 347 /* 348 * spec_io_drain(sd) 349 * 350 * Wait for all existing spec_io_enter/exit sections to complete. 351 * Caller must ensure spec_io_enter will fail at this point. 352 */ 353 static void 354 spec_io_drain(struct specdev *sd) 355 { 356 357 /* 358 * I/O at the same time as closing is unlikely -- it often 359 * indicates an application bug. 360 */ 361 if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0)) 362 return; 363 364 mutex_enter(&device_lock); 365 while (atomic_load_relaxed(&sd->sd_iocnt) > 0) 366 cv_wait(&specfs_iocv, &device_lock); 367 mutex_exit(&device_lock); 368 } 369 370 /* 371 * Initialize a vnode that represents a device. 372 */ 373 void 374 spec_node_init(vnode_t *vp, dev_t rdev) 375 { 376 specnode_t *sn; 377 specdev_t *sd; 378 vnode_t *vp2; 379 vnode_t **vpp; 380 381 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 382 KASSERT(vp->v_specnode == NULL); 383 384 /* 385 * Search the hash table for this device. If known, add a 386 * reference to the device structure. If not known, create 387 * a new entry to represent the device. In all cases add 388 * the vnode to the hash table. 389 */ 390 sn = kmem_alloc(sizeof(*sn), KM_SLEEP); 391 sd = kmem_alloc(sizeof(*sd), KM_SLEEP); 392 mutex_enter(&device_lock); 393 vpp = &specfs_hash[SPECHASH(rdev)]; 394 for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) { 395 KASSERT(vp2->v_specnode != NULL); 396 if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) { 397 break; 398 } 399 } 400 if (vp2 == NULL) { 401 /* No existing record, create a new one. */ 402 sd->sd_mountpoint = NULL; 403 sd->sd_lockf = NULL; 404 sd->sd_refcnt = 1; 405 sd->sd_opencnt = 0; 406 sd->sd_bdevvp = NULL; 407 sd->sd_iocnt = 0; 408 sd->sd_opened = false; 409 sd->sd_closing = false; 410 sn->sn_dev = sd; 411 sd = NULL; 412 } else { 413 /* Use the existing record. */ 414 sn->sn_dev = vp2->v_specnode->sn_dev; 415 sn->sn_dev->sd_refcnt++; 416 } 417 /* Insert vnode into the hash chain. */ 418 sn->sn_opencnt = 0; 419 sn->sn_rdev = rdev; 420 sn->sn_gone = false; 421 vp->v_specnode = sn; 422 vp->v_specnext = *vpp; 423 *vpp = vp; 424 mutex_exit(&device_lock); 425 426 /* Free the record we allocated if unused. */ 427 if (sd != NULL) { 428 kmem_free(sd, sizeof(*sd)); 429 } 430 } 431 432 /* 433 * Lookup a vnode by device number and return it referenced. 434 */ 435 int 436 spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp) 437 { 438 int error; 439 vnode_t *vp; 440 441 top: mutex_enter(&device_lock); 442 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 443 if (type == vp->v_type && dev == vp->v_rdev) { 444 mutex_enter(vp->v_interlock); 445 /* If clean or being cleaned, then ignore it. */ 446 if (vdead_check(vp, VDEAD_NOWAIT) == 0) 447 break; 448 if ((flags & VDEAD_NOWAIT) == 0) { 449 mutex_exit(&device_lock); 450 /* 451 * It may be being revoked as we speak, 452 * and the caller wants to wait until 453 * all revocation has completed. Let 454 * vcache_vget wait for it to finish 455 * dying; as a side effect, vcache_vget 456 * releases vp->v_interlock. Note that 457 * vcache_vget cannot succeed at this 458 * point because vdead_check already 459 * failed. 460 */ 461 error = vcache_vget(vp); 462 KASSERT(error); 463 goto top; 464 } 465 mutex_exit(vp->v_interlock); 466 } 467 } 468 KASSERT(vp == NULL || mutex_owned(vp->v_interlock)); 469 if (vp == NULL) { 470 mutex_exit(&device_lock); 471 return ENOENT; 472 } 473 /* 474 * If it is an opened block device return the opened vnode. 475 */ 476 if (type == VBLK && vp->v_specnode->sn_dev->sd_bdevvp != NULL) { 477 mutex_exit(vp->v_interlock); 478 vp = vp->v_specnode->sn_dev->sd_bdevvp; 479 mutex_enter(vp->v_interlock); 480 } 481 mutex_exit(&device_lock); 482 error = vcache_vget(vp); 483 if (error) 484 return error; 485 *vpp = vp; 486 487 return 0; 488 } 489 490 /* 491 * Lookup a vnode by file system mounted on and return it referenced. 492 */ 493 int 494 spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp) 495 { 496 int i, error; 497 vnode_t *vp, *vq; 498 499 mutex_enter(&device_lock); 500 for (i = 0, vq = NULL; i < SPECHSZ && vq == NULL; i++) { 501 for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) { 502 if (vp->v_type != VBLK) 503 continue; 504 vq = vp->v_specnode->sn_dev->sd_bdevvp; 505 if (vq != NULL && 506 vq->v_specnode->sn_dev->sd_mountpoint == mp) 507 break; 508 vq = NULL; 509 } 510 } 511 if (vq == NULL) { 512 mutex_exit(&device_lock); 513 return ENOENT; 514 } 515 mutex_enter(vq->v_interlock); 516 mutex_exit(&device_lock); 517 error = vcache_vget(vq); 518 if (error) 519 return error; 520 *vpp = vq; 521 522 return 0; 523 524 } 525 526 /* 527 * Get the file system mounted on this block device. 528 * 529 * XXX Caller should hold the vnode lock -- shared or exclusive -- so 530 * that this can't changed, and the vnode can't be revoked while we 531 * examine it. But not all callers do, and they're scattered through a 532 * lot of file systems, so we can't assert this yet. 533 */ 534 struct mount * 535 spec_node_getmountedfs(vnode_t *devvp) 536 { 537 struct mount *mp; 538 539 KASSERT(devvp->v_type == VBLK); 540 mp = devvp->v_specnode->sn_dev->sd_mountpoint; 541 542 return mp; 543 } 544 545 /* 546 * Set the file system mounted on this block device. 547 * 548 * XXX Caller should hold the vnode lock exclusively so this can't be 549 * changed or assumed by spec_node_getmountedfs while we change it, and 550 * the vnode can't be revoked while we handle it. But not all callers 551 * do, and they're scattered through a lot of file systems, so we can't 552 * assert this yet. Instead, for now, we'll take an I/O reference so 553 * at least the ioctl doesn't race with revoke/detach. 554 * 555 * If you do change this to assert an exclusive vnode lock, you must 556 * also do vdead_check before trying bdev_ioctl, because the vnode may 557 * have been revoked by the time the caller locked it, and this is 558 * _not_ a vop -- calls to spec_node_setmountedfs don't go through 559 * v_op, so revoking the vnode doesn't prevent further calls. 560 * 561 * XXX Caller should additionally have the vnode open, at least if mp 562 * is nonnull, but I'm not sure all callers do that -- need to audit. 563 * Currently udf closes the vnode before clearing the mount. 564 */ 565 void 566 spec_node_setmountedfs(vnode_t *devvp, struct mount *mp) 567 { 568 struct dkwedge_info dkw; 569 struct specnode *sn; 570 dev_t dev; 571 int error; 572 573 KASSERT(devvp->v_type == VBLK); 574 575 error = spec_io_enter(devvp, &sn, &dev); 576 if (error) 577 return; 578 579 KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL); 580 sn->sn_dev->sd_mountpoint = mp; 581 if (mp == NULL) 582 goto out; 583 584 error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp); 585 if (error) 586 goto out; 587 588 strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname, 589 sizeof(mp->mnt_stat.f_mntfromlabel)); 590 591 out: spec_io_exit(devvp, sn); 592 } 593 594 /* 595 * A vnode representing a special device is going away. Close 596 * the device if the vnode holds it open. 597 */ 598 void 599 spec_node_revoke(vnode_t *vp) 600 { 601 specnode_t *sn; 602 specdev_t *sd; 603 struct vnode **vpp; 604 605 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 606 607 sn = vp->v_specnode; 608 sd = sn->sn_dev; 609 610 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 611 KASSERT(vp->v_specnode != NULL); 612 KASSERT(sn->sn_gone == false); 613 614 mutex_enter(&device_lock); 615 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 616 "sn_opencnt=%u > sd_opencnt=%u", 617 sn->sn_opencnt, sd->sd_opencnt); 618 sn->sn_gone = true; 619 if (sn->sn_opencnt != 0) { 620 sd->sd_opencnt -= (sn->sn_opencnt - 1); 621 sn->sn_opencnt = 1; 622 mutex_exit(&device_lock); 623 624 VOP_CLOSE(vp, FNONBLOCK, NOCRED); 625 626 mutex_enter(&device_lock); 627 KASSERT(sn->sn_opencnt == 0); 628 } 629 630 /* 631 * We may have revoked the vnode in this thread while another 632 * thread was in the middle of spec_close, in the window when 633 * spec_close releases the vnode lock to call .d_close for the 634 * last close. In that case, wait for the concurrent 635 * spec_close to complete. 636 */ 637 while (sd->sd_closing) 638 cv_wait(&specfs_iocv, &device_lock); 639 640 /* 641 * Remove from the hash so lookups stop returning this 642 * specnode. We will dissociate it from the specdev -- and 643 * possibly free the specdev -- in spec_node_destroy. 644 */ 645 KASSERT(sn->sn_gone); 646 KASSERT(sn->sn_opencnt == 0); 647 for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];; 648 vpp = &(*vpp)->v_specnext) { 649 if (*vpp == vp) { 650 *vpp = vp->v_specnext; 651 vp->v_specnext = NULL; 652 break; 653 } 654 } 655 mutex_exit(&device_lock); 656 } 657 658 /* 659 * A vnode representing a special device is being recycled. 660 * Destroy the specfs component. 661 */ 662 void 663 spec_node_destroy(vnode_t *vp) 664 { 665 specnode_t *sn; 666 specdev_t *sd; 667 int refcnt; 668 669 sn = vp->v_specnode; 670 sd = sn->sn_dev; 671 672 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 673 KASSERT(vp->v_specnode != NULL); 674 KASSERT(sn->sn_opencnt == 0); 675 676 mutex_enter(&device_lock); 677 sn = vp->v_specnode; 678 vp->v_specnode = NULL; 679 refcnt = sd->sd_refcnt--; 680 KASSERT(refcnt > 0); 681 mutex_exit(&device_lock); 682 683 /* If the device is no longer in use, destroy our record. */ 684 if (refcnt == 1) { 685 KASSERT(sd->sd_iocnt == 0); 686 KASSERT(sd->sd_opencnt == 0); 687 KASSERT(sd->sd_bdevvp == NULL); 688 kmem_free(sd, sizeof(*sd)); 689 } 690 kmem_free(sn, sizeof(*sn)); 691 } 692 693 /* 694 * Trivial lookup routine that always fails. 695 */ 696 int 697 spec_lookup(void *v) 698 { 699 struct vop_lookup_v2_args /* { 700 struct vnode *a_dvp; 701 struct vnode **a_vpp; 702 struct componentname *a_cnp; 703 } */ *ap = v; 704 705 *ap->a_vpp = NULL; 706 return ENOTDIR; 707 } 708 709 typedef int (*spec_ioctl_t)(dev_t, u_long, void *, int, struct lwp *); 710 711 /* 712 * Open a special file. 713 */ 714 /* ARGSUSED */ 715 int 716 spec_open(void *v) 717 { 718 struct vop_open_args /* { 719 struct vnode *a_vp; 720 int a_mode; 721 kauth_cred_t a_cred; 722 } */ *ap = v; 723 struct lwp *l = curlwp; 724 struct vnode *vp = ap->a_vp; 725 dev_t dev, dev1; 726 int error; 727 enum kauth_device_req req; 728 specnode_t *sn, *sn1; 729 specdev_t *sd; 730 spec_ioctl_t ioctl; 731 u_int gen = 0; 732 const char *name = NULL; 733 bool needclose = false; 734 struct partinfo pi; 735 736 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 737 KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", 738 vp->v_type); 739 740 dev = vp->v_rdev; 741 sn = vp->v_specnode; 742 sd = sn->sn_dev; 743 744 /* 745 * Don't allow open if fs is mounted -nodev. 746 */ 747 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) 748 return ENXIO; 749 750 switch (ap->a_mode & (FREAD | FWRITE)) { 751 case FREAD | FWRITE: 752 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW; 753 break; 754 case FWRITE: 755 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE; 756 break; 757 default: 758 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ; 759 break; 760 } 761 error = kauth_authorize_device_spec(ap->a_cred, req, vp); 762 if (error) 763 return error; 764 765 /* 766 * Acquire an open reference -- as long as we hold onto it, and 767 * the vnode isn't revoked, it can't be closed, and the vnode 768 * can't be revoked until we release the vnode lock. 769 */ 770 mutex_enter(&device_lock); 771 KASSERT(!sn->sn_gone); 772 switch (vp->v_type) { 773 case VCHR: 774 /* 775 * Character devices can accept opens from multiple 776 * vnodes. But first, wait for any close to finish. 777 * Wait under the vnode lock so we don't have to worry 778 * about the vnode being revoked while we wait. 779 */ 780 while (sd->sd_closing) { 781 error = cv_wait_sig(&specfs_iocv, &device_lock); 782 if (error) 783 break; 784 } 785 if (error) 786 break; 787 sd->sd_opencnt++; 788 sn->sn_opencnt++; 789 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 790 "sn_opencnt=%u > sd_opencnt=%u", 791 sn->sn_opencnt, sd->sd_opencnt); 792 break; 793 case VBLK: 794 /* 795 * For block devices, permit only one open. The buffer 796 * cache cannot remain self-consistent with multiple 797 * vnodes holding a block device open. 798 * 799 * Treat zero opencnt with non-NULL mountpoint as open. 800 * This may happen after forced detach of a mounted device. 801 * 802 * Also treat sd_closing, meaning there is a concurrent 803 * close in progress, as still open. 804 */ 805 if (sd->sd_opencnt != 0 || 806 sd->sd_mountpoint != NULL || 807 sd->sd_closing) { 808 error = EBUSY; 809 break; 810 } 811 KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u", 812 sn->sn_opencnt); 813 sn->sn_opencnt = 1; 814 sd->sd_opencnt = 1; 815 sd->sd_bdevvp = vp; 816 break; 817 default: 818 panic("invalid specfs vnode type: %d", vp->v_type); 819 } 820 mutex_exit(&device_lock); 821 if (error) 822 return error; 823 824 /* 825 * Set VV_ISTTY if this is a tty cdev. 826 * 827 * XXX This does the wrong thing if the module has to be 828 * autoloaded. We should maybe set this after autoloading 829 * modules and calling .d_open successfully, except (a) we need 830 * the vnode lock to touch it, and (b) once we acquire the 831 * vnode lock again, the vnode may have been revoked, and 832 * deadfs's dead_read needs VV_ISTTY to be already set in order 833 * to return the right answer. So this needs some additional 834 * synchronization to be made to work correctly with tty driver 835 * module autoload. For now, let's just hope it doesn't cause 836 * too much trouble for a tty from an autoloaded driver module 837 * to fail with EIO instead of returning EOF. 838 */ 839 if (vp->v_type == VCHR) { 840 if (cdev_type(dev) == D_TTY) 841 vp->v_vflag |= VV_ISTTY; 842 } 843 844 /* 845 * Because opening the device may block indefinitely, e.g. when 846 * opening a tty, and loading a module may cross into many 847 * other subsystems, we must not hold the vnode lock while 848 * calling .d_open, so release it now and reacquire it when 849 * done. 850 * 851 * Take an I/O reference so that any concurrent spec_close via 852 * spec_node_revoke will wait for us to finish calling .d_open. 853 * The vnode can't be dead at this point because we have it 854 * locked. Note that if revoked, the driver must interrupt 855 * .d_open before spec_close starts waiting for I/O to drain so 856 * this doesn't deadlock. 857 */ 858 VOP_UNLOCK(vp); 859 error = spec_io_enter(vp, &sn1, &dev1); 860 if (error) { 861 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 862 return error; 863 } 864 KASSERT(sn1 == sn); 865 KASSERT(dev1 == dev); 866 867 /* 868 * Open the device. If .d_open returns ENXIO (device not 869 * configured), the driver may not be loaded, so try 870 * autoloading a module and then try .d_open again if anything 871 * got loaded. 872 */ 873 switch (vp->v_type) { 874 case VCHR: 875 do { 876 const struct cdevsw *cdev; 877 878 gen = module_gen; 879 error = cdev_open(dev, ap->a_mode, S_IFCHR, l); 880 if (error != ENXIO) 881 break; 882 883 /* Check if we already have a valid driver */ 884 mutex_enter(&device_lock); 885 cdev = cdevsw_lookup(dev); 886 mutex_exit(&device_lock); 887 if (cdev != NULL) 888 break; 889 890 /* Get device name from devsw_conv array */ 891 if ((name = cdevsw_getname(major(dev))) == NULL) 892 break; 893 894 /* Try to autoload device module */ 895 (void)module_autoload(name, MODULE_CLASS_DRIVER); 896 } while (gen != module_gen); 897 break; 898 899 case VBLK: 900 do { 901 const struct bdevsw *bdev; 902 903 gen = module_gen; 904 error = bdev_open(dev, ap->a_mode, S_IFBLK, l); 905 if (error != ENXIO) 906 break; 907 908 /* Check if we already have a valid driver */ 909 mutex_enter(&device_lock); 910 bdev = bdevsw_lookup(dev); 911 mutex_exit(&device_lock); 912 if (bdev != NULL) 913 break; 914 915 /* Get device name from devsw_conv array */ 916 if ((name = bdevsw_getname(major(dev))) == NULL) 917 break; 918 919 /* Try to autoload device module */ 920 (void)module_autoload(name, MODULE_CLASS_DRIVER); 921 } while (gen != module_gen); 922 break; 923 924 default: 925 __unreachable(); 926 } 927 928 /* 929 * Release the I/O reference now that we have called .d_open, 930 * and reacquire the vnode lock. At this point, the device may 931 * have been revoked, so we must tread carefully. However, sn 932 * and sd remain valid pointers until we drop our reference. 933 */ 934 spec_io_exit(vp, sn); 935 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 936 KASSERT(vp->v_specnode == sn); 937 938 /* 939 * If it has been revoked since we released the vnode lock and 940 * reacquired it, then spec_node_revoke has closed it, and we 941 * must fail with EBADF. 942 * 943 * Otherwise, if opening it failed, back out and release the 944 * open reference. If it was ever successfully opened and we 945 * got the last reference this way, it's now our job to close 946 * it. This might happen in the following scenario: 947 * 948 * Thread 1 Thread 2 949 * VOP_OPEN 950 * ... 951 * .d_open -> 0 (success) 952 * acquire vnode lock 953 * do stuff VOP_OPEN 954 * release vnode lock ... 955 * .d_open -> EBUSY 956 * VOP_CLOSE 957 * acquire vnode lock 958 * --sd_opencnt != 0 959 * => no .d_close 960 * release vnode lock 961 * acquire vnode lock 962 * --sd_opencnt == 0 963 * 964 * We can't resolve this by making spec_close wait for .d_open 965 * to complete before examining sd_opencnt, because .d_open can 966 * hang indefinitely, e.g. for a tty. 967 */ 968 mutex_enter(&device_lock); 969 if (sn->sn_gone) { 970 if (error == 0) 971 error = EBADF; 972 } else if (error == 0) { 973 /* 974 * Device has not been revoked, so our opencnt can't 975 * have gone away at this point -- transition to 976 * sn_gone=true happens before transition to 977 * sn_opencnt=0 in spec_node_revoke. 978 */ 979 KASSERT(sd->sd_opencnt); 980 KASSERT(sn->sn_opencnt); 981 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 982 "sn_opencnt=%u > sd_opencnt=%u", 983 sn->sn_opencnt, sd->sd_opencnt); 984 KASSERT(!sd->sd_closing); 985 sd->sd_opened = true; 986 } else if (sd->sd_opencnt == 1 && sd->sd_opened) { 987 /* 988 * We're the last reference to a _previous_ open even 989 * though this one failed, so we have to close it. 990 * Don't decrement the reference count here -- 991 * spec_close will do that. 992 */ 993 KASSERT(sn->sn_opencnt == 1); 994 needclose = true; 995 } else { 996 KASSERT(sd->sd_opencnt); 997 KASSERT(sn->sn_opencnt); 998 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 999 "sn_opencnt=%u > sd_opencnt=%u", 1000 sn->sn_opencnt, sd->sd_opencnt); 1001 sd->sd_opencnt--; 1002 sn->sn_opencnt--; 1003 if (vp->v_type == VBLK) 1004 sd->sd_bdevvp = NULL; 1005 } 1006 mutex_exit(&device_lock); 1007 1008 /* 1009 * If this open failed, but the device was previously opened, 1010 * and another thread concurrently closed the vnode while we 1011 * were in the middle of reopening it, the other thread will 1012 * see sd_opencnt > 0 and thus decide not to call .d_close -- 1013 * it is now our responsibility to do so. 1014 * 1015 * XXX The flags passed to VOP_CLOSE here are wrong, but 1016 * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider 1017 * a device opened by thread 0 with O_READ, then opened by 1018 * thread 1 with O_WRITE, then closed by thread 0, and finally 1019 * closed by thread 1; the last .d_close call will have FWRITE 1020 * but not FREAD. We should just eliminate the FREAD/FWRITE 1021 * parameter to .d_close altogether. 1022 */ 1023 if (needclose) { 1024 KASSERT(error); 1025 VOP_CLOSE(vp, FNONBLOCK, NOCRED); 1026 } 1027 1028 /* If anything went wrong, we're done. */ 1029 if (error) 1030 return error; 1031 1032 /* 1033 * For disk devices, automagically set the vnode size to the 1034 * partition size, if we can. This applies to block devices 1035 * and character devices alike -- every block device must have 1036 * a corresponding character device. And if the module is 1037 * loaded it will remain loaded until we're done here (it is 1038 * forbidden to devsw_detach until closed). So it is safe to 1039 * query cdev_type unconditionally here. 1040 */ 1041 if (cdev_type(dev) == D_DISK) { 1042 ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; 1043 if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0) 1044 uvm_vnp_setsize(vp, 1045 (voff_t)pi.pi_secsize * pi.pi_size); 1046 } 1047 1048 /* Success! */ 1049 return 0; 1050 } 1051 1052 /* 1053 * Vnode op for read 1054 */ 1055 /* ARGSUSED */ 1056 int 1057 spec_read(void *v) 1058 { 1059 struct vop_read_args /* { 1060 struct vnode *a_vp; 1061 struct uio *a_uio; 1062 int a_ioflag; 1063 kauth_cred_t a_cred; 1064 } */ *ap = v; 1065 struct vnode *vp = ap->a_vp; 1066 struct uio *uio = ap->a_uio; 1067 struct lwp *l = curlwp; 1068 struct specnode *sn; 1069 dev_t dev; 1070 struct buf *bp; 1071 daddr_t bn; 1072 int bsize, bscale; 1073 struct partinfo pi; 1074 int n, on; 1075 int error = 0; 1076 int i, nra; 1077 daddr_t lastbn, *rablks; 1078 int *rasizes; 1079 int nrablks, ratogo; 1080 1081 KASSERT(uio->uio_rw == UIO_READ); 1082 KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || 1083 uio->uio_vmspace == curproc->p_vmspace), 1084 "vmspace belongs to neither kernel nor curproc"); 1085 1086 if (uio->uio_resid == 0) 1087 return 0; 1088 1089 switch (vp->v_type) { 1090 1091 case VCHR: 1092 /* 1093 * Release the lock while we sleep -- possibly 1094 * indefinitely, if this is, e.g., a tty -- in 1095 * cdev_read, so we don't hold up everything else that 1096 * might want access to the vnode. 1097 * 1098 * But before we issue the read, take an I/O reference 1099 * to the specnode so close will know when we're done 1100 * reading. Note that the moment we release the lock, 1101 * the vnode's identity may change; hence spec_io_enter 1102 * may fail, and the caller may have a dead vnode on 1103 * their hands, if the file system on which vp lived 1104 * has been unmounted. 1105 */ 1106 VOP_UNLOCK(vp); 1107 error = spec_io_enter(vp, &sn, &dev); 1108 if (error) 1109 goto out; 1110 error = cdev_read(dev, uio, ap->a_ioflag); 1111 spec_io_exit(vp, sn); 1112 out: /* XXX What if the caller held an exclusive lock? */ 1113 vn_lock(vp, LK_SHARED | LK_RETRY); 1114 return error; 1115 1116 case VBLK: 1117 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1118 if (uio->uio_offset < 0) 1119 return EINVAL; 1120 1121 if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) 1122 bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); 1123 else 1124 bsize = BLKDEV_IOSIZE; 1125 1126 bscale = bsize >> DEV_BSHIFT; 1127 1128 nra = uimax(16 * MAXPHYS / bsize - 1, 511); 1129 rablks = kmem_alloc(nra * sizeof(*rablks), KM_SLEEP); 1130 rasizes = kmem_alloc(nra * sizeof(*rasizes), KM_SLEEP); 1131 lastbn = ((uio->uio_offset + uio->uio_resid - 1) >> DEV_BSHIFT) 1132 &~ (bscale - 1); 1133 nrablks = ratogo = 0; 1134 do { 1135 bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1); 1136 on = uio->uio_offset % bsize; 1137 n = uimin((unsigned)(bsize - on), uio->uio_resid); 1138 1139 if (ratogo == 0) { 1140 nrablks = uimin((lastbn - bn) / bscale, nra); 1141 ratogo = nrablks; 1142 1143 for (i = 0; i < nrablks; ++i) { 1144 rablks[i] = bn + (i+1) * bscale; 1145 rasizes[i] = bsize; 1146 } 1147 1148 error = breadn(vp, bn, bsize, 1149 rablks, rasizes, nrablks, 1150 0, &bp); 1151 } else { 1152 if (ratogo > 0) 1153 --ratogo; 1154 error = bread(vp, bn, bsize, 0, &bp); 1155 } 1156 if (error) 1157 break; 1158 n = uimin(n, bsize - bp->b_resid); 1159 error = uiomove((char *)bp->b_data + on, n, uio); 1160 brelse(bp, 0); 1161 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1162 1163 kmem_free(rablks, nra * sizeof(*rablks)); 1164 kmem_free(rasizes, nra * sizeof(*rasizes)); 1165 1166 return error; 1167 1168 default: 1169 panic("spec_read type"); 1170 } 1171 /* NOTREACHED */ 1172 } 1173 1174 /* 1175 * Vnode op for write 1176 */ 1177 /* ARGSUSED */ 1178 int 1179 spec_write(void *v) 1180 { 1181 struct vop_write_args /* { 1182 struct vnode *a_vp; 1183 struct uio *a_uio; 1184 int a_ioflag; 1185 kauth_cred_t a_cred; 1186 } */ *ap = v; 1187 struct vnode *vp = ap->a_vp; 1188 struct uio *uio = ap->a_uio; 1189 struct lwp *l = curlwp; 1190 struct specnode *sn; 1191 dev_t dev; 1192 struct buf *bp; 1193 daddr_t bn; 1194 int bsize, bscale; 1195 struct partinfo pi; 1196 int n, on; 1197 int error = 0; 1198 1199 KASSERT(uio->uio_rw == UIO_WRITE); 1200 KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || 1201 uio->uio_vmspace == curproc->p_vmspace), 1202 "vmspace belongs to neither kernel nor curproc"); 1203 1204 switch (vp->v_type) { 1205 1206 case VCHR: 1207 /* 1208 * Release the lock while we sleep -- possibly 1209 * indefinitely, if this is, e.g., a tty -- in 1210 * cdev_write, so we don't hold up everything else that 1211 * might want access to the vnode. 1212 * 1213 * But before we issue the write, take an I/O reference 1214 * to the specnode so close will know when we're done 1215 * writing. Note that the moment we release the lock, 1216 * the vnode's identity may change; hence spec_io_enter 1217 * may fail, and the caller may have a dead vnode on 1218 * their hands, if the file system on which vp lived 1219 * has been unmounted. 1220 */ 1221 VOP_UNLOCK(vp); 1222 error = spec_io_enter(vp, &sn, &dev); 1223 if (error) 1224 goto out; 1225 error = cdev_write(dev, uio, ap->a_ioflag); 1226 spec_io_exit(vp, sn); 1227 out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1228 return error; 1229 1230 case VBLK: 1231 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1232 if (uio->uio_resid == 0) 1233 return 0; 1234 if (uio->uio_offset < 0) 1235 return EINVAL; 1236 1237 if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) 1238 bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); 1239 else 1240 bsize = BLKDEV_IOSIZE; 1241 1242 bscale = bsize >> DEV_BSHIFT; 1243 do { 1244 bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1); 1245 on = uio->uio_offset % bsize; 1246 n = uimin((unsigned)(bsize - on), uio->uio_resid); 1247 if (n == bsize) 1248 bp = getblk(vp, bn, bsize, 0, 0); 1249 else 1250 error = bread(vp, bn, bsize, B_MODIFY, &bp); 1251 if (error) { 1252 return error; 1253 } 1254 n = uimin(n, bsize - bp->b_resid); 1255 error = uiomove((char *)bp->b_data + on, n, uio); 1256 if (error) 1257 brelse(bp, 0); 1258 else { 1259 if (n + on == bsize) 1260 bawrite(bp); 1261 else 1262 bdwrite(bp); 1263 error = bp->b_error; 1264 } 1265 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1266 return error; 1267 1268 default: 1269 panic("spec_write type"); 1270 } 1271 /* NOTREACHED */ 1272 } 1273 1274 /* 1275 * fdiscard, which on disk devices becomes TRIM. 1276 */ 1277 int 1278 spec_fdiscard(void *v) 1279 { 1280 struct vop_fdiscard_args /* { 1281 struct vnode *a_vp; 1282 off_t a_pos; 1283 off_t a_len; 1284 } */ *ap = v; 1285 struct vnode *vp = ap->a_vp; 1286 dev_t dev; 1287 1288 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1289 1290 dev = vp->v_rdev; 1291 1292 switch (vp->v_type) { 1293 case VCHR: 1294 #if 0 /* This is not stored for character devices. */ 1295 KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp); 1296 #endif 1297 return cdev_discard(dev, ap->a_pos, ap->a_len); 1298 case VBLK: 1299 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1300 return bdev_discard(dev, ap->a_pos, ap->a_len); 1301 default: 1302 panic("spec_fdiscard: not a device\n"); 1303 } 1304 } 1305 1306 /* 1307 * Device ioctl operation. 1308 */ 1309 /* ARGSUSED */ 1310 int 1311 spec_ioctl(void *v) 1312 { 1313 struct vop_ioctl_args /* { 1314 struct vnode *a_vp; 1315 u_long a_command; 1316 void *a_data; 1317 int a_fflag; 1318 kauth_cred_t a_cred; 1319 } */ *ap = v; 1320 struct vnode *vp = ap->a_vp; 1321 struct specnode *sn; 1322 dev_t dev; 1323 int error; 1324 1325 error = spec_io_enter(vp, &sn, &dev); 1326 if (error) 1327 return error; 1328 1329 switch (vp->v_type) { 1330 case VCHR: 1331 error = cdev_ioctl(dev, ap->a_command, ap->a_data, 1332 ap->a_fflag, curlwp); 1333 break; 1334 case VBLK: 1335 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1336 error = bdev_ioctl(dev, ap->a_command, ap->a_data, 1337 ap->a_fflag, curlwp); 1338 break; 1339 default: 1340 panic("spec_ioctl"); 1341 /* NOTREACHED */ 1342 } 1343 1344 spec_io_exit(vp, sn); 1345 return error; 1346 } 1347 1348 /* ARGSUSED */ 1349 int 1350 spec_poll(void *v) 1351 { 1352 struct vop_poll_args /* { 1353 struct vnode *a_vp; 1354 int a_events; 1355 } */ *ap = v; 1356 struct vnode *vp = ap->a_vp; 1357 struct specnode *sn; 1358 dev_t dev; 1359 int revents; 1360 1361 if (spec_io_enter(vp, &sn, &dev) != 0) 1362 return POLLERR; 1363 1364 switch (vp->v_type) { 1365 case VCHR: 1366 revents = cdev_poll(dev, ap->a_events, curlwp); 1367 break; 1368 default: 1369 revents = genfs_poll(v); 1370 break; 1371 } 1372 1373 spec_io_exit(vp, sn); 1374 return revents; 1375 } 1376 1377 /* ARGSUSED */ 1378 int 1379 spec_kqfilter(void *v) 1380 { 1381 struct vop_kqfilter_args /* { 1382 struct vnode *a_vp; 1383 struct proc *a_kn; 1384 } */ *ap = v; 1385 struct vnode *vp = ap->a_vp; 1386 struct specnode *sn; 1387 dev_t dev; 1388 int error; 1389 1390 error = spec_io_enter(vp, &sn, &dev); 1391 if (error) 1392 return error; 1393 1394 switch (vp->v_type) { 1395 case VCHR: 1396 error = cdev_kqfilter(dev, ap->a_kn); 1397 break; 1398 default: 1399 /* 1400 * Block devices don't support kqfilter, and refuse it 1401 * for any other files (like those vflush()ed) too. 1402 */ 1403 error = EOPNOTSUPP; 1404 break; 1405 } 1406 1407 spec_io_exit(vp, sn); 1408 return error; 1409 } 1410 1411 /* 1412 * Allow mapping of only D_DISK. This is called only for VBLK. 1413 */ 1414 int 1415 spec_mmap(void *v) 1416 { 1417 struct vop_mmap_args /* { 1418 struct vnode *a_vp; 1419 vm_prot_t a_prot; 1420 kauth_cred_t a_cred; 1421 } */ *ap = v; 1422 struct vnode *vp = ap->a_vp; 1423 struct specnode *sn; 1424 dev_t dev; 1425 int error; 1426 1427 KASSERT(vp->v_type == VBLK); 1428 1429 error = spec_io_enter(vp, &sn, &dev); 1430 if (error) 1431 return error; 1432 1433 error = bdev_type(dev) == D_DISK ? 0 : EINVAL; 1434 1435 spec_io_exit(vp, sn); 1436 return 0; 1437 } 1438 1439 /* 1440 * Synch buffers associated with a block device 1441 */ 1442 /* ARGSUSED */ 1443 int 1444 spec_fsync(void *v) 1445 { 1446 struct vop_fsync_args /* { 1447 struct vnode *a_vp; 1448 kauth_cred_t a_cred; 1449 int a_flags; 1450 off_t offlo; 1451 off_t offhi; 1452 } */ *ap = v; 1453 struct vnode *vp = ap->a_vp; 1454 struct mount *mp; 1455 int error; 1456 1457 if (vp->v_type == VBLK) { 1458 if ((mp = spec_node_getmountedfs(vp)) != NULL) { 1459 error = VFS_FSYNC(mp, vp, ap->a_flags); 1460 if (error != EOPNOTSUPP) 1461 return error; 1462 } 1463 return vflushbuf(vp, ap->a_flags); 1464 } 1465 return 0; 1466 } 1467 1468 /* 1469 * Just call the device strategy routine 1470 */ 1471 int 1472 spec_strategy(void *v) 1473 { 1474 struct vop_strategy_args /* { 1475 struct vnode *a_vp; 1476 struct buf *a_bp; 1477 } */ *ap = v; 1478 struct vnode *vp = ap->a_vp; 1479 struct buf *bp = ap->a_bp; 1480 struct specnode *sn = NULL; 1481 dev_t dev; 1482 int error; 1483 1484 error = spec_io_enter(vp, &sn, &dev); 1485 if (error) 1486 goto out; 1487 1488 bp->b_dev = dev; 1489 1490 if (!(bp->b_flags & B_READ)) { 1491 #ifdef DIAGNOSTIC 1492 if (bp->b_vp && bp->b_vp->v_type == VBLK) { 1493 struct mount *mp = spec_node_getmountedfs(bp->b_vp); 1494 1495 if (mp && (mp->mnt_flag & MNT_RDONLY)) { 1496 printf("%s blk %"PRId64" written while ro!\n", 1497 mp->mnt_stat.f_mntonname, bp->b_blkno); 1498 #ifdef DDB 1499 db_stacktrace(); 1500 #endif 1501 } 1502 } 1503 #endif /* DIAGNOSTIC */ 1504 error = fscow_run(bp, false); 1505 if (error) 1506 goto out; 1507 } 1508 bdev_strategy(bp); 1509 1510 error = 0; 1511 1512 out: if (sn) 1513 spec_io_exit(vp, sn); 1514 if (error) { 1515 bp->b_error = error; 1516 bp->b_resid = bp->b_bcount; 1517 biodone(bp); 1518 } 1519 return error; 1520 } 1521 1522 int 1523 spec_inactive(void *v) 1524 { 1525 struct vop_inactive_v2_args /* { 1526 struct vnode *a_vp; 1527 struct bool *a_recycle; 1528 } */ *ap = v; 1529 1530 KASSERT(ap->a_vp->v_mount == dead_rootmount); 1531 *ap->a_recycle = true; 1532 1533 return 0; 1534 } 1535 1536 int 1537 spec_reclaim(void *v) 1538 { 1539 struct vop_reclaim_v2_args /* { 1540 struct vnode *a_vp; 1541 } */ *ap = v; 1542 struct vnode *vp = ap->a_vp; 1543 1544 KASSERT(vp->v_specnode->sn_opencnt == 0); 1545 1546 VOP_UNLOCK(vp); 1547 1548 KASSERT(vp->v_mount == dead_rootmount); 1549 return 0; 1550 } 1551 1552 /* 1553 * This is a noop, simply returning what one has been given. 1554 */ 1555 int 1556 spec_bmap(void *v) 1557 { 1558 struct vop_bmap_args /* { 1559 struct vnode *a_vp; 1560 daddr_t a_bn; 1561 struct vnode **a_vpp; 1562 daddr_t *a_bnp; 1563 int *a_runp; 1564 } */ *ap = v; 1565 1566 if (ap->a_vpp != NULL) 1567 *ap->a_vpp = ap->a_vp; 1568 if (ap->a_bnp != NULL) 1569 *ap->a_bnp = ap->a_bn; 1570 if (ap->a_runp != NULL) 1571 *ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1; 1572 return 0; 1573 } 1574 1575 /* 1576 * Device close routine 1577 */ 1578 /* ARGSUSED */ 1579 int 1580 spec_close(void *v) 1581 { 1582 struct vop_close_args /* { 1583 struct vnode *a_vp; 1584 int a_fflag; 1585 kauth_cred_t a_cred; 1586 } */ *ap = v; 1587 struct vnode *vp = ap->a_vp; 1588 struct session *sess; 1589 dev_t dev; 1590 int flags = ap->a_fflag; 1591 int mode, error, count; 1592 specnode_t *sn; 1593 specdev_t *sd; 1594 1595 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1596 1597 mutex_enter(vp->v_interlock); 1598 sn = vp->v_specnode; 1599 dev = vp->v_rdev; 1600 sd = sn->sn_dev; 1601 /* 1602 * If we're going away soon, make this non-blocking. 1603 * Also ensures that we won't wedge in vn_lock below. 1604 */ 1605 if (vdead_check(vp, VDEAD_NOWAIT) != 0) 1606 flags |= FNONBLOCK; 1607 mutex_exit(vp->v_interlock); 1608 1609 switch (vp->v_type) { 1610 1611 case VCHR: 1612 /* 1613 * Hack: a tty device that is a controlling terminal 1614 * has a reference from the session structure. We 1615 * cannot easily tell that a character device is a 1616 * controlling terminal, unless it is the closing 1617 * process' controlling terminal. In that case, if the 1618 * open count is 1 release the reference from the 1619 * session. Also, remove the link from the tty back to 1620 * the session and pgrp. 1621 * 1622 * XXX V. fishy. 1623 */ 1624 mutex_enter(&proc_lock); 1625 sess = curlwp->l_proc->p_session; 1626 if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) { 1627 mutex_spin_enter(&tty_lock); 1628 sess->s_ttyvp = NULL; 1629 if (sess->s_ttyp->t_session != NULL) { 1630 sess->s_ttyp->t_pgrp = NULL; 1631 sess->s_ttyp->t_session = NULL; 1632 mutex_spin_exit(&tty_lock); 1633 /* Releases proc_lock. */ 1634 proc_sessrele(sess); 1635 } else { 1636 mutex_spin_exit(&tty_lock); 1637 if (sess->s_ttyp->t_pgrp != NULL) 1638 panic("spec_close: spurious pgrp ref"); 1639 mutex_exit(&proc_lock); 1640 } 1641 vrele(vp); 1642 } else 1643 mutex_exit(&proc_lock); 1644 1645 /* 1646 * If the vnode is locked, then we are in the midst 1647 * of forcably closing the device, otherwise we only 1648 * close on last reference. 1649 */ 1650 mode = S_IFCHR; 1651 break; 1652 1653 case VBLK: 1654 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1655 /* 1656 * On last close of a block device (that isn't mounted) 1657 * we must invalidate any in core blocks, so that 1658 * we can, for instance, change floppy disks. 1659 */ 1660 error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0); 1661 if (error) 1662 return error; 1663 /* 1664 * We do not want to really close the device if it 1665 * is still in use unless we are trying to close it 1666 * forcibly. Since every use (buffer, vnode, swap, cmap) 1667 * holds a reference to the vnode, and because we mark 1668 * any other vnodes that alias this device, when the 1669 * sum of the reference counts on all the aliased 1670 * vnodes descends to one, we are on last close. 1671 */ 1672 mode = S_IFBLK; 1673 break; 1674 1675 default: 1676 panic("spec_close: not special"); 1677 } 1678 1679 /* 1680 * Decrement the open reference count of this node and the 1681 * device. For block devices, the open reference count must be 1682 * 1 at this point. If the device's open reference count goes 1683 * to zero, we're the last one out so get the lights. 1684 * 1685 * We may find --sd->sd_opencnt gives zero, and yet 1686 * sd->sd_opened is false. This happens if the vnode is 1687 * revoked at the same time as it is being opened, which can 1688 * happen when opening a tty blocks indefinitely. In that 1689 * case, we still must call close -- it is the job of close to 1690 * interrupt the open. Either way, the device will be no 1691 * longer opened, so we have to clear sd->sd_opened; subsequent 1692 * opens will have responsibility for issuing close. 1693 * 1694 * This has the side effect that the sequence of opens might 1695 * happen out of order -- we might end up doing open, open, 1696 * close, close, instead of open, close, open, close. This is 1697 * unavoidable with the current devsw API, where open is 1698 * allowed to block and close must be able to run concurrently 1699 * to interrupt it. It is the driver's responsibility to 1700 * ensure that close is idempotent so that this works. Drivers 1701 * requiring per-open state and exact 1:1 correspondence 1702 * between open and close can use fd_clone. 1703 */ 1704 mutex_enter(&device_lock); 1705 KASSERT(sn->sn_opencnt); 1706 KASSERT(sd->sd_opencnt); 1707 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 1708 "sn_opencnt=%u > sd_opencnt=%u", 1709 sn->sn_opencnt, sd->sd_opencnt); 1710 sn->sn_opencnt--; 1711 count = --sd->sd_opencnt; 1712 if (vp->v_type == VBLK) { 1713 KASSERTMSG(count == 0, "block device with %u opens", 1714 count + 1); 1715 sd->sd_bdevvp = NULL; 1716 } 1717 if (count == 0) { 1718 KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u", 1719 sn->sn_opencnt); 1720 KASSERT(!sd->sd_closing); 1721 sd->sd_opened = false; 1722 sd->sd_closing = true; 1723 } 1724 mutex_exit(&device_lock); 1725 1726 if (count != 0) 1727 return 0; 1728 1729 /* 1730 * If we're able to block, release the vnode lock & reacquire. We 1731 * might end up sleeping for someone else who wants our queues. They 1732 * won't get them if we hold the vnode locked. 1733 */ 1734 if (!(flags & FNONBLOCK)) 1735 VOP_UNLOCK(vp); 1736 1737 /* 1738 * If we can cancel all outstanding I/O, then wait for it to 1739 * drain before we call .d_close. Drivers that split up 1740 * .d_cancel and .d_close this way need not have any internal 1741 * mechanism for waiting in .d_close for I/O to drain. 1742 */ 1743 if (vp->v_type == VBLK) 1744 error = bdev_cancel(dev, flags, mode, curlwp); 1745 else 1746 error = cdev_cancel(dev, flags, mode, curlwp); 1747 if (error == 0) 1748 spec_io_drain(sd); 1749 else 1750 KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d", 1751 (unsigned long)dev, error); 1752 1753 if (vp->v_type == VBLK) 1754 error = bdev_close(dev, flags, mode, curlwp); 1755 else 1756 error = cdev_close(dev, flags, mode, curlwp); 1757 1758 /* 1759 * Wait for all other devsw operations to drain. After this 1760 * point, no bdev/cdev_* can be active for this specdev. 1761 */ 1762 spec_io_drain(sd); 1763 1764 /* 1765 * Wake any spec_open calls waiting for close to finish -- do 1766 * this before reacquiring the vnode lock, because spec_open 1767 * holds the vnode lock while waiting, so doing this after 1768 * reacquiring the lock would deadlock. 1769 */ 1770 mutex_enter(&device_lock); 1771 KASSERT(!sd->sd_opened); 1772 KASSERT(sd->sd_closing); 1773 sd->sd_closing = false; 1774 cv_broadcast(&specfs_iocv); 1775 mutex_exit(&device_lock); 1776 1777 if (!(flags & FNONBLOCK)) 1778 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1779 1780 return error; 1781 } 1782 1783 /* 1784 * Print out the contents of a special device vnode. 1785 */ 1786 int 1787 spec_print(void *v) 1788 { 1789 struct vop_print_args /* { 1790 struct vnode *a_vp; 1791 } */ *ap = v; 1792 1793 printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev), 1794 (unsigned long long)minor(ap->a_vp->v_rdev)); 1795 return 0; 1796 } 1797 1798 /* 1799 * Return POSIX pathconf information applicable to special devices. 1800 */ 1801 int 1802 spec_pathconf(void *v) 1803 { 1804 struct vop_pathconf_args /* { 1805 struct vnode *a_vp; 1806 int a_name; 1807 register_t *a_retval; 1808 } */ *ap = v; 1809 1810 switch (ap->a_name) { 1811 case _PC_LINK_MAX: 1812 *ap->a_retval = LINK_MAX; 1813 return 0; 1814 case _PC_MAX_CANON: 1815 *ap->a_retval = MAX_CANON; 1816 return 0; 1817 case _PC_MAX_INPUT: 1818 *ap->a_retval = MAX_INPUT; 1819 return 0; 1820 case _PC_PIPE_BUF: 1821 *ap->a_retval = PIPE_BUF; 1822 return 0; 1823 case _PC_CHOWN_RESTRICTED: 1824 *ap->a_retval = 1; 1825 return 0; 1826 case _PC_VDISABLE: 1827 *ap->a_retval = _POSIX_VDISABLE; 1828 return 0; 1829 case _PC_SYNC_IO: 1830 *ap->a_retval = 1; 1831 return 0; 1832 default: 1833 return genfs_pathconf(ap); 1834 } 1835 /* NOTREACHED */ 1836 } 1837 1838 /* 1839 * Advisory record locking support. 1840 */ 1841 int 1842 spec_advlock(void *v) 1843 { 1844 struct vop_advlock_args /* { 1845 struct vnode *a_vp; 1846 void *a_id; 1847 int a_op; 1848 struct flock *a_fl; 1849 int a_flags; 1850 } */ *ap = v; 1851 struct vnode *vp = ap->a_vp; 1852 1853 return lf_advlock(ap, &vp->v_speclockf, (off_t)0); 1854 } 1855