1 /* $NetBSD: spec_vnops.c,v 1.216 2022/10/15 15:20:46 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. Neither the name of the University nor the names of its contributors 42 * may be used to endorse or promote products derived from this software 43 * without specific prior written permission. 44 * 45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 * @(#)spec_vnops.c 8.15 (Berkeley) 7/14/95 58 */ 59 60 #include <sys/cdefs.h> 61 __KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.216 2022/10/15 15:20:46 riastradh Exp $"); 62 63 #ifdef _KERNEL_OPT 64 #include "opt_ddb.h" 65 #endif 66 67 #include <sys/param.h> 68 #include <sys/proc.h> 69 #include <sys/systm.h> 70 #include <sys/kernel.h> 71 #include <sys/conf.h> 72 #include <sys/buf.h> 73 #include <sys/mount.h> 74 #include <sys/namei.h> 75 #include <sys/vnode_impl.h> 76 #include <sys/stat.h> 77 #include <sys/errno.h> 78 #include <sys/ioctl.h> 79 #include <sys/poll.h> 80 #include <sys/file.h> 81 #include <sys/disklabel.h> 82 #include <sys/disk.h> 83 #include <sys/lockf.h> 84 #include <sys/tty.h> 85 #include <sys/kauth.h> 86 #include <sys/fstrans.h> 87 #include <sys/module.h> 88 #include <sys/atomic.h> 89 90 #include <miscfs/genfs/genfs.h> 91 #include <miscfs/specfs/specdev.h> 92 93 #ifdef DDB 94 #include <ddb/ddb.h> 95 #endif 96 97 /* 98 * Lock order: 99 * 100 * vnode lock 101 * -> device_lock 102 * -> struct vnode::v_interlock 103 */ 104 105 /* symbolic sleep message strings for devices */ 106 const char devopn[] = "devopn"; 107 const char devio[] = "devio"; 108 const char devwait[] = "devwait"; 109 const char devin[] = "devin"; 110 const char devout[] = "devout"; 111 const char devioc[] = "devioc"; 112 const char devcls[] = "devcls"; 113 114 #define SPECHSZ 64 115 #if ((SPECHSZ&(SPECHSZ-1)) == 0) 116 #define SPECHASH(rdev) (((rdev>>5)+(rdev))&(SPECHSZ-1)) 117 #else 118 #define SPECHASH(rdev) (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ) 119 #endif 120 121 static vnode_t *specfs_hash[SPECHSZ]; 122 extern struct mount *dead_rootmount; 123 124 /* 125 * This vnode operations vector is used for special device nodes 126 * created from whole cloth by the kernel. For the ops vector for 127 * vnodes built from special devices found in a filesystem, see (e.g) 128 * ffs_specop_entries[] in ffs_vnops.c or the equivalent for other 129 * filesystems. 130 */ 131 132 int (**spec_vnodeop_p)(void *); 133 const struct vnodeopv_entry_desc spec_vnodeop_entries[] = { 134 { &vop_default_desc, vn_default_error }, 135 { &vop_parsepath_desc, genfs_parsepath }, /* parsepath */ 136 { &vop_lookup_desc, spec_lookup }, /* lookup */ 137 { &vop_create_desc, genfs_badop }, /* create */ 138 { &vop_mknod_desc, genfs_badop }, /* mknod */ 139 { &vop_open_desc, spec_open }, /* open */ 140 { &vop_close_desc, spec_close }, /* close */ 141 { &vop_access_desc, genfs_ebadf }, /* access */ 142 { &vop_accessx_desc, genfs_ebadf }, /* accessx */ 143 { &vop_getattr_desc, genfs_ebadf }, /* getattr */ 144 { &vop_setattr_desc, genfs_ebadf }, /* setattr */ 145 { &vop_read_desc, spec_read }, /* read */ 146 { &vop_write_desc, spec_write }, /* write */ 147 { &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */ 148 { &vop_fdiscard_desc, spec_fdiscard }, /* fdiscard */ 149 { &vop_fcntl_desc, genfs_fcntl }, /* fcntl */ 150 { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ 151 { &vop_poll_desc, spec_poll }, /* poll */ 152 { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */ 153 { &vop_revoke_desc, genfs_revoke }, /* revoke */ 154 { &vop_mmap_desc, spec_mmap }, /* mmap */ 155 { &vop_fsync_desc, spec_fsync }, /* fsync */ 156 { &vop_seek_desc, spec_seek }, /* seek */ 157 { &vop_remove_desc, genfs_badop }, /* remove */ 158 { &vop_link_desc, genfs_badop }, /* link */ 159 { &vop_rename_desc, genfs_badop }, /* rename */ 160 { &vop_mkdir_desc, genfs_badop }, /* mkdir */ 161 { &vop_rmdir_desc, genfs_badop }, /* rmdir */ 162 { &vop_symlink_desc, genfs_badop }, /* symlink */ 163 { &vop_readdir_desc, genfs_badop }, /* readdir */ 164 { &vop_readlink_desc, genfs_badop }, /* readlink */ 165 { &vop_abortop_desc, genfs_badop }, /* abortop */ 166 { &vop_inactive_desc, spec_inactive }, /* inactive */ 167 { &vop_reclaim_desc, spec_reclaim }, /* reclaim */ 168 { &vop_lock_desc, genfs_lock }, /* lock */ 169 { &vop_unlock_desc, genfs_unlock }, /* unlock */ 170 { &vop_bmap_desc, spec_bmap }, /* bmap */ 171 { &vop_strategy_desc, spec_strategy }, /* strategy */ 172 { &vop_print_desc, spec_print }, /* print */ 173 { &vop_islocked_desc, genfs_islocked }, /* islocked */ 174 { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ 175 { &vop_advlock_desc, spec_advlock }, /* advlock */ 176 { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ 177 { &vop_getpages_desc, genfs_getpages }, /* getpages */ 178 { &vop_putpages_desc, genfs_putpages }, /* putpages */ 179 { NULL, NULL } 180 }; 181 const struct vnodeopv_desc spec_vnodeop_opv_desc = 182 { &spec_vnodeop_p, spec_vnodeop_entries }; 183 184 static kauth_listener_t rawio_listener; 185 static struct kcondvar specfs_iocv; 186 187 /* Returns true if vnode is /dev/mem or /dev/kmem. */ 188 bool 189 iskmemvp(struct vnode *vp) 190 { 191 return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev)); 192 } 193 194 /* 195 * Returns true if dev is /dev/mem or /dev/kmem. 196 */ 197 int 198 iskmemdev(dev_t dev) 199 { 200 /* mem_no is emitted by config(8) to generated devsw.c */ 201 extern const int mem_no; 202 203 /* minor 14 is /dev/io on i386 with COMPAT_10 */ 204 return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14)); 205 } 206 207 static int 208 rawio_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 209 void *arg0, void *arg1, void *arg2, void *arg3) 210 { 211 int result; 212 213 result = KAUTH_RESULT_DEFER; 214 215 if ((action != KAUTH_DEVICE_RAWIO_SPEC) && 216 (action != KAUTH_DEVICE_RAWIO_PASSTHRU)) 217 return result; 218 219 /* Access is mandated by permissions. */ 220 result = KAUTH_RESULT_ALLOW; 221 222 return result; 223 } 224 225 void 226 spec_init(void) 227 { 228 229 rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE, 230 rawio_listener_cb, NULL); 231 cv_init(&specfs_iocv, "specio"); 232 } 233 234 /* 235 * spec_io_enter(vp, &sn, &dev) 236 * 237 * Enter an operation that may not hold vp's vnode lock or an 238 * fstrans on vp's mount. Until spec_io_exit, the vnode will not 239 * be revoked. 240 * 241 * On success, set sn to the specnode pointer and dev to the dev_t 242 * number and return zero. Caller must later call spec_io_exit 243 * when done. 244 * 245 * On failure, return ENXIO -- the device has been revoked and no 246 * longer exists. 247 */ 248 static int 249 spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp) 250 { 251 dev_t dev; 252 struct specnode *sn; 253 unsigned iocnt; 254 int error = 0; 255 256 mutex_enter(vp->v_interlock); 257 258 /* 259 * Extract all the info we need from the vnode, unless the 260 * vnode has already been reclaimed. This can happen if the 261 * underlying device has been removed and all the device nodes 262 * for it have been revoked. The caller may not hold a vnode 263 * lock or fstrans to prevent this from happening before it has 264 * had an opportunity to notice the vnode is dead. 265 */ 266 if (vdead_check(vp, VDEAD_NOWAIT) != 0 || 267 (sn = vp->v_specnode) == NULL || 268 (dev = vp->v_rdev) == NODEV) { 269 error = ENXIO; 270 goto out; 271 } 272 273 /* 274 * Notify spec_close that we are doing an I/O operation which 275 * may not be not bracketed by fstrans(9) and thus is not 276 * blocked by vfs suspension. 277 * 278 * We could hold this reference with psref(9) instead, but we 279 * already have to take the interlock for vdead_check, so 280 * there's not much more cost here to another atomic operation. 281 */ 282 do { 283 iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt); 284 if (__predict_false(iocnt == UINT_MAX)) { 285 /* 286 * The I/O count is limited by the number of 287 * LWPs (which will never overflow this) -- 288 * unless one driver uses another driver via 289 * specfs, which is rather unusual, but which 290 * could happen via pud(4) userspace drivers. 291 * We could use a 64-bit count, but can't use 292 * atomics for that on all platforms. 293 * (Probably better to switch to psref or 294 * localcount instead.) 295 */ 296 error = EBUSY; 297 goto out; 298 } 299 } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1) 300 != iocnt); 301 302 /* Success! */ 303 *snp = sn; 304 *devp = dev; 305 error = 0; 306 307 out: mutex_exit(vp->v_interlock); 308 return error; 309 } 310 311 /* 312 * spec_io_exit(vp, sn) 313 * 314 * Exit an operation entered with a successful spec_io_enter -- 315 * allow concurrent spec_node_revoke to proceed. The argument sn 316 * must match the struct specnode pointer returned by spec_io_exit 317 * for vp. 318 */ 319 static void 320 spec_io_exit(struct vnode *vp, struct specnode *sn) 321 { 322 struct specdev *sd = sn->sn_dev; 323 unsigned iocnt; 324 325 KASSERT(vp->v_specnode == sn); 326 327 /* 328 * We are done. Notify spec_close if appropriate. The 329 * transition of 1 -> 0 must happen under device_lock so 330 * spec_close doesn't miss a wakeup. 331 */ 332 do { 333 iocnt = atomic_load_relaxed(&sd->sd_iocnt); 334 KASSERT(iocnt > 0); 335 if (iocnt == 1) { 336 mutex_enter(&device_lock); 337 if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) 338 cv_broadcast(&specfs_iocv); 339 mutex_exit(&device_lock); 340 break; 341 } 342 } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt); 343 } 344 345 /* 346 * spec_io_drain(sd) 347 * 348 * Wait for all existing spec_io_enter/exit sections to complete. 349 * Caller must ensure spec_io_enter will fail at this point. 350 */ 351 static void 352 spec_io_drain(struct specdev *sd) 353 { 354 355 /* 356 * I/O at the same time as closing is unlikely -- it often 357 * indicates an application bug. 358 */ 359 if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0)) 360 return; 361 362 mutex_enter(&device_lock); 363 while (atomic_load_relaxed(&sd->sd_iocnt) > 0) 364 cv_wait(&specfs_iocv, &device_lock); 365 mutex_exit(&device_lock); 366 } 367 368 /* 369 * Initialize a vnode that represents a device. 370 */ 371 void 372 spec_node_init(vnode_t *vp, dev_t rdev) 373 { 374 specnode_t *sn; 375 specdev_t *sd; 376 vnode_t *vp2; 377 vnode_t **vpp; 378 379 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 380 KASSERT(vp->v_specnode == NULL); 381 382 /* 383 * Search the hash table for this device. If known, add a 384 * reference to the device structure. If not known, create 385 * a new entry to represent the device. In all cases add 386 * the vnode to the hash table. 387 */ 388 sn = kmem_alloc(sizeof(*sn), KM_SLEEP); 389 sd = kmem_alloc(sizeof(*sd), KM_SLEEP); 390 mutex_enter(&device_lock); 391 vpp = &specfs_hash[SPECHASH(rdev)]; 392 for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) { 393 KASSERT(vp2->v_specnode != NULL); 394 if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) { 395 break; 396 } 397 } 398 if (vp2 == NULL) { 399 /* No existing record, create a new one. */ 400 sd->sd_rdev = rdev; 401 sd->sd_mountpoint = NULL; 402 sd->sd_lockf = NULL; 403 sd->sd_refcnt = 1; 404 sd->sd_opencnt = 0; 405 sd->sd_bdevvp = NULL; 406 sd->sd_iocnt = 0; 407 sd->sd_opened = false; 408 sd->sd_closing = false; 409 sn->sn_dev = sd; 410 sd = NULL; 411 } else { 412 /* Use the existing record. */ 413 sn->sn_dev = vp2->v_specnode->sn_dev; 414 sn->sn_dev->sd_refcnt++; 415 } 416 /* Insert vnode into the hash chain. */ 417 sn->sn_opencnt = 0; 418 sn->sn_rdev = rdev; 419 sn->sn_gone = false; 420 vp->v_specnode = sn; 421 vp->v_specnext = *vpp; 422 *vpp = vp; 423 mutex_exit(&device_lock); 424 425 /* Free the record we allocated if unused. */ 426 if (sd != NULL) { 427 kmem_free(sd, sizeof(*sd)); 428 } 429 } 430 431 /* 432 * Lookup a vnode by device number and return it referenced. 433 */ 434 int 435 spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp) 436 { 437 int error; 438 vnode_t *vp; 439 440 top: mutex_enter(&device_lock); 441 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 442 if (type == vp->v_type && dev == vp->v_rdev) { 443 mutex_enter(vp->v_interlock); 444 /* If clean or being cleaned, then ignore it. */ 445 if (vdead_check(vp, VDEAD_NOWAIT) == 0) 446 break; 447 if ((flags & VDEAD_NOWAIT) == 0) { 448 mutex_exit(&device_lock); 449 /* 450 * It may be being revoked as we speak, 451 * and the caller wants to wait until 452 * all revocation has completed. Let 453 * vcache_vget wait for it to finish 454 * dying; as a side effect, vcache_vget 455 * releases vp->v_interlock. Note that 456 * vcache_vget cannot succeed at this 457 * point because vdead_check already 458 * failed. 459 */ 460 error = vcache_vget(vp); 461 KASSERT(error); 462 goto top; 463 } 464 mutex_exit(vp->v_interlock); 465 } 466 } 467 KASSERT(vp == NULL || mutex_owned(vp->v_interlock)); 468 if (vp == NULL) { 469 mutex_exit(&device_lock); 470 return ENOENT; 471 } 472 /* 473 * If it is an opened block device return the opened vnode. 474 */ 475 if (type == VBLK && vp->v_specnode->sn_dev->sd_bdevvp != NULL) { 476 mutex_exit(vp->v_interlock); 477 vp = vp->v_specnode->sn_dev->sd_bdevvp; 478 mutex_enter(vp->v_interlock); 479 } 480 mutex_exit(&device_lock); 481 error = vcache_vget(vp); 482 if (error != 0) 483 return error; 484 *vpp = vp; 485 486 return 0; 487 } 488 489 /* 490 * Lookup a vnode by file system mounted on and return it referenced. 491 */ 492 int 493 spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp) 494 { 495 int i, error; 496 vnode_t *vp, *vq; 497 498 mutex_enter(&device_lock); 499 for (i = 0, vq = NULL; i < SPECHSZ && vq == NULL; i++) { 500 for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) { 501 if (vp->v_type != VBLK) 502 continue; 503 vq = vp->v_specnode->sn_dev->sd_bdevvp; 504 if (vq != NULL && 505 vq->v_specnode->sn_dev->sd_mountpoint == mp) 506 break; 507 vq = NULL; 508 } 509 } 510 if (vq == NULL) { 511 mutex_exit(&device_lock); 512 return ENOENT; 513 } 514 mutex_enter(vq->v_interlock); 515 mutex_exit(&device_lock); 516 error = vcache_vget(vq); 517 if (error != 0) 518 return error; 519 *vpp = vq; 520 521 return 0; 522 523 } 524 525 /* 526 * Get the file system mounted on this block device. 527 * 528 * XXX Caller should hold the vnode lock -- shared or exclusive -- so 529 * that this can't changed, and the vnode can't be revoked while we 530 * examine it. But not all callers do, and they're scattered through a 531 * lot of file systems, so we can't assert this yet. 532 */ 533 struct mount * 534 spec_node_getmountedfs(vnode_t *devvp) 535 { 536 struct mount *mp; 537 538 KASSERT(devvp->v_type == VBLK); 539 mp = devvp->v_specnode->sn_dev->sd_mountpoint; 540 541 return mp; 542 } 543 544 /* 545 * Set the file system mounted on this block device. 546 * 547 * XXX Caller should hold the vnode lock exclusively so this can't be 548 * changed or assumed by spec_node_getmountedfs while we change it, and 549 * the vnode can't be revoked while we handle it. But not all callers 550 * do, and they're scattered through a lot of file systems, so we can't 551 * assert this yet. Instead, for now, we'll take an I/O reference so 552 * at least the ioctl doesn't race with revoke/detach. 553 * 554 * If you do change this to assert an exclusive vnode lock, you must 555 * also do vdead_check before trying bdev_ioctl, because the vnode may 556 * have been revoked by the time the caller locked it, and this is 557 * _not_ a vop -- calls to spec_node_setmountedfs don't go through 558 * v_op, so revoking the vnode doesn't prevent further calls. 559 * 560 * XXX Caller should additionally have the vnode open, at least if mp 561 * is nonnull, but I'm not sure all callers do that -- need to audit. 562 * Currently udf closes the vnode before clearing the mount. 563 */ 564 void 565 spec_node_setmountedfs(vnode_t *devvp, struct mount *mp) 566 { 567 struct dkwedge_info dkw; 568 struct specnode *sn; 569 dev_t dev; 570 int error; 571 572 KASSERT(devvp->v_type == VBLK); 573 574 error = spec_io_enter(devvp, &sn, &dev); 575 if (error) 576 return; 577 578 KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL); 579 sn->sn_dev->sd_mountpoint = mp; 580 if (mp == NULL) 581 goto out; 582 583 error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp); 584 if (error) 585 goto out; 586 587 strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname, 588 sizeof(mp->mnt_stat.f_mntfromlabel)); 589 590 out: spec_io_exit(devvp, sn); 591 } 592 593 /* 594 * A vnode representing a special device is going away. Close 595 * the device if the vnode holds it open. 596 */ 597 void 598 spec_node_revoke(vnode_t *vp) 599 { 600 specnode_t *sn; 601 specdev_t *sd; 602 struct vnode **vpp; 603 604 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 605 606 sn = vp->v_specnode; 607 sd = sn->sn_dev; 608 609 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 610 KASSERT(vp->v_specnode != NULL); 611 KASSERT(sn->sn_gone == false); 612 613 mutex_enter(&device_lock); 614 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 615 "sn_opencnt=%u > sd_opencnt=%u", 616 sn->sn_opencnt, sd->sd_opencnt); 617 sn->sn_gone = true; 618 if (sn->sn_opencnt != 0) { 619 sd->sd_opencnt -= (sn->sn_opencnt - 1); 620 sn->sn_opencnt = 1; 621 mutex_exit(&device_lock); 622 623 VOP_CLOSE(vp, FNONBLOCK, NOCRED); 624 625 mutex_enter(&device_lock); 626 KASSERT(sn->sn_opencnt == 0); 627 } 628 629 /* 630 * We may have revoked the vnode in this thread while another 631 * thread was in the middle of spec_close, in the window when 632 * spec_close releases the vnode lock to call .d_close for the 633 * last close. In that case, wait for the concurrent 634 * spec_close to complete. 635 */ 636 while (sd->sd_closing) 637 cv_wait(&specfs_iocv, &device_lock); 638 639 /* 640 * Remove from the hash so lookups stop returning this 641 * specnode. We will dissociate it from the specdev -- and 642 * possibly free the specdev -- in spec_node_destroy. 643 */ 644 KASSERT(sn->sn_gone); 645 KASSERT(sn->sn_opencnt == 0); 646 for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];; 647 vpp = &(*vpp)->v_specnext) { 648 if (*vpp == vp) { 649 *vpp = vp->v_specnext; 650 vp->v_specnext = NULL; 651 break; 652 } 653 } 654 mutex_exit(&device_lock); 655 } 656 657 /* 658 * A vnode representing a special device is being recycled. 659 * Destroy the specfs component. 660 */ 661 void 662 spec_node_destroy(vnode_t *vp) 663 { 664 specnode_t *sn; 665 specdev_t *sd; 666 int refcnt; 667 668 sn = vp->v_specnode; 669 sd = sn->sn_dev; 670 671 KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); 672 KASSERT(vp->v_specnode != NULL); 673 KASSERT(sn->sn_opencnt == 0); 674 675 mutex_enter(&device_lock); 676 sn = vp->v_specnode; 677 vp->v_specnode = NULL; 678 refcnt = sd->sd_refcnt--; 679 KASSERT(refcnt > 0); 680 mutex_exit(&device_lock); 681 682 /* If the device is no longer in use, destroy our record. */ 683 if (refcnt == 1) { 684 KASSERT(sd->sd_iocnt == 0); 685 KASSERT(sd->sd_opencnt == 0); 686 KASSERT(sd->sd_bdevvp == NULL); 687 kmem_free(sd, sizeof(*sd)); 688 } 689 kmem_free(sn, sizeof(*sn)); 690 } 691 692 /* 693 * Trivial lookup routine that always fails. 694 */ 695 int 696 spec_lookup(void *v) 697 { 698 struct vop_lookup_v2_args /* { 699 struct vnode *a_dvp; 700 struct vnode **a_vpp; 701 struct componentname *a_cnp; 702 } */ *ap = v; 703 704 *ap->a_vpp = NULL; 705 return (ENOTDIR); 706 } 707 708 typedef int (*spec_ioctl_t)(dev_t, u_long, void *, int, struct lwp *); 709 710 /* 711 * Open a special file. 712 */ 713 /* ARGSUSED */ 714 int 715 spec_open(void *v) 716 { 717 struct vop_open_args /* { 718 struct vnode *a_vp; 719 int a_mode; 720 kauth_cred_t a_cred; 721 } */ *ap = v; 722 struct lwp *l = curlwp; 723 struct vnode *vp = ap->a_vp; 724 dev_t dev, dev1; 725 int error; 726 enum kauth_device_req req; 727 specnode_t *sn, *sn1; 728 specdev_t *sd; 729 spec_ioctl_t ioctl; 730 u_int gen = 0; 731 const char *name = NULL; 732 bool needclose = false; 733 struct partinfo pi; 734 735 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 736 KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d", 737 vp->v_type); 738 739 dev = vp->v_rdev; 740 sn = vp->v_specnode; 741 sd = sn->sn_dev; 742 743 /* 744 * Don't allow open if fs is mounted -nodev. 745 */ 746 if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) 747 return (ENXIO); 748 749 switch (ap->a_mode & (FREAD | FWRITE)) { 750 case FREAD | FWRITE: 751 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW; 752 break; 753 case FWRITE: 754 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE; 755 break; 756 default: 757 req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ; 758 break; 759 } 760 error = kauth_authorize_device_spec(ap->a_cred, req, vp); 761 if (error != 0) 762 return (error); 763 764 /* 765 * Acquire an open reference -- as long as we hold onto it, and 766 * the vnode isn't revoked, it can't be closed, and the vnode 767 * can't be revoked until we release the vnode lock. 768 */ 769 mutex_enter(&device_lock); 770 KASSERT(!sn->sn_gone); 771 switch (vp->v_type) { 772 case VCHR: 773 /* 774 * Character devices can accept opens from multiple 775 * vnodes. But first, wait for any close to finish. 776 * Wait under the vnode lock so we don't have to worry 777 * about the vnode being revoked while we wait. 778 */ 779 while (sd->sd_closing) { 780 error = cv_wait_sig(&specfs_iocv, &device_lock); 781 if (error) 782 break; 783 } 784 if (error) 785 break; 786 sd->sd_opencnt++; 787 sn->sn_opencnt++; 788 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 789 "sn_opencnt=%u > sd_opencnt=%u", 790 sn->sn_opencnt, sd->sd_opencnt); 791 break; 792 case VBLK: 793 /* 794 * For block devices, permit only one open. The buffer 795 * cache cannot remain self-consistent with multiple 796 * vnodes holding a block device open. 797 * 798 * Treat zero opencnt with non-NULL mountpoint as open. 799 * This may happen after forced detach of a mounted device. 800 * 801 * Also treat sd_closing, meaning there is a concurrent 802 * close in progress, as still open. 803 */ 804 if (sd->sd_opencnt != 0 || 805 sd->sd_mountpoint != NULL || 806 sd->sd_closing) { 807 error = EBUSY; 808 break; 809 } 810 KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u", 811 sn->sn_opencnt); 812 sn->sn_opencnt = 1; 813 sd->sd_opencnt = 1; 814 sd->sd_bdevvp = vp; 815 break; 816 default: 817 panic("invalid specfs vnode type: %d", vp->v_type); 818 } 819 mutex_exit(&device_lock); 820 if (error) 821 return error; 822 823 /* 824 * Set VV_ISTTY if this is a tty cdev. 825 * 826 * XXX This does the wrong thing if the module has to be 827 * autoloaded. We should maybe set this after autoloading 828 * modules and calling .d_open successfully, except (a) we need 829 * the vnode lock to touch it, and (b) once we acquire the 830 * vnode lock again, the vnode may have been revoked, and 831 * deadfs's dead_read needs VV_ISTTY to be already set in order 832 * to return the right answer. So this needs some additional 833 * synchronization to be made to work correctly with tty driver 834 * module autoload. For now, let's just hope it doesn't cause 835 * too much trouble for a tty from an autoloaded driver module 836 * to fail with EIO instead of returning EOF. 837 */ 838 if (vp->v_type == VCHR) { 839 if (cdev_type(dev) == D_TTY) 840 vp->v_vflag |= VV_ISTTY; 841 } 842 843 /* 844 * Because opening the device may block indefinitely, e.g. when 845 * opening a tty, and loading a module may cross into many 846 * other subsystems, we must not hold the vnode lock while 847 * calling .d_open, so release it now and reacquire it when 848 * done. 849 * 850 * Take an I/O reference so that any concurrent spec_close via 851 * spec_node_revoke will wait for us to finish calling .d_open. 852 * The vnode can't be dead at this point because we have it 853 * locked. Note that if revoked, the driver must interrupt 854 * .d_open before spec_close starts waiting for I/O to drain so 855 * this doesn't deadlock. 856 */ 857 VOP_UNLOCK(vp); 858 error = spec_io_enter(vp, &sn1, &dev1); 859 if (error) { 860 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 861 return error; 862 } 863 KASSERT(sn1 == sn); 864 KASSERT(dev1 == dev); 865 866 /* 867 * Open the device. If .d_open returns ENXIO (device not 868 * configured), the driver may not be loaded, so try 869 * autoloading a module and then try .d_open again if anything 870 * got loaded. 871 */ 872 switch (vp->v_type) { 873 case VCHR: 874 do { 875 const struct cdevsw *cdev; 876 877 gen = module_gen; 878 error = cdev_open(dev, ap->a_mode, S_IFCHR, l); 879 if (error != ENXIO) 880 break; 881 882 /* Check if we already have a valid driver */ 883 mutex_enter(&device_lock); 884 cdev = cdevsw_lookup(dev); 885 mutex_exit(&device_lock); 886 if (cdev != NULL) 887 break; 888 889 /* Get device name from devsw_conv array */ 890 if ((name = cdevsw_getname(major(dev))) == NULL) 891 break; 892 893 /* Try to autoload device module */ 894 (void) module_autoload(name, MODULE_CLASS_DRIVER); 895 } while (gen != module_gen); 896 break; 897 898 case VBLK: 899 do { 900 const struct bdevsw *bdev; 901 902 gen = module_gen; 903 error = bdev_open(dev, ap->a_mode, S_IFBLK, l); 904 if (error != ENXIO) 905 break; 906 907 /* Check if we already have a valid driver */ 908 mutex_enter(&device_lock); 909 bdev = bdevsw_lookup(dev); 910 mutex_exit(&device_lock); 911 if (bdev != NULL) 912 break; 913 914 /* Get device name from devsw_conv array */ 915 if ((name = bdevsw_getname(major(dev))) == NULL) 916 break; 917 918 /* Try to autoload device module */ 919 (void) module_autoload(name, MODULE_CLASS_DRIVER); 920 } while (gen != module_gen); 921 break; 922 923 default: 924 __unreachable(); 925 } 926 927 /* 928 * Release the I/O reference now that we have called .d_open, 929 * and reacquire the vnode lock. At this point, the device may 930 * have been revoked, so we must tread carefully. However, sn 931 * and sd remain valid pointers until we drop our reference. 932 */ 933 spec_io_exit(vp, sn); 934 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 935 KASSERT(vp->v_specnode == sn); 936 937 /* 938 * If it has been revoked since we released the vnode lock and 939 * reacquired it, then spec_node_revoke has closed it, and we 940 * must fail with EBADF. 941 * 942 * Otherwise, if opening it failed, back out and release the 943 * open reference. If it was ever successfully opened and we 944 * got the last reference this way, it's now our job to close 945 * it. This might happen in the following scenario: 946 * 947 * Thread 1 Thread 2 948 * VOP_OPEN 949 * ... 950 * .d_open -> 0 (success) 951 * acquire vnode lock 952 * do stuff VOP_OPEN 953 * release vnode lock ... 954 * .d_open -> EBUSY 955 * VOP_CLOSE 956 * acquire vnode lock 957 * --sd_opencnt != 0 958 * => no .d_close 959 * release vnode lock 960 * acquire vnode lock 961 * --sd_opencnt == 0 962 * 963 * We can't resolve this by making spec_close wait for .d_open 964 * to complete before examining sd_opencnt, because .d_open can 965 * hang indefinitely, e.g. for a tty. 966 */ 967 mutex_enter(&device_lock); 968 if (sn->sn_gone) { 969 if (error == 0) 970 error = EBADF; 971 } else if (error == 0) { 972 /* 973 * Device has not been revoked, so our opencnt can't 974 * have gone away at this point -- transition to 975 * sn_gone=true happens before transition to 976 * sn_opencnt=0 in spec_node_revoke. 977 */ 978 KASSERT(sd->sd_opencnt); 979 KASSERT(sn->sn_opencnt); 980 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 981 "sn_opencnt=%u > sd_opencnt=%u", 982 sn->sn_opencnt, sd->sd_opencnt); 983 KASSERT(!sd->sd_closing); 984 sd->sd_opened = true; 985 } else if (sd->sd_opencnt == 1 && sd->sd_opened) { 986 /* 987 * We're the last reference to a _previous_ open even 988 * though this one failed, so we have to close it. 989 * Don't decrement the reference count here -- 990 * spec_close will do that. 991 */ 992 KASSERT(sn->sn_opencnt == 1); 993 needclose = true; 994 } else { 995 KASSERT(sd->sd_opencnt); 996 KASSERT(sn->sn_opencnt); 997 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 998 "sn_opencnt=%u > sd_opencnt=%u", 999 sn->sn_opencnt, sd->sd_opencnt); 1000 sd->sd_opencnt--; 1001 sn->sn_opencnt--; 1002 if (vp->v_type == VBLK) 1003 sd->sd_bdevvp = NULL; 1004 } 1005 mutex_exit(&device_lock); 1006 1007 /* 1008 * If this open failed, but the device was previously opened, 1009 * and another thread concurrently closed the vnode while we 1010 * were in the middle of reopening it, the other thread will 1011 * see sd_opencnt > 0 and thus decide not to call .d_close -- 1012 * it is now our responsibility to do so. 1013 * 1014 * XXX The flags passed to VOP_CLOSE here are wrong, but 1015 * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider 1016 * a device opened by thread 0 with O_READ, then opened by 1017 * thread 1 with O_WRITE, then closed by thread 0, and finally 1018 * closed by thread 1; the last .d_close call will have FWRITE 1019 * but not FREAD. We should just eliminate the FREAD/FWRITE 1020 * parameter to .d_close altogether. 1021 */ 1022 if (needclose) { 1023 KASSERT(error); 1024 VOP_CLOSE(vp, FNONBLOCK, NOCRED); 1025 } 1026 1027 /* If anything went wrong, we're done. */ 1028 if (error) 1029 return error; 1030 1031 /* 1032 * For disk devices, automagically set the vnode size to the 1033 * partition size, if we can. This applies to block devices 1034 * and character devices alike -- every block device must have 1035 * a corresponding character device. And if the module is 1036 * loaded it will remain loaded until we're done here (it is 1037 * forbidden to devsw_detach until closed). So it is safe to 1038 * query cdev_type unconditionally here. 1039 */ 1040 if (cdev_type(dev) == D_DISK) { 1041 ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl; 1042 if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0) 1043 uvm_vnp_setsize(vp, 1044 (voff_t)pi.pi_secsize * pi.pi_size); 1045 } 1046 1047 /* Success! */ 1048 return 0; 1049 } 1050 1051 /* 1052 * Vnode op for read 1053 */ 1054 /* ARGSUSED */ 1055 int 1056 spec_read(void *v) 1057 { 1058 struct vop_read_args /* { 1059 struct vnode *a_vp; 1060 struct uio *a_uio; 1061 int a_ioflag; 1062 kauth_cred_t a_cred; 1063 } */ *ap = v; 1064 struct vnode *vp = ap->a_vp; 1065 struct uio *uio = ap->a_uio; 1066 struct lwp *l = curlwp; 1067 struct specnode *sn; 1068 dev_t dev; 1069 struct buf *bp; 1070 daddr_t bn; 1071 int bsize, bscale; 1072 struct partinfo pi; 1073 int n, on; 1074 int error = 0; 1075 int i, nra; 1076 daddr_t lastbn, *rablks; 1077 int *rasizes; 1078 int nrablks, ratogo; 1079 1080 KASSERT(uio->uio_rw == UIO_READ); 1081 KASSERTMSG(VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || 1082 uio->uio_vmspace == curproc->p_vmspace, 1083 "vmspace belongs to neither kernel nor curproc"); 1084 1085 if (uio->uio_resid == 0) 1086 return (0); 1087 1088 switch (vp->v_type) { 1089 1090 case VCHR: 1091 /* 1092 * Release the lock while we sleep -- possibly 1093 * indefinitely, if this is, e.g., a tty -- in 1094 * cdev_read, so we don't hold up everything else that 1095 * might want access to the vnode. 1096 * 1097 * But before we issue the read, take an I/O reference 1098 * to the specnode so close will know when we're done 1099 * reading. Note that the moment we release the lock, 1100 * the vnode's identity may change; hence spec_io_enter 1101 * may fail, and the caller may have a dead vnode on 1102 * their hands, if the file system on which vp lived 1103 * has been unmounted. 1104 */ 1105 VOP_UNLOCK(vp); 1106 error = spec_io_enter(vp, &sn, &dev); 1107 if (error) 1108 goto out; 1109 error = cdev_read(dev, uio, ap->a_ioflag); 1110 spec_io_exit(vp, sn); 1111 out: /* XXX What if the caller held an exclusive lock? */ 1112 vn_lock(vp, LK_SHARED | LK_RETRY); 1113 return (error); 1114 1115 case VBLK: 1116 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1117 if (uio->uio_offset < 0) 1118 return (EINVAL); 1119 1120 if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) 1121 bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); 1122 else 1123 bsize = BLKDEV_IOSIZE; 1124 1125 bscale = bsize >> DEV_BSHIFT; 1126 1127 nra = uimax(16 * MAXPHYS / bsize - 1, 511); 1128 rablks = kmem_alloc(nra * sizeof(*rablks), KM_SLEEP); 1129 rasizes = kmem_alloc(nra * sizeof(*rasizes), KM_SLEEP); 1130 lastbn = ((uio->uio_offset + uio->uio_resid - 1) >> DEV_BSHIFT) 1131 &~ (bscale - 1); 1132 nrablks = ratogo = 0; 1133 do { 1134 bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1); 1135 on = uio->uio_offset % bsize; 1136 n = uimin((unsigned)(bsize - on), uio->uio_resid); 1137 1138 if (ratogo == 0) { 1139 nrablks = uimin((lastbn - bn) / bscale, nra); 1140 ratogo = nrablks; 1141 1142 for (i = 0; i < nrablks; ++i) { 1143 rablks[i] = bn + (i+1) * bscale; 1144 rasizes[i] = bsize; 1145 } 1146 1147 error = breadn(vp, bn, bsize, 1148 rablks, rasizes, nrablks, 1149 0, &bp); 1150 } else { 1151 if (ratogo > 0) 1152 --ratogo; 1153 error = bread(vp, bn, bsize, 0, &bp); 1154 } 1155 if (error) 1156 break; 1157 n = uimin(n, bsize - bp->b_resid); 1158 error = uiomove((char *)bp->b_data + on, n, uio); 1159 brelse(bp, 0); 1160 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1161 1162 kmem_free(rablks, nra * sizeof(*rablks)); 1163 kmem_free(rasizes, nra * sizeof(*rasizes)); 1164 1165 return (error); 1166 1167 default: 1168 panic("spec_read type"); 1169 } 1170 /* NOTREACHED */ 1171 } 1172 1173 /* 1174 * Vnode op for write 1175 */ 1176 /* ARGSUSED */ 1177 int 1178 spec_write(void *v) 1179 { 1180 struct vop_write_args /* { 1181 struct vnode *a_vp; 1182 struct uio *a_uio; 1183 int a_ioflag; 1184 kauth_cred_t a_cred; 1185 } */ *ap = v; 1186 struct vnode *vp = ap->a_vp; 1187 struct uio *uio = ap->a_uio; 1188 struct lwp *l = curlwp; 1189 struct specnode *sn; 1190 dev_t dev; 1191 struct buf *bp; 1192 daddr_t bn; 1193 int bsize, bscale; 1194 struct partinfo pi; 1195 int n, on; 1196 int error = 0; 1197 1198 KASSERT(uio->uio_rw == UIO_WRITE); 1199 KASSERTMSG(VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || 1200 uio->uio_vmspace == curproc->p_vmspace, 1201 "vmspace belongs to neither kernel nor curproc"); 1202 1203 switch (vp->v_type) { 1204 1205 case VCHR: 1206 /* 1207 * Release the lock while we sleep -- possibly 1208 * indefinitely, if this is, e.g., a tty -- in 1209 * cdev_write, so we don't hold up everything else that 1210 * might want access to the vnode. 1211 * 1212 * But before we issue the write, take an I/O reference 1213 * to the specnode so close will know when we're done 1214 * writing. Note that the moment we release the lock, 1215 * the vnode's identity may change; hence spec_io_enter 1216 * may fail, and the caller may have a dead vnode on 1217 * their hands, if the file system on which vp lived 1218 * has been unmounted. 1219 */ 1220 VOP_UNLOCK(vp); 1221 error = spec_io_enter(vp, &sn, &dev); 1222 if (error) 1223 goto out; 1224 error = cdev_write(dev, uio, ap->a_ioflag); 1225 spec_io_exit(vp, sn); 1226 out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1227 return (error); 1228 1229 case VBLK: 1230 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1231 if (uio->uio_resid == 0) 1232 return (0); 1233 if (uio->uio_offset < 0) 1234 return (EINVAL); 1235 1236 if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) 1237 bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE); 1238 else 1239 bsize = BLKDEV_IOSIZE; 1240 1241 bscale = bsize >> DEV_BSHIFT; 1242 do { 1243 bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1); 1244 on = uio->uio_offset % bsize; 1245 n = uimin((unsigned)(bsize - on), uio->uio_resid); 1246 if (n == bsize) 1247 bp = getblk(vp, bn, bsize, 0, 0); 1248 else 1249 error = bread(vp, bn, bsize, B_MODIFY, &bp); 1250 if (error) { 1251 return (error); 1252 } 1253 n = uimin(n, bsize - bp->b_resid); 1254 error = uiomove((char *)bp->b_data + on, n, uio); 1255 if (error) 1256 brelse(bp, 0); 1257 else { 1258 if (n + on == bsize) 1259 bawrite(bp); 1260 else 1261 bdwrite(bp); 1262 error = bp->b_error; 1263 } 1264 } while (error == 0 && uio->uio_resid > 0 && n != 0); 1265 return (error); 1266 1267 default: 1268 panic("spec_write type"); 1269 } 1270 /* NOTREACHED */ 1271 } 1272 1273 /* 1274 * fdiscard, which on disk devices becomes TRIM. 1275 */ 1276 int 1277 spec_fdiscard(void *v) 1278 { 1279 struct vop_fdiscard_args /* { 1280 struct vnode *a_vp; 1281 off_t a_pos; 1282 off_t a_len; 1283 } */ *ap = v; 1284 struct vnode *vp = ap->a_vp; 1285 dev_t dev; 1286 1287 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1288 1289 dev = vp->v_rdev; 1290 1291 switch (vp->v_type) { 1292 case VCHR: 1293 // this is not stored for character devices 1294 //KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp); 1295 return cdev_discard(dev, ap->a_pos, ap->a_len); 1296 case VBLK: 1297 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1298 return bdev_discard(dev, ap->a_pos, ap->a_len); 1299 default: 1300 panic("spec_fdiscard: not a device\n"); 1301 } 1302 } 1303 1304 /* 1305 * Device ioctl operation. 1306 */ 1307 /* ARGSUSED */ 1308 int 1309 spec_ioctl(void *v) 1310 { 1311 struct vop_ioctl_args /* { 1312 struct vnode *a_vp; 1313 u_long a_command; 1314 void *a_data; 1315 int a_fflag; 1316 kauth_cred_t a_cred; 1317 } */ *ap = v; 1318 struct vnode *vp = ap->a_vp; 1319 struct specnode *sn; 1320 dev_t dev; 1321 int error; 1322 1323 error = spec_io_enter(vp, &sn, &dev); 1324 if (error) 1325 return error; 1326 1327 switch (vp->v_type) { 1328 case VCHR: 1329 error = cdev_ioctl(dev, ap->a_command, ap->a_data, 1330 ap->a_fflag, curlwp); 1331 break; 1332 case VBLK: 1333 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1334 error = bdev_ioctl(dev, ap->a_command, ap->a_data, 1335 ap->a_fflag, curlwp); 1336 break; 1337 default: 1338 panic("spec_ioctl"); 1339 /* NOTREACHED */ 1340 } 1341 1342 spec_io_exit(vp, sn); 1343 return error; 1344 } 1345 1346 /* ARGSUSED */ 1347 int 1348 spec_poll(void *v) 1349 { 1350 struct vop_poll_args /* { 1351 struct vnode *a_vp; 1352 int a_events; 1353 } */ *ap = v; 1354 struct vnode *vp = ap->a_vp; 1355 struct specnode *sn; 1356 dev_t dev; 1357 int revents; 1358 1359 if (spec_io_enter(vp, &sn, &dev) != 0) 1360 return POLLERR; 1361 1362 switch (vp->v_type) { 1363 case VCHR: 1364 revents = cdev_poll(dev, ap->a_events, curlwp); 1365 break; 1366 default: 1367 revents = genfs_poll(v); 1368 break; 1369 } 1370 1371 spec_io_exit(vp, sn); 1372 return revents; 1373 } 1374 1375 /* ARGSUSED */ 1376 int 1377 spec_kqfilter(void *v) 1378 { 1379 struct vop_kqfilter_args /* { 1380 struct vnode *a_vp; 1381 struct proc *a_kn; 1382 } */ *ap = v; 1383 struct vnode *vp = ap->a_vp; 1384 struct specnode *sn; 1385 dev_t dev; 1386 int error; 1387 1388 error = spec_io_enter(vp, &sn, &dev); 1389 if (error) 1390 return error; 1391 1392 switch (vp->v_type) { 1393 case VCHR: 1394 error = cdev_kqfilter(dev, ap->a_kn); 1395 break; 1396 default: 1397 /* 1398 * Block devices don't support kqfilter, and refuse it 1399 * for any other files (like those vflush()ed) too. 1400 */ 1401 error = EOPNOTSUPP; 1402 break; 1403 } 1404 1405 spec_io_exit(vp, sn); 1406 return error; 1407 } 1408 1409 /* 1410 * Allow mapping of only D_DISK. This is called only for VBLK. 1411 */ 1412 int 1413 spec_mmap(void *v) 1414 { 1415 struct vop_mmap_args /* { 1416 struct vnode *a_vp; 1417 vm_prot_t a_prot; 1418 kauth_cred_t a_cred; 1419 } */ *ap = v; 1420 struct vnode *vp = ap->a_vp; 1421 struct specnode *sn; 1422 dev_t dev; 1423 int error; 1424 1425 KASSERT(vp->v_type == VBLK); 1426 1427 error = spec_io_enter(vp, &sn, &dev); 1428 if (error) 1429 return error; 1430 1431 error = bdev_type(dev) == D_DISK ? 0 : EINVAL; 1432 1433 spec_io_exit(vp, sn); 1434 return 0; 1435 } 1436 1437 /* 1438 * Synch buffers associated with a block device 1439 */ 1440 /* ARGSUSED */ 1441 int 1442 spec_fsync(void *v) 1443 { 1444 struct vop_fsync_args /* { 1445 struct vnode *a_vp; 1446 kauth_cred_t a_cred; 1447 int a_flags; 1448 off_t offlo; 1449 off_t offhi; 1450 } */ *ap = v; 1451 struct vnode *vp = ap->a_vp; 1452 struct mount *mp; 1453 int error; 1454 1455 if (vp->v_type == VBLK) { 1456 if ((mp = spec_node_getmountedfs(vp)) != NULL) { 1457 error = VFS_FSYNC(mp, vp, ap->a_flags); 1458 if (error != EOPNOTSUPP) 1459 return error; 1460 } 1461 return vflushbuf(vp, ap->a_flags); 1462 } 1463 return (0); 1464 } 1465 1466 /* 1467 * Just call the device strategy routine 1468 */ 1469 int 1470 spec_strategy(void *v) 1471 { 1472 struct vop_strategy_args /* { 1473 struct vnode *a_vp; 1474 struct buf *a_bp; 1475 } */ *ap = v; 1476 struct vnode *vp = ap->a_vp; 1477 struct buf *bp = ap->a_bp; 1478 struct specnode *sn = NULL; 1479 dev_t dev; 1480 int error; 1481 1482 error = spec_io_enter(vp, &sn, &dev); 1483 if (error) 1484 goto out; 1485 1486 bp->b_dev = dev; 1487 1488 if (!(bp->b_flags & B_READ)) { 1489 #ifdef DIAGNOSTIC 1490 if (bp->b_vp && bp->b_vp->v_type == VBLK) { 1491 struct mount *mp = spec_node_getmountedfs(bp->b_vp); 1492 1493 if (mp && (mp->mnt_flag & MNT_RDONLY)) { 1494 printf("%s blk %"PRId64" written while ro!\n", 1495 mp->mnt_stat.f_mntonname, bp->b_blkno); 1496 #ifdef DDB 1497 db_stacktrace(); 1498 #endif 1499 } 1500 } 1501 #endif /* DIAGNOSTIC */ 1502 error = fscow_run(bp, false); 1503 if (error) 1504 goto out; 1505 } 1506 bdev_strategy(bp); 1507 1508 error = 0; 1509 1510 out: if (sn) 1511 spec_io_exit(vp, sn); 1512 if (error) { 1513 bp->b_error = error; 1514 bp->b_resid = bp->b_bcount; 1515 biodone(bp); 1516 } 1517 return error; 1518 } 1519 1520 int 1521 spec_inactive(void *v) 1522 { 1523 struct vop_inactive_v2_args /* { 1524 struct vnode *a_vp; 1525 struct bool *a_recycle; 1526 } */ *ap = v; 1527 1528 KASSERT(ap->a_vp->v_mount == dead_rootmount); 1529 *ap->a_recycle = true; 1530 1531 return 0; 1532 } 1533 1534 int 1535 spec_reclaim(void *v) 1536 { 1537 struct vop_reclaim_v2_args /* { 1538 struct vnode *a_vp; 1539 } */ *ap = v; 1540 struct vnode *vp = ap->a_vp; 1541 1542 KASSERT(vp->v_specnode->sn_opencnt == 0); 1543 1544 VOP_UNLOCK(vp); 1545 1546 KASSERT(vp->v_mount == dead_rootmount); 1547 return 0; 1548 } 1549 1550 /* 1551 * This is a noop, simply returning what one has been given. 1552 */ 1553 int 1554 spec_bmap(void *v) 1555 { 1556 struct vop_bmap_args /* { 1557 struct vnode *a_vp; 1558 daddr_t a_bn; 1559 struct vnode **a_vpp; 1560 daddr_t *a_bnp; 1561 int *a_runp; 1562 } */ *ap = v; 1563 1564 if (ap->a_vpp != NULL) 1565 *ap->a_vpp = ap->a_vp; 1566 if (ap->a_bnp != NULL) 1567 *ap->a_bnp = ap->a_bn; 1568 if (ap->a_runp != NULL) 1569 *ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1; 1570 return (0); 1571 } 1572 1573 /* 1574 * Device close routine 1575 */ 1576 /* ARGSUSED */ 1577 int 1578 spec_close(void *v) 1579 { 1580 struct vop_close_args /* { 1581 struct vnode *a_vp; 1582 int a_fflag; 1583 kauth_cred_t a_cred; 1584 } */ *ap = v; 1585 struct vnode *vp = ap->a_vp; 1586 struct session *sess; 1587 dev_t dev; 1588 int flags = ap->a_fflag; 1589 int mode, error, count; 1590 specnode_t *sn; 1591 specdev_t *sd; 1592 1593 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1594 1595 mutex_enter(vp->v_interlock); 1596 sn = vp->v_specnode; 1597 dev = vp->v_rdev; 1598 sd = sn->sn_dev; 1599 /* 1600 * If we're going away soon, make this non-blocking. 1601 * Also ensures that we won't wedge in vn_lock below. 1602 */ 1603 if (vdead_check(vp, VDEAD_NOWAIT) != 0) 1604 flags |= FNONBLOCK; 1605 mutex_exit(vp->v_interlock); 1606 1607 switch (vp->v_type) { 1608 1609 case VCHR: 1610 /* 1611 * Hack: a tty device that is a controlling terminal 1612 * has a reference from the session structure. We 1613 * cannot easily tell that a character device is a 1614 * controlling terminal, unless it is the closing 1615 * process' controlling terminal. In that case, if the 1616 * open count is 1 release the reference from the 1617 * session. Also, remove the link from the tty back to 1618 * the session and pgrp. 1619 * 1620 * XXX V. fishy. 1621 */ 1622 mutex_enter(&proc_lock); 1623 sess = curlwp->l_proc->p_session; 1624 if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) { 1625 mutex_spin_enter(&tty_lock); 1626 sess->s_ttyvp = NULL; 1627 if (sess->s_ttyp->t_session != NULL) { 1628 sess->s_ttyp->t_pgrp = NULL; 1629 sess->s_ttyp->t_session = NULL; 1630 mutex_spin_exit(&tty_lock); 1631 /* Releases proc_lock. */ 1632 proc_sessrele(sess); 1633 } else { 1634 mutex_spin_exit(&tty_lock); 1635 if (sess->s_ttyp->t_pgrp != NULL) 1636 panic("spec_close: spurious pgrp ref"); 1637 mutex_exit(&proc_lock); 1638 } 1639 vrele(vp); 1640 } else 1641 mutex_exit(&proc_lock); 1642 1643 /* 1644 * If the vnode is locked, then we are in the midst 1645 * of forcably closing the device, otherwise we only 1646 * close on last reference. 1647 */ 1648 mode = S_IFCHR; 1649 break; 1650 1651 case VBLK: 1652 KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); 1653 /* 1654 * On last close of a block device (that isn't mounted) 1655 * we must invalidate any in core blocks, so that 1656 * we can, for instance, change floppy disks. 1657 */ 1658 error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0); 1659 if (error) 1660 return (error); 1661 /* 1662 * We do not want to really close the device if it 1663 * is still in use unless we are trying to close it 1664 * forcibly. Since every use (buffer, vnode, swap, cmap) 1665 * holds a reference to the vnode, and because we mark 1666 * any other vnodes that alias this device, when the 1667 * sum of the reference counts on all the aliased 1668 * vnodes descends to one, we are on last close. 1669 */ 1670 mode = S_IFBLK; 1671 break; 1672 1673 default: 1674 panic("spec_close: not special"); 1675 } 1676 1677 /* 1678 * Decrement the open reference count of this node and the 1679 * device. For block devices, the open reference count must be 1680 * 1 at this point. If the device's open reference count goes 1681 * to zero, we're the last one out so get the lights. 1682 * 1683 * We may find --sd->sd_opencnt gives zero, and yet 1684 * sd->sd_opened is false. This happens if the vnode is 1685 * revoked at the same time as it is being opened, which can 1686 * happen when opening a tty blocks indefinitely. In that 1687 * case, we still must call close -- it is the job of close to 1688 * interrupt the open. Either way, the device will be no 1689 * longer opened, so we have to clear sd->sd_opened; subsequent 1690 * opens will have responsibility for issuing close. 1691 * 1692 * This has the side effect that the sequence of opens might 1693 * happen out of order -- we might end up doing open, open, 1694 * close, close, instead of open, close, open, close. This is 1695 * unavoidable with the current devsw API, where open is 1696 * allowed to block and close must be able to run concurrently 1697 * to interrupt it. It is the driver's responsibility to 1698 * ensure that close is idempotent so that this works. Drivers 1699 * requiring per-open state and exact 1:1 correspondence 1700 * between open and close can use fd_clone. 1701 */ 1702 mutex_enter(&device_lock); 1703 KASSERT(sn->sn_opencnt); 1704 KASSERT(sd->sd_opencnt); 1705 KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt, 1706 "sn_opencnt=%u > sd_opencnt=%u", 1707 sn->sn_opencnt, sd->sd_opencnt); 1708 sn->sn_opencnt--; 1709 count = --sd->sd_opencnt; 1710 if (vp->v_type == VBLK) { 1711 KASSERTMSG(count == 0, "block device with %u opens", 1712 count + 1); 1713 sd->sd_bdevvp = NULL; 1714 } 1715 if (count == 0) { 1716 KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u", 1717 sn->sn_opencnt); 1718 KASSERT(!sd->sd_closing); 1719 sd->sd_opened = false; 1720 sd->sd_closing = true; 1721 } 1722 mutex_exit(&device_lock); 1723 1724 if (count != 0) 1725 return 0; 1726 1727 /* 1728 * If we're able to block, release the vnode lock & reacquire. We 1729 * might end up sleeping for someone else who wants our queues. They 1730 * won't get them if we hold the vnode locked. 1731 */ 1732 if (!(flags & FNONBLOCK)) 1733 VOP_UNLOCK(vp); 1734 1735 /* 1736 * If we can cancel all outstanding I/O, then wait for it to 1737 * drain before we call .d_close. Drivers that split up 1738 * .d_cancel and .d_close this way need not have any internal 1739 * mechanism for waiting in .d_close for I/O to drain. 1740 */ 1741 if (vp->v_type == VBLK) 1742 error = bdev_cancel(dev, flags, mode, curlwp); 1743 else 1744 error = cdev_cancel(dev, flags, mode, curlwp); 1745 if (error == 0) 1746 spec_io_drain(sd); 1747 else 1748 KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d", 1749 (unsigned long)dev, error); 1750 1751 if (vp->v_type == VBLK) 1752 error = bdev_close(dev, flags, mode, curlwp); 1753 else 1754 error = cdev_close(dev, flags, mode, curlwp); 1755 1756 /* 1757 * Wait for all other devsw operations to drain. After this 1758 * point, no bdev/cdev_* can be active for this specdev. 1759 */ 1760 spec_io_drain(sd); 1761 1762 /* 1763 * Wake any spec_open calls waiting for close to finish -- do 1764 * this before reacquiring the vnode lock, because spec_open 1765 * holds the vnode lock while waiting, so doing this after 1766 * reacquiring the lock would deadlock. 1767 */ 1768 mutex_enter(&device_lock); 1769 KASSERT(!sd->sd_opened); 1770 KASSERT(sd->sd_closing); 1771 sd->sd_closing = false; 1772 cv_broadcast(&specfs_iocv); 1773 mutex_exit(&device_lock); 1774 1775 if (!(flags & FNONBLOCK)) 1776 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1777 1778 return (error); 1779 } 1780 1781 /* 1782 * Print out the contents of a special device vnode. 1783 */ 1784 int 1785 spec_print(void *v) 1786 { 1787 struct vop_print_args /* { 1788 struct vnode *a_vp; 1789 } */ *ap = v; 1790 1791 printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev), 1792 (unsigned long long)minor(ap->a_vp->v_rdev)); 1793 return 0; 1794 } 1795 1796 /* 1797 * Return POSIX pathconf information applicable to special devices. 1798 */ 1799 int 1800 spec_pathconf(void *v) 1801 { 1802 struct vop_pathconf_args /* { 1803 struct vnode *a_vp; 1804 int a_name; 1805 register_t *a_retval; 1806 } */ *ap = v; 1807 1808 switch (ap->a_name) { 1809 case _PC_LINK_MAX: 1810 *ap->a_retval = LINK_MAX; 1811 return (0); 1812 case _PC_MAX_CANON: 1813 *ap->a_retval = MAX_CANON; 1814 return (0); 1815 case _PC_MAX_INPUT: 1816 *ap->a_retval = MAX_INPUT; 1817 return (0); 1818 case _PC_PIPE_BUF: 1819 *ap->a_retval = PIPE_BUF; 1820 return (0); 1821 case _PC_CHOWN_RESTRICTED: 1822 *ap->a_retval = 1; 1823 return (0); 1824 case _PC_VDISABLE: 1825 *ap->a_retval = _POSIX_VDISABLE; 1826 return (0); 1827 case _PC_SYNC_IO: 1828 *ap->a_retval = 1; 1829 return (0); 1830 default: 1831 return genfs_pathconf(ap); 1832 } 1833 /* NOTREACHED */ 1834 } 1835 1836 /* 1837 * Advisory record locking support. 1838 */ 1839 int 1840 spec_advlock(void *v) 1841 { 1842 struct vop_advlock_args /* { 1843 struct vnode *a_vp; 1844 void *a_id; 1845 int a_op; 1846 struct flock *a_fl; 1847 int a_flags; 1848 } */ *ap = v; 1849 struct vnode *vp = ap->a_vp; 1850 1851 return lf_advlock(ap, &vp->v_speclockf, (off_t)0); 1852 } 1853