1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 25 * All rights reserved. 26 * 27 * Portions Copyright 2010 Robert Milkowski 28 * 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 31 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 32 * Copyright (c) 2014 Integros [integros.com] 33 * Copyright (c) 2024, Klara, Inc. 34 */ 35 36 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 37 38 /* 39 * ZFS volume emulation driver. 40 * 41 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 42 * Volumes are accessed through the symbolic links named: 43 * 44 * /dev/zvol/<pool_name>/<dataset_name> 45 * 46 * Volumes are persistent through reboot. No user command needs to be 47 * run before opening and using a device. 48 * 49 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 50 * in the system. Except when they're simply character devices (volmode=dev). 51 */ 52 53 #include <sys/types.h> 54 #include <sys/param.h> 55 #include <sys/kernel.h> 56 #include <sys/errno.h> 57 #include <sys/uio.h> 58 #include <sys/bio.h> 59 #include <sys/buf.h> 60 #include <sys/kmem.h> 61 #include <sys/conf.h> 62 #include <sys/cmn_err.h> 63 #include <sys/stat.h> 64 #include <sys/proc.h> 65 #include <sys/zap.h> 66 #include <sys/spa.h> 67 #include <sys/spa_impl.h> 68 #include <sys/zio.h> 69 #include <sys/disk.h> 70 #include <sys/dmu_traverse.h> 71 #include <sys/dnode.h> 72 #include <sys/dsl_dataset.h> 73 #include <sys/dsl_prop.h> 74 #include <sys/dsl_dir.h> 75 #include <sys/byteorder.h> 76 #include <sys/sunddi.h> 77 #include <sys/dirent.h> 78 #include <sys/policy.h> 79 #include <sys/queue.h> 80 #include <sys/fs/zfs.h> 81 #include <sys/zfs_ioctl.h> 82 #include <sys/zil.h> 83 #include <sys/zfs_znode.h> 84 #include <sys/zfs_rlock.h> 85 #include <sys/vdev_impl.h> 86 #include <sys/vdev_raidz.h> 87 #include <sys/zvol.h> 88 #include <sys/zil_impl.h> 89 #include <sys/dataset_kstats.h> 90 #include <sys/dbuf.h> 91 #include <sys/dmu_tx.h> 92 #include <sys/zfeature.h> 93 #include <sys/zio_checksum.h> 94 #include <sys/zil_impl.h> 95 #include <sys/filio.h> 96 #include <sys/freebsd_event.h> 97 98 #include <geom/geom.h> 99 #include <sys/zvol.h> 100 #include <sys/zvol_impl.h> 101 102 #include "zfs_namecheck.h" 103 104 #define ZVOL_DUMPSIZE "dumpsize" 105 106 #ifdef ZVOL_LOCK_DEBUG 107 #define ZVOL_RW_READER RW_WRITER 108 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 109 #else 110 #define ZVOL_RW_READER RW_READER 111 #define ZVOL_RW_READ_HELD RW_READ_HELD 112 #endif 113 114 enum zvol_geom_state { 115 ZVOL_GEOM_UNINIT, 116 ZVOL_GEOM_STOPPED, 117 ZVOL_GEOM_RUNNING, 118 }; 119 120 struct zvol_state_os { 121 #define zso_dev _zso_state._zso_dev 122 #define zso_geom _zso_state._zso_geom 123 union { 124 /* volmode=dev */ 125 struct zvol_state_dev { 126 struct cdev *zsd_cdev; 127 struct selinfo zsd_selinfo; 128 } _zso_dev; 129 130 /* volmode=geom */ 131 struct zvol_state_geom { 132 struct g_provider *zsg_provider; 133 struct bio_queue_head zsg_queue; 134 struct mtx zsg_queue_mtx; 135 enum zvol_geom_state zsg_state; 136 } _zso_geom; 137 } _zso_state; 138 int zso_dying; 139 }; 140 141 static uint32_t zvol_minors; 142 143 SYSCTL_DECL(_vfs_zfs); 144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, 146 "Expose as GEOM providers (1), device files (2) or neither"); 147 static boolean_t zpool_on_zvol = B_FALSE; 148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 149 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 150 151 /* 152 * Toggle unmap functionality. 153 */ 154 boolean_t zvol_unmap_enabled = B_TRUE; 155 156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 157 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 158 159 /* 160 * zvol maximum transfer in one DMU tx. 161 */ 162 int zvol_maxphys = DMU_MAX_ACCESS / 2; 163 164 static void zvol_ensure_zilog(zvol_state_t *zv); 165 166 static d_open_t zvol_cdev_open; 167 static d_close_t zvol_cdev_close; 168 static d_ioctl_t zvol_cdev_ioctl; 169 static d_read_t zvol_cdev_read; 170 static d_write_t zvol_cdev_write; 171 static d_strategy_t zvol_geom_bio_strategy; 172 static d_kqfilter_t zvol_cdev_kqfilter; 173 174 static struct cdevsw zvol_cdevsw = { 175 .d_name = "zvol", 176 .d_version = D_VERSION, 177 .d_flags = D_DISK | D_TRACKCLOSE, 178 .d_open = zvol_cdev_open, 179 .d_close = zvol_cdev_close, 180 .d_ioctl = zvol_cdev_ioctl, 181 .d_read = zvol_cdev_read, 182 .d_write = zvol_cdev_write, 183 .d_strategy = zvol_geom_bio_strategy, 184 .d_kqfilter = zvol_cdev_kqfilter, 185 }; 186 187 static void zvol_filter_detach(struct knote *kn); 188 static int zvol_filter_vnode(struct knote *kn, long hint); 189 190 static struct filterops zvol_filterops_vnode = { 191 .f_isfd = 1, 192 .f_detach = zvol_filter_detach, 193 .f_event = zvol_filter_vnode, 194 }; 195 196 extern uint_t zfs_geom_probe_vdev_key; 197 198 struct g_class zfs_zvol_class = { 199 .name = "ZFS::ZVOL", 200 .version = G_VERSION, 201 }; 202 203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 204 205 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 206 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 207 static void zvol_geom_run(zvol_state_t *zv); 208 static void zvol_geom_destroy(zvol_state_t *zv); 209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 210 static void zvol_geom_worker(void *arg); 211 static void zvol_geom_bio_start(struct bio *bp); 212 static int zvol_geom_bio_getattr(struct bio *bp); 213 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ 214 215 /* 216 * GEOM mode implementation 217 */ 218 219 static int 220 zvol_geom_open(struct g_provider *pp, int flag, int count) 221 { 222 zvol_state_t *zv; 223 int err = 0; 224 boolean_t drop_suspend = B_FALSE; 225 226 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 227 /* 228 * If zfs_geom_probe_vdev_key is set, that means that zfs is 229 * attempting to probe geom providers while looking for a 230 * replacement for a missing VDEV. In this case, the 231 * spa_namespace_lock will not be held, but it is still illegal 232 * to use a zvol as a vdev. Deadlocks can result if another 233 * thread has spa_namespace_lock. 234 */ 235 return (SET_ERROR(EOPNOTSUPP)); 236 } 237 238 retry: 239 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 240 /* 241 * Obtain a copy of private under zvol_state_lock to make sure either 242 * the result of zvol free code setting private to NULL is observed, 243 * or the zv is protected from being freed because of the positive 244 * zv_open_count. 245 */ 246 zv = pp->private; 247 if (zv == NULL) { 248 rw_exit(&zvol_state_lock); 249 err = SET_ERROR(ENXIO); 250 goto out_locked; 251 } 252 253 mutex_enter(&zv->zv_state_lock); 254 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { 255 rw_exit(&zvol_state_lock); 256 err = SET_ERROR(ENXIO); 257 goto out_zv_locked; 258 } 259 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 260 261 /* 262 * Make sure zvol is not suspended during first open 263 * (hold zv_suspend_lock) and respect proper lock acquisition 264 * ordering - zv_suspend_lock before zv_state_lock. 265 */ 266 if (zv->zv_open_count == 0) { 267 drop_suspend = B_TRUE; 268 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 269 mutex_exit(&zv->zv_state_lock); 270 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 271 mutex_enter(&zv->zv_state_lock); 272 /* Check to see if zv_suspend_lock is needed. */ 273 if (zv->zv_open_count != 0) { 274 rw_exit(&zv->zv_suspend_lock); 275 drop_suspend = B_FALSE; 276 } 277 } 278 } 279 rw_exit(&zvol_state_lock); 280 281 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 282 283 if (zv->zv_open_count == 0) { 284 boolean_t drop_namespace = B_FALSE; 285 286 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 287 288 /* 289 * Take spa_namespace_lock to prevent lock inversion when 290 * zvols from one pool are opened as vdevs in another. 291 */ 292 if (!mutex_owned(&spa_namespace_lock)) { 293 if (!mutex_tryenter(&spa_namespace_lock)) { 294 mutex_exit(&zv->zv_state_lock); 295 rw_exit(&zv->zv_suspend_lock); 296 drop_suspend = B_FALSE; 297 kern_yield(PRI_USER); 298 goto retry; 299 } else { 300 drop_namespace = B_TRUE; 301 } 302 } 303 err = zvol_first_open(zv, !(flag & FWRITE)); 304 if (drop_namespace) 305 mutex_exit(&spa_namespace_lock); 306 if (err) 307 goto out_zv_locked; 308 pp->mediasize = zv->zv_volsize; 309 pp->stripeoffset = 0; 310 pp->stripesize = zv->zv_volblocksize; 311 } 312 313 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 314 315 /* 316 * Check for a bad on-disk format version now since we 317 * lied about owning the dataset readonly before. 318 */ 319 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 320 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 321 err = SET_ERROR(EROFS); 322 goto out_opened; 323 } 324 if (zv->zv_flags & ZVOL_EXCL) { 325 err = SET_ERROR(EBUSY); 326 goto out_opened; 327 } 328 if (flag & O_EXCL) { 329 if (zv->zv_open_count != 0) { 330 err = SET_ERROR(EBUSY); 331 goto out_opened; 332 } 333 zv->zv_flags |= ZVOL_EXCL; 334 } 335 336 zv->zv_open_count += count; 337 out_opened: 338 if (zv->zv_open_count == 0) { 339 zvol_last_close(zv); 340 wakeup(zv); 341 } 342 out_zv_locked: 343 mutex_exit(&zv->zv_state_lock); 344 out_locked: 345 if (drop_suspend) 346 rw_exit(&zv->zv_suspend_lock); 347 return (err); 348 } 349 350 static int 351 zvol_geom_close(struct g_provider *pp, int flag, int count) 352 { 353 (void) flag; 354 zvol_state_t *zv; 355 boolean_t drop_suspend = B_TRUE; 356 int new_open_count; 357 358 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 359 zv = pp->private; 360 if (zv == NULL) { 361 rw_exit(&zvol_state_lock); 362 return (SET_ERROR(ENXIO)); 363 } 364 365 mutex_enter(&zv->zv_state_lock); 366 if (zv->zv_flags & ZVOL_EXCL) { 367 ASSERT3U(zv->zv_open_count, ==, 1); 368 zv->zv_flags &= ~ZVOL_EXCL; 369 } 370 371 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 372 373 /* 374 * If the open count is zero, this is a spurious close. 375 * That indicates a bug in the kernel / DDI framework. 376 */ 377 ASSERT3U(zv->zv_open_count, >, 0); 378 379 /* 380 * Make sure zvol is not suspended during last close 381 * (hold zv_suspend_lock) and respect proper lock acquisition 382 * ordering - zv_suspend_lock before zv_state_lock. 383 */ 384 new_open_count = zv->zv_open_count - count; 385 if (new_open_count == 0) { 386 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 387 mutex_exit(&zv->zv_state_lock); 388 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 389 mutex_enter(&zv->zv_state_lock); 390 /* Check to see if zv_suspend_lock is needed. */ 391 new_open_count = zv->zv_open_count - count; 392 if (new_open_count != 0) { 393 rw_exit(&zv->zv_suspend_lock); 394 drop_suspend = B_FALSE; 395 } 396 } 397 } else { 398 drop_suspend = B_FALSE; 399 } 400 rw_exit(&zvol_state_lock); 401 402 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 403 404 /* 405 * You may get multiple opens, but only one close. 406 */ 407 zv->zv_open_count = new_open_count; 408 if (zv->zv_open_count == 0) { 409 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 410 zvol_last_close(zv); 411 wakeup(zv); 412 } 413 414 mutex_exit(&zv->zv_state_lock); 415 416 if (drop_suspend) 417 rw_exit(&zv->zv_suspend_lock); 418 return (0); 419 } 420 421 static void 422 zvol_geom_run(zvol_state_t *zv) 423 { 424 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 425 struct g_provider *pp = zsg->zsg_provider; 426 427 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 428 429 g_error_provider(pp, 0); 430 431 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, 432 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); 433 } 434 435 static void 436 zvol_geom_destroy(zvol_state_t *zv) 437 { 438 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 439 struct g_provider *pp = zsg->zsg_provider; 440 441 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 442 443 g_topology_assert(); 444 445 mutex_enter(&zv->zv_state_lock); 446 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); 447 mutex_exit(&zv->zv_state_lock); 448 zsg->zsg_provider = NULL; 449 g_wither_geom(pp->geom, ENXIO); 450 } 451 452 void 453 zvol_wait_close(zvol_state_t *zv) 454 { 455 456 if (zv->zv_volmode != ZFS_VOLMODE_GEOM) 457 return; 458 mutex_enter(&zv->zv_state_lock); 459 zv->zv_zso->zso_dying = B_TRUE; 460 461 if (zv->zv_open_count) 462 msleep(zv, &zv->zv_state_lock, 463 PRIBIO, "zvol:dying", 10*hz); 464 mutex_exit(&zv->zv_state_lock); 465 } 466 467 468 static int 469 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 470 { 471 int count, error, flags; 472 473 g_topology_assert(); 474 475 /* 476 * To make it easier we expect either open or close, but not both 477 * at the same time. 478 */ 479 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 480 (acr <= 0 && acw <= 0 && ace <= 0), 481 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 482 pp->name, acr, acw, ace)); 483 484 if (pp->private == NULL) { 485 if (acr <= 0 && acw <= 0 && ace <= 0) 486 return (0); 487 return (pp->error); 488 } 489 490 /* 491 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 492 * ace != 0, because GEOM already handles that and handles it a bit 493 * differently. GEOM allows for multiple read/exclusive consumers and 494 * ZFS allows only one exclusive consumer, no matter if it is reader or 495 * writer. I like better the way GEOM works so I'll leave it for GEOM 496 * to decide what to do. 497 */ 498 499 count = acr + acw + ace; 500 if (count == 0) 501 return (0); 502 503 flags = 0; 504 if (acr != 0 || ace != 0) 505 flags |= FREAD; 506 if (acw != 0) 507 flags |= FWRITE; 508 509 g_topology_unlock(); 510 if (count > 0) 511 error = zvol_geom_open(pp, flags, count); 512 else 513 error = zvol_geom_close(pp, flags, -count); 514 g_topology_lock(); 515 return (error); 516 } 517 518 static void 519 zvol_geom_worker(void *arg) 520 { 521 zvol_state_t *zv = arg; 522 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 523 struct bio *bp; 524 525 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 526 527 thread_lock(curthread); 528 sched_prio(curthread, PRIBIO); 529 thread_unlock(curthread); 530 531 for (;;) { 532 mtx_lock(&zsg->zsg_queue_mtx); 533 bp = bioq_takefirst(&zsg->zsg_queue); 534 if (bp == NULL) { 535 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { 536 zsg->zsg_state = ZVOL_GEOM_RUNNING; 537 wakeup(&zsg->zsg_state); 538 mtx_unlock(&zsg->zsg_queue_mtx); 539 kthread_exit(); 540 } 541 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, 542 PRIBIO | PDROP, "zvol:io", 0); 543 continue; 544 } 545 mtx_unlock(&zsg->zsg_queue_mtx); 546 zvol_geom_bio_strategy(bp); 547 } 548 } 549 550 static void 551 zvol_geom_bio_start(struct bio *bp) 552 { 553 zvol_state_t *zv = bp->bio_to->private; 554 struct zvol_state_geom *zsg; 555 boolean_t first; 556 557 if (zv == NULL) { 558 g_io_deliver(bp, ENXIO); 559 return; 560 } 561 if (bp->bio_cmd == BIO_GETATTR) { 562 if (zvol_geom_bio_getattr(bp)) 563 g_io_deliver(bp, EOPNOTSUPP); 564 return; 565 } 566 567 if (!THREAD_CAN_SLEEP()) { 568 zsg = &zv->zv_zso->zso_geom; 569 mtx_lock(&zsg->zsg_queue_mtx); 570 first = (bioq_first(&zsg->zsg_queue) == NULL); 571 bioq_insert_tail(&zsg->zsg_queue, bp); 572 mtx_unlock(&zsg->zsg_queue_mtx); 573 if (first) 574 wakeup_one(&zsg->zsg_queue); 575 return; 576 } 577 578 zvol_geom_bio_strategy(bp); 579 } 580 581 static int 582 zvol_geom_bio_getattr(struct bio *bp) 583 { 584 zvol_state_t *zv; 585 586 zv = bp->bio_to->private; 587 ASSERT3P(zv, !=, NULL); 588 589 spa_t *spa = dmu_objset_spa(zv->zv_objset); 590 uint64_t refd, avail, usedobjs, availobjs; 591 592 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 593 return (0); 594 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 595 dmu_objset_space(zv->zv_objset, &refd, &avail, 596 &usedobjs, &availobjs); 597 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 598 return (0); 599 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 600 dmu_objset_space(zv->zv_objset, &refd, &avail, 601 &usedobjs, &availobjs); 602 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 603 return (0); 604 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 605 avail = metaslab_class_get_space(spa_normal_class(spa)); 606 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 607 if (g_handleattr_off_t(bp, "poolblocksavail", 608 avail / DEV_BSIZE)) 609 return (0); 610 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 611 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 612 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 613 return (0); 614 } 615 return (1); 616 } 617 618 static void 619 zvol_filter_detach(struct knote *kn) 620 { 621 zvol_state_t *zv; 622 struct zvol_state_dev *zsd; 623 624 zv = kn->kn_hook; 625 zsd = &zv->zv_zso->zso_dev; 626 627 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0); 628 } 629 630 static int 631 zvol_filter_vnode(struct knote *kn, long hint) 632 { 633 kn->kn_fflags |= kn->kn_sfflags & hint; 634 635 return (kn->kn_fflags != 0); 636 } 637 638 static int 639 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) 640 { 641 zvol_state_t *zv; 642 struct zvol_state_dev *zsd; 643 644 zv = dev->si_drv2; 645 zsd = &zv->zv_zso->zso_dev; 646 647 if (kn->kn_filter != EVFILT_VNODE) 648 return (EINVAL); 649 650 /* XXX: extend support for other NOTE_* events */ 651 if (kn->kn_sfflags != NOTE_ATTRIB) 652 return (EINVAL); 653 654 kn->kn_fop = &zvol_filterops_vnode; 655 kn->kn_hook = zv; 656 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0); 657 658 return (0); 659 } 660 661 static void 662 zvol_geom_bio_strategy(struct bio *bp) 663 { 664 zvol_state_t *zv; 665 uint64_t off, volsize; 666 size_t resid; 667 char *addr; 668 objset_t *os; 669 zfs_locked_range_t *lr; 670 int error = 0; 671 boolean_t doread = B_FALSE; 672 boolean_t is_dumpified; 673 boolean_t commit; 674 675 if (bp->bio_to) 676 zv = bp->bio_to->private; 677 else 678 zv = bp->bio_dev->si_drv2; 679 680 if (zv == NULL) { 681 error = SET_ERROR(ENXIO); 682 goto out; 683 } 684 685 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 686 687 if (zv->zv_flags & ZVOL_REMOVING) { 688 error = SET_ERROR(ENXIO); 689 goto resume; 690 } 691 692 switch (bp->bio_cmd) { 693 case BIO_READ: 694 doread = B_TRUE; 695 break; 696 case BIO_WRITE: 697 case BIO_FLUSH: 698 case BIO_DELETE: 699 if (zv->zv_flags & ZVOL_RDONLY) { 700 error = SET_ERROR(EROFS); 701 goto resume; 702 } 703 zvol_ensure_zilog(zv); 704 if (bp->bio_cmd == BIO_FLUSH) 705 goto commit; 706 break; 707 default: 708 error = SET_ERROR(EOPNOTSUPP); 709 goto resume; 710 } 711 712 off = bp->bio_offset; 713 volsize = zv->zv_volsize; 714 715 os = zv->zv_objset; 716 ASSERT3P(os, !=, NULL); 717 718 addr = bp->bio_data; 719 resid = bp->bio_length; 720 721 if (resid > 0 && off >= volsize) { 722 error = SET_ERROR(EIO); 723 goto resume; 724 } 725 726 is_dumpified = B_FALSE; 727 commit = !doread && !is_dumpified && 728 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 729 730 /* 731 * There must be no buffer changes when doing a dmu_sync() because 732 * we can't change the data whilst calculating the checksum. 733 */ 734 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 735 doread ? RL_READER : RL_WRITER); 736 737 if (bp->bio_cmd == BIO_DELETE) { 738 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 739 error = dmu_tx_assign(tx, TXG_WAIT); 740 if (error != 0) { 741 dmu_tx_abort(tx); 742 } else { 743 zvol_log_truncate(zv, tx, off, resid); 744 dmu_tx_commit(tx); 745 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 746 off, resid); 747 resid = 0; 748 } 749 goto unlock; 750 } 751 while (resid != 0 && off < volsize) { 752 size_t size = MIN(resid, zvol_maxphys); 753 if (doread) { 754 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 755 DMU_READ_PREFETCH); 756 } else { 757 dmu_tx_t *tx = dmu_tx_create(os); 758 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 759 error = dmu_tx_assign(tx, TXG_WAIT); 760 if (error) { 761 dmu_tx_abort(tx); 762 } else { 763 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 764 zvol_log_write(zv, tx, off, size, commit); 765 dmu_tx_commit(tx); 766 } 767 } 768 if (error) { 769 /* Convert checksum errors into IO errors. */ 770 if (error == ECKSUM) 771 error = SET_ERROR(EIO); 772 break; 773 } 774 off += size; 775 addr += size; 776 resid -= size; 777 } 778 unlock: 779 zfs_rangelock_exit(lr); 780 781 bp->bio_completed = bp->bio_length - resid; 782 if (bp->bio_completed < bp->bio_length && off > volsize) 783 error = SET_ERROR(EINVAL); 784 785 switch (bp->bio_cmd) { 786 case BIO_FLUSH: 787 break; 788 case BIO_READ: 789 dataset_kstats_update_read_kstats(&zv->zv_kstat, 790 bp->bio_completed); 791 break; 792 case BIO_WRITE: 793 dataset_kstats_update_write_kstats(&zv->zv_kstat, 794 bp->bio_completed); 795 break; 796 case BIO_DELETE: 797 break; 798 default: 799 break; 800 } 801 802 if (commit) { 803 commit: 804 zil_commit(zv->zv_zilog, ZVOL_OBJ); 805 } 806 resume: 807 rw_exit(&zv->zv_suspend_lock); 808 out: 809 if (bp->bio_to) 810 g_io_deliver(bp, error); 811 else 812 biofinish(bp, NULL, error); 813 } 814 815 /* 816 * Character device mode implementation 817 */ 818 819 static int 820 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) 821 { 822 zvol_state_t *zv; 823 uint64_t volsize; 824 zfs_locked_range_t *lr; 825 int error = 0; 826 zfs_uio_t uio; 827 828 zfs_uio_init(&uio, uio_s); 829 830 zv = dev->si_drv2; 831 832 volsize = zv->zv_volsize; 833 /* 834 * uio_loffset == volsize isn't an error as 835 * it's required for EOF processing. 836 */ 837 if (zfs_uio_resid(&uio) > 0 && 838 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 839 return (SET_ERROR(EIO)); 840 841 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 842 ssize_t start_resid = zfs_uio_resid(&uio); 843 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 844 zfs_uio_resid(&uio), RL_READER); 845 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 846 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 847 848 /* Don't read past the end. */ 849 if (bytes > volsize - zfs_uio_offset(&uio)) 850 bytes = volsize - zfs_uio_offset(&uio); 851 852 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 853 if (error) { 854 /* Convert checksum errors into IO errors. */ 855 if (error == ECKSUM) 856 error = SET_ERROR(EIO); 857 break; 858 } 859 } 860 zfs_rangelock_exit(lr); 861 int64_t nread = start_resid - zfs_uio_resid(&uio); 862 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 863 rw_exit(&zv->zv_suspend_lock); 864 865 return (error); 866 } 867 868 static int 869 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) 870 { 871 zvol_state_t *zv; 872 uint64_t volsize; 873 zfs_locked_range_t *lr; 874 int error = 0; 875 boolean_t commit; 876 zfs_uio_t uio; 877 878 zv = dev->si_drv2; 879 880 volsize = zv->zv_volsize; 881 882 zfs_uio_init(&uio, uio_s); 883 884 if (zfs_uio_resid(&uio) > 0 && 885 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 886 return (SET_ERROR(EIO)); 887 888 ssize_t start_resid = zfs_uio_resid(&uio); 889 commit = (ioflag & IO_SYNC) || 890 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 891 892 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 893 zvol_ensure_zilog(zv); 894 895 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 896 zfs_uio_resid(&uio), RL_WRITER); 897 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 898 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 899 uint64_t off = zfs_uio_offset(&uio); 900 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 901 902 if (bytes > volsize - off) /* Don't write past the end. */ 903 bytes = volsize - off; 904 905 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 906 error = dmu_tx_assign(tx, TXG_WAIT); 907 if (error) { 908 dmu_tx_abort(tx); 909 break; 910 } 911 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 912 if (error == 0) 913 zvol_log_write(zv, tx, off, bytes, commit); 914 dmu_tx_commit(tx); 915 916 if (error) 917 break; 918 } 919 zfs_rangelock_exit(lr); 920 int64_t nwritten = start_resid - zfs_uio_resid(&uio); 921 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 922 if (commit) 923 zil_commit(zv->zv_zilog, ZVOL_OBJ); 924 rw_exit(&zv->zv_suspend_lock); 925 926 return (error); 927 } 928 929 static int 930 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 931 { 932 zvol_state_t *zv; 933 int err = 0; 934 boolean_t drop_suspend = B_FALSE; 935 936 retry: 937 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 938 /* 939 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either 940 * the result of zvol free code setting si_drv2 to NULL is observed, 941 * or the zv is protected from being freed because of the positive 942 * zv_open_count. 943 */ 944 zv = dev->si_drv2; 945 if (zv == NULL) { 946 rw_exit(&zvol_state_lock); 947 err = SET_ERROR(ENXIO); 948 goto out_locked; 949 } 950 951 mutex_enter(&zv->zv_state_lock); 952 if (zv->zv_zso->zso_dying) { 953 rw_exit(&zvol_state_lock); 954 err = SET_ERROR(ENXIO); 955 goto out_zv_locked; 956 } 957 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 958 959 /* 960 * Make sure zvol is not suspended during first open 961 * (hold zv_suspend_lock) and respect proper lock acquisition 962 * ordering - zv_suspend_lock before zv_state_lock. 963 */ 964 if (zv->zv_open_count == 0) { 965 drop_suspend = B_TRUE; 966 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 967 mutex_exit(&zv->zv_state_lock); 968 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 969 mutex_enter(&zv->zv_state_lock); 970 /* Check to see if zv_suspend_lock is needed. */ 971 if (zv->zv_open_count != 0) { 972 rw_exit(&zv->zv_suspend_lock); 973 drop_suspend = B_FALSE; 974 } 975 } 976 } 977 rw_exit(&zvol_state_lock); 978 979 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 980 981 if (zv->zv_open_count == 0) { 982 boolean_t drop_namespace = B_FALSE; 983 984 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 985 986 /* 987 * Take spa_namespace_lock to prevent lock inversion when 988 * zvols from one pool are opened as vdevs in another. 989 */ 990 if (!mutex_owned(&spa_namespace_lock)) { 991 if (!mutex_tryenter(&spa_namespace_lock)) { 992 mutex_exit(&zv->zv_state_lock); 993 rw_exit(&zv->zv_suspend_lock); 994 drop_suspend = B_FALSE; 995 kern_yield(PRI_USER); 996 goto retry; 997 } else { 998 drop_namespace = B_TRUE; 999 } 1000 } 1001 err = zvol_first_open(zv, !(flags & FWRITE)); 1002 if (drop_namespace) 1003 mutex_exit(&spa_namespace_lock); 1004 if (err) 1005 goto out_zv_locked; 1006 } 1007 1008 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1009 1010 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 1011 err = SET_ERROR(EROFS); 1012 goto out_opened; 1013 } 1014 if (zv->zv_flags & ZVOL_EXCL) { 1015 err = SET_ERROR(EBUSY); 1016 goto out_opened; 1017 } 1018 if (flags & O_EXCL) { 1019 if (zv->zv_open_count != 0) { 1020 err = SET_ERROR(EBUSY); 1021 goto out_opened; 1022 } 1023 zv->zv_flags |= ZVOL_EXCL; 1024 } 1025 1026 zv->zv_open_count++; 1027 out_opened: 1028 if (zv->zv_open_count == 0) { 1029 zvol_last_close(zv); 1030 wakeup(zv); 1031 } 1032 out_zv_locked: 1033 mutex_exit(&zv->zv_state_lock); 1034 out_locked: 1035 if (drop_suspend) 1036 rw_exit(&zv->zv_suspend_lock); 1037 return (err); 1038 } 1039 1040 static int 1041 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 1042 { 1043 zvol_state_t *zv; 1044 boolean_t drop_suspend = B_TRUE; 1045 1046 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 1047 zv = dev->si_drv2; 1048 if (zv == NULL) { 1049 rw_exit(&zvol_state_lock); 1050 return (SET_ERROR(ENXIO)); 1051 } 1052 1053 mutex_enter(&zv->zv_state_lock); 1054 if (zv->zv_flags & ZVOL_EXCL) { 1055 ASSERT3U(zv->zv_open_count, ==, 1); 1056 zv->zv_flags &= ~ZVOL_EXCL; 1057 } 1058 1059 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 1060 1061 /* 1062 * If the open count is zero, this is a spurious close. 1063 * That indicates a bug in the kernel / DDI framework. 1064 */ 1065 ASSERT3U(zv->zv_open_count, >, 0); 1066 /* 1067 * Make sure zvol is not suspended during last close 1068 * (hold zv_suspend_lock) and respect proper lock acquisition 1069 * ordering - zv_suspend_lock before zv_state_lock. 1070 */ 1071 if (zv->zv_open_count == 1) { 1072 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 1073 mutex_exit(&zv->zv_state_lock); 1074 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1075 mutex_enter(&zv->zv_state_lock); 1076 /* Check to see if zv_suspend_lock is needed. */ 1077 if (zv->zv_open_count != 1) { 1078 rw_exit(&zv->zv_suspend_lock); 1079 drop_suspend = B_FALSE; 1080 } 1081 } 1082 } else { 1083 drop_suspend = B_FALSE; 1084 } 1085 rw_exit(&zvol_state_lock); 1086 1087 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1088 1089 /* 1090 * You may get multiple opens, but only one close. 1091 */ 1092 zv->zv_open_count--; 1093 1094 if (zv->zv_open_count == 0) { 1095 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1096 zvol_last_close(zv); 1097 wakeup(zv); 1098 } 1099 1100 mutex_exit(&zv->zv_state_lock); 1101 1102 if (drop_suspend) 1103 rw_exit(&zv->zv_suspend_lock); 1104 return (0); 1105 } 1106 1107 static int 1108 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 1109 int fflag, struct thread *td) 1110 { 1111 zvol_state_t *zv; 1112 zfs_locked_range_t *lr; 1113 off_t offset, length; 1114 int error; 1115 boolean_t sync; 1116 1117 zv = dev->si_drv2; 1118 1119 error = 0; 1120 KASSERT(zv->zv_open_count > 0, 1121 ("Device with zero access count in %s", __func__)); 1122 1123 switch (cmd) { 1124 case DIOCGSECTORSIZE: 1125 *(uint32_t *)data = DEV_BSIZE; 1126 break; 1127 case DIOCGMEDIASIZE: 1128 *(off_t *)data = zv->zv_volsize; 1129 break; 1130 case DIOCGFLUSH: 1131 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1132 if (zv->zv_zilog != NULL) 1133 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1134 rw_exit(&zv->zv_suspend_lock); 1135 break; 1136 case DIOCGDELETE: 1137 if (!zvol_unmap_enabled) 1138 break; 1139 1140 offset = ((off_t *)data)[0]; 1141 length = ((off_t *)data)[1]; 1142 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1143 offset < 0 || offset >= zv->zv_volsize || 1144 length <= 0) { 1145 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1146 length); 1147 error = SET_ERROR(EINVAL); 1148 break; 1149 } 1150 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1151 zvol_ensure_zilog(zv); 1152 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1153 RL_WRITER); 1154 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1155 error = dmu_tx_assign(tx, TXG_WAIT); 1156 if (error != 0) { 1157 sync = FALSE; 1158 dmu_tx_abort(tx); 1159 } else { 1160 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1161 zvol_log_truncate(zv, tx, offset, length); 1162 dmu_tx_commit(tx); 1163 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1164 offset, length); 1165 } 1166 zfs_rangelock_exit(lr); 1167 if (sync) 1168 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1169 rw_exit(&zv->zv_suspend_lock); 1170 break; 1171 case DIOCGSTRIPESIZE: 1172 *(off_t *)data = zv->zv_volblocksize; 1173 break; 1174 case DIOCGSTRIPEOFFSET: 1175 *(off_t *)data = 0; 1176 break; 1177 case DIOCGATTR: { 1178 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1179 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1180 uint64_t refd, avail, usedobjs, availobjs; 1181 1182 if (strcmp(arg->name, "GEOM::candelete") == 0) 1183 arg->value.i = 1; 1184 else if (strcmp(arg->name, "blocksavail") == 0) { 1185 dmu_objset_space(zv->zv_objset, &refd, &avail, 1186 &usedobjs, &availobjs); 1187 arg->value.off = avail / DEV_BSIZE; 1188 } else if (strcmp(arg->name, "blocksused") == 0) { 1189 dmu_objset_space(zv->zv_objset, &refd, &avail, 1190 &usedobjs, &availobjs); 1191 arg->value.off = refd / DEV_BSIZE; 1192 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1193 avail = metaslab_class_get_space(spa_normal_class(spa)); 1194 avail -= metaslab_class_get_alloc( 1195 spa_normal_class(spa)); 1196 arg->value.off = avail / DEV_BSIZE; 1197 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1198 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1199 arg->value.off = refd / DEV_BSIZE; 1200 } else 1201 error = SET_ERROR(ENOIOCTL); 1202 break; 1203 } 1204 case FIOSEEKHOLE: 1205 case FIOSEEKDATA: { 1206 off_t *off = (off_t *)data; 1207 uint64_t noff; 1208 boolean_t hole; 1209 1210 hole = (cmd == FIOSEEKHOLE); 1211 noff = *off; 1212 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, 1213 RL_READER); 1214 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1215 zfs_rangelock_exit(lr); 1216 *off = noff; 1217 break; 1218 } 1219 default: 1220 error = SET_ERROR(ENOIOCTL); 1221 } 1222 1223 return (error); 1224 } 1225 1226 /* 1227 * Misc. helpers 1228 */ 1229 1230 static void 1231 zvol_ensure_zilog(zvol_state_t *zv) 1232 { 1233 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1234 1235 /* 1236 * Open a ZIL if this is the first time we have written to this 1237 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1238 * than zv_state_lock so that we don't need to acquire an 1239 * additional lock in this path. 1240 */ 1241 if (zv->zv_zilog == NULL) { 1242 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1243 rw_exit(&zv->zv_suspend_lock); 1244 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1245 } 1246 if (zv->zv_zilog == NULL) { 1247 zv->zv_zilog = zil_open(zv->zv_objset, 1248 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1249 zv->zv_flags |= ZVOL_WRITTEN_TO; 1250 /* replay / destroy done in zvol_os_create_minor() */ 1251 VERIFY0(zv->zv_zilog->zl_header->zh_flags & 1252 ZIL_REPLAY_NEEDED); 1253 } 1254 rw_downgrade(&zv->zv_suspend_lock); 1255 } 1256 } 1257 1258 boolean_t 1259 zvol_os_is_zvol(const char *device) 1260 { 1261 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1262 } 1263 1264 void 1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1266 { 1267 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1268 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1269 1270 /* Move to a new hashtable entry. */ 1271 zv->zv_hash = zvol_name_hash(newname); 1272 hlist_del(&zv->zv_hlink); 1273 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1274 1275 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1276 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1277 struct g_provider *pp = zsg->zsg_provider; 1278 struct g_geom *gp; 1279 1280 g_topology_lock(); 1281 gp = pp->geom; 1282 ASSERT3P(gp, !=, NULL); 1283 1284 zsg->zsg_provider = NULL; 1285 g_wither_provider(pp, ENXIO); 1286 1287 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1288 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1289 pp->sectorsize = DEV_BSIZE; 1290 pp->mediasize = zv->zv_volsize; 1291 pp->private = zv; 1292 zsg->zsg_provider = pp; 1293 g_error_provider(pp, 0); 1294 g_topology_unlock(); 1295 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1296 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1297 struct cdev *dev; 1298 struct make_dev_args args; 1299 1300 dev = zsd->zsd_cdev; 1301 if (dev != NULL) { 1302 destroy_dev(dev); 1303 dev = zsd->zsd_cdev = NULL; 1304 if (zv->zv_open_count > 0) { 1305 zv->zv_flags &= ~ZVOL_EXCL; 1306 zv->zv_open_count = 0; 1307 /* XXX need suspend lock but lock order */ 1308 zvol_last_close(zv); 1309 } 1310 } 1311 1312 make_dev_args_init(&args); 1313 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1314 args.mda_devsw = &zvol_cdevsw; 1315 args.mda_cr = NULL; 1316 args.mda_uid = UID_ROOT; 1317 args.mda_gid = GID_OPERATOR; 1318 args.mda_mode = 0640; 1319 args.mda_si_drv2 = zv; 1320 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) 1321 == 0) { 1322 dev->si_iosize_max = maxphys; 1323 zsd->zsd_cdev = dev; 1324 } 1325 } 1326 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1327 dataset_kstats_rename(&zv->zv_kstat, newname); 1328 } 1329 1330 /* 1331 * Remove minor node for the specified volume. 1332 */ 1333 void 1334 zvol_os_free(zvol_state_t *zv) 1335 { 1336 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1337 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1338 ASSERT0(zv->zv_open_count); 1339 1340 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1341 1342 rw_destroy(&zv->zv_suspend_lock); 1343 zfs_rangelock_fini(&zv->zv_rangelock); 1344 1345 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1346 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1347 struct g_provider *pp __maybe_unused = zsg->zsg_provider; 1348 1349 ASSERT3P(pp->private, ==, NULL); 1350 1351 g_topology_lock(); 1352 zvol_geom_destroy(zv); 1353 g_topology_unlock(); 1354 mtx_destroy(&zsg->zsg_queue_mtx); 1355 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1356 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1357 struct cdev *dev = zsd->zsd_cdev; 1358 1359 if (dev != NULL) { 1360 ASSERT3P(dev->si_drv2, ==, NULL); 1361 destroy_dev(dev); 1362 knlist_clear(&zsd->zsd_selinfo.si_note, 0); 1363 knlist_destroy(&zsd->zsd_selinfo.si_note); 1364 } 1365 } 1366 1367 mutex_destroy(&zv->zv_state_lock); 1368 cv_destroy(&zv->zv_removing_cv); 1369 dataset_kstats_destroy(&zv->zv_kstat); 1370 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1371 kmem_free(zv, sizeof (zvol_state_t)); 1372 zvol_minors--; 1373 } 1374 1375 /* 1376 * Create a minor node (plus a whole lot more) for the specified volume. 1377 */ 1378 int 1379 zvol_os_create_minor(const char *name) 1380 { 1381 zvol_state_t *zv; 1382 objset_t *os; 1383 dmu_object_info_t *doi; 1384 uint64_t volsize; 1385 uint64_t volmode, hash; 1386 int error; 1387 bool replayed_zil = B_FALSE; 1388 1389 ZFS_LOG(1, "Creating ZVOL %s...", name); 1390 hash = zvol_name_hash(name); 1391 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1392 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1393 mutex_exit(&zv->zv_state_lock); 1394 return (SET_ERROR(EEXIST)); 1395 } 1396 1397 DROP_GIANT(); 1398 1399 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1400 1401 /* Lie and say we're read-only. */ 1402 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1403 if (error) 1404 goto out_doi; 1405 1406 error = dmu_object_info(os, ZVOL_OBJ, doi); 1407 if (error) 1408 goto out_dmu_objset_disown; 1409 1410 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1411 if (error) 1412 goto out_dmu_objset_disown; 1413 1414 error = dsl_prop_get_integer(name, 1415 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); 1416 if (error || volmode == ZFS_VOLMODE_DEFAULT) 1417 volmode = zvol_volmode; 1418 error = 0; 1419 1420 /* 1421 * zvol_alloc equivalent ... 1422 */ 1423 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1424 zv->zv_hash = hash; 1425 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1426 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1427 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1428 zv->zv_volmode = volmode; 1429 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1430 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1431 struct g_provider *pp; 1432 struct g_geom *gp; 1433 1434 zsg->zsg_state = ZVOL_GEOM_UNINIT; 1435 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); 1436 1437 g_topology_lock(); 1438 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1439 gp->start = zvol_geom_bio_start; 1440 gp->access = zvol_geom_access; 1441 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1442 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1443 pp->sectorsize = DEV_BSIZE; 1444 pp->mediasize = 0; 1445 pp->private = zv; 1446 1447 zsg->zsg_provider = pp; 1448 bioq_init(&zsg->zsg_queue); 1449 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1450 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1451 struct cdev *dev; 1452 struct make_dev_args args; 1453 1454 make_dev_args_init(&args); 1455 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1456 args.mda_devsw = &zvol_cdevsw; 1457 args.mda_cr = NULL; 1458 args.mda_uid = UID_ROOT; 1459 args.mda_gid = GID_OPERATOR; 1460 args.mda_mode = 0640; 1461 args.mda_si_drv2 = zv; 1462 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) 1463 == 0) { 1464 dev->si_iosize_max = maxphys; 1465 zsd->zsd_cdev = dev; 1466 knlist_init_sx(&zsd->zsd_selinfo.si_note, 1467 &zv->zv_state_lock); 1468 } 1469 } 1470 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1471 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1472 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1473 1474 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1475 zv->zv_flags |= ZVOL_RDONLY; 1476 1477 zv->zv_volblocksize = doi->doi_data_block_size; 1478 zv->zv_volsize = volsize; 1479 zv->zv_objset = os; 1480 1481 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1482 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1483 if (error) 1484 goto out_dmu_objset_disown; 1485 ASSERT3P(zv->zv_zilog, ==, NULL); 1486 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1487 if (spa_writeable(dmu_objset_spa(os))) { 1488 if (zil_replay_disable) 1489 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1490 else 1491 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1492 } 1493 if (replayed_zil) 1494 zil_close(zv->zv_zilog); 1495 zv->zv_zilog = NULL; 1496 1497 /* TODO: prefetch for geom tasting */ 1498 1499 zv->zv_objset = NULL; 1500 out_dmu_objset_disown: 1501 dmu_objset_disown(os, B_TRUE, FTAG); 1502 1503 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { 1504 zvol_geom_run(zv); 1505 g_topology_unlock(); 1506 } 1507 out_doi: 1508 kmem_free(doi, sizeof (dmu_object_info_t)); 1509 if (error == 0) { 1510 rw_enter(&zvol_state_lock, RW_WRITER); 1511 zvol_insert(zv); 1512 zvol_minors++; 1513 rw_exit(&zvol_state_lock); 1514 ZFS_LOG(1, "ZVOL %s created.", name); 1515 } 1516 PICKUP_GIANT(); 1517 return (error); 1518 } 1519 1520 void 1521 zvol_os_clear_private(zvol_state_t *zv) 1522 { 1523 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1524 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1525 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1526 struct g_provider *pp = zsg->zsg_provider; 1527 1528 if (pp->private == NULL) /* already cleared */ 1529 return; 1530 1531 mtx_lock(&zsg->zsg_queue_mtx); 1532 zsg->zsg_state = ZVOL_GEOM_STOPPED; 1533 pp->private = NULL; 1534 wakeup_one(&zsg->zsg_queue); 1535 while (zsg->zsg_state != ZVOL_GEOM_RUNNING) 1536 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 1537 0, "zvol:w", 0); 1538 mtx_unlock(&zsg->zsg_queue_mtx); 1539 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1540 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1541 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1542 struct cdev *dev = zsd->zsd_cdev; 1543 1544 if (dev != NULL) 1545 dev->si_drv2 = NULL; 1546 } 1547 } 1548 1549 int 1550 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1551 { 1552 zv->zv_volsize = volsize; 1553 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1554 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1555 struct g_provider *pp = zsg->zsg_provider; 1556 1557 g_topology_lock(); 1558 1559 if (pp->private == NULL) { 1560 g_topology_unlock(); 1561 return (SET_ERROR(ENXIO)); 1562 } 1563 1564 /* 1565 * Do not invoke resize event when initial size was zero. 1566 * ZVOL initializes the size on first open, this is not 1567 * real resizing. 1568 */ 1569 if (pp->mediasize == 0) 1570 pp->mediasize = zv->zv_volsize; 1571 else 1572 g_resize_provider(pp, zv->zv_volsize); 1573 1574 g_topology_unlock(); 1575 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1576 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1577 1578 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB); 1579 } 1580 return (0); 1581 } 1582 1583 void 1584 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1585 { 1586 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); 1587 } 1588 1589 void 1590 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1591 { 1592 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); 1593 } 1594 1595 /* 1596 * Public interfaces 1597 */ 1598 1599 int 1600 zvol_busy(void) 1601 { 1602 return (zvol_minors != 0); 1603 } 1604 1605 int 1606 zvol_init(void) 1607 { 1608 zvol_init_impl(); 1609 return (0); 1610 } 1611 1612 void 1613 zvol_fini(void) 1614 { 1615 zvol_fini_impl(); 1616 } 1617