1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 25 * All rights reserved. 26 * 27 * Portions Copyright 2010 Robert Milkowski 28 * 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 31 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 32 * Copyright (c) 2014 Integros [integros.com] 33 */ 34 35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 36 37 /* 38 * ZFS volume emulation driver. 39 * 40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 41 * Volumes are accessed through the symbolic links named: 42 * 43 * /dev/zvol/<pool_name>/<dataset_name> 44 * 45 * Volumes are persistent through reboot. No user command needs to be 46 * run before opening and using a device. 47 * 48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 49 * in the system. Except when they're simply character devices (volmode=dev). 50 */ 51 52 #include <sys/types.h> 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/errno.h> 56 #include <sys/uio.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/kmem.h> 60 #include <sys/conf.h> 61 #include <sys/cmn_err.h> 62 #include <sys/stat.h> 63 #include <sys/proc.h> 64 #include <sys/zap.h> 65 #include <sys/spa.h> 66 #include <sys/spa_impl.h> 67 #include <sys/zio.h> 68 #include <sys/disk.h> 69 #include <sys/dmu_traverse.h> 70 #include <sys/dnode.h> 71 #include <sys/dsl_dataset.h> 72 #include <sys/dsl_prop.h> 73 #include <sys/dsl_dir.h> 74 #include <sys/byteorder.h> 75 #include <sys/sunddi.h> 76 #include <sys/dirent.h> 77 #include <sys/policy.h> 78 #include <sys/queue.h> 79 #include <sys/fs/zfs.h> 80 #include <sys/zfs_ioctl.h> 81 #include <sys/zil.h> 82 #include <sys/zfs_znode.h> 83 #include <sys/zfs_rlock.h> 84 #include <sys/vdev_impl.h> 85 #include <sys/vdev_raidz.h> 86 #include <sys/zvol.h> 87 #include <sys/zil_impl.h> 88 #include <sys/dataset_kstats.h> 89 #include <sys/dbuf.h> 90 #include <sys/dmu_tx.h> 91 #include <sys/zfeature.h> 92 #include <sys/zio_checksum.h> 93 #include <sys/zil_impl.h> 94 #include <sys/filio.h> 95 96 #include <geom/geom.h> 97 #include <sys/zvol.h> 98 #include <sys/zvol_impl.h> 99 100 #include "zfs_namecheck.h" 101 102 #define ZVOL_DUMPSIZE "dumpsize" 103 104 #ifdef ZVOL_LOCK_DEBUG 105 #define ZVOL_RW_READER RW_WRITER 106 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 107 #else 108 #define ZVOL_RW_READER RW_READER 109 #define ZVOL_RW_READ_HELD RW_READ_HELD 110 #endif 111 112 enum zvol_geom_state { 113 ZVOL_GEOM_UNINIT, 114 ZVOL_GEOM_STOPPED, 115 ZVOL_GEOM_RUNNING, 116 }; 117 118 struct zvol_state_os { 119 #define zso_dev _zso_state._zso_dev 120 #define zso_geom _zso_state._zso_geom 121 union { 122 /* volmode=dev */ 123 struct zvol_state_dev { 124 struct cdev *zsd_cdev; 125 uint64_t zsd_sync_cnt; 126 } _zso_dev; 127 128 /* volmode=geom */ 129 struct zvol_state_geom { 130 struct g_provider *zsg_provider; 131 struct bio_queue_head zsg_queue; 132 struct mtx zsg_queue_mtx; 133 enum zvol_geom_state zsg_state; 134 } _zso_geom; 135 } _zso_state; 136 int zso_dying; 137 }; 138 139 static uint32_t zvol_minors; 140 141 SYSCTL_DECL(_vfs_zfs); 142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, 144 "Expose as GEOM providers (1), device files (2) or neither"); 145 static boolean_t zpool_on_zvol = B_FALSE; 146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 147 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 148 149 /* 150 * Toggle unmap functionality. 151 */ 152 boolean_t zvol_unmap_enabled = B_TRUE; 153 154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 155 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 156 157 /* 158 * zvol maximum transfer in one DMU tx. 159 */ 160 int zvol_maxphys = DMU_MAX_ACCESS / 2; 161 162 static void zvol_ensure_zilog(zvol_state_t *zv); 163 164 static d_open_t zvol_cdev_open; 165 static d_close_t zvol_cdev_close; 166 static d_ioctl_t zvol_cdev_ioctl; 167 static d_read_t zvol_cdev_read; 168 static d_write_t zvol_cdev_write; 169 static d_strategy_t zvol_geom_bio_strategy; 170 171 static struct cdevsw zvol_cdevsw = { 172 .d_name = "zvol", 173 .d_version = D_VERSION, 174 .d_flags = D_DISK | D_TRACKCLOSE, 175 .d_open = zvol_cdev_open, 176 .d_close = zvol_cdev_close, 177 .d_ioctl = zvol_cdev_ioctl, 178 .d_read = zvol_cdev_read, 179 .d_write = zvol_cdev_write, 180 .d_strategy = zvol_geom_bio_strategy, 181 }; 182 183 extern uint_t zfs_geom_probe_vdev_key; 184 185 struct g_class zfs_zvol_class = { 186 .name = "ZFS::ZVOL", 187 .version = G_VERSION, 188 }; 189 190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 191 192 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 193 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 194 static void zvol_geom_run(zvol_state_t *zv); 195 static void zvol_geom_destroy(zvol_state_t *zv); 196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 197 static void zvol_geom_worker(void *arg); 198 static void zvol_geom_bio_start(struct bio *bp); 199 static int zvol_geom_bio_getattr(struct bio *bp); 200 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ 201 202 /* 203 * GEOM mode implementation 204 */ 205 206 static int 207 zvol_geom_open(struct g_provider *pp, int flag, int count) 208 { 209 zvol_state_t *zv; 210 int err = 0; 211 boolean_t drop_suspend = B_FALSE; 212 213 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 214 /* 215 * If zfs_geom_probe_vdev_key is set, that means that zfs is 216 * attempting to probe geom providers while looking for a 217 * replacement for a missing VDEV. In this case, the 218 * spa_namespace_lock will not be held, but it is still illegal 219 * to use a zvol as a vdev. Deadlocks can result if another 220 * thread has spa_namespace_lock. 221 */ 222 return (SET_ERROR(EOPNOTSUPP)); 223 } 224 225 retry: 226 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 227 /* 228 * Obtain a copy of private under zvol_state_lock to make sure either 229 * the result of zvol free code setting private to NULL is observed, 230 * or the zv is protected from being freed because of the positive 231 * zv_open_count. 232 */ 233 zv = pp->private; 234 if (zv == NULL) { 235 rw_exit(&zvol_state_lock); 236 err = SET_ERROR(ENXIO); 237 goto out_locked; 238 } 239 240 mutex_enter(&zv->zv_state_lock); 241 if (zv->zv_zso->zso_dying) { 242 rw_exit(&zvol_state_lock); 243 err = SET_ERROR(ENXIO); 244 goto out_zv_locked; 245 } 246 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 247 248 /* 249 * Make sure zvol is not suspended during first open 250 * (hold zv_suspend_lock) and respect proper lock acquisition 251 * ordering - zv_suspend_lock before zv_state_lock. 252 */ 253 if (zv->zv_open_count == 0) { 254 drop_suspend = B_TRUE; 255 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 256 mutex_exit(&zv->zv_state_lock); 257 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 258 mutex_enter(&zv->zv_state_lock); 259 /* Check to see if zv_suspend_lock is needed. */ 260 if (zv->zv_open_count != 0) { 261 rw_exit(&zv->zv_suspend_lock); 262 drop_suspend = B_FALSE; 263 } 264 } 265 } 266 rw_exit(&zvol_state_lock); 267 268 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 269 270 if (zv->zv_open_count == 0) { 271 boolean_t drop_namespace = B_FALSE; 272 273 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 274 275 /* 276 * Take spa_namespace_lock to prevent lock inversion when 277 * zvols from one pool are opened as vdevs in another. 278 */ 279 if (!mutex_owned(&spa_namespace_lock)) { 280 if (!mutex_tryenter(&spa_namespace_lock)) { 281 mutex_exit(&zv->zv_state_lock); 282 rw_exit(&zv->zv_suspend_lock); 283 kern_yield(PRI_USER); 284 goto retry; 285 } else { 286 drop_namespace = B_TRUE; 287 } 288 } 289 err = zvol_first_open(zv, !(flag & FWRITE)); 290 if (drop_namespace) 291 mutex_exit(&spa_namespace_lock); 292 if (err) 293 goto out_zv_locked; 294 pp->mediasize = zv->zv_volsize; 295 pp->stripeoffset = 0; 296 pp->stripesize = zv->zv_volblocksize; 297 } 298 299 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 300 301 /* 302 * Check for a bad on-disk format version now since we 303 * lied about owning the dataset readonly before. 304 */ 305 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 306 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 307 err = SET_ERROR(EROFS); 308 goto out_opened; 309 } 310 if (zv->zv_flags & ZVOL_EXCL) { 311 err = SET_ERROR(EBUSY); 312 goto out_opened; 313 } 314 if (flag & O_EXCL) { 315 if (zv->zv_open_count != 0) { 316 err = SET_ERROR(EBUSY); 317 goto out_opened; 318 } 319 zv->zv_flags |= ZVOL_EXCL; 320 } 321 322 zv->zv_open_count += count; 323 out_opened: 324 if (zv->zv_open_count == 0) { 325 zvol_last_close(zv); 326 wakeup(zv); 327 } 328 out_zv_locked: 329 mutex_exit(&zv->zv_state_lock); 330 out_locked: 331 if (drop_suspend) 332 rw_exit(&zv->zv_suspend_lock); 333 return (err); 334 } 335 336 static int 337 zvol_geom_close(struct g_provider *pp, int flag, int count) 338 { 339 (void) flag; 340 zvol_state_t *zv; 341 boolean_t drop_suspend = B_TRUE; 342 int new_open_count; 343 344 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 345 zv = pp->private; 346 if (zv == NULL) { 347 rw_exit(&zvol_state_lock); 348 return (SET_ERROR(ENXIO)); 349 } 350 351 mutex_enter(&zv->zv_state_lock); 352 if (zv->zv_flags & ZVOL_EXCL) { 353 ASSERT3U(zv->zv_open_count, ==, 1); 354 zv->zv_flags &= ~ZVOL_EXCL; 355 } 356 357 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 358 359 /* 360 * If the open count is zero, this is a spurious close. 361 * That indicates a bug in the kernel / DDI framework. 362 */ 363 ASSERT3U(zv->zv_open_count, >, 0); 364 365 /* 366 * Make sure zvol is not suspended during last close 367 * (hold zv_suspend_lock) and respect proper lock acquisition 368 * ordering - zv_suspend_lock before zv_state_lock. 369 */ 370 new_open_count = zv->zv_open_count - count; 371 if (new_open_count == 0) { 372 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 373 mutex_exit(&zv->zv_state_lock); 374 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 375 mutex_enter(&zv->zv_state_lock); 376 /* Check to see if zv_suspend_lock is needed. */ 377 new_open_count = zv->zv_open_count - count; 378 if (new_open_count != 0) { 379 rw_exit(&zv->zv_suspend_lock); 380 drop_suspend = B_FALSE; 381 } 382 } 383 } else { 384 drop_suspend = B_FALSE; 385 } 386 rw_exit(&zvol_state_lock); 387 388 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 389 390 /* 391 * You may get multiple opens, but only one close. 392 */ 393 zv->zv_open_count = new_open_count; 394 if (zv->zv_open_count == 0) { 395 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 396 zvol_last_close(zv); 397 wakeup(zv); 398 } 399 400 mutex_exit(&zv->zv_state_lock); 401 402 if (drop_suspend) 403 rw_exit(&zv->zv_suspend_lock); 404 return (0); 405 } 406 407 static void 408 zvol_geom_run(zvol_state_t *zv) 409 { 410 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 411 struct g_provider *pp = zsg->zsg_provider; 412 413 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 414 415 g_error_provider(pp, 0); 416 417 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, 418 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); 419 } 420 421 static void 422 zvol_geom_destroy(zvol_state_t *zv) 423 { 424 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 425 struct g_provider *pp = zsg->zsg_provider; 426 427 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 428 429 g_topology_assert(); 430 431 mutex_enter(&zv->zv_state_lock); 432 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING); 433 mutex_exit(&zv->zv_state_lock); 434 zsg->zsg_provider = NULL; 435 g_wither_geom(pp->geom, ENXIO); 436 } 437 438 void 439 zvol_wait_close(zvol_state_t *zv) 440 { 441 442 if (zv->zv_volmode != ZFS_VOLMODE_GEOM) 443 return; 444 mutex_enter(&zv->zv_state_lock); 445 zv->zv_zso->zso_dying = B_TRUE; 446 447 if (zv->zv_open_count) 448 msleep(zv, &zv->zv_state_lock, 449 PRIBIO, "zvol:dying", 10*hz); 450 mutex_exit(&zv->zv_state_lock); 451 } 452 453 454 static int 455 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 456 { 457 int count, error, flags; 458 459 g_topology_assert(); 460 461 /* 462 * To make it easier we expect either open or close, but not both 463 * at the same time. 464 */ 465 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 466 (acr <= 0 && acw <= 0 && ace <= 0), 467 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 468 pp->name, acr, acw, ace)); 469 470 if (pp->private == NULL) { 471 if (acr <= 0 && acw <= 0 && ace <= 0) 472 return (0); 473 return (pp->error); 474 } 475 476 /* 477 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 478 * ace != 0, because GEOM already handles that and handles it a bit 479 * differently. GEOM allows for multiple read/exclusive consumers and 480 * ZFS allows only one exclusive consumer, no matter if it is reader or 481 * writer. I like better the way GEOM works so I'll leave it for GEOM 482 * to decide what to do. 483 */ 484 485 count = acr + acw + ace; 486 if (count == 0) 487 return (0); 488 489 flags = 0; 490 if (acr != 0 || ace != 0) 491 flags |= FREAD; 492 if (acw != 0) 493 flags |= FWRITE; 494 495 g_topology_unlock(); 496 if (count > 0) 497 error = zvol_geom_open(pp, flags, count); 498 else 499 error = zvol_geom_close(pp, flags, -count); 500 g_topology_lock(); 501 return (error); 502 } 503 504 static void 505 zvol_geom_worker(void *arg) 506 { 507 zvol_state_t *zv = arg; 508 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 509 struct bio *bp; 510 511 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 512 513 thread_lock(curthread); 514 sched_prio(curthread, PRIBIO); 515 thread_unlock(curthread); 516 517 for (;;) { 518 mtx_lock(&zsg->zsg_queue_mtx); 519 bp = bioq_takefirst(&zsg->zsg_queue); 520 if (bp == NULL) { 521 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { 522 zsg->zsg_state = ZVOL_GEOM_RUNNING; 523 wakeup(&zsg->zsg_state); 524 mtx_unlock(&zsg->zsg_queue_mtx); 525 kthread_exit(); 526 } 527 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, 528 PRIBIO | PDROP, "zvol:io", 0); 529 continue; 530 } 531 mtx_unlock(&zsg->zsg_queue_mtx); 532 zvol_geom_bio_strategy(bp); 533 } 534 } 535 536 static void 537 zvol_geom_bio_start(struct bio *bp) 538 { 539 zvol_state_t *zv = bp->bio_to->private; 540 struct zvol_state_geom *zsg; 541 boolean_t first; 542 543 if (zv == NULL) { 544 g_io_deliver(bp, ENXIO); 545 return; 546 } 547 if (bp->bio_cmd == BIO_GETATTR) { 548 if (zvol_geom_bio_getattr(bp)) 549 g_io_deliver(bp, EOPNOTSUPP); 550 return; 551 } 552 553 if (!THREAD_CAN_SLEEP()) { 554 zsg = &zv->zv_zso->zso_geom; 555 mtx_lock(&zsg->zsg_queue_mtx); 556 first = (bioq_first(&zsg->zsg_queue) == NULL); 557 bioq_insert_tail(&zsg->zsg_queue, bp); 558 mtx_unlock(&zsg->zsg_queue_mtx); 559 if (first) 560 wakeup_one(&zsg->zsg_queue); 561 return; 562 } 563 564 zvol_geom_bio_strategy(bp); 565 } 566 567 static int 568 zvol_geom_bio_getattr(struct bio *bp) 569 { 570 zvol_state_t *zv; 571 572 zv = bp->bio_to->private; 573 ASSERT3P(zv, !=, NULL); 574 575 spa_t *spa = dmu_objset_spa(zv->zv_objset); 576 uint64_t refd, avail, usedobjs, availobjs; 577 578 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 579 return (0); 580 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 581 dmu_objset_space(zv->zv_objset, &refd, &avail, 582 &usedobjs, &availobjs); 583 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 584 return (0); 585 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 586 dmu_objset_space(zv->zv_objset, &refd, &avail, 587 &usedobjs, &availobjs); 588 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 589 return (0); 590 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 591 avail = metaslab_class_get_space(spa_normal_class(spa)); 592 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 593 if (g_handleattr_off_t(bp, "poolblocksavail", 594 avail / DEV_BSIZE)) 595 return (0); 596 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 597 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 598 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 599 return (0); 600 } 601 return (1); 602 } 603 604 static void 605 zvol_geom_bio_strategy(struct bio *bp) 606 { 607 zvol_state_t *zv; 608 uint64_t off, volsize; 609 size_t resid; 610 char *addr; 611 objset_t *os; 612 zfs_locked_range_t *lr; 613 int error = 0; 614 boolean_t doread = B_FALSE; 615 boolean_t is_dumpified; 616 boolean_t sync; 617 618 if (bp->bio_to) 619 zv = bp->bio_to->private; 620 else 621 zv = bp->bio_dev->si_drv2; 622 623 if (zv == NULL) { 624 error = SET_ERROR(ENXIO); 625 goto out; 626 } 627 628 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 629 630 switch (bp->bio_cmd) { 631 case BIO_READ: 632 doread = B_TRUE; 633 break; 634 case BIO_WRITE: 635 case BIO_FLUSH: 636 case BIO_DELETE: 637 if (zv->zv_flags & ZVOL_RDONLY) { 638 error = SET_ERROR(EROFS); 639 goto resume; 640 } 641 zvol_ensure_zilog(zv); 642 if (bp->bio_cmd == BIO_FLUSH) 643 goto sync; 644 break; 645 default: 646 error = SET_ERROR(EOPNOTSUPP); 647 goto resume; 648 } 649 650 off = bp->bio_offset; 651 volsize = zv->zv_volsize; 652 653 os = zv->zv_objset; 654 ASSERT3P(os, !=, NULL); 655 656 addr = bp->bio_data; 657 resid = bp->bio_length; 658 659 if (resid > 0 && off >= volsize) { 660 error = SET_ERROR(EIO); 661 goto resume; 662 } 663 664 is_dumpified = B_FALSE; 665 sync = !doread && !is_dumpified && 666 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 667 668 /* 669 * There must be no buffer changes when doing a dmu_sync() because 670 * we can't change the data whilst calculating the checksum. 671 */ 672 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 673 doread ? RL_READER : RL_WRITER); 674 675 if (bp->bio_cmd == BIO_DELETE) { 676 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 677 error = dmu_tx_assign(tx, TXG_WAIT); 678 if (error != 0) { 679 dmu_tx_abort(tx); 680 } else { 681 zvol_log_truncate(zv, tx, off, resid, sync); 682 dmu_tx_commit(tx); 683 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 684 off, resid); 685 resid = 0; 686 } 687 goto unlock; 688 } 689 while (resid != 0 && off < volsize) { 690 size_t size = MIN(resid, zvol_maxphys); 691 if (doread) { 692 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 693 DMU_READ_PREFETCH); 694 } else { 695 dmu_tx_t *tx = dmu_tx_create(os); 696 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 697 error = dmu_tx_assign(tx, TXG_WAIT); 698 if (error) { 699 dmu_tx_abort(tx); 700 } else { 701 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 702 zvol_log_write(zv, tx, off, size, sync); 703 dmu_tx_commit(tx); 704 } 705 } 706 if (error) { 707 /* Convert checksum errors into IO errors. */ 708 if (error == ECKSUM) 709 error = SET_ERROR(EIO); 710 break; 711 } 712 off += size; 713 addr += size; 714 resid -= size; 715 } 716 unlock: 717 zfs_rangelock_exit(lr); 718 719 bp->bio_completed = bp->bio_length - resid; 720 if (bp->bio_completed < bp->bio_length && off > volsize) 721 error = SET_ERROR(EINVAL); 722 723 switch (bp->bio_cmd) { 724 case BIO_FLUSH: 725 break; 726 case BIO_READ: 727 dataset_kstats_update_read_kstats(&zv->zv_kstat, 728 bp->bio_completed); 729 break; 730 case BIO_WRITE: 731 dataset_kstats_update_write_kstats(&zv->zv_kstat, 732 bp->bio_completed); 733 break; 734 case BIO_DELETE: 735 break; 736 default: 737 break; 738 } 739 740 if (sync) { 741 sync: 742 zil_commit(zv->zv_zilog, ZVOL_OBJ); 743 } 744 resume: 745 rw_exit(&zv->zv_suspend_lock); 746 out: 747 if (bp->bio_to) 748 g_io_deliver(bp, error); 749 else 750 biofinish(bp, NULL, error); 751 } 752 753 /* 754 * Character device mode implementation 755 */ 756 757 static int 758 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) 759 { 760 zvol_state_t *zv; 761 uint64_t volsize; 762 zfs_locked_range_t *lr; 763 int error = 0; 764 zfs_uio_t uio; 765 766 zfs_uio_init(&uio, uio_s); 767 768 zv = dev->si_drv2; 769 770 volsize = zv->zv_volsize; 771 /* 772 * uio_loffset == volsize isn't an error as 773 * it's required for EOF processing. 774 */ 775 if (zfs_uio_resid(&uio) > 0 && 776 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 777 return (SET_ERROR(EIO)); 778 779 ssize_t start_resid = zfs_uio_resid(&uio); 780 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 781 zfs_uio_resid(&uio), RL_READER); 782 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 783 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 784 785 /* Don't read past the end. */ 786 if (bytes > volsize - zfs_uio_offset(&uio)) 787 bytes = volsize - zfs_uio_offset(&uio); 788 789 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 790 if (error) { 791 /* Convert checksum errors into IO errors. */ 792 if (error == ECKSUM) 793 error = SET_ERROR(EIO); 794 break; 795 } 796 } 797 zfs_rangelock_exit(lr); 798 int64_t nread = start_resid - zfs_uio_resid(&uio); 799 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 800 801 return (error); 802 } 803 804 static int 805 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) 806 { 807 zvol_state_t *zv; 808 uint64_t volsize; 809 zfs_locked_range_t *lr; 810 int error = 0; 811 boolean_t sync; 812 zfs_uio_t uio; 813 814 zv = dev->si_drv2; 815 816 volsize = zv->zv_volsize; 817 818 zfs_uio_init(&uio, uio_s); 819 820 if (zfs_uio_resid(&uio) > 0 && 821 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 822 return (SET_ERROR(EIO)); 823 824 ssize_t start_resid = zfs_uio_resid(&uio); 825 sync = (ioflag & IO_SYNC) || 826 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 827 828 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 829 zvol_ensure_zilog(zv); 830 831 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 832 zfs_uio_resid(&uio), RL_WRITER); 833 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 834 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 835 uint64_t off = zfs_uio_offset(&uio); 836 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 837 838 if (bytes > volsize - off) /* Don't write past the end. */ 839 bytes = volsize - off; 840 841 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 842 error = dmu_tx_assign(tx, TXG_WAIT); 843 if (error) { 844 dmu_tx_abort(tx); 845 break; 846 } 847 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 848 if (error == 0) 849 zvol_log_write(zv, tx, off, bytes, sync); 850 dmu_tx_commit(tx); 851 852 if (error) 853 break; 854 } 855 zfs_rangelock_exit(lr); 856 int64_t nwritten = start_resid - zfs_uio_resid(&uio); 857 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 858 if (sync) 859 zil_commit(zv->zv_zilog, ZVOL_OBJ); 860 rw_exit(&zv->zv_suspend_lock); 861 return (error); 862 } 863 864 static int 865 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 866 { 867 zvol_state_t *zv; 868 struct zvol_state_dev *zsd; 869 int err = 0; 870 boolean_t drop_suspend = B_FALSE; 871 872 retry: 873 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 874 /* 875 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either 876 * the result of zvol free code setting si_drv2 to NULL is observed, 877 * or the zv is protected from being freed because of the positive 878 * zv_open_count. 879 */ 880 zv = dev->si_drv2; 881 if (zv == NULL) { 882 rw_exit(&zvol_state_lock); 883 err = SET_ERROR(ENXIO); 884 goto out_locked; 885 } 886 887 mutex_enter(&zv->zv_state_lock); 888 if (zv->zv_zso->zso_dying) { 889 rw_exit(&zvol_state_lock); 890 err = SET_ERROR(ENXIO); 891 goto out_zv_locked; 892 } 893 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 894 895 /* 896 * Make sure zvol is not suspended during first open 897 * (hold zv_suspend_lock) and respect proper lock acquisition 898 * ordering - zv_suspend_lock before zv_state_lock. 899 */ 900 if (zv->zv_open_count == 0) { 901 drop_suspend = B_TRUE; 902 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 903 mutex_exit(&zv->zv_state_lock); 904 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 905 mutex_enter(&zv->zv_state_lock); 906 /* Check to see if zv_suspend_lock is needed. */ 907 if (zv->zv_open_count != 0) { 908 rw_exit(&zv->zv_suspend_lock); 909 drop_suspend = B_FALSE; 910 } 911 } 912 } 913 rw_exit(&zvol_state_lock); 914 915 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 916 917 if (zv->zv_open_count == 0) { 918 boolean_t drop_namespace = B_FALSE; 919 920 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 921 922 /* 923 * Take spa_namespace_lock to prevent lock inversion when 924 * zvols from one pool are opened as vdevs in another. 925 */ 926 if (!mutex_owned(&spa_namespace_lock)) { 927 if (!mutex_tryenter(&spa_namespace_lock)) { 928 mutex_exit(&zv->zv_state_lock); 929 rw_exit(&zv->zv_suspend_lock); 930 kern_yield(PRI_USER); 931 goto retry; 932 } else { 933 drop_namespace = B_TRUE; 934 } 935 } 936 err = zvol_first_open(zv, !(flags & FWRITE)); 937 if (drop_namespace) 938 mutex_exit(&spa_namespace_lock); 939 if (err) 940 goto out_zv_locked; 941 } 942 943 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 944 945 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 946 err = SET_ERROR(EROFS); 947 goto out_opened; 948 } 949 if (zv->zv_flags & ZVOL_EXCL) { 950 err = SET_ERROR(EBUSY); 951 goto out_opened; 952 } 953 if (flags & O_EXCL) { 954 if (zv->zv_open_count != 0) { 955 err = SET_ERROR(EBUSY); 956 goto out_opened; 957 } 958 zv->zv_flags |= ZVOL_EXCL; 959 } 960 961 zv->zv_open_count++; 962 if (flags & O_SYNC) { 963 zsd = &zv->zv_zso->zso_dev; 964 zsd->zsd_sync_cnt++; 965 if (zsd->zsd_sync_cnt == 1 && 966 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) 967 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); 968 } 969 out_opened: 970 if (zv->zv_open_count == 0) { 971 zvol_last_close(zv); 972 wakeup(zv); 973 } 974 out_zv_locked: 975 mutex_exit(&zv->zv_state_lock); 976 out_locked: 977 if (drop_suspend) 978 rw_exit(&zv->zv_suspend_lock); 979 return (err); 980 } 981 982 static int 983 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 984 { 985 zvol_state_t *zv; 986 struct zvol_state_dev *zsd; 987 boolean_t drop_suspend = B_TRUE; 988 989 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 990 zv = dev->si_drv2; 991 if (zv == NULL) { 992 rw_exit(&zvol_state_lock); 993 return (SET_ERROR(ENXIO)); 994 } 995 996 mutex_enter(&zv->zv_state_lock); 997 if (zv->zv_flags & ZVOL_EXCL) { 998 ASSERT3U(zv->zv_open_count, ==, 1); 999 zv->zv_flags &= ~ZVOL_EXCL; 1000 } 1001 1002 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 1003 1004 /* 1005 * If the open count is zero, this is a spurious close. 1006 * That indicates a bug in the kernel / DDI framework. 1007 */ 1008 ASSERT3U(zv->zv_open_count, >, 0); 1009 /* 1010 * Make sure zvol is not suspended during last close 1011 * (hold zv_suspend_lock) and respect proper lock acquisition 1012 * ordering - zv_suspend_lock before zv_state_lock. 1013 */ 1014 if (zv->zv_open_count == 1) { 1015 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 1016 mutex_exit(&zv->zv_state_lock); 1017 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1018 mutex_enter(&zv->zv_state_lock); 1019 /* Check to see if zv_suspend_lock is needed. */ 1020 if (zv->zv_open_count != 1) { 1021 rw_exit(&zv->zv_suspend_lock); 1022 drop_suspend = B_FALSE; 1023 } 1024 } 1025 } else { 1026 drop_suspend = B_FALSE; 1027 } 1028 rw_exit(&zvol_state_lock); 1029 1030 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1031 1032 /* 1033 * You may get multiple opens, but only one close. 1034 */ 1035 zv->zv_open_count--; 1036 if (flags & O_SYNC) { 1037 zsd = &zv->zv_zso->zso_dev; 1038 zsd->zsd_sync_cnt--; 1039 } 1040 1041 if (zv->zv_open_count == 0) { 1042 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1043 zvol_last_close(zv); 1044 wakeup(zv); 1045 } 1046 1047 mutex_exit(&zv->zv_state_lock); 1048 1049 if (drop_suspend) 1050 rw_exit(&zv->zv_suspend_lock); 1051 return (0); 1052 } 1053 1054 static int 1055 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 1056 int fflag, struct thread *td) 1057 { 1058 zvol_state_t *zv; 1059 zfs_locked_range_t *lr; 1060 off_t offset, length; 1061 int error; 1062 boolean_t sync; 1063 1064 zv = dev->si_drv2; 1065 1066 error = 0; 1067 KASSERT(zv->zv_open_count > 0, 1068 ("Device with zero access count in %s", __func__)); 1069 1070 switch (cmd) { 1071 case DIOCGSECTORSIZE: 1072 *(uint32_t *)data = DEV_BSIZE; 1073 break; 1074 case DIOCGMEDIASIZE: 1075 *(off_t *)data = zv->zv_volsize; 1076 break; 1077 case DIOCGFLUSH: 1078 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1079 if (zv->zv_zilog != NULL) 1080 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1081 rw_exit(&zv->zv_suspend_lock); 1082 break; 1083 case DIOCGDELETE: 1084 if (!zvol_unmap_enabled) 1085 break; 1086 1087 offset = ((off_t *)data)[0]; 1088 length = ((off_t *)data)[1]; 1089 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1090 offset < 0 || offset >= zv->zv_volsize || 1091 length <= 0) { 1092 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1093 length); 1094 error = SET_ERROR(EINVAL); 1095 break; 1096 } 1097 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1098 zvol_ensure_zilog(zv); 1099 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1100 RL_WRITER); 1101 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1102 error = dmu_tx_assign(tx, TXG_WAIT); 1103 if (error != 0) { 1104 sync = FALSE; 1105 dmu_tx_abort(tx); 1106 } else { 1107 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1108 zvol_log_truncate(zv, tx, offset, length, sync); 1109 dmu_tx_commit(tx); 1110 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1111 offset, length); 1112 } 1113 zfs_rangelock_exit(lr); 1114 if (sync) 1115 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1116 rw_exit(&zv->zv_suspend_lock); 1117 break; 1118 case DIOCGSTRIPESIZE: 1119 *(off_t *)data = zv->zv_volblocksize; 1120 break; 1121 case DIOCGSTRIPEOFFSET: 1122 *(off_t *)data = 0; 1123 break; 1124 case DIOCGATTR: { 1125 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1126 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1127 uint64_t refd, avail, usedobjs, availobjs; 1128 1129 if (strcmp(arg->name, "GEOM::candelete") == 0) 1130 arg->value.i = 1; 1131 else if (strcmp(arg->name, "blocksavail") == 0) { 1132 dmu_objset_space(zv->zv_objset, &refd, &avail, 1133 &usedobjs, &availobjs); 1134 arg->value.off = avail / DEV_BSIZE; 1135 } else if (strcmp(arg->name, "blocksused") == 0) { 1136 dmu_objset_space(zv->zv_objset, &refd, &avail, 1137 &usedobjs, &availobjs); 1138 arg->value.off = refd / DEV_BSIZE; 1139 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1140 avail = metaslab_class_get_space(spa_normal_class(spa)); 1141 avail -= metaslab_class_get_alloc( 1142 spa_normal_class(spa)); 1143 arg->value.off = avail / DEV_BSIZE; 1144 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1145 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1146 arg->value.off = refd / DEV_BSIZE; 1147 } else 1148 error = SET_ERROR(ENOIOCTL); 1149 break; 1150 } 1151 case FIOSEEKHOLE: 1152 case FIOSEEKDATA: { 1153 off_t *off = (off_t *)data; 1154 uint64_t noff; 1155 boolean_t hole; 1156 1157 hole = (cmd == FIOSEEKHOLE); 1158 noff = *off; 1159 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1160 *off = noff; 1161 break; 1162 } 1163 default: 1164 error = SET_ERROR(ENOIOCTL); 1165 } 1166 1167 return (error); 1168 } 1169 1170 /* 1171 * Misc. helpers 1172 */ 1173 1174 static void 1175 zvol_ensure_zilog(zvol_state_t *zv) 1176 { 1177 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1178 1179 /* 1180 * Open a ZIL if this is the first time we have written to this 1181 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1182 * than zv_state_lock so that we don't need to acquire an 1183 * additional lock in this path. 1184 */ 1185 if (zv->zv_zilog == NULL) { 1186 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1187 rw_exit(&zv->zv_suspend_lock); 1188 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1189 } 1190 if (zv->zv_zilog == NULL) { 1191 zv->zv_zilog = zil_open(zv->zv_objset, 1192 zvol_get_data); 1193 zv->zv_flags |= ZVOL_WRITTEN_TO; 1194 /* replay / destroy done in zvol_os_create_minor() */ 1195 VERIFY0(zv->zv_zilog->zl_header->zh_flags & 1196 ZIL_REPLAY_NEEDED); 1197 } 1198 rw_downgrade(&zv->zv_suspend_lock); 1199 } 1200 } 1201 1202 boolean_t 1203 zvol_os_is_zvol(const char *device) 1204 { 1205 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1206 } 1207 1208 void 1209 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1210 { 1211 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1212 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1213 1214 /* Move to a new hashtable entry. */ 1215 zv->zv_hash = zvol_name_hash(zv->zv_name); 1216 hlist_del(&zv->zv_hlink); 1217 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1218 1219 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1220 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1221 struct g_provider *pp = zsg->zsg_provider; 1222 struct g_geom *gp; 1223 1224 g_topology_lock(); 1225 gp = pp->geom; 1226 ASSERT3P(gp, !=, NULL); 1227 1228 zsg->zsg_provider = NULL; 1229 g_wither_provider(pp, ENXIO); 1230 1231 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1232 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1233 pp->sectorsize = DEV_BSIZE; 1234 pp->mediasize = zv->zv_volsize; 1235 pp->private = zv; 1236 zsg->zsg_provider = pp; 1237 g_error_provider(pp, 0); 1238 g_topology_unlock(); 1239 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1240 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1241 struct cdev *dev; 1242 struct make_dev_args args; 1243 1244 dev = zsd->zsd_cdev; 1245 if (dev != NULL) { 1246 destroy_dev(dev); 1247 dev = zsd->zsd_cdev = NULL; 1248 if (zv->zv_open_count > 0) { 1249 zv->zv_flags &= ~ZVOL_EXCL; 1250 zv->zv_open_count = 0; 1251 /* XXX need suspend lock but lock order */ 1252 zvol_last_close(zv); 1253 } 1254 } 1255 1256 make_dev_args_init(&args); 1257 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1258 args.mda_devsw = &zvol_cdevsw; 1259 args.mda_cr = NULL; 1260 args.mda_uid = UID_ROOT; 1261 args.mda_gid = GID_OPERATOR; 1262 args.mda_mode = 0640; 1263 args.mda_si_drv2 = zv; 1264 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) 1265 == 0) { 1266 #if __FreeBSD_version > 1300130 1267 dev->si_iosize_max = maxphys; 1268 #else 1269 dev->si_iosize_max = MAXPHYS; 1270 #endif 1271 zsd->zsd_cdev = dev; 1272 } 1273 } 1274 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1275 } 1276 1277 /* 1278 * Remove minor node for the specified volume. 1279 */ 1280 void 1281 zvol_os_free(zvol_state_t *zv) 1282 { 1283 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1284 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1285 ASSERT0(zv->zv_open_count); 1286 1287 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1288 1289 rw_destroy(&zv->zv_suspend_lock); 1290 zfs_rangelock_fini(&zv->zv_rangelock); 1291 1292 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1293 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1294 struct g_provider *pp __maybe_unused = zsg->zsg_provider; 1295 1296 ASSERT3P(pp->private, ==, NULL); 1297 1298 g_topology_lock(); 1299 zvol_geom_destroy(zv); 1300 g_topology_unlock(); 1301 mtx_destroy(&zsg->zsg_queue_mtx); 1302 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1303 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1304 struct cdev *dev = zsd->zsd_cdev; 1305 1306 if (dev != NULL) { 1307 ASSERT3P(dev->si_drv2, ==, NULL); 1308 destroy_dev(dev); 1309 } 1310 } 1311 1312 mutex_destroy(&zv->zv_state_lock); 1313 dataset_kstats_destroy(&zv->zv_kstat); 1314 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1315 kmem_free(zv, sizeof (zvol_state_t)); 1316 zvol_minors--; 1317 } 1318 1319 /* 1320 * Create a minor node (plus a whole lot more) for the specified volume. 1321 */ 1322 int 1323 zvol_os_create_minor(const char *name) 1324 { 1325 zvol_state_t *zv; 1326 objset_t *os; 1327 dmu_object_info_t *doi; 1328 uint64_t volsize; 1329 uint64_t volmode, hash; 1330 int error; 1331 1332 ZFS_LOG(1, "Creating ZVOL %s...", name); 1333 hash = zvol_name_hash(name); 1334 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1335 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1336 mutex_exit(&zv->zv_state_lock); 1337 return (SET_ERROR(EEXIST)); 1338 } 1339 1340 DROP_GIANT(); 1341 1342 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1343 1344 /* Lie and say we're read-only. */ 1345 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1346 if (error) 1347 goto out_doi; 1348 1349 error = dmu_object_info(os, ZVOL_OBJ, doi); 1350 if (error) 1351 goto out_dmu_objset_disown; 1352 1353 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1354 if (error) 1355 goto out_dmu_objset_disown; 1356 1357 error = dsl_prop_get_integer(name, 1358 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); 1359 if (error || volmode == ZFS_VOLMODE_DEFAULT) 1360 volmode = zvol_volmode; 1361 error = 0; 1362 1363 /* 1364 * zvol_alloc equivalent ... 1365 */ 1366 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1367 zv->zv_hash = hash; 1368 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1369 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1370 zv->zv_volmode = volmode; 1371 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1372 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1373 struct g_provider *pp; 1374 struct g_geom *gp; 1375 1376 zsg->zsg_state = ZVOL_GEOM_UNINIT; 1377 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); 1378 1379 g_topology_lock(); 1380 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1381 gp->start = zvol_geom_bio_start; 1382 gp->access = zvol_geom_access; 1383 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1384 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1385 pp->sectorsize = DEV_BSIZE; 1386 pp->mediasize = 0; 1387 pp->private = zv; 1388 1389 zsg->zsg_provider = pp; 1390 bioq_init(&zsg->zsg_queue); 1391 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1392 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1393 struct cdev *dev; 1394 struct make_dev_args args; 1395 1396 make_dev_args_init(&args); 1397 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1398 args.mda_devsw = &zvol_cdevsw; 1399 args.mda_cr = NULL; 1400 args.mda_uid = UID_ROOT; 1401 args.mda_gid = GID_OPERATOR; 1402 args.mda_mode = 0640; 1403 args.mda_si_drv2 = zv; 1404 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) 1405 == 0) { 1406 #if __FreeBSD_version > 1300130 1407 dev->si_iosize_max = maxphys; 1408 #else 1409 dev->si_iosize_max = MAXPHYS; 1410 #endif 1411 zsd->zsd_cdev = dev; 1412 } 1413 } 1414 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1415 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1416 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1417 1418 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1419 zv->zv_flags |= ZVOL_RDONLY; 1420 1421 zv->zv_volblocksize = doi->doi_data_block_size; 1422 zv->zv_volsize = volsize; 1423 zv->zv_objset = os; 1424 1425 ASSERT3P(zv->zv_zilog, ==, NULL); 1426 zv->zv_zilog = zil_open(os, zvol_get_data); 1427 if (spa_writeable(dmu_objset_spa(os))) { 1428 if (zil_replay_disable) 1429 zil_destroy(zv->zv_zilog, B_FALSE); 1430 else 1431 zil_replay(os, zv, zvol_replay_vector); 1432 } 1433 zil_close(zv->zv_zilog); 1434 zv->zv_zilog = NULL; 1435 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1436 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1437 1438 /* TODO: prefetch for geom tasting */ 1439 1440 zv->zv_objset = NULL; 1441 out_dmu_objset_disown: 1442 dmu_objset_disown(os, B_TRUE, FTAG); 1443 1444 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { 1445 zvol_geom_run(zv); 1446 g_topology_unlock(); 1447 } 1448 out_doi: 1449 kmem_free(doi, sizeof (dmu_object_info_t)); 1450 if (error == 0) { 1451 rw_enter(&zvol_state_lock, RW_WRITER); 1452 zvol_insert(zv); 1453 zvol_minors++; 1454 rw_exit(&zvol_state_lock); 1455 ZFS_LOG(1, "ZVOL %s created.", name); 1456 } 1457 PICKUP_GIANT(); 1458 return (error); 1459 } 1460 1461 void 1462 zvol_os_clear_private(zvol_state_t *zv) 1463 { 1464 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1465 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1466 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1467 struct g_provider *pp = zsg->zsg_provider; 1468 1469 if (pp->private == NULL) /* already cleared */ 1470 return; 1471 1472 mtx_lock(&zsg->zsg_queue_mtx); 1473 zsg->zsg_state = ZVOL_GEOM_STOPPED; 1474 pp->private = NULL; 1475 wakeup_one(&zsg->zsg_queue); 1476 while (zsg->zsg_state != ZVOL_GEOM_RUNNING) 1477 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 1478 0, "zvol:w", 0); 1479 mtx_unlock(&zsg->zsg_queue_mtx); 1480 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1481 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1482 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1483 struct cdev *dev = zsd->zsd_cdev; 1484 1485 if (dev != NULL) 1486 dev->si_drv2 = NULL; 1487 } 1488 } 1489 1490 int 1491 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1492 { 1493 zv->zv_volsize = volsize; 1494 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1495 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1496 struct g_provider *pp = zsg->zsg_provider; 1497 1498 g_topology_lock(); 1499 1500 if (pp->private == NULL) { 1501 g_topology_unlock(); 1502 return (SET_ERROR(ENXIO)); 1503 } 1504 1505 /* 1506 * Do not invoke resize event when initial size was zero. 1507 * ZVOL initializes the size on first open, this is not 1508 * real resizing. 1509 */ 1510 if (pp->mediasize == 0) 1511 pp->mediasize = zv->zv_volsize; 1512 else 1513 g_resize_provider(pp, zv->zv_volsize); 1514 1515 g_topology_unlock(); 1516 } 1517 return (0); 1518 } 1519 1520 void 1521 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1522 { 1523 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); 1524 } 1525 1526 void 1527 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1528 { 1529 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); 1530 } 1531 1532 /* 1533 * Public interfaces 1534 */ 1535 1536 int 1537 zvol_busy(void) 1538 { 1539 return (zvol_minors != 0); 1540 } 1541 1542 int 1543 zvol_init(void) 1544 { 1545 zvol_init_impl(); 1546 return (0); 1547 } 1548 1549 void 1550 zvol_fini(void) 1551 { 1552 zvol_fini_impl(); 1553 } 1554