1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 25 * All rights reserved. 26 * 27 * Portions Copyright 2010 Robert Milkowski 28 * 29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 31 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 32 * Copyright (c) 2014 Integros [integros.com] 33 */ 34 35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 36 37 /* 38 * ZFS volume emulation driver. 39 * 40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 41 * Volumes are accessed through the symbolic links named: 42 * 43 * /dev/zvol/<pool_name>/<dataset_name> 44 * 45 * Volumes are persistent through reboot. No user command needs to be 46 * run before opening and using a device. 47 * 48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 49 * in the system. Except when they're simply character devices (volmode=dev). 50 */ 51 52 #include <sys/types.h> 53 #include <sys/param.h> 54 #include <sys/kernel.h> 55 #include <sys/errno.h> 56 #include <sys/uio.h> 57 #include <sys/bio.h> 58 #include <sys/buf.h> 59 #include <sys/kmem.h> 60 #include <sys/conf.h> 61 #include <sys/cmn_err.h> 62 #include <sys/stat.h> 63 #include <sys/proc.h> 64 #include <sys/zap.h> 65 #include <sys/spa.h> 66 #include <sys/spa_impl.h> 67 #include <sys/zio.h> 68 #include <sys/disk.h> 69 #include <sys/dmu_traverse.h> 70 #include <sys/dnode.h> 71 #include <sys/dsl_dataset.h> 72 #include <sys/dsl_prop.h> 73 #include <sys/dsl_dir.h> 74 #include <sys/byteorder.h> 75 #include <sys/sunddi.h> 76 #include <sys/dirent.h> 77 #include <sys/policy.h> 78 #include <sys/queue.h> 79 #include <sys/fs/zfs.h> 80 #include <sys/zfs_ioctl.h> 81 #include <sys/zil.h> 82 #include <sys/zfs_znode.h> 83 #include <sys/zfs_rlock.h> 84 #include <sys/vdev_impl.h> 85 #include <sys/vdev_raidz.h> 86 #include <sys/zvol.h> 87 #include <sys/zil_impl.h> 88 #include <sys/dataset_kstats.h> 89 #include <sys/dbuf.h> 90 #include <sys/dmu_tx.h> 91 #include <sys/zfeature.h> 92 #include <sys/zio_checksum.h> 93 #include <sys/zil_impl.h> 94 #include <sys/filio.h> 95 96 #include <geom/geom.h> 97 #include <sys/zvol.h> 98 #include <sys/zvol_impl.h> 99 100 #include "zfs_namecheck.h" 101 102 #define ZVOL_DUMPSIZE "dumpsize" 103 104 #ifdef ZVOL_LOCK_DEBUG 105 #define ZVOL_RW_READER RW_WRITER 106 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 107 #else 108 #define ZVOL_RW_READER RW_READER 109 #define ZVOL_RW_READ_HELD RW_READ_HELD 110 #endif 111 112 enum zvol_geom_state { 113 ZVOL_GEOM_UNINIT, 114 ZVOL_GEOM_STOPPED, 115 ZVOL_GEOM_RUNNING, 116 }; 117 118 struct zvol_state_os { 119 int zso_volmode; 120 #define zso_dev _zso_state._zso_dev 121 #define zso_geom _zso_state._zso_geom 122 union { 123 /* volmode=dev */ 124 struct zvol_state_dev { 125 struct cdev *zsd_cdev; 126 uint64_t zsd_sync_cnt; 127 } _zso_dev; 128 129 /* volmode=geom */ 130 struct zvol_state_geom { 131 struct g_provider *zsg_provider; 132 struct bio_queue_head zsg_queue; 133 struct mtx zsg_queue_mtx; 134 enum zvol_geom_state zsg_state; 135 } _zso_geom; 136 } _zso_state; 137 }; 138 139 static uint32_t zvol_minors; 140 141 SYSCTL_DECL(_vfs_zfs); 142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, 144 "Expose as GEOM providers (1), device files (2) or neither"); 145 static boolean_t zpool_on_zvol = B_FALSE; 146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 147 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 148 149 /* 150 * Toggle unmap functionality. 151 */ 152 boolean_t zvol_unmap_enabled = B_TRUE; 153 154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 155 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 156 157 /* 158 * zvol maximum transfer in one DMU tx. 159 */ 160 int zvol_maxphys = DMU_MAX_ACCESS / 2; 161 162 static void zvol_ensure_zilog(zvol_state_t *zv); 163 164 static d_open_t zvol_cdev_open; 165 static d_close_t zvol_cdev_close; 166 static d_ioctl_t zvol_cdev_ioctl; 167 static d_read_t zvol_cdev_read; 168 static d_write_t zvol_cdev_write; 169 static d_strategy_t zvol_geom_bio_strategy; 170 171 static struct cdevsw zvol_cdevsw = { 172 .d_name = "zvol", 173 .d_version = D_VERSION, 174 .d_flags = D_DISK | D_TRACKCLOSE, 175 .d_open = zvol_cdev_open, 176 .d_close = zvol_cdev_close, 177 .d_ioctl = zvol_cdev_ioctl, 178 .d_read = zvol_cdev_read, 179 .d_write = zvol_cdev_write, 180 .d_strategy = zvol_geom_bio_strategy, 181 }; 182 183 extern uint_t zfs_geom_probe_vdev_key; 184 185 struct g_class zfs_zvol_class = { 186 .name = "ZFS::ZVOL", 187 .version = G_VERSION, 188 }; 189 190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 191 192 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 193 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 194 static void zvol_geom_run(zvol_state_t *zv); 195 static void zvol_geom_destroy(zvol_state_t *zv); 196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 197 static void zvol_geom_worker(void *arg); 198 static void zvol_geom_bio_start(struct bio *bp); 199 static int zvol_geom_bio_getattr(struct bio *bp); 200 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ 201 202 /* 203 * GEOM mode implementation 204 */ 205 206 /*ARGSUSED*/ 207 static int 208 zvol_geom_open(struct g_provider *pp, int flag, int count) 209 { 210 zvol_state_t *zv; 211 int err = 0; 212 boolean_t drop_suspend = B_TRUE; 213 boolean_t drop_namespace = B_FALSE; 214 215 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 216 /* 217 * if zfs_geom_probe_vdev_key is set, that means that zfs is 218 * attempting to probe geom providers while looking for a 219 * replacement for a missing VDEV. In this case, the 220 * spa_namespace_lock will not be held, but it is still illegal 221 * to use a zvol as a vdev. Deadlocks can result if another 222 * thread has spa_namespace_lock 223 */ 224 return (SET_ERROR(EOPNOTSUPP)); 225 } 226 227 retry: 228 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 229 zv = pp->private; 230 if (zv == NULL) { 231 if (drop_namespace) 232 mutex_exit(&spa_namespace_lock); 233 rw_exit(&zvol_state_lock); 234 return (SET_ERROR(ENXIO)); 235 } 236 237 if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { 238 /* 239 * We need to guarantee that the namespace lock is held 240 * to avoid spurious failures in zvol_first_open 241 */ 242 drop_namespace = B_TRUE; 243 if (!mutex_tryenter(&spa_namespace_lock)) { 244 rw_exit(&zvol_state_lock); 245 mutex_enter(&spa_namespace_lock); 246 goto retry; 247 } 248 } 249 mutex_enter(&zv->zv_state_lock); 250 251 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); 252 253 /* 254 * make sure zvol is not suspended during first open 255 * (hold zv_suspend_lock) and respect proper lock acquisition 256 * ordering - zv_suspend_lock before zv_state_lock 257 */ 258 if (zv->zv_open_count == 0) { 259 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 260 mutex_exit(&zv->zv_state_lock); 261 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 262 mutex_enter(&zv->zv_state_lock); 263 /* check to see if zv_suspend_lock is needed */ 264 if (zv->zv_open_count != 0) { 265 rw_exit(&zv->zv_suspend_lock); 266 drop_suspend = B_FALSE; 267 } 268 } 269 } else { 270 drop_suspend = B_FALSE; 271 } 272 rw_exit(&zvol_state_lock); 273 274 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 275 276 if (zv->zv_open_count == 0) { 277 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 278 err = zvol_first_open(zv, !(flag & FWRITE)); 279 if (err) 280 goto out_mutex; 281 pp->mediasize = zv->zv_volsize; 282 pp->stripeoffset = 0; 283 pp->stripesize = zv->zv_volblocksize; 284 } 285 286 /* 287 * Check for a bad on-disk format version now since we 288 * lied about owning the dataset readonly before. 289 */ 290 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 291 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 292 err = EROFS; 293 goto out_open_count; 294 } 295 if (zv->zv_flags & ZVOL_EXCL) { 296 err = EBUSY; 297 goto out_open_count; 298 } 299 #ifdef FEXCL 300 if (flag & FEXCL) { 301 if (zv->zv_open_count != 0) { 302 err = EBUSY; 303 goto out_open_count; 304 } 305 zv->zv_flags |= ZVOL_EXCL; 306 } 307 #endif 308 309 zv->zv_open_count += count; 310 if (drop_namespace) 311 mutex_exit(&spa_namespace_lock); 312 mutex_exit(&zv->zv_state_lock); 313 if (drop_suspend) 314 rw_exit(&zv->zv_suspend_lock); 315 return (0); 316 317 out_open_count: 318 if (zv->zv_open_count == 0) 319 zvol_last_close(zv); 320 out_mutex: 321 if (drop_namespace) 322 mutex_exit(&spa_namespace_lock); 323 mutex_exit(&zv->zv_state_lock); 324 if (drop_suspend) 325 rw_exit(&zv->zv_suspend_lock); 326 return (SET_ERROR(err)); 327 } 328 329 /*ARGSUSED*/ 330 static int 331 zvol_geom_close(struct g_provider *pp, int flag, int count) 332 { 333 zvol_state_t *zv; 334 boolean_t drop_suspend = B_TRUE; 335 336 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 337 zv = pp->private; 338 if (zv == NULL) { 339 rw_exit(&zvol_state_lock); 340 return (SET_ERROR(ENXIO)); 341 } 342 343 mutex_enter(&zv->zv_state_lock); 344 if (zv->zv_flags & ZVOL_EXCL) { 345 ASSERT(zv->zv_open_count == 1); 346 zv->zv_flags &= ~ZVOL_EXCL; 347 } 348 349 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); 350 351 /* 352 * If the open count is zero, this is a spurious close. 353 * That indicates a bug in the kernel / DDI framework. 354 */ 355 ASSERT(zv->zv_open_count > 0); 356 357 /* 358 * make sure zvol is not suspended during last close 359 * (hold zv_suspend_lock) and respect proper lock acquisition 360 * ordering - zv_suspend_lock before zv_state_lock 361 */ 362 if ((zv->zv_open_count - count) == 0) { 363 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 364 mutex_exit(&zv->zv_state_lock); 365 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 366 mutex_enter(&zv->zv_state_lock); 367 /* check to see if zv_suspend_lock is needed */ 368 if (zv->zv_open_count != 1) { 369 rw_exit(&zv->zv_suspend_lock); 370 drop_suspend = B_FALSE; 371 } 372 } 373 } else { 374 drop_suspend = B_FALSE; 375 } 376 rw_exit(&zvol_state_lock); 377 378 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 379 380 /* 381 * You may get multiple opens, but only one close. 382 */ 383 zv->zv_open_count -= count; 384 385 if (zv->zv_open_count == 0) { 386 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 387 zvol_last_close(zv); 388 } 389 390 mutex_exit(&zv->zv_state_lock); 391 392 if (drop_suspend) 393 rw_exit(&zv->zv_suspend_lock); 394 return (0); 395 } 396 397 static void 398 zvol_geom_run(zvol_state_t *zv) 399 { 400 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 401 struct g_provider *pp = zsg->zsg_provider; 402 403 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); 404 405 g_error_provider(pp, 0); 406 407 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, 408 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); 409 } 410 411 static void 412 zvol_geom_destroy(zvol_state_t *zv) 413 { 414 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 415 struct g_provider *pp = zsg->zsg_provider; 416 417 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); 418 419 g_topology_assert(); 420 421 mutex_enter(&zv->zv_state_lock); 422 VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING); 423 mutex_exit(&zv->zv_state_lock); 424 zsg->zsg_provider = NULL; 425 pp->private = NULL; 426 g_wither_geom(pp->geom, ENXIO); 427 } 428 429 static int 430 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 431 { 432 int count, error, flags; 433 434 g_topology_assert(); 435 436 /* 437 * To make it easier we expect either open or close, but not both 438 * at the same time. 439 */ 440 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 441 (acr <= 0 && acw <= 0 && ace <= 0), 442 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 443 pp->name, acr, acw, ace)); 444 445 if (pp->private == NULL) { 446 if (acr <= 0 && acw <= 0 && ace <= 0) 447 return (0); 448 return (pp->error); 449 } 450 451 /* 452 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 453 * ace != 0, because GEOM already handles that and handles it a bit 454 * differently. GEOM allows for multiple read/exclusive consumers and 455 * ZFS allows only one exclusive consumer, no matter if it is reader or 456 * writer. I like better the way GEOM works so I'll leave it for GEOM 457 * to decide what to do. 458 */ 459 460 count = acr + acw + ace; 461 if (count == 0) 462 return (0); 463 464 flags = 0; 465 if (acr != 0 || ace != 0) 466 flags |= FREAD; 467 if (acw != 0) 468 flags |= FWRITE; 469 470 g_topology_unlock(); 471 if (count > 0) 472 error = zvol_geom_open(pp, flags, count); 473 else 474 error = zvol_geom_close(pp, flags, -count); 475 g_topology_lock(); 476 return (error); 477 } 478 479 static void 480 zvol_geom_worker(void *arg) 481 { 482 zvol_state_t *zv = arg; 483 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 484 struct bio *bp; 485 486 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); 487 488 thread_lock(curthread); 489 sched_prio(curthread, PRIBIO); 490 thread_unlock(curthread); 491 492 for (;;) { 493 mtx_lock(&zsg->zsg_queue_mtx); 494 bp = bioq_takefirst(&zsg->zsg_queue); 495 if (bp == NULL) { 496 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { 497 zsg->zsg_state = ZVOL_GEOM_RUNNING; 498 wakeup(&zsg->zsg_state); 499 mtx_unlock(&zsg->zsg_queue_mtx); 500 kthread_exit(); 501 } 502 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, 503 PRIBIO | PDROP, "zvol:io", 0); 504 continue; 505 } 506 mtx_unlock(&zsg->zsg_queue_mtx); 507 zvol_geom_bio_strategy(bp); 508 } 509 } 510 511 static void 512 zvol_geom_bio_start(struct bio *bp) 513 { 514 zvol_state_t *zv = bp->bio_to->private; 515 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 516 boolean_t first; 517 518 if (bp->bio_cmd == BIO_GETATTR) { 519 if (zvol_geom_bio_getattr(bp)) 520 g_io_deliver(bp, EOPNOTSUPP); 521 return; 522 } 523 524 if (!THREAD_CAN_SLEEP()) { 525 mtx_lock(&zsg->zsg_queue_mtx); 526 first = (bioq_first(&zsg->zsg_queue) == NULL); 527 bioq_insert_tail(&zsg->zsg_queue, bp); 528 mtx_unlock(&zsg->zsg_queue_mtx); 529 if (first) 530 wakeup_one(&zsg->zsg_queue); 531 return; 532 } 533 534 zvol_geom_bio_strategy(bp); 535 } 536 537 static int 538 zvol_geom_bio_getattr(struct bio *bp) 539 { 540 zvol_state_t *zv; 541 542 zv = bp->bio_to->private; 543 ASSERT(zv != NULL); 544 545 spa_t *spa = dmu_objset_spa(zv->zv_objset); 546 uint64_t refd, avail, usedobjs, availobjs; 547 548 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 549 return (0); 550 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 551 dmu_objset_space(zv->zv_objset, &refd, &avail, 552 &usedobjs, &availobjs); 553 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 554 return (0); 555 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 556 dmu_objset_space(zv->zv_objset, &refd, &avail, 557 &usedobjs, &availobjs); 558 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 559 return (0); 560 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 561 avail = metaslab_class_get_space(spa_normal_class(spa)); 562 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 563 if (g_handleattr_off_t(bp, "poolblocksavail", 564 avail / DEV_BSIZE)) 565 return (0); 566 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 567 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 568 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 569 return (0); 570 } 571 return (1); 572 } 573 574 static void 575 zvol_geom_bio_strategy(struct bio *bp) 576 { 577 zvol_state_t *zv; 578 uint64_t off, volsize; 579 size_t resid; 580 char *addr; 581 objset_t *os; 582 zfs_locked_range_t *lr; 583 int error = 0; 584 boolean_t doread = B_FALSE; 585 boolean_t is_dumpified; 586 boolean_t sync; 587 588 if (bp->bio_to) 589 zv = bp->bio_to->private; 590 else 591 zv = bp->bio_dev->si_drv2; 592 593 if (zv == NULL) { 594 error = SET_ERROR(ENXIO); 595 goto out; 596 } 597 598 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 599 600 switch (bp->bio_cmd) { 601 case BIO_READ: 602 doread = B_TRUE; 603 break; 604 case BIO_WRITE: 605 case BIO_FLUSH: 606 case BIO_DELETE: 607 if (zv->zv_flags & ZVOL_RDONLY) { 608 error = SET_ERROR(EROFS); 609 goto resume; 610 } 611 zvol_ensure_zilog(zv); 612 if (bp->bio_cmd == BIO_FLUSH) 613 goto sync; 614 break; 615 default: 616 error = EOPNOTSUPP; 617 goto resume; 618 } 619 620 off = bp->bio_offset; 621 volsize = zv->zv_volsize; 622 623 os = zv->zv_objset; 624 ASSERT(os != NULL); 625 626 addr = bp->bio_data; 627 resid = bp->bio_length; 628 629 if (resid > 0 && off >= volsize) { 630 error = SET_ERROR(EIO); 631 goto resume; 632 } 633 634 is_dumpified = B_FALSE; 635 sync = !doread && !is_dumpified && 636 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 637 638 /* 639 * There must be no buffer changes when doing a dmu_sync() because 640 * we can't change the data whilst calculating the checksum. 641 */ 642 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 643 doread ? RL_READER : RL_WRITER); 644 645 if (bp->bio_cmd == BIO_DELETE) { 646 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 647 error = dmu_tx_assign(tx, TXG_WAIT); 648 if (error != 0) { 649 dmu_tx_abort(tx); 650 } else { 651 zvol_log_truncate(zv, tx, off, resid, sync); 652 dmu_tx_commit(tx); 653 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 654 off, resid); 655 resid = 0; 656 } 657 goto unlock; 658 } 659 while (resid != 0 && off < volsize) { 660 size_t size = MIN(resid, zvol_maxphys); 661 if (doread) { 662 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 663 DMU_READ_PREFETCH); 664 } else { 665 dmu_tx_t *tx = dmu_tx_create(os); 666 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 667 error = dmu_tx_assign(tx, TXG_WAIT); 668 if (error) { 669 dmu_tx_abort(tx); 670 } else { 671 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 672 zvol_log_write(zv, tx, off, size, sync); 673 dmu_tx_commit(tx); 674 } 675 } 676 if (error) { 677 /* convert checksum errors into IO errors */ 678 if (error == ECKSUM) 679 error = SET_ERROR(EIO); 680 break; 681 } 682 off += size; 683 addr += size; 684 resid -= size; 685 } 686 unlock: 687 zfs_rangelock_exit(lr); 688 689 bp->bio_completed = bp->bio_length - resid; 690 if (bp->bio_completed < bp->bio_length && off > volsize) 691 error = EINVAL; 692 693 switch (bp->bio_cmd) { 694 case BIO_FLUSH: 695 break; 696 case BIO_READ: 697 dataset_kstats_update_read_kstats(&zv->zv_kstat, 698 bp->bio_completed); 699 break; 700 case BIO_WRITE: 701 dataset_kstats_update_write_kstats(&zv->zv_kstat, 702 bp->bio_completed); 703 break; 704 case BIO_DELETE: 705 break; 706 default: 707 break; 708 } 709 710 if (sync) { 711 sync: 712 zil_commit(zv->zv_zilog, ZVOL_OBJ); 713 } 714 resume: 715 rw_exit(&zv->zv_suspend_lock); 716 out: 717 if (bp->bio_to) 718 g_io_deliver(bp, error); 719 else 720 biofinish(bp, NULL, error); 721 } 722 723 /* 724 * Character device mode implementation 725 */ 726 727 static int 728 zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag) 729 { 730 zvol_state_t *zv; 731 uint64_t volsize; 732 zfs_locked_range_t *lr; 733 int error = 0; 734 735 zv = dev->si_drv2; 736 737 volsize = zv->zv_volsize; 738 /* 739 * uio_loffset == volsize isn't an error as 740 * its required for EOF processing. 741 */ 742 if (uio->uio_resid > 0 && 743 (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) 744 return (SET_ERROR(EIO)); 745 746 lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, 747 uio->uio_resid, RL_READER); 748 while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { 749 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); 750 751 /* don't read past the end */ 752 if (bytes > volsize - uio->uio_loffset) 753 bytes = volsize - uio->uio_loffset; 754 755 error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); 756 if (error) { 757 /* convert checksum errors into IO errors */ 758 if (error == ECKSUM) 759 error = SET_ERROR(EIO); 760 break; 761 } 762 } 763 zfs_rangelock_exit(lr); 764 765 return (error); 766 } 767 768 static int 769 zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag) 770 { 771 zvol_state_t *zv; 772 uint64_t volsize; 773 zfs_locked_range_t *lr; 774 int error = 0; 775 boolean_t sync; 776 777 zv = dev->si_drv2; 778 779 volsize = zv->zv_volsize; 780 781 if (uio->uio_resid > 0 && 782 (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) 783 return (SET_ERROR(EIO)); 784 785 sync = (ioflag & IO_SYNC) || 786 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 787 788 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 789 zvol_ensure_zilog(zv); 790 791 lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, 792 uio->uio_resid, RL_WRITER); 793 while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { 794 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); 795 uint64_t off = uio->uio_loffset; 796 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 797 798 if (bytes > volsize - off) /* don't write past the end */ 799 bytes = volsize - off; 800 801 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 802 error = dmu_tx_assign(tx, TXG_WAIT); 803 if (error) { 804 dmu_tx_abort(tx); 805 break; 806 } 807 error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); 808 if (error == 0) 809 zvol_log_write(zv, tx, off, bytes, sync); 810 dmu_tx_commit(tx); 811 812 if (error) 813 break; 814 } 815 zfs_rangelock_exit(lr); 816 if (sync) 817 zil_commit(zv->zv_zilog, ZVOL_OBJ); 818 rw_exit(&zv->zv_suspend_lock); 819 return (error); 820 } 821 822 static int 823 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 824 { 825 zvol_state_t *zv; 826 struct zvol_state_dev *zsd; 827 int err = 0; 828 boolean_t drop_suspend = B_TRUE; 829 830 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 831 zv = dev->si_drv2; 832 if (zv == NULL) { 833 rw_exit(&zvol_state_lock); 834 return (SET_ERROR(ENXIO)); 835 } 836 837 mutex_enter(&zv->zv_state_lock); 838 839 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); 840 841 /* 842 * make sure zvol is not suspended during first open 843 * (hold zv_suspend_lock) and respect proper lock acquisition 844 * ordering - zv_suspend_lock before zv_state_lock 845 */ 846 if (zv->zv_open_count == 0) { 847 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 848 mutex_exit(&zv->zv_state_lock); 849 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 850 mutex_enter(&zv->zv_state_lock); 851 /* check to see if zv_suspend_lock is needed */ 852 if (zv->zv_open_count != 0) { 853 rw_exit(&zv->zv_suspend_lock); 854 drop_suspend = B_FALSE; 855 } 856 } 857 } else { 858 drop_suspend = B_FALSE; 859 } 860 rw_exit(&zvol_state_lock); 861 862 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 863 864 if (zv->zv_open_count == 0) { 865 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 866 err = zvol_first_open(zv, !(flags & FWRITE)); 867 if (err) 868 goto out_locked; 869 } 870 871 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 872 err = EROFS; 873 goto out_opened; 874 } 875 if (zv->zv_flags & ZVOL_EXCL) { 876 err = EBUSY; 877 goto out_opened; 878 } 879 #ifdef FEXCL 880 if (flags & FEXCL) { 881 if (zv->zv_open_count != 0) { 882 err = EBUSY; 883 goto out_opened; 884 } 885 zv->zv_flags |= ZVOL_EXCL; 886 } 887 #endif 888 889 zv->zv_open_count++; 890 if (flags & (FSYNC | FDSYNC)) { 891 zsd = &zv->zv_zso->zso_dev; 892 zsd->zsd_sync_cnt++; 893 if (zsd->zsd_sync_cnt == 1 && 894 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) 895 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); 896 } 897 898 mutex_exit(&zv->zv_state_lock); 899 if (drop_suspend) 900 rw_exit(&zv->zv_suspend_lock); 901 return (0); 902 903 out_opened: 904 if (zv->zv_open_count == 0) 905 zvol_last_close(zv); 906 out_locked: 907 mutex_exit(&zv->zv_state_lock); 908 if (drop_suspend) 909 rw_exit(&zv->zv_suspend_lock); 910 return (SET_ERROR(err)); 911 } 912 913 static int 914 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 915 { 916 zvol_state_t *zv; 917 struct zvol_state_dev *zsd; 918 boolean_t drop_suspend = B_TRUE; 919 920 rw_enter(&zvol_state_lock, ZVOL_RW_READER); 921 zv = dev->si_drv2; 922 if (zv == NULL) { 923 rw_exit(&zvol_state_lock); 924 return (SET_ERROR(ENXIO)); 925 } 926 927 mutex_enter(&zv->zv_state_lock); 928 if (zv->zv_flags & ZVOL_EXCL) { 929 ASSERT(zv->zv_open_count == 1); 930 zv->zv_flags &= ~ZVOL_EXCL; 931 } 932 933 ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); 934 935 /* 936 * If the open count is zero, this is a spurious close. 937 * That indicates a bug in the kernel / DDI framework. 938 */ 939 ASSERT(zv->zv_open_count > 0); 940 /* 941 * make sure zvol is not suspended during last close 942 * (hold zv_suspend_lock) and respect proper lock acquisition 943 * ordering - zv_suspend_lock before zv_state_lock 944 */ 945 if (zv->zv_open_count == 1) { 946 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 947 mutex_exit(&zv->zv_state_lock); 948 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 949 mutex_enter(&zv->zv_state_lock); 950 /* check to see if zv_suspend_lock is needed */ 951 if (zv->zv_open_count != 1) { 952 rw_exit(&zv->zv_suspend_lock); 953 drop_suspend = B_FALSE; 954 } 955 } 956 } else { 957 drop_suspend = B_FALSE; 958 } 959 rw_exit(&zvol_state_lock); 960 961 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 962 963 /* 964 * You may get multiple opens, but only one close. 965 */ 966 zv->zv_open_count--; 967 if (flags & (FSYNC | FDSYNC)) { 968 zsd = &zv->zv_zso->zso_dev; 969 zsd->zsd_sync_cnt--; 970 } 971 972 if (zv->zv_open_count == 0) { 973 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 974 zvol_last_close(zv); 975 } 976 977 mutex_exit(&zv->zv_state_lock); 978 979 if (drop_suspend) 980 rw_exit(&zv->zv_suspend_lock); 981 return (0); 982 } 983 984 static int 985 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 986 int fflag, struct thread *td) 987 { 988 zvol_state_t *zv; 989 zfs_locked_range_t *lr; 990 off_t offset, length; 991 int i, error; 992 boolean_t sync; 993 994 zv = dev->si_drv2; 995 996 error = 0; 997 KASSERT(zv->zv_open_count > 0, 998 ("Device with zero access count in %s", __func__)); 999 1000 i = IOCPARM_LEN(cmd); 1001 switch (cmd) { 1002 case DIOCGSECTORSIZE: 1003 *(uint32_t *)data = DEV_BSIZE; 1004 break; 1005 case DIOCGMEDIASIZE: 1006 *(off_t *)data = zv->zv_volsize; 1007 break; 1008 case DIOCGFLUSH: 1009 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1010 if (zv->zv_zilog != NULL) 1011 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1012 rw_exit(&zv->zv_suspend_lock); 1013 break; 1014 case DIOCGDELETE: 1015 if (!zvol_unmap_enabled) 1016 break; 1017 1018 offset = ((off_t *)data)[0]; 1019 length = ((off_t *)data)[1]; 1020 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1021 offset < 0 || offset >= zv->zv_volsize || 1022 length <= 0) { 1023 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1024 length); 1025 error = EINVAL; 1026 break; 1027 } 1028 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1029 zvol_ensure_zilog(zv); 1030 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1031 RL_WRITER); 1032 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1033 error = dmu_tx_assign(tx, TXG_WAIT); 1034 if (error != 0) { 1035 sync = FALSE; 1036 dmu_tx_abort(tx); 1037 } else { 1038 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1039 zvol_log_truncate(zv, tx, offset, length, sync); 1040 dmu_tx_commit(tx); 1041 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1042 offset, length); 1043 } 1044 zfs_rangelock_exit(lr); 1045 if (sync) 1046 zil_commit(zv->zv_zilog, ZVOL_OBJ); 1047 rw_exit(&zv->zv_suspend_lock); 1048 break; 1049 case DIOCGSTRIPESIZE: 1050 *(off_t *)data = zv->zv_volblocksize; 1051 break; 1052 case DIOCGSTRIPEOFFSET: 1053 *(off_t *)data = 0; 1054 break; 1055 case DIOCGATTR: { 1056 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1057 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1058 uint64_t refd, avail, usedobjs, availobjs; 1059 1060 if (strcmp(arg->name, "GEOM::candelete") == 0) 1061 arg->value.i = 1; 1062 else if (strcmp(arg->name, "blocksavail") == 0) { 1063 dmu_objset_space(zv->zv_objset, &refd, &avail, 1064 &usedobjs, &availobjs); 1065 arg->value.off = avail / DEV_BSIZE; 1066 } else if (strcmp(arg->name, "blocksused") == 0) { 1067 dmu_objset_space(zv->zv_objset, &refd, &avail, 1068 &usedobjs, &availobjs); 1069 arg->value.off = refd / DEV_BSIZE; 1070 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1071 avail = metaslab_class_get_space(spa_normal_class(spa)); 1072 avail -= metaslab_class_get_alloc( 1073 spa_normal_class(spa)); 1074 arg->value.off = avail / DEV_BSIZE; 1075 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1076 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1077 arg->value.off = refd / DEV_BSIZE; 1078 } else 1079 error = ENOIOCTL; 1080 break; 1081 } 1082 case FIOSEEKHOLE: 1083 case FIOSEEKDATA: { 1084 off_t *off = (off_t *)data; 1085 uint64_t noff; 1086 boolean_t hole; 1087 1088 hole = (cmd == FIOSEEKHOLE); 1089 noff = *off; 1090 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1091 *off = noff; 1092 break; 1093 } 1094 default: 1095 error = ENOIOCTL; 1096 } 1097 1098 return (error); 1099 } 1100 1101 /* 1102 * Misc. helpers 1103 */ 1104 1105 static void 1106 zvol_ensure_zilog(zvol_state_t *zv) 1107 { 1108 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1109 1110 /* 1111 * Open a ZIL if this is the first time we have written to this 1112 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1113 * than zv_state_lock so that we don't need to acquire an 1114 * additional lock in this path. 1115 */ 1116 if (zv->zv_zilog == NULL) { 1117 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1118 rw_exit(&zv->zv_suspend_lock); 1119 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1120 } 1121 if (zv->zv_zilog == NULL) { 1122 zv->zv_zilog = zil_open(zv->zv_objset, 1123 zvol_get_data); 1124 zv->zv_flags |= ZVOL_WRITTEN_TO; 1125 } 1126 rw_downgrade(&zv->zv_suspend_lock); 1127 } 1128 } 1129 1130 static boolean_t 1131 zvol_is_zvol_impl(const char *device) 1132 { 1133 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1134 } 1135 1136 static void 1137 zvol_rename_minor(zvol_state_t *zv, const char *newname) 1138 { 1139 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1140 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1141 1142 /* move to new hashtable entry */ 1143 zv->zv_hash = zvol_name_hash(zv->zv_name); 1144 hlist_del(&zv->zv_hlink); 1145 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1146 1147 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { 1148 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1149 struct g_provider *pp = zsg->zsg_provider; 1150 struct g_geom *gp; 1151 1152 g_topology_lock(); 1153 gp = pp->geom; 1154 ASSERT(gp != NULL); 1155 1156 zsg->zsg_provider = NULL; 1157 g_wither_provider(pp, ENXIO); 1158 1159 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1160 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1161 pp->sectorsize = DEV_BSIZE; 1162 pp->mediasize = zv->zv_volsize; 1163 pp->private = zv; 1164 zsg->zsg_provider = pp; 1165 g_error_provider(pp, 0); 1166 g_topology_unlock(); 1167 } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { 1168 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1169 struct cdev *dev; 1170 struct make_dev_args args; 1171 1172 dev = zsd->zsd_cdev; 1173 if (dev != NULL) { 1174 destroy_dev(dev); 1175 dev = zsd->zsd_cdev = NULL; 1176 if (zv->zv_open_count > 0) { 1177 zv->zv_flags &= ~ZVOL_EXCL; 1178 zv->zv_open_count = 0; 1179 /* XXX need suspend lock but lock order */ 1180 zvol_last_close(zv); 1181 } 1182 } 1183 1184 make_dev_args_init(&args); 1185 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1186 args.mda_devsw = &zvol_cdevsw; 1187 args.mda_cr = NULL; 1188 args.mda_uid = UID_ROOT; 1189 args.mda_gid = GID_OPERATOR; 1190 args.mda_mode = 0640; 1191 args.mda_si_drv2 = zv; 1192 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) 1193 == 0) { 1194 dev->si_iosize_max = MAXPHYS; 1195 zsd->zsd_cdev = dev; 1196 } 1197 } 1198 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1199 } 1200 1201 /* 1202 * Remove minor node for the specified volume. 1203 */ 1204 static void 1205 zvol_free(zvol_state_t *zv) 1206 { 1207 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1208 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1209 ASSERT(zv->zv_open_count == 0); 1210 1211 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1212 1213 rw_destroy(&zv->zv_suspend_lock); 1214 zfs_rangelock_fini(&zv->zv_rangelock); 1215 1216 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { 1217 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1218 1219 g_topology_lock(); 1220 zvol_geom_destroy(zv); 1221 g_topology_unlock(); 1222 mtx_destroy(&zsg->zsg_queue_mtx); 1223 } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { 1224 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1225 struct cdev *dev = zsd->zsd_cdev; 1226 1227 if (dev != NULL) 1228 destroy_dev(dev); 1229 } 1230 1231 mutex_destroy(&zv->zv_state_lock); 1232 dataset_kstats_destroy(&zv->zv_kstat); 1233 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1234 kmem_free(zv, sizeof (zvol_state_t)); 1235 zvol_minors--; 1236 } 1237 1238 /* 1239 * Create a minor node (plus a whole lot more) for the specified volume. 1240 */ 1241 static int 1242 zvol_create_minor_impl(const char *name) 1243 { 1244 zvol_state_t *zv; 1245 objset_t *os; 1246 dmu_object_info_t *doi; 1247 uint64_t volsize; 1248 uint64_t volmode, hash; 1249 int error; 1250 1251 ZFS_LOG(1, "Creating ZVOL %s...", name); 1252 1253 hash = zvol_name_hash(name); 1254 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1255 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1256 mutex_exit(&zv->zv_state_lock); 1257 return (SET_ERROR(EEXIST)); 1258 } 1259 1260 DROP_GIANT(); 1261 /* lie and say we're read-only */ 1262 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1263 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1264 1265 if (error) 1266 goto out_doi; 1267 1268 error = dmu_object_info(os, ZVOL_OBJ, doi); 1269 if (error) 1270 goto out_dmu_objset_disown; 1271 1272 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1273 if (error) 1274 goto out_dmu_objset_disown; 1275 1276 error = dsl_prop_get_integer(name, 1277 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); 1278 if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT) 1279 volmode = zvol_volmode; 1280 /* 1281 * zvol_alloc equivalent ... 1282 */ 1283 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1284 zv->zv_hash = hash; 1285 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1286 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1287 zv->zv_zso->zso_volmode = volmode; 1288 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { 1289 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1290 struct g_provider *pp; 1291 struct g_geom *gp; 1292 1293 zsg->zsg_state = ZVOL_GEOM_UNINIT; 1294 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); 1295 1296 g_topology_lock(); 1297 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1298 gp->start = zvol_geom_bio_start; 1299 gp->access = zvol_geom_access; 1300 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1301 /* TODO: NULL check? */ 1302 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1303 pp->sectorsize = DEV_BSIZE; 1304 pp->mediasize = 0; 1305 pp->private = zv; 1306 1307 zsg->zsg_provider = pp; 1308 bioq_init(&zsg->zsg_queue); 1309 } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { 1310 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1311 struct cdev *dev; 1312 struct make_dev_args args; 1313 1314 make_dev_args_init(&args); 1315 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1316 args.mda_devsw = &zvol_cdevsw; 1317 args.mda_cr = NULL; 1318 args.mda_uid = UID_ROOT; 1319 args.mda_gid = GID_OPERATOR; 1320 args.mda_mode = 0640; 1321 args.mda_si_drv2 = zv; 1322 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); 1323 if (error != 0) { 1324 mutex_destroy(&zv->zv_state_lock); 1325 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1326 kmem_free(zv, sizeof (*zv)); 1327 dmu_objset_disown(os, B_TRUE, FTAG); 1328 goto out_giant; 1329 } 1330 dev->si_iosize_max = MAXPHYS; 1331 zsd->zsd_cdev = dev; 1332 } 1333 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1334 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1335 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1336 1337 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1338 zv->zv_flags |= ZVOL_RDONLY; 1339 1340 zv->zv_volblocksize = doi->doi_data_block_size; 1341 zv->zv_volsize = volsize; 1342 zv->zv_objset = os; 1343 1344 if (spa_writeable(dmu_objset_spa(os))) { 1345 if (zil_replay_disable) 1346 zil_destroy(dmu_objset_zil(os), B_FALSE); 1347 else 1348 zil_replay(os, zv, zvol_replay_vector); 1349 } 1350 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1351 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1352 1353 /* XXX do prefetch */ 1354 1355 zv->zv_objset = NULL; 1356 out_dmu_objset_disown: 1357 dmu_objset_disown(os, B_TRUE, FTAG); 1358 1359 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { 1360 if (error == 0) 1361 zvol_geom_run(zv); 1362 g_topology_unlock(); 1363 } 1364 out_doi: 1365 kmem_free(doi, sizeof (dmu_object_info_t)); 1366 if (error == 0) { 1367 rw_enter(&zvol_state_lock, RW_WRITER); 1368 zvol_insert(zv); 1369 zvol_minors++; 1370 rw_exit(&zvol_state_lock); 1371 } 1372 ZFS_LOG(1, "ZVOL %s created.", name); 1373 out_giant: 1374 PICKUP_GIANT(); 1375 return (error); 1376 } 1377 1378 static void 1379 zvol_clear_private(zvol_state_t *zv) 1380 { 1381 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1382 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { 1383 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1384 struct g_provider *pp = zsg->zsg_provider; 1385 1386 if (pp == NULL) /* XXX when? */ 1387 return; 1388 1389 mtx_lock(&zsg->zsg_queue_mtx); 1390 zsg->zsg_state = ZVOL_GEOM_STOPPED; 1391 pp->private = NULL; 1392 wakeup_one(&zsg->zsg_queue); 1393 while (zsg->zsg_state != ZVOL_GEOM_RUNNING) 1394 msleep(&zsg->zsg_state, 1395 &zsg->zsg_queue_mtx, 1396 0, "zvol:w", 0); 1397 mtx_unlock(&zsg->zsg_queue_mtx); 1398 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1399 } 1400 } 1401 1402 static int 1403 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) 1404 { 1405 zv->zv_volsize = volsize; 1406 if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { 1407 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1408 struct g_provider *pp = zsg->zsg_provider; 1409 1410 if (pp == NULL) /* XXX when? */ 1411 return (0); 1412 1413 g_topology_lock(); 1414 1415 /* 1416 * Do not invoke resize event when initial size was zero. 1417 * ZVOL initializes the size on first open, this is not 1418 * real resizing. 1419 */ 1420 if (pp->mediasize == 0) 1421 pp->mediasize = zv->zv_volsize; 1422 else 1423 g_resize_provider(pp, zv->zv_volsize); 1424 1425 g_topology_unlock(); 1426 } 1427 return (0); 1428 } 1429 1430 static void 1431 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) 1432 { 1433 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); 1434 } 1435 1436 static void 1437 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) 1438 { 1439 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); 1440 } 1441 1442 const static zvol_platform_ops_t zvol_freebsd_ops = { 1443 .zv_free = zvol_free, 1444 .zv_rename_minor = zvol_rename_minor, 1445 .zv_create_minor = zvol_create_minor_impl, 1446 .zv_update_volsize = zvol_update_volsize, 1447 .zv_clear_private = zvol_clear_private, 1448 .zv_is_zvol = zvol_is_zvol_impl, 1449 .zv_set_disk_ro = zvol_set_disk_ro_impl, 1450 .zv_set_capacity = zvol_set_capacity_impl, 1451 }; 1452 1453 /* 1454 * Public interfaces 1455 */ 1456 1457 int 1458 zvol_busy(void) 1459 { 1460 return (zvol_minors != 0); 1461 } 1462 1463 int 1464 zvol_init(void) 1465 { 1466 zvol_init_impl(); 1467 zvol_register_ops(&zvol_freebsd_ops); 1468 return (0); 1469 } 1470 1471 void 1472 zvol_fini(void) 1473 { 1474 zvol_fini_impl(); 1475 } 1476