1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 23 */ 24 25 #include <sys/dataset_kstats.h> 26 #include <sys/dbuf.h> 27 #include <sys/dmu_traverse.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/zap.h> 32 #include <sys/zfeature.h> 33 #include <sys/zil_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/zio.h> 36 #include <sys/zfs_rlock.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zvol.h> 39 #include <sys/zvol_impl.h> 40 #include <cityhash.h> 41 42 #include <linux/blkdev_compat.h> 43 #include <linux/task_io_accounting_ops.h> 44 45 #ifdef HAVE_BLK_MQ 46 #include <linux/blk-mq.h> 47 #endif 48 49 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 50 struct request *rq, boolean_t force_sync); 51 52 static unsigned int zvol_major = ZVOL_MAJOR; 53 static unsigned int zvol_request_sync = 0; 54 static unsigned int zvol_prefetch_bytes = (128 * 1024); 55 static unsigned long zvol_max_discard_blocks = 16384; 56 57 /* 58 * Switch taskq at multiple of 512 MB offset. This can be set to a lower value 59 * to utilize more threads for small files but may affect prefetch hits. 60 */ 61 #define ZVOL_TASKQ_OFFSET_SHIFT 29 62 63 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 64 static unsigned int zvol_open_timeout_ms = 1000; 65 #endif 66 67 static unsigned int zvol_threads = 0; 68 #ifdef HAVE_BLK_MQ 69 static unsigned int zvol_blk_mq_threads = 0; 70 static unsigned int zvol_blk_mq_actual_threads; 71 static boolean_t zvol_use_blk_mq = B_FALSE; 72 73 /* 74 * The maximum number of volblocksize blocks to process per thread. Typically, 75 * write heavy workloads preform better with higher values here, and read 76 * heavy workloads preform better with lower values, but that's not a hard 77 * and fast rule. It's basically a knob to tune between "less overhead with 78 * less parallelism" and "more overhead, but more parallelism". 79 * 80 * '8' was chosen as a reasonable, balanced, default based off of sequential 81 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 82 */ 83 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 84 #endif 85 86 static unsigned int zvol_num_taskqs = 0; 87 88 #ifndef BLKDEV_DEFAULT_RQ 89 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 90 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 91 #endif 92 93 /* 94 * Finalize our BIO or request. 95 */ 96 #ifdef HAVE_BLK_MQ 97 #define END_IO(zv, bio, rq, error) do { \ 98 if (bio) { \ 99 BIO_END_IO(bio, error); \ 100 } else { \ 101 blk_mq_end_request(rq, errno_to_bi_status(error)); \ 102 } \ 103 } while (0) 104 #else 105 #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) 106 #endif 107 108 #ifdef HAVE_BLK_MQ 109 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 110 static unsigned int zvol_actual_blk_mq_queue_depth; 111 #endif 112 113 struct zvol_state_os { 114 struct gendisk *zvo_disk; /* generic disk */ 115 struct request_queue *zvo_queue; /* request queue */ 116 dev_t zvo_dev; /* device id */ 117 118 #ifdef HAVE_BLK_MQ 119 struct blk_mq_tag_set tag_set; 120 #endif 121 122 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 123 boolean_t use_blk_mq; 124 }; 125 126 typedef struct zv_taskq { 127 uint_t tqs_cnt; 128 taskq_t **tqs_taskq; 129 } zv_taskq_t; 130 static zv_taskq_t zvol_taskqs; 131 static struct ida zvol_ida; 132 133 typedef struct zv_request_stack { 134 zvol_state_t *zv; 135 struct bio *bio; 136 struct request *rq; 137 } zv_request_t; 138 139 typedef struct zv_work { 140 struct request *rq; 141 struct work_struct work; 142 } zv_work_t; 143 144 typedef struct zv_request_task { 145 zv_request_t zvr; 146 taskq_ent_t ent; 147 } zv_request_task_t; 148 149 static zv_request_task_t * 150 zv_request_task_create(zv_request_t zvr) 151 { 152 zv_request_task_t *task; 153 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 154 taskq_init_ent(&task->ent); 155 task->zvr = zvr; 156 return (task); 157 } 158 159 static void 160 zv_request_task_free(zv_request_task_t *task) 161 { 162 kmem_free(task, sizeof (*task)); 163 } 164 165 #ifdef HAVE_BLK_MQ 166 167 /* 168 * This is called when a new block multiqueue request comes in. A request 169 * contains one or more BIOs. 170 */ 171 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 172 const struct blk_mq_queue_data *bd) 173 { 174 struct request *rq = bd->rq; 175 zvol_state_t *zv = rq->q->queuedata; 176 177 /* Tell the kernel that we are starting to process this request */ 178 blk_mq_start_request(rq); 179 180 if (blk_rq_is_passthrough(rq)) { 181 /* Skip non filesystem request */ 182 blk_mq_end_request(rq, BLK_STS_IOERR); 183 return (BLK_STS_IOERR); 184 } 185 186 zvol_request_impl(zv, NULL, rq, 0); 187 188 /* Acknowledge to the kernel that we got this request */ 189 return (BLK_STS_OK); 190 } 191 192 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 193 .queue_rq = zvol_mq_queue_rq, 194 }; 195 196 /* Initialize our blk-mq struct */ 197 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 198 { 199 struct zvol_state_os *zso = zv->zv_zso; 200 201 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 202 203 /* Initialize tag set. */ 204 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 205 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 206 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 207 zso->tag_set.numa_node = NUMA_NO_NODE; 208 zso->tag_set.cmd_size = 0; 209 210 /* 211 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 212 * zvol_request_impl() 213 */ 214 zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 215 zso->tag_set.driver_data = zv; 216 217 return (blk_mq_alloc_tag_set(&zso->tag_set)); 218 } 219 #endif /* HAVE_BLK_MQ */ 220 221 /* 222 * Given a path, return TRUE if path is a ZVOL. 223 */ 224 boolean_t 225 zvol_os_is_zvol(const char *path) 226 { 227 dev_t dev = 0; 228 229 if (vdev_lookup_bdev(path, &dev) != 0) 230 return (B_FALSE); 231 232 if (MAJOR(dev) == zvol_major) 233 return (B_TRUE); 234 235 return (B_FALSE); 236 } 237 238 static void 239 zvol_write(zv_request_t *zvr) 240 { 241 struct bio *bio = zvr->bio; 242 struct request *rq = zvr->rq; 243 int error = 0; 244 zfs_uio_t uio; 245 zvol_state_t *zv = zvr->zv; 246 struct request_queue *q; 247 struct gendisk *disk; 248 unsigned long start_time = 0; 249 boolean_t acct = B_FALSE; 250 251 ASSERT3P(zv, !=, NULL); 252 ASSERT3U(zv->zv_open_count, >, 0); 253 ASSERT3P(zv->zv_zilog, !=, NULL); 254 255 q = zv->zv_zso->zvo_queue; 256 disk = zv->zv_zso->zvo_disk; 257 258 /* bio marked as FLUSH need to flush before write */ 259 if (io_is_flush(bio, rq)) 260 zil_commit(zv->zv_zilog, ZVOL_OBJ); 261 262 /* Some requests are just for flush and nothing else. */ 263 if (io_size(bio, rq) == 0) { 264 rw_exit(&zv->zv_suspend_lock); 265 END_IO(zv, bio, rq, 0); 266 return; 267 } 268 269 zfs_uio_bvec_init(&uio, bio, rq); 270 271 ssize_t start_resid = uio.uio_resid; 272 273 /* 274 * With use_blk_mq, accounting is done by blk_mq_start_request() 275 * and blk_mq_end_request(), so we can skip it here. 276 */ 277 if (bio) { 278 acct = blk_queue_io_stat(q); 279 if (acct) { 280 start_time = blk_generic_start_io_acct(q, disk, WRITE, 281 bio); 282 } 283 } 284 285 boolean_t sync = 286 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 287 288 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 289 uio.uio_loffset, uio.uio_resid, RL_WRITER); 290 291 uint64_t volsize = zv->zv_volsize; 292 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 293 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 294 uint64_t off = uio.uio_loffset; 295 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 296 297 if (bytes > volsize - off) /* don't write past the end */ 298 bytes = volsize - off; 299 300 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 301 302 /* This will only fail for ENOSPC */ 303 error = dmu_tx_assign(tx, TXG_WAIT); 304 if (error) { 305 dmu_tx_abort(tx); 306 break; 307 } 308 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 309 if (error == 0) { 310 zvol_log_write(zv, tx, off, bytes, sync); 311 } 312 dmu_tx_commit(tx); 313 314 if (error) 315 break; 316 } 317 zfs_rangelock_exit(lr); 318 319 int64_t nwritten = start_resid - uio.uio_resid; 320 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 321 task_io_account_write(nwritten); 322 323 if (sync) 324 zil_commit(zv->zv_zilog, ZVOL_OBJ); 325 326 rw_exit(&zv->zv_suspend_lock); 327 328 if (bio && acct) { 329 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 330 } 331 332 END_IO(zv, bio, rq, -error); 333 } 334 335 static void 336 zvol_write_task(void *arg) 337 { 338 zv_request_task_t *task = arg; 339 zvol_write(&task->zvr); 340 zv_request_task_free(task); 341 } 342 343 static void 344 zvol_discard(zv_request_t *zvr) 345 { 346 struct bio *bio = zvr->bio; 347 struct request *rq = zvr->rq; 348 zvol_state_t *zv = zvr->zv; 349 uint64_t start = io_offset(bio, rq); 350 uint64_t size = io_size(bio, rq); 351 uint64_t end = start + size; 352 boolean_t sync; 353 int error = 0; 354 dmu_tx_t *tx; 355 struct request_queue *q = zv->zv_zso->zvo_queue; 356 struct gendisk *disk = zv->zv_zso->zvo_disk; 357 unsigned long start_time = 0; 358 boolean_t acct = B_FALSE; 359 360 ASSERT3P(zv, !=, NULL); 361 ASSERT3U(zv->zv_open_count, >, 0); 362 ASSERT3P(zv->zv_zilog, !=, NULL); 363 364 if (bio) { 365 acct = blk_queue_io_stat(q); 366 if (acct) { 367 start_time = blk_generic_start_io_acct(q, disk, WRITE, 368 bio); 369 } 370 } 371 372 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 373 374 if (end > zv->zv_volsize) { 375 error = SET_ERROR(EIO); 376 goto unlock; 377 } 378 379 /* 380 * Align the request to volume block boundaries when a secure erase is 381 * not required. This will prevent dnode_free_range() from zeroing out 382 * the unaligned parts which is slow (read-modify-write) and useless 383 * since we are not freeing any space by doing so. 384 */ 385 if (!io_is_secure_erase(bio, rq)) { 386 start = P2ROUNDUP(start, zv->zv_volblocksize); 387 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 388 size = end - start; 389 } 390 391 if (start >= end) 392 goto unlock; 393 394 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 395 start, size, RL_WRITER); 396 397 tx = dmu_tx_create(zv->zv_objset); 398 dmu_tx_mark_netfree(tx); 399 error = dmu_tx_assign(tx, TXG_WAIT); 400 if (error != 0) { 401 dmu_tx_abort(tx); 402 } else { 403 zvol_log_truncate(zv, tx, start, size); 404 dmu_tx_commit(tx); 405 error = dmu_free_long_range(zv->zv_objset, 406 ZVOL_OBJ, start, size); 407 } 408 zfs_rangelock_exit(lr); 409 410 if (error == 0 && sync) 411 zil_commit(zv->zv_zilog, ZVOL_OBJ); 412 413 unlock: 414 rw_exit(&zv->zv_suspend_lock); 415 416 if (bio && acct) { 417 blk_generic_end_io_acct(q, disk, WRITE, bio, 418 start_time); 419 } 420 421 END_IO(zv, bio, rq, -error); 422 } 423 424 static void 425 zvol_discard_task(void *arg) 426 { 427 zv_request_task_t *task = arg; 428 zvol_discard(&task->zvr); 429 zv_request_task_free(task); 430 } 431 432 static void 433 zvol_read(zv_request_t *zvr) 434 { 435 struct bio *bio = zvr->bio; 436 struct request *rq = zvr->rq; 437 int error = 0; 438 zfs_uio_t uio; 439 boolean_t acct = B_FALSE; 440 zvol_state_t *zv = zvr->zv; 441 struct request_queue *q; 442 struct gendisk *disk; 443 unsigned long start_time = 0; 444 445 ASSERT3P(zv, !=, NULL); 446 ASSERT3U(zv->zv_open_count, >, 0); 447 448 zfs_uio_bvec_init(&uio, bio, rq); 449 450 q = zv->zv_zso->zvo_queue; 451 disk = zv->zv_zso->zvo_disk; 452 453 ssize_t start_resid = uio.uio_resid; 454 455 /* 456 * When blk-mq is being used, accounting is done by 457 * blk_mq_start_request() and blk_mq_end_request(). 458 */ 459 if (bio) { 460 acct = blk_queue_io_stat(q); 461 if (acct) 462 start_time = blk_generic_start_io_acct(q, disk, READ, 463 bio); 464 } 465 466 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 467 uio.uio_loffset, uio.uio_resid, RL_READER); 468 469 uint64_t volsize = zv->zv_volsize; 470 471 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 472 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 473 474 /* don't read past the end */ 475 if (bytes > volsize - uio.uio_loffset) 476 bytes = volsize - uio.uio_loffset; 477 478 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 479 if (error) { 480 /* convert checksum errors into IO errors */ 481 if (error == ECKSUM) 482 error = SET_ERROR(EIO); 483 break; 484 } 485 } 486 zfs_rangelock_exit(lr); 487 488 int64_t nread = start_resid - uio.uio_resid; 489 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 490 task_io_account_read(nread); 491 492 rw_exit(&zv->zv_suspend_lock); 493 494 if (bio && acct) { 495 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 496 } 497 498 END_IO(zv, bio, rq, -error); 499 } 500 501 static void 502 zvol_read_task(void *arg) 503 { 504 zv_request_task_t *task = arg; 505 zvol_read(&task->zvr); 506 zv_request_task_free(task); 507 } 508 509 510 /* 511 * Process a BIO or request 512 * 513 * Either 'bio' or 'rq' should be set depending on if we are processing a 514 * bio or a request (both should not be set). 515 * 516 * force_sync: Set to 0 to defer processing to a background taskq 517 * Set to 1 to process data synchronously 518 */ 519 static void 520 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 521 boolean_t force_sync) 522 { 523 fstrans_cookie_t cookie = spl_fstrans_mark(); 524 uint64_t offset = io_offset(bio, rq); 525 uint64_t size = io_size(bio, rq); 526 int rw = io_data_dir(bio, rq); 527 528 if (zvol_request_sync || zv->zv_threading == B_FALSE) 529 force_sync = 1; 530 531 zv_request_t zvr = { 532 .zv = zv, 533 .bio = bio, 534 .rq = rq, 535 }; 536 537 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 538 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 539 zv->zv_zso->zvo_disk->disk_name, 540 (long long unsigned)offset, 541 (long unsigned)size); 542 543 END_IO(zv, bio, rq, -SET_ERROR(EIO)); 544 goto out; 545 } 546 547 zv_request_task_t *task; 548 zv_taskq_t *ztqs = &zvol_taskqs; 549 uint_t blk_mq_hw_queue = 0; 550 uint_t tq_idx; 551 uint_t taskq_hash; 552 #ifdef HAVE_BLK_MQ 553 if (rq) 554 #ifdef HAVE_BLK_MQ_RQ_HCTX 555 blk_mq_hw_queue = rq->mq_hctx->queue_num; 556 #else 557 blk_mq_hw_queue = 558 rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 559 #endif 560 #endif 561 taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 562 blk_mq_hw_queue, 0); 563 tq_idx = taskq_hash % ztqs->tqs_cnt; 564 565 if (rw == WRITE) { 566 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 567 END_IO(zv, bio, rq, -SET_ERROR(EROFS)); 568 goto out; 569 } 570 571 /* 572 * Prevents the zvol from being suspended, or the ZIL being 573 * concurrently opened. Will be released after the i/o 574 * completes. 575 */ 576 rw_enter(&zv->zv_suspend_lock, RW_READER); 577 578 /* 579 * Open a ZIL if this is the first time we have written to this 580 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 581 * than zv_state_lock so that we don't need to acquire an 582 * additional lock in this path. 583 */ 584 if (zv->zv_zilog == NULL) { 585 rw_exit(&zv->zv_suspend_lock); 586 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 587 if (zv->zv_zilog == NULL) { 588 zv->zv_zilog = zil_open(zv->zv_objset, 589 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 590 zv->zv_flags |= ZVOL_WRITTEN_TO; 591 /* replay / destroy done in zvol_create_minor */ 592 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 593 ZIL_REPLAY_NEEDED)); 594 } 595 rw_downgrade(&zv->zv_suspend_lock); 596 } 597 598 /* 599 * We don't want this thread to be blocked waiting for i/o to 600 * complete, so we instead wait from a taskq callback. The 601 * i/o may be a ZIL write (via zil_commit()), or a read of an 602 * indirect block, or a read of a data block (if this is a 603 * partial-block write). We will indicate that the i/o is 604 * complete by calling END_IO() from the taskq callback. 605 * 606 * This design allows the calling thread to continue and 607 * initiate more concurrent operations by calling 608 * zvol_request() again. There are typically only a small 609 * number of threads available to call zvol_request() (e.g. 610 * one per iSCSI target), so keeping the latency of 611 * zvol_request() low is important for performance. 612 * 613 * The zvol_request_sync module parameter allows this 614 * behavior to be altered, for performance evaluation 615 * purposes. If the callback blocks, setting 616 * zvol_request_sync=1 will result in much worse performance. 617 * 618 * We can have up to zvol_threads concurrent i/o's being 619 * processed for all zvols on the system. This is typically 620 * a vast improvement over the zvol_request_sync=1 behavior 621 * of one i/o at a time per zvol. However, an even better 622 * design would be for zvol_request() to initiate the zio 623 * directly, and then be notified by the zio_done callback, 624 * which would call END_IO(). Unfortunately, the DMU/ZIL 625 * interfaces lack this functionality (they block waiting for 626 * the i/o to complete). 627 */ 628 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 629 if (force_sync) { 630 zvol_discard(&zvr); 631 } else { 632 task = zv_request_task_create(zvr); 633 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 634 zvol_discard_task, task, 0, &task->ent); 635 } 636 } else { 637 if (force_sync) { 638 zvol_write(&zvr); 639 } else { 640 task = zv_request_task_create(zvr); 641 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 642 zvol_write_task, task, 0, &task->ent); 643 } 644 } 645 } else { 646 /* 647 * The SCST driver, and possibly others, may issue READ I/Os 648 * with a length of zero bytes. These empty I/Os contain no 649 * data and require no additional handling. 650 */ 651 if (size == 0) { 652 END_IO(zv, bio, rq, 0); 653 goto out; 654 } 655 656 rw_enter(&zv->zv_suspend_lock, RW_READER); 657 658 /* See comment in WRITE case above. */ 659 if (force_sync) { 660 zvol_read(&zvr); 661 } else { 662 task = zv_request_task_create(zvr); 663 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 664 zvol_read_task, task, 0, &task->ent); 665 } 666 } 667 668 out: 669 spl_fstrans_unmark(cookie); 670 } 671 672 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 673 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 674 static void 675 zvol_submit_bio(struct bio *bio) 676 #else 677 static blk_qc_t 678 zvol_submit_bio(struct bio *bio) 679 #endif 680 #else 681 static MAKE_REQUEST_FN_RET 682 zvol_request(struct request_queue *q, struct bio *bio) 683 #endif 684 { 685 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 686 #if defined(HAVE_BIO_BDEV_DISK) 687 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 688 #else 689 struct request_queue *q = bio->bi_disk->queue; 690 #endif 691 #endif 692 zvol_state_t *zv = q->queuedata; 693 694 zvol_request_impl(zv, bio, NULL, 0); 695 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 696 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 697 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 698 return (BLK_QC_T_NONE); 699 #endif 700 } 701 702 static int 703 #ifdef HAVE_BLK_MODE_T 704 zvol_open(struct gendisk *disk, blk_mode_t flag) 705 #else 706 zvol_open(struct block_device *bdev, fmode_t flag) 707 #endif 708 { 709 zvol_state_t *zv; 710 int error = 0; 711 boolean_t drop_suspend = B_FALSE; 712 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 713 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 714 hrtime_t start = gethrtime(); 715 716 retry: 717 #endif 718 rw_enter(&zvol_state_lock, RW_READER); 719 /* 720 * Obtain a copy of private_data under the zvol_state_lock to make 721 * sure that either the result of zvol free code path setting 722 * disk->private_data to NULL is observed, or zvol_os_free() 723 * is not called on this zv because of the positive zv_open_count. 724 */ 725 #ifdef HAVE_BLK_MODE_T 726 zv = disk->private_data; 727 #else 728 zv = bdev->bd_disk->private_data; 729 #endif 730 if (zv == NULL) { 731 rw_exit(&zvol_state_lock); 732 return (SET_ERROR(-ENXIO)); 733 } 734 735 mutex_enter(&zv->zv_state_lock); 736 /* 737 * Make sure zvol is not suspended during first open 738 * (hold zv_suspend_lock) and respect proper lock acquisition 739 * ordering - zv_suspend_lock before zv_state_lock 740 */ 741 if (zv->zv_open_count == 0) { 742 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 743 mutex_exit(&zv->zv_state_lock); 744 rw_enter(&zv->zv_suspend_lock, RW_READER); 745 mutex_enter(&zv->zv_state_lock); 746 /* check to see if zv_suspend_lock is needed */ 747 if (zv->zv_open_count != 0) { 748 rw_exit(&zv->zv_suspend_lock); 749 } else { 750 drop_suspend = B_TRUE; 751 } 752 } else { 753 drop_suspend = B_TRUE; 754 } 755 } 756 rw_exit(&zvol_state_lock); 757 758 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 759 760 if (zv->zv_open_count == 0) { 761 boolean_t drop_namespace = B_FALSE; 762 763 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 764 765 /* 766 * In all other call paths the spa_namespace_lock is taken 767 * before the bdev->bd_mutex lock. However, on open(2) 768 * the __blkdev_get() function calls fops->open() with the 769 * bdev->bd_mutex lock held. This can result in a deadlock 770 * when zvols from one pool are used as vdevs in another. 771 * 772 * To prevent a lock inversion deadlock we preemptively 773 * take the spa_namespace_lock. Normally the lock will not 774 * be contended and this is safe because spa_open_common() 775 * handles the case where the caller already holds the 776 * spa_namespace_lock. 777 * 778 * When the lock cannot be aquired after multiple retries 779 * this must be the vdev on zvol deadlock case and we have 780 * no choice but to return an error. For 5.12 and older 781 * kernels returning -ERESTARTSYS will result in the 782 * bdev->bd_mutex being dropped, then reacquired, and 783 * fops->open() being called again. This process can be 784 * repeated safely until both locks are acquired. For 5.13 785 * and newer the -ERESTARTSYS retry logic was removed from 786 * the kernel so the only option is to return the error for 787 * the caller to handle it. 788 */ 789 if (!mutex_owned(&spa_namespace_lock)) { 790 if (!mutex_tryenter(&spa_namespace_lock)) { 791 mutex_exit(&zv->zv_state_lock); 792 rw_exit(&zv->zv_suspend_lock); 793 794 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 795 schedule(); 796 return (SET_ERROR(-ERESTARTSYS)); 797 #else 798 if ((gethrtime() - start) > timeout) 799 return (SET_ERROR(-ERESTARTSYS)); 800 801 schedule_timeout_interruptible( 802 MSEC_TO_TICK(10)); 803 goto retry; 804 #endif 805 } else { 806 drop_namespace = B_TRUE; 807 } 808 } 809 810 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 811 812 if (drop_namespace) 813 mutex_exit(&spa_namespace_lock); 814 } 815 816 if (error == 0) { 817 if ((blk_mode_is_open_write(flag)) && 818 (zv->zv_flags & ZVOL_RDONLY)) { 819 if (zv->zv_open_count == 0) 820 zvol_last_close(zv); 821 822 error = SET_ERROR(-EROFS); 823 } else { 824 zv->zv_open_count++; 825 } 826 } 827 828 mutex_exit(&zv->zv_state_lock); 829 if (drop_suspend) 830 rw_exit(&zv->zv_suspend_lock); 831 832 if (error == 0) 833 #ifdef HAVE_BLK_MODE_T 834 disk_check_media_change(disk); 835 #else 836 zfs_check_media_change(bdev); 837 #endif 838 839 return (error); 840 } 841 842 static void 843 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 844 zvol_release(struct gendisk *disk) 845 #else 846 zvol_release(struct gendisk *disk, fmode_t unused) 847 #endif 848 { 849 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 850 (void) unused; 851 #endif 852 zvol_state_t *zv; 853 boolean_t drop_suspend = B_TRUE; 854 855 rw_enter(&zvol_state_lock, RW_READER); 856 zv = disk->private_data; 857 858 mutex_enter(&zv->zv_state_lock); 859 ASSERT3U(zv->zv_open_count, >, 0); 860 /* 861 * make sure zvol is not suspended during last close 862 * (hold zv_suspend_lock) and respect proper lock acquisition 863 * ordering - zv_suspend_lock before zv_state_lock 864 */ 865 if (zv->zv_open_count == 1) { 866 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 867 mutex_exit(&zv->zv_state_lock); 868 rw_enter(&zv->zv_suspend_lock, RW_READER); 869 mutex_enter(&zv->zv_state_lock); 870 /* check to see if zv_suspend_lock is needed */ 871 if (zv->zv_open_count != 1) { 872 rw_exit(&zv->zv_suspend_lock); 873 drop_suspend = B_FALSE; 874 } 875 } 876 } else { 877 drop_suspend = B_FALSE; 878 } 879 rw_exit(&zvol_state_lock); 880 881 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 882 883 zv->zv_open_count--; 884 if (zv->zv_open_count == 0) { 885 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 886 zvol_last_close(zv); 887 } 888 889 mutex_exit(&zv->zv_state_lock); 890 891 if (drop_suspend) 892 rw_exit(&zv->zv_suspend_lock); 893 } 894 895 static int 896 zvol_ioctl(struct block_device *bdev, fmode_t mode, 897 unsigned int cmd, unsigned long arg) 898 { 899 zvol_state_t *zv = bdev->bd_disk->private_data; 900 int error = 0; 901 902 ASSERT3U(zv->zv_open_count, >, 0); 903 904 switch (cmd) { 905 case BLKFLSBUF: 906 #ifdef HAVE_FSYNC_BDEV 907 fsync_bdev(bdev); 908 #elif defined(HAVE_SYNC_BLOCKDEV) 909 sync_blockdev(bdev); 910 #else 911 #error "Neither fsync_bdev() nor sync_blockdev() found" 912 #endif 913 invalidate_bdev(bdev); 914 rw_enter(&zv->zv_suspend_lock, RW_READER); 915 916 if (!(zv->zv_flags & ZVOL_RDONLY)) 917 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 918 919 rw_exit(&zv->zv_suspend_lock); 920 break; 921 922 case BLKZNAME: 923 mutex_enter(&zv->zv_state_lock); 924 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 925 mutex_exit(&zv->zv_state_lock); 926 break; 927 928 default: 929 error = -ENOTTY; 930 break; 931 } 932 933 return (SET_ERROR(error)); 934 } 935 936 #ifdef CONFIG_COMPAT 937 static int 938 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 939 unsigned cmd, unsigned long arg) 940 { 941 return (zvol_ioctl(bdev, mode, cmd, arg)); 942 } 943 #else 944 #define zvol_compat_ioctl NULL 945 #endif 946 947 static unsigned int 948 zvol_check_events(struct gendisk *disk, unsigned int clearing) 949 { 950 unsigned int mask = 0; 951 952 rw_enter(&zvol_state_lock, RW_READER); 953 954 zvol_state_t *zv = disk->private_data; 955 if (zv != NULL) { 956 mutex_enter(&zv->zv_state_lock); 957 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 958 zv->zv_changed = 0; 959 mutex_exit(&zv->zv_state_lock); 960 } 961 962 rw_exit(&zvol_state_lock); 963 964 return (mask); 965 } 966 967 static int 968 zvol_revalidate_disk(struct gendisk *disk) 969 { 970 rw_enter(&zvol_state_lock, RW_READER); 971 972 zvol_state_t *zv = disk->private_data; 973 if (zv != NULL) { 974 mutex_enter(&zv->zv_state_lock); 975 set_capacity(zv->zv_zso->zvo_disk, 976 zv->zv_volsize >> SECTOR_BITS); 977 mutex_exit(&zv->zv_state_lock); 978 } 979 980 rw_exit(&zvol_state_lock); 981 982 return (0); 983 } 984 985 int 986 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 987 { 988 struct gendisk *disk = zv->zv_zso->zvo_disk; 989 990 #if defined(HAVE_REVALIDATE_DISK_SIZE) 991 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 992 #elif defined(HAVE_REVALIDATE_DISK) 993 revalidate_disk(disk); 994 #else 995 zvol_revalidate_disk(disk); 996 #endif 997 return (0); 998 } 999 1000 void 1001 zvol_os_clear_private(zvol_state_t *zv) 1002 { 1003 /* 1004 * Cleared while holding zvol_state_lock as a writer 1005 * which will prevent zvol_open() from opening it. 1006 */ 1007 zv->zv_zso->zvo_disk->private_data = NULL; 1008 } 1009 1010 /* 1011 * Provide a simple virtual geometry for legacy compatibility. For devices 1012 * smaller than 1 MiB a small head and sector count is used to allow very 1013 * tiny devices. For devices over 1 Mib a standard head and sector count 1014 * is used to keep the cylinders count reasonable. 1015 */ 1016 static int 1017 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1018 { 1019 zvol_state_t *zv = bdev->bd_disk->private_data; 1020 sector_t sectors; 1021 1022 ASSERT3U(zv->zv_open_count, >, 0); 1023 1024 sectors = get_capacity(zv->zv_zso->zvo_disk); 1025 1026 if (sectors > 2048) { 1027 geo->heads = 16; 1028 geo->sectors = 63; 1029 } else { 1030 geo->heads = 2; 1031 geo->sectors = 4; 1032 } 1033 1034 geo->start = 0; 1035 geo->cylinders = sectors / (geo->heads * geo->sectors); 1036 1037 return (0); 1038 } 1039 1040 /* 1041 * Why have two separate block_device_operations structs? 1042 * 1043 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1044 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1045 * can't just change submit_bio dynamically at runtime. So just create two 1046 * separate structs to get around this. 1047 */ 1048 static const struct block_device_operations zvol_ops_blk_mq = { 1049 .open = zvol_open, 1050 .release = zvol_release, 1051 .ioctl = zvol_ioctl, 1052 .compat_ioctl = zvol_compat_ioctl, 1053 .check_events = zvol_check_events, 1054 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1055 .revalidate_disk = zvol_revalidate_disk, 1056 #endif 1057 .getgeo = zvol_getgeo, 1058 .owner = THIS_MODULE, 1059 }; 1060 1061 static const struct block_device_operations zvol_ops = { 1062 .open = zvol_open, 1063 .release = zvol_release, 1064 .ioctl = zvol_ioctl, 1065 .compat_ioctl = zvol_compat_ioctl, 1066 .check_events = zvol_check_events, 1067 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1068 .revalidate_disk = zvol_revalidate_disk, 1069 #endif 1070 .getgeo = zvol_getgeo, 1071 .owner = THIS_MODULE, 1072 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1073 .submit_bio = zvol_submit_bio, 1074 #endif 1075 }; 1076 1077 static int 1078 zvol_alloc_non_blk_mq(struct zvol_state_os *zso) 1079 { 1080 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1081 #if defined(HAVE_BLK_ALLOC_DISK) 1082 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1083 if (zso->zvo_disk == NULL) 1084 return (1); 1085 1086 zso->zvo_disk->minors = ZVOL_MINORS; 1087 zso->zvo_queue = zso->zvo_disk->queue; 1088 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1089 struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 1090 if (IS_ERR(disk)) { 1091 zso->zvo_disk = NULL; 1092 return (1); 1093 } 1094 1095 zso->zvo_disk = disk; 1096 zso->zvo_disk->minors = ZVOL_MINORS; 1097 zso->zvo_queue = zso->zvo_disk->queue; 1098 #else 1099 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1100 if (zso->zvo_queue == NULL) 1101 return (1); 1102 1103 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1104 if (zso->zvo_disk == NULL) { 1105 blk_cleanup_queue(zso->zvo_queue); 1106 return (1); 1107 } 1108 1109 zso->zvo_disk->queue = zso->zvo_queue; 1110 #endif /* HAVE_BLK_ALLOC_DISK */ 1111 #else 1112 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1113 if (zso->zvo_queue == NULL) 1114 return (1); 1115 1116 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1117 if (zso->zvo_disk == NULL) { 1118 blk_cleanup_queue(zso->zvo_queue); 1119 return (1); 1120 } 1121 1122 zso->zvo_disk->queue = zso->zvo_queue; 1123 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1124 return (0); 1125 1126 } 1127 1128 static int 1129 zvol_alloc_blk_mq(zvol_state_t *zv) 1130 { 1131 #ifdef HAVE_BLK_MQ 1132 struct zvol_state_os *zso = zv->zv_zso; 1133 1134 /* Allocate our blk-mq tag_set */ 1135 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1136 return (1); 1137 1138 #if defined(HAVE_BLK_ALLOC_DISK) 1139 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1140 if (zso->zvo_disk == NULL) { 1141 blk_mq_free_tag_set(&zso->tag_set); 1142 return (1); 1143 } 1144 zso->zvo_queue = zso->zvo_disk->queue; 1145 zso->zvo_disk->minors = ZVOL_MINORS; 1146 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1147 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv); 1148 if (IS_ERR(disk)) { 1149 zso->zvo_disk = NULL; 1150 blk_mq_free_tag_set(&zso->tag_set); 1151 return (1); 1152 } 1153 1154 zso->zvo_disk = disk; 1155 zso->zvo_queue = zso->zvo_disk->queue; 1156 zso->zvo_disk->minors = ZVOL_MINORS; 1157 #else 1158 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1159 if (zso->zvo_disk == NULL) { 1160 blk_cleanup_queue(zso->zvo_queue); 1161 blk_mq_free_tag_set(&zso->tag_set); 1162 return (1); 1163 } 1164 /* Allocate queue */ 1165 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1166 if (IS_ERR(zso->zvo_queue)) { 1167 blk_mq_free_tag_set(&zso->tag_set); 1168 return (1); 1169 } 1170 1171 /* Our queue is now created, assign it to our disk */ 1172 zso->zvo_disk->queue = zso->zvo_queue; 1173 1174 #endif 1175 #endif 1176 return (0); 1177 } 1178 1179 /* 1180 * Allocate memory for a new zvol_state_t and setup the required 1181 * request queue and generic disk structures for the block device. 1182 */ 1183 static zvol_state_t * 1184 zvol_alloc(dev_t dev, const char *name) 1185 { 1186 zvol_state_t *zv; 1187 struct zvol_state_os *zso; 1188 uint64_t volmode; 1189 int ret; 1190 1191 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1192 return (NULL); 1193 1194 if (volmode == ZFS_VOLMODE_DEFAULT) 1195 volmode = zvol_volmode; 1196 1197 if (volmode == ZFS_VOLMODE_NONE) 1198 return (NULL); 1199 1200 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1201 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1202 zv->zv_zso = zso; 1203 zv->zv_volmode = volmode; 1204 1205 list_link_init(&zv->zv_next); 1206 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1207 1208 #ifdef HAVE_BLK_MQ 1209 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1210 #endif 1211 1212 /* 1213 * The block layer has 3 interfaces for getting BIOs: 1214 * 1215 * 1. blk-mq request queues (new) 1216 * 2. submit_bio() (oldest) 1217 * 3. regular request queues (old). 1218 * 1219 * Each of those interfaces has two permutations: 1220 * 1221 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1222 * both the disk and its queue (5.14 kernel or newer) 1223 * 1224 * b) We don't have blk_*alloc_disk(), and have to allocate the 1225 * disk and the queue separately. (5.13 kernel or older) 1226 */ 1227 if (zv->zv_zso->use_blk_mq) { 1228 ret = zvol_alloc_blk_mq(zv); 1229 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1230 } else { 1231 ret = zvol_alloc_non_blk_mq(zso); 1232 zso->zvo_disk->fops = &zvol_ops; 1233 } 1234 if (ret != 0) 1235 goto out_kmem; 1236 1237 blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); 1238 1239 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1240 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1241 1242 if (!zv->zv_zso->use_blk_mq) { 1243 /* Disable write merging in favor of the ZIO pipeline. */ 1244 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1245 } 1246 1247 /* Enable /proc/diskstats */ 1248 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); 1249 1250 zso->zvo_queue->queuedata = zv; 1251 zso->zvo_dev = dev; 1252 zv->zv_open_count = 0; 1253 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1254 1255 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1256 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1257 1258 zso->zvo_disk->major = zvol_major; 1259 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1260 1261 /* 1262 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1263 * This is accomplished by limiting the number of minors for the 1264 * device to one and explicitly disabling partition scanning. 1265 */ 1266 if (volmode == ZFS_VOLMODE_DEV) { 1267 zso->zvo_disk->minors = 1; 1268 zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; 1269 zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; 1270 } 1271 1272 zso->zvo_disk->first_minor = (dev & MINORMASK); 1273 zso->zvo_disk->private_data = zv; 1274 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1275 ZVOL_DEV_NAME, (dev & MINORMASK)); 1276 1277 return (zv); 1278 1279 out_kmem: 1280 kmem_free(zso, sizeof (struct zvol_state_os)); 1281 kmem_free(zv, sizeof (zvol_state_t)); 1282 return (NULL); 1283 } 1284 1285 /* 1286 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1287 * At this time, the structure is not opened by anyone, is taken off 1288 * the zvol_state_list, and has its private data set to NULL. 1289 * The zvol_state_lock is dropped. 1290 * 1291 * This function may take many milliseconds to complete (e.g. we've seen 1292 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1293 * "del_gendisk". Thus, consumers need to be careful to account for this 1294 * latency when calling this function. 1295 */ 1296 void 1297 zvol_os_free(zvol_state_t *zv) 1298 { 1299 1300 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1301 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1302 ASSERT0(zv->zv_open_count); 1303 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1304 1305 rw_destroy(&zv->zv_suspend_lock); 1306 zfs_rangelock_fini(&zv->zv_rangelock); 1307 1308 del_gendisk(zv->zv_zso->zvo_disk); 1309 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1310 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1311 #if defined(HAVE_BLK_CLEANUP_DISK) 1312 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1313 #else 1314 put_disk(zv->zv_zso->zvo_disk); 1315 #endif 1316 #else 1317 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1318 put_disk(zv->zv_zso->zvo_disk); 1319 #endif 1320 1321 #ifdef HAVE_BLK_MQ 1322 if (zv->zv_zso->use_blk_mq) 1323 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1324 #endif 1325 1326 ida_simple_remove(&zvol_ida, 1327 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1328 1329 mutex_destroy(&zv->zv_state_lock); 1330 dataset_kstats_destroy(&zv->zv_kstat); 1331 1332 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1333 kmem_free(zv, sizeof (zvol_state_t)); 1334 } 1335 1336 void 1337 zvol_wait_close(zvol_state_t *zv) 1338 { 1339 } 1340 1341 /* 1342 * Create a block device minor node and setup the linkage between it 1343 * and the specified volume. Once this function returns the block 1344 * device is live and ready for use. 1345 */ 1346 int 1347 zvol_os_create_minor(const char *name) 1348 { 1349 zvol_state_t *zv; 1350 objset_t *os; 1351 dmu_object_info_t *doi; 1352 uint64_t volsize; 1353 uint64_t len; 1354 unsigned minor = 0; 1355 int error = 0; 1356 int idx; 1357 uint64_t hash = zvol_name_hash(name); 1358 uint64_t volthreading; 1359 bool replayed_zil = B_FALSE; 1360 1361 if (zvol_inhibit_dev) 1362 return (0); 1363 1364 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1365 if (idx < 0) 1366 return (SET_ERROR(-idx)); 1367 minor = idx << ZVOL_MINOR_BITS; 1368 if (MINOR(minor) != minor) { 1369 /* too many partitions can cause an overflow */ 1370 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1371 name, minor, MINOR(minor)); 1372 ida_simple_remove(&zvol_ida, idx); 1373 return (SET_ERROR(EINVAL)); 1374 } 1375 1376 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1377 if (zv) { 1378 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1379 mutex_exit(&zv->zv_state_lock); 1380 ida_simple_remove(&zvol_ida, idx); 1381 return (SET_ERROR(EEXIST)); 1382 } 1383 1384 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1385 1386 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1387 if (error) 1388 goto out_doi; 1389 1390 error = dmu_object_info(os, ZVOL_OBJ, doi); 1391 if (error) 1392 goto out_dmu_objset_disown; 1393 1394 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1395 if (error) 1396 goto out_dmu_objset_disown; 1397 1398 zv = zvol_alloc(MKDEV(zvol_major, minor), name); 1399 if (zv == NULL) { 1400 error = SET_ERROR(EAGAIN); 1401 goto out_dmu_objset_disown; 1402 } 1403 zv->zv_hash = hash; 1404 1405 if (dmu_objset_is_snapshot(os)) 1406 zv->zv_flags |= ZVOL_RDONLY; 1407 1408 zv->zv_volblocksize = doi->doi_data_block_size; 1409 zv->zv_volsize = volsize; 1410 zv->zv_objset = os; 1411 1412 /* Default */ 1413 zv->zv_threading = B_TRUE; 1414 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1415 == 0) 1416 zv->zv_threading = volthreading; 1417 1418 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1419 1420 blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, 1421 (DMU_MAX_ACCESS / 4) >> 9); 1422 1423 if (zv->zv_zso->use_blk_mq) { 1424 /* 1425 * IO requests can be really big (1MB). When an IO request 1426 * comes in, it is passed off to zvol_read() or zvol_write() 1427 * in a new thread, where it is chunked up into 'volblocksize' 1428 * sized pieces and processed. So for example, if the request 1429 * is a 1MB write and your volblocksize is 128k, one zvol_write 1430 * thread will take that request and sequentially do ten 128k 1431 * IOs. This is due to the fact that the thread needs to lock 1432 * each volblocksize sized block. So you might be wondering: 1433 * "instead of passing the whole 1MB request to one thread, 1434 * why not pass ten individual 128k chunks to ten threads and 1435 * process the whole write in parallel?" The short answer is 1436 * that there's a sweet spot number of chunks that balances 1437 * the greater parallelism with the added overhead of more 1438 * threads. The sweet spot can be different depending on if you 1439 * have a read or write heavy workload. Writes typically want 1440 * high chunk counts while reads typically want lower ones. On 1441 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1442 * configuration, with volblocksize=8k, the sweet spot for good 1443 * sequential reads and writes was at 8 chunks. 1444 */ 1445 1446 /* 1447 * Below we tell the kernel how big we want our requests 1448 * to be. You would think that blk_queue_io_opt() would be 1449 * used to do this since it is used to "set optimal request 1450 * size for the queue", but that doesn't seem to do 1451 * anything - the kernel still gives you huge requests 1452 * with tons of little PAGE_SIZE segments contained within it. 1453 * 1454 * Knowing that the kernel will just give you PAGE_SIZE segments 1455 * no matter what, you can say "ok, I want PAGE_SIZE byte 1456 * segments, and I want 'N' of them per request", where N is 1457 * the correct number of segments for the volblocksize and 1458 * number of chunks you want. 1459 */ 1460 #ifdef HAVE_BLK_MQ 1461 if (zvol_blk_mq_blocks_per_thread != 0) { 1462 unsigned int chunks; 1463 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1464 1465 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, 1466 PAGE_SIZE); 1467 blk_queue_max_segments(zv->zv_zso->zvo_queue, 1468 (zv->zv_volblocksize * chunks) / PAGE_SIZE); 1469 } else { 1470 /* 1471 * Special case: zvol_blk_mq_blocks_per_thread = 0 1472 * Max everything out. 1473 */ 1474 blk_queue_max_segments(zv->zv_zso->zvo_queue, 1475 UINT16_MAX); 1476 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, 1477 UINT_MAX); 1478 } 1479 #endif 1480 } else { 1481 blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); 1482 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); 1483 } 1484 1485 blk_queue_physical_block_size(zv->zv_zso->zvo_queue, 1486 zv->zv_volblocksize); 1487 blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); 1488 blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, 1489 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); 1490 blk_queue_discard_granularity(zv->zv_zso->zvo_queue, 1491 zv->zv_volblocksize); 1492 #ifdef QUEUE_FLAG_DISCARD 1493 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1494 #endif 1495 #ifdef QUEUE_FLAG_NONROT 1496 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1497 #endif 1498 #ifdef QUEUE_FLAG_ADD_RANDOM 1499 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1500 #endif 1501 /* This flag was introduced in kernel version 4.12. */ 1502 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1503 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1504 #endif 1505 1506 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1507 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1508 if (error) 1509 goto out_dmu_objset_disown; 1510 ASSERT3P(zv->zv_zilog, ==, NULL); 1511 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1512 if (spa_writeable(dmu_objset_spa(os))) { 1513 if (zil_replay_disable) 1514 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1515 else 1516 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1517 } 1518 if (replayed_zil) 1519 zil_close(zv->zv_zilog); 1520 zv->zv_zilog = NULL; 1521 1522 /* 1523 * When udev detects the addition of the device it will immediately 1524 * invoke blkid(8) to determine the type of content on the device. 1525 * Prefetching the blocks commonly scanned by blkid(8) will speed 1526 * up this process. 1527 */ 1528 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1529 if (len > 0) { 1530 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1531 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1532 ZIO_PRIORITY_SYNC_READ); 1533 } 1534 1535 zv->zv_objset = NULL; 1536 out_dmu_objset_disown: 1537 dmu_objset_disown(os, B_TRUE, FTAG); 1538 out_doi: 1539 kmem_free(doi, sizeof (dmu_object_info_t)); 1540 1541 /* 1542 * Keep in mind that once add_disk() is called, the zvol is 1543 * announced to the world, and zvol_open()/zvol_release() can 1544 * be called at any time. Incidentally, add_disk() itself calls 1545 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1546 * directly as well. 1547 */ 1548 if (error == 0) { 1549 rw_enter(&zvol_state_lock, RW_WRITER); 1550 zvol_insert(zv); 1551 rw_exit(&zvol_state_lock); 1552 #ifdef HAVE_ADD_DISK_RET 1553 error = add_disk(zv->zv_zso->zvo_disk); 1554 #else 1555 add_disk(zv->zv_zso->zvo_disk); 1556 #endif 1557 } else { 1558 ida_simple_remove(&zvol_ida, idx); 1559 } 1560 1561 return (error); 1562 } 1563 1564 void 1565 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1566 { 1567 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1568 1569 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1570 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1571 1572 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1573 1574 /* move to new hashtable entry */ 1575 zv->zv_hash = zvol_name_hash(newname); 1576 hlist_del(&zv->zv_hlink); 1577 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1578 1579 /* 1580 * The block device's read-only state is briefly changed causing 1581 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1582 * the name change and fixes the symlinks. This does not change 1583 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1584 * changes. This would normally be done using kobject_uevent() but 1585 * that is a GPL-only symbol which is why we need this workaround. 1586 */ 1587 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1588 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1589 1590 dataset_kstats_rename(&zv->zv_kstat, newname); 1591 } 1592 1593 void 1594 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1595 { 1596 1597 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1598 } 1599 1600 void 1601 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1602 { 1603 1604 set_capacity(zv->zv_zso->zvo_disk, capacity); 1605 } 1606 1607 int 1608 zvol_init(void) 1609 { 1610 int error; 1611 1612 /* 1613 * zvol_threads is the module param the user passes in. 1614 * 1615 * zvol_actual_threads is what we use internally, since the user can 1616 * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 1617 */ 1618 static unsigned int zvol_actual_threads; 1619 1620 if (zvol_threads == 0) { 1621 /* 1622 * See dde9380a1 for why 32 was chosen here. This should 1623 * probably be refined to be some multiple of the number 1624 * of CPUs. 1625 */ 1626 zvol_actual_threads = MAX(num_online_cpus(), 32); 1627 } else { 1628 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 1629 } 1630 1631 /* 1632 * Use atleast 32 zvol_threads but for many core system, 1633 * prefer 6 threads per taskq, but no more taskqs 1634 * than threads in them on large systems. 1635 * 1636 * taskq total 1637 * cpus taskqs threads threads 1638 * ------- ------- ------- ------- 1639 * 1 1 32 32 1640 * 2 1 32 32 1641 * 4 1 32 32 1642 * 8 2 16 32 1643 * 16 3 11 33 1644 * 32 5 7 35 1645 * 64 8 8 64 1646 * 128 11 12 132 1647 * 256 16 16 256 1648 */ 1649 zv_taskq_t *ztqs = &zvol_taskqs; 1650 uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); 1651 if (num_tqs == 0) { 1652 num_tqs = 1 + num_online_cpus() / 6; 1653 while (num_tqs * num_tqs > zvol_actual_threads) 1654 num_tqs--; 1655 } 1656 uint_t per_tq_thread = zvol_actual_threads / num_tqs; 1657 if (per_tq_thread * num_tqs < zvol_actual_threads) 1658 per_tq_thread++; 1659 ztqs->tqs_cnt = num_tqs; 1660 ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); 1661 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1662 if (error) { 1663 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); 1664 ztqs->tqs_taskq = NULL; 1665 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1666 return (error); 1667 } 1668 1669 #ifdef HAVE_BLK_MQ 1670 if (zvol_blk_mq_queue_depth == 0) { 1671 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1672 } else { 1673 zvol_actual_blk_mq_queue_depth = 1674 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1675 } 1676 1677 if (zvol_blk_mq_threads == 0) { 1678 zvol_blk_mq_actual_threads = num_online_cpus(); 1679 } else { 1680 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1681 1024); 1682 } 1683 #endif 1684 for (uint_t i = 0; i < num_tqs; i++) { 1685 char name[32]; 1686 (void) snprintf(name, sizeof (name), "%s_tq-%u", 1687 ZVOL_DRIVER, i); 1688 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, 1689 maxclsyspri, per_tq_thread, INT_MAX, 1690 TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1691 if (ztqs->tqs_taskq[i] == NULL) { 1692 for (int j = i - 1; j >= 0; j--) 1693 taskq_destroy(ztqs->tqs_taskq[j]); 1694 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1695 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1696 sizeof (taskq_t *)); 1697 ztqs->tqs_taskq = NULL; 1698 return (-ENOMEM); 1699 } 1700 } 1701 1702 zvol_init_impl(); 1703 ida_init(&zvol_ida); 1704 return (0); 1705 } 1706 1707 void 1708 zvol_fini(void) 1709 { 1710 zv_taskq_t *ztqs = &zvol_taskqs; 1711 zvol_fini_impl(); 1712 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1713 1714 if (ztqs->tqs_taskq == NULL) { 1715 ASSERT3U(ztqs->tqs_cnt, ==, 0); 1716 } else { 1717 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { 1718 ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); 1719 taskq_destroy(ztqs->tqs_taskq[i]); 1720 } 1721 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1722 sizeof (taskq_t *)); 1723 ztqs->tqs_taskq = NULL; 1724 } 1725 1726 ida_destroy(&zvol_ida); 1727 } 1728 1729 /* BEGIN CSTYLED */ 1730 module_param(zvol_inhibit_dev, uint, 0644); 1731 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1732 1733 module_param(zvol_major, uint, 0444); 1734 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1735 1736 module_param(zvol_threads, uint, 0444); 1737 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" 1738 "to 0 to use all active CPUs"); 1739 1740 module_param(zvol_request_sync, uint, 0644); 1741 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1742 1743 module_param(zvol_max_discard_blocks, ulong, 0444); 1744 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1745 1746 module_param(zvol_num_taskqs, uint, 0444); 1747 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); 1748 1749 module_param(zvol_prefetch_bytes, uint, 0644); 1750 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1751 1752 module_param(zvol_volmode, uint, 0644); 1753 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1754 1755 #ifdef HAVE_BLK_MQ 1756 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1757 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1758 1759 module_param(zvol_use_blk_mq, uint, 0644); 1760 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1761 1762 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1763 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1764 "Process volblocksize blocks per thread"); 1765 #endif 1766 1767 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1768 module_param(zvol_open_timeout_ms, uint, 0644); 1769 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1770 #endif 1771 1772 /* END CSTYLED */ 1773