1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 2329dc9349SMartin Matuska * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 24ce4dcb97SMartin Matuska * Copyright (c) 2024, Klara, Inc. 25eda14cbcSMatt Macy */ 26eda14cbcSMatt Macy 27eda14cbcSMatt Macy #include <sys/dataset_kstats.h> 28eda14cbcSMatt Macy #include <sys/dbuf.h> 29eda14cbcSMatt Macy #include <sys/dmu_traverse.h> 30eda14cbcSMatt Macy #include <sys/dsl_dataset.h> 31eda14cbcSMatt Macy #include <sys/dsl_prop.h> 32eda14cbcSMatt Macy #include <sys/dsl_dir.h> 33eda14cbcSMatt Macy #include <sys/zap.h> 34eda14cbcSMatt Macy #include <sys/zfeature.h> 35eda14cbcSMatt Macy #include <sys/zil_impl.h> 36eda14cbcSMatt Macy #include <sys/dmu_tx.h> 37eda14cbcSMatt Macy #include <sys/zio.h> 38eda14cbcSMatt Macy #include <sys/zfs_rlock.h> 39eda14cbcSMatt Macy #include <sys/spa_impl.h> 40eda14cbcSMatt Macy #include <sys/zvol.h> 41eda14cbcSMatt Macy #include <sys/zvol_impl.h> 421719886fSMartin Matuska #include <cityhash.h> 43eda14cbcSMatt Macy 44eda14cbcSMatt Macy #include <linux/blkdev_compat.h> 45eda14cbcSMatt Macy #include <linux/task_io_accounting_ops.h> 4675e1fea6SMartin Matuska #include <linux/workqueue.h> 471f1e2261SMartin Matuska #include <linux/blk-mq.h> 481f1e2261SMartin Matuska 491f1e2261SMartin Matuska static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 501f1e2261SMartin Matuska struct request *rq, boolean_t force_sync); 511f1e2261SMartin Matuska 52e92ffd9bSMartin Matuska static unsigned int zvol_major = ZVOL_MAJOR; 53e92ffd9bSMartin Matuska static unsigned int zvol_request_sync = 0; 54e92ffd9bSMartin Matuska static unsigned int zvol_prefetch_bytes = (128 * 1024); 55e92ffd9bSMartin Matuska static unsigned long zvol_max_discard_blocks = 16384; 56716fd348SMartin Matuska 571719886fSMartin Matuska /* 581719886fSMartin Matuska * Switch taskq at multiple of 512 MB offset. This can be set to a lower value 591719886fSMartin Matuska * to utilize more threads for small files but may affect prefetch hits. 601719886fSMartin Matuska */ 611719886fSMartin Matuska #define ZVOL_TASKQ_OFFSET_SHIFT 29 621719886fSMartin Matuska 63716fd348SMartin Matuska #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 640a97523dSMartin Matuska static unsigned int zvol_open_timeout_ms = 1000; 65716fd348SMartin Matuska #endif 66eda14cbcSMatt Macy 671f1e2261SMartin Matuska static unsigned int zvol_threads = 0; 681f1e2261SMartin Matuska static unsigned int zvol_blk_mq_threads = 0; 691f1e2261SMartin Matuska static unsigned int zvol_blk_mq_actual_threads; 701f1e2261SMartin Matuska static boolean_t zvol_use_blk_mq = B_FALSE; 711f1e2261SMartin Matuska 721f1e2261SMartin Matuska /* 731f1e2261SMartin Matuska * The maximum number of volblocksize blocks to process per thread. Typically, 741f1e2261SMartin Matuska * write heavy workloads preform better with higher values here, and read 751f1e2261SMartin Matuska * heavy workloads preform better with lower values, but that's not a hard 761f1e2261SMartin Matuska * and fast rule. It's basically a knob to tune between "less overhead with 771f1e2261SMartin Matuska * less parallelism" and "more overhead, but more parallelism". 781f1e2261SMartin Matuska * 791f1e2261SMartin Matuska * '8' was chosen as a reasonable, balanced, default based off of sequential 801f1e2261SMartin Matuska * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 811f1e2261SMartin Matuska */ 821f1e2261SMartin Matuska static unsigned int zvol_blk_mq_blocks_per_thread = 8; 831f1e2261SMartin Matuska 841719886fSMartin Matuska static unsigned int zvol_num_taskqs = 0; 851719886fSMartin Matuska 861f1e2261SMartin Matuska #ifndef BLKDEV_DEFAULT_RQ 871f1e2261SMartin Matuska /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 881f1e2261SMartin Matuska #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 891f1e2261SMartin Matuska #endif 901f1e2261SMartin Matuska 911f1e2261SMartin Matuska /* 921f1e2261SMartin Matuska * Finalize our BIO or request. 931f1e2261SMartin Matuska */ 947a7741afSMartin Matuska static inline void 957a7741afSMartin Matuska zvol_end_io(struct bio *bio, struct request *rq, int error) 967a7741afSMartin Matuska { 977a7741afSMartin Matuska if (bio) { 987a7741afSMartin Matuska bio->bi_status = errno_to_bi_status(-error); 997a7741afSMartin Matuska bio_endio(bio); 1007a7741afSMartin Matuska } else { 1017a7741afSMartin Matuska blk_mq_end_request(rq, errno_to_bi_status(error)); 1027a7741afSMartin Matuska } 1037a7741afSMartin Matuska } 1041f1e2261SMartin Matuska 1051f1e2261SMartin Matuska static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1061f1e2261SMartin Matuska static unsigned int zvol_actual_blk_mq_queue_depth; 1071f1e2261SMartin Matuska 108eda14cbcSMatt Macy struct zvol_state_os { 109eda14cbcSMatt Macy struct gendisk *zvo_disk; /* generic disk */ 110eda14cbcSMatt Macy struct request_queue *zvo_queue; /* request queue */ 111eda14cbcSMatt Macy dev_t zvo_dev; /* device id */ 1121f1e2261SMartin Matuska 1131f1e2261SMartin Matuska struct blk_mq_tag_set tag_set; 1141f1e2261SMartin Matuska 1151f1e2261SMartin Matuska /* Set from the global 'zvol_use_blk_mq' at zvol load */ 1161f1e2261SMartin Matuska boolean_t use_blk_mq; 117eda14cbcSMatt Macy }; 118eda14cbcSMatt Macy 1191719886fSMartin Matuska typedef struct zv_taskq { 1201719886fSMartin Matuska uint_t tqs_cnt; 1211719886fSMartin Matuska taskq_t **tqs_taskq; 1221719886fSMartin Matuska } zv_taskq_t; 1231719886fSMartin Matuska static zv_taskq_t zvol_taskqs; 124eda14cbcSMatt Macy static struct ida zvol_ida; 125eda14cbcSMatt Macy 1269db44a8eSMartin Matuska typedef struct zv_request_stack { 127eda14cbcSMatt Macy zvol_state_t *zv; 128eda14cbcSMatt Macy struct bio *bio; 1291f1e2261SMartin Matuska struct request *rq; 130eda14cbcSMatt Macy } zv_request_t; 131eda14cbcSMatt Macy 1321f1e2261SMartin Matuska typedef struct zv_work { 1331f1e2261SMartin Matuska struct request *rq; 1341f1e2261SMartin Matuska struct work_struct work; 1351f1e2261SMartin Matuska } zv_work_t; 1361f1e2261SMartin Matuska 1379db44a8eSMartin Matuska typedef struct zv_request_task { 1389db44a8eSMartin Matuska zv_request_t zvr; 1399db44a8eSMartin Matuska taskq_ent_t ent; 1409db44a8eSMartin Matuska } zv_request_task_t; 1419db44a8eSMartin Matuska 1429db44a8eSMartin Matuska static zv_request_task_t * 1439db44a8eSMartin Matuska zv_request_task_create(zv_request_t zvr) 1449db44a8eSMartin Matuska { 1459db44a8eSMartin Matuska zv_request_task_t *task; 1469db44a8eSMartin Matuska task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 1479db44a8eSMartin Matuska taskq_init_ent(&task->ent); 1489db44a8eSMartin Matuska task->zvr = zvr; 1499db44a8eSMartin Matuska return (task); 1509db44a8eSMartin Matuska } 1519db44a8eSMartin Matuska 1529db44a8eSMartin Matuska static void 1539db44a8eSMartin Matuska zv_request_task_free(zv_request_task_t *task) 1549db44a8eSMartin Matuska { 1559db44a8eSMartin Matuska kmem_free(task, sizeof (*task)); 1569db44a8eSMartin Matuska } 1579db44a8eSMartin Matuska 1581f1e2261SMartin Matuska /* 1591f1e2261SMartin Matuska * This is called when a new block multiqueue request comes in. A request 1601f1e2261SMartin Matuska * contains one or more BIOs. 1611f1e2261SMartin Matuska */ 1621f1e2261SMartin Matuska static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 1631f1e2261SMartin Matuska const struct blk_mq_queue_data *bd) 1641f1e2261SMartin Matuska { 1651f1e2261SMartin Matuska struct request *rq = bd->rq; 1661f1e2261SMartin Matuska zvol_state_t *zv = rq->q->queuedata; 1671f1e2261SMartin Matuska 1681f1e2261SMartin Matuska /* Tell the kernel that we are starting to process this request */ 1691f1e2261SMartin Matuska blk_mq_start_request(rq); 1701f1e2261SMartin Matuska 1711f1e2261SMartin Matuska if (blk_rq_is_passthrough(rq)) { 1721f1e2261SMartin Matuska /* Skip non filesystem request */ 1731f1e2261SMartin Matuska blk_mq_end_request(rq, BLK_STS_IOERR); 1741f1e2261SMartin Matuska return (BLK_STS_IOERR); 1751f1e2261SMartin Matuska } 1761f1e2261SMartin Matuska 1771f1e2261SMartin Matuska zvol_request_impl(zv, NULL, rq, 0); 1781f1e2261SMartin Matuska 1791f1e2261SMartin Matuska /* Acknowledge to the kernel that we got this request */ 1801f1e2261SMartin Matuska return (BLK_STS_OK); 1811f1e2261SMartin Matuska } 1821f1e2261SMartin Matuska 1831f1e2261SMartin Matuska static struct blk_mq_ops zvol_blk_mq_queue_ops = { 1841f1e2261SMartin Matuska .queue_rq = zvol_mq_queue_rq, 1851f1e2261SMartin Matuska }; 1861f1e2261SMartin Matuska 1871f1e2261SMartin Matuska /* Initialize our blk-mq struct */ 1881f1e2261SMartin Matuska static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 1891f1e2261SMartin Matuska { 1901f1e2261SMartin Matuska struct zvol_state_os *zso = zv->zv_zso; 1911f1e2261SMartin Matuska 1921f1e2261SMartin Matuska memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 1931f1e2261SMartin Matuska 1941f1e2261SMartin Matuska /* Initialize tag set. */ 1951f1e2261SMartin Matuska zso->tag_set.ops = &zvol_blk_mq_queue_ops; 1961f1e2261SMartin Matuska zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 1971f1e2261SMartin Matuska zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 1981f1e2261SMartin Matuska zso->tag_set.numa_node = NUMA_NO_NODE; 1991f1e2261SMartin Matuska zso->tag_set.cmd_size = 0; 2001f1e2261SMartin Matuska 2011f1e2261SMartin Matuska /* 2021f1e2261SMartin Matuska * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 2031f1e2261SMartin Matuska * zvol_request_impl() 2041f1e2261SMartin Matuska */ 2051f1e2261SMartin Matuska zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 2061f1e2261SMartin Matuska zso->tag_set.driver_data = zv; 2071f1e2261SMartin Matuska 2081f1e2261SMartin Matuska return (blk_mq_alloc_tag_set(&zso->tag_set)); 2091f1e2261SMartin Matuska } 2101f1e2261SMartin Matuska 211eda14cbcSMatt Macy /* 212eda14cbcSMatt Macy * Given a path, return TRUE if path is a ZVOL. 213eda14cbcSMatt Macy */ 214c03c5b1cSMartin Matuska boolean_t 215c03c5b1cSMartin Matuska zvol_os_is_zvol(const char *path) 216eda14cbcSMatt Macy { 2177877fdebSMatt Macy dev_t dev = 0; 218eda14cbcSMatt Macy 2197877fdebSMatt Macy if (vdev_lookup_bdev(path, &dev) != 0) 220eda14cbcSMatt Macy return (B_FALSE); 221eda14cbcSMatt Macy 2227877fdebSMatt Macy if (MAJOR(dev) == zvol_major) 223eda14cbcSMatt Macy return (B_TRUE); 224eda14cbcSMatt Macy 225eda14cbcSMatt Macy return (B_FALSE); 226eda14cbcSMatt Macy } 227eda14cbcSMatt Macy 228eda14cbcSMatt Macy static void 2299db44a8eSMartin Matuska zvol_write(zv_request_t *zvr) 230eda14cbcSMatt Macy { 231eda14cbcSMatt Macy struct bio *bio = zvr->bio; 2321f1e2261SMartin Matuska struct request *rq = zvr->rq; 2337877fdebSMatt Macy int error = 0; 234184c1b94SMartin Matuska zfs_uio_t uio; 235eda14cbcSMatt Macy zvol_state_t *zv = zvr->zv; 2361f1e2261SMartin Matuska struct request_queue *q; 2371f1e2261SMartin Matuska struct gendisk *disk; 2381f1e2261SMartin Matuska unsigned long start_time = 0; 2391f1e2261SMartin Matuska boolean_t acct = B_FALSE; 2401f1e2261SMartin Matuska 2417877fdebSMatt Macy ASSERT3P(zv, !=, NULL); 2427877fdebSMatt Macy ASSERT3U(zv->zv_open_count, >, 0); 2437877fdebSMatt Macy ASSERT3P(zv->zv_zilog, !=, NULL); 244eda14cbcSMatt Macy 2451f1e2261SMartin Matuska q = zv->zv_zso->zvo_queue; 2461f1e2261SMartin Matuska disk = zv->zv_zso->zvo_disk; 2471f1e2261SMartin Matuska 248eda14cbcSMatt Macy /* bio marked as FLUSH need to flush before write */ 2491f1e2261SMartin Matuska if (io_is_flush(bio, rq)) 250eda14cbcSMatt Macy zil_commit(zv->zv_zilog, ZVOL_OBJ); 251eda14cbcSMatt Macy 252eda14cbcSMatt Macy /* Some requests are just for flush and nothing else. */ 2531f1e2261SMartin Matuska if (io_size(bio, rq) == 0) { 254eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 2557a7741afSMartin Matuska zvol_end_io(bio, rq, 0); 256eda14cbcSMatt Macy return; 257eda14cbcSMatt Macy } 258eda14cbcSMatt Macy 2591f1e2261SMartin Matuska zfs_uio_bvec_init(&uio, bio, rq); 2607877fdebSMatt Macy 2611f1e2261SMartin Matuska ssize_t start_resid = uio.uio_resid; 2621f1e2261SMartin Matuska 2631f1e2261SMartin Matuska /* 2641f1e2261SMartin Matuska * With use_blk_mq, accounting is done by blk_mq_start_request() 2651f1e2261SMartin Matuska * and blk_mq_end_request(), so we can skip it here. 2661f1e2261SMartin Matuska */ 2671f1e2261SMartin Matuska if (bio) { 2681f1e2261SMartin Matuska acct = blk_queue_io_stat(q); 2691f1e2261SMartin Matuska if (acct) { 2701f1e2261SMartin Matuska start_time = blk_generic_start_io_acct(q, disk, WRITE, 2711f1e2261SMartin Matuska bio); 2721f1e2261SMartin Matuska } 2731f1e2261SMartin Matuska } 274eda14cbcSMatt Macy 275eda14cbcSMatt Macy boolean_t sync = 2761f1e2261SMartin Matuska io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 277eda14cbcSMatt Macy 278eda14cbcSMatt Macy zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 279eda14cbcSMatt Macy uio.uio_loffset, uio.uio_resid, RL_WRITER); 280eda14cbcSMatt Macy 281eda14cbcSMatt Macy uint64_t volsize = zv->zv_volsize; 282eda14cbcSMatt Macy while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 283eda14cbcSMatt Macy uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 284eda14cbcSMatt Macy uint64_t off = uio.uio_loffset; 285eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 286eda14cbcSMatt Macy 287eda14cbcSMatt Macy if (bytes > volsize - off) /* don't write past the end */ 288eda14cbcSMatt Macy bytes = volsize - off; 289eda14cbcSMatt Macy 290eda14cbcSMatt Macy dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 291eda14cbcSMatt Macy 292eda14cbcSMatt Macy /* This will only fail for ENOSPC */ 293eda14cbcSMatt Macy error = dmu_tx_assign(tx, TXG_WAIT); 294eda14cbcSMatt Macy if (error) { 295eda14cbcSMatt Macy dmu_tx_abort(tx); 296eda14cbcSMatt Macy break; 297eda14cbcSMatt Macy } 298eda14cbcSMatt Macy error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 299eda14cbcSMatt Macy if (error == 0) { 300eda14cbcSMatt Macy zvol_log_write(zv, tx, off, bytes, sync); 301eda14cbcSMatt Macy } 302eda14cbcSMatt Macy dmu_tx_commit(tx); 303eda14cbcSMatt Macy 304eda14cbcSMatt Macy if (error) 305eda14cbcSMatt Macy break; 306eda14cbcSMatt Macy } 307eda14cbcSMatt Macy zfs_rangelock_exit(lr); 308eda14cbcSMatt Macy 309eda14cbcSMatt Macy int64_t nwritten = start_resid - uio.uio_resid; 310eda14cbcSMatt Macy dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 311eda14cbcSMatt Macy task_io_account_write(nwritten); 312eda14cbcSMatt Macy 313eda14cbcSMatt Macy if (sync) 314eda14cbcSMatt Macy zil_commit(zv->zv_zilog, ZVOL_OBJ); 315eda14cbcSMatt Macy 316eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 3177877fdebSMatt Macy 3181f1e2261SMartin Matuska if (bio && acct) { 3197877fdebSMatt Macy blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 3201f1e2261SMartin Matuska } 3217877fdebSMatt Macy 3227a7741afSMartin Matuska zvol_end_io(bio, rq, -error); 323eda14cbcSMatt Macy } 324eda14cbcSMatt Macy 325eda14cbcSMatt Macy static void 3269db44a8eSMartin Matuska zvol_write_task(void *arg) 327eda14cbcSMatt Macy { 3289db44a8eSMartin Matuska zv_request_task_t *task = arg; 3299db44a8eSMartin Matuska zvol_write(&task->zvr); 3309db44a8eSMartin Matuska zv_request_task_free(task); 3319db44a8eSMartin Matuska } 3329db44a8eSMartin Matuska 3339db44a8eSMartin Matuska static void 3349db44a8eSMartin Matuska zvol_discard(zv_request_t *zvr) 3359db44a8eSMartin Matuska { 336eda14cbcSMatt Macy struct bio *bio = zvr->bio; 3371f1e2261SMartin Matuska struct request *rq = zvr->rq; 338eda14cbcSMatt Macy zvol_state_t *zv = zvr->zv; 3391f1e2261SMartin Matuska uint64_t start = io_offset(bio, rq); 3401f1e2261SMartin Matuska uint64_t size = io_size(bio, rq); 341eda14cbcSMatt Macy uint64_t end = start + size; 342eda14cbcSMatt Macy boolean_t sync; 343eda14cbcSMatt Macy int error = 0; 344eda14cbcSMatt Macy dmu_tx_t *tx; 3451f1e2261SMartin Matuska struct request_queue *q = zv->zv_zso->zvo_queue; 3461f1e2261SMartin Matuska struct gendisk *disk = zv->zv_zso->zvo_disk; 3471f1e2261SMartin Matuska unsigned long start_time = 0; 3482a58b312SMartin Matuska boolean_t acct = B_FALSE; 349eda14cbcSMatt Macy 3507877fdebSMatt Macy ASSERT3P(zv, !=, NULL); 3517877fdebSMatt Macy ASSERT3U(zv->zv_open_count, >, 0); 3527877fdebSMatt Macy ASSERT3P(zv->zv_zilog, !=, NULL); 353eda14cbcSMatt Macy 3541f1e2261SMartin Matuska if (bio) { 3551f1e2261SMartin Matuska acct = blk_queue_io_stat(q); 3561f1e2261SMartin Matuska if (acct) { 3571f1e2261SMartin Matuska start_time = blk_generic_start_io_acct(q, disk, WRITE, 3581f1e2261SMartin Matuska bio); 3591f1e2261SMartin Matuska } 3601f1e2261SMartin Matuska } 3617877fdebSMatt Macy 3621f1e2261SMartin Matuska sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 363eda14cbcSMatt Macy 364eda14cbcSMatt Macy if (end > zv->zv_volsize) { 365eda14cbcSMatt Macy error = SET_ERROR(EIO); 366eda14cbcSMatt Macy goto unlock; 367eda14cbcSMatt Macy } 368eda14cbcSMatt Macy 369eda14cbcSMatt Macy /* 370eda14cbcSMatt Macy * Align the request to volume block boundaries when a secure erase is 371eda14cbcSMatt Macy * not required. This will prevent dnode_free_range() from zeroing out 372eda14cbcSMatt Macy * the unaligned parts which is slow (read-modify-write) and useless 373eda14cbcSMatt Macy * since we are not freeing any space by doing so. 374eda14cbcSMatt Macy */ 3751f1e2261SMartin Matuska if (!io_is_secure_erase(bio, rq)) { 376eda14cbcSMatt Macy start = P2ROUNDUP(start, zv->zv_volblocksize); 377aca928a5SMartin Matuska end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 378eda14cbcSMatt Macy size = end - start; 379eda14cbcSMatt Macy } 380eda14cbcSMatt Macy 381eda14cbcSMatt Macy if (start >= end) 382eda14cbcSMatt Macy goto unlock; 383eda14cbcSMatt Macy 384eda14cbcSMatt Macy zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 385eda14cbcSMatt Macy start, size, RL_WRITER); 386eda14cbcSMatt Macy 387eda14cbcSMatt Macy tx = dmu_tx_create(zv->zv_objset); 388eda14cbcSMatt Macy dmu_tx_mark_netfree(tx); 389eda14cbcSMatt Macy error = dmu_tx_assign(tx, TXG_WAIT); 390eda14cbcSMatt Macy if (error != 0) { 391eda14cbcSMatt Macy dmu_tx_abort(tx); 392eda14cbcSMatt Macy } else { 393f8b1db88SMartin Matuska zvol_log_truncate(zv, tx, start, size); 394eda14cbcSMatt Macy dmu_tx_commit(tx); 395eda14cbcSMatt Macy error = dmu_free_long_range(zv->zv_objset, 396eda14cbcSMatt Macy ZVOL_OBJ, start, size); 397eda14cbcSMatt Macy } 398eda14cbcSMatt Macy zfs_rangelock_exit(lr); 399eda14cbcSMatt Macy 400eda14cbcSMatt Macy if (error == 0 && sync) 401eda14cbcSMatt Macy zil_commit(zv->zv_zilog, ZVOL_OBJ); 402eda14cbcSMatt Macy 403eda14cbcSMatt Macy unlock: 404eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 4057877fdebSMatt Macy 4061f1e2261SMartin Matuska if (bio && acct) { 4071f1e2261SMartin Matuska blk_generic_end_io_acct(q, disk, WRITE, bio, 4081f1e2261SMartin Matuska start_time); 4091f1e2261SMartin Matuska } 4107877fdebSMatt Macy 4117a7741afSMartin Matuska zvol_end_io(bio, rq, -error); 412eda14cbcSMatt Macy } 413eda14cbcSMatt Macy 414eda14cbcSMatt Macy static void 4159db44a8eSMartin Matuska zvol_discard_task(void *arg) 416eda14cbcSMatt Macy { 4179db44a8eSMartin Matuska zv_request_task_t *task = arg; 4189db44a8eSMartin Matuska zvol_discard(&task->zvr); 4199db44a8eSMartin Matuska zv_request_task_free(task); 4209db44a8eSMartin Matuska } 4219db44a8eSMartin Matuska 4229db44a8eSMartin Matuska static void 4239db44a8eSMartin Matuska zvol_read(zv_request_t *zvr) 4249db44a8eSMartin Matuska { 425eda14cbcSMatt Macy struct bio *bio = zvr->bio; 4261f1e2261SMartin Matuska struct request *rq = zvr->rq; 4277877fdebSMatt Macy int error = 0; 428184c1b94SMartin Matuska zfs_uio_t uio; 4291f1e2261SMartin Matuska boolean_t acct = B_FALSE; 430eda14cbcSMatt Macy zvol_state_t *zv = zvr->zv; 4311f1e2261SMartin Matuska struct request_queue *q; 4321f1e2261SMartin Matuska struct gendisk *disk; 4331f1e2261SMartin Matuska unsigned long start_time = 0; 4341f1e2261SMartin Matuska 4357877fdebSMatt Macy ASSERT3P(zv, !=, NULL); 4367877fdebSMatt Macy ASSERT3U(zv->zv_open_count, >, 0); 437eda14cbcSMatt Macy 4381f1e2261SMartin Matuska zfs_uio_bvec_init(&uio, bio, rq); 4397877fdebSMatt Macy 4401f1e2261SMartin Matuska q = zv->zv_zso->zvo_queue; 4411f1e2261SMartin Matuska disk = zv->zv_zso->zvo_disk; 4421f1e2261SMartin Matuska 4431f1e2261SMartin Matuska ssize_t start_resid = uio.uio_resid; 4441f1e2261SMartin Matuska 4451f1e2261SMartin Matuska /* 4461f1e2261SMartin Matuska * When blk-mq is being used, accounting is done by 4471f1e2261SMartin Matuska * blk_mq_start_request() and blk_mq_end_request(). 4481f1e2261SMartin Matuska */ 4491f1e2261SMartin Matuska if (bio) { 4501f1e2261SMartin Matuska acct = blk_queue_io_stat(q); 4517877fdebSMatt Macy if (acct) 4521f1e2261SMartin Matuska start_time = blk_generic_start_io_acct(q, disk, READ, 4531f1e2261SMartin Matuska bio); 4541f1e2261SMartin Matuska } 455eda14cbcSMatt Macy 456eda14cbcSMatt Macy zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 457eda14cbcSMatt Macy uio.uio_loffset, uio.uio_resid, RL_READER); 458eda14cbcSMatt Macy 459eda14cbcSMatt Macy uint64_t volsize = zv->zv_volsize; 4601f1e2261SMartin Matuska 461eda14cbcSMatt Macy while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 462eda14cbcSMatt Macy uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 463eda14cbcSMatt Macy 464eda14cbcSMatt Macy /* don't read past the end */ 465eda14cbcSMatt Macy if (bytes > volsize - uio.uio_loffset) 466eda14cbcSMatt Macy bytes = volsize - uio.uio_loffset; 467eda14cbcSMatt Macy 468eda14cbcSMatt Macy error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 469eda14cbcSMatt Macy if (error) { 470eda14cbcSMatt Macy /* convert checksum errors into IO errors */ 471eda14cbcSMatt Macy if (error == ECKSUM) 472eda14cbcSMatt Macy error = SET_ERROR(EIO); 473eda14cbcSMatt Macy break; 474eda14cbcSMatt Macy } 475eda14cbcSMatt Macy } 476eda14cbcSMatt Macy zfs_rangelock_exit(lr); 477eda14cbcSMatt Macy 478eda14cbcSMatt Macy int64_t nread = start_resid - uio.uio_resid; 479eda14cbcSMatt Macy dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 480eda14cbcSMatt Macy task_io_account_read(nread); 481eda14cbcSMatt Macy 482eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 4837877fdebSMatt Macy 4841f1e2261SMartin Matuska if (bio && acct) { 4857877fdebSMatt Macy blk_generic_end_io_acct(q, disk, READ, bio, start_time); 4861f1e2261SMartin Matuska } 4877877fdebSMatt Macy 4887a7741afSMartin Matuska zvol_end_io(bio, rq, -error); 4899db44a8eSMartin Matuska } 4909db44a8eSMartin Matuska 4919db44a8eSMartin Matuska static void 4929db44a8eSMartin Matuska zvol_read_task(void *arg) 4939db44a8eSMartin Matuska { 4949db44a8eSMartin Matuska zv_request_task_t *task = arg; 4959db44a8eSMartin Matuska zvol_read(&task->zvr); 4969db44a8eSMartin Matuska zv_request_task_free(task); 497eda14cbcSMatt Macy } 498eda14cbcSMatt Macy 4991f1e2261SMartin Matuska 5001f1e2261SMartin Matuska /* 5011f1e2261SMartin Matuska * Process a BIO or request 5021f1e2261SMartin Matuska * 5031f1e2261SMartin Matuska * Either 'bio' or 'rq' should be set depending on if we are processing a 5041f1e2261SMartin Matuska * bio or a request (both should not be set). 5051f1e2261SMartin Matuska * 5061f1e2261SMartin Matuska * force_sync: Set to 0 to defer processing to a background taskq 5071f1e2261SMartin Matuska * Set to 1 to process data synchronously 5081f1e2261SMartin Matuska */ 509681ce946SMartin Matuska static void 5101f1e2261SMartin Matuska zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 5111f1e2261SMartin Matuska boolean_t force_sync) 512eda14cbcSMatt Macy { 513eda14cbcSMatt Macy fstrans_cookie_t cookie = spl_fstrans_mark(); 5141f1e2261SMartin Matuska uint64_t offset = io_offset(bio, rq); 5151f1e2261SMartin Matuska uint64_t size = io_size(bio, rq); 5161f1e2261SMartin Matuska int rw = io_data_dir(bio, rq); 517eda14cbcSMatt Macy 518ce4dcb97SMartin Matuska if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 5197a7741afSMartin Matuska zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); 520ce4dcb97SMartin Matuska goto out; 521ce4dcb97SMartin Matuska } 522ce4dcb97SMartin Matuska 523f8b1db88SMartin Matuska if (zvol_request_sync || zv->zv_threading == B_FALSE) 5241f1e2261SMartin Matuska force_sync = 1; 525eda14cbcSMatt Macy 5269db44a8eSMartin Matuska zv_request_t zvr = { 5279db44a8eSMartin Matuska .zv = zv, 5289db44a8eSMartin Matuska .bio = bio, 5291f1e2261SMartin Matuska .rq = rq, 5309db44a8eSMartin Matuska }; 5311f1e2261SMartin Matuska 5321f1e2261SMartin Matuska if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 5331f1e2261SMartin Matuska printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 5341f1e2261SMartin Matuska zv->zv_zso->zvo_disk->disk_name, 5351f1e2261SMartin Matuska (long long unsigned)offset, 5361f1e2261SMartin Matuska (long unsigned)size); 5371f1e2261SMartin Matuska 5387a7741afSMartin Matuska zvol_end_io(bio, rq, -SET_ERROR(EIO)); 5391f1e2261SMartin Matuska goto out; 5401f1e2261SMartin Matuska } 5411f1e2261SMartin Matuska 5429db44a8eSMartin Matuska zv_request_task_t *task; 5431719886fSMartin Matuska zv_taskq_t *ztqs = &zvol_taskqs; 5441719886fSMartin Matuska uint_t blk_mq_hw_queue = 0; 5451719886fSMartin Matuska uint_t tq_idx; 5461719886fSMartin Matuska uint_t taskq_hash; 5471719886fSMartin Matuska if (rq) 5481719886fSMartin Matuska #ifdef HAVE_BLK_MQ_RQ_HCTX 5491719886fSMartin Matuska blk_mq_hw_queue = rq->mq_hctx->queue_num; 5501719886fSMartin Matuska #else 5511719886fSMartin Matuska blk_mq_hw_queue = 5521719886fSMartin Matuska rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 5531719886fSMartin Matuska #endif 5547a7741afSMartin Matuska taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 5557a7741afSMartin Matuska blk_mq_hw_queue); 5561719886fSMartin Matuska tq_idx = taskq_hash % ztqs->tqs_cnt; 5579db44a8eSMartin Matuska 558eda14cbcSMatt Macy if (rw == WRITE) { 559eda14cbcSMatt Macy if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 5607a7741afSMartin Matuska zvol_end_io(bio, rq, -SET_ERROR(EROFS)); 561eda14cbcSMatt Macy goto out; 562eda14cbcSMatt Macy } 563eda14cbcSMatt Macy 564eda14cbcSMatt Macy /* 565eda14cbcSMatt Macy * Prevents the zvol from being suspended, or the ZIL being 566eda14cbcSMatt Macy * concurrently opened. Will be released after the i/o 567eda14cbcSMatt Macy * completes. 568eda14cbcSMatt Macy */ 569eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_READER); 570eda14cbcSMatt Macy 571eda14cbcSMatt Macy /* 572eda14cbcSMatt Macy * Open a ZIL if this is the first time we have written to this 573eda14cbcSMatt Macy * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 574eda14cbcSMatt Macy * than zv_state_lock so that we don't need to acquire an 575eda14cbcSMatt Macy * additional lock in this path. 576eda14cbcSMatt Macy */ 577eda14cbcSMatt Macy if (zv->zv_zilog == NULL) { 578eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 579eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_WRITER); 580eda14cbcSMatt Macy if (zv->zv_zilog == NULL) { 581eda14cbcSMatt Macy zv->zv_zilog = zil_open(zv->zv_objset, 582271171e0SMartin Matuska zvol_get_data, &zv->zv_kstat.dk_zil_sums); 583eda14cbcSMatt Macy zv->zv_flags |= ZVOL_WRITTEN_TO; 5849db44a8eSMartin Matuska /* replay / destroy done in zvol_create_minor */ 5859db44a8eSMartin Matuska VERIFY0((zv->zv_zilog->zl_header->zh_flags & 5869db44a8eSMartin Matuska ZIL_REPLAY_NEEDED)); 587eda14cbcSMatt Macy } 588eda14cbcSMatt Macy rw_downgrade(&zv->zv_suspend_lock); 589eda14cbcSMatt Macy } 590eda14cbcSMatt Macy 591eda14cbcSMatt Macy /* 592eda14cbcSMatt Macy * We don't want this thread to be blocked waiting for i/o to 593eda14cbcSMatt Macy * complete, so we instead wait from a taskq callback. The 594eda14cbcSMatt Macy * i/o may be a ZIL write (via zil_commit()), or a read of an 595eda14cbcSMatt Macy * indirect block, or a read of a data block (if this is a 596eda14cbcSMatt Macy * partial-block write). We will indicate that the i/o is 5971f1e2261SMartin Matuska * complete by calling END_IO() from the taskq callback. 598eda14cbcSMatt Macy * 599eda14cbcSMatt Macy * This design allows the calling thread to continue and 600eda14cbcSMatt Macy * initiate more concurrent operations by calling 601eda14cbcSMatt Macy * zvol_request() again. There are typically only a small 602eda14cbcSMatt Macy * number of threads available to call zvol_request() (e.g. 603eda14cbcSMatt Macy * one per iSCSI target), so keeping the latency of 604eda14cbcSMatt Macy * zvol_request() low is important for performance. 605eda14cbcSMatt Macy * 606eda14cbcSMatt Macy * The zvol_request_sync module parameter allows this 607eda14cbcSMatt Macy * behavior to be altered, for performance evaluation 608eda14cbcSMatt Macy * purposes. If the callback blocks, setting 609eda14cbcSMatt Macy * zvol_request_sync=1 will result in much worse performance. 610eda14cbcSMatt Macy * 611eda14cbcSMatt Macy * We can have up to zvol_threads concurrent i/o's being 612eda14cbcSMatt Macy * processed for all zvols on the system. This is typically 613eda14cbcSMatt Macy * a vast improvement over the zvol_request_sync=1 behavior 614eda14cbcSMatt Macy * of one i/o at a time per zvol. However, an even better 615eda14cbcSMatt Macy * design would be for zvol_request() to initiate the zio 616eda14cbcSMatt Macy * directly, and then be notified by the zio_done callback, 6171f1e2261SMartin Matuska * which would call END_IO(). Unfortunately, the DMU/ZIL 618eda14cbcSMatt Macy * interfaces lack this functionality (they block waiting for 619eda14cbcSMatt Macy * the i/o to complete). 620eda14cbcSMatt Macy */ 6211f1e2261SMartin Matuska if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 6221f1e2261SMartin Matuska if (force_sync) { 6239db44a8eSMartin Matuska zvol_discard(&zvr); 624eda14cbcSMatt Macy } else { 6259db44a8eSMartin Matuska task = zv_request_task_create(zvr); 6261719886fSMartin Matuska taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 6279db44a8eSMartin Matuska zvol_discard_task, task, 0, &task->ent); 628eda14cbcSMatt Macy } 629eda14cbcSMatt Macy } else { 6301f1e2261SMartin Matuska if (force_sync) { 6319db44a8eSMartin Matuska zvol_write(&zvr); 632eda14cbcSMatt Macy } else { 6339db44a8eSMartin Matuska task = zv_request_task_create(zvr); 6341719886fSMartin Matuska taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 6359db44a8eSMartin Matuska zvol_write_task, task, 0, &task->ent); 636eda14cbcSMatt Macy } 637eda14cbcSMatt Macy } 638eda14cbcSMatt Macy } else { 639eda14cbcSMatt Macy /* 640eda14cbcSMatt Macy * The SCST driver, and possibly others, may issue READ I/Os 641eda14cbcSMatt Macy * with a length of zero bytes. These empty I/Os contain no 642eda14cbcSMatt Macy * data and require no additional handling. 643eda14cbcSMatt Macy */ 644eda14cbcSMatt Macy if (size == 0) { 6457a7741afSMartin Matuska zvol_end_io(bio, rq, 0); 646eda14cbcSMatt Macy goto out; 647eda14cbcSMatt Macy } 648eda14cbcSMatt Macy 649eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_READER); 650eda14cbcSMatt Macy 651eda14cbcSMatt Macy /* See comment in WRITE case above. */ 6521f1e2261SMartin Matuska if (force_sync) { 6539db44a8eSMartin Matuska zvol_read(&zvr); 654eda14cbcSMatt Macy } else { 6559db44a8eSMartin Matuska task = zv_request_task_create(zvr); 6561719886fSMartin Matuska taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 6579db44a8eSMartin Matuska zvol_read_task, task, 0, &task->ent); 658eda14cbcSMatt Macy } 659eda14cbcSMatt Macy } 660eda14cbcSMatt Macy 661eda14cbcSMatt Macy out: 662eda14cbcSMatt Macy spl_fstrans_unmark(cookie); 6631f1e2261SMartin Matuska } 6641f1e2261SMartin Matuska 6651f1e2261SMartin Matuska #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 6661f1e2261SMartin Matuska #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 6671f1e2261SMartin Matuska static void 6681f1e2261SMartin Matuska zvol_submit_bio(struct bio *bio) 6691f1e2261SMartin Matuska #else 6701f1e2261SMartin Matuska static blk_qc_t 6711f1e2261SMartin Matuska zvol_submit_bio(struct bio *bio) 6721f1e2261SMartin Matuska #endif 6731f1e2261SMartin Matuska #else 6741f1e2261SMartin Matuska static MAKE_REQUEST_FN_RET 6751f1e2261SMartin Matuska zvol_request(struct request_queue *q, struct bio *bio) 6761f1e2261SMartin Matuska #endif 6771f1e2261SMartin Matuska { 6781f1e2261SMartin Matuska #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 6791f1e2261SMartin Matuska #if defined(HAVE_BIO_BDEV_DISK) 6801f1e2261SMartin Matuska struct request_queue *q = bio->bi_bdev->bd_disk->queue; 6811f1e2261SMartin Matuska #else 6821f1e2261SMartin Matuska struct request_queue *q = bio->bi_disk->queue; 6831f1e2261SMartin Matuska #endif 6841f1e2261SMartin Matuska #endif 6851f1e2261SMartin Matuska zvol_state_t *zv = q->queuedata; 6861f1e2261SMartin Matuska 6871f1e2261SMartin Matuska zvol_request_impl(zv, bio, NULL, 0); 6881f1e2261SMartin Matuska #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 6891f1e2261SMartin Matuska defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 690681ce946SMartin Matuska !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 691eda14cbcSMatt Macy return (BLK_QC_T_NONE); 692eda14cbcSMatt Macy #endif 693eda14cbcSMatt Macy } 694eda14cbcSMatt Macy 695eda14cbcSMatt Macy static int 696315ee00fSMartin Matuska #ifdef HAVE_BLK_MODE_T 697315ee00fSMartin Matuska zvol_open(struct gendisk *disk, blk_mode_t flag) 698315ee00fSMartin Matuska #else 699eda14cbcSMatt Macy zvol_open(struct block_device *bdev, fmode_t flag) 700315ee00fSMartin Matuska #endif 701eda14cbcSMatt Macy { 702eda14cbcSMatt Macy zvol_state_t *zv; 703eda14cbcSMatt Macy int error = 0; 704e92ffd9bSMartin Matuska boolean_t drop_suspend = B_FALSE; 705681ce946SMartin Matuska #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 706681ce946SMartin Matuska hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 707681ce946SMartin Matuska hrtime_t start = gethrtime(); 708eda14cbcSMatt Macy 709681ce946SMartin Matuska retry: 710681ce946SMartin Matuska #endif 711eda14cbcSMatt Macy rw_enter(&zvol_state_lock, RW_READER); 712eda14cbcSMatt Macy /* 713eda14cbcSMatt Macy * Obtain a copy of private_data under the zvol_state_lock to make 714eda14cbcSMatt Macy * sure that either the result of zvol free code path setting 715315ee00fSMartin Matuska * disk->private_data to NULL is observed, or zvol_os_free() 716eda14cbcSMatt Macy * is not called on this zv because of the positive zv_open_count. 717eda14cbcSMatt Macy */ 718315ee00fSMartin Matuska #ifdef HAVE_BLK_MODE_T 719315ee00fSMartin Matuska zv = disk->private_data; 720315ee00fSMartin Matuska #else 721eda14cbcSMatt Macy zv = bdev->bd_disk->private_data; 722315ee00fSMartin Matuska #endif 723eda14cbcSMatt Macy if (zv == NULL) { 724eda14cbcSMatt Macy rw_exit(&zvol_state_lock); 725ce4dcb97SMartin Matuska return (-SET_ERROR(ENXIO)); 726eda14cbcSMatt Macy } 727eda14cbcSMatt Macy 728e92ffd9bSMartin Matuska mutex_enter(&zv->zv_state_lock); 729ce4dcb97SMartin Matuska 730ce4dcb97SMartin Matuska if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 731ce4dcb97SMartin Matuska mutex_exit(&zv->zv_state_lock); 732ce4dcb97SMartin Matuska rw_exit(&zvol_state_lock); 733ce4dcb97SMartin Matuska return (-SET_ERROR(ENXIO)); 734ce4dcb97SMartin Matuska } 735ce4dcb97SMartin Matuska 736e92ffd9bSMartin Matuska /* 737e92ffd9bSMartin Matuska * Make sure zvol is not suspended during first open 738e92ffd9bSMartin Matuska * (hold zv_suspend_lock) and respect proper lock acquisition 739e92ffd9bSMartin Matuska * ordering - zv_suspend_lock before zv_state_lock 740e92ffd9bSMartin Matuska */ 741e92ffd9bSMartin Matuska if (zv->zv_open_count == 0) { 742e92ffd9bSMartin Matuska if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 743e92ffd9bSMartin Matuska mutex_exit(&zv->zv_state_lock); 744e92ffd9bSMartin Matuska rw_enter(&zv->zv_suspend_lock, RW_READER); 745e92ffd9bSMartin Matuska mutex_enter(&zv->zv_state_lock); 746e92ffd9bSMartin Matuska /* check to see if zv_suspend_lock is needed */ 747e92ffd9bSMartin Matuska if (zv->zv_open_count != 0) { 748e92ffd9bSMartin Matuska rw_exit(&zv->zv_suspend_lock); 749e92ffd9bSMartin Matuska } else { 750e92ffd9bSMartin Matuska drop_suspend = B_TRUE; 751e92ffd9bSMartin Matuska } 752e92ffd9bSMartin Matuska } else { 753e92ffd9bSMartin Matuska drop_suspend = B_TRUE; 754e92ffd9bSMartin Matuska } 755e92ffd9bSMartin Matuska } 756e92ffd9bSMartin Matuska rw_exit(&zvol_state_lock); 757e92ffd9bSMartin Matuska 758e92ffd9bSMartin Matuska ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 759e92ffd9bSMartin Matuska 760e92ffd9bSMartin Matuska if (zv->zv_open_count == 0) { 761e92ffd9bSMartin Matuska boolean_t drop_namespace = B_FALSE; 762e92ffd9bSMartin Matuska 763e92ffd9bSMartin Matuska ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 764e92ffd9bSMartin Matuska 765681ce946SMartin Matuska /* 766681ce946SMartin Matuska * In all other call paths the spa_namespace_lock is taken 767681ce946SMartin Matuska * before the bdev->bd_mutex lock. However, on open(2) 768681ce946SMartin Matuska * the __blkdev_get() function calls fops->open() with the 769681ce946SMartin Matuska * bdev->bd_mutex lock held. This can result in a deadlock 770681ce946SMartin Matuska * when zvols from one pool are used as vdevs in another. 771681ce946SMartin Matuska * 772681ce946SMartin Matuska * To prevent a lock inversion deadlock we preemptively 773681ce946SMartin Matuska * take the spa_namespace_lock. Normally the lock will not 774681ce946SMartin Matuska * be contended and this is safe because spa_open_common() 775681ce946SMartin Matuska * handles the case where the caller already holds the 776681ce946SMartin Matuska * spa_namespace_lock. 777681ce946SMartin Matuska * 778681ce946SMartin Matuska * When the lock cannot be aquired after multiple retries 779681ce946SMartin Matuska * this must be the vdev on zvol deadlock case and we have 780681ce946SMartin Matuska * no choice but to return an error. For 5.12 and older 781681ce946SMartin Matuska * kernels returning -ERESTARTSYS will result in the 782681ce946SMartin Matuska * bdev->bd_mutex being dropped, then reacquired, and 783681ce946SMartin Matuska * fops->open() being called again. This process can be 784681ce946SMartin Matuska * repeated safely until both locks are acquired. For 5.13 785681ce946SMartin Matuska * and newer the -ERESTARTSYS retry logic was removed from 786681ce946SMartin Matuska * the kernel so the only option is to return the error for 787681ce946SMartin Matuska * the caller to handle it. 788681ce946SMartin Matuska */ 789e92ffd9bSMartin Matuska if (!mutex_owned(&spa_namespace_lock)) { 790681ce946SMartin Matuska if (!mutex_tryenter(&spa_namespace_lock)) { 791e92ffd9bSMartin Matuska mutex_exit(&zv->zv_state_lock); 792e92ffd9bSMartin Matuska rw_exit(&zv->zv_suspend_lock); 79375e1fea6SMartin Matuska drop_suspend = B_FALSE; 794681ce946SMartin Matuska 795681ce946SMartin Matuska #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 796681ce946SMartin Matuska schedule(); 797ce4dcb97SMartin Matuska return (-SET_ERROR(ERESTARTSYS)); 798681ce946SMartin Matuska #else 799681ce946SMartin Matuska if ((gethrtime() - start) > timeout) 800ce4dcb97SMartin Matuska return (-SET_ERROR(ERESTARTSYS)); 801681ce946SMartin Matuska 802aca928a5SMartin Matuska schedule_timeout_interruptible( 803aca928a5SMartin Matuska MSEC_TO_TICK(10)); 804681ce946SMartin Matuska goto retry; 805681ce946SMartin Matuska #endif 806681ce946SMartin Matuska } else { 807681ce946SMartin Matuska drop_namespace = B_TRUE; 808681ce946SMartin Matuska } 809681ce946SMartin Matuska } 810681ce946SMartin Matuska 811315ee00fSMartin Matuska error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 812eda14cbcSMatt Macy 813681ce946SMartin Matuska if (drop_namespace) 814681ce946SMartin Matuska mutex_exit(&spa_namespace_lock); 815e92ffd9bSMartin Matuska } 816eda14cbcSMatt Macy 817e92ffd9bSMartin Matuska if (error == 0) { 818315ee00fSMartin Matuska if ((blk_mode_is_open_write(flag)) && 819315ee00fSMartin Matuska (zv->zv_flags & ZVOL_RDONLY)) { 820eda14cbcSMatt Macy if (zv->zv_open_count == 0) 821eda14cbcSMatt Macy zvol_last_close(zv); 822eda14cbcSMatt Macy 823ce4dcb97SMartin Matuska error = -SET_ERROR(EROFS); 824e92ffd9bSMartin Matuska } else { 825e92ffd9bSMartin Matuska zv->zv_open_count++; 826e92ffd9bSMartin Matuska } 827e92ffd9bSMartin Matuska } 828e92ffd9bSMartin Matuska 829eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock); 830eda14cbcSMatt Macy if (drop_suspend) 831eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 832681ce946SMartin Matuska 833e92ffd9bSMartin Matuska if (error == 0) 834315ee00fSMartin Matuska #ifdef HAVE_BLK_MODE_T 835315ee00fSMartin Matuska disk_check_media_change(disk); 836315ee00fSMartin Matuska #else 837e92ffd9bSMartin Matuska zfs_check_media_change(bdev); 838315ee00fSMartin Matuska #endif 839e92ffd9bSMartin Matuska 840e92ffd9bSMartin Matuska return (error); 841eda14cbcSMatt Macy } 842eda14cbcSMatt Macy 843eda14cbcSMatt Macy static void 844315ee00fSMartin Matuska #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 845315ee00fSMartin Matuska zvol_release(struct gendisk *disk) 846315ee00fSMartin Matuska #else 847315ee00fSMartin Matuska zvol_release(struct gendisk *disk, fmode_t unused) 848315ee00fSMartin Matuska #endif 849eda14cbcSMatt Macy { 850315ee00fSMartin Matuska #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 851315ee00fSMartin Matuska (void) unused; 852315ee00fSMartin Matuska #endif 853eda14cbcSMatt Macy zvol_state_t *zv; 854eda14cbcSMatt Macy boolean_t drop_suspend = B_TRUE; 855eda14cbcSMatt Macy 856eda14cbcSMatt Macy rw_enter(&zvol_state_lock, RW_READER); 857eda14cbcSMatt Macy zv = disk->private_data; 858eda14cbcSMatt Macy 859eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock); 8607877fdebSMatt Macy ASSERT3U(zv->zv_open_count, >, 0); 861eda14cbcSMatt Macy /* 862eda14cbcSMatt Macy * make sure zvol is not suspended during last close 863eda14cbcSMatt Macy * (hold zv_suspend_lock) and respect proper lock acquisition 864eda14cbcSMatt Macy * ordering - zv_suspend_lock before zv_state_lock 865eda14cbcSMatt Macy */ 866eda14cbcSMatt Macy if (zv->zv_open_count == 1) { 867eda14cbcSMatt Macy if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 868eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock); 869eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_READER); 870eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock); 871eda14cbcSMatt Macy /* check to see if zv_suspend_lock is needed */ 872eda14cbcSMatt Macy if (zv->zv_open_count != 1) { 873eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 874eda14cbcSMatt Macy drop_suspend = B_FALSE; 875eda14cbcSMatt Macy } 876eda14cbcSMatt Macy } 877eda14cbcSMatt Macy } else { 878eda14cbcSMatt Macy drop_suspend = B_FALSE; 879eda14cbcSMatt Macy } 880eda14cbcSMatt Macy rw_exit(&zvol_state_lock); 881eda14cbcSMatt Macy 882eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 883eda14cbcSMatt Macy 884eda14cbcSMatt Macy zv->zv_open_count--; 8857877fdebSMatt Macy if (zv->zv_open_count == 0) { 8867877fdebSMatt Macy ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 887eda14cbcSMatt Macy zvol_last_close(zv); 8887877fdebSMatt Macy } 889eda14cbcSMatt Macy 890eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock); 891eda14cbcSMatt Macy 892eda14cbcSMatt Macy if (drop_suspend) 893eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 894eda14cbcSMatt Macy } 895eda14cbcSMatt Macy 896eda14cbcSMatt Macy static int 897eda14cbcSMatt Macy zvol_ioctl(struct block_device *bdev, fmode_t mode, 898eda14cbcSMatt Macy unsigned int cmd, unsigned long arg) 899eda14cbcSMatt Macy { 900eda14cbcSMatt Macy zvol_state_t *zv = bdev->bd_disk->private_data; 901eda14cbcSMatt Macy int error = 0; 902eda14cbcSMatt Macy 903eda14cbcSMatt Macy ASSERT3U(zv->zv_open_count, >, 0); 904eda14cbcSMatt Macy 905eda14cbcSMatt Macy switch (cmd) { 906eda14cbcSMatt Macy case BLKFLSBUF: 907abcdc1b9SMartin Matuska #ifdef HAVE_FSYNC_BDEV 908eda14cbcSMatt Macy fsync_bdev(bdev); 909abcdc1b9SMartin Matuska #elif defined(HAVE_SYNC_BLOCKDEV) 910abcdc1b9SMartin Matuska sync_blockdev(bdev); 911abcdc1b9SMartin Matuska #else 912abcdc1b9SMartin Matuska #error "Neither fsync_bdev() nor sync_blockdev() found" 913abcdc1b9SMartin Matuska #endif 914eda14cbcSMatt Macy invalidate_bdev(bdev); 915eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_READER); 916eda14cbcSMatt Macy 917eda14cbcSMatt Macy if (!(zv->zv_flags & ZVOL_RDONLY)) 918eda14cbcSMatt Macy txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 919eda14cbcSMatt Macy 920eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock); 921eda14cbcSMatt Macy break; 922eda14cbcSMatt Macy 923eda14cbcSMatt Macy case BLKZNAME: 924eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock); 925eda14cbcSMatt Macy error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 926eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock); 927eda14cbcSMatt Macy break; 928eda14cbcSMatt Macy 929eda14cbcSMatt Macy default: 930eda14cbcSMatt Macy error = -ENOTTY; 931eda14cbcSMatt Macy break; 932eda14cbcSMatt Macy } 933eda14cbcSMatt Macy 934eda14cbcSMatt Macy return (SET_ERROR(error)); 935eda14cbcSMatt Macy } 936eda14cbcSMatt Macy 937eda14cbcSMatt Macy #ifdef CONFIG_COMPAT 938eda14cbcSMatt Macy static int 939eda14cbcSMatt Macy zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 940eda14cbcSMatt Macy unsigned cmd, unsigned long arg) 941eda14cbcSMatt Macy { 942eda14cbcSMatt Macy return (zvol_ioctl(bdev, mode, cmd, arg)); 943eda14cbcSMatt Macy } 944eda14cbcSMatt Macy #else 945eda14cbcSMatt Macy #define zvol_compat_ioctl NULL 946eda14cbcSMatt Macy #endif 947eda14cbcSMatt Macy 948eda14cbcSMatt Macy static unsigned int 949eda14cbcSMatt Macy zvol_check_events(struct gendisk *disk, unsigned int clearing) 950eda14cbcSMatt Macy { 951eda14cbcSMatt Macy unsigned int mask = 0; 952eda14cbcSMatt Macy 953eda14cbcSMatt Macy rw_enter(&zvol_state_lock, RW_READER); 954eda14cbcSMatt Macy 955eda14cbcSMatt Macy zvol_state_t *zv = disk->private_data; 956eda14cbcSMatt Macy if (zv != NULL) { 957eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock); 958eda14cbcSMatt Macy mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 959eda14cbcSMatt Macy zv->zv_changed = 0; 960eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock); 961eda14cbcSMatt Macy } 962eda14cbcSMatt Macy 963eda14cbcSMatt Macy rw_exit(&zvol_state_lock); 964eda14cbcSMatt Macy 965eda14cbcSMatt Macy return (mask); 966eda14cbcSMatt Macy } 967eda14cbcSMatt Macy 968eda14cbcSMatt Macy static int 969eda14cbcSMatt Macy zvol_revalidate_disk(struct gendisk *disk) 970eda14cbcSMatt Macy { 971eda14cbcSMatt Macy rw_enter(&zvol_state_lock, RW_READER); 972eda14cbcSMatt Macy 973eda14cbcSMatt Macy zvol_state_t *zv = disk->private_data; 974eda14cbcSMatt Macy if (zv != NULL) { 975eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock); 976eda14cbcSMatt Macy set_capacity(zv->zv_zso->zvo_disk, 977eda14cbcSMatt Macy zv->zv_volsize >> SECTOR_BITS); 978eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock); 979eda14cbcSMatt Macy } 980eda14cbcSMatt Macy 981eda14cbcSMatt Macy rw_exit(&zvol_state_lock); 982eda14cbcSMatt Macy 983eda14cbcSMatt Macy return (0); 984eda14cbcSMatt Macy } 985eda14cbcSMatt Macy 986c03c5b1cSMartin Matuska int 987c03c5b1cSMartin Matuska zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 988eda14cbcSMatt Macy { 9897877fdebSMatt Macy struct gendisk *disk = zv->zv_zso->zvo_disk; 990eda14cbcSMatt Macy 9917877fdebSMatt Macy #if defined(HAVE_REVALIDATE_DISK_SIZE) 9927877fdebSMatt Macy revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 9937877fdebSMatt Macy #elif defined(HAVE_REVALIDATE_DISK) 9947877fdebSMatt Macy revalidate_disk(disk); 9957877fdebSMatt Macy #else 9967877fdebSMatt Macy zvol_revalidate_disk(disk); 9977877fdebSMatt Macy #endif 998eda14cbcSMatt Macy return (0); 999eda14cbcSMatt Macy } 1000eda14cbcSMatt Macy 1001c03c5b1cSMartin Matuska void 1002c03c5b1cSMartin Matuska zvol_os_clear_private(zvol_state_t *zv) 1003eda14cbcSMatt Macy { 1004eda14cbcSMatt Macy /* 1005eda14cbcSMatt Macy * Cleared while holding zvol_state_lock as a writer 1006eda14cbcSMatt Macy * which will prevent zvol_open() from opening it. 1007eda14cbcSMatt Macy */ 1008eda14cbcSMatt Macy zv->zv_zso->zvo_disk->private_data = NULL; 1009eda14cbcSMatt Macy } 1010eda14cbcSMatt Macy 1011eda14cbcSMatt Macy /* 1012eda14cbcSMatt Macy * Provide a simple virtual geometry for legacy compatibility. For devices 1013eda14cbcSMatt Macy * smaller than 1 MiB a small head and sector count is used to allow very 1014eda14cbcSMatt Macy * tiny devices. For devices over 1 Mib a standard head and sector count 1015eda14cbcSMatt Macy * is used to keep the cylinders count reasonable. 1016eda14cbcSMatt Macy */ 1017eda14cbcSMatt Macy static int 1018eda14cbcSMatt Macy zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1019eda14cbcSMatt Macy { 1020eda14cbcSMatt Macy zvol_state_t *zv = bdev->bd_disk->private_data; 1021eda14cbcSMatt Macy sector_t sectors; 1022eda14cbcSMatt Macy 1023eda14cbcSMatt Macy ASSERT3U(zv->zv_open_count, >, 0); 1024eda14cbcSMatt Macy 1025eda14cbcSMatt Macy sectors = get_capacity(zv->zv_zso->zvo_disk); 1026eda14cbcSMatt Macy 1027eda14cbcSMatt Macy if (sectors > 2048) { 1028eda14cbcSMatt Macy geo->heads = 16; 1029eda14cbcSMatt Macy geo->sectors = 63; 1030eda14cbcSMatt Macy } else { 1031eda14cbcSMatt Macy geo->heads = 2; 1032eda14cbcSMatt Macy geo->sectors = 4; 1033eda14cbcSMatt Macy } 1034eda14cbcSMatt Macy 1035eda14cbcSMatt Macy geo->start = 0; 1036eda14cbcSMatt Macy geo->cylinders = sectors / (geo->heads * geo->sectors); 1037eda14cbcSMatt Macy 1038eda14cbcSMatt Macy return (0); 1039eda14cbcSMatt Macy } 1040eda14cbcSMatt Macy 10411f1e2261SMartin Matuska /* 10421f1e2261SMartin Matuska * Why have two separate block_device_operations structs? 10431f1e2261SMartin Matuska * 10441f1e2261SMartin Matuska * Normally we'd just have one, and assign 'submit_bio' as needed. However, 10451f1e2261SMartin Matuska * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 10461f1e2261SMartin Matuska * can't just change submit_bio dynamically at runtime. So just create two 10471f1e2261SMartin Matuska * separate structs to get around this. 10481f1e2261SMartin Matuska */ 10491f1e2261SMartin Matuska static const struct block_device_operations zvol_ops_blk_mq = { 10501f1e2261SMartin Matuska .open = zvol_open, 10511f1e2261SMartin Matuska .release = zvol_release, 10521f1e2261SMartin Matuska .ioctl = zvol_ioctl, 10531f1e2261SMartin Matuska .compat_ioctl = zvol_compat_ioctl, 10541f1e2261SMartin Matuska .check_events = zvol_check_events, 10551f1e2261SMartin Matuska #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 10561f1e2261SMartin Matuska .revalidate_disk = zvol_revalidate_disk, 10571f1e2261SMartin Matuska #endif 10581f1e2261SMartin Matuska .getgeo = zvol_getgeo, 10591f1e2261SMartin Matuska .owner = THIS_MODULE, 10601f1e2261SMartin Matuska }; 10611f1e2261SMartin Matuska 1062e92ffd9bSMartin Matuska static const struct block_device_operations zvol_ops = { 1063eda14cbcSMatt Macy .open = zvol_open, 1064eda14cbcSMatt Macy .release = zvol_release, 1065eda14cbcSMatt Macy .ioctl = zvol_ioctl, 1066eda14cbcSMatt Macy .compat_ioctl = zvol_compat_ioctl, 1067eda14cbcSMatt Macy .check_events = zvol_check_events, 106816038816SMartin Matuska #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1069eda14cbcSMatt Macy .revalidate_disk = zvol_revalidate_disk, 107016038816SMartin Matuska #endif 1071eda14cbcSMatt Macy .getgeo = zvol_getgeo, 1072eda14cbcSMatt Macy .owner = THIS_MODULE, 1073eda14cbcSMatt Macy #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1074eda14cbcSMatt Macy .submit_bio = zvol_submit_bio, 1075eda14cbcSMatt Macy #endif 1076eda14cbcSMatt Macy }; 1077eda14cbcSMatt Macy 107829dc9349SMartin Matuska /* 107929dc9349SMartin Matuska * Since 6.9, Linux has been removing queue limit setters in favour of an 108029dc9349SMartin Matuska * initial queue_limits struct applied when the device is open. Since 6.11, 108129dc9349SMartin Matuska * queue_limits is being extended to allow more things to be applied when the 108229dc9349SMartin Matuska * device is open. Setters are also being removed for this. 108329dc9349SMartin Matuska * 108429dc9349SMartin Matuska * For OpenZFS, this means that depending on kernel version, some options may 108529dc9349SMartin Matuska * be set up before the device is open, and some applied to an open device 108629dc9349SMartin Matuska * (queue) after the fact. 108729dc9349SMartin Matuska * 108829dc9349SMartin Matuska * We manage this complexity by having our own limits struct, 108929dc9349SMartin Matuska * zvol_queue_limits_t, in which we carry any queue config that we're 109029dc9349SMartin Matuska * interested in setting. This structure is the same on all kernels. 109129dc9349SMartin Matuska * 109229dc9349SMartin Matuska * These limits are then applied to the queue at device open time by the most 109329dc9349SMartin Matuska * appropriate method for the kernel. 109429dc9349SMartin Matuska * 109529dc9349SMartin Matuska * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 109629dc9349SMartin Matuska * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 109729dc9349SMartin Matuska * struct queue_limits, and passes it in. Any fields added in later kernels are 109829dc9349SMartin Matuska * (obviously) not set up here. 109929dc9349SMartin Matuska * 110029dc9349SMartin Matuska * zvol_queue_limits_apply() is called on all kernel versions after the queue 110129dc9349SMartin Matuska * is created, and applies any remaining config. Before 6.9 that will be 110229dc9349SMartin Matuska * everything, via setter methods. After 6.9 that will be whatever couldn't be 110329dc9349SMartin Matuska * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 110429dc9349SMartin Matuska * will always be a no-op on the latest kernel we support). 110529dc9349SMartin Matuska */ 110675e1fea6SMartin Matuska typedef struct zvol_queue_limits { 110775e1fea6SMartin Matuska unsigned int zql_max_hw_sectors; 110875e1fea6SMartin Matuska unsigned short zql_max_segments; 110975e1fea6SMartin Matuska unsigned int zql_max_segment_size; 111075e1fea6SMartin Matuska unsigned int zql_io_opt; 111129dc9349SMartin Matuska unsigned int zql_physical_block_size; 111229dc9349SMartin Matuska unsigned int zql_max_discard_sectors; 111329dc9349SMartin Matuska unsigned int zql_discard_granularity; 111475e1fea6SMartin Matuska } zvol_queue_limits_t; 111575e1fea6SMartin Matuska 111675e1fea6SMartin Matuska static void 111775e1fea6SMartin Matuska zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 111875e1fea6SMartin Matuska boolean_t use_blk_mq) 111975e1fea6SMartin Matuska { 112075e1fea6SMartin Matuska limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 112175e1fea6SMartin Matuska 112275e1fea6SMartin Matuska if (use_blk_mq) { 112375e1fea6SMartin Matuska /* 112475e1fea6SMartin Matuska * IO requests can be really big (1MB). When an IO request 112575e1fea6SMartin Matuska * comes in, it is passed off to zvol_read() or zvol_write() 112675e1fea6SMartin Matuska * in a new thread, where it is chunked up into 'volblocksize' 112775e1fea6SMartin Matuska * sized pieces and processed. So for example, if the request 112875e1fea6SMartin Matuska * is a 1MB write and your volblocksize is 128k, one zvol_write 112975e1fea6SMartin Matuska * thread will take that request and sequentially do ten 128k 113075e1fea6SMartin Matuska * IOs. This is due to the fact that the thread needs to lock 113175e1fea6SMartin Matuska * each volblocksize sized block. So you might be wondering: 113275e1fea6SMartin Matuska * "instead of passing the whole 1MB request to one thread, 113375e1fea6SMartin Matuska * why not pass ten individual 128k chunks to ten threads and 113475e1fea6SMartin Matuska * process the whole write in parallel?" The short answer is 113575e1fea6SMartin Matuska * that there's a sweet spot number of chunks that balances 113675e1fea6SMartin Matuska * the greater parallelism with the added overhead of more 113775e1fea6SMartin Matuska * threads. The sweet spot can be different depending on if you 113875e1fea6SMartin Matuska * have a read or write heavy workload. Writes typically want 113975e1fea6SMartin Matuska * high chunk counts while reads typically want lower ones. On 114075e1fea6SMartin Matuska * a test pool with 6 NVMe drives in a 3x 2-disk mirror 114175e1fea6SMartin Matuska * configuration, with volblocksize=8k, the sweet spot for good 114275e1fea6SMartin Matuska * sequential reads and writes was at 8 chunks. 114375e1fea6SMartin Matuska */ 114475e1fea6SMartin Matuska 114575e1fea6SMartin Matuska /* 114675e1fea6SMartin Matuska * Below we tell the kernel how big we want our requests 114775e1fea6SMartin Matuska * to be. You would think that blk_queue_io_opt() would be 114875e1fea6SMartin Matuska * used to do this since it is used to "set optimal request 114975e1fea6SMartin Matuska * size for the queue", but that doesn't seem to do 115075e1fea6SMartin Matuska * anything - the kernel still gives you huge requests 115175e1fea6SMartin Matuska * with tons of little PAGE_SIZE segments contained within it. 115275e1fea6SMartin Matuska * 115375e1fea6SMartin Matuska * Knowing that the kernel will just give you PAGE_SIZE segments 115475e1fea6SMartin Matuska * no matter what, you can say "ok, I want PAGE_SIZE byte 115575e1fea6SMartin Matuska * segments, and I want 'N' of them per request", where N is 115675e1fea6SMartin Matuska * the correct number of segments for the volblocksize and 115775e1fea6SMartin Matuska * number of chunks you want. 115875e1fea6SMartin Matuska */ 115975e1fea6SMartin Matuska if (zvol_blk_mq_blocks_per_thread != 0) { 116075e1fea6SMartin Matuska unsigned int chunks; 116175e1fea6SMartin Matuska chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 116275e1fea6SMartin Matuska 116375e1fea6SMartin Matuska limits->zql_max_segment_size = PAGE_SIZE; 116475e1fea6SMartin Matuska limits->zql_max_segments = 116575e1fea6SMartin Matuska (zv->zv_volblocksize * chunks) / PAGE_SIZE; 116675e1fea6SMartin Matuska } else { 116775e1fea6SMartin Matuska /* 116875e1fea6SMartin Matuska * Special case: zvol_blk_mq_blocks_per_thread = 0 116975e1fea6SMartin Matuska * Max everything out. 117075e1fea6SMartin Matuska */ 117175e1fea6SMartin Matuska limits->zql_max_segments = UINT16_MAX; 117275e1fea6SMartin Matuska limits->zql_max_segment_size = UINT_MAX; 117375e1fea6SMartin Matuska } 117475e1fea6SMartin Matuska } else { 117575e1fea6SMartin Matuska limits->zql_max_segments = UINT16_MAX; 117675e1fea6SMartin Matuska limits->zql_max_segment_size = UINT_MAX; 117775e1fea6SMartin Matuska } 117875e1fea6SMartin Matuska 1179*718519f4SMartin Matuska limits->zql_io_opt = DMU_MAX_ACCESS / 2; 118029dc9349SMartin Matuska 118129dc9349SMartin Matuska limits->zql_physical_block_size = zv->zv_volblocksize; 118229dc9349SMartin Matuska limits->zql_max_discard_sectors = 118329dc9349SMartin Matuska (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 118429dc9349SMartin Matuska limits->zql_discard_granularity = zv->zv_volblocksize; 118575e1fea6SMartin Matuska } 118675e1fea6SMartin Matuska 118775e1fea6SMartin Matuska #ifdef HAVE_BLK_ALLOC_DISK_2ARG 118875e1fea6SMartin Matuska static void 118975e1fea6SMartin Matuska zvol_queue_limits_convert(zvol_queue_limits_t *limits, 119075e1fea6SMartin Matuska struct queue_limits *qlimits) 119175e1fea6SMartin Matuska { 119275e1fea6SMartin Matuska memset(qlimits, 0, sizeof (struct queue_limits)); 119375e1fea6SMartin Matuska qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 119475e1fea6SMartin Matuska qlimits->max_segments = limits->zql_max_segments; 119575e1fea6SMartin Matuska qlimits->max_segment_size = limits->zql_max_segment_size; 119675e1fea6SMartin Matuska qlimits->io_opt = limits->zql_io_opt; 119729dc9349SMartin Matuska qlimits->physical_block_size = limits->zql_physical_block_size; 119829dc9349SMartin Matuska qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1199e2df9bb4SMartin Matuska qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 120029dc9349SMartin Matuska qlimits->discard_granularity = limits->zql_discard_granularity; 120129dc9349SMartin Matuska #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 120229dc9349SMartin Matuska qlimits->features = 120329dc9349SMartin Matuska BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 120429dc9349SMartin Matuska #endif 120575e1fea6SMartin Matuska } 120629dc9349SMartin Matuska #endif 120729dc9349SMartin Matuska 120875e1fea6SMartin Matuska static void 120975e1fea6SMartin Matuska zvol_queue_limits_apply(zvol_queue_limits_t *limits, 121075e1fea6SMartin Matuska struct request_queue *queue) 121175e1fea6SMartin Matuska { 121229dc9349SMartin Matuska #ifndef HAVE_BLK_ALLOC_DISK_2ARG 121375e1fea6SMartin Matuska blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 121475e1fea6SMartin Matuska blk_queue_max_segments(queue, limits->zql_max_segments); 121575e1fea6SMartin Matuska blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 121675e1fea6SMartin Matuska blk_queue_io_opt(queue, limits->zql_io_opt); 121729dc9349SMartin Matuska blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 121829dc9349SMartin Matuska blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 121929dc9349SMartin Matuska blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 122075e1fea6SMartin Matuska #endif 122129dc9349SMartin Matuska #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 122229dc9349SMartin Matuska blk_queue_set_write_cache(queue, B_TRUE); 122329dc9349SMartin Matuska blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 122429dc9349SMartin Matuska #endif 122529dc9349SMartin Matuska } 122675e1fea6SMartin Matuska 12271f1e2261SMartin Matuska static int 122875e1fea6SMartin Matuska zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 12291f1e2261SMartin Matuska { 12301f1e2261SMartin Matuska #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 12311f1e2261SMartin Matuska #if defined(HAVE_BLK_ALLOC_DISK) 12321f1e2261SMartin Matuska zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 12331f1e2261SMartin Matuska if (zso->zvo_disk == NULL) 12341f1e2261SMartin Matuska return (1); 12351f1e2261SMartin Matuska 12361f1e2261SMartin Matuska zso->zvo_disk->minors = ZVOL_MINORS; 12371f1e2261SMartin Matuska zso->zvo_queue = zso->zvo_disk->queue; 12381719886fSMartin Matuska #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 123975e1fea6SMartin Matuska struct queue_limits qlimits; 124075e1fea6SMartin Matuska zvol_queue_limits_convert(limits, &qlimits); 124175e1fea6SMartin Matuska struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 12421719886fSMartin Matuska if (IS_ERR(disk)) { 12431719886fSMartin Matuska zso->zvo_disk = NULL; 12441719886fSMartin Matuska return (1); 12451719886fSMartin Matuska } 12461719886fSMartin Matuska 12471719886fSMartin Matuska zso->zvo_disk = disk; 12481719886fSMartin Matuska zso->zvo_disk->minors = ZVOL_MINORS; 12491719886fSMartin Matuska zso->zvo_queue = zso->zvo_disk->queue; 1250e2df9bb4SMartin Matuska 12511f1e2261SMartin Matuska #else 12521f1e2261SMartin Matuska zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 12531f1e2261SMartin Matuska if (zso->zvo_queue == NULL) 12541f1e2261SMartin Matuska return (1); 12551f1e2261SMartin Matuska 12561f1e2261SMartin Matuska zso->zvo_disk = alloc_disk(ZVOL_MINORS); 12571f1e2261SMartin Matuska if (zso->zvo_disk == NULL) { 12581f1e2261SMartin Matuska blk_cleanup_queue(zso->zvo_queue); 12591f1e2261SMartin Matuska return (1); 12601f1e2261SMartin Matuska } 12611f1e2261SMartin Matuska 12621f1e2261SMartin Matuska zso->zvo_disk->queue = zso->zvo_queue; 12631f1e2261SMartin Matuska #endif /* HAVE_BLK_ALLOC_DISK */ 12641f1e2261SMartin Matuska #else 12651f1e2261SMartin Matuska zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 12661f1e2261SMartin Matuska if (zso->zvo_queue == NULL) 12671f1e2261SMartin Matuska return (1); 12681f1e2261SMartin Matuska 12691f1e2261SMartin Matuska zso->zvo_disk = alloc_disk(ZVOL_MINORS); 12701f1e2261SMartin Matuska if (zso->zvo_disk == NULL) { 12711f1e2261SMartin Matuska blk_cleanup_queue(zso->zvo_queue); 12721f1e2261SMartin Matuska return (1); 12731f1e2261SMartin Matuska } 12741f1e2261SMartin Matuska 12751f1e2261SMartin Matuska zso->zvo_disk->queue = zso->zvo_queue; 12761f1e2261SMartin Matuska #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 127729dc9349SMartin Matuska 127829dc9349SMartin Matuska zvol_queue_limits_apply(limits, zso->zvo_queue); 127929dc9349SMartin Matuska 12801f1e2261SMartin Matuska return (0); 12811f1e2261SMartin Matuska 12821f1e2261SMartin Matuska } 12831f1e2261SMartin Matuska 12841f1e2261SMartin Matuska static int 128575e1fea6SMartin Matuska zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 12861f1e2261SMartin Matuska { 12871f1e2261SMartin Matuska struct zvol_state_os *zso = zv->zv_zso; 12881f1e2261SMartin Matuska 12891f1e2261SMartin Matuska /* Allocate our blk-mq tag_set */ 12901f1e2261SMartin Matuska if (zvol_blk_mq_alloc_tag_set(zv) != 0) 12911f1e2261SMartin Matuska return (1); 12921f1e2261SMartin Matuska 12931f1e2261SMartin Matuska #if defined(HAVE_BLK_ALLOC_DISK) 12941f1e2261SMartin Matuska zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 12951f1e2261SMartin Matuska if (zso->zvo_disk == NULL) { 12961f1e2261SMartin Matuska blk_mq_free_tag_set(&zso->tag_set); 12971f1e2261SMartin Matuska return (1); 12981f1e2261SMartin Matuska } 12991f1e2261SMartin Matuska zso->zvo_queue = zso->zvo_disk->queue; 13001f1e2261SMartin Matuska zso->zvo_disk->minors = ZVOL_MINORS; 13011719886fSMartin Matuska #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 130275e1fea6SMartin Matuska struct queue_limits qlimits; 130375e1fea6SMartin Matuska zvol_queue_limits_convert(limits, &qlimits); 130475e1fea6SMartin Matuska struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 13051719886fSMartin Matuska if (IS_ERR(disk)) { 13061719886fSMartin Matuska zso->zvo_disk = NULL; 13071719886fSMartin Matuska blk_mq_free_tag_set(&zso->tag_set); 13081719886fSMartin Matuska return (1); 13091719886fSMartin Matuska } 13101719886fSMartin Matuska 13111719886fSMartin Matuska zso->zvo_disk = disk; 13121719886fSMartin Matuska zso->zvo_queue = zso->zvo_disk->queue; 13131719886fSMartin Matuska zso->zvo_disk->minors = ZVOL_MINORS; 13141f1e2261SMartin Matuska #else 13151f1e2261SMartin Matuska zso->zvo_disk = alloc_disk(ZVOL_MINORS); 13161f1e2261SMartin Matuska if (zso->zvo_disk == NULL) { 13171f1e2261SMartin Matuska blk_cleanup_queue(zso->zvo_queue); 13181f1e2261SMartin Matuska blk_mq_free_tag_set(&zso->tag_set); 13191f1e2261SMartin Matuska return (1); 13201f1e2261SMartin Matuska } 13211f1e2261SMartin Matuska /* Allocate queue */ 13221f1e2261SMartin Matuska zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 13231f1e2261SMartin Matuska if (IS_ERR(zso->zvo_queue)) { 13241f1e2261SMartin Matuska blk_mq_free_tag_set(&zso->tag_set); 13251f1e2261SMartin Matuska return (1); 13261f1e2261SMartin Matuska } 13271f1e2261SMartin Matuska 13281f1e2261SMartin Matuska /* Our queue is now created, assign it to our disk */ 13291f1e2261SMartin Matuska zso->zvo_disk->queue = zso->zvo_queue; 133029dc9349SMartin Matuska #endif 13311f1e2261SMartin Matuska 133229dc9349SMartin Matuska zvol_queue_limits_apply(limits, zso->zvo_queue); 133329dc9349SMartin Matuska 13341f1e2261SMartin Matuska return (0); 13351f1e2261SMartin Matuska } 13361f1e2261SMartin Matuska 1337eda14cbcSMatt Macy /* 1338eda14cbcSMatt Macy * Allocate memory for a new zvol_state_t and setup the required 1339eda14cbcSMatt Macy * request queue and generic disk structures for the block device. 1340eda14cbcSMatt Macy */ 1341eda14cbcSMatt Macy static zvol_state_t * 1342e2df9bb4SMartin Matuska zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) 1343eda14cbcSMatt Macy { 1344eda14cbcSMatt Macy zvol_state_t *zv; 1345eda14cbcSMatt Macy struct zvol_state_os *zso; 1346eda14cbcSMatt Macy uint64_t volmode; 13471f1e2261SMartin Matuska int ret; 1348eda14cbcSMatt Macy 1349eda14cbcSMatt Macy if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1350eda14cbcSMatt Macy return (NULL); 1351eda14cbcSMatt Macy 1352eda14cbcSMatt Macy if (volmode == ZFS_VOLMODE_DEFAULT) 1353eda14cbcSMatt Macy volmode = zvol_volmode; 1354eda14cbcSMatt Macy 1355eda14cbcSMatt Macy if (volmode == ZFS_VOLMODE_NONE) 1356eda14cbcSMatt Macy return (NULL); 1357eda14cbcSMatt Macy 1358eda14cbcSMatt Macy zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1359eda14cbcSMatt Macy zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1360eda14cbcSMatt Macy zv->zv_zso = zso; 13617877fdebSMatt Macy zv->zv_volmode = volmode; 1362e2df9bb4SMartin Matuska zv->zv_volblocksize = volblocksize; 1363eda14cbcSMatt Macy 1364eda14cbcSMatt Macy list_link_init(&zv->zv_next); 1365eda14cbcSMatt Macy mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1366ce4dcb97SMartin Matuska cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1367eda14cbcSMatt Macy 13681f1e2261SMartin Matuska zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 13691f88aa09SMartin Matuska 137075e1fea6SMartin Matuska zvol_queue_limits_t limits; 137175e1fea6SMartin Matuska zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 137275e1fea6SMartin Matuska 13731f1e2261SMartin Matuska /* 13741f1e2261SMartin Matuska * The block layer has 3 interfaces for getting BIOs: 13751f1e2261SMartin Matuska * 13761f1e2261SMartin Matuska * 1. blk-mq request queues (new) 13771f1e2261SMartin Matuska * 2. submit_bio() (oldest) 13781f1e2261SMartin Matuska * 3. regular request queues (old). 13791f1e2261SMartin Matuska * 13801f1e2261SMartin Matuska * Each of those interfaces has two permutations: 13811f1e2261SMartin Matuska * 13821f1e2261SMartin Matuska * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 13831f1e2261SMartin Matuska * both the disk and its queue (5.14 kernel or newer) 13841f1e2261SMartin Matuska * 13851f1e2261SMartin Matuska * b) We don't have blk_*alloc_disk(), and have to allocate the 13861f1e2261SMartin Matuska * disk and the queue separately. (5.13 kernel or older) 13871f1e2261SMartin Matuska */ 13881f1e2261SMartin Matuska if (zv->zv_zso->use_blk_mq) { 138975e1fea6SMartin Matuska ret = zvol_alloc_blk_mq(zv, &limits); 13901f1e2261SMartin Matuska zso->zvo_disk->fops = &zvol_ops_blk_mq; 13911f1e2261SMartin Matuska } else { 139275e1fea6SMartin Matuska ret = zvol_alloc_non_blk_mq(zso, &limits); 13931f1e2261SMartin Matuska zso->zvo_disk->fops = &zvol_ops; 13941f88aa09SMartin Matuska } 13951f1e2261SMartin Matuska if (ret != 0) 13961f88aa09SMartin Matuska goto out_kmem; 13971f88aa09SMartin Matuska 1398eda14cbcSMatt Macy /* Limit read-ahead to a single page to prevent over-prefetching. */ 1399eda14cbcSMatt Macy blk_queue_set_read_ahead(zso->zvo_queue, 1); 1400eda14cbcSMatt Macy 14011f1e2261SMartin Matuska if (!zv->zv_zso->use_blk_mq) { 1402eda14cbcSMatt Macy /* Disable write merging in favor of the ZIO pipeline. */ 1403eda14cbcSMatt Macy blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 14041f1e2261SMartin Matuska } 1405eda14cbcSMatt Macy 1406eda14cbcSMatt Macy zso->zvo_queue->queuedata = zv; 1407eda14cbcSMatt Macy zso->zvo_dev = dev; 1408eda14cbcSMatt Macy zv->zv_open_count = 0; 14092276e539SMartin Matuska strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1410eda14cbcSMatt Macy 1411eda14cbcSMatt Macy zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1412eda14cbcSMatt Macy rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1413eda14cbcSMatt Macy 1414eda14cbcSMatt Macy zso->zvo_disk->major = zvol_major; 1415eda14cbcSMatt Macy zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1416eda14cbcSMatt Macy 1417eda14cbcSMatt Macy /* 1418716fd348SMartin Matuska * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1419716fd348SMartin Matuska * This is accomplished by limiting the number of minors for the 1420716fd348SMartin Matuska * device to one and explicitly disabling partition scanning. 1421eda14cbcSMatt Macy */ 1422716fd348SMartin Matuska if (volmode == ZFS_VOLMODE_DEV) { 1423eda14cbcSMatt Macy zso->zvo_disk->minors = 1; 14247a7741afSMartin Matuska zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 14257a7741afSMartin Matuska zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1426eda14cbcSMatt Macy } 1427716fd348SMartin Matuska 1428eda14cbcSMatt Macy zso->zvo_disk->first_minor = (dev & MINORMASK); 1429eda14cbcSMatt Macy zso->zvo_disk->private_data = zv; 1430eda14cbcSMatt Macy snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1431eda14cbcSMatt Macy ZVOL_DEV_NAME, (dev & MINORMASK)); 1432eda14cbcSMatt Macy 1433eda14cbcSMatt Macy return (zv); 1434eda14cbcSMatt Macy 1435eda14cbcSMatt Macy out_kmem: 1436eda14cbcSMatt Macy kmem_free(zso, sizeof (struct zvol_state_os)); 1437eda14cbcSMatt Macy kmem_free(zv, sizeof (zvol_state_t)); 1438eda14cbcSMatt Macy return (NULL); 1439eda14cbcSMatt Macy } 1440eda14cbcSMatt Macy 1441eda14cbcSMatt Macy /* 1442eda14cbcSMatt Macy * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1443eda14cbcSMatt Macy * At this time, the structure is not opened by anyone, is taken off 1444eda14cbcSMatt Macy * the zvol_state_list, and has its private data set to NULL. 1445eda14cbcSMatt Macy * The zvol_state_lock is dropped. 1446eda14cbcSMatt Macy * 1447eda14cbcSMatt Macy * This function may take many milliseconds to complete (e.g. we've seen 1448eda14cbcSMatt Macy * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1449eda14cbcSMatt Macy * "del_gendisk". Thus, consumers need to be careful to account for this 1450eda14cbcSMatt Macy * latency when calling this function. 1451eda14cbcSMatt Macy */ 1452c03c5b1cSMartin Matuska void 1453c03c5b1cSMartin Matuska zvol_os_free(zvol_state_t *zv) 1454eda14cbcSMatt Macy { 1455eda14cbcSMatt Macy 1456eda14cbcSMatt Macy ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1457eda14cbcSMatt Macy ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 14587877fdebSMatt Macy ASSERT0(zv->zv_open_count); 14597877fdebSMatt Macy ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1460eda14cbcSMatt Macy 1461eda14cbcSMatt Macy rw_destroy(&zv->zv_suspend_lock); 1462eda14cbcSMatt Macy zfs_rangelock_fini(&zv->zv_rangelock); 1463eda14cbcSMatt Macy 1464eda14cbcSMatt Macy del_gendisk(zv->zv_zso->zvo_disk); 14651f88aa09SMartin Matuska #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 14661719886fSMartin Matuska (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1467271171e0SMartin Matuska #if defined(HAVE_BLK_CLEANUP_DISK) 14681f88aa09SMartin Matuska blk_cleanup_disk(zv->zv_zso->zvo_disk); 14691f88aa09SMartin Matuska #else 1470271171e0SMartin Matuska put_disk(zv->zv_zso->zvo_disk); 1471271171e0SMartin Matuska #endif 1472271171e0SMartin Matuska #else 1473eda14cbcSMatt Macy blk_cleanup_queue(zv->zv_zso->zvo_queue); 1474eda14cbcSMatt Macy put_disk(zv->zv_zso->zvo_disk); 14751f88aa09SMartin Matuska #endif 1476eda14cbcSMatt Macy 14771f1e2261SMartin Matuska if (zv->zv_zso->use_blk_mq) 14781f1e2261SMartin Matuska blk_mq_free_tag_set(&zv->zv_zso->tag_set); 14791f1e2261SMartin Matuska 1480eda14cbcSMatt Macy ida_simple_remove(&zvol_ida, 1481eda14cbcSMatt Macy MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1482eda14cbcSMatt Macy 1483ce4dcb97SMartin Matuska cv_destroy(&zv->zv_removing_cv); 1484eda14cbcSMatt Macy mutex_destroy(&zv->zv_state_lock); 1485eda14cbcSMatt Macy dataset_kstats_destroy(&zv->zv_kstat); 1486eda14cbcSMatt Macy 1487eda14cbcSMatt Macy kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1488eda14cbcSMatt Macy kmem_free(zv, sizeof (zvol_state_t)); 1489eda14cbcSMatt Macy } 1490eda14cbcSMatt Macy 14917877fdebSMatt Macy void 14927877fdebSMatt Macy zvol_wait_close(zvol_state_t *zv) 14937877fdebSMatt Macy { 14947877fdebSMatt Macy } 14957877fdebSMatt Macy 149675e1fea6SMartin Matuska struct add_disk_work { 149775e1fea6SMartin Matuska struct delayed_work work; 149875e1fea6SMartin Matuska struct gendisk *disk; 149975e1fea6SMartin Matuska int error; 150075e1fea6SMartin Matuska }; 150175e1fea6SMartin Matuska 150275e1fea6SMartin Matuska static int 150375e1fea6SMartin Matuska __zvol_os_add_disk(struct gendisk *disk) 150475e1fea6SMartin Matuska { 150575e1fea6SMartin Matuska int error = 0; 150675e1fea6SMartin Matuska #ifdef HAVE_ADD_DISK_RET 150775e1fea6SMartin Matuska error = add_disk(disk); 150875e1fea6SMartin Matuska #else 150975e1fea6SMartin Matuska add_disk(disk); 151075e1fea6SMartin Matuska #endif 151175e1fea6SMartin Matuska return (error); 151275e1fea6SMartin Matuska } 151375e1fea6SMartin Matuska 151475e1fea6SMartin Matuska #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 151575e1fea6SMartin Matuska static void 151675e1fea6SMartin Matuska zvol_os_add_disk_work(struct work_struct *work) 151775e1fea6SMartin Matuska { 151875e1fea6SMartin Matuska struct add_disk_work *add_disk_work; 151975e1fea6SMartin Matuska add_disk_work = container_of(work, struct add_disk_work, work.work); 152075e1fea6SMartin Matuska add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 152175e1fea6SMartin Matuska } 152275e1fea6SMartin Matuska #endif 152375e1fea6SMartin Matuska 152475e1fea6SMartin Matuska /* 152575e1fea6SMartin Matuska * SPECIAL CASE: 152675e1fea6SMartin Matuska * 152775e1fea6SMartin Matuska * This function basically calls add_disk() from a workqueue. You may be 152875e1fea6SMartin Matuska * thinking: why not just call add_disk() directly? 152975e1fea6SMartin Matuska * 153075e1fea6SMartin Matuska * When you call add_disk(), the zvol appears to the world. When this happens, 153175e1fea6SMartin Matuska * the kernel calls disk_scan_partitions() on the zvol, which behaves 153275e1fea6SMartin Matuska * differently on the 6.9+ kernels: 153375e1fea6SMartin Matuska * 153475e1fea6SMartin Matuska * - 6.8 and older kernels - 153575e1fea6SMartin Matuska * disk_scan_partitions() 153675e1fea6SMartin Matuska * handle = bdev_open_by_dev( 153775e1fea6SMartin Matuska * zvol_open() 153875e1fea6SMartin Matuska * bdev_release(handle); 153975e1fea6SMartin Matuska * zvol_release() 154075e1fea6SMartin Matuska * 154175e1fea6SMartin Matuska * 154275e1fea6SMartin Matuska * - 6.9+ kernels - 154375e1fea6SMartin Matuska * disk_scan_partitions() 154475e1fea6SMartin Matuska * file = bdev_file_open_by_dev() 154575e1fea6SMartin Matuska * zvol_open() 154675e1fea6SMartin Matuska * fput(file) 154775e1fea6SMartin Matuska * < wait for return to userspace > 154875e1fea6SMartin Matuska * zvol_release() 154975e1fea6SMartin Matuska * 155075e1fea6SMartin Matuska * The difference is that the bdev_release() from the 6.8 kernel is synchronous 155175e1fea6SMartin Matuska * while the fput() from the 6.9 kernel is async. Or more specifically it's 155275e1fea6SMartin Matuska * async that has to wait until we return to userspace (since it adds the fput 155375e1fea6SMartin Matuska * into the caller's work queue with the TWA_RESUME flag set). This is not the 155475e1fea6SMartin Matuska * behavior we want, since we want do things like create+destroy a zvol within 155575e1fea6SMartin Matuska * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 155675e1fea6SMartin Matuska * reference to the zvol while we're in the IOCTL, which can't wait until we 155775e1fea6SMartin Matuska * return to userspace. 155875e1fea6SMartin Matuska * 155975e1fea6SMartin Matuska * We can get around this since fput() has a special codepath for when it's 156075e1fea6SMartin Matuska * running in a kernel thread or interrupt. In those cases, it just puts the 156175e1fea6SMartin Matuska * fput into the system workqueue, which we can force to run with 156275e1fea6SMartin Matuska * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 156375e1fea6SMartin Matuska * run from a kernel thread and "tricks" the fput() codepaths. 156475e1fea6SMartin Matuska * 156575e1fea6SMartin Matuska * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 156675e1fea6SMartin Matuska * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 156775e1fea6SMartin Matuska * fput) to happen, which it eventually, naturally, will from the system_wq 156875e1fea6SMartin Matuska * without us explicitly calling __flush_workqueue(). 156975e1fea6SMartin Matuska */ 157075e1fea6SMartin Matuska static int 157175e1fea6SMartin Matuska zvol_os_add_disk(struct gendisk *disk) 157275e1fea6SMartin Matuska { 157375e1fea6SMartin Matuska #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 157475e1fea6SMartin Matuska struct add_disk_work add_disk_work; 157575e1fea6SMartin Matuska 157675e1fea6SMartin Matuska INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 157775e1fea6SMartin Matuska add_disk_work.disk = disk; 157875e1fea6SMartin Matuska add_disk_work.error = 0; 157975e1fea6SMartin Matuska 158075e1fea6SMartin Matuska /* Use *_delayed_work functions since they're not GPL'd */ 158175e1fea6SMartin Matuska schedule_delayed_work(&add_disk_work.work, 0); 158275e1fea6SMartin Matuska flush_delayed_work(&add_disk_work.work); 158375e1fea6SMartin Matuska 158475e1fea6SMartin Matuska __flush_workqueue(system_wq); 158575e1fea6SMartin Matuska return (add_disk_work.error); 158675e1fea6SMartin Matuska #else /* <= 6.8 kernel */ 158775e1fea6SMartin Matuska return (__zvol_os_add_disk(disk)); 158875e1fea6SMartin Matuska #endif 158975e1fea6SMartin Matuska } 159075e1fea6SMartin Matuska 1591eda14cbcSMatt Macy /* 1592eda14cbcSMatt Macy * Create a block device minor node and setup the linkage between it 1593eda14cbcSMatt Macy * and the specified volume. Once this function returns the block 1594eda14cbcSMatt Macy * device is live and ready for use. 1595eda14cbcSMatt Macy */ 1596c03c5b1cSMartin Matuska int 1597eda14cbcSMatt Macy zvol_os_create_minor(const char *name) 1598eda14cbcSMatt Macy { 1599eda14cbcSMatt Macy zvol_state_t *zv; 1600eda14cbcSMatt Macy objset_t *os; 1601eda14cbcSMatt Macy dmu_object_info_t *doi; 1602eda14cbcSMatt Macy uint64_t volsize; 1603eda14cbcSMatt Macy uint64_t len; 1604eda14cbcSMatt Macy unsigned minor = 0; 1605eda14cbcSMatt Macy int error = 0; 1606eda14cbcSMatt Macy int idx; 1607eda14cbcSMatt Macy uint64_t hash = zvol_name_hash(name); 1608f8b1db88SMartin Matuska uint64_t volthreading; 1609dbd5678dSMartin Matuska bool replayed_zil = B_FALSE; 1610eda14cbcSMatt Macy 1611eda14cbcSMatt Macy if (zvol_inhibit_dev) 1612eda14cbcSMatt Macy return (0); 1613eda14cbcSMatt Macy 1614eda14cbcSMatt Macy idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1615eda14cbcSMatt Macy if (idx < 0) 1616eda14cbcSMatt Macy return (SET_ERROR(-idx)); 1617eda14cbcSMatt Macy minor = idx << ZVOL_MINOR_BITS; 1618783d3ff6SMartin Matuska if (MINOR(minor) != minor) { 1619783d3ff6SMartin Matuska /* too many partitions can cause an overflow */ 1620783d3ff6SMartin Matuska zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1621783d3ff6SMartin Matuska name, minor, MINOR(minor)); 1622783d3ff6SMartin Matuska ida_simple_remove(&zvol_ida, idx); 1623783d3ff6SMartin Matuska return (SET_ERROR(EINVAL)); 1624783d3ff6SMartin Matuska } 1625eda14cbcSMatt Macy 1626eda14cbcSMatt Macy zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1627eda14cbcSMatt Macy if (zv) { 1628eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1629eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock); 1630eda14cbcSMatt Macy ida_simple_remove(&zvol_ida, idx); 1631eda14cbcSMatt Macy return (SET_ERROR(EEXIST)); 1632eda14cbcSMatt Macy } 1633eda14cbcSMatt Macy 1634eda14cbcSMatt Macy doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1635eda14cbcSMatt Macy 1636eda14cbcSMatt Macy error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1637eda14cbcSMatt Macy if (error) 1638eda14cbcSMatt Macy goto out_doi; 1639eda14cbcSMatt Macy 1640eda14cbcSMatt Macy error = dmu_object_info(os, ZVOL_OBJ, doi); 1641eda14cbcSMatt Macy if (error) 1642eda14cbcSMatt Macy goto out_dmu_objset_disown; 1643eda14cbcSMatt Macy 1644eda14cbcSMatt Macy error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1645eda14cbcSMatt Macy if (error) 1646eda14cbcSMatt Macy goto out_dmu_objset_disown; 1647eda14cbcSMatt Macy 1648e2df9bb4SMartin Matuska zv = zvol_alloc(MKDEV(zvol_major, minor), name, 1649e2df9bb4SMartin Matuska doi->doi_data_block_size); 1650eda14cbcSMatt Macy if (zv == NULL) { 1651eda14cbcSMatt Macy error = SET_ERROR(EAGAIN); 1652eda14cbcSMatt Macy goto out_dmu_objset_disown; 1653eda14cbcSMatt Macy } 1654eda14cbcSMatt Macy zv->zv_hash = hash; 1655eda14cbcSMatt Macy 1656eda14cbcSMatt Macy if (dmu_objset_is_snapshot(os)) 1657eda14cbcSMatt Macy zv->zv_flags |= ZVOL_RDONLY; 1658eda14cbcSMatt Macy 1659eda14cbcSMatt Macy zv->zv_volsize = volsize; 1660eda14cbcSMatt Macy zv->zv_objset = os; 1661eda14cbcSMatt Macy 1662f8b1db88SMartin Matuska /* Default */ 1663f8b1db88SMartin Matuska zv->zv_threading = B_TRUE; 1664f8b1db88SMartin Matuska if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1665f8b1db88SMartin Matuska == 0) 1666f8b1db88SMartin Matuska zv->zv_threading = volthreading; 1667f8b1db88SMartin Matuska 1668eda14cbcSMatt Macy set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1669eda14cbcSMatt Macy 1670e3aa18adSMartin Matuska #ifdef QUEUE_FLAG_DISCARD 1671eda14cbcSMatt Macy blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1672e3aa18adSMartin Matuska #endif 1673eda14cbcSMatt Macy #ifdef QUEUE_FLAG_NONROT 1674eda14cbcSMatt Macy blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1675eda14cbcSMatt Macy #endif 1676eda14cbcSMatt Macy #ifdef QUEUE_FLAG_ADD_RANDOM 1677eda14cbcSMatt Macy blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1678eda14cbcSMatt Macy #endif 1679eda14cbcSMatt Macy /* This flag was introduced in kernel version 4.12. */ 1680eda14cbcSMatt Macy #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1681eda14cbcSMatt Macy blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1682eda14cbcSMatt Macy #endif 1683eda14cbcSMatt Macy 1684271171e0SMartin Matuska ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1685271171e0SMartin Matuska error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1686271171e0SMartin Matuska if (error) 1687271171e0SMartin Matuska goto out_dmu_objset_disown; 16889db44a8eSMartin Matuska ASSERT3P(zv->zv_zilog, ==, NULL); 1689271171e0SMartin Matuska zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1690eda14cbcSMatt Macy if (spa_writeable(dmu_objset_spa(os))) { 1691eda14cbcSMatt Macy if (zil_replay_disable) 1692dbd5678dSMartin Matuska replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1693eda14cbcSMatt Macy else 1694dbd5678dSMartin Matuska replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1695eda14cbcSMatt Macy } 1696dbd5678dSMartin Matuska if (replayed_zil) 16979db44a8eSMartin Matuska zil_close(zv->zv_zilog); 16989db44a8eSMartin Matuska zv->zv_zilog = NULL; 1699eda14cbcSMatt Macy 1700eda14cbcSMatt Macy /* 1701eda14cbcSMatt Macy * When udev detects the addition of the device it will immediately 1702eda14cbcSMatt Macy * invoke blkid(8) to determine the type of content on the device. 1703eda14cbcSMatt Macy * Prefetching the blocks commonly scanned by blkid(8) will speed 1704eda14cbcSMatt Macy * up this process. 1705eda14cbcSMatt Macy */ 1706be181ee2SMartin Matuska len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1707eda14cbcSMatt Macy if (len > 0) { 1708eda14cbcSMatt Macy dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1709eda14cbcSMatt Macy dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1710eda14cbcSMatt Macy ZIO_PRIORITY_SYNC_READ); 1711eda14cbcSMatt Macy } 1712eda14cbcSMatt Macy 1713eda14cbcSMatt Macy zv->zv_objset = NULL; 1714eda14cbcSMatt Macy out_dmu_objset_disown: 1715eda14cbcSMatt Macy dmu_objset_disown(os, B_TRUE, FTAG); 1716eda14cbcSMatt Macy out_doi: 1717eda14cbcSMatt Macy kmem_free(doi, sizeof (dmu_object_info_t)); 1718eda14cbcSMatt Macy 1719eda14cbcSMatt Macy /* 1720eda14cbcSMatt Macy * Keep in mind that once add_disk() is called, the zvol is 1721eda14cbcSMatt Macy * announced to the world, and zvol_open()/zvol_release() can 1722eda14cbcSMatt Macy * be called at any time. Incidentally, add_disk() itself calls 1723eda14cbcSMatt Macy * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1724eda14cbcSMatt Macy * directly as well. 1725eda14cbcSMatt Macy */ 1726eda14cbcSMatt Macy if (error == 0) { 1727eda14cbcSMatt Macy rw_enter(&zvol_state_lock, RW_WRITER); 1728eda14cbcSMatt Macy zvol_insert(zv); 1729eda14cbcSMatt Macy rw_exit(&zvol_state_lock); 173075e1fea6SMartin Matuska error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1731eda14cbcSMatt Macy } else { 1732eda14cbcSMatt Macy ida_simple_remove(&zvol_ida, idx); 1733eda14cbcSMatt Macy } 1734eda14cbcSMatt Macy 1735eda14cbcSMatt Macy return (error); 1736eda14cbcSMatt Macy } 1737eda14cbcSMatt Macy 1738c03c5b1cSMartin Matuska void 1739c03c5b1cSMartin Matuska zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1740eda14cbcSMatt Macy { 1741eda14cbcSMatt Macy int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1742eda14cbcSMatt Macy 1743eda14cbcSMatt Macy ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1744eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1745eda14cbcSMatt Macy 1746eda14cbcSMatt Macy strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1747eda14cbcSMatt Macy 1748eda14cbcSMatt Macy /* move to new hashtable entry */ 1749b985c9caSMartin Matuska zv->zv_hash = zvol_name_hash(newname); 1750eda14cbcSMatt Macy hlist_del(&zv->zv_hlink); 1751eda14cbcSMatt Macy hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1752eda14cbcSMatt Macy 1753eda14cbcSMatt Macy /* 1754eda14cbcSMatt Macy * The block device's read-only state is briefly changed causing 1755eda14cbcSMatt Macy * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1756eda14cbcSMatt Macy * the name change and fixes the symlinks. This does not change 1757eda14cbcSMatt Macy * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1758eda14cbcSMatt Macy * changes. This would normally be done using kobject_uevent() but 1759eda14cbcSMatt Macy * that is a GPL-only symbol which is why we need this workaround. 1760eda14cbcSMatt Macy */ 1761eda14cbcSMatt Macy set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1762eda14cbcSMatt Macy set_disk_ro(zv->zv_zso->zvo_disk, readonly); 176314c2e0a0SMartin Matuska 176414c2e0a0SMartin Matuska dataset_kstats_rename(&zv->zv_kstat, newname); 1765eda14cbcSMatt Macy } 1766eda14cbcSMatt Macy 1767c03c5b1cSMartin Matuska void 1768c03c5b1cSMartin Matuska zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1769eda14cbcSMatt Macy { 1770eda14cbcSMatt Macy 1771eda14cbcSMatt Macy set_disk_ro(zv->zv_zso->zvo_disk, flags); 1772eda14cbcSMatt Macy } 1773eda14cbcSMatt Macy 1774c03c5b1cSMartin Matuska void 1775c03c5b1cSMartin Matuska zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1776eda14cbcSMatt Macy { 1777eda14cbcSMatt Macy 1778eda14cbcSMatt Macy set_capacity(zv->zv_zso->zvo_disk, capacity); 1779eda14cbcSMatt Macy } 1780eda14cbcSMatt Macy 1781eda14cbcSMatt Macy int 1782eda14cbcSMatt Macy zvol_init(void) 1783eda14cbcSMatt Macy { 1784eda14cbcSMatt Macy int error; 17851f1e2261SMartin Matuska 17861f1e2261SMartin Matuska /* 17871f1e2261SMartin Matuska * zvol_threads is the module param the user passes in. 17881f1e2261SMartin Matuska * 17891f1e2261SMartin Matuska * zvol_actual_threads is what we use internally, since the user can 17901f1e2261SMartin Matuska * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 17911f1e2261SMartin Matuska */ 17921f1e2261SMartin Matuska static unsigned int zvol_actual_threads; 17931f1e2261SMartin Matuska 17941f1e2261SMartin Matuska if (zvol_threads == 0) { 17951f1e2261SMartin Matuska /* 17961f1e2261SMartin Matuska * See dde9380a1 for why 32 was chosen here. This should 17971f1e2261SMartin Matuska * probably be refined to be some multiple of the number 17981f1e2261SMartin Matuska * of CPUs. 17991f1e2261SMartin Matuska */ 18001f1e2261SMartin Matuska zvol_actual_threads = MAX(num_online_cpus(), 32); 18011f1e2261SMartin Matuska } else { 18021f1e2261SMartin Matuska zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 18031f1e2261SMartin Matuska } 1804eda14cbcSMatt Macy 18051719886fSMartin Matuska /* 18061719886fSMartin Matuska * Use atleast 32 zvol_threads but for many core system, 18071719886fSMartin Matuska * prefer 6 threads per taskq, but no more taskqs 18081719886fSMartin Matuska * than threads in them on large systems. 18091719886fSMartin Matuska * 18101719886fSMartin Matuska * taskq total 18111719886fSMartin Matuska * cpus taskqs threads threads 18121719886fSMartin Matuska * ------- ------- ------- ------- 18131719886fSMartin Matuska * 1 1 32 32 18141719886fSMartin Matuska * 2 1 32 32 18151719886fSMartin Matuska * 4 1 32 32 18161719886fSMartin Matuska * 8 2 16 32 18171719886fSMartin Matuska * 16 3 11 33 18181719886fSMartin Matuska * 32 5 7 35 18191719886fSMartin Matuska * 64 8 8 64 18201719886fSMartin Matuska * 128 11 12 132 18211719886fSMartin Matuska * 256 16 16 256 18221719886fSMartin Matuska */ 18231719886fSMartin Matuska zv_taskq_t *ztqs = &zvol_taskqs; 18241719886fSMartin Matuska uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); 18251719886fSMartin Matuska if (num_tqs == 0) { 18261719886fSMartin Matuska num_tqs = 1 + num_online_cpus() / 6; 18271719886fSMartin Matuska while (num_tqs * num_tqs > zvol_actual_threads) 18281719886fSMartin Matuska num_tqs--; 18291719886fSMartin Matuska } 18301719886fSMartin Matuska uint_t per_tq_thread = zvol_actual_threads / num_tqs; 18311719886fSMartin Matuska if (per_tq_thread * num_tqs < zvol_actual_threads) 18321719886fSMartin Matuska per_tq_thread++; 18331719886fSMartin Matuska ztqs->tqs_cnt = num_tqs; 18341719886fSMartin Matuska ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); 1835eda14cbcSMatt Macy error = register_blkdev(zvol_major, ZVOL_DRIVER); 1836eda14cbcSMatt Macy if (error) { 18371719886fSMartin Matuska kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); 18381719886fSMartin Matuska ztqs->tqs_taskq = NULL; 1839eda14cbcSMatt Macy printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1840eda14cbcSMatt Macy return (error); 1841eda14cbcSMatt Macy } 18421f1e2261SMartin Matuska 18431f1e2261SMartin Matuska if (zvol_blk_mq_queue_depth == 0) { 18441f1e2261SMartin Matuska zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 18451f1e2261SMartin Matuska } else { 18461f1e2261SMartin Matuska zvol_actual_blk_mq_queue_depth = 18471f1e2261SMartin Matuska MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 18481f1e2261SMartin Matuska } 18491f1e2261SMartin Matuska 18501f1e2261SMartin Matuska if (zvol_blk_mq_threads == 0) { 18511f1e2261SMartin Matuska zvol_blk_mq_actual_threads = num_online_cpus(); 18521f1e2261SMartin Matuska } else { 18531f1e2261SMartin Matuska zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 18541f1e2261SMartin Matuska 1024); 18551f1e2261SMartin Matuska } 18567a7741afSMartin Matuska 18571719886fSMartin Matuska for (uint_t i = 0; i < num_tqs; i++) { 18581719886fSMartin Matuska char name[32]; 18591719886fSMartin Matuska (void) snprintf(name, sizeof (name), "%s_tq-%u", 18601719886fSMartin Matuska ZVOL_DRIVER, i); 18611719886fSMartin Matuska ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, 18621719886fSMartin Matuska maxclsyspri, per_tq_thread, INT_MAX, 18631719886fSMartin Matuska TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 18641719886fSMartin Matuska if (ztqs->tqs_taskq[i] == NULL) { 18651719886fSMartin Matuska for (int j = i - 1; j >= 0; j--) 18661719886fSMartin Matuska taskq_destroy(ztqs->tqs_taskq[j]); 1867eda14cbcSMatt Macy unregister_blkdev(zvol_major, ZVOL_DRIVER); 18681719886fSMartin Matuska kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 18691719886fSMartin Matuska sizeof (taskq_t *)); 18701719886fSMartin Matuska ztqs->tqs_taskq = NULL; 1871eda14cbcSMatt Macy return (-ENOMEM); 1872eda14cbcSMatt Macy } 18731719886fSMartin Matuska } 18741f1e2261SMartin Matuska 1875eda14cbcSMatt Macy zvol_init_impl(); 1876eda14cbcSMatt Macy ida_init(&zvol_ida); 1877eda14cbcSMatt Macy return (0); 1878eda14cbcSMatt Macy } 1879eda14cbcSMatt Macy 1880eda14cbcSMatt Macy void 1881eda14cbcSMatt Macy zvol_fini(void) 1882eda14cbcSMatt Macy { 18831719886fSMartin Matuska zv_taskq_t *ztqs = &zvol_taskqs; 1884eda14cbcSMatt Macy zvol_fini_impl(); 1885eda14cbcSMatt Macy unregister_blkdev(zvol_major, ZVOL_DRIVER); 18861719886fSMartin Matuska 18871719886fSMartin Matuska if (ztqs->tqs_taskq == NULL) { 18881719886fSMartin Matuska ASSERT3U(ztqs->tqs_cnt, ==, 0); 18891719886fSMartin Matuska } else { 18901719886fSMartin Matuska for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { 18911719886fSMartin Matuska ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); 18921719886fSMartin Matuska taskq_destroy(ztqs->tqs_taskq[i]); 18931719886fSMartin Matuska } 18941719886fSMartin Matuska kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 18951719886fSMartin Matuska sizeof (taskq_t *)); 18961719886fSMartin Matuska ztqs->tqs_taskq = NULL; 18971719886fSMartin Matuska } 18981719886fSMartin Matuska 1899eda14cbcSMatt Macy ida_destroy(&zvol_ida); 1900eda14cbcSMatt Macy } 1901eda14cbcSMatt Macy 1902eda14cbcSMatt Macy module_param(zvol_inhibit_dev, uint, 0644); 1903eda14cbcSMatt Macy MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1904eda14cbcSMatt Macy 1905eda14cbcSMatt Macy module_param(zvol_major, uint, 0444); 1906eda14cbcSMatt Macy MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1907eda14cbcSMatt Macy 1908eda14cbcSMatt Macy module_param(zvol_threads, uint, 0444); 19091f1e2261SMartin Matuska MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" 19101f1e2261SMartin Matuska "to 0 to use all active CPUs"); 1911eda14cbcSMatt Macy 1912eda14cbcSMatt Macy module_param(zvol_request_sync, uint, 0644); 1913eda14cbcSMatt Macy MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1914eda14cbcSMatt Macy 1915eda14cbcSMatt Macy module_param(zvol_max_discard_blocks, ulong, 0444); 1916eda14cbcSMatt Macy MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1917eda14cbcSMatt Macy 19181719886fSMartin Matuska module_param(zvol_num_taskqs, uint, 0444); 19191719886fSMartin Matuska MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); 19201719886fSMartin Matuska 1921eda14cbcSMatt Macy module_param(zvol_prefetch_bytes, uint, 0644); 1922eda14cbcSMatt Macy MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1923eda14cbcSMatt Macy 1924eda14cbcSMatt Macy module_param(zvol_volmode, uint, 0644); 1925eda14cbcSMatt Macy MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 19261f1e2261SMartin Matuska 19276c1e79dfSMartin Matuska module_param(zvol_blk_mq_queue_depth, uint, 0644); 19286c1e79dfSMartin Matuska MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 19296c1e79dfSMartin Matuska 19306c1e79dfSMartin Matuska module_param(zvol_use_blk_mq, uint, 0644); 19316c1e79dfSMartin Matuska MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 19326c1e79dfSMartin Matuska 19336c1e79dfSMartin Matuska module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 19346c1e79dfSMartin Matuska MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 19356c1e79dfSMartin Matuska "Process volblocksize blocks per thread"); 19366c1e79dfSMartin Matuska 19370a97523dSMartin Matuska #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 19380a97523dSMartin Matuska module_param(zvol_open_timeout_ms, uint, 0644); 19390a97523dSMartin Matuska MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 19400a97523dSMartin Matuska #endif 1941