1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23eda14cbcSMatt Macy * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24eda14cbcSMatt Macy * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25eda14cbcSMatt Macy * LLNL-CODE-403049. 26eda14cbcSMatt Macy * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27783d3ff6SMartin Matuska * Copyright (c) 2023, 2024, Klara Inc. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy #include <sys/zfs_context.h> 31eda14cbcSMatt Macy #include <sys/spa_impl.h> 32eda14cbcSMatt Macy #include <sys/vdev_disk.h> 33eda14cbcSMatt Macy #include <sys/vdev_impl.h> 34eda14cbcSMatt Macy #include <sys/vdev_trim.h> 35eda14cbcSMatt Macy #include <sys/abd.h> 36eda14cbcSMatt Macy #include <sys/fs/zfs.h> 37eda14cbcSMatt Macy #include <sys/zio.h> 382c48331dSMatt Macy #include <linux/blkpg.h> 39eda14cbcSMatt Macy #include <linux/msdos_fs.h> 40eda14cbcSMatt Macy #include <linux/vfs_compat.h> 41681ce946SMartin Matuska #include <linux/blk-cgroup.h> 42eda14cbcSMatt Macy 43fd45b686SMartin Matuska /* 44fd45b686SMartin Matuska * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying 45fd45b686SMartin Matuska * block_device. Since it carries the block_device inside, its convenient to 461719886fSMartin Matuska * just use the handle as a proxy. 471719886fSMartin Matuska * 481719886fSMartin Matuska * Linux 6.9.x uses a file for the same purpose. 491719886fSMartin Matuska * 501719886fSMartin Matuska * For pre-6.8, we just emulate this with a cast, since we don't need any of 511719886fSMartin Matuska * the other fields inside the handle. 52fd45b686SMartin Matuska */ 531719886fSMartin Matuska #if defined(HAVE_BDEV_OPEN_BY_PATH) 54fd45b686SMartin Matuska typedef struct bdev_handle zfs_bdev_handle_t; 55fd45b686SMartin Matuska #define BDH_BDEV(bdh) ((bdh)->bdev) 56fd45b686SMartin Matuska #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 57fd45b686SMartin Matuska #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 58fd45b686SMartin Matuska #define BDH_ERR_PTR(err) (ERR_PTR(err)) 591719886fSMartin Matuska #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 601719886fSMartin Matuska typedef struct file zfs_bdev_handle_t; 611719886fSMartin Matuska #define BDH_BDEV(bdh) (file_bdev(bdh)) 621719886fSMartin Matuska #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 631719886fSMartin Matuska #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 641719886fSMartin Matuska #define BDH_ERR_PTR(err) (ERR_PTR(err)) 65fd45b686SMartin Matuska #else 66fd45b686SMartin Matuska typedef void zfs_bdev_handle_t; 67fd45b686SMartin Matuska #define BDH_BDEV(bdh) ((struct block_device *)bdh) 68fd45b686SMartin Matuska #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) 69fd45b686SMartin Matuska #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) 70fd45b686SMartin Matuska #define BDH_ERR_PTR(err) (ERR_PTR(err)) 71fd45b686SMartin Matuska #endif 72fd45b686SMartin Matuska 73eda14cbcSMatt Macy typedef struct vdev_disk { 74fd45b686SMartin Matuska zfs_bdev_handle_t *vd_bdh; 75eda14cbcSMatt Macy krwlock_t vd_lock; 76eda14cbcSMatt Macy } vdev_disk_t; 77eda14cbcSMatt Macy 78eda14cbcSMatt Macy /* 79783d3ff6SMartin Matuska * Maximum number of segments to add to a bio (min 4). If this is higher than 80783d3ff6SMartin Matuska * the maximum allowed by the device queue or the kernel itself, it will be 81783d3ff6SMartin Matuska * clamped. Setting it to zero will cause the kernel's ideal size to be used. 82783d3ff6SMartin Matuska */ 83783d3ff6SMartin Matuska uint_t zfs_vdev_disk_max_segs = 0; 84783d3ff6SMartin Matuska 85783d3ff6SMartin Matuska /* 86eda14cbcSMatt Macy * Unique identifier for the exclusive vdev holder. 87eda14cbcSMatt Macy */ 88eda14cbcSMatt Macy static void *zfs_vdev_holder = VDEV_HOLDER; 89eda14cbcSMatt Macy 90eda14cbcSMatt Macy /* 91eda14cbcSMatt Macy * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 92eda14cbcSMatt Macy * device is missing. The missing path may be transient since the links 93eda14cbcSMatt Macy * can be briefly removed and recreated in response to udev events. 94eda14cbcSMatt Macy */ 95dbd5678dSMartin Matuska static uint_t zfs_vdev_open_timeout_ms = 1000; 96eda14cbcSMatt Macy 97eda14cbcSMatt Macy /* 98eda14cbcSMatt Macy * Size of the "reserved" partition, in blocks. 99eda14cbcSMatt Macy */ 100eda14cbcSMatt Macy #define EFI_MIN_RESV_SIZE (16 * 1024) 101eda14cbcSMatt Macy 102eda14cbcSMatt Macy /* 103dbd5678dSMartin Matuska * BIO request failfast mask. 104dbd5678dSMartin Matuska */ 105dbd5678dSMartin Matuska 106dbd5678dSMartin Matuska static unsigned int zfs_vdev_failfast_mask = 1; 107dbd5678dSMartin Matuska 108783d3ff6SMartin Matuska /* 109783d3ff6SMartin Matuska * Convert SPA mode flags into bdev open mode flags. 110783d3ff6SMartin Matuska */ 111315ee00fSMartin Matuska #ifdef HAVE_BLK_MODE_T 112783d3ff6SMartin Matuska typedef blk_mode_t vdev_bdev_mode_t; 113783d3ff6SMartin Matuska #define VDEV_BDEV_MODE_READ BLK_OPEN_READ 114783d3ff6SMartin Matuska #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE 115783d3ff6SMartin Matuska #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL 116783d3ff6SMartin Matuska #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) 117315ee00fSMartin Matuska #else 118783d3ff6SMartin Matuska typedef fmode_t vdev_bdev_mode_t; 119783d3ff6SMartin Matuska #define VDEV_BDEV_MODE_READ FMODE_READ 120783d3ff6SMartin Matuska #define VDEV_BDEV_MODE_WRITE FMODE_WRITE 121783d3ff6SMartin Matuska #define VDEV_BDEV_MODE_EXCL FMODE_EXCL 122783d3ff6SMartin Matuska #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) 123315ee00fSMartin Matuska #endif 124783d3ff6SMartin Matuska 125783d3ff6SMartin Matuska static vdev_bdev_mode_t 126783d3ff6SMartin Matuska vdev_bdev_mode(spa_mode_t smode) 127eda14cbcSMatt Macy { 128783d3ff6SMartin Matuska ASSERT3U(smode, !=, SPA_MODE_UNINIT); 129783d3ff6SMartin Matuska ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); 130315ee00fSMartin Matuska 131783d3ff6SMartin Matuska vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; 132315ee00fSMartin Matuska 133783d3ff6SMartin Matuska if (smode & SPA_MODE_READ) 134783d3ff6SMartin Matuska bmode |= VDEV_BDEV_MODE_READ; 135b356da80SMartin Matuska 136783d3ff6SMartin Matuska if (smode & SPA_MODE_WRITE) 137783d3ff6SMartin Matuska bmode |= VDEV_BDEV_MODE_WRITE; 138eda14cbcSMatt Macy 139783d3ff6SMartin Matuska ASSERT(bmode & VDEV_BDEV_MODE_MASK); 140783d3ff6SMartin Matuska ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); 141eda14cbcSMatt Macy 142783d3ff6SMartin Matuska return (bmode); 143eda14cbcSMatt Macy } 144eda14cbcSMatt Macy 145eda14cbcSMatt Macy /* 146eda14cbcSMatt Macy * Returns the usable capacity (in bytes) for the partition or disk. 147eda14cbcSMatt Macy */ 148eda14cbcSMatt Macy static uint64_t 149eda14cbcSMatt Macy bdev_capacity(struct block_device *bdev) 150eda14cbcSMatt Macy { 15175e1fea6SMartin Matuska #ifdef HAVE_BDEV_NR_BYTES 15275e1fea6SMartin Matuska return (bdev_nr_bytes(bdev)); 15375e1fea6SMartin Matuska #else 154eda14cbcSMatt Macy return (i_size_read(bdev->bd_inode)); 15575e1fea6SMartin Matuska #endif 156eda14cbcSMatt Macy } 157eda14cbcSMatt Macy 1587877fdebSMatt Macy #if !defined(HAVE_BDEV_WHOLE) 1597877fdebSMatt Macy static inline struct block_device * 1607877fdebSMatt Macy bdev_whole(struct block_device *bdev) 1617877fdebSMatt Macy { 1627877fdebSMatt Macy return (bdev->bd_contains); 1637877fdebSMatt Macy } 1647877fdebSMatt Macy #endif 1657877fdebSMatt Macy 166271171e0SMartin Matuska #if defined(HAVE_BDEVNAME) 167271171e0SMartin Matuska #define vdev_bdevname(bdev, name) bdevname(bdev, name) 168271171e0SMartin Matuska #else 169271171e0SMartin Matuska static inline void 170271171e0SMartin Matuska vdev_bdevname(struct block_device *bdev, char *name) 171271171e0SMartin Matuska { 172271171e0SMartin Matuska snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 173271171e0SMartin Matuska } 174271171e0SMartin Matuska #endif 175271171e0SMartin Matuska 176eda14cbcSMatt Macy /* 177eda14cbcSMatt Macy * Returns the maximum expansion capacity of the block device (in bytes). 178eda14cbcSMatt Macy * 179eda14cbcSMatt Macy * It is possible to expand a vdev when it has been created as a wholedisk 180eda14cbcSMatt Macy * and the containing block device has increased in capacity. Or when the 181eda14cbcSMatt Macy * partition containing the pool has been manually increased in size. 182eda14cbcSMatt Macy * 183eda14cbcSMatt Macy * This function is only responsible for calculating the potential expansion 184eda14cbcSMatt Macy * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 185eda14cbcSMatt Macy * responsible for verifying the expected partition layout in the wholedisk 186eda14cbcSMatt Macy * case, and updating the partition table if appropriate. Once the partition 187eda14cbcSMatt Macy * size has been increased the additional capacity will be visible using 188eda14cbcSMatt Macy * bdev_capacity(). 189eda14cbcSMatt Macy * 190eda14cbcSMatt Macy * The returned maximum expansion capacity is always expected to be larger, or 191eda14cbcSMatt Macy * at the very least equal, to its usable capacity to prevent overestimating 192eda14cbcSMatt Macy * the pool expandsize. 193eda14cbcSMatt Macy */ 194eda14cbcSMatt Macy static uint64_t 195eda14cbcSMatt Macy bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 196eda14cbcSMatt Macy { 197eda14cbcSMatt Macy uint64_t psize; 198eda14cbcSMatt Macy int64_t available; 199eda14cbcSMatt Macy 2007877fdebSMatt Macy if (wholedisk && bdev != bdev_whole(bdev)) { 201eda14cbcSMatt Macy /* 202eda14cbcSMatt Macy * When reporting maximum expansion capacity for a wholedisk 203eda14cbcSMatt Macy * deduct any capacity which is expected to be lost due to 204eda14cbcSMatt Macy * alignment restrictions. Over reporting this value isn't 205eda14cbcSMatt Macy * harmful and would only result in slightly less capacity 206eda14cbcSMatt Macy * than expected post expansion. 207eda14cbcSMatt Macy * The estimated available space may be slightly smaller than 208eda14cbcSMatt Macy * bdev_capacity() for devices where the number of sectors is 209eda14cbcSMatt Macy * not a multiple of the alignment size and the partition layout 210eda14cbcSMatt Macy * is keeping less than PARTITION_END_ALIGNMENT bytes after the 211eda14cbcSMatt Macy * "reserved" EFI partition: in such cases return the device 212eda14cbcSMatt Macy * usable capacity. 213eda14cbcSMatt Macy */ 21475e1fea6SMartin Matuska available = bdev_capacity(bdev_whole(bdev)) - 215eda14cbcSMatt Macy ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 216eda14cbcSMatt Macy PARTITION_END_ALIGNMENT) << SECTOR_BITS); 217eda14cbcSMatt Macy psize = MAX(available, bdev_capacity(bdev)); 218eda14cbcSMatt Macy } else { 219eda14cbcSMatt Macy psize = bdev_capacity(bdev); 220eda14cbcSMatt Macy } 221eda14cbcSMatt Macy 222eda14cbcSMatt Macy return (psize); 223eda14cbcSMatt Macy } 224eda14cbcSMatt Macy 225eda14cbcSMatt Macy static void 226eda14cbcSMatt Macy vdev_disk_error(zio_t *zio) 227eda14cbcSMatt Macy { 228eda14cbcSMatt Macy /* 229eda14cbcSMatt Macy * This function can be called in interrupt context, for instance while 230eda14cbcSMatt Macy * handling IRQs coming from a misbehaving disk device; use printk() 231eda14cbcSMatt Macy * which is safe from any context. 232eda14cbcSMatt Macy */ 233eda14cbcSMatt Macy printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 234dbd5678dSMartin Matuska "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 235eda14cbcSMatt Macy zio->io_vd->vdev_path, zio->io_error, zio->io_type, 236eda14cbcSMatt Macy (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 237eda14cbcSMatt Macy zio->io_flags); 238eda14cbcSMatt Macy } 239eda14cbcSMatt Macy 240be181ee2SMartin Matuska static void 241be181ee2SMartin Matuska vdev_disk_kobj_evt_post(vdev_t *v) 242be181ee2SMartin Matuska { 243be181ee2SMartin Matuska vdev_disk_t *vd = v->vdev_tsd; 244fd45b686SMartin Matuska if (vd && vd->vd_bdh) { 245fd45b686SMartin Matuska spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); 246be181ee2SMartin Matuska } else { 247be181ee2SMartin Matuska vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 248be181ee2SMartin Matuska v->vdev_path); 249be181ee2SMartin Matuska } 250be181ee2SMartin Matuska } 251be181ee2SMartin Matuska 252fd45b686SMartin Matuska static zfs_bdev_handle_t * 253783d3ff6SMartin Matuska vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) 254315ee00fSMartin Matuska { 255783d3ff6SMartin Matuska vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); 256783d3ff6SMartin Matuska 2571719886fSMartin Matuska #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 2581719886fSMartin Matuska return (bdev_file_open_by_path(path, bmode, holder, NULL)); 2591719886fSMartin Matuska #elif defined(HAVE_BDEV_OPEN_BY_PATH) 260783d3ff6SMartin Matuska return (bdev_open_by_path(path, bmode, holder, NULL)); 261fd45b686SMartin Matuska #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 262783d3ff6SMartin Matuska return (blkdev_get_by_path(path, bmode, holder, NULL)); 263315ee00fSMartin Matuska #else 264783d3ff6SMartin Matuska return (blkdev_get_by_path(path, bmode, holder)); 265315ee00fSMartin Matuska #endif 266315ee00fSMartin Matuska } 267315ee00fSMartin Matuska 268315ee00fSMartin Matuska static void 269783d3ff6SMartin Matuska vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) 270315ee00fSMartin Matuska { 271fd45b686SMartin Matuska #if defined(HAVE_BDEV_RELEASE) 272fd45b686SMartin Matuska return (bdev_release(bdh)); 273fd45b686SMartin Matuska #elif defined(HAVE_BLKDEV_PUT_HOLDER) 274fd45b686SMartin Matuska return (blkdev_put(BDH_BDEV(bdh), holder)); 2751719886fSMartin Matuska #elif defined(HAVE_BLKDEV_PUT) 276783d3ff6SMartin Matuska return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); 2771719886fSMartin Matuska #else 2781719886fSMartin Matuska fput(bdh); 279315ee00fSMartin Matuska #endif 280315ee00fSMartin Matuska } 281315ee00fSMartin Matuska 282eda14cbcSMatt Macy static int 283eda14cbcSMatt Macy vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 284eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 285eda14cbcSMatt Macy { 286fd45b686SMartin Matuska zfs_bdev_handle_t *bdh; 287783d3ff6SMartin Matuska spa_mode_t smode = spa_mode(v->vdev_spa); 288eda14cbcSMatt Macy hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 289eda14cbcSMatt Macy vdev_disk_t *vd; 290eda14cbcSMatt Macy 291eda14cbcSMatt Macy /* Must have a pathname and it must be absolute. */ 292eda14cbcSMatt Macy if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 293eda14cbcSMatt Macy v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 294eda14cbcSMatt Macy vdev_dbgmsg(v, "invalid vdev_path"); 295eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 296eda14cbcSMatt Macy } 297eda14cbcSMatt Macy 298eda14cbcSMatt Macy /* 299eda14cbcSMatt Macy * Reopen the device if it is currently open. When expanding a 3002c48331dSMatt Macy * partition force re-scanning the partition table if userland 3012c48331dSMatt Macy * did not take care of this already. We need to do this while closed 302eda14cbcSMatt Macy * in order to get an accurate updated block device size. Then 303eda14cbcSMatt Macy * since udev may need to recreate the device links increase the 304eda14cbcSMatt Macy * open retry timeout before reporting the device as unavailable. 305eda14cbcSMatt Macy */ 306eda14cbcSMatt Macy vd = v->vdev_tsd; 307eda14cbcSMatt Macy if (vd) { 308eda14cbcSMatt Macy char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 309eda14cbcSMatt Macy boolean_t reread_part = B_FALSE; 310eda14cbcSMatt Macy 311eda14cbcSMatt Macy rw_enter(&vd->vd_lock, RW_WRITER); 312fd45b686SMartin Matuska bdh = vd->vd_bdh; 313fd45b686SMartin Matuska vd->vd_bdh = NULL; 314eda14cbcSMatt Macy 315fd45b686SMartin Matuska if (bdh) { 316fd45b686SMartin Matuska struct block_device *bdev = BDH_BDEV(bdh); 3177877fdebSMatt Macy if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 318271171e0SMartin Matuska vdev_bdevname(bdev_whole(bdev), disk_name + 5); 3192c48331dSMatt Macy /* 3202c48331dSMatt Macy * If userland has BLKPG_RESIZE_PARTITION, 3212c48331dSMatt Macy * then it should have updated the partition 3222c48331dSMatt Macy * table already. We can detect this by 3232c48331dSMatt Macy * comparing our current physical size 3242c48331dSMatt Macy * with that of the device. If they are 3252c48331dSMatt Macy * the same, then we must not have 3262c48331dSMatt Macy * BLKPG_RESIZE_PARTITION or it failed to 3272c48331dSMatt Macy * update the partition table online. We 3282c48331dSMatt Macy * fallback to rescanning the partition 3292c48331dSMatt Macy * table from the kernel below. However, 3302c48331dSMatt Macy * if the capacity already reflects the 3312c48331dSMatt Macy * updated partition, then we skip 3322c48331dSMatt Macy * rescanning the partition table here. 3332c48331dSMatt Macy */ 3342c48331dSMatt Macy if (v->vdev_psize == bdev_capacity(bdev)) 335eda14cbcSMatt Macy reread_part = B_TRUE; 336eda14cbcSMatt Macy } 337eda14cbcSMatt Macy 338783d3ff6SMartin Matuska vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 339eda14cbcSMatt Macy } 340eda14cbcSMatt Macy 341eda14cbcSMatt Macy if (reread_part) { 342783d3ff6SMartin Matuska bdh = vdev_blkdev_get_by_path(disk_name, smode, 343fd45b686SMartin Matuska zfs_vdev_holder); 344fd45b686SMartin Matuska if (!BDH_IS_ERR(bdh)) { 345fd45b686SMartin Matuska int error = 346fd45b686SMartin Matuska vdev_bdev_reread_part(BDH_BDEV(bdh)); 347783d3ff6SMartin Matuska vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 348eda14cbcSMatt Macy if (error == 0) { 349eda14cbcSMatt Macy timeout = MSEC2NSEC( 350eda14cbcSMatt Macy zfs_vdev_open_timeout_ms * 2); 351eda14cbcSMatt Macy } 352eda14cbcSMatt Macy } 353eda14cbcSMatt Macy } 354eda14cbcSMatt Macy } else { 355eda14cbcSMatt Macy vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 356eda14cbcSMatt Macy 357eda14cbcSMatt Macy rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 358eda14cbcSMatt Macy rw_enter(&vd->vd_lock, RW_WRITER); 359eda14cbcSMatt Macy } 360eda14cbcSMatt Macy 361eda14cbcSMatt Macy /* 362eda14cbcSMatt Macy * Devices are always opened by the path provided at configuration 363eda14cbcSMatt Macy * time. This means that if the provided path is a udev by-id path 364eda14cbcSMatt Macy * then drives may be re-cabled without an issue. If the provided 365eda14cbcSMatt Macy * path is a udev by-path path, then the physical location information 366eda14cbcSMatt Macy * will be preserved. This can be critical for more complicated 367eda14cbcSMatt Macy * configurations where drives are located in specific physical 368eda14cbcSMatt Macy * locations to maximize the systems tolerance to component failure. 369eda14cbcSMatt Macy * 370eda14cbcSMatt Macy * Alternatively, you can provide your own udev rule to flexibly map 371eda14cbcSMatt Macy * the drives as you see fit. It is not advised that you use the 372eda14cbcSMatt Macy * /dev/[hd]d devices which may be reordered due to probing order. 373eda14cbcSMatt Macy * Devices in the wrong locations will be detected by the higher 374eda14cbcSMatt Macy * level vdev validation. 375eda14cbcSMatt Macy * 376eda14cbcSMatt Macy * The specified paths may be briefly removed and recreated in 377eda14cbcSMatt Macy * response to udev events. This should be exceptionally unlikely 378eda14cbcSMatt Macy * because the zpool command makes every effort to verify these paths 379eda14cbcSMatt Macy * have already settled prior to reaching this point. Therefore, 380eda14cbcSMatt Macy * a ENOENT failure at this point is highly likely to be transient 381eda14cbcSMatt Macy * and it is reasonable to sleep and retry before giving up. In 382eda14cbcSMatt Macy * practice delays have been observed to be on the order of 100ms. 383681ce946SMartin Matuska * 384681ce946SMartin Matuska * When ERESTARTSYS is returned it indicates the block device is 385681ce946SMartin Matuska * a zvol which could not be opened due to the deadlock detection 386681ce946SMartin Matuska * logic in zvol_open(). Extend the timeout and retry the open 387681ce946SMartin Matuska * subsequent attempts are expected to eventually succeed. 388eda14cbcSMatt Macy */ 389eda14cbcSMatt Macy hrtime_t start = gethrtime(); 390fd45b686SMartin Matuska bdh = BDH_ERR_PTR(-ENXIO); 391fd45b686SMartin Matuska while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { 392783d3ff6SMartin Matuska bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, 393fd45b686SMartin Matuska zfs_vdev_holder); 394fd45b686SMartin Matuska if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { 395be181ee2SMartin Matuska /* 396be181ee2SMartin Matuska * There is no point of waiting since device is removed 397be181ee2SMartin Matuska * explicitly 398be181ee2SMartin Matuska */ 399be181ee2SMartin Matuska if (v->vdev_removed) 400be181ee2SMartin Matuska break; 401be181ee2SMartin Matuska 402aca928a5SMartin Matuska schedule_timeout_interruptible(MSEC_TO_TICK(10)); 403fd45b686SMartin Matuska } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { 404681ce946SMartin Matuska timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 405681ce946SMartin Matuska continue; 406fd45b686SMartin Matuska } else if (BDH_IS_ERR(bdh)) { 407eda14cbcSMatt Macy break; 408eda14cbcSMatt Macy } 409eda14cbcSMatt Macy } 410eda14cbcSMatt Macy 411fd45b686SMartin Matuska if (BDH_IS_ERR(bdh)) { 412fd45b686SMartin Matuska int error = -BDH_PTR_ERR(bdh); 413eda14cbcSMatt Macy vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 414eda14cbcSMatt Macy (u_longlong_t)(gethrtime() - start), 415eda14cbcSMatt Macy (u_longlong_t)timeout); 416fd45b686SMartin Matuska vd->vd_bdh = NULL; 417eda14cbcSMatt Macy v->vdev_tsd = vd; 418eda14cbcSMatt Macy rw_exit(&vd->vd_lock); 419eda14cbcSMatt Macy return (SET_ERROR(error)); 420eda14cbcSMatt Macy } else { 421fd45b686SMartin Matuska vd->vd_bdh = bdh; 422eda14cbcSMatt Macy v->vdev_tsd = vd; 423eda14cbcSMatt Macy rw_exit(&vd->vd_lock); 424eda14cbcSMatt Macy } 425eda14cbcSMatt Macy 426fd45b686SMartin Matuska struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 427fd45b686SMartin Matuska 428eda14cbcSMatt Macy /* Determine the physical block size */ 429fd45b686SMartin Matuska int physical_block_size = bdev_physical_block_size(bdev); 430eda14cbcSMatt Macy 431eda14cbcSMatt Macy /* Determine the logical block size */ 432fd45b686SMartin Matuska int logical_block_size = bdev_logical_block_size(bdev); 433eda14cbcSMatt Macy 434b985c9caSMartin Matuska /* 435b985c9caSMartin Matuska * If the device has a write cache, clear the nowritecache flag, 436b985c9caSMartin Matuska * so that we start issuing flush requests again. 437b985c9caSMartin Matuska */ 438b985c9caSMartin Matuska v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); 439eda14cbcSMatt Macy 440eda14cbcSMatt Macy /* Set when device reports it supports TRIM. */ 441fd45b686SMartin Matuska v->vdev_has_trim = bdev_discard_supported(bdev); 442eda14cbcSMatt Macy 443eda14cbcSMatt Macy /* Set when device reports it supports secure TRIM. */ 444fd45b686SMartin Matuska v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); 445eda14cbcSMatt Macy 446eda14cbcSMatt Macy /* Inform the ZIO pipeline that we are non-rotational */ 447fd45b686SMartin Matuska v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); 448eda14cbcSMatt Macy 449eda14cbcSMatt Macy /* Physical volume size in bytes for the partition */ 450fd45b686SMartin Matuska *psize = bdev_capacity(bdev); 451eda14cbcSMatt Macy 452eda14cbcSMatt Macy /* Physical volume size in bytes including possible expansion space */ 453fd45b686SMartin Matuska *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); 454eda14cbcSMatt Macy 455eda14cbcSMatt Macy /* Based on the minimum sector size set the block size */ 456eda14cbcSMatt Macy *physical_ashift = highbit64(MAX(physical_block_size, 457eda14cbcSMatt Macy SPA_MINBLOCKSIZE)) - 1; 458eda14cbcSMatt Macy 459eda14cbcSMatt Macy *logical_ashift = highbit64(MAX(logical_block_size, 460eda14cbcSMatt Macy SPA_MINBLOCKSIZE)) - 1; 461eda14cbcSMatt Macy 462eda14cbcSMatt Macy return (0); 463eda14cbcSMatt Macy } 464eda14cbcSMatt Macy 465eda14cbcSMatt Macy static void 466eda14cbcSMatt Macy vdev_disk_close(vdev_t *v) 467eda14cbcSMatt Macy { 468eda14cbcSMatt Macy vdev_disk_t *vd = v->vdev_tsd; 469eda14cbcSMatt Macy 470eda14cbcSMatt Macy if (v->vdev_reopening || vd == NULL) 471eda14cbcSMatt Macy return; 472eda14cbcSMatt Macy 473783d3ff6SMartin Matuska if (vd->vd_bdh != NULL) 474fd45b686SMartin Matuska vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), 475315ee00fSMartin Matuska zfs_vdev_holder); 476eda14cbcSMatt Macy 477eda14cbcSMatt Macy rw_destroy(&vd->vd_lock); 478eda14cbcSMatt Macy kmem_free(vd, sizeof (vdev_disk_t)); 479eda14cbcSMatt Macy v->vdev_tsd = NULL; 480eda14cbcSMatt Macy } 481eda14cbcSMatt Macy 482c40487d4SMatt Macy /* 483c40487d4SMatt Macy * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 484c40487d4SMatt Macy * replace it with preempt_schedule under the following condition: 485c40487d4SMatt Macy */ 486c40487d4SMatt Macy #if defined(CONFIG_ARM64) && \ 487c40487d4SMatt Macy defined(CONFIG_PREEMPTION) && \ 488c40487d4SMatt Macy defined(CONFIG_BLK_CGROUP) 489c40487d4SMatt Macy #define preempt_schedule_notrace(x) preempt_schedule(x) 490c40487d4SMatt Macy #endif 491c40487d4SMatt Macy 492e3aa18adSMartin Matuska /* 493e3aa18adSMartin Matuska * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 494e3aa18adSMartin Matuska * as an argument removing the need to set it with bio_set_dev(). This 495e3aa18adSMartin Matuska * removes the need for all of the following compatibility code. 496e3aa18adSMartin Matuska */ 497e3aa18adSMartin Matuska #if !defined(HAVE_BIO_ALLOC_4ARG) 498e3aa18adSMartin Matuska 499eda14cbcSMatt Macy #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 500eda14cbcSMatt Macy /* 501eda14cbcSMatt Macy * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 502eda14cbcSMatt Macy * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 503eda14cbcSMatt Macy * As a side effect the function was converted to GPL-only. Define our 504eda14cbcSMatt Macy * own version when needed which uses rcu_read_lock_sched(). 505716fd348SMartin Matuska * 506716fd348SMartin Matuska * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 507716fd348SMartin Matuska * part, moving blkg_tryget into the private one. Define our own version. 508eda14cbcSMatt Macy */ 509716fd348SMartin Matuska #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 510eda14cbcSMatt Macy static inline bool 511eda14cbcSMatt Macy vdev_blkg_tryget(struct blkcg_gq *blkg) 512eda14cbcSMatt Macy { 513eda14cbcSMatt Macy struct percpu_ref *ref = &blkg->refcnt; 514eda14cbcSMatt Macy unsigned long __percpu *count; 515eda14cbcSMatt Macy bool rc; 516eda14cbcSMatt Macy 517eda14cbcSMatt Macy rcu_read_lock_sched(); 518eda14cbcSMatt Macy 519eda14cbcSMatt Macy if (__ref_is_percpu(ref, &count)) { 520eda14cbcSMatt Macy this_cpu_inc(*count); 521eda14cbcSMatt Macy rc = true; 522eda14cbcSMatt Macy } else { 5237877fdebSMatt Macy #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 5247877fdebSMatt Macy rc = atomic_long_inc_not_zero(&ref->data->count); 5257877fdebSMatt Macy #else 526eda14cbcSMatt Macy rc = atomic_long_inc_not_zero(&ref->count); 5277877fdebSMatt Macy #endif 528eda14cbcSMatt Macy } 529eda14cbcSMatt Macy 530eda14cbcSMatt Macy rcu_read_unlock_sched(); 531eda14cbcSMatt Macy 532eda14cbcSMatt Macy return (rc); 533eda14cbcSMatt Macy } 534716fd348SMartin Matuska #else 535eda14cbcSMatt Macy #define vdev_blkg_tryget(bg) blkg_tryget(bg) 536eda14cbcSMatt Macy #endif 537681ce946SMartin Matuska #ifdef HAVE_BIO_SET_DEV_MACRO 538eda14cbcSMatt Macy /* 539eda14cbcSMatt Macy * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 540eda14cbcSMatt Macy * GPL-only bio_associate_blkg() symbol thus inadvertently converting 541eda14cbcSMatt Macy * the entire macro. Provide a minimal version which always assigns the 542eda14cbcSMatt Macy * request queue's root_blkg to the bio. 543eda14cbcSMatt Macy */ 544eda14cbcSMatt Macy static inline void 545eda14cbcSMatt Macy vdev_bio_associate_blkg(struct bio *bio) 546eda14cbcSMatt Macy { 547caed7b1cSMartin Matuska #if defined(HAVE_BIO_BDEV_DISK) 548caed7b1cSMartin Matuska struct request_queue *q = bio->bi_bdev->bd_disk->queue; 549caed7b1cSMartin Matuska #else 550eda14cbcSMatt Macy struct request_queue *q = bio->bi_disk->queue; 551caed7b1cSMartin Matuska #endif 552eda14cbcSMatt Macy 553eda14cbcSMatt Macy ASSERT3P(q, !=, NULL); 554eda14cbcSMatt Macy ASSERT3P(bio->bi_blkg, ==, NULL); 555eda14cbcSMatt Macy 556eda14cbcSMatt Macy if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 557eda14cbcSMatt Macy bio->bi_blkg = q->root_blkg; 558eda14cbcSMatt Macy } 559681ce946SMartin Matuska 560eda14cbcSMatt Macy #define bio_associate_blkg vdev_bio_associate_blkg 561681ce946SMartin Matuska #else 562681ce946SMartin Matuska static inline void 563681ce946SMartin Matuska vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 564681ce946SMartin Matuska { 565681ce946SMartin Matuska #if defined(HAVE_BIO_BDEV_DISK) 566681ce946SMartin Matuska struct request_queue *q = bdev->bd_disk->queue; 567681ce946SMartin Matuska #else 568681ce946SMartin Matuska struct request_queue *q = bio->bi_disk->queue; 569681ce946SMartin Matuska #endif 570681ce946SMartin Matuska bio_clear_flag(bio, BIO_REMAPPED); 571681ce946SMartin Matuska if (bio->bi_bdev != bdev) 572681ce946SMartin Matuska bio_clear_flag(bio, BIO_THROTTLED); 573681ce946SMartin Matuska bio->bi_bdev = bdev; 574681ce946SMartin Matuska 575681ce946SMartin Matuska ASSERT3P(q, !=, NULL); 576681ce946SMartin Matuska ASSERT3P(bio->bi_blkg, ==, NULL); 577681ce946SMartin Matuska 578681ce946SMartin Matuska if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 579681ce946SMartin Matuska bio->bi_blkg = q->root_blkg; 580681ce946SMartin Matuska } 581681ce946SMartin Matuska #define bio_set_dev vdev_bio_set_dev 582681ce946SMartin Matuska #endif 583eda14cbcSMatt Macy #endif 584e3aa18adSMartin Matuska #endif /* !HAVE_BIO_ALLOC_4ARG */ 585eda14cbcSMatt Macy 586eda14cbcSMatt Macy static inline void 587eda14cbcSMatt Macy vdev_submit_bio(struct bio *bio) 588eda14cbcSMatt Macy { 589eda14cbcSMatt Macy struct bio_list *bio_list = current->bio_list; 590eda14cbcSMatt Macy current->bio_list = NULL; 5917a7741afSMartin Matuska (void) submit_bio(bio); 592eda14cbcSMatt Macy current->bio_list = bio_list; 593eda14cbcSMatt Macy } 594eda14cbcSMatt Macy 595e3aa18adSMartin Matuska static inline struct bio * 596e3aa18adSMartin Matuska vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 597e3aa18adSMartin Matuska unsigned short nr_vecs) 598e3aa18adSMartin Matuska { 599e3aa18adSMartin Matuska struct bio *bio; 600e3aa18adSMartin Matuska 601da5137abSMartin Matuska #ifdef HAVE_BIO_ALLOC_4ARG 602e3aa18adSMartin Matuska bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 603e3aa18adSMartin Matuska #else 604e3aa18adSMartin Matuska bio = bio_alloc(gfp_mask, nr_vecs); 605e3aa18adSMartin Matuska if (likely(bio != NULL)) 606e3aa18adSMartin Matuska bio_set_dev(bio, bdev); 607da5137abSMartin Matuska #endif 608da5137abSMartin Matuska 609e3aa18adSMartin Matuska return (bio); 610e3aa18adSMartin Matuska } 611e3aa18adSMartin Matuska 612783d3ff6SMartin Matuska static inline uint_t 613783d3ff6SMartin Matuska vdev_bio_max_segs(struct block_device *bdev) 614783d3ff6SMartin Matuska { 615783d3ff6SMartin Matuska /* 616783d3ff6SMartin Matuska * Smallest of the device max segs and the tuneable max segs. Minimum 617783d3ff6SMartin Matuska * 4, so there's room to finish split pages if they come up. 618783d3ff6SMartin Matuska */ 619783d3ff6SMartin Matuska const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); 620783d3ff6SMartin Matuska const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? 621783d3ff6SMartin Matuska MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; 622783d3ff6SMartin Matuska const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); 623783d3ff6SMartin Matuska 624783d3ff6SMartin Matuska #ifdef HAVE_BIO_MAX_SEGS 625783d3ff6SMartin Matuska return (bio_max_segs(max_segs)); 626783d3ff6SMartin Matuska #else 627783d3ff6SMartin Matuska return (MIN(max_segs, BIO_MAX_PAGES)); 628783d3ff6SMartin Matuska #endif 629783d3ff6SMartin Matuska } 630783d3ff6SMartin Matuska 631783d3ff6SMartin Matuska static inline uint_t 632783d3ff6SMartin Matuska vdev_bio_max_bytes(struct block_device *bdev) 633783d3ff6SMartin Matuska { 634783d3ff6SMartin Matuska return (queue_max_sectors(bdev_get_queue(bdev)) << 9); 635783d3ff6SMartin Matuska } 636783d3ff6SMartin Matuska 637783d3ff6SMartin Matuska 638783d3ff6SMartin Matuska /* 639783d3ff6SMartin Matuska * Virtual block IO object (VBIO) 640783d3ff6SMartin Matuska * 641783d3ff6SMartin Matuska * Linux block IO (BIO) objects have a limit on how many data segments (pages) 642783d3ff6SMartin Matuska * they can hold. Depending on how they're allocated and structured, a large 643783d3ff6SMartin Matuska * ZIO can require more than one BIO to be submitted to the kernel, which then 644783d3ff6SMartin Matuska * all have to complete before we can return the completed ZIO back to ZFS. 645783d3ff6SMartin Matuska * 646783d3ff6SMartin Matuska * A VBIO is a wrapper around multiple BIOs, carrying everything needed to 647783d3ff6SMartin Matuska * translate a ZIO down into the kernel block layer and back again. 648783d3ff6SMartin Matuska * 649783d3ff6SMartin Matuska * Note that these are only used for data ZIOs (read/write). Meta-operations 650783d3ff6SMartin Matuska * (flush/trim) don't need multiple BIOs and so can just make the call 651783d3ff6SMartin Matuska * directly. 652783d3ff6SMartin Matuska */ 653783d3ff6SMartin Matuska typedef struct { 654783d3ff6SMartin Matuska zio_t *vbio_zio; /* parent zio */ 655783d3ff6SMartin Matuska 656783d3ff6SMartin Matuska struct block_device *vbio_bdev; /* blockdev to submit bios to */ 657783d3ff6SMartin Matuska 658783d3ff6SMartin Matuska abd_t *vbio_abd; /* abd carrying borrowed linear buf */ 659783d3ff6SMartin Matuska 660783d3ff6SMartin Matuska uint_t vbio_max_segs; /* max segs per bio */ 661783d3ff6SMartin Matuska 662783d3ff6SMartin Matuska uint_t vbio_max_bytes; /* max bytes per bio */ 663783d3ff6SMartin Matuska uint_t vbio_lbs_mask; /* logical block size mask */ 664783d3ff6SMartin Matuska 665783d3ff6SMartin Matuska uint64_t vbio_offset; /* start offset of next bio */ 666783d3ff6SMartin Matuska 667783d3ff6SMartin Matuska struct bio *vbio_bio; /* pointer to the current bio */ 668783d3ff6SMartin Matuska int vbio_flags; /* bio flags */ 669783d3ff6SMartin Matuska } vbio_t; 670783d3ff6SMartin Matuska 671783d3ff6SMartin Matuska static vbio_t * 672783d3ff6SMartin Matuska vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) 673783d3ff6SMartin Matuska { 674783d3ff6SMartin Matuska vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); 675783d3ff6SMartin Matuska 676783d3ff6SMartin Matuska vbio->vbio_zio = zio; 677783d3ff6SMartin Matuska vbio->vbio_bdev = bdev; 678783d3ff6SMartin Matuska vbio->vbio_abd = NULL; 679783d3ff6SMartin Matuska vbio->vbio_max_segs = vdev_bio_max_segs(bdev); 680783d3ff6SMartin Matuska vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); 681783d3ff6SMartin Matuska vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); 682783d3ff6SMartin Matuska vbio->vbio_offset = zio->io_offset; 683783d3ff6SMartin Matuska vbio->vbio_bio = NULL; 684783d3ff6SMartin Matuska vbio->vbio_flags = flags; 685783d3ff6SMartin Matuska 686783d3ff6SMartin Matuska return (vbio); 687783d3ff6SMartin Matuska } 688783d3ff6SMartin Matuska 6897a7741afSMartin Matuska static void vbio_completion(struct bio *bio); 690783d3ff6SMartin Matuska 691783d3ff6SMartin Matuska static int 692783d3ff6SMartin Matuska vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) 693783d3ff6SMartin Matuska { 694783d3ff6SMartin Matuska struct bio *bio = vbio->vbio_bio; 695783d3ff6SMartin Matuska uint_t ssize; 696783d3ff6SMartin Matuska 697783d3ff6SMartin Matuska while (size > 0) { 698783d3ff6SMartin Matuska if (bio == NULL) { 699783d3ff6SMartin Matuska /* New BIO, allocate and set up */ 700783d3ff6SMartin Matuska bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, 701783d3ff6SMartin Matuska vbio->vbio_max_segs); 702783d3ff6SMartin Matuska VERIFY(bio); 703783d3ff6SMartin Matuska 704783d3ff6SMartin Matuska BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; 705783d3ff6SMartin Matuska bio_set_op_attrs(bio, 706783d3ff6SMartin Matuska vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? 707783d3ff6SMartin Matuska WRITE : READ, vbio->vbio_flags); 708783d3ff6SMartin Matuska 709783d3ff6SMartin Matuska if (vbio->vbio_bio) { 710783d3ff6SMartin Matuska bio_chain(vbio->vbio_bio, bio); 711783d3ff6SMartin Matuska vdev_submit_bio(vbio->vbio_bio); 712783d3ff6SMartin Matuska } 713783d3ff6SMartin Matuska vbio->vbio_bio = bio; 714783d3ff6SMartin Matuska } 715783d3ff6SMartin Matuska 716783d3ff6SMartin Matuska /* 717783d3ff6SMartin Matuska * Only load as much of the current page data as will fit in 718783d3ff6SMartin Matuska * the space left in the BIO, respecting lbs alignment. Older 719783d3ff6SMartin Matuska * kernels will error if we try to overfill the BIO, while 720783d3ff6SMartin Matuska * newer ones will accept it and split the BIO. This ensures 721783d3ff6SMartin Matuska * everything works on older kernels, and avoids an additional 722783d3ff6SMartin Matuska * overhead on the new. 723783d3ff6SMartin Matuska */ 724783d3ff6SMartin Matuska ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & 725783d3ff6SMartin Matuska vbio->vbio_lbs_mask); 726783d3ff6SMartin Matuska if (ssize > 0 && 727783d3ff6SMartin Matuska bio_add_page(bio, page, ssize, offset) == ssize) { 728783d3ff6SMartin Matuska /* Accepted, adjust and load any remaining. */ 729783d3ff6SMartin Matuska size -= ssize; 730783d3ff6SMartin Matuska offset += ssize; 731783d3ff6SMartin Matuska continue; 732783d3ff6SMartin Matuska } 733783d3ff6SMartin Matuska 734783d3ff6SMartin Matuska /* No room, set up for a new BIO and loop */ 735783d3ff6SMartin Matuska vbio->vbio_offset += BIO_BI_SIZE(bio); 736783d3ff6SMartin Matuska 737783d3ff6SMartin Matuska /* Signal new BIO allocation wanted */ 738783d3ff6SMartin Matuska bio = NULL; 739783d3ff6SMartin Matuska } 740783d3ff6SMartin Matuska 741783d3ff6SMartin Matuska return (0); 742783d3ff6SMartin Matuska } 743783d3ff6SMartin Matuska 744783d3ff6SMartin Matuska /* Iterator callback to submit ABD pages to the vbio. */ 745783d3ff6SMartin Matuska static int 746783d3ff6SMartin Matuska vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) 747783d3ff6SMartin Matuska { 748783d3ff6SMartin Matuska vbio_t *vbio = priv; 749783d3ff6SMartin Matuska return (vbio_add_page(vbio, page, len, off)); 750783d3ff6SMartin Matuska } 751783d3ff6SMartin Matuska 752783d3ff6SMartin Matuska /* Create some BIOs, fill them with data and submit them */ 753783d3ff6SMartin Matuska static void 754783d3ff6SMartin Matuska vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) 755783d3ff6SMartin Matuska { 756783d3ff6SMartin Matuska /* 757783d3ff6SMartin Matuska * We plug so we can submit the BIOs as we go and only unplug them when 758783d3ff6SMartin Matuska * they are fully created and submitted. This is important; if we don't 759783d3ff6SMartin Matuska * plug, then the kernel may start executing earlier BIOs while we're 760783d3ff6SMartin Matuska * still creating and executing later ones, and if the device goes 761783d3ff6SMartin Matuska * away while that's happening, older kernels can get confused and 762783d3ff6SMartin Matuska * trample memory. 763783d3ff6SMartin Matuska */ 764783d3ff6SMartin Matuska struct blk_plug plug; 765783d3ff6SMartin Matuska blk_start_plug(&plug); 766783d3ff6SMartin Matuska 767783d3ff6SMartin Matuska (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); 768783d3ff6SMartin Matuska ASSERT(vbio->vbio_bio); 769783d3ff6SMartin Matuska 770783d3ff6SMartin Matuska vbio->vbio_bio->bi_end_io = vbio_completion; 771783d3ff6SMartin Matuska vbio->vbio_bio->bi_private = vbio; 772783d3ff6SMartin Matuska 7731719886fSMartin Matuska /* 7741719886fSMartin Matuska * Once submitted, vbio_bio now owns vbio (through bi_private) and we 7751719886fSMartin Matuska * can't touch it again. The bio may complete and vbio_completion() be 7761719886fSMartin Matuska * called and free the vbio before this task is run again, so we must 7771719886fSMartin Matuska * consider it invalid from this point. 7781719886fSMartin Matuska */ 779783d3ff6SMartin Matuska vdev_submit_bio(vbio->vbio_bio); 780783d3ff6SMartin Matuska 781783d3ff6SMartin Matuska blk_finish_plug(&plug); 782783d3ff6SMartin Matuska } 783783d3ff6SMartin Matuska 784783d3ff6SMartin Matuska /* IO completion callback */ 7857a7741afSMartin Matuska static void 7867a7741afSMartin Matuska vbio_completion(struct bio *bio) 787783d3ff6SMartin Matuska { 788783d3ff6SMartin Matuska vbio_t *vbio = bio->bi_private; 789783d3ff6SMartin Matuska zio_t *zio = vbio->vbio_zio; 790783d3ff6SMartin Matuska 791783d3ff6SMartin Matuska ASSERT(zio); 792783d3ff6SMartin Matuska 793783d3ff6SMartin Matuska /* Capture and log any errors */ 7947a7741afSMartin Matuska zio->io_error = bi_status_to_errno(bio->bi_status); 795783d3ff6SMartin Matuska ASSERT3U(zio->io_error, >=, 0); 796783d3ff6SMartin Matuska 797783d3ff6SMartin Matuska if (zio->io_error) 798783d3ff6SMartin Matuska vdev_disk_error(zio); 799783d3ff6SMartin Matuska 800783d3ff6SMartin Matuska /* Return the BIO to the kernel */ 801783d3ff6SMartin Matuska bio_put(bio); 802783d3ff6SMartin Matuska 803783d3ff6SMartin Matuska /* 8045c65a0a9SMartin Matuska * We're likely in an interrupt context so we can't do ABD/memory work 8055c65a0a9SMartin Matuska * here; instead we stash vbio on the zio and take care of it in the 8065c65a0a9SMartin Matuska * done callback. 807783d3ff6SMartin Matuska */ 8085c65a0a9SMartin Matuska ASSERT3P(zio->io_bio, ==, NULL); 8095c65a0a9SMartin Matuska zio->io_bio = vbio; 810783d3ff6SMartin Matuska 811783d3ff6SMartin Matuska zio_delay_interrupt(zio); 812783d3ff6SMartin Matuska } 813783d3ff6SMartin Matuska 814783d3ff6SMartin Matuska /* 815783d3ff6SMartin Matuska * Iterator callback to count ABD pages and check their size & alignment. 816783d3ff6SMartin Matuska * 817783d3ff6SMartin Matuska * On Linux, each BIO segment can take a page pointer, and an offset+length of 818783d3ff6SMartin Matuska * the data within that page. A page can be arbitrarily large ("compound" 819783d3ff6SMartin Matuska * pages) but we still have to ensure the data portion is correctly sized and 820783d3ff6SMartin Matuska * aligned to the logical block size, to ensure that if the kernel wants to 821783d3ff6SMartin Matuska * split the BIO, the two halves will still be properly aligned. 8221719886fSMartin Matuska * 8231719886fSMartin Matuska * NOTE: if you change this function, change the copy in 8241719886fSMartin Matuska * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test 8251719886fSMartin Matuska * data there to validate the change you're making. 826783d3ff6SMartin Matuska */ 827783d3ff6SMartin Matuska typedef struct { 8285c65a0a9SMartin Matuska size_t blocksize; 8295c65a0a9SMartin Matuska int seen_first; 8305c65a0a9SMartin Matuska int seen_last; 8315c65a0a9SMartin Matuska } vdev_disk_check_alignment_t; 832783d3ff6SMartin Matuska 833783d3ff6SMartin Matuska static int 8345c65a0a9SMartin Matuska vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len, 8355c65a0a9SMartin Matuska void *priv) 836783d3ff6SMartin Matuska { 8371719886fSMartin Matuska (void) page; 8385c65a0a9SMartin Matuska vdev_disk_check_alignment_t *s = priv; 839783d3ff6SMartin Matuska 840783d3ff6SMartin Matuska /* 8415c65a0a9SMartin Matuska * The cardinal rule: a single on-disk block must never cross an 8425c65a0a9SMartin Matuska * physical (order-0) page boundary, as the kernel expects to be able 8435c65a0a9SMartin Matuska * to split at both LBS and page boundaries. 8445c65a0a9SMartin Matuska * 8455c65a0a9SMartin Matuska * This implies various alignment rules for the blocks in this 8465c65a0a9SMartin Matuska * (possibly compound) page, which we can check for. 847783d3ff6SMartin Matuska */ 8485c65a0a9SMartin Matuska 8495c65a0a9SMartin Matuska /* 8505c65a0a9SMartin Matuska * If the previous page did not end on a page boundary, then we 8515c65a0a9SMartin Matuska * can't proceed without creating a hole. 8525c65a0a9SMartin Matuska */ 8535c65a0a9SMartin Matuska if (s->seen_last) 8545c65a0a9SMartin Matuska return (1); 8555c65a0a9SMartin Matuska 8565c65a0a9SMartin Matuska /* This page must contain only whole LBS-sized blocks. */ 8575c65a0a9SMartin Matuska if (!IS_P2ALIGNED(len, s->blocksize)) 858783d3ff6SMartin Matuska return (1); 859783d3ff6SMartin Matuska 860783d3ff6SMartin Matuska /* 8615c65a0a9SMartin Matuska * If this is not the first page in the ABD, then the data must start 8625c65a0a9SMartin Matuska * on a page-aligned boundary (so the kernel can split on page 8635c65a0a9SMartin Matuska * boundaries without having to deal with a hole). If it is, then 8645c65a0a9SMartin Matuska * it can start on LBS-alignment. 865783d3ff6SMartin Matuska */ 8665c65a0a9SMartin Matuska if (s->seen_first) { 8675c65a0a9SMartin Matuska if (!IS_P2ALIGNED(off, PAGESIZE)) 868783d3ff6SMartin Matuska return (1); 8695c65a0a9SMartin Matuska } else { 8705c65a0a9SMartin Matuska if (!IS_P2ALIGNED(off, s->blocksize)) 8715c65a0a9SMartin Matuska return (1); 8725c65a0a9SMartin Matuska s->seen_first = 1; 8735c65a0a9SMartin Matuska } 874783d3ff6SMartin Matuska 8755c65a0a9SMartin Matuska /* 8765c65a0a9SMartin Matuska * If this data does not end on a page-aligned boundary, then this 8775c65a0a9SMartin Matuska * must be the last page in the ABD, for the same reason. 8785c65a0a9SMartin Matuska */ 8795c65a0a9SMartin Matuska s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE); 8805c65a0a9SMartin Matuska 881783d3ff6SMartin Matuska return (0); 882783d3ff6SMartin Matuska } 883783d3ff6SMartin Matuska 884783d3ff6SMartin Matuska /* 885783d3ff6SMartin Matuska * Check if we can submit the pages in this ABD to the kernel as-is. Returns 886783d3ff6SMartin Matuska * the number of pages, or 0 if it can't be submitted like this. 887783d3ff6SMartin Matuska */ 888783d3ff6SMartin Matuska static boolean_t 8895c65a0a9SMartin Matuska vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev) 890783d3ff6SMartin Matuska { 8915c65a0a9SMartin Matuska vdev_disk_check_alignment_t s = { 8925c65a0a9SMartin Matuska .blocksize = bdev_logical_block_size(bdev), 893783d3ff6SMartin Matuska }; 894783d3ff6SMartin Matuska 8955c65a0a9SMartin Matuska if (abd_iterate_page_func(abd, 0, size, 8965c65a0a9SMartin Matuska vdev_disk_check_alignment_cb, &s)) 897783d3ff6SMartin Matuska return (B_FALSE); 898783d3ff6SMartin Matuska 899783d3ff6SMartin Matuska return (B_TRUE); 900783d3ff6SMartin Matuska } 901783d3ff6SMartin Matuska 902783d3ff6SMartin Matuska static int 903783d3ff6SMartin Matuska vdev_disk_io_rw(zio_t *zio) 904783d3ff6SMartin Matuska { 905783d3ff6SMartin Matuska vdev_t *v = zio->io_vd; 906783d3ff6SMartin Matuska vdev_disk_t *vd = v->vdev_tsd; 907783d3ff6SMartin Matuska struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 908783d3ff6SMartin Matuska int flags = 0; 909783d3ff6SMartin Matuska 910783d3ff6SMartin Matuska /* 911783d3ff6SMartin Matuska * Accessing outside the block device is never allowed. 912783d3ff6SMartin Matuska */ 91375e1fea6SMartin Matuska if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) { 914783d3ff6SMartin Matuska vdev_dbgmsg(zio->io_vd, 915783d3ff6SMartin Matuska "Illegal access %llu size %llu, device size %llu", 916783d3ff6SMartin Matuska (u_longlong_t)zio->io_offset, 917783d3ff6SMartin Matuska (u_longlong_t)zio->io_size, 91875e1fea6SMartin Matuska (u_longlong_t)bdev_capacity(bdev)); 919783d3ff6SMartin Matuska return (SET_ERROR(EIO)); 920783d3ff6SMartin Matuska } 921783d3ff6SMartin Matuska 922783d3ff6SMartin Matuska if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 923783d3ff6SMartin Matuska v->vdev_failfast == B_TRUE) { 924783d3ff6SMartin Matuska bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 925783d3ff6SMartin Matuska zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 926783d3ff6SMartin Matuska } 927783d3ff6SMartin Matuska 928783d3ff6SMartin Matuska /* 929783d3ff6SMartin Matuska * Check alignment of the incoming ABD. If any part of it would require 9305c65a0a9SMartin Matuska * submitting a page that is not aligned to both the logical block size 9315c65a0a9SMartin Matuska * and the page size, then we take a copy into a new memory region with 9325c65a0a9SMartin Matuska * correct alignment. This should be impossible on a 512b LBS. On 9335c65a0a9SMartin Matuska * larger blocks, this can happen at least when a small number of 9345c65a0a9SMartin Matuska * blocks (usually 1) are allocated from a shared slab, or when 9355c65a0a9SMartin Matuska * abnormally-small data regions (eg gang headers) are mixed into the 9365c65a0a9SMartin Matuska * same ABD as larger allocations (eg aggregations). 937783d3ff6SMartin Matuska */ 938783d3ff6SMartin Matuska abd_t *abd = zio->io_abd; 9395c65a0a9SMartin Matuska if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) { 9405c65a0a9SMartin Matuska /* Allocate a new memory region with guaranteed alignment */ 9415c65a0a9SMartin Matuska abd = abd_alloc_for_io(zio->io_size, 9425c65a0a9SMartin Matuska zio->io_abd->abd_flags & ABD_FLAG_META); 9435c65a0a9SMartin Matuska 9445c65a0a9SMartin Matuska /* If we're writing copy our data into it */ 9455c65a0a9SMartin Matuska if (zio->io_type == ZIO_TYPE_WRITE) 9465c65a0a9SMartin Matuska abd_copy(abd, zio->io_abd, zio->io_size); 947783d3ff6SMartin Matuska 948783d3ff6SMartin Matuska /* 9495c65a0a9SMartin Matuska * False here would mean the new allocation has an invalid 9505c65a0a9SMartin Matuska * alignment too, which would mean that abd_alloc() is not 9515c65a0a9SMartin Matuska * guaranteeing this, or our logic in 9525c65a0a9SMartin Matuska * vdev_disk_check_alignment() is wrong. In either case, 953783d3ff6SMartin Matuska * something in seriously wrong and its not safe to continue. 954783d3ff6SMartin Matuska */ 9555c65a0a9SMartin Matuska VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev)); 956783d3ff6SMartin Matuska } 957783d3ff6SMartin Matuska 958783d3ff6SMartin Matuska /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ 959783d3ff6SMartin Matuska vbio_t *vbio = vbio_alloc(zio, bdev, flags); 960783d3ff6SMartin Matuska if (abd != zio->io_abd) 961783d3ff6SMartin Matuska vbio->vbio_abd = abd; 962783d3ff6SMartin Matuska 963783d3ff6SMartin Matuska /* Fill it with data pages and submit it to the kernel */ 964783d3ff6SMartin Matuska vbio_submit(vbio, abd, zio->io_size); 965783d3ff6SMartin Matuska return (0); 966783d3ff6SMartin Matuska } 967783d3ff6SMartin Matuska 968783d3ff6SMartin Matuska /* ========== */ 969783d3ff6SMartin Matuska 970783d3ff6SMartin Matuska /* 971783d3ff6SMartin Matuska * This is the classic, battle-tested BIO submission code. Until we're totally 972783d3ff6SMartin Matuska * sure that the new code is safe and correct in all cases, this will remain 973783d3ff6SMartin Matuska * available and can be enabled by setting zfs_vdev_disk_classic=1 at module 974783d3ff6SMartin Matuska * load time. 975783d3ff6SMartin Matuska * 976783d3ff6SMartin Matuska * These functions have been renamed to vdev_classic_* to make it clear what 977783d3ff6SMartin Matuska * they belong to, but their implementations are unchanged. 978783d3ff6SMartin Matuska */ 979783d3ff6SMartin Matuska 980783d3ff6SMartin Matuska /* 981783d3ff6SMartin Matuska * Virtual device vector for disks. 982783d3ff6SMartin Matuska */ 983783d3ff6SMartin Matuska typedef struct dio_request { 984783d3ff6SMartin Matuska zio_t *dr_zio; /* Parent ZIO */ 985783d3ff6SMartin Matuska atomic_t dr_ref; /* References */ 986783d3ff6SMartin Matuska int dr_error; /* Bio error */ 987783d3ff6SMartin Matuska int dr_bio_count; /* Count of bio's */ 988783d3ff6SMartin Matuska struct bio *dr_bio[]; /* Attached bio's */ 989783d3ff6SMartin Matuska } dio_request_t; 990783d3ff6SMartin Matuska 991783d3ff6SMartin Matuska static dio_request_t * 992783d3ff6SMartin Matuska vdev_classic_dio_alloc(int bio_count) 993783d3ff6SMartin Matuska { 994783d3ff6SMartin Matuska dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 995783d3ff6SMartin Matuska sizeof (struct bio *) * bio_count, KM_SLEEP); 996783d3ff6SMartin Matuska atomic_set(&dr->dr_ref, 0); 997783d3ff6SMartin Matuska dr->dr_bio_count = bio_count; 998783d3ff6SMartin Matuska dr->dr_error = 0; 999783d3ff6SMartin Matuska 1000783d3ff6SMartin Matuska for (int i = 0; i < dr->dr_bio_count; i++) 1001783d3ff6SMartin Matuska dr->dr_bio[i] = NULL; 1002783d3ff6SMartin Matuska 1003783d3ff6SMartin Matuska return (dr); 1004783d3ff6SMartin Matuska } 1005783d3ff6SMartin Matuska 1006783d3ff6SMartin Matuska static void 1007783d3ff6SMartin Matuska vdev_classic_dio_free(dio_request_t *dr) 1008783d3ff6SMartin Matuska { 1009783d3ff6SMartin Matuska int i; 1010783d3ff6SMartin Matuska 1011783d3ff6SMartin Matuska for (i = 0; i < dr->dr_bio_count; i++) 1012783d3ff6SMartin Matuska if (dr->dr_bio[i]) 1013783d3ff6SMartin Matuska bio_put(dr->dr_bio[i]); 1014783d3ff6SMartin Matuska 1015783d3ff6SMartin Matuska kmem_free(dr, sizeof (dio_request_t) + 1016783d3ff6SMartin Matuska sizeof (struct bio *) * dr->dr_bio_count); 1017783d3ff6SMartin Matuska } 1018783d3ff6SMartin Matuska 1019783d3ff6SMartin Matuska static void 1020783d3ff6SMartin Matuska vdev_classic_dio_get(dio_request_t *dr) 1021783d3ff6SMartin Matuska { 1022783d3ff6SMartin Matuska atomic_inc(&dr->dr_ref); 1023783d3ff6SMartin Matuska } 1024783d3ff6SMartin Matuska 1025783d3ff6SMartin Matuska static void 1026783d3ff6SMartin Matuska vdev_classic_dio_put(dio_request_t *dr) 1027783d3ff6SMartin Matuska { 1028783d3ff6SMartin Matuska int rc = atomic_dec_return(&dr->dr_ref); 1029783d3ff6SMartin Matuska 1030783d3ff6SMartin Matuska /* 1031783d3ff6SMartin Matuska * Free the dio_request when the last reference is dropped and 1032783d3ff6SMartin Matuska * ensure zio_interpret is called only once with the correct zio 1033783d3ff6SMartin Matuska */ 1034783d3ff6SMartin Matuska if (rc == 0) { 1035783d3ff6SMartin Matuska zio_t *zio = dr->dr_zio; 1036783d3ff6SMartin Matuska int error = dr->dr_error; 1037783d3ff6SMartin Matuska 1038783d3ff6SMartin Matuska vdev_classic_dio_free(dr); 1039783d3ff6SMartin Matuska 1040783d3ff6SMartin Matuska if (zio) { 1041783d3ff6SMartin Matuska zio->io_error = error; 1042783d3ff6SMartin Matuska ASSERT3S(zio->io_error, >=, 0); 1043783d3ff6SMartin Matuska if (zio->io_error) 1044783d3ff6SMartin Matuska vdev_disk_error(zio); 1045783d3ff6SMartin Matuska 1046783d3ff6SMartin Matuska zio_delay_interrupt(zio); 1047783d3ff6SMartin Matuska } 1048783d3ff6SMartin Matuska } 1049783d3ff6SMartin Matuska } 1050783d3ff6SMartin Matuska 10517a7741afSMartin Matuska static void 10527a7741afSMartin Matuska vdev_classic_physio_completion(struct bio *bio) 1053783d3ff6SMartin Matuska { 1054783d3ff6SMartin Matuska dio_request_t *dr = bio->bi_private; 1055783d3ff6SMartin Matuska 1056783d3ff6SMartin Matuska if (dr->dr_error == 0) { 10577a7741afSMartin Matuska dr->dr_error = bi_status_to_errno(bio->bi_status); 1058783d3ff6SMartin Matuska } 1059783d3ff6SMartin Matuska 1060783d3ff6SMartin Matuska /* Drop reference acquired by vdev_classic_physio */ 1061783d3ff6SMartin Matuska vdev_classic_dio_put(dr); 1062783d3ff6SMartin Matuska } 1063783d3ff6SMartin Matuska 1064e3aa18adSMartin Matuska static inline unsigned int 1065783d3ff6SMartin Matuska vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 1066e3aa18adSMartin Matuska { 1067e3aa18adSMartin Matuska unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 1068e3aa18adSMartin Matuska bio_size, abd_offset); 1069e3aa18adSMartin Matuska 1070e3aa18adSMartin Matuska #ifdef HAVE_BIO_MAX_SEGS 1071e3aa18adSMartin Matuska return (bio_max_segs(nr_segs)); 1072e3aa18adSMartin Matuska #else 1073e3aa18adSMartin Matuska return (MIN(nr_segs, BIO_MAX_PAGES)); 1074e3aa18adSMartin Matuska #endif 1075e3aa18adSMartin Matuska } 1076e3aa18adSMartin Matuska 1077eda14cbcSMatt Macy static int 1078783d3ff6SMartin Matuska vdev_classic_physio(zio_t *zio) 1079eda14cbcSMatt Macy { 1080783d3ff6SMartin Matuska vdev_t *v = zio->io_vd; 1081783d3ff6SMartin Matuska vdev_disk_t *vd = v->vdev_tsd; 1082783d3ff6SMartin Matuska struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 1083783d3ff6SMartin Matuska size_t io_size = zio->io_size; 1084783d3ff6SMartin Matuska uint64_t io_offset = zio->io_offset; 1085783d3ff6SMartin Matuska int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; 1086783d3ff6SMartin Matuska int flags = 0; 1087783d3ff6SMartin Matuska 1088eda14cbcSMatt Macy dio_request_t *dr; 1089eda14cbcSMatt Macy uint64_t abd_offset; 1090eda14cbcSMatt Macy uint64_t bio_offset; 1091184c1b94SMartin Matuska int bio_size; 1092184c1b94SMartin Matuska int bio_count = 16; 1093184c1b94SMartin Matuska int error = 0; 1094eda14cbcSMatt Macy struct blk_plug plug; 1095e3aa18adSMartin Matuska unsigned short nr_vecs; 1096eda14cbcSMatt Macy 1097eda14cbcSMatt Macy /* 1098eda14cbcSMatt Macy * Accessing outside the block device is never allowed. 1099eda14cbcSMatt Macy */ 110075e1fea6SMartin Matuska if (io_offset + io_size > bdev_capacity(bdev)) { 1101eda14cbcSMatt Macy vdev_dbgmsg(zio->io_vd, 1102eda14cbcSMatt Macy "Illegal access %llu size %llu, device size %llu", 11031f88aa09SMartin Matuska (u_longlong_t)io_offset, 11041f88aa09SMartin Matuska (u_longlong_t)io_size, 110575e1fea6SMartin Matuska (u_longlong_t)bdev_capacity(bdev)); 1106eda14cbcSMatt Macy return (SET_ERROR(EIO)); 1107eda14cbcSMatt Macy } 1108eda14cbcSMatt Macy 1109eda14cbcSMatt Macy retry: 1110783d3ff6SMartin Matuska dr = vdev_classic_dio_alloc(bio_count); 1111eda14cbcSMatt Macy 111215f0b8c3SMartin Matuska if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 1113dbd5678dSMartin Matuska zio->io_vd->vdev_failfast == B_TRUE) { 1114dbd5678dSMartin Matuska bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 1115dbd5678dSMartin Matuska zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 1116dbd5678dSMartin Matuska } 1117eda14cbcSMatt Macy 1118eda14cbcSMatt Macy dr->dr_zio = zio; 1119eda14cbcSMatt Macy 1120eda14cbcSMatt Macy /* 1121184c1b94SMartin Matuska * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 1122184c1b94SMartin Matuska * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 1123184c1b94SMartin Matuska * can cover at least 128KB and at most 1MB. When the required number 1124184c1b94SMartin Matuska * of iovec's exceeds this, we are forced to break the IO in multiple 1125184c1b94SMartin Matuska * bio's and wait for them all to complete. This is likely if the 1126184c1b94SMartin Matuska * recordsize property is increased beyond 1MB. The default 1127184c1b94SMartin Matuska * bio_count=16 should typically accommodate the maximum-size zio of 1128184c1b94SMartin Matuska * 16MB. 1129eda14cbcSMatt Macy */ 1130eda14cbcSMatt Macy 1131eda14cbcSMatt Macy abd_offset = 0; 1132eda14cbcSMatt Macy bio_offset = io_offset; 1133eda14cbcSMatt Macy bio_size = io_size; 1134184c1b94SMartin Matuska for (int i = 0; i <= dr->dr_bio_count; i++) { 1135eda14cbcSMatt Macy 1136eda14cbcSMatt Macy /* Finished constructing bio's for given buffer */ 1137eda14cbcSMatt Macy if (bio_size <= 0) 1138eda14cbcSMatt Macy break; 1139eda14cbcSMatt Macy 1140eda14cbcSMatt Macy /* 1141184c1b94SMartin Matuska * If additional bio's are required, we have to retry, but 1142184c1b94SMartin Matuska * this should be rare - see the comment above. 1143eda14cbcSMatt Macy */ 1144eda14cbcSMatt Macy if (dr->dr_bio_count == i) { 1145783d3ff6SMartin Matuska vdev_classic_dio_free(dr); 1146eda14cbcSMatt Macy bio_count *= 2; 1147eda14cbcSMatt Macy goto retry; 1148eda14cbcSMatt Macy } 1149eda14cbcSMatt Macy 1150783d3ff6SMartin Matuska nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); 1151e3aa18adSMartin Matuska dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 1152eda14cbcSMatt Macy if (unlikely(dr->dr_bio[i] == NULL)) { 1153783d3ff6SMartin Matuska vdev_classic_dio_free(dr); 1154eda14cbcSMatt Macy return (SET_ERROR(ENOMEM)); 1155eda14cbcSMatt Macy } 1156eda14cbcSMatt Macy 1157783d3ff6SMartin Matuska /* Matching put called by vdev_classic_physio_completion */ 1158783d3ff6SMartin Matuska vdev_classic_dio_get(dr); 1159eda14cbcSMatt Macy 1160eda14cbcSMatt Macy BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 1161783d3ff6SMartin Matuska dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; 1162eda14cbcSMatt Macy dr->dr_bio[i]->bi_private = dr; 1163eda14cbcSMatt Macy bio_set_op_attrs(dr->dr_bio[i], rw, flags); 1164eda14cbcSMatt Macy 1165eda14cbcSMatt Macy /* Remaining size is returned to become the new size */ 1166eda14cbcSMatt Macy bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 1167eda14cbcSMatt Macy bio_size, abd_offset); 1168eda14cbcSMatt Macy 1169eda14cbcSMatt Macy /* Advance in buffer and construct another bio if needed */ 1170eda14cbcSMatt Macy abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1171eda14cbcSMatt Macy bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1172eda14cbcSMatt Macy } 1173eda14cbcSMatt Macy 1174eda14cbcSMatt Macy /* Extra reference to protect dio_request during vdev_submit_bio */ 1175783d3ff6SMartin Matuska vdev_classic_dio_get(dr); 1176eda14cbcSMatt Macy 1177eda14cbcSMatt Macy if (dr->dr_bio_count > 1) 1178eda14cbcSMatt Macy blk_start_plug(&plug); 1179eda14cbcSMatt Macy 1180eda14cbcSMatt Macy /* Submit all bio's associated with this dio */ 1181184c1b94SMartin Matuska for (int i = 0; i < dr->dr_bio_count; i++) { 1182eda14cbcSMatt Macy if (dr->dr_bio[i]) 1183eda14cbcSMatt Macy vdev_submit_bio(dr->dr_bio[i]); 1184184c1b94SMartin Matuska } 1185eda14cbcSMatt Macy 1186eda14cbcSMatt Macy if (dr->dr_bio_count > 1) 1187eda14cbcSMatt Macy blk_finish_plug(&plug); 1188eda14cbcSMatt Macy 1189783d3ff6SMartin Matuska vdev_classic_dio_put(dr); 1190eda14cbcSMatt Macy 1191eda14cbcSMatt Macy return (error); 1192eda14cbcSMatt Macy } 1193eda14cbcSMatt Macy 1194783d3ff6SMartin Matuska /* ========== */ 1195783d3ff6SMartin Matuska 11967a7741afSMartin Matuska static void 11977a7741afSMartin Matuska vdev_disk_io_flush_completion(struct bio *bio) 1198eda14cbcSMatt Macy { 1199eda14cbcSMatt Macy zio_t *zio = bio->bi_private; 12007a7741afSMartin Matuska zio->io_error = bi_status_to_errno(bio->bi_status); 1201*dd215568SMartin Matuska if (zio->io_error == EOPNOTSUPP || zio->io_error == ENOTTY) 1202*dd215568SMartin Matuska zio->io_error = SET_ERROR(ENOTSUP); 1203eda14cbcSMatt Macy 1204eda14cbcSMatt Macy bio_put(bio); 1205eda14cbcSMatt Macy ASSERT3S(zio->io_error, >=, 0); 1206eda14cbcSMatt Macy if (zio->io_error) 1207eda14cbcSMatt Macy vdev_disk_error(zio); 1208eda14cbcSMatt Macy zio_interrupt(zio); 1209eda14cbcSMatt Macy } 1210eda14cbcSMatt Macy 1211eda14cbcSMatt Macy static int 1212eda14cbcSMatt Macy vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 1213eda14cbcSMatt Macy { 1214eda14cbcSMatt Macy struct request_queue *q; 1215eda14cbcSMatt Macy struct bio *bio; 1216eda14cbcSMatt Macy 1217eda14cbcSMatt Macy q = bdev_get_queue(bdev); 1218eda14cbcSMatt Macy if (!q) 1219eda14cbcSMatt Macy return (SET_ERROR(ENXIO)); 1220eda14cbcSMatt Macy 1221e3aa18adSMartin Matuska bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 1222eda14cbcSMatt Macy if (unlikely(bio == NULL)) 1223eda14cbcSMatt Macy return (SET_ERROR(ENOMEM)); 1224eda14cbcSMatt Macy 1225eda14cbcSMatt Macy bio->bi_end_io = vdev_disk_io_flush_completion; 1226eda14cbcSMatt Macy bio->bi_private = zio; 1227eda14cbcSMatt Macy bio_set_flush(bio); 1228eda14cbcSMatt Macy vdev_submit_bio(bio); 1229eda14cbcSMatt Macy invalidate_bdev(bdev); 1230eda14cbcSMatt Macy 1231eda14cbcSMatt Macy return (0); 1232eda14cbcSMatt Macy } 1233eda14cbcSMatt Macy 12347a7741afSMartin Matuska static void 12357a7741afSMartin Matuska vdev_disk_discard_end_io(struct bio *bio) 1236a4e5e010SMartin Matuska { 1237a4e5e010SMartin Matuska zio_t *zio = bio->bi_private; 12387a7741afSMartin Matuska zio->io_error = bi_status_to_errno(bio->bi_status); 12397a7741afSMartin Matuska 1240a4e5e010SMartin Matuska bio_put(bio); 1241a4e5e010SMartin Matuska if (zio->io_error) 1242a4e5e010SMartin Matuska vdev_disk_error(zio); 1243a4e5e010SMartin Matuska zio_interrupt(zio); 1244a4e5e010SMartin Matuska } 1245a4e5e010SMartin Matuska 12461719886fSMartin Matuska /* 12471719886fSMartin Matuska * Wrappers for the different secure erase and discard APIs. We use async 12481719886fSMartin Matuska * when available; in this case, *biop is set to the last bio in the chain. 12491719886fSMartin Matuska */ 1250a4e5e010SMartin Matuska static int 12511719886fSMartin Matuska vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, 12521719886fSMartin Matuska sector_t nsect, struct bio **biop) 1253a4e5e010SMartin Matuska { 12541719886fSMartin Matuska *biop = NULL; 12551719886fSMartin Matuska int error; 1256a4e5e010SMartin Matuska 12571719886fSMartin Matuska #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 12581719886fSMartin Matuska error = blkdev_issue_secure_erase(BDH_BDEV(bdh), 12591719886fSMartin Matuska sector, nsect, GFP_NOFS); 12601719886fSMartin Matuska #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 12611719886fSMartin Matuska error = __blkdev_issue_discard(BDH_BDEV(bdh), 12621719886fSMartin Matuska sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); 12631719886fSMartin Matuska #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 12641719886fSMartin Matuska error = blkdev_issue_discard(BDH_BDEV(bdh), 12651719886fSMartin Matuska sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); 1266a4e5e010SMartin Matuska #else 12671719886fSMartin Matuska #error "unsupported kernel" 1268a4e5e010SMartin Matuska #endif 12691719886fSMartin Matuska 12701719886fSMartin Matuska return (error); 12711719886fSMartin Matuska } 12721719886fSMartin Matuska 12731719886fSMartin Matuska static int 12741719886fSMartin Matuska vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, 12751719886fSMartin Matuska sector_t nsect, struct bio **biop) 12761719886fSMartin Matuska { 12771719886fSMartin Matuska *biop = NULL; 12781719886fSMartin Matuska int error; 12791719886fSMartin Matuska 12801719886fSMartin Matuska #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 12811719886fSMartin Matuska error = __blkdev_issue_discard(BDH_BDEV(bdh), 12821719886fSMartin Matuska sector, nsect, GFP_NOFS, 0, biop); 12831719886fSMartin Matuska #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) 12841719886fSMartin Matuska error = __blkdev_issue_discard(BDH_BDEV(bdh), 12851719886fSMartin Matuska sector, nsect, GFP_NOFS, biop); 12861719886fSMartin Matuska #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 12871719886fSMartin Matuska error = blkdev_issue_discard(BDH_BDEV(bdh), 12881719886fSMartin Matuska sector, nsect, GFP_NOFS, 0); 12891719886fSMartin Matuska #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) 12901719886fSMartin Matuska error = blkdev_issue_discard(BDH_BDEV(bdh), 12911719886fSMartin Matuska sector, nsect, GFP_NOFS); 12921719886fSMartin Matuska #else 12931719886fSMartin Matuska #error "unsupported kernel" 12941719886fSMartin Matuska #endif 12951719886fSMartin Matuska 12961719886fSMartin Matuska return (error); 12971719886fSMartin Matuska } 12981719886fSMartin Matuska 12991719886fSMartin Matuska /* 13001719886fSMartin Matuska * Entry point for TRIM ops. This calls the right wrapper for secure erase or 13011719886fSMartin Matuska * discard, and then does the appropriate finishing work for error vs success 13021719886fSMartin Matuska * and async vs sync. 13031719886fSMartin Matuska */ 13041719886fSMartin Matuska static int 13051719886fSMartin Matuska vdev_disk_io_trim(zio_t *zio) 13061719886fSMartin Matuska { 13071719886fSMartin Matuska int error; 13081719886fSMartin Matuska struct bio *bio; 13091719886fSMartin Matuska 13101719886fSMartin Matuska zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; 13111719886fSMartin Matuska sector_t sector = zio->io_offset >> 9; 13121719886fSMartin Matuska sector_t nsects = zio->io_size >> 9; 13131719886fSMartin Matuska 13141719886fSMartin Matuska if (zio->io_trim_flags & ZIO_TRIM_SECURE) 13151719886fSMartin Matuska error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); 13161719886fSMartin Matuska else 13171719886fSMartin Matuska error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); 13181719886fSMartin Matuska 13191719886fSMartin Matuska if (error != 0) 13201719886fSMartin Matuska return (SET_ERROR(-error)); 13211719886fSMartin Matuska 13221719886fSMartin Matuska if (bio == NULL) { 13231719886fSMartin Matuska /* 13241719886fSMartin Matuska * This was a synchronous op that completed successfully, so 13251719886fSMartin Matuska * return it to ZFS immediately. 13261719886fSMartin Matuska */ 13271719886fSMartin Matuska zio_interrupt(zio); 13281719886fSMartin Matuska } else { 13291719886fSMartin Matuska /* 13301719886fSMartin Matuska * This was an asynchronous op; set up completion callback and 13311719886fSMartin Matuska * issue it. 13321719886fSMartin Matuska */ 1333a4e5e010SMartin Matuska bio->bi_private = zio; 1334a4e5e010SMartin Matuska bio->bi_end_io = vdev_disk_discard_end_io; 1335a4e5e010SMartin Matuska vdev_submit_bio(bio); 1336a4e5e010SMartin Matuska } 1337a4e5e010SMartin Matuska 13381719886fSMartin Matuska return (0); 1339e3aa18adSMartin Matuska } 1340e3aa18adSMartin Matuska 1341783d3ff6SMartin Matuska int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; 1342783d3ff6SMartin Matuska 1343eda14cbcSMatt Macy static void 1344eda14cbcSMatt Macy vdev_disk_io_start(zio_t *zio) 1345eda14cbcSMatt Macy { 1346eda14cbcSMatt Macy vdev_t *v = zio->io_vd; 1347eda14cbcSMatt Macy vdev_disk_t *vd = v->vdev_tsd; 1348783d3ff6SMartin Matuska int error; 1349eda14cbcSMatt Macy 1350eda14cbcSMatt Macy /* 1351eda14cbcSMatt Macy * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 1352eda14cbcSMatt Macy * Nothing to be done here but return failure. 1353eda14cbcSMatt Macy */ 1354eda14cbcSMatt Macy if (vd == NULL) { 1355eda14cbcSMatt Macy zio->io_error = ENXIO; 1356eda14cbcSMatt Macy zio_interrupt(zio); 1357eda14cbcSMatt Macy return; 1358eda14cbcSMatt Macy } 1359eda14cbcSMatt Macy 1360eda14cbcSMatt Macy rw_enter(&vd->vd_lock, RW_READER); 1361eda14cbcSMatt Macy 1362eda14cbcSMatt Macy /* 1363eda14cbcSMatt Macy * If the vdev is closed, it's likely due to a failed reopen and is 1364eda14cbcSMatt Macy * in the UNAVAIL state. Nothing to be done here but return failure. 1365eda14cbcSMatt Macy */ 1366fd45b686SMartin Matuska if (vd->vd_bdh == NULL) { 1367eda14cbcSMatt Macy rw_exit(&vd->vd_lock); 1368eda14cbcSMatt Macy zio->io_error = ENXIO; 1369eda14cbcSMatt Macy zio_interrupt(zio); 1370eda14cbcSMatt Macy return; 1371eda14cbcSMatt Macy } 1372eda14cbcSMatt Macy 1373eda14cbcSMatt Macy switch (zio->io_type) { 13741719886fSMartin Matuska case ZIO_TYPE_FLUSH: 1375eda14cbcSMatt Macy 1376eda14cbcSMatt Macy if (!vdev_readable(v)) { 13771719886fSMartin Matuska /* Drive not there, can't flush */ 13781719886fSMartin Matuska error = SET_ERROR(ENXIO); 13791719886fSMartin Matuska } else if (zfs_nocacheflush) { 13801719886fSMartin Matuska /* Flushing disabled by operator, declare success */ 13811719886fSMartin Matuska error = 0; 13821719886fSMartin Matuska } else if (v->vdev_nowritecache) { 13831719886fSMartin Matuska /* This vdev not capable of flushing */ 13841719886fSMartin Matuska error = SET_ERROR(ENOTSUP); 13851719886fSMartin Matuska } else { 13861719886fSMartin Matuska /* 13871719886fSMartin Matuska * Issue the flush. If successful, the response will 13881719886fSMartin Matuska * be handled in the completion callback, so we're done. 13891719886fSMartin Matuska */ 1390fd45b686SMartin Matuska error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); 1391eda14cbcSMatt Macy if (error == 0) { 1392eda14cbcSMatt Macy rw_exit(&vd->vd_lock); 1393eda14cbcSMatt Macy return; 1394eda14cbcSMatt Macy } 1395eda14cbcSMatt Macy } 1396eda14cbcSMatt Macy 13971719886fSMartin Matuska /* Couldn't issue the flush, so set the error and return it */ 1398eda14cbcSMatt Macy rw_exit(&vd->vd_lock); 13991719886fSMartin Matuska zio->io_error = error; 1400eda14cbcSMatt Macy zio_execute(zio); 1401eda14cbcSMatt Macy return; 1402eda14cbcSMatt Macy 1403eda14cbcSMatt Macy case ZIO_TYPE_TRIM: 14041719886fSMartin Matuska error = vdev_disk_io_trim(zio); 1405eda14cbcSMatt Macy rw_exit(&vd->vd_lock); 14061719886fSMartin Matuska if (error) { 14071719886fSMartin Matuska zio->io_error = error; 14081719886fSMartin Matuska zio_execute(zio); 14091719886fSMartin Matuska } 1410eda14cbcSMatt Macy return; 1411eda14cbcSMatt Macy 1412783d3ff6SMartin Matuska case ZIO_TYPE_READ: 1413783d3ff6SMartin Matuska case ZIO_TYPE_WRITE: 1414783d3ff6SMartin Matuska zio->io_target_timestamp = zio_handle_io_delay(zio); 1415783d3ff6SMartin Matuska error = vdev_disk_io_rw_fn(zio); 1416783d3ff6SMartin Matuska rw_exit(&vd->vd_lock); 1417783d3ff6SMartin Matuska if (error) { 1418783d3ff6SMartin Matuska zio->io_error = error; 1419783d3ff6SMartin Matuska zio_interrupt(zio); 1420783d3ff6SMartin Matuska } 1421783d3ff6SMartin Matuska return; 1422783d3ff6SMartin Matuska 1423eda14cbcSMatt Macy default: 1424783d3ff6SMartin Matuska /* 1425783d3ff6SMartin Matuska * Getting here means our parent vdev has made a very strange 1426783d3ff6SMartin Matuska * request of us, and shouldn't happen. Assert here to force a 1427783d3ff6SMartin Matuska * crash in dev builds, but in production return the IO 1428783d3ff6SMartin Matuska * unhandled. The pool will likely suspend anyway but that's 1429783d3ff6SMartin Matuska * nicer than crashing the kernel. 1430783d3ff6SMartin Matuska */ 1431783d3ff6SMartin Matuska ASSERT3S(zio->io_type, ==, -1); 1432783d3ff6SMartin Matuska 1433eda14cbcSMatt Macy rw_exit(&vd->vd_lock); 1434eda14cbcSMatt Macy zio->io_error = SET_ERROR(ENOTSUP); 1435eda14cbcSMatt Macy zio_interrupt(zio); 1436eda14cbcSMatt Macy return; 1437eda14cbcSMatt Macy } 1438eda14cbcSMatt Macy 1439783d3ff6SMartin Matuska __builtin_unreachable(); 1440eda14cbcSMatt Macy } 1441eda14cbcSMatt Macy 1442eda14cbcSMatt Macy static void 1443eda14cbcSMatt Macy vdev_disk_io_done(zio_t *zio) 1444eda14cbcSMatt Macy { 14455c65a0a9SMartin Matuska /* If this was a read or write, we need to clean up the vbio */ 14465c65a0a9SMartin Matuska if (zio->io_bio != NULL) { 14475c65a0a9SMartin Matuska vbio_t *vbio = zio->io_bio; 14485c65a0a9SMartin Matuska zio->io_bio = NULL; 14495c65a0a9SMartin Matuska 14505c65a0a9SMartin Matuska /* 14515c65a0a9SMartin Matuska * If we copied the ABD before issuing it, clean up and return 14525c65a0a9SMartin Matuska * the copy to the ADB, with changes if appropriate. 14535c65a0a9SMartin Matuska */ 14545c65a0a9SMartin Matuska if (vbio->vbio_abd != NULL) { 14555c65a0a9SMartin Matuska if (zio->io_type == ZIO_TYPE_READ) 14565c65a0a9SMartin Matuska abd_copy(zio->io_abd, vbio->vbio_abd, 14575c65a0a9SMartin Matuska zio->io_size); 14585c65a0a9SMartin Matuska 14595c65a0a9SMartin Matuska abd_free(vbio->vbio_abd); 14605c65a0a9SMartin Matuska vbio->vbio_abd = NULL; 14615c65a0a9SMartin Matuska } 14625c65a0a9SMartin Matuska 14635c65a0a9SMartin Matuska /* Final cleanup */ 14645c65a0a9SMartin Matuska kmem_free(vbio, sizeof (vbio_t)); 14655c65a0a9SMartin Matuska } 14665c65a0a9SMartin Matuska 1467eda14cbcSMatt Macy /* 1468eda14cbcSMatt Macy * If the device returned EIO, we revalidate the media. If it is 1469eda14cbcSMatt Macy * determined the media has changed this triggers the asynchronous 1470eda14cbcSMatt Macy * removal of the device from the configuration. 1471eda14cbcSMatt Macy */ 1472eda14cbcSMatt Macy if (zio->io_error == EIO) { 1473eda14cbcSMatt Macy vdev_t *v = zio->io_vd; 1474eda14cbcSMatt Macy vdev_disk_t *vd = v->vdev_tsd; 1475eda14cbcSMatt Macy 1476fd45b686SMartin Matuska if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { 1477fd45b686SMartin Matuska invalidate_bdev(BDH_BDEV(vd->vd_bdh)); 1478eda14cbcSMatt Macy v->vdev_remove_wanted = B_TRUE; 1479eda14cbcSMatt Macy spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1480eda14cbcSMatt Macy } 1481eda14cbcSMatt Macy } 1482eda14cbcSMatt Macy } 1483eda14cbcSMatt Macy 1484eda14cbcSMatt Macy static void 1485eda14cbcSMatt Macy vdev_disk_hold(vdev_t *vd) 1486eda14cbcSMatt Macy { 1487eda14cbcSMatt Macy ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1488eda14cbcSMatt Macy 1489eda14cbcSMatt Macy /* We must have a pathname, and it must be absolute. */ 1490eda14cbcSMatt Macy if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 1491eda14cbcSMatt Macy return; 1492eda14cbcSMatt Macy 1493eda14cbcSMatt Macy /* 1494eda14cbcSMatt Macy * Only prefetch path and devid info if the device has 1495eda14cbcSMatt Macy * never been opened. 1496eda14cbcSMatt Macy */ 1497eda14cbcSMatt Macy if (vd->vdev_tsd != NULL) 1498eda14cbcSMatt Macy return; 1499eda14cbcSMatt Macy 1500eda14cbcSMatt Macy } 1501eda14cbcSMatt Macy 1502eda14cbcSMatt Macy static void 1503eda14cbcSMatt Macy vdev_disk_rele(vdev_t *vd) 1504eda14cbcSMatt Macy { 1505eda14cbcSMatt Macy ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1506eda14cbcSMatt Macy 1507eda14cbcSMatt Macy /* XXX: Implement me as a vnode rele for the device */ 1508eda14cbcSMatt Macy } 1509eda14cbcSMatt Macy 1510783d3ff6SMartin Matuska /* 1511783d3ff6SMartin Matuska * BIO submission method. See comment above about vdev_classic. 1512783d3ff6SMartin Matuska * Set zfs_vdev_disk_classic=0 for new, =1 for classic 1513783d3ff6SMartin Matuska */ 1514783d3ff6SMartin Matuska static uint_t zfs_vdev_disk_classic = 0; /* default new */ 1515783d3ff6SMartin Matuska 1516783d3ff6SMartin Matuska /* Set submission function from module parameter */ 1517783d3ff6SMartin Matuska static int 1518783d3ff6SMartin Matuska vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) 1519783d3ff6SMartin Matuska { 1520783d3ff6SMartin Matuska int err = param_set_uint(buf, kp); 1521783d3ff6SMartin Matuska if (err < 0) 1522783d3ff6SMartin Matuska return (SET_ERROR(err)); 1523783d3ff6SMartin Matuska 1524783d3ff6SMartin Matuska vdev_disk_io_rw_fn = 1525783d3ff6SMartin Matuska zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; 1526783d3ff6SMartin Matuska 1527783d3ff6SMartin Matuska printk(KERN_INFO "ZFS: forcing %s BIO submission\n", 1528783d3ff6SMartin Matuska zfs_vdev_disk_classic ? "classic" : "new"); 1529783d3ff6SMartin Matuska 1530783d3ff6SMartin Matuska return (0); 1531783d3ff6SMartin Matuska } 1532783d3ff6SMartin Matuska 1533783d3ff6SMartin Matuska /* 1534783d3ff6SMartin Matuska * At first use vdev use, set the submission function from the default value if 1535783d3ff6SMartin Matuska * it hasn't been set already. 1536783d3ff6SMartin Matuska */ 1537783d3ff6SMartin Matuska static int 1538783d3ff6SMartin Matuska vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) 1539783d3ff6SMartin Matuska { 1540783d3ff6SMartin Matuska (void) spa; 1541783d3ff6SMartin Matuska (void) nv; 1542783d3ff6SMartin Matuska (void) tsd; 1543783d3ff6SMartin Matuska 1544783d3ff6SMartin Matuska if (vdev_disk_io_rw_fn == NULL) 1545783d3ff6SMartin Matuska vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? 1546783d3ff6SMartin Matuska vdev_classic_physio : vdev_disk_io_rw; 1547783d3ff6SMartin Matuska 1548783d3ff6SMartin Matuska return (0); 1549783d3ff6SMartin Matuska } 1550783d3ff6SMartin Matuska 1551eda14cbcSMatt Macy vdev_ops_t vdev_disk_ops = { 1552783d3ff6SMartin Matuska .vdev_op_init = vdev_disk_init, 15537877fdebSMatt Macy .vdev_op_fini = NULL, 1554eda14cbcSMatt Macy .vdev_op_open = vdev_disk_open, 1555eda14cbcSMatt Macy .vdev_op_close = vdev_disk_close, 1556eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 15577877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 15587877fdebSMatt Macy .vdev_op_min_alloc = NULL, 1559eda14cbcSMatt Macy .vdev_op_io_start = vdev_disk_io_start, 1560eda14cbcSMatt Macy .vdev_op_io_done = vdev_disk_io_done, 1561eda14cbcSMatt Macy .vdev_op_state_change = NULL, 1562eda14cbcSMatt Macy .vdev_op_need_resilver = NULL, 1563eda14cbcSMatt Macy .vdev_op_hold = vdev_disk_hold, 1564eda14cbcSMatt Macy .vdev_op_rele = vdev_disk_rele, 1565eda14cbcSMatt Macy .vdev_op_remap = NULL, 1566eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 15677877fdebSMatt Macy .vdev_op_rebuild_asize = NULL, 15687877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 15697877fdebSMatt Macy .vdev_op_config_generate = NULL, 15707877fdebSMatt Macy .vdev_op_nparity = NULL, 15717877fdebSMatt Macy .vdev_op_ndisks = NULL, 1572eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1573be181ee2SMartin Matuska .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1574be181ee2SMartin Matuska .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1575eda14cbcSMatt Macy }; 1576eda14cbcSMatt Macy 1577eda14cbcSMatt Macy /* 1578eda14cbcSMatt Macy * The zfs_vdev_scheduler module option has been deprecated. Setting this 1579eda14cbcSMatt Macy * value no longer has any effect. It has not yet been entirely removed 1580eda14cbcSMatt Macy * to allow the module to be loaded if this option is specified in the 1581eda14cbcSMatt Macy * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 1582eda14cbcSMatt Macy */ 1583eda14cbcSMatt Macy static int 1584eda14cbcSMatt Macy param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1585eda14cbcSMatt Macy { 1586eda14cbcSMatt Macy int error = param_set_charp(val, kp); 1587eda14cbcSMatt Macy if (error == 0) { 1588eda14cbcSMatt Macy printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1589eda14cbcSMatt Macy "is not supported.\n"); 1590eda14cbcSMatt Macy } 1591eda14cbcSMatt Macy 1592eda14cbcSMatt Macy return (error); 1593eda14cbcSMatt Macy } 1594eda14cbcSMatt Macy 1595e92ffd9bSMartin Matuska static const char *zfs_vdev_scheduler = "unused"; 1596eda14cbcSMatt Macy module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1597eda14cbcSMatt Macy param_get_charp, &zfs_vdev_scheduler, 0644); 1598eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1599eda14cbcSMatt Macy 1600eda14cbcSMatt Macy int 1601eda14cbcSMatt Macy param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1602eda14cbcSMatt Macy { 1603dbd5678dSMartin Matuska uint_t val; 1604eda14cbcSMatt Macy int error; 1605eda14cbcSMatt Macy 1606dbd5678dSMartin Matuska error = kstrtouint(buf, 0, &val); 1607eda14cbcSMatt Macy if (error < 0) 1608eda14cbcSMatt Macy return (SET_ERROR(error)); 1609eda14cbcSMatt Macy 1610eda14cbcSMatt Macy if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1611eda14cbcSMatt Macy return (SET_ERROR(-EINVAL)); 1612eda14cbcSMatt Macy 1613dbd5678dSMartin Matuska error = param_set_uint(buf, kp); 1614eda14cbcSMatt Macy if (error < 0) 1615eda14cbcSMatt Macy return (SET_ERROR(error)); 1616eda14cbcSMatt Macy 1617eda14cbcSMatt Macy return (0); 1618eda14cbcSMatt Macy } 1619eda14cbcSMatt Macy 1620eda14cbcSMatt Macy int 1621eda14cbcSMatt Macy param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1622eda14cbcSMatt Macy { 1623dbd5678dSMartin Matuska uint_t val; 1624eda14cbcSMatt Macy int error; 1625eda14cbcSMatt Macy 1626dbd5678dSMartin Matuska error = kstrtouint(buf, 0, &val); 1627eda14cbcSMatt Macy if (error < 0) 1628eda14cbcSMatt Macy return (SET_ERROR(error)); 1629eda14cbcSMatt Macy 1630eda14cbcSMatt Macy if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1631eda14cbcSMatt Macy return (SET_ERROR(-EINVAL)); 1632eda14cbcSMatt Macy 1633dbd5678dSMartin Matuska error = param_set_uint(buf, kp); 1634eda14cbcSMatt Macy if (error < 0) 1635eda14cbcSMatt Macy return (SET_ERROR(error)); 1636eda14cbcSMatt Macy 1637eda14cbcSMatt Macy return (0); 1638eda14cbcSMatt Macy } 1639dbd5678dSMartin Matuska 1640dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1641dbd5678dSMartin Matuska "Timeout before determining that a device is missing"); 1642dbd5678dSMartin Matuska 1643dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1644dbd5678dSMartin Matuska "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1645783d3ff6SMartin Matuska 1646783d3ff6SMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, 1647783d3ff6SMartin Matuska "Maximum number of data segments to add to an IO request (min 4)"); 1648783d3ff6SMartin Matuska 1649783d3ff6SMartin Matuska ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, 1650783d3ff6SMartin Matuska vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, 1651783d3ff6SMartin Matuska "Use classic BIO submission method"); 1652