1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 22*6523Sek110237 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 29789Sahrens #include <sys/spa.h> 30789Sahrens #include <sys/vdev_file.h> 31789Sahrens #include <sys/vdev_impl.h> 32789Sahrens #include <sys/zio.h> 33789Sahrens #include <sys/fs/zfs.h> 34789Sahrens 35789Sahrens /* 36789Sahrens * Virtual device vector for files. 37789Sahrens */ 38789Sahrens 39789Sahrens static int 405329Sgw25295 vdev_file_open_common(vdev_t *vd) 41789Sahrens { 42789Sahrens vdev_file_t *vf; 43789Sahrens vnode_t *vp; 44789Sahrens int error; 45789Sahrens 46789Sahrens /* 47789Sahrens * We must have a pathname, and it must be absolute. 48789Sahrens */ 49789Sahrens if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 50789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 51789Sahrens return (EINVAL); 52789Sahrens } 53789Sahrens 54789Sahrens vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 55789Sahrens 56789Sahrens /* 57789Sahrens * We always open the files from the root of the global zone, even if 58789Sahrens * we're in a local zone. If the user has gotten to this point, the 59789Sahrens * administrator has already decided that the pool should be available 60789Sahrens * to local zone users, so the underlying devices should be as well. 61789Sahrens */ 62789Sahrens ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 635329Sgw25295 error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, 645331Samw spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); 65789Sahrens 66789Sahrens if (error) { 67789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 68789Sahrens return (error); 69789Sahrens } 70789Sahrens 71789Sahrens vf->vf_vnode = vp; 72789Sahrens 73789Sahrens #ifdef _KERNEL 74789Sahrens /* 75789Sahrens * Make sure it's a regular file. 76789Sahrens */ 77789Sahrens if (vp->v_type != VREG) { 78789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 79789Sahrens return (ENODEV); 80789Sahrens } 81789Sahrens #endif 82789Sahrens 835329Sgw25295 return (0); 845329Sgw25295 } 855329Sgw25295 865329Sgw25295 static int 875329Sgw25295 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) 885329Sgw25295 { 895329Sgw25295 vdev_file_t *vf; 905329Sgw25295 vattr_t vattr; 915329Sgw25295 int error; 925329Sgw25295 935329Sgw25295 if ((error = vdev_file_open_common(vd)) != 0) 945329Sgw25295 return (error); 955329Sgw25295 965329Sgw25295 vf = vd->vdev_tsd; 975329Sgw25295 98789Sahrens /* 99789Sahrens * Determine the physical size of the file. 100789Sahrens */ 101789Sahrens vattr.va_mask = AT_SIZE; 1025331Samw error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); 103789Sahrens if (error) { 104789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 105789Sahrens return (error); 106789Sahrens } 107789Sahrens 108789Sahrens *psize = vattr.va_size; 109789Sahrens *ashift = SPA_MINBLOCKSHIFT; 110789Sahrens 111789Sahrens return (0); 112789Sahrens } 113789Sahrens 114789Sahrens static void 115789Sahrens vdev_file_close(vdev_t *vd) 116789Sahrens { 117789Sahrens vdev_file_t *vf = vd->vdev_tsd; 118789Sahrens 119789Sahrens if (vf == NULL) 120789Sahrens return; 121789Sahrens 122789Sahrens if (vf->vf_vnode != NULL) { 1235331Samw (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); 1245331Samw (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); 125789Sahrens VN_RELE(vf->vf_vnode); 126789Sahrens } 127789Sahrens 128789Sahrens kmem_free(vf, sizeof (vdev_file_t)); 129789Sahrens vd->vdev_tsd = NULL; 130789Sahrens } 131789Sahrens 1325329Sgw25295 static int 1335329Sgw25295 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, 1345329Sgw25295 enum uio_rw rw) 1355329Sgw25295 { 136*6523Sek110237 vdev_file_t *vf = vd ? vd->vdev_tsd : NULL; 1375329Sgw25295 ssize_t resid; 1385329Sgw25295 int error = 0; 1395329Sgw25295 1405329Sgw25295 if (vd == NULL || vf == NULL || vf->vf_vnode == NULL) 1415329Sgw25295 return (EINVAL); 1425329Sgw25295 1435329Sgw25295 ASSERT(rw == UIO_READ || rw == UIO_WRITE); 1445329Sgw25295 1455329Sgw25295 error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE, 1465329Sgw25295 0, RLIM64_INFINITY, kcred, &resid); 1475329Sgw25295 if (error || resid != 0) 1485329Sgw25295 return (EIO); 1495329Sgw25295 return (0); 1505329Sgw25295 } 1515329Sgw25295 1525369Sgw25295 /* 1535369Sgw25295 * Determine if the underlying device is accessible by reading and writing 1545369Sgw25295 * to a known location. We must be able to do this during syncing context 1555369Sgw25295 * and thus we cannot set the vdev state directly. 1565369Sgw25295 */ 1575329Sgw25295 static int 1585329Sgw25295 vdev_file_probe(vdev_t *vd) 1595329Sgw25295 { 1605329Sgw25295 vdev_t *nvd; 1615329Sgw25295 char *vl_boot; 1625329Sgw25295 uint64_t offset; 1635329Sgw25295 int l, error = 0, retries = 0; 1645329Sgw25295 1655329Sgw25295 if (vd == NULL) 1665329Sgw25295 return (EINVAL); 1675329Sgw25295 1685329Sgw25295 /* Hijack the current vdev */ 1695329Sgw25295 nvd = vd; 1705329Sgw25295 1715329Sgw25295 /* 1725329Sgw25295 * Pick a random label to rewrite. 1735329Sgw25295 */ 1745329Sgw25295 l = spa_get_random(VDEV_LABELS); 1755329Sgw25295 ASSERT(l < VDEV_LABELS); 1765329Sgw25295 1775329Sgw25295 offset = vdev_label_offset(vd->vdev_psize, l, 1785329Sgw25295 offsetof(vdev_label_t, vl_boot_header)); 1795329Sgw25295 1805329Sgw25295 vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP); 1815329Sgw25295 1825329Sgw25295 while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, 1835329Sgw25295 offset, UIO_READ)) != 0 && retries == 0) { 1845329Sgw25295 1855329Sgw25295 /* 1865329Sgw25295 * If we failed with the vdev that was passed in then 1875329Sgw25295 * try allocating a new one and try again. 1885329Sgw25295 */ 1895329Sgw25295 nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 1905329Sgw25295 if (vd->vdev_path) 1915329Sgw25295 nvd->vdev_path = spa_strdup(vd->vdev_path); 1925369Sgw25295 retries++; 1935369Sgw25295 1945329Sgw25295 error = vdev_file_open_common(nvd); 1955369Sgw25295 if (error) 1965329Sgw25295 break; 1975329Sgw25295 } 1985329Sgw25295 1995329Sgw25295 if ((spa_mode & FWRITE) && !error) { 2005329Sgw25295 error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, 2015329Sgw25295 offset, UIO_WRITE); 2025329Sgw25295 } 2035329Sgw25295 2045329Sgw25295 if (retries) { 2055329Sgw25295 vdev_file_close(nvd); 2065329Sgw25295 if (nvd->vdev_path) 2075329Sgw25295 spa_strfree(nvd->vdev_path); 2085329Sgw25295 kmem_free(nvd, sizeof (vdev_t)); 2095329Sgw25295 } 2105329Sgw25295 kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE); 2115329Sgw25295 2125329Sgw25295 if (!error) 2135329Sgw25295 vd->vdev_is_failing = B_FALSE; 2145329Sgw25295 2155329Sgw25295 return (error); 2165329Sgw25295 } 2175329Sgw25295 2185530Sbonwick static int 219789Sahrens vdev_file_io_start(zio_t *zio) 220789Sahrens { 221789Sahrens vdev_t *vd = zio->io_vd; 222789Sahrens vdev_file_t *vf = vd->vdev_tsd; 223789Sahrens ssize_t resid; 224789Sahrens int error; 225789Sahrens 226789Sahrens if (zio->io_type == ZIO_TYPE_IOCTL) { 227789Sahrens zio_vdev_io_bypass(zio); 228789Sahrens 229789Sahrens /* XXPOLICY */ 2305329Sgw25295 if (!vdev_readable(vd)) { 231789Sahrens zio->io_error = ENXIO; 2325530Sbonwick return (ZIO_PIPELINE_CONTINUE); 233789Sahrens } 234789Sahrens 235789Sahrens switch (zio->io_cmd) { 236789Sahrens case DKIOCFLUSHWRITECACHE: 237789Sahrens zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, 2385331Samw kcred, NULL); 239789Sahrens dprintf("fsync(%s) = %d\n", vdev_description(vd), 240789Sahrens zio->io_error); 241789Sahrens break; 242789Sahrens default: 243789Sahrens zio->io_error = ENOTSUP; 244789Sahrens } 245789Sahrens 2465530Sbonwick return (ZIO_PIPELINE_CONTINUE); 247789Sahrens } 248789Sahrens 2493059Sahrens /* 2503059Sahrens * In the kernel, don't bother double-caching, but in userland, 2513059Sahrens * we want to test the vdev_cache code. 2523059Sahrens */ 2533059Sahrens #ifndef _KERNEL 254789Sahrens if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2555530Sbonwick return (ZIO_PIPELINE_STOP); 2563059Sahrens #endif 257789Sahrens 258789Sahrens if ((zio = vdev_queue_io(zio)) == NULL) 2595530Sbonwick return (ZIO_PIPELINE_STOP); 260789Sahrens 261789Sahrens /* XXPOLICY */ 2625329Sgw25295 if (zio->io_type == ZIO_TYPE_WRITE) 2635329Sgw25295 error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; 2645329Sgw25295 else 2655329Sgw25295 error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; 2665329Sgw25295 error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; 267789Sahrens if (error) { 268789Sahrens zio->io_error = error; 2695530Sbonwick zio_interrupt(zio); 2705530Sbonwick return (ZIO_PIPELINE_STOP); 271789Sahrens } 272789Sahrens 273789Sahrens zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? 274789Sahrens UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, 275789Sahrens zio->io_size, zio->io_offset, UIO_SYSSPACE, 276789Sahrens 0, RLIM64_INFINITY, kcred, &resid); 277789Sahrens 278789Sahrens if (resid != 0 && zio->io_error == 0) 279789Sahrens zio->io_error = ENOSPC; 280789Sahrens 2815530Sbonwick zio_interrupt(zio); 2825530Sbonwick 2835530Sbonwick return (ZIO_PIPELINE_STOP); 284789Sahrens } 285789Sahrens 2865530Sbonwick static int 287789Sahrens vdev_file_io_done(zio_t *zio) 288789Sahrens { 2895530Sbonwick vdev_t *vd = zio->io_vd; 2905329Sgw25295 2915329Sgw25295 if (zio_injection_enabled && zio->io_error == 0) 2925530Sbonwick zio->io_error = zio_handle_device_injection(vd, EIO); 2935329Sgw25295 2945329Sgw25295 /* 2955369Sgw25295 * If an error has been encountered then attempt to probe the device 2965369Sgw25295 * to determine if it's still accessible. 2975329Sgw25295 */ 2985530Sbonwick if (zio->io_error == EIO && vdev_probe(vd) != 0) 2995530Sbonwick vd->vdev_is_failing = B_TRUE; 3005329Sgw25295 301789Sahrens vdev_queue_io_done(zio); 302789Sahrens 3033059Sahrens #ifndef _KERNEL 304789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) 305789Sahrens vdev_cache_write(zio); 3063059Sahrens #endif 307789Sahrens 3085530Sbonwick return (ZIO_PIPELINE_CONTINUE); 309789Sahrens } 310789Sahrens 311789Sahrens vdev_ops_t vdev_file_ops = { 312789Sahrens vdev_file_open, 313789Sahrens vdev_file_close, 3145329Sgw25295 vdev_file_probe, 315789Sahrens vdev_default_asize, 316789Sahrens vdev_file_io_start, 317789Sahrens vdev_file_io_done, 318789Sahrens NULL, 319789Sahrens VDEV_TYPE_FILE, /* name of this vdev type */ 320789Sahrens B_TRUE /* leaf vdev */ 321789Sahrens }; 322789Sahrens 323789Sahrens /* 324789Sahrens * From userland we access disks just like files. 325789Sahrens */ 326789Sahrens #ifndef _KERNEL 327789Sahrens 328789Sahrens vdev_ops_t vdev_disk_ops = { 329789Sahrens vdev_file_open, 330789Sahrens vdev_file_close, 3315329Sgw25295 vdev_file_probe, 332789Sahrens vdev_default_asize, 333789Sahrens vdev_file_io_start, 334789Sahrens vdev_file_io_done, 335789Sahrens NULL, 336789Sahrens VDEV_TYPE_DISK, /* name of this vdev type */ 337789Sahrens B_TRUE /* leaf vdev */ 338789Sahrens }; 339789Sahrens 340789Sahrens #endif 341