1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/file.h> 29 #include <sys/vdev_file.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 #include <sys/abd.h> 35 #include <sys/stat.h> 36 37 /* 38 * Virtual device vector for files. 39 */ 40 41 static taskq_t *vdev_file_taskq; 42 43 void 44 vdev_file_init(void) 45 { 46 vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), 47 minclsyspri, max_ncpus, INT_MAX, 0); 48 } 49 50 void 51 vdev_file_fini(void) 52 { 53 taskq_destroy(vdev_file_taskq); 54 } 55 56 static void 57 vdev_file_hold(vdev_t *vd) 58 { 59 ASSERT(vd->vdev_path != NULL); 60 } 61 62 static void 63 vdev_file_rele(vdev_t *vd) 64 { 65 ASSERT(vd->vdev_path != NULL); 66 } 67 68 static mode_t 69 vdev_file_open_mode(spa_mode_t spa_mode) 70 { 71 mode_t mode = 0; 72 73 if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { 74 mode = O_RDWR; 75 } else if (spa_mode & SPA_MODE_READ) { 76 mode = O_RDONLY; 77 } else if (spa_mode & SPA_MODE_WRITE) { 78 mode = O_WRONLY; 79 } 80 81 return (mode | O_LARGEFILE); 82 } 83 84 static int 85 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 86 uint64_t *logical_ashift, uint64_t *physical_ashift) 87 { 88 vdev_file_t *vf; 89 zfs_file_t *fp; 90 zfs_file_attr_t zfa; 91 int error; 92 93 /* 94 * Rotational optimizations only make sense on block devices. 95 */ 96 vd->vdev_nonrot = B_TRUE; 97 98 /* 99 * Allow TRIM on file based vdevs. This may not always be supported, 100 * since it depends on your kernel version and underlying filesystem 101 * type but it is always safe to attempt. 102 */ 103 vd->vdev_has_trim = B_TRUE; 104 105 /* 106 * Disable secure TRIM on file based vdevs. There is no way to 107 * request this behavior from the underlying filesystem. 108 */ 109 vd->vdev_has_securetrim = B_FALSE; 110 111 /* 112 * We must have a pathname, and it must be absolute. 113 */ 114 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 115 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 116 return (SET_ERROR(EINVAL)); 117 } 118 119 /* 120 * Reopen the device if it's not currently open. Otherwise, 121 * just update the physical size of the device. 122 */ 123 if (vd->vdev_tsd != NULL) { 124 ASSERT(vd->vdev_reopening); 125 vf = vd->vdev_tsd; 126 goto skip_open; 127 } 128 129 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 130 131 /* 132 * We always open the files from the root of the global zone, even if 133 * we're in a local zone. If the user has gotten to this point, the 134 * administrator has already decided that the pool should be available 135 * to local zone users, so the underlying devices should be as well. 136 */ 137 ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 138 139 error = zfs_file_open(vd->vdev_path, 140 vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); 141 if (error) { 142 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 143 return (error); 144 } 145 146 vf->vf_file = fp; 147 148 #ifdef _KERNEL 149 /* 150 * Make sure it's a regular file. 151 */ 152 if (zfs_file_getattr(fp, &zfa)) { 153 return (SET_ERROR(ENODEV)); 154 } 155 if (!S_ISREG(zfa.zfa_mode)) { 156 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 157 return (SET_ERROR(ENODEV)); 158 } 159 #endif 160 161 skip_open: 162 163 error = zfs_file_getattr(vf->vf_file, &zfa); 164 if (error) { 165 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 166 return (error); 167 } 168 169 *max_psize = *psize = zfa.zfa_size; 170 *logical_ashift = SPA_MINBLOCKSHIFT; 171 *physical_ashift = SPA_MINBLOCKSHIFT; 172 173 return (0); 174 } 175 176 static void 177 vdev_file_close(vdev_t *vd) 178 { 179 vdev_file_t *vf = vd->vdev_tsd; 180 181 if (vd->vdev_reopening || vf == NULL) 182 return; 183 184 if (vf->vf_file != NULL) { 185 zfs_file_close(vf->vf_file); 186 } 187 188 vd->vdev_delayed_close = B_FALSE; 189 kmem_free(vf, sizeof (vdev_file_t)); 190 vd->vdev_tsd = NULL; 191 } 192 193 /* 194 * Implements the interrupt side for file vdev types. This routine will be 195 * called when the I/O completes allowing us to transfer the I/O to the 196 * interrupt taskqs. For consistency, the code structure mimics disk vdev 197 * types. 198 */ 199 static void 200 vdev_file_io_intr(zio_t *zio) 201 { 202 zio_delay_interrupt(zio); 203 } 204 205 static void 206 vdev_file_io_strategy(void *arg) 207 { 208 zio_t *zio = arg; 209 vdev_t *vd = zio->io_vd; 210 vdev_file_t *vf; 211 void *buf; 212 ssize_t resid; 213 loff_t off; 214 ssize_t size; 215 int err; 216 217 off = zio->io_offset; 218 size = zio->io_size; 219 resid = 0; 220 221 vf = vd->vdev_tsd; 222 223 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 224 if (zio->io_type == ZIO_TYPE_READ) { 225 buf = abd_borrow_buf(zio->io_abd, zio->io_size); 226 err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); 227 abd_return_buf_copy(zio->io_abd, buf, size); 228 } else { 229 buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); 230 err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); 231 abd_return_buf(zio->io_abd, buf, size); 232 } 233 if (resid != 0 && zio->io_error == 0) 234 zio->io_error = ENOSPC; 235 236 vdev_file_io_intr(zio); 237 } 238 239 static void 240 vdev_file_io_start(zio_t *zio) 241 { 242 vdev_t *vd = zio->io_vd; 243 vdev_file_t *vf = vd->vdev_tsd; 244 245 if (zio->io_type == ZIO_TYPE_IOCTL) { 246 /* XXPOLICY */ 247 if (!vdev_readable(vd)) { 248 zio->io_error = SET_ERROR(ENXIO); 249 zio_interrupt(zio); 250 return; 251 } 252 253 switch (zio->io_cmd) { 254 case DKIOCFLUSHWRITECACHE: 255 zio->io_error = zfs_file_fsync(vf->vf_file, 256 O_SYNC|O_DSYNC); 257 break; 258 default: 259 zio->io_error = SET_ERROR(ENOTSUP); 260 } 261 262 zio_execute(zio); 263 return; 264 } else if (zio->io_type == ZIO_TYPE_TRIM) { 265 #ifdef notyet 266 int mode = 0; 267 268 ASSERT3U(zio->io_size, !=, 0); 269 270 /* XXX FreeBSD has no fallocate routine in file ops */ 271 zio->io_error = zfs_file_fallocate(vf->vf_file, 272 mode, zio->io_offset, zio->io_size); 273 #endif 274 zio->io_error = SET_ERROR(ENOTSUP); 275 zio_execute(zio); 276 return; 277 } 278 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 279 zio->io_target_timestamp = zio_handle_io_delay(zio); 280 281 VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, 282 TQ_SLEEP), !=, 0); 283 } 284 285 /* ARGSUSED */ 286 static void 287 vdev_file_io_done(zio_t *zio) 288 { 289 } 290 291 vdev_ops_t vdev_file_ops = { 292 vdev_file_open, 293 vdev_file_close, 294 vdev_default_asize, 295 vdev_file_io_start, 296 vdev_file_io_done, 297 NULL, 298 NULL, 299 vdev_file_hold, 300 vdev_file_rele, 301 NULL, 302 vdev_default_xlate, 303 VDEV_TYPE_FILE, /* name of this vdev type */ 304 B_TRUE /* leaf vdev */ 305 }; 306 307 /* 308 * From userland we access disks just like files. 309 */ 310 #ifndef _KERNEL 311 312 vdev_ops_t vdev_disk_ops = { 313 vdev_file_open, 314 vdev_file_close, 315 vdev_default_asize, 316 vdev_file_io_start, 317 vdev_file_io_done, 318 NULL, 319 NULL, 320 vdev_file_hold, 321 vdev_file_rele, 322 NULL, 323 vdev_default_xlate, 324 VDEV_TYPE_DISK, /* name of this vdev type */ 325 B_TRUE /* leaf vdev */ 326 }; 327 328 #endif 329