xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c (revision 154bfe8e089c1a0a4e9ed8414f08d3da90949162)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/refcount.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/disk.h>
38 #include <sys/dkio.h>
39 #include <sys/workqueue.h>
40 
41 #ifdef __NetBSD__
42 static int
43 geterror(struct buf *bp)
44 {
45 
46 	return (bp->b_error);
47 }
48 #endif
49 
50 /*
51  * Virtual device vector for disks.
52  */
53 
54 static void	vdev_disk_io_intr(buf_t *);
55 
56 static void
57 vdev_disk_alloc(vdev_t *vd)
58 {
59 	vdev_disk_t *dvd;
60 
61 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
62 
63 #ifdef illumos
64 	/*
65 	 * Create the LDI event callback list.
66 	 */
67 	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
68 	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
69 #endif
70 }
71 
72 
73 static void
74 vdev_disk_free(vdev_t *vd)
75 {
76 	vdev_disk_t *dvd = vd->vdev_tsd;
77 #ifdef illumos
78 	vdev_disk_ldi_cb_t *lcb;
79 #endif
80 
81 	if (dvd == NULL)
82 		return;
83 
84 #ifdef illumos
85 	/*
86 	 * We have already closed the LDI handle. Clean up the LDI event
87 	 * callbacks and free vd->vdev_tsd.
88 	 */
89 	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
90 		list_remove(&dvd->vd_ldi_cbs, lcb);
91 		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
92 		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
93 	}
94 	list_destroy(&dvd->vd_ldi_cbs);
95 #endif
96 	kmem_free(dvd, sizeof (vdev_disk_t));
97 	vd->vdev_tsd = NULL;
98 }
99 
100 
101 /*
102  * It's not clear what these hold/rele functions are supposed to do.
103  */
104 static void
105 vdev_disk_hold(vdev_t *vd)
106 {
107 
108 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
109 
110 }
111 
112 static void
113 vdev_disk_rele(vdev_t *vd)
114 {
115 
116 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
117 
118 }
119 
120 static void
121 vdev_disk_flush(struct work *work, void *cookie)
122 {
123 	vdev_disk_t *dvd;
124 	int error, cmd;
125 	buf_t *bp;
126 	vnode_t *vp;
127 
128 	bp = (struct buf *)work;
129 	vp = bp->b_vp;
130 	dvd = cookie;
131 
132 	KASSERT(vp == dvd->vd_vp);
133 
134 	cmd = 1;
135 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
136 	error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, kcred);
137 	VOP_UNLOCK(vp, 0);
138 	bp->b_error = error;
139 	vdev_disk_io_intr(bp);
140 }
141 
142 static int
143 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
144     uint64_t *ashift, uint64_t *pashift)
145 {
146 	spa_t *spa = vd->vdev_spa;
147 	vdev_disk_t *dvd;
148 	vnode_t *vp;
149 	int error, cmd;
150 	uint64_t numsecs;
151 	unsigned secsize;
152 	struct disk *pdk;
153 	struct dkwedge_info dkw;
154 	struct disk_sectoralign dsa;
155 
156 	/*
157 	 * We must have a pathname, and it must be absolute.
158 	 */
159 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
160 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
161 		return (SET_ERROR(EINVAL));
162 	}
163 
164 	/*
165 	 * Reopen the device if it's not currently open. Otherwise,
166 	 * just update the physical size of the device.
167 	 */
168 	if (vd->vdev_tsd != NULL) {
169 		ASSERT(vd->vdev_reopening);
170 		dvd = vd->vdev_tsd;
171 		vp = dvd->vd_vp;
172 		KASSERT(vp != NULL);
173 		goto skip_open;
174 	}
175 
176 	/*
177 	 * Create vd->vdev_tsd.
178 	 */
179 	vdev_disk_alloc(vd);
180 	dvd = vd->vdev_tsd;
181 
182 	/*
183 	 * When opening a disk device, we want to preserve the user's original
184 	 * intent.  We always want to open the device by the path the user gave
185 	 * us, even if it is one of multiple paths to the save device.  But we
186 	 * also want to be able to survive disks being removed/recabled.
187 	 * Therefore the sequence of opening devices is:
188 	 *
189 	 * 1. Try opening the device by path.  For legacy pools without the
190 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
191 	 *
192 	 * 2. If the devid of the device matches the stored value, return
193 	 *    success.
194 	 *
195 	 * 3. Otherwise, the device may have moved.  Try opening the device
196 	 *    by the devid instead.
197 	 */
198 	if (vd->vdev_devid != NULL) {
199 		/* XXXNETBSD wedges */
200 #ifdef illumos
201 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
202 		    &dvd->vd_minor) != 0) {
203 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
204 			return (SET_ERROR(EINVAL));
205 		}
206 #endif
207 	}
208 
209 	error = EINVAL;		/* presume failure */
210 
211 	error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0,
212 	    &vp, CRCREAT, 0);
213 	if (error != 0) {
214 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
215 		return (SET_ERROR(error));
216 	}
217 	if (vp->v_type != VBLK) {
218 		vrele(vp);
219 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
220 		return (SET_ERROR(EINVAL));
221 	}
222 
223 	pdk = NULL;
224 	if (getdiskinfo(vp, &dkw) == 0)
225 		pdk = disk_find(dkw.dkw_devname);
226 
227 	/* XXXNETBSD Once tls-maxphys gets merged this block becomes:
228 		dvd->vd_maxphys = (pdk ? disk_maxphys(pdk) : MACHINE_MAXPHYS);
229 	*/
230 	{
231 		struct buf buf = {
232 			.b_dev = vp->v_rdev,
233 			.b_bcount = MAXPHYS,
234 		};
235 		if (pdk && pdk->dk_driver && pdk->dk_driver->d_minphys)
236 			(*pdk->dk_driver->d_minphys)(&buf);
237 		dvd->vd_maxphys = buf.b_bcount;
238 	}
239 
240 	/*
241 	 * XXXNETBSD Compare the devid to the stored value.
242 	 */
243 
244 	/*
245 	 * Create a workqueue to process cache-flushes concurrently.
246 	 */
247 	error = workqueue_create(&dvd->vd_wq, "vdevsync",
248 	    vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE);
249 	if (error != 0) {
250 		vrele(vp);
251 		return (SET_ERROR(error));
252 	}
253 
254 	dvd->vd_vp = vp;
255 
256 skip_open:
257 	error = getdisksize(vp, &numsecs, &secsize);
258 	if (error != 0) {
259 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
260 		return (SET_ERROR(error));
261 	}
262 
263 	*psize = numsecs * secsize;
264 	*max_psize = *psize;
265 
266 	*ashift = highbit(MAX(secsize, SPA_MINBLOCKSIZE)) - 1;
267 
268 	/*
269 	 * Try to determine whether the disk has a preferred physical
270 	 * sector size even if it can emulate a smaller logical sector
271 	 * size with r/m/w cycles, e.g. a disk with 4096-byte sectors
272 	 * that for compatibility claims to support 512-byte ones.
273 	 */
274 	if (VOP_IOCTL(vp, DIOCGSECTORALIGN, &dsa, FREAD, NOCRED) == 0) {
275 		*pashift = highbit(dsa.dsa_alignment * secsize) - 1;
276 		if (dsa.dsa_firstaligned % dsa.dsa_alignment)
277 			printf("ZFS WARNING: vdev %s: sectors are misaligned"
278 			    " (alignment=%"PRIu32", firstaligned=%"PRIu32")\n",
279 			    vd->vdev_path,
280 			    dsa.dsa_alignment, dsa.dsa_firstaligned);
281 	} else {
282 		*pashift = *ashift;
283 	}
284 
285 	vd->vdev_wholedisk = 0;
286 	if (getdiskinfo(vp, &dkw) != 0 &&
287 	    dkw.dkw_offset == 0 && dkw.dkw_size == numsecs)
288 		vd->vdev_wholedisk = 1,
289 
290 	/*
291 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
292 	 * try again.
293 	 */
294 	vd->vdev_nowritecache = B_FALSE;
295 
296 	return (0);
297 }
298 
299 static void
300 vdev_disk_close(vdev_t *vd)
301 {
302 	vdev_disk_t *dvd = vd->vdev_tsd;
303 
304 	if (vd->vdev_reopening || dvd == NULL)
305 		return;
306 
307 #ifdef illumos
308 	if (dvd->vd_minor != NULL) {
309 		ddi_devid_str_free(dvd->vd_minor);
310 		dvd->vd_minor = NULL;
311 	}
312 
313 	if (dvd->vd_devid != NULL) {
314 		ddi_devid_free(dvd->vd_devid);
315 		dvd->vd_devid = NULL;
316 	}
317 
318 	if (dvd->vd_lh != NULL) {
319 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
320 		dvd->vd_lh = NULL;
321 	}
322 #endif
323 
324 #ifdef __NetBSD__
325 	if (dvd->vd_vp != NULL) {
326 		vn_close(dvd->vd_vp, FREAD|FWRITE, kcred);
327 		dvd->vd_vp = NULL;
328 	}
329 	if (dvd->vd_wq != NULL) {
330 		workqueue_destroy(dvd->vd_wq);
331 		dvd->vd_wq = NULL;
332 	}
333 #endif
334 
335 	vd->vdev_delayed_close = B_FALSE;
336 #ifdef illumos
337 	/*
338 	 * If we closed the LDI handle due to an offline notify from LDI,
339 	 * don't free vd->vdev_tsd or unregister the callbacks here;
340 	 * the offline finalize callback or a reopen will take care of it.
341 	 */
342 	if (dvd->vd_ldi_offline)
343 		return;
344 #endif
345 
346 	vdev_disk_free(vd);
347 }
348 
349 int
350 vdev_disk_physio(vdev_t *vd, caddr_t data,
351     size_t size, uint64_t offset, int flags, boolean_t isdump)
352 {
353 #ifdef illumos
354 	vdev_disk_t *dvd = vd->vdev_tsd;
355 
356 	/*
357 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
358 	 * Nothing to be done here but return failure.
359 	 */
360 	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
361 		return (EIO);
362 
363 	ASSERT(vd->vdev_ops == &vdev_disk_ops);
364 
365 	/*
366 	 * If in the context of an active crash dump, use the ldi_dump(9F)
367 	 * call instead of ldi_strategy(9F) as usual.
368 	 */
369 	if (isdump) {
370 		ASSERT3P(dvd, !=, NULL);
371 		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
372 		    lbtodb(size)));
373 	}
374 
375 	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
376 #endif
377 #ifdef __NetBSD__
378 	return (EIO);
379 #endif
380 }
381 
382 static void
383 vdev_disk_io_intr(buf_t *bp)
384 {
385 	zio_t *zio = bp->b_private;
386 
387 	/*
388 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
389 	 * Rather than teach the rest of the stack about other error
390 	 * possibilities (EFAULT, etc), we normalize the error value here.
391 	 */
392 	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
393 
394 	if (zio->io_error == 0 && bp->b_resid != 0)
395 		zio->io_error = SET_ERROR(EIO);
396 
397 	putiobuf(bp);
398 	zio_delay_interrupt(zio);
399 }
400 
401 static void
402 vdev_disk_ioctl_free(zio_t *zio)
403 {
404 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
405 }
406 
407 static const zio_vsd_ops_t vdev_disk_vsd_ops = {
408 	vdev_disk_ioctl_free,
409 	zio_vsd_default_cksum_report
410 };
411 
412 static void
413 vdev_disk_ioctl_done(void *zio_arg, int error)
414 {
415 	zio_t *zio = zio_arg;
416 
417 	zio->io_error = error;
418 
419 	zio_interrupt(zio);
420 }
421 
422 static void
423 vdev_disk_io_start(zio_t *zio)
424 {
425 	vdev_t *vd = zio->io_vd;
426 	vdev_disk_t *dvd = vd->vdev_tsd;
427 	vnode_t *vp;
428 	buf_t *bp, *nbp;
429 	int error, size, off, resid;
430 
431 	/*
432 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
433 	 * Nothing to be done here but return failure.
434 	 */
435 #ifdef illumos
436 	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
437 		zio->io_error = SET_ERROR(ENXIO);
438 		zio_interrupt(zio);
439 		return;
440 	}
441 #endif
442 #ifdef __NetBSD__
443 	if (dvd == NULL) {
444 		zio->io_error = SET_ERROR(ENXIO);
445 		zio_interrupt(zio);
446 		return;
447 	}
448 	ASSERT3U(dvd->vd_maxphys, >, 0);
449 	vp = dvd->vd_vp;
450 #endif
451 
452 	if (zio->io_type == ZIO_TYPE_IOCTL) {
453 		/* XXPOLICY */
454 		if (!vdev_readable(vd)) {
455 			zio->io_error = SET_ERROR(ENXIO);
456 			zio_interrupt(zio);
457 			return;
458 		}
459 
460 		switch (zio->io_cmd) {
461 		case DKIOCFLUSHWRITECACHE:
462 
463 			if (zfs_nocacheflush)
464 				break;
465 
466 			if (vd->vdev_nowritecache) {
467 				zio->io_error = ENOTSUP;
468 				break;
469 			}
470 
471 			bp = getiobuf(vp, true);
472 			bp->b_private = zio;
473 			workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL);
474 			return;
475 
476 		default:
477 			zio->io_error = SET_ERROR(ENOTSUP);
478 			break;
479 		}
480 
481 		zio_execute(zio);
482 		return;
483 	}
484 
485 	bp = getiobuf(vp, true);
486 	bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
487 	bp->b_cflags = BC_BUSY | BC_NOCACHE;
488 	bp->b_data = zio->io_data;
489 	bp->b_blkno = btodb(zio->io_offset);
490 	bp->b_bcount = zio->io_size;
491 	bp->b_resid = zio->io_size;
492 	bp->b_iodone = vdev_disk_io_intr;
493 	bp->b_private = zio;
494 
495 	if (!(bp->b_flags & B_READ)) {
496 		mutex_enter(vp->v_interlock);
497 		vp->v_numoutput++;
498 		mutex_exit(vp->v_interlock);
499 	}
500 
501 	if (bp->b_bcount <= dvd->vd_maxphys) {
502 		/* We can do this I/O in one pass. */
503 		(void)VOP_STRATEGY(vp, bp);
504 	} else {
505 		/*
506 		 * The I/O is larger than we can process in one pass.
507 		 * Split it into smaller pieces.
508 		 */
509 		resid = zio->io_size;
510 		off = 0;
511 		while (resid != 0) {
512 			size = uimin(resid, dvd->vd_maxphys);
513 			nbp = getiobuf(vp, true);
514 			nbp->b_blkno = btodb(zio->io_offset + off);
515 			/* Below call increments v_numoutput. */
516 			nestiobuf_setup(bp, nbp, off, size);
517 			(void)VOP_STRATEGY(vp, nbp);
518 			resid -= size;
519 			off += size;
520 		}
521 	}
522 }
523 
524 static void
525 vdev_disk_io_done(zio_t *zio)
526 {
527 #ifdef illumos
528 	vdev_t *vd = zio->io_vd;
529 
530 	/*
531 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
532 	 * the device has been removed.  If this is the case, then we trigger an
533 	 * asynchronous removal of the device. Otherwise, probe the device and
534 	 * make sure it's still accessible.
535 	 */
536 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
537 		vdev_disk_t *dvd = vd->vdev_tsd;
538 		int state = DKIO_NONE;
539 
540 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
541 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
542 			/*
543 			 * We post the resource as soon as possible, instead of
544 			 * when the async removal actually happens, because the
545 			 * DE is using this information to discard previous I/O
546 			 * errors.
547 			 */
548 			zfs_post_remove(zio->io_spa, vd);
549 			vd->vdev_remove_wanted = B_TRUE;
550 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
551 		} else if (!vd->vdev_delayed_close) {
552 			vd->vdev_delayed_close = B_TRUE;
553 		}
554 	}
555 #endif
556 }
557 
558 vdev_ops_t vdev_disk_ops = {
559 	vdev_disk_open,
560 	vdev_disk_close,
561 	vdev_default_asize,
562 	vdev_disk_io_start,
563 	vdev_disk_io_done,
564 	NULL,
565 	vdev_disk_hold,
566 	vdev_disk_rele,
567 	VDEV_TYPE_DISK,		/* name of this vdev type */
568 	B_TRUE			/* leaf vdev */
569 };
570 
571 /*
572  * Given the root disk device devid or pathname, read the label from
573  * the device, and construct a configuration nvlist.
574  */
575 int
576 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
577 {
578 #ifdef __NetBSD__
579 	return (ENOTSUP);
580 #else
581 	ldi_handle_t vd_lh;
582 	vdev_label_t *label;
583 	uint64_t s, size;
584 	int l;
585 	ddi_devid_t tmpdevid;
586 	int error = -1;
587 	char *minor_name;
588 
589 	/*
590 	 * Read the device label and build the nvlist.
591 	 */
592 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
593 	    &minor_name) == 0) {
594 		error = ldi_open_by_devid(tmpdevid, minor_name,
595 		    FREAD, kcred, &vd_lh, zfs_li);
596 		ddi_devid_free(tmpdevid);
597 		ddi_devid_str_free(minor_name);
598 	}
599 
600 	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
601 	    zfs_li)))
602 		return (error);
603 
604 	if (ldi_get_size(vd_lh, &s)) {
605 		(void) ldi_close(vd_lh, FREAD, kcred);
606 		return (SET_ERROR(EIO));
607 	}
608 
609 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
610 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
611 
612 	*config = NULL;
613 	for (l = 0; l < VDEV_LABELS; l++) {
614 		uint64_t offset, state, txg = 0;
615 
616 		/* read vdev label */
617 		offset = vdev_label_offset(size, l, 0);
618 		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
619 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
620 			continue;
621 
622 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
623 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
624 			*config = NULL;
625 			continue;
626 		}
627 
628 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
629 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
630 			nvlist_free(*config);
631 			*config = NULL;
632 			continue;
633 		}
634 
635 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
636 		    &txg) != 0 || txg == 0) {
637 			nvlist_free(*config);
638 			*config = NULL;
639 			continue;
640 		}
641 
642 		break;
643 	}
644 
645 	kmem_free(label, sizeof (vdev_label_t));
646 	(void) ldi_close(vd_lh, FREAD, kcred);
647 	if (*config == NULL)
648 		error = SET_ERROR(EIDRM);
649 
650 	return (error);
651 #endif
652 }
653