xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/refcount.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/disk.h>
38 #include <sys/dkio.h>
39 #include <sys/workqueue.h>
40 
41 #ifdef __NetBSD__
42 static int
43 geterror(struct buf *bp)
44 {
45 
46 	return (bp->b_error);
47 }
48 #endif
49 
50 /*
51  * Virtual device vector for disks.
52  */
53 
54 static void	vdev_disk_io_intr(buf_t *);
55 
56 static void
57 vdev_disk_alloc(vdev_t *vd)
58 {
59 	vdev_disk_t *dvd;
60 
61 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
62 
63 #ifdef illumos
64 	/*
65 	 * Create the LDI event callback list.
66 	 */
67 	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
68 	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
69 #endif
70 }
71 
72 
73 static void
74 vdev_disk_free(vdev_t *vd)
75 {
76 	vdev_disk_t *dvd = vd->vdev_tsd;
77 #ifdef illumos
78 	vdev_disk_ldi_cb_t *lcb;
79 #endif
80 
81 	if (dvd == NULL)
82 		return;
83 
84 #ifdef illumos
85 	/*
86 	 * We have already closed the LDI handle. Clean up the LDI event
87 	 * callbacks and free vd->vdev_tsd.
88 	 */
89 	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
90 		list_remove(&dvd->vd_ldi_cbs, lcb);
91 		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
92 		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
93 	}
94 	list_destroy(&dvd->vd_ldi_cbs);
95 #endif
96 	kmem_free(dvd, sizeof (vdev_disk_t));
97 	vd->vdev_tsd = NULL;
98 }
99 
100 
101 /*
102  * It's not clear what these hold/rele functions are supposed to do.
103  */
104 static void
105 vdev_disk_hold(vdev_t *vd)
106 {
107 
108 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
109 
110 }
111 
112 static void
113 vdev_disk_rele(vdev_t *vd)
114 {
115 
116 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
117 
118 }
119 
120 static void
121 vdev_disk_flush(struct work *work, void *cookie)
122 {
123 	vdev_disk_t *dvd;
124 	int error, cmd;
125 	buf_t *bp;
126 	vnode_t *vp;
127 
128 	bp = (struct buf *)work;
129 	vp = bp->b_vp;
130 	dvd = cookie;
131 
132 	KASSERT(vp == dvd->vd_vp);
133 
134 	cmd = 1;
135 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
136 	error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, kcred);
137 	VOP_UNLOCK(vp, 0);
138 	bp->b_error = error;
139 	vdev_disk_io_intr(bp);
140 }
141 
142 static int
143 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
144     uint64_t *ashift, uint64_t *pashift)
145 {
146 	spa_t *spa = vd->vdev_spa;
147 	vdev_disk_t *dvd;
148 	vnode_t *vp;
149 	int error, cmd;
150 	uint64_t numsecs;
151 	unsigned secsize;
152 	struct disk *pdk;
153 	struct dkwedge_info dkw;
154 	struct disk_sectoralign dsa;
155 
156 	/*
157 	 * We must have a pathname, and it must be absolute.
158 	 */
159 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
160 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
161 		return (SET_ERROR(EINVAL));
162 	}
163 
164 	/*
165 	 * Reopen the device if it's not currently open. Otherwise,
166 	 * just update the physical size of the device.
167 	 */
168 	if (vd->vdev_tsd != NULL) {
169 		ASSERT(vd->vdev_reopening);
170 		dvd = vd->vdev_tsd;
171 		vp = dvd->vd_vp;
172 		KASSERT(vp != NULL);
173 		goto skip_open;
174 	}
175 
176 	/*
177 	 * Create vd->vdev_tsd.
178 	 */
179 	vdev_disk_alloc(vd);
180 	dvd = vd->vdev_tsd;
181 
182 	/*
183 	 * When opening a disk device, we want to preserve the user's original
184 	 * intent.  We always want to open the device by the path the user gave
185 	 * us, even if it is one of multiple paths to the save device.  But we
186 	 * also want to be able to survive disks being removed/recabled.
187 	 * Therefore the sequence of opening devices is:
188 	 *
189 	 * 1. Try opening the device by path.  For legacy pools without the
190 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
191 	 *
192 	 * 2. If the devid of the device matches the stored value, return
193 	 *    success.
194 	 *
195 	 * 3. Otherwise, the device may have moved.  Try opening the device
196 	 *    by the devid instead.
197 	 */
198 	if (vd->vdev_devid != NULL) {
199 		/* XXXNETBSD wedges */
200 #ifdef illumos
201 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
202 		    &dvd->vd_minor) != 0) {
203 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
204 			return (SET_ERROR(EINVAL));
205 		}
206 #endif
207 	}
208 
209 	error = EINVAL;		/* presume failure */
210 
211 	error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0,
212 	    &vp, CRCREAT, 0);
213 	if (error != 0) {
214 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
215 		return (SET_ERROR(error));
216 	}
217 	if (vp->v_type != VBLK) {
218 		vrele(vp);
219 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
220 		return (SET_ERROR(EINVAL));
221 	}
222 
223 	pdk = NULL;
224 	if (getdiskinfo(vp, &dkw) == 0)
225 		pdk = disk_find(dkw.dkw_parent);
226 
227 	/* XXXNETBSD Once tls-maxphys gets merged this block becomes:
228 		dvd->vd_maxphys = (pdk ? disk_maxphys(pdk) : MACHINE_MAXPHYS);
229 	*/
230 	{
231 		struct buf buf = { .b_bcount = MAXPHYS };
232 		if (pdk && pdk->dk_driver && pdk->dk_driver->d_minphys)
233 			(*pdk->dk_driver->d_minphys)(&buf);
234 		dvd->vd_maxphys = buf.b_bcount;
235 	}
236 
237 	/*
238 	 * XXXNETBSD Compare the devid to the stored value.
239 	 */
240 
241 	/*
242 	 * Create a workqueue to process cache-flushes concurrently.
243 	 */
244 	error = workqueue_create(&dvd->vd_wq, "vdevsync",
245 	    vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE);
246 	if (error != 0) {
247 		vrele(vp);
248 		return (SET_ERROR(error));
249 	}
250 
251 	dvd->vd_vp = vp;
252 
253 skip_open:
254 	error = getdisksize(vp, &numsecs, &secsize);
255 	if (error != 0) {
256 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
257 		return (SET_ERROR(error));
258 	}
259 
260 	*psize = numsecs * secsize;
261 	*max_psize = *psize;
262 
263 	*ashift = highbit(MAX(secsize, SPA_MINBLOCKSIZE)) - 1;
264 
265 	/*
266 	 * Try to determine whether the disk has a preferred physical
267 	 * sector size even if it can emulate a smaller logical sector
268 	 * size with r/m/w cycles, e.g. a disk with 4096-byte sectors
269 	 * that for compatibility claims to support 512-byte ones.
270 	 */
271 	if (VOP_IOCTL(vp, DIOCGSECTORALIGN, &dsa, FREAD, NOCRED) == 0) {
272 		*pashift = highbit(dsa.dsa_alignment * secsize) - 1;
273 		if (dsa.dsa_firstaligned % dsa.dsa_alignment)
274 			printf("ZFS WARNING: vdev %s: sectors are misaligned"
275 			    " (alignment=%"PRIu32", firstaligned=%"PRIu32")\n",
276 			    vd->vdev_path,
277 			    dsa.dsa_alignment, dsa.dsa_firstaligned);
278 	} else {
279 		*pashift = *ashift;
280 	}
281 
282 	vd->vdev_wholedisk = 0;
283 	if (getdiskinfo(vp, &dkw) != 0 &&
284 	    dkw.dkw_offset == 0 && dkw.dkw_size == numsecs)
285 		vd->vdev_wholedisk = 1,
286 
287 	/*
288 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
289 	 * try again.
290 	 */
291 	vd->vdev_nowritecache = B_FALSE;
292 
293 	return (0);
294 }
295 
296 static void
297 vdev_disk_close(vdev_t *vd)
298 {
299 	vdev_disk_t *dvd = vd->vdev_tsd;
300 
301 	if (vd->vdev_reopening || dvd == NULL)
302 		return;
303 
304 #ifdef illumos
305 	if (dvd->vd_minor != NULL) {
306 		ddi_devid_str_free(dvd->vd_minor);
307 		dvd->vd_minor = NULL;
308 	}
309 
310 	if (dvd->vd_devid != NULL) {
311 		ddi_devid_free(dvd->vd_devid);
312 		dvd->vd_devid = NULL;
313 	}
314 
315 	if (dvd->vd_lh != NULL) {
316 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
317 		dvd->vd_lh = NULL;
318 	}
319 #endif
320 
321 #ifdef __NetBSD__
322 	if (dvd->vd_vp != NULL) {
323 		vn_close(dvd->vd_vp, FREAD|FWRITE, kcred);
324 		dvd->vd_vp = NULL;
325 	}
326 	if (dvd->vd_wq != NULL) {
327 		workqueue_destroy(dvd->vd_wq);
328 		dvd->vd_wq = NULL;
329 	}
330 #endif
331 
332 	vd->vdev_delayed_close = B_FALSE;
333 #ifdef illumos
334 	/*
335 	 * If we closed the LDI handle due to an offline notify from LDI,
336 	 * don't free vd->vdev_tsd or unregister the callbacks here;
337 	 * the offline finalize callback or a reopen will take care of it.
338 	 */
339 	if (dvd->vd_ldi_offline)
340 		return;
341 #endif
342 
343 	vdev_disk_free(vd);
344 }
345 
346 int
347 vdev_disk_physio(vdev_t *vd, caddr_t data,
348     size_t size, uint64_t offset, int flags, boolean_t isdump)
349 {
350 #ifdef illumos
351 	vdev_disk_t *dvd = vd->vdev_tsd;
352 
353 	/*
354 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
355 	 * Nothing to be done here but return failure.
356 	 */
357 	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
358 		return (EIO);
359 
360 	ASSERT(vd->vdev_ops == &vdev_disk_ops);
361 
362 	/*
363 	 * If in the context of an active crash dump, use the ldi_dump(9F)
364 	 * call instead of ldi_strategy(9F) as usual.
365 	 */
366 	if (isdump) {
367 		ASSERT3P(dvd, !=, NULL);
368 		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
369 		    lbtodb(size)));
370 	}
371 
372 	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
373 #endif
374 #ifdef __NetBSD__
375 	return (EIO);
376 #endif
377 }
378 
379 static void
380 vdev_disk_io_intr(buf_t *bp)
381 {
382 	zio_t *zio = bp->b_private;
383 
384 	/*
385 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
386 	 * Rather than teach the rest of the stack about other error
387 	 * possibilities (EFAULT, etc), we normalize the error value here.
388 	 */
389 	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
390 
391 	if (zio->io_error == 0 && bp->b_resid != 0)
392 		zio->io_error = SET_ERROR(EIO);
393 
394 	putiobuf(bp);
395 	zio_delay_interrupt(zio);
396 }
397 
398 static void
399 vdev_disk_ioctl_free(zio_t *zio)
400 {
401 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
402 }
403 
404 static const zio_vsd_ops_t vdev_disk_vsd_ops = {
405 	vdev_disk_ioctl_free,
406 	zio_vsd_default_cksum_report
407 };
408 
409 static void
410 vdev_disk_ioctl_done(void *zio_arg, int error)
411 {
412 	zio_t *zio = zio_arg;
413 
414 	zio->io_error = error;
415 
416 	zio_interrupt(zio);
417 }
418 
419 static void
420 vdev_disk_io_start(zio_t *zio)
421 {
422 	vdev_t *vd = zio->io_vd;
423 	vdev_disk_t *dvd = vd->vdev_tsd;
424 	vnode_t *vp;
425 	buf_t *bp, *nbp;
426 	int error, size, off, resid;
427 
428 	/*
429 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
430 	 * Nothing to be done here but return failure.
431 	 */
432 #ifdef illumos
433 	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
434 		zio->io_error = SET_ERROR(ENXIO);
435 		zio_interrupt(zio);
436 		return;
437 	}
438 #endif
439 #ifdef __NetBSD__
440 	if (dvd == NULL) {
441 		zio->io_error = SET_ERROR(ENXIO);
442 		zio_interrupt(zio);
443 		return;
444 	}
445 	ASSERT3U(dvd->vd_maxphys, >, 0);
446 	vp = dvd->vd_vp;
447 #endif
448 
449 	if (zio->io_type == ZIO_TYPE_IOCTL) {
450 		/* XXPOLICY */
451 		if (!vdev_readable(vd)) {
452 			zio->io_error = SET_ERROR(ENXIO);
453 			zio_interrupt(zio);
454 			return;
455 		}
456 
457 		switch (zio->io_cmd) {
458 		case DKIOCFLUSHWRITECACHE:
459 
460 			if (zfs_nocacheflush)
461 				break;
462 
463 			if (vd->vdev_nowritecache) {
464 				zio->io_error = ENOTSUP;
465 				break;
466 			}
467 
468 			bp = getiobuf(vp, true);
469 			bp->b_private = zio;
470 			workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL);
471 			return;
472 
473 		default:
474 			zio->io_error = SET_ERROR(ENOTSUP);
475 			break;
476 		}
477 
478 		zio_execute(zio);
479 		return;
480 	}
481 
482 	bp = getiobuf(vp, true);
483 	bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
484 	bp->b_cflags = BC_BUSY | BC_NOCACHE;
485 	bp->b_data = zio->io_data;
486 	bp->b_blkno = btodb(zio->io_offset);
487 	bp->b_bcount = zio->io_size;
488 	bp->b_resid = zio->io_size;
489 	bp->b_iodone = vdev_disk_io_intr;
490 	bp->b_private = zio;
491 
492 	if (!(bp->b_flags & B_READ)) {
493 		mutex_enter(vp->v_interlock);
494 		vp->v_numoutput++;
495 		mutex_exit(vp->v_interlock);
496 	}
497 
498 	if (bp->b_bcount <= dvd->vd_maxphys) {
499 		/* We can do this I/O in one pass. */
500 		(void)VOP_STRATEGY(vp, bp);
501 	} else {
502 		/*
503 		 * The I/O is larger than we can process in one pass.
504 		 * Split it into smaller pieces.
505 		 */
506 		resid = zio->io_size;
507 		off = 0;
508 		while (resid != 0) {
509 			size = uimin(resid, dvd->vd_maxphys);
510 			nbp = getiobuf(vp, true);
511 			nbp->b_blkno = btodb(zio->io_offset + off);
512 			/* Below call increments v_numoutput. */
513 			nestiobuf_setup(bp, nbp, off, size);
514 			(void)VOP_STRATEGY(vp, nbp);
515 			resid -= size;
516 			off += size;
517 		}
518 	}
519 }
520 
521 static void
522 vdev_disk_io_done(zio_t *zio)
523 {
524 #ifdef illumos
525 	vdev_t *vd = zio->io_vd;
526 
527 	/*
528 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
529 	 * the device has been removed.  If this is the case, then we trigger an
530 	 * asynchronous removal of the device. Otherwise, probe the device and
531 	 * make sure it's still accessible.
532 	 */
533 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
534 		vdev_disk_t *dvd = vd->vdev_tsd;
535 		int state = DKIO_NONE;
536 
537 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
538 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
539 			/*
540 			 * We post the resource as soon as possible, instead of
541 			 * when the async removal actually happens, because the
542 			 * DE is using this information to discard previous I/O
543 			 * errors.
544 			 */
545 			zfs_post_remove(zio->io_spa, vd);
546 			vd->vdev_remove_wanted = B_TRUE;
547 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
548 		} else if (!vd->vdev_delayed_close) {
549 			vd->vdev_delayed_close = B_TRUE;
550 		}
551 	}
552 #endif
553 }
554 
555 vdev_ops_t vdev_disk_ops = {
556 	vdev_disk_open,
557 	vdev_disk_close,
558 	vdev_default_asize,
559 	vdev_disk_io_start,
560 	vdev_disk_io_done,
561 	NULL,
562 	vdev_disk_hold,
563 	vdev_disk_rele,
564 	VDEV_TYPE_DISK,		/* name of this vdev type */
565 	B_TRUE			/* leaf vdev */
566 };
567 
568 /*
569  * Given the root disk device devid or pathname, read the label from
570  * the device, and construct a configuration nvlist.
571  */
572 int
573 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
574 {
575 #ifdef __NetBSD__
576 	return (ENOTSUP);
577 #else
578 	ldi_handle_t vd_lh;
579 	vdev_label_t *label;
580 	uint64_t s, size;
581 	int l;
582 	ddi_devid_t tmpdevid;
583 	int error = -1;
584 	char *minor_name;
585 
586 	/*
587 	 * Read the device label and build the nvlist.
588 	 */
589 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
590 	    &minor_name) == 0) {
591 		error = ldi_open_by_devid(tmpdevid, minor_name,
592 		    FREAD, kcred, &vd_lh, zfs_li);
593 		ddi_devid_free(tmpdevid);
594 		ddi_devid_str_free(minor_name);
595 	}
596 
597 	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
598 	    zfs_li)))
599 		return (error);
600 
601 	if (ldi_get_size(vd_lh, &s)) {
602 		(void) ldi_close(vd_lh, FREAD, kcred);
603 		return (SET_ERROR(EIO));
604 	}
605 
606 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
607 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
608 
609 	*config = NULL;
610 	for (l = 0; l < VDEV_LABELS; l++) {
611 		uint64_t offset, state, txg = 0;
612 
613 		/* read vdev label */
614 		offset = vdev_label_offset(size, l, 0);
615 		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
616 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
617 			continue;
618 
619 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
620 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
621 			*config = NULL;
622 			continue;
623 		}
624 
625 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
626 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
627 			nvlist_free(*config);
628 			*config = NULL;
629 			continue;
630 		}
631 
632 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
633 		    &txg) != 0 || txg == 0) {
634 			nvlist_free(*config);
635 			*config = NULL;
636 			continue;
637 		}
638 
639 		break;
640 	}
641 
642 	kmem_free(label, sizeof (vdev_label_t));
643 	(void) ldi_close(vd_lh, FREAD, kcred);
644 	if (*config == NULL)
645 		error = SET_ERROR(EIDRM);
646 
647 	return (error);
648 #endif
649 }
650