xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/refcount.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/disk.h>
38 #include <sys/dkio.h>
39 #include <sys/workqueue.h>
40 
41 #ifdef __NetBSD__
42 static int
43 geterror(struct buf *bp)
44 {
45 
46 	return (bp->b_error);
47 }
48 #endif
49 
50 /*
51  * Virtual device vector for disks.
52  */
53 
54 static void	vdev_disk_io_intr(buf_t *);
55 
56 static void
57 vdev_disk_alloc(vdev_t *vd)
58 {
59 	vdev_disk_t *dvd;
60 
61 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
62 
63 #ifdef illumos
64 	/*
65 	 * Create the LDI event callback list.
66 	 */
67 	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
68 	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
69 #endif
70 }
71 
72 
73 static void
74 vdev_disk_free(vdev_t *vd)
75 {
76 	vdev_disk_t *dvd = vd->vdev_tsd;
77 #ifdef illumos
78 	vdev_disk_ldi_cb_t *lcb;
79 #endif
80 
81 	if (dvd == NULL)
82 		return;
83 
84 #ifdef illumos
85 	/*
86 	 * We have already closed the LDI handle. Clean up the LDI event
87 	 * callbacks and free vd->vdev_tsd.
88 	 */
89 	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
90 		list_remove(&dvd->vd_ldi_cbs, lcb);
91 		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
92 		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
93 	}
94 	list_destroy(&dvd->vd_ldi_cbs);
95 #endif
96 	kmem_free(dvd, sizeof (vdev_disk_t));
97 	vd->vdev_tsd = NULL;
98 }
99 
100 
101 /*
102  * It's not clear what these hold/rele functions are supposed to do.
103  */
104 static void
105 vdev_disk_hold(vdev_t *vd)
106 {
107 
108 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
109 
110 }
111 
112 static void
113 vdev_disk_rele(vdev_t *vd)
114 {
115 
116 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
117 
118 }
119 
120 static void
121 vdev_disk_flush(struct work *work, void *cookie)
122 {
123 	vdev_disk_t *dvd;
124 	int error, cmd;
125 	buf_t *bp;
126 	vnode_t *vp;
127 
128 	bp = (struct buf *)work;
129 	vp = bp->b_vp;
130 	dvd = cookie;
131 
132 	KASSERT(vp == dvd->vd_vp);
133 
134 	cmd = 1;
135 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
136 	error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, kcred);
137 	VOP_UNLOCK(vp, 0);
138 	bp->b_error = error;
139 	vdev_disk_io_intr(bp);
140 }
141 
142 static int
143 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
144     uint64_t *ashift, uint64_t *pashift)
145 {
146 	spa_t *spa = vd->vdev_spa;
147 	vdev_disk_t *dvd;
148 	vnode_t *vp;
149 	int error, cmd;
150 	uint64_t numsecs;
151 	unsigned secsize;
152 	struct disk *pdk;
153 	struct dkwedge_info dkw;
154 	struct disk_sectoralign dsa;
155 
156 	/*
157 	 * We must have a pathname, and it must be absolute.
158 	 */
159 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
160 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
161 		return (SET_ERROR(EINVAL));
162 	}
163 
164 	/*
165 	 * Reopen the device if it's not currently open. Otherwise,
166 	 * just update the physical size of the device.
167 	 */
168 	if (vd->vdev_tsd != NULL) {
169 		ASSERT(vd->vdev_reopening);
170 		dvd = vd->vdev_tsd;
171 		vp = dvd->vd_vp;
172 		KASSERT(vp != NULL);
173 		goto skip_open;
174 	}
175 
176 	/*
177 	 * Create vd->vdev_tsd.
178 	 */
179 	vdev_disk_alloc(vd);
180 	dvd = vd->vdev_tsd;
181 
182 	/*
183 	 * When opening a disk device, we want to preserve the user's original
184 	 * intent.  We always want to open the device by the path the user gave
185 	 * us, even if it is one of multiple paths to the save device.  But we
186 	 * also want to be able to survive disks being removed/recabled.
187 	 * Therefore the sequence of opening devices is:
188 	 *
189 	 * 1. Try opening the device by path.  For legacy pools without the
190 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
191 	 *
192 	 * 2. If the devid of the device matches the stored value, return
193 	 *    success.
194 	 *
195 	 * 3. Otherwise, the device may have moved.  Try opening the device
196 	 *    by the devid instead.
197 	 */
198 	if (vd->vdev_devid != NULL) {
199 		/* XXXNETBSD wedges */
200 #ifdef illumos
201 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
202 		    &dvd->vd_minor) != 0) {
203 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
204 			return (SET_ERROR(EINVAL));
205 		}
206 #endif
207 	}
208 
209 	error = EINVAL;		/* presume failure */
210 
211 	error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0,
212 	    &vp, CRCREAT, 0);
213 	if (error != 0) {
214 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
215 		return (SET_ERROR(error));
216 	}
217 	if (vp->v_type != VBLK) {
218 #ifdef __NetBSD__
219 		vn_close(vp, FREAD|FWRITE, kcred);
220 #else
221 		vrele(vp);
222 #endif
223 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
224 		return (SET_ERROR(EINVAL));
225 	}
226 
227 	pdk = NULL;
228 	if (getdiskinfo(vp, &dkw) == 0)
229 		pdk = disk_find(dkw.dkw_devname);
230 
231 	/* XXXNETBSD Once tls-maxphys gets merged this block becomes:
232 		dvd->vd_maxphys = (pdk ? disk_maxphys(pdk) : MACHINE_MAXPHYS);
233 	*/
234 	{
235 		struct buf buf = {
236 			.b_dev = vp->v_rdev,
237 			.b_bcount = MAXPHYS,
238 		};
239 		if (pdk && pdk->dk_driver && pdk->dk_driver->d_minphys)
240 			(*pdk->dk_driver->d_minphys)(&buf);
241 		dvd->vd_maxphys = buf.b_bcount;
242 	}
243 
244 	/*
245 	 * XXXNETBSD Compare the devid to the stored value.
246 	 */
247 
248 	/*
249 	 * Create a workqueue to process cache-flushes concurrently.
250 	 */
251 	error = workqueue_create(&dvd->vd_wq, "vdevsync",
252 	    vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE);
253 	if (error != 0) {
254 #ifdef __NetBSD__
255 		vn_close(vp, FREAD|FWRITE, kcred);
256 #else
257 		vrele(vp);
258 #endif
259 		return (SET_ERROR(error));
260 	}
261 
262 	dvd->vd_vp = vp;
263 
264 skip_open:
265 	error = getdisksize(vp, &numsecs, &secsize);
266 	if (error != 0) {
267 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
268 		return (SET_ERROR(error));
269 	}
270 
271 	*psize = numsecs * secsize;
272 	*max_psize = *psize;
273 
274 	*ashift = highbit(MAX(secsize, SPA_MINBLOCKSIZE)) - 1;
275 
276 	/*
277 	 * Try to determine whether the disk has a preferred physical
278 	 * sector size even if it can emulate a smaller logical sector
279 	 * size with r/m/w cycles, e.g. a disk with 4096-byte sectors
280 	 * that for compatibility claims to support 512-byte ones.
281 	 */
282 	if (VOP_IOCTL(vp, DIOCGSECTORALIGN, &dsa, FREAD, NOCRED) == 0) {
283 		*pashift = highbit(dsa.dsa_alignment * secsize) - 1;
284 		if (dsa.dsa_firstaligned % dsa.dsa_alignment)
285 			printf("ZFS WARNING: vdev %s: sectors are misaligned"
286 			    " (alignment=%"PRIu32", firstaligned=%"PRIu32")\n",
287 			    vd->vdev_path,
288 			    dsa.dsa_alignment, dsa.dsa_firstaligned);
289 	} else {
290 		*pashift = *ashift;
291 	}
292 
293 	vd->vdev_wholedisk = 0;
294 	if (getdiskinfo(vp, &dkw) != 0 &&
295 	    dkw.dkw_offset == 0 && dkw.dkw_size == numsecs)
296 		vd->vdev_wholedisk = 1,
297 
298 	/*
299 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
300 	 * try again.
301 	 */
302 	vd->vdev_nowritecache = B_FALSE;
303 
304 	return (0);
305 }
306 
307 static void
308 vdev_disk_close(vdev_t *vd)
309 {
310 	vdev_disk_t *dvd = vd->vdev_tsd;
311 
312 	if (vd->vdev_reopening || dvd == NULL)
313 		return;
314 
315 #ifdef illumos
316 	if (dvd->vd_minor != NULL) {
317 		ddi_devid_str_free(dvd->vd_minor);
318 		dvd->vd_minor = NULL;
319 	}
320 
321 	if (dvd->vd_devid != NULL) {
322 		ddi_devid_free(dvd->vd_devid);
323 		dvd->vd_devid = NULL;
324 	}
325 
326 	if (dvd->vd_lh != NULL) {
327 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
328 		dvd->vd_lh = NULL;
329 	}
330 #endif
331 
332 #ifdef __NetBSD__
333 	if (dvd->vd_vp != NULL) {
334 		vn_close(dvd->vd_vp, FREAD|FWRITE, kcred);
335 		dvd->vd_vp = NULL;
336 	}
337 	if (dvd->vd_wq != NULL) {
338 		workqueue_destroy(dvd->vd_wq);
339 		dvd->vd_wq = NULL;
340 	}
341 #endif
342 
343 	vd->vdev_delayed_close = B_FALSE;
344 #ifdef illumos
345 	/*
346 	 * If we closed the LDI handle due to an offline notify from LDI,
347 	 * don't free vd->vdev_tsd or unregister the callbacks here;
348 	 * the offline finalize callback or a reopen will take care of it.
349 	 */
350 	if (dvd->vd_ldi_offline)
351 		return;
352 #endif
353 
354 	vdev_disk_free(vd);
355 }
356 
357 int
358 vdev_disk_physio(vdev_t *vd, caddr_t data,
359     size_t size, uint64_t offset, int flags, boolean_t isdump)
360 {
361 #ifdef illumos
362 	vdev_disk_t *dvd = vd->vdev_tsd;
363 
364 	/*
365 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
366 	 * Nothing to be done here but return failure.
367 	 */
368 	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
369 		return (EIO);
370 
371 	ASSERT(vd->vdev_ops == &vdev_disk_ops);
372 
373 	/*
374 	 * If in the context of an active crash dump, use the ldi_dump(9F)
375 	 * call instead of ldi_strategy(9F) as usual.
376 	 */
377 	if (isdump) {
378 		ASSERT3P(dvd, !=, NULL);
379 		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
380 		    lbtodb(size)));
381 	}
382 
383 	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
384 #endif
385 #ifdef __NetBSD__
386 	return (EIO);
387 #endif
388 }
389 
390 static void
391 vdev_disk_io_intr(buf_t *bp)
392 {
393 	zio_t *zio = bp->b_private;
394 
395 	/*
396 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
397 	 * Rather than teach the rest of the stack about other error
398 	 * possibilities (EFAULT, etc), we normalize the error value here.
399 	 */
400 	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
401 
402 	if (zio->io_error == 0 && bp->b_resid != 0)
403 		zio->io_error = SET_ERROR(EIO);
404 
405 	putiobuf(bp);
406 	zio_delay_interrupt(zio);
407 }
408 
409 static void
410 vdev_disk_ioctl_free(zio_t *zio)
411 {
412 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
413 }
414 
415 static const zio_vsd_ops_t vdev_disk_vsd_ops = {
416 	vdev_disk_ioctl_free,
417 	zio_vsd_default_cksum_report
418 };
419 
420 static void
421 vdev_disk_ioctl_done(void *zio_arg, int error)
422 {
423 	zio_t *zio = zio_arg;
424 
425 	zio->io_error = error;
426 
427 	zio_interrupt(zio);
428 }
429 
430 static void
431 vdev_disk_io_start(zio_t *zio)
432 {
433 	vdev_t *vd = zio->io_vd;
434 	vdev_disk_t *dvd = vd->vdev_tsd;
435 	vnode_t *vp;
436 	buf_t *bp, *nbp;
437 	int error, size, off, resid;
438 
439 	/*
440 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
441 	 * Nothing to be done here but return failure.
442 	 */
443 #ifdef illumos
444 	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
445 		zio->io_error = SET_ERROR(ENXIO);
446 		zio_interrupt(zio);
447 		return;
448 	}
449 #endif
450 #ifdef __NetBSD__
451 	if (dvd == NULL) {
452 		zio->io_error = SET_ERROR(ENXIO);
453 		zio_interrupt(zio);
454 		return;
455 	}
456 	ASSERT3U(dvd->vd_maxphys, >, 0);
457 	vp = dvd->vd_vp;
458 #endif
459 
460 	if (zio->io_type == ZIO_TYPE_IOCTL) {
461 		/* XXPOLICY */
462 		if (!vdev_readable(vd)) {
463 			zio->io_error = SET_ERROR(ENXIO);
464 			zio_interrupt(zio);
465 			return;
466 		}
467 
468 		switch (zio->io_cmd) {
469 		case DKIOCFLUSHWRITECACHE:
470 
471 			if (zfs_nocacheflush)
472 				break;
473 
474 			if (vd->vdev_nowritecache) {
475 				zio->io_error = ENOTSUP;
476 				break;
477 			}
478 
479 			bp = getiobuf(vp, true);
480 			bp->b_private = zio;
481 			workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL);
482 			return;
483 
484 		default:
485 			zio->io_error = SET_ERROR(ENOTSUP);
486 			break;
487 		}
488 
489 		zio_execute(zio);
490 		return;
491 	}
492 
493 	bp = getiobuf(vp, true);
494 	bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
495 	bp->b_cflags = BC_BUSY | BC_NOCACHE;
496 	bp->b_data = zio->io_data;
497 	bp->b_blkno = btodb(zio->io_offset);
498 	bp->b_bcount = zio->io_size;
499 	bp->b_resid = zio->io_size;
500 	bp->b_iodone = vdev_disk_io_intr;
501 	bp->b_private = zio;
502 
503 	if (!(bp->b_flags & B_READ)) {
504 		mutex_enter(vp->v_interlock);
505 		vp->v_numoutput++;
506 		mutex_exit(vp->v_interlock);
507 	}
508 
509 	if (bp->b_bcount <= dvd->vd_maxphys) {
510 		/* We can do this I/O in one pass. */
511 		(void)VOP_STRATEGY(vp, bp);
512 	} else {
513 		/*
514 		 * The I/O is larger than we can process in one pass.
515 		 * Split it into smaller pieces.
516 		 */
517 		resid = zio->io_size;
518 		off = 0;
519 		while (resid != 0) {
520 			size = uimin(resid, dvd->vd_maxphys);
521 			nbp = getiobuf(vp, true);
522 			nbp->b_blkno = btodb(zio->io_offset + off);
523 			/* Below call increments v_numoutput. */
524 			nestiobuf_setup(bp, nbp, off, size);
525 			(void)VOP_STRATEGY(vp, nbp);
526 			resid -= size;
527 			off += size;
528 		}
529 	}
530 }
531 
532 static void
533 vdev_disk_io_done(zio_t *zio)
534 {
535 #ifdef illumos
536 	vdev_t *vd = zio->io_vd;
537 
538 	/*
539 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
540 	 * the device has been removed.  If this is the case, then we trigger an
541 	 * asynchronous removal of the device. Otherwise, probe the device and
542 	 * make sure it's still accessible.
543 	 */
544 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
545 		vdev_disk_t *dvd = vd->vdev_tsd;
546 		int state = DKIO_NONE;
547 
548 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
549 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
550 			/*
551 			 * We post the resource as soon as possible, instead of
552 			 * when the async removal actually happens, because the
553 			 * DE is using this information to discard previous I/O
554 			 * errors.
555 			 */
556 			zfs_post_remove(zio->io_spa, vd);
557 			vd->vdev_remove_wanted = B_TRUE;
558 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
559 		} else if (!vd->vdev_delayed_close) {
560 			vd->vdev_delayed_close = B_TRUE;
561 		}
562 	}
563 #endif
564 }
565 
566 vdev_ops_t vdev_disk_ops = {
567 	vdev_disk_open,
568 	vdev_disk_close,
569 	vdev_default_asize,
570 	vdev_disk_io_start,
571 	vdev_disk_io_done,
572 	NULL,
573 	vdev_disk_hold,
574 	vdev_disk_rele,
575 	VDEV_TYPE_DISK,		/* name of this vdev type */
576 	B_TRUE			/* leaf vdev */
577 };
578 
579 /*
580  * Given the root disk device devid or pathname, read the label from
581  * the device, and construct a configuration nvlist.
582  */
583 int
584 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
585 {
586 #ifdef __NetBSD__
587 	return (ENOTSUP);
588 #else
589 	ldi_handle_t vd_lh;
590 	vdev_label_t *label;
591 	uint64_t s, size;
592 	int l;
593 	ddi_devid_t tmpdevid;
594 	int error = -1;
595 	char *minor_name;
596 
597 	/*
598 	 * Read the device label and build the nvlist.
599 	 */
600 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
601 	    &minor_name) == 0) {
602 		error = ldi_open_by_devid(tmpdevid, minor_name,
603 		    FREAD, kcred, &vd_lh, zfs_li);
604 		ddi_devid_free(tmpdevid);
605 		ddi_devid_str_free(minor_name);
606 	}
607 
608 	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
609 	    zfs_li)))
610 		return (error);
611 
612 	if (ldi_get_size(vd_lh, &s)) {
613 		(void) ldi_close(vd_lh, FREAD, kcred);
614 		return (SET_ERROR(EIO));
615 	}
616 
617 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
618 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
619 
620 	*config = NULL;
621 	for (l = 0; l < VDEV_LABELS; l++) {
622 		uint64_t offset, state, txg = 0;
623 
624 		/* read vdev label */
625 		offset = vdev_label_offset(size, l, 0);
626 		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
627 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
628 			continue;
629 
630 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
631 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
632 			*config = NULL;
633 			continue;
634 		}
635 
636 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
637 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
638 			nvlist_free(*config);
639 			*config = NULL;
640 			continue;
641 		}
642 
643 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
644 		    &txg) != 0 || txg == 0) {
645 			nvlist_free(*config);
646 			*config = NULL;
647 			continue;
648 		}
649 
650 		break;
651 	}
652 
653 	kmem_free(label, sizeof (vdev_label_t));
654 	(void) ldi_close(vd_lh, FREAD, kcred);
655 	if (*config == NULL)
656 		error = SET_ERROR(EIDRM);
657 
658 	return (error);
659 #endif
660 }
661