1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2013 Joyent, Inc. All rights reserved.
26 */
27
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/refcount.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/disk.h>
38 #include <sys/dkio.h>
39 #include <sys/workqueue.h>
40
41 #ifdef __NetBSD__
42 static int
geterror(struct buf * bp)43 geterror(struct buf *bp)
44 {
45
46 return (bp->b_error);
47 }
48 #endif
49
50 /*
51 * Virtual device vector for disks.
52 */
53
54 static void vdev_disk_io_intr(buf_t *);
55
56 static void
vdev_disk_alloc(vdev_t * vd)57 vdev_disk_alloc(vdev_t *vd)
58 {
59 vdev_disk_t *dvd;
60
61 dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
62
63 #ifdef illumos
64 /*
65 * Create the LDI event callback list.
66 */
67 list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
68 offsetof(vdev_disk_ldi_cb_t, lcb_next));
69 #endif
70 }
71
72
73 static void
vdev_disk_free(vdev_t * vd)74 vdev_disk_free(vdev_t *vd)
75 {
76 vdev_disk_t *dvd = vd->vdev_tsd;
77 #ifdef illumos
78 vdev_disk_ldi_cb_t *lcb;
79 #endif
80
81 if (dvd == NULL)
82 return;
83
84 #ifdef illumos
85 /*
86 * We have already closed the LDI handle. Clean up the LDI event
87 * callbacks and free vd->vdev_tsd.
88 */
89 while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
90 list_remove(&dvd->vd_ldi_cbs, lcb);
91 (void) ldi_ev_remove_callbacks(lcb->lcb_id);
92 kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
93 }
94 list_destroy(&dvd->vd_ldi_cbs);
95 #endif
96 kmem_free(dvd, sizeof (vdev_disk_t));
97 vd->vdev_tsd = NULL;
98 }
99
100
101 /*
102 * It's not clear what these hold/rele functions are supposed to do.
103 */
104 static void
vdev_disk_hold(vdev_t * vd)105 vdev_disk_hold(vdev_t *vd)
106 {
107
108 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
109
110 }
111
112 static void
vdev_disk_rele(vdev_t * vd)113 vdev_disk_rele(vdev_t *vd)
114 {
115
116 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
117
118 }
119
120 static void
vdev_disk_flush(struct work * work,void * cookie)121 vdev_disk_flush(struct work *work, void *cookie)
122 {
123 vdev_disk_t *dvd;
124 int error, cmd;
125 buf_t *bp;
126 vnode_t *vp;
127
128 bp = (struct buf *)work;
129 vp = bp->b_vp;
130 dvd = cookie;
131
132 KASSERT(vp == dvd->vd_vp);
133
134 cmd = 1;
135 error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, kcred);
136 bp->b_error = error;
137 vdev_disk_io_intr(bp);
138 }
139
140 static int
vdev_disk_open(vdev_t * vd,uint64_t * psize,uint64_t * max_psize,uint64_t * ashift,uint64_t * pashift)141 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
142 uint64_t *ashift, uint64_t *pashift)
143 {
144 spa_t *spa = vd->vdev_spa;
145 vdev_disk_t *dvd;
146 vnode_t *vp;
147 int error, cmd;
148 uint64_t numsecs;
149 unsigned secsize;
150 struct disk *pdk;
151 struct dkwedge_info dkw;
152 struct disk_sectoralign dsa;
153
154 /*
155 * We must have a pathname, and it must be absolute.
156 */
157 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
158 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
159 return (SET_ERROR(EINVAL));
160 }
161
162 /*
163 * Reopen the device if it's not currently open. Otherwise,
164 * just update the physical size of the device.
165 */
166 if (vd->vdev_tsd != NULL) {
167 ASSERT(vd->vdev_reopening);
168 dvd = vd->vdev_tsd;
169 vp = dvd->vd_vp;
170 KASSERT(vp != NULL);
171 goto skip_open;
172 }
173
174 /*
175 * Create vd->vdev_tsd.
176 */
177 vdev_disk_alloc(vd);
178 dvd = vd->vdev_tsd;
179
180 /*
181 * When opening a disk device, we want to preserve the user's original
182 * intent. We always want to open the device by the path the user gave
183 * us, even if it is one of multiple paths to the save device. But we
184 * also want to be able to survive disks being removed/recabled.
185 * Therefore the sequence of opening devices is:
186 *
187 * 1. Try opening the device by path. For legacy pools without the
188 * 'whole_disk' property, attempt to fix the path by appending 's0'.
189 *
190 * 2. If the devid of the device matches the stored value, return
191 * success.
192 *
193 * 3. Otherwise, the device may have moved. Try opening the device
194 * by the devid instead.
195 */
196 if (vd->vdev_devid != NULL) {
197 /* XXXNETBSD wedges */
198 #ifdef illumos
199 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
200 &dvd->vd_minor) != 0) {
201 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
202 return (SET_ERROR(EINVAL));
203 }
204 #endif
205 }
206
207 error = EINVAL; /* presume failure */
208
209 error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0,
210 &vp, CRCREAT, 0);
211 if (error != 0) {
212 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
213 return (SET_ERROR(error));
214 }
215 if (vp->v_type != VBLK) {
216 #ifdef __NetBSD__
217 vn_close(vp, FREAD|FWRITE, kcred);
218 #else
219 vrele(vp);
220 #endif
221 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
222 return (SET_ERROR(EINVAL));
223 }
224
225 pdk = NULL;
226 if (getdiskinfo(vp, &dkw) == 0)
227 pdk = disk_find(dkw.dkw_devname);
228
229 /* XXXNETBSD Once tls-maxphys gets merged this block becomes:
230 dvd->vd_maxphys = (pdk ? disk_maxphys(pdk) : MACHINE_MAXPHYS);
231 */
232 {
233 struct buf buf = {
234 .b_dev = vp->v_rdev,
235 .b_bcount = MAXPHYS,
236 };
237 if (pdk && pdk->dk_driver && pdk->dk_driver->d_minphys)
238 (*pdk->dk_driver->d_minphys)(&buf);
239 dvd->vd_maxphys = buf.b_bcount;
240 }
241
242 /*
243 * XXXNETBSD Compare the devid to the stored value.
244 */
245
246 /*
247 * Create a workqueue to process cache-flushes concurrently.
248 */
249 error = workqueue_create(&dvd->vd_wq, "vdevsync",
250 vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE);
251 if (error != 0) {
252 #ifdef __NetBSD__
253 vn_close(vp, FREAD|FWRITE, kcred);
254 #else
255 vrele(vp);
256 #endif
257 return (SET_ERROR(error));
258 }
259
260 dvd->vd_vp = vp;
261
262 skip_open:
263 error = getdisksize(vp, &numsecs, &secsize);
264 if (error != 0) {
265 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
266 return (SET_ERROR(error));
267 }
268
269 *psize = numsecs * secsize;
270 *max_psize = *psize;
271
272 *ashift = highbit(MAX(secsize, SPA_MINBLOCKSIZE)) - 1;
273
274 /*
275 * Try to determine whether the disk has a preferred physical
276 * sector size even if it can emulate a smaller logical sector
277 * size with r/m/w cycles, e.g. a disk with 4096-byte sectors
278 * that for compatibility claims to support 512-byte ones.
279 */
280 if (VOP_IOCTL(vp, DIOCGSECTORALIGN, &dsa, FREAD, NOCRED) == 0) {
281 *pashift = highbit(dsa.dsa_alignment * secsize) - 1;
282 if (dsa.dsa_firstaligned % dsa.dsa_alignment)
283 printf("ZFS WARNING: vdev %s: sectors are misaligned"
284 " (alignment=%"PRIu32", firstaligned=%"PRIu32")\n",
285 vd->vdev_path,
286 dsa.dsa_alignment, dsa.dsa_firstaligned);
287 } else {
288 *pashift = *ashift;
289 }
290
291 vd->vdev_wholedisk = 0;
292 if (getdiskinfo(vp, &dkw) != 0 &&
293 dkw.dkw_offset == 0 && dkw.dkw_size == numsecs)
294 vd->vdev_wholedisk = 1,
295
296 /*
297 * Clear the nowritecache bit, so that on a vdev_reopen() we will
298 * try again.
299 */
300 vd->vdev_nowritecache = B_FALSE;
301
302 return (0);
303 }
304
305 static void
vdev_disk_close(vdev_t * vd)306 vdev_disk_close(vdev_t *vd)
307 {
308 vdev_disk_t *dvd = vd->vdev_tsd;
309
310 if (vd->vdev_reopening || dvd == NULL)
311 return;
312
313 #ifdef illumos
314 if (dvd->vd_minor != NULL) {
315 ddi_devid_str_free(dvd->vd_minor);
316 dvd->vd_minor = NULL;
317 }
318
319 if (dvd->vd_devid != NULL) {
320 ddi_devid_free(dvd->vd_devid);
321 dvd->vd_devid = NULL;
322 }
323
324 if (dvd->vd_lh != NULL) {
325 (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
326 dvd->vd_lh = NULL;
327 }
328 #endif
329
330 #ifdef __NetBSD__
331 if (dvd->vd_vp != NULL) {
332 vn_close(dvd->vd_vp, FREAD|FWRITE, kcred);
333 dvd->vd_vp = NULL;
334 }
335 if (dvd->vd_wq != NULL) {
336 workqueue_destroy(dvd->vd_wq);
337 dvd->vd_wq = NULL;
338 }
339 #endif
340
341 vd->vdev_delayed_close = B_FALSE;
342 #ifdef illumos
343 /*
344 * If we closed the LDI handle due to an offline notify from LDI,
345 * don't free vd->vdev_tsd or unregister the callbacks here;
346 * the offline finalize callback or a reopen will take care of it.
347 */
348 if (dvd->vd_ldi_offline)
349 return;
350 #endif
351
352 vdev_disk_free(vd);
353 }
354
355 int
vdev_disk_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,int flags,boolean_t isdump)356 vdev_disk_physio(vdev_t *vd, caddr_t data,
357 size_t size, uint64_t offset, int flags, boolean_t isdump)
358 {
359 #ifdef illumos
360 vdev_disk_t *dvd = vd->vdev_tsd;
361
362 /*
363 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
364 * Nothing to be done here but return failure.
365 */
366 if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
367 return (EIO);
368
369 ASSERT(vd->vdev_ops == &vdev_disk_ops);
370
371 /*
372 * If in the context of an active crash dump, use the ldi_dump(9F)
373 * call instead of ldi_strategy(9F) as usual.
374 */
375 if (isdump) {
376 ASSERT3P(dvd, !=, NULL);
377 return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
378 lbtodb(size)));
379 }
380
381 return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
382 #endif
383 #ifdef __NetBSD__
384 return (EIO);
385 #endif
386 }
387
388 static void
vdev_disk_io_intr(buf_t * bp)389 vdev_disk_io_intr(buf_t *bp)
390 {
391 zio_t *zio = bp->b_private;
392
393 /*
394 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
395 * Rather than teach the rest of the stack about other error
396 * possibilities (EFAULT, etc), we normalize the error value here.
397 */
398 zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
399
400 if (zio->io_error == 0 && bp->b_resid != 0)
401 zio->io_error = SET_ERROR(EIO);
402
403 putiobuf(bp);
404 zio_delay_interrupt(zio);
405 }
406
407 static void
vdev_disk_ioctl_free(zio_t * zio)408 vdev_disk_ioctl_free(zio_t *zio)
409 {
410 kmem_free(zio->io_vsd, sizeof (struct dk_callback));
411 }
412
413 static const zio_vsd_ops_t vdev_disk_vsd_ops = {
414 vdev_disk_ioctl_free,
415 zio_vsd_default_cksum_report
416 };
417
418 static void
vdev_disk_ioctl_done(void * zio_arg,int error)419 vdev_disk_ioctl_done(void *zio_arg, int error)
420 {
421 zio_t *zio = zio_arg;
422
423 zio->io_error = error;
424
425 zio_interrupt(zio);
426 }
427
428 static void
vdev_disk_io_start(zio_t * zio)429 vdev_disk_io_start(zio_t *zio)
430 {
431 vdev_t *vd = zio->io_vd;
432 vdev_disk_t *dvd = vd->vdev_tsd;
433 vnode_t *vp;
434 buf_t *bp, *nbp;
435 int error, size, off, resid;
436
437 /*
438 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
439 * Nothing to be done here but return failure.
440 */
441 #ifdef illumos
442 if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
443 zio->io_error = SET_ERROR(ENXIO);
444 zio_interrupt(zio);
445 return;
446 }
447 #endif
448 #ifdef __NetBSD__
449 if (dvd == NULL) {
450 zio->io_error = SET_ERROR(ENXIO);
451 zio_interrupt(zio);
452 return;
453 }
454 ASSERT3U(dvd->vd_maxphys, >, 0);
455 vp = dvd->vd_vp;
456 #endif
457
458 if (zio->io_type == ZIO_TYPE_IOCTL) {
459 /* XXPOLICY */
460 if (!vdev_readable(vd)) {
461 zio->io_error = SET_ERROR(ENXIO);
462 zio_interrupt(zio);
463 return;
464 }
465
466 switch (zio->io_cmd) {
467 case DKIOCFLUSHWRITECACHE:
468
469 if (zfs_nocacheflush)
470 break;
471
472 if (vd->vdev_nowritecache) {
473 zio->io_error = ENOTSUP;
474 break;
475 }
476
477 bp = getiobuf(vp, true);
478 bp->b_private = zio;
479 workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL);
480 return;
481
482 default:
483 zio->io_error = SET_ERROR(ENOTSUP);
484 break;
485 }
486
487 zio_execute(zio);
488 return;
489 }
490
491 bp = getiobuf(vp, true);
492 bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
493 bp->b_cflags = BC_BUSY | BC_NOCACHE;
494 bp->b_data = zio->io_data;
495 bp->b_blkno = btodb(zio->io_offset);
496 bp->b_bcount = zio->io_size;
497 bp->b_resid = zio->io_size;
498 bp->b_iodone = vdev_disk_io_intr;
499 bp->b_private = zio;
500
501 if (!(bp->b_flags & B_READ)) {
502 mutex_enter(vp->v_interlock);
503 vp->v_numoutput++;
504 mutex_exit(vp->v_interlock);
505 }
506
507 if (bp->b_bcount <= dvd->vd_maxphys) {
508 /* We can do this I/O in one pass. */
509 (void)VOP_STRATEGY(vp, bp);
510 } else {
511 /*
512 * The I/O is larger than we can process in one pass.
513 * Split it into smaller pieces.
514 */
515 resid = zio->io_size;
516 off = 0;
517 while (resid != 0) {
518 size = uimin(resid, dvd->vd_maxphys);
519 nbp = getiobuf(vp, true);
520 nbp->b_blkno = btodb(zio->io_offset + off);
521 /* Below call increments v_numoutput. */
522 nestiobuf_setup(bp, nbp, off, size);
523 (void)VOP_STRATEGY(vp, nbp);
524 resid -= size;
525 off += size;
526 }
527 }
528 }
529
530 static void
vdev_disk_io_done(zio_t * zio)531 vdev_disk_io_done(zio_t *zio)
532 {
533 #ifdef illumos
534 vdev_t *vd = zio->io_vd;
535
536 /*
537 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
538 * the device has been removed. If this is the case, then we trigger an
539 * asynchronous removal of the device. Otherwise, probe the device and
540 * make sure it's still accessible.
541 */
542 if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
543 vdev_disk_t *dvd = vd->vdev_tsd;
544 int state = DKIO_NONE;
545
546 if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
547 FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
548 /*
549 * We post the resource as soon as possible, instead of
550 * when the async removal actually happens, because the
551 * DE is using this information to discard previous I/O
552 * errors.
553 */
554 zfs_post_remove(zio->io_spa, vd);
555 vd->vdev_remove_wanted = B_TRUE;
556 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
557 } else if (!vd->vdev_delayed_close) {
558 vd->vdev_delayed_close = B_TRUE;
559 }
560 }
561 #endif
562 }
563
564 vdev_ops_t vdev_disk_ops = {
565 vdev_disk_open,
566 vdev_disk_close,
567 vdev_default_asize,
568 vdev_disk_io_start,
569 vdev_disk_io_done,
570 NULL,
571 vdev_disk_hold,
572 vdev_disk_rele,
573 VDEV_TYPE_DISK, /* name of this vdev type */
574 B_TRUE /* leaf vdev */
575 };
576
577 /*
578 * Given the root disk device devid or pathname, read the label from
579 * the device, and construct a configuration nvlist.
580 */
581 int
vdev_disk_read_rootlabel(char * devpath,char * devid,nvlist_t ** config)582 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
583 {
584 #ifdef __NetBSD__
585 return (ENOTSUP);
586 #else
587 ldi_handle_t vd_lh;
588 vdev_label_t *label;
589 uint64_t s, size;
590 int l;
591 ddi_devid_t tmpdevid;
592 int error = -1;
593 char *minor_name;
594
595 /*
596 * Read the device label and build the nvlist.
597 */
598 if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
599 &minor_name) == 0) {
600 error = ldi_open_by_devid(tmpdevid, minor_name,
601 FREAD, kcred, &vd_lh, zfs_li);
602 ddi_devid_free(tmpdevid);
603 ddi_devid_str_free(minor_name);
604 }
605
606 if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
607 zfs_li)))
608 return (error);
609
610 if (ldi_get_size(vd_lh, &s)) {
611 (void) ldi_close(vd_lh, FREAD, kcred);
612 return (SET_ERROR(EIO));
613 }
614
615 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
616 label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
617
618 *config = NULL;
619 for (l = 0; l < VDEV_LABELS; l++) {
620 uint64_t offset, state, txg = 0;
621
622 /* read vdev label */
623 offset = vdev_label_offset(size, l, 0);
624 if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
625 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
626 continue;
627
628 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
629 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
630 *config = NULL;
631 continue;
632 }
633
634 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
635 &state) != 0 || state >= POOL_STATE_DESTROYED) {
636 nvlist_free(*config);
637 *config = NULL;
638 continue;
639 }
640
641 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
642 &txg) != 0 || txg == 0) {
643 nvlist_free(*config);
644 *config = NULL;
645 continue;
646 }
647
648 break;
649 }
650
651 kmem_free(label, sizeof (vdev_label_t));
652 (void) ldi_close(vd_lh, FREAD, kcred);
653 if (*config == NULL)
654 error = SET_ERROR(EIDRM);
655
656 return (error);
657 #endif
658 }
659