xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c (revision 3227e6cf668bd374971740bd6660f43cee4417ac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23  * All rights reserved.
24  *
25  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/bio.h>
32 #include <sys/disk.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/vdev_impl.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/zio.h>
38 #include <geom/geom.h>
39 #include <geom/geom_int.h>
40 
41 /*
42  * Virtual device vector for GEOM.
43  */
44 
45 static g_attrchanged_t vdev_geom_attrchanged;
46 struct g_class zfs_vdev_class = {
47 	.name = "ZFS::VDEV",
48 	.version = G_VERSION,
49 	.attrchanged = vdev_geom_attrchanged,
50 };
51 
52 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53 
54 SYSCTL_DECL(_vfs_zfs_vdev);
55 /* Don't send BIO_FLUSH. */
56 static int vdev_geom_bio_flush_disable;
57 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
58     &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
59 /* Don't send BIO_DELETE. */
60 static int vdev_geom_bio_delete_disable;
61 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
62     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
63 
64 /* Declare local functions */
65 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
66 
67 /*
68  * Thread local storage used to indicate when a thread is probing geoms
69  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
70  * it is looking for a replacement for the vdev_t* that is its value.
71  */
72 uint_t zfs_geom_probe_vdev_key;
73 
74 static void
vdev_geom_set_rotation_rate(vdev_t * vd,struct g_consumer * cp)75 vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
76 {
77 	int error;
78 	uint16_t rate;
79 
80 	error = g_getattr("GEOM::rotation_rate", cp, &rate);
81 	if (error == 0)
82 		vd->vdev_rotation_rate = rate;
83 	else
84 		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
85 }
86 
87 static void
vdev_geom_set_physpath(struct g_consumer * cp,boolean_t do_null_update)88 vdev_geom_set_physpath(struct g_consumer *cp, boolean_t do_null_update)
89 {
90 	boolean_t needs_update = B_FALSE;
91 	vdev_t *vd;
92 	char *physpath;
93 	int error, physpath_len;
94 
95 	if (g_access(cp, 1, 0, 0) != 0)
96 		return;
97 
98 	vd = cp->private;
99 	physpath_len = MAXPATHLEN;
100 	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
101 	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
102 	g_access(cp, -1, 0, 0);
103 	if (error == 0) {
104 		char *old_physpath;
105 
106 		/* g_topology lock ensures that vdev has not been closed */
107 		g_topology_assert();
108 		old_physpath = vd->vdev_physpath;
109 		vd->vdev_physpath = spa_strdup(physpath);
110 
111 		if (old_physpath != NULL) {
112 			needs_update = (strcmp(old_physpath,
113 						vd->vdev_physpath) != 0);
114 			spa_strfree(old_physpath);
115 		} else
116 			needs_update = do_null_update;
117 	}
118 	g_free(physpath);
119 
120 	/*
121 	 * If the physical path changed, update the config.
122 	 * Only request an update for previously unset physpaths if
123 	 * requested by the caller.
124 	 */
125 	if (needs_update)
126 		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
127 
128 }
129 
130 static void
vdev_geom_attrchanged(struct g_consumer * cp,const char * attr)131 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
132 {
133 	vdev_t *vd;
134 	char *old_physpath;
135 	int error;
136 
137 	vd = cp->private;
138 	if (vd == NULL)
139 		return;
140 
141 	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
142 		vdev_geom_set_rotation_rate(vd, cp);
143 		return;
144 	}
145 
146 	if (strcmp(attr, "GEOM::physpath") == 0) {
147 		vdev_geom_set_physpath(cp, /*do_null_update*/B_TRUE);
148 		return;
149 	}
150 }
151 
152 static void
vdev_geom_orphan(struct g_consumer * cp)153 vdev_geom_orphan(struct g_consumer *cp)
154 {
155 	vdev_t *vd;
156 
157 	g_topology_assert();
158 
159 	vd = cp->private;
160 	if (vd == NULL) {
161 		/* Vdev close in progress.  Ignore the event. */
162 		return;
163 	}
164 
165 	/*
166 	 * Orphan callbacks occur from the GEOM event thread.
167 	 * Concurrent with this call, new I/O requests may be
168 	 * working their way through GEOM about to find out
169 	 * (only once executed by the g_down thread) that we've
170 	 * been orphaned from our disk provider.  These I/Os
171 	 * must be retired before we can detach our consumer.
172 	 * This is most easily achieved by acquiring the
173 	 * SPA ZIO configuration lock as a writer, but doing
174 	 * so with the GEOM topology lock held would cause
175 	 * a lock order reversal.  Instead, rely on the SPA's
176 	 * async removal support to invoke a close on this
177 	 * vdev once it is safe to do so.
178 	 */
179 	vd->vdev_remove_wanted = B_TRUE;
180 	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
181 }
182 
183 static struct g_consumer *
vdev_geom_attach(struct g_provider * pp,vdev_t * vd)184 vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
185 {
186 	struct g_geom *gp;
187 	struct g_consumer *cp;
188 	int error;
189 
190 	g_topology_assert();
191 
192 	ZFS_LOG(1, "Attaching to %s.", pp->name);
193 
194 	if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
195 		ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
196 		    pp->name, pp->sectorsize);
197 		return (NULL);
198 	} else if (pp->mediasize < SPA_MINDEVSIZE) {
199 		ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
200 		    pp->name, pp->mediasize);
201 		return (NULL);
202 	}
203 
204 	/* Do we have geom already? No? Create one. */
205 	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
206 		if (gp->flags & G_GEOM_WITHER)
207 			continue;
208 		if (strcmp(gp->name, "zfs::vdev") != 0)
209 			continue;
210 		break;
211 	}
212 	if (gp == NULL) {
213 		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
214 		gp->orphan = vdev_geom_orphan;
215 		gp->attrchanged = vdev_geom_attrchanged;
216 		cp = g_new_consumer(gp);
217 		error = g_attach(cp, pp);
218 		if (error != 0) {
219 			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
220 			    __LINE__, error);
221 			vdev_geom_detach(cp, B_FALSE);
222 			return (NULL);
223 		}
224 		error = g_access(cp, 1, 0, 1);
225 		if (error != 0) {
226 			ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__,
227 			       __LINE__, error);
228 			vdev_geom_detach(cp, B_FALSE);
229 			return (NULL);
230 		}
231 		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
232 	} else {
233 		/* Check if we are already connected to this provider. */
234 		LIST_FOREACH(cp, &gp->consumer, consumer) {
235 			if (cp->provider == pp) {
236 				ZFS_LOG(1, "Found consumer for %s.", pp->name);
237 				break;
238 			}
239 		}
240 		if (cp == NULL) {
241 			cp = g_new_consumer(gp);
242 			error = g_attach(cp, pp);
243 			if (error != 0) {
244 				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
245 				    __func__, __LINE__, error);
246 				vdev_geom_detach(cp, B_FALSE);
247 				return (NULL);
248 			}
249 			error = g_access(cp, 1, 0, 1);
250 			if (error != 0) {
251 				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
252 				    __func__, __LINE__, error);
253 				vdev_geom_detach(cp, B_FALSE);
254 				return (NULL);
255 			}
256 			ZFS_LOG(1, "Created consumer for %s.", pp->name);
257 		} else {
258 			error = g_access(cp, 1, 0, 1);
259 			if (error != 0) {
260 				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
261 				    __func__, __LINE__, error);
262 				return (NULL);
263 			}
264 			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
265 		}
266 	}
267 
268 	/*
269 	 * BUG: cp may already belong to a vdev.  This could happen if:
270 	 * 1) That vdev is a shared spare, or
271 	 * 2) We are trying to reopen a missing vdev and we are scanning by
272 	 *    guid.  In that case, we'll ultimately fail to open this consumer,
273 	 *    but not until after setting the private field.
274 	 * The solution is to:
275 	 * 1) Don't set the private field until after the open succeeds, and
276 	 * 2) Set it to a linked list of vdevs, not just a single vdev
277 	 */
278 	cp->private = vd;
279 	if (vd != NULL) {
280 		vd->vdev_tsd = cp;
281 		vdev_geom_set_physpath(cp, /*do_null_update*/B_FALSE);
282 	}
283 
284 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
285 	return (cp);
286 }
287 
288 static void
vdev_geom_detach(struct g_consumer * cp,boolean_t open_for_read)289 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
290 {
291 	struct g_geom *gp;
292 	vdev_t *vd;
293 
294 	g_topology_assert();
295 
296 	ZFS_LOG(1, "Detaching from %s.",
297 	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
298 
299 	vd = cp->private;
300 	cp->private = NULL;
301 
302 	gp = cp->geom;
303 	if (open_for_read)
304 		g_access(cp, -1, 0, -1);
305 	/* Destroy consumer on last close. */
306 	if (cp->acr == 0 && cp->ace == 0) {
307 		if (cp->acw > 0)
308 			g_access(cp, 0, -cp->acw, 0);
309 		if (cp->provider != NULL) {
310 			ZFS_LOG(1, "Destroying consumer for %s.",
311 			    cp->provider->name ? cp->provider->name : "NULL");
312 			g_detach(cp);
313 		}
314 		g_destroy_consumer(cp);
315 	}
316 	/* Destroy geom if there are no consumers left. */
317 	if (LIST_EMPTY(&gp->consumer)) {
318 		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
319 		g_wither_geom(gp, ENXIO);
320 	}
321 }
322 
323 static void
vdev_geom_close_locked(vdev_t * vd)324 vdev_geom_close_locked(vdev_t *vd)
325 {
326 	struct g_consumer *cp;
327 
328 	g_topology_assert();
329 
330 	cp = vd->vdev_tsd;
331 	vd->vdev_tsd = NULL;
332 	vd->vdev_delayed_close = B_FALSE;
333 	if (cp == NULL)
334 		return;
335 
336 	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
337 
338 	vdev_geom_detach(cp, B_TRUE);
339 }
340 
341 /*
342  * Issue one or more bios to the vdev in parallel
343  * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
344  * operation is described by parallel entries from each array.  There may be
345  * more bios actually issued than entries in the array
346  */
347 static void
vdev_geom_io(struct g_consumer * cp,int * cmds,void ** datas,off_t * offsets,off_t * sizes,int * errors,int ncmds)348 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
349     off_t *sizes, int *errors, int ncmds)
350 {
351 	struct bio **bios;
352 	u_char *p;
353 	off_t off, maxio, s, end;
354 	int i, n_bios, j;
355 	size_t bios_size;
356 
357 	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
358 	n_bios = 0;
359 
360 	/* How many bios are required for all commands ? */
361 	for (i = 0; i < ncmds; i++)
362 		n_bios += (sizes[i] + maxio - 1) / maxio;
363 
364 	/* Allocate memory for the bios */
365 	bios_size = n_bios * sizeof(struct bio*);
366 	bios = kmem_zalloc(bios_size, KM_SLEEP);
367 
368 	/* Prepare and issue all of the bios */
369 	for (i = j = 0; i < ncmds; i++) {
370 		off = offsets[i];
371 		p = datas[i];
372 		s = sizes[i];
373 		end = off + s;
374 		ASSERT((off % cp->provider->sectorsize) == 0);
375 		ASSERT((s % cp->provider->sectorsize) == 0);
376 
377 		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
378 			bios[j] = g_alloc_bio();
379 			bios[j]->bio_cmd = cmds[i];
380 			bios[j]->bio_done = NULL;
381 			bios[j]->bio_offset = off;
382 			bios[j]->bio_length = MIN(s, maxio);
383 			bios[j]->bio_data = p;
384 			g_io_request(bios[j], cp);
385 		}
386 	}
387 	ASSERT(j == n_bios);
388 
389 	/* Wait for all of the bios to complete, and clean them up */
390 	for (i = j = 0; i < ncmds; i++) {
391 		off = offsets[i];
392 		s = sizes[i];
393 		end = off + s;
394 
395 		for (; off < end; off += maxio, s -= maxio, j++) {
396 			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
397 			g_destroy_bio(bios[j]);
398 		}
399 	}
400 	kmem_free(bios, bios_size);
401 }
402 
403 static int
vdev_geom_read_config(struct g_consumer * cp,nvlist_t ** config)404 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
405 {
406 	struct g_provider *pp;
407 	vdev_phys_t *vdev_lists[VDEV_LABELS];
408 	char *p, *buf;
409 	size_t buflen;
410 	uint64_t psize, state, txg;
411 	off_t offsets[VDEV_LABELS];
412 	off_t size;
413 	off_t sizes[VDEV_LABELS];
414 	int cmds[VDEV_LABELS];
415 	int errors[VDEV_LABELS];
416 	int l, len;
417 
418 	g_topology_assert_not();
419 
420 	pp = cp->provider;
421 	ZFS_LOG(1, "Reading config from %s...", pp->name);
422 
423 	psize = pp->mediasize;
424 	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
425 
426 	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
427 	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
428 
429 	buflen = sizeof(vdev_lists[0]->vp_nvlist);
430 
431 	*config = NULL;
432 	/* Create all of the IO requests */
433 	for (l = 0; l < VDEV_LABELS; l++) {
434 		cmds[l] = BIO_READ;
435 		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
436 		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
437 		sizes[l] = size;
438 		errors[l] = 0;
439 		ASSERT(offsets[l] % pp->sectorsize == 0);
440 	}
441 
442 	/* Issue the IO requests */
443 	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
444 	    VDEV_LABELS);
445 
446 	/* Parse the labels */
447 	for (l = 0; l < VDEV_LABELS; l++) {
448 		if (errors[l] != 0)
449 			continue;
450 
451 		buf = vdev_lists[l]->vp_nvlist;
452 
453 		if (nvlist_unpack(buf, buflen, config, 0) != 0)
454 			continue;
455 
456 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
457 		    &state) != 0 || state > POOL_STATE_L2CACHE) {
458 			nvlist_free(*config);
459 			*config = NULL;
460 			continue;
461 		}
462 
463 		if (state != POOL_STATE_SPARE &&
464 		    state != POOL_STATE_L2CACHE &&
465 		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
466 		    &txg) != 0 || txg == 0)) {
467 			nvlist_free(*config);
468 			*config = NULL;
469 			continue;
470 		}
471 
472 		break;
473 	}
474 
475 	/* Free the label storage */
476 	for (l = 0; l < VDEV_LABELS; l++)
477 		kmem_free(vdev_lists[l], size);
478 
479 	return (*config == NULL ? ENOENT : 0);
480 }
481 
482 static void
resize_configs(nvlist_t *** configs,uint64_t * count,uint64_t id)483 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
484 {
485 	nvlist_t **new_configs;
486 	uint64_t i;
487 
488 	if (id < *count)
489 		return;
490 	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
491 	    KM_SLEEP);
492 	for (i = 0; i < *count; i++)
493 		new_configs[i] = (*configs)[i];
494 	if (*configs != NULL)
495 		kmem_free(*configs, *count * sizeof(void *));
496 	*configs = new_configs;
497 	*count = id + 1;
498 }
499 
500 static void
process_vdev_config(nvlist_t *** configs,uint64_t * count,nvlist_t * cfg,const char * name,uint64_t * known_pool_guid)501 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
502     const char *name, uint64_t* known_pool_guid)
503 {
504 	nvlist_t *vdev_tree;
505 	uint64_t pool_guid;
506 	uint64_t vdev_guid, known_guid;
507 	uint64_t id, txg, known_txg;
508 	char *pname;
509 	int i;
510 
511 	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
512 	    strcmp(pname, name) != 0)
513 		goto ignore;
514 
515 	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
516 		goto ignore;
517 
518 	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
519 		goto ignore;
520 
521 	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
522 		goto ignore;
523 
524 	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
525 		goto ignore;
526 
527 	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
528 
529 	if (*known_pool_guid != 0) {
530 		if (pool_guid != *known_pool_guid)
531 			goto ignore;
532 	} else
533 		*known_pool_guid = pool_guid;
534 
535 	resize_configs(configs, count, id);
536 
537 	if ((*configs)[id] != NULL) {
538 		VERIFY(nvlist_lookup_uint64((*configs)[id],
539 		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
540 		if (txg <= known_txg)
541 			goto ignore;
542 		nvlist_free((*configs)[id]);
543 	}
544 
545 	(*configs)[id] = cfg;
546 	return;
547 
548 ignore:
549 	nvlist_free(cfg);
550 }
551 
552 int
vdev_geom_read_pool_label(const char * name,nvlist_t *** configs,uint64_t * count)553 vdev_geom_read_pool_label(const char *name,
554     nvlist_t ***configs, uint64_t *count)
555 {
556 	struct g_class *mp;
557 	struct g_geom *gp;
558 	struct g_provider *pp;
559 	struct g_consumer *zcp;
560 	nvlist_t *vdev_cfg;
561 	uint64_t pool_guid;
562 	int error;
563 
564 	DROP_GIANT();
565 	g_topology_lock();
566 
567 	*configs = NULL;
568 	*count = 0;
569 	pool_guid = 0;
570 	LIST_FOREACH(mp, &g_classes, class) {
571 		if (mp == &zfs_vdev_class)
572 			continue;
573 		LIST_FOREACH(gp, &mp->geom, geom) {
574 			if (gp->flags & G_GEOM_WITHER)
575 				continue;
576 			LIST_FOREACH(pp, &gp->provider, provider) {
577 				if (pp->flags & G_PF_WITHER)
578 					continue;
579 				zcp = vdev_geom_attach(pp, NULL);
580 				if (zcp == NULL)
581 					continue;
582 				g_topology_unlock();
583 				error = vdev_geom_read_config(zcp, &vdev_cfg);
584 				g_topology_lock();
585 				vdev_geom_detach(zcp, B_TRUE);
586 				if (error)
587 					continue;
588 				ZFS_LOG(1, "successfully read vdev config");
589 
590 				process_vdev_config(configs, count,
591 				    vdev_cfg, name, &pool_guid);
592 			}
593 		}
594 	}
595 	g_topology_unlock();
596 	PICKUP_GIANT();
597 
598 	return (*count > 0 ? 0 : ENOENT);
599 }
600 
601 enum match {
602 	NO_MATCH,
603 	TOP_MATCH,
604 	FULL_MATCH
605 };
606 
607 static enum match
vdev_attach_ok(vdev_t * vd,struct g_provider * pp)608 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
609 {
610 	nvlist_t *config;
611 	uint64_t pool_guid, top_guid, vdev_guid;
612 	struct g_consumer *cp;
613 
614 	cp = vdev_geom_attach(pp, NULL);
615 	if (cp == NULL) {
616 		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
617 		    pp->name);
618 		return (NO_MATCH);
619 	}
620 	g_topology_unlock();
621 	if (vdev_geom_read_config(cp, &config) != 0) {
622 		g_topology_lock();
623 		vdev_geom_detach(cp, B_TRUE);
624 		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
625 		return (NO_MATCH);
626 	}
627 	g_topology_lock();
628 	vdev_geom_detach(cp, B_TRUE);
629 
630 	pool_guid = 0;
631 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
632 	top_guid = 0;
633 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
634 	vdev_guid = 0;
635 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
636 	nvlist_free(config);
637 
638 	/*
639 	 * Check that the label's pool guid matches the desired guid.
640 	 * Inactive spares and L2ARCs do not have any pool guid in the label.
641 	 */
642 	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
643 		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
644 		    pp->name,
645 		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
646 		return (NO_MATCH);
647 	}
648 
649 	/*
650 	 * Check that the label's vdev guid matches the desired guid.
651 	 * The second condition handles possible race on vdev detach, when
652 	 * remaining vdev receives GUID of destroyed top level mirror vdev.
653 	 */
654 	if (vdev_guid == vd->vdev_guid) {
655 		ZFS_LOG(1, "guids match for provider %s.", pp->name);
656 		return (FULL_MATCH);
657 	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
658 		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
659 		return (TOP_MATCH);
660 	}
661 	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
662 	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
663 	return (NO_MATCH);
664 }
665 
666 static struct g_consumer *
vdev_geom_attach_by_guids(vdev_t * vd)667 vdev_geom_attach_by_guids(vdev_t *vd)
668 {
669 	struct g_class *mp;
670 	struct g_geom *gp;
671 	struct g_provider *pp;
672 	struct g_consumer *cp;
673 	enum match m;
674 
675 	g_topology_assert();
676 
677 	cp = NULL;
678 	LIST_FOREACH(mp, &g_classes, class) {
679 		if (mp == &zfs_vdev_class)
680 			continue;
681 		LIST_FOREACH(gp, &mp->geom, geom) {
682 			if (gp->flags & G_GEOM_WITHER)
683 				continue;
684 			LIST_FOREACH(pp, &gp->provider, provider) {
685 				m = vdev_attach_ok(vd, pp);
686 				if (m == NO_MATCH)
687 					continue;
688 				if (cp != NULL) {
689 					if (m == FULL_MATCH)
690 						vdev_geom_detach(cp, B_TRUE);
691 					else
692 						continue;
693 				}
694 				cp = vdev_geom_attach(pp, vd);
695 				if (cp == NULL) {
696 					printf("ZFS WARNING: Unable to "
697 					    "attach to %s.\n", pp->name);
698 					continue;
699 				}
700 				if (m == FULL_MATCH)
701 					return (cp);
702 			}
703 		}
704 	}
705 	return (cp);
706 }
707 
708 static struct g_consumer *
vdev_geom_open_by_guids(vdev_t * vd)709 vdev_geom_open_by_guids(vdev_t *vd)
710 {
711 	struct g_consumer *cp;
712 	char *buf;
713 	size_t len;
714 
715 	g_topology_assert();
716 
717 	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
718 		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
719 	cp = vdev_geom_attach_by_guids(vd);
720 	if (cp != NULL) {
721 		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
722 		buf = kmem_alloc(len, KM_SLEEP);
723 
724 		snprintf(buf, len, "/dev/%s", cp->provider->name);
725 		spa_strfree(vd->vdev_path);
726 		vd->vdev_path = buf;
727 
728 		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
729 		    (uintmax_t)spa_guid(vd->vdev_spa),
730 		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
731 	} else {
732 		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
733 		    (uintmax_t)spa_guid(vd->vdev_spa),
734 		    (uintmax_t)vd->vdev_guid);
735 	}
736 
737 	return (cp);
738 }
739 
740 static struct g_consumer *
vdev_geom_open_by_path(vdev_t * vd,int check_guid)741 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
742 {
743 	struct g_provider *pp;
744 	struct g_consumer *cp;
745 
746 	g_topology_assert();
747 
748 	cp = NULL;
749 	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
750 	if (pp != NULL) {
751 		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
752 		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
753 			cp = vdev_geom_attach(pp, vd);
754 	}
755 
756 	return (cp);
757 }
758 
759 static int
vdev_geom_open(vdev_t * vd,uint64_t * psize,uint64_t * max_psize,uint64_t * logical_ashift,uint64_t * physical_ashift)760 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
761     uint64_t *logical_ashift, uint64_t *physical_ashift)
762 {
763 	struct g_provider *pp;
764 	struct g_consumer *cp;
765 	size_t bufsize;
766 	int error;
767 
768 	/* Set the TLS to indicate downstack that we should not access zvols*/
769 	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
770 
771 	/*
772 	 * We must have a pathname, and it must be absolute.
773 	 */
774 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
775 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
776 		return (EINVAL);
777 	}
778 
779 	/*
780 	 * Reopen the device if it's not currently open. Otherwise,
781 	 * just update the physical size of the device.
782 	 */
783 	if ((cp = vd->vdev_tsd) != NULL) {
784 		ASSERT(vd->vdev_reopening);
785 		goto skip_open;
786 	}
787 
788 	DROP_GIANT();
789 	g_topology_lock();
790 	error = 0;
791 
792 	if (vd->vdev_spa->spa_splitting_newspa ||
793 	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
794 	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
795 	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
796 		/*
797 		 * We are dealing with a vdev that hasn't been previously
798 		 * opened (since boot), and we are not loading an
799 		 * existing pool configuration.  This looks like a
800 		 * vdev add operation to a new or existing pool.
801 		 * Assume the user knows what he/she is doing and find
802 		 * GEOM provider by its name, ignoring GUID mismatches.
803 		 *
804 		 * XXPOLICY: It would be safer to only allow a device
805 		 *           that is unlabeled or labeled but missing
806 		 *           GUID information to be opened in this fashion,
807 		 *           unless we are doing a split, in which case we
808 		 *           should allow any guid.
809 		 */
810 		cp = vdev_geom_open_by_path(vd, 0);
811 	} else {
812 		/*
813 		 * Try using the recorded path for this device, but only
814 		 * accept it if its label data contains the expected GUIDs.
815 		 */
816 		cp = vdev_geom_open_by_path(vd, 1);
817 		if (cp == NULL) {
818 			/*
819 			 * The device at vd->vdev_path doesn't have the
820 			 * expected GUIDs. The disks might have merely
821 			 * moved around so try all other GEOM providers
822 			 * to find one with the right GUIDs.
823 			 */
824 			cp = vdev_geom_open_by_guids(vd);
825 		}
826 	}
827 
828 	/* Clear the TLS now that tasting is done */
829 	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
830 
831 	if (cp == NULL) {
832 		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
833 		error = ENOENT;
834 	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
835 	    !ISP2(cp->provider->sectorsize)) {
836 		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
837 		    vd->vdev_path);
838 
839 		vdev_geom_close_locked(vd);
840 		error = EINVAL;
841 		cp = NULL;
842 	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
843 		int i;
844 
845 		for (i = 0; i < 5; i++) {
846 			error = g_access(cp, 0, 1, 0);
847 			if (error == 0)
848 				break;
849 			g_topology_unlock();
850 			tsleep(vd, 0, "vdev", hz / 2);
851 			g_topology_lock();
852 		}
853 		if (error != 0) {
854 			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
855 			    vd->vdev_path, error);
856 			vdev_geom_close_locked(vd);
857 			cp = NULL;
858 		}
859 	}
860 
861 	/* Fetch initial physical path information for this device. */
862 	if (cp != NULL)
863 		vdev_geom_attrchanged(cp, "GEOM::physpath");
864 
865 	g_topology_unlock();
866 	PICKUP_GIANT();
867 	if (cp == NULL) {
868 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
869 		return (error);
870 	}
871 skip_open:
872 	pp = cp->provider;
873 
874 	/*
875 	 * Determine the actual size of the device.
876 	 */
877 	*max_psize = *psize = pp->mediasize;
878 
879 	/*
880 	 * Determine the device's minimum transfer size and preferred
881 	 * transfer size.
882 	 */
883 	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
884 	*physical_ashift = 0;
885 	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
886 	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
887 		*physical_ashift = highbit(pp->stripesize) - 1;
888 
889 	/*
890 	 * Clear the nowritecache settings, so that on a vdev_reopen()
891 	 * we will try again.
892 	 */
893 	vd->vdev_nowritecache = B_FALSE;
894 
895 	/*
896 	 * Determine the device's rotation rate.
897 	 */
898 	vdev_geom_set_rotation_rate(vd, cp);
899 
900 	return (0);
901 }
902 
903 static void
vdev_geom_close(vdev_t * vd)904 vdev_geom_close(vdev_t *vd)
905 {
906 
907 	if (vd->vdev_reopening)
908 		return;
909 
910 	DROP_GIANT();
911 	g_topology_lock();
912 	vdev_geom_close_locked(vd);
913 	g_topology_unlock();
914 	PICKUP_GIANT();
915 }
916 
917 static void
vdev_geom_io_intr(struct bio * bp)918 vdev_geom_io_intr(struct bio *bp)
919 {
920 	vdev_t *vd;
921 	zio_t *zio;
922 
923 	zio = bp->bio_caller1;
924 	vd = zio->io_vd;
925 	zio->io_error = bp->bio_error;
926 	if (zio->io_error == 0 && bp->bio_resid != 0)
927 		zio->io_error = SET_ERROR(EIO);
928 
929 	switch(zio->io_error) {
930 	case ENOTSUP:
931 		/*
932 		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
933 		 * that future attempts will never succeed. In this case
934 		 * we set a persistent flag so that we don't bother with
935 		 * requests in the future.
936 		 */
937 		switch(bp->bio_cmd) {
938 		case BIO_FLUSH:
939 			vd->vdev_nowritecache = B_TRUE;
940 			break;
941 		case BIO_DELETE:
942 			vd->vdev_notrim = B_TRUE;
943 			break;
944 		}
945 		break;
946 	case ENXIO:
947 		if (!vd->vdev_remove_wanted) {
948 			/*
949 			 * If provider's error is set we assume it is being
950 			 * removed.
951 			 */
952 			if (bp->bio_to->error != 0) {
953 				vd->vdev_remove_wanted = B_TRUE;
954 				spa_async_request(zio->io_spa,
955 				    SPA_ASYNC_REMOVE);
956 			} else if (!vd->vdev_delayed_close) {
957 				vd->vdev_delayed_close = B_TRUE;
958 			}
959 		}
960 		break;
961 	}
962 	g_destroy_bio(bp);
963 	zio_delay_interrupt(zio);
964 }
965 
966 static void
vdev_geom_io_start(zio_t * zio)967 vdev_geom_io_start(zio_t *zio)
968 {
969 	vdev_t *vd;
970 	struct g_consumer *cp;
971 	struct bio *bp;
972 	int error;
973 
974 	vd = zio->io_vd;
975 
976 	switch (zio->io_type) {
977 	case ZIO_TYPE_IOCTL:
978 		/* XXPOLICY */
979 		if (!vdev_readable(vd)) {
980 			zio->io_error = SET_ERROR(ENXIO);
981 			zio_interrupt(zio);
982 			return;
983 		} else {
984 			switch (zio->io_cmd) {
985 			case DKIOCFLUSHWRITECACHE:
986 				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
987 					break;
988 				if (vd->vdev_nowritecache) {
989 					zio->io_error = SET_ERROR(ENOTSUP);
990 					break;
991 				}
992 				goto sendreq;
993 			default:
994 				zio->io_error = SET_ERROR(ENOTSUP);
995 			}
996 		}
997 
998 		zio_execute(zio);
999 		return;
1000 	case ZIO_TYPE_FREE:
1001 		if (vd->vdev_notrim) {
1002 			zio->io_error = SET_ERROR(ENOTSUP);
1003 		} else if (!vdev_geom_bio_delete_disable) {
1004 			goto sendreq;
1005 		}
1006 		zio_execute(zio);
1007 		return;
1008 	}
1009 sendreq:
1010 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
1011 	    zio->io_type == ZIO_TYPE_WRITE ||
1012 	    zio->io_type == ZIO_TYPE_FREE ||
1013 	    zio->io_type == ZIO_TYPE_IOCTL);
1014 
1015 	cp = vd->vdev_tsd;
1016 	if (cp == NULL) {
1017 		zio->io_error = SET_ERROR(ENXIO);
1018 		zio_interrupt(zio);
1019 		return;
1020 	}
1021 	bp = g_alloc_bio();
1022 	bp->bio_caller1 = zio;
1023 	switch (zio->io_type) {
1024 	case ZIO_TYPE_READ:
1025 	case ZIO_TYPE_WRITE:
1026 		zio->io_target_timestamp = zio_handle_io_delay(zio);
1027 		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
1028 		bp->bio_data = zio->io_data;
1029 		bp->bio_offset = zio->io_offset;
1030 		bp->bio_length = zio->io_size;
1031 		break;
1032 	case ZIO_TYPE_FREE:
1033 		bp->bio_cmd = BIO_DELETE;
1034 		bp->bio_data = NULL;
1035 		bp->bio_offset = zio->io_offset;
1036 		bp->bio_length = zio->io_size;
1037 		break;
1038 	case ZIO_TYPE_IOCTL:
1039 		bp->bio_cmd = BIO_FLUSH;
1040 		bp->bio_flags |= BIO_ORDERED;
1041 		bp->bio_data = NULL;
1042 		bp->bio_offset = cp->provider->mediasize;
1043 		bp->bio_length = 0;
1044 		break;
1045 	}
1046 	bp->bio_done = vdev_geom_io_intr;
1047 
1048 	g_io_request(bp, cp);
1049 }
1050 
1051 static void
vdev_geom_io_done(zio_t * zio)1052 vdev_geom_io_done(zio_t *zio)
1053 {
1054 }
1055 
1056 static void
vdev_geom_hold(vdev_t * vd)1057 vdev_geom_hold(vdev_t *vd)
1058 {
1059 }
1060 
1061 static void
vdev_geom_rele(vdev_t * vd)1062 vdev_geom_rele(vdev_t *vd)
1063 {
1064 }
1065 
1066 vdev_ops_t vdev_geom_ops = {
1067 	vdev_geom_open,
1068 	vdev_geom_close,
1069 	vdev_default_asize,
1070 	vdev_geom_io_start,
1071 	vdev_geom_io_done,
1072 	NULL,
1073 	vdev_geom_hold,
1074 	vdev_geom_rele,
1075 	VDEV_TYPE_DISK,		/* name of this vdev type */
1076 	B_TRUE			/* leaf vdev */
1077 };
1078