xref: /netbsd-src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c (revision 48fb7bfab72acd4281a53bbee5ccf3f809019e75)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Functions to convert between a list of vdevs and an nvlist representing the
29  * configuration.  Each entry in the list can be one of:
30  *
31  * 	Device vdevs
32  * 		disk=(path=..., devid=...)
33  * 		file=(path=...)
34  *
35  * 	Group vdevs
36  * 		raidz[1|2]=(...)
37  * 		mirror=(...)
38  *
39  * 	Hot spares
40  *
41  * While the underlying implementation supports it, group vdevs cannot contain
42  * other group vdevs.  All userland verification of devices is contained within
43  * this file.  If successful, the nvlist returned can be passed directly to the
44  * kernel; we've done as much verification as possible in userland.
45  *
46  * Hot spares are a special case, and passed down as an array of disk vdevs, at
47  * the same level as the root of the vdev tree.
48  *
49  * The only function exported by this file is 'make_root_vdev'.  The
50  * function performs several passes:
51  *
52  * 	1. Construct the vdev specification.  Performs syntax validation and
53  *         makes sure each device is valid.
54  * 	2. Check for devices in use.  Using libdiskmgt, makes sure that no
55  *         devices are also in use.  Some can be overridden using the 'force'
56  *         flag, others cannot.
57  * 	3. Check for replication errors if the 'force' flag is not specified.
58  *         validates that the replication level is consistent across the
59  *         entire pool.
60  * 	4. Call libzfs to label any whole disks with an EFI label.
61  */
62 
63 #include <assert.h>
64 #include <devid.h>
65 #include <errno.h>
66 #include <fcntl.h>
67 #include <libintl.h>
68 #include <libnvpair.h>
69 #include <limits.h>
70 #include <stdio.h>
71 #include <string.h>
72 #include <unistd.h>
73 #include <sys/efi_partition.h>
74 #include <sys/stat.h>
75 #include <sys/vtoc.h>
76 #include <sys/mntent.h>
77 
78 #include "zpool_util.h"
79 
80 #define	DISK_ROOT	"/dev/dsk"
81 #define	RDISK_ROOT	"/dev/rdsk"
82 #define	BACKUP_SLICE	"s2"
83 
84 /*
85  * For any given vdev specification, we can have multiple errors.  The
86  * vdev_error() function keeps track of whether we have seen an error yet, and
87  * prints out a header if its the first error we've seen.
88  */
89 boolean_t error_seen;
90 boolean_t is_force;
91 
92 /*PRINTFLIKE1*/
93 static void
94 vdev_error(const char *fmt, ...)
95 {
96 	va_list ap;
97 
98 	if (!error_seen) {
99 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
100 		if (!is_force)
101 			(void) fprintf(stderr, gettext("use '-f' to override "
102 			    "the following errors:\n"));
103 		else
104 			(void) fprintf(stderr, gettext("the following errors "
105 			    "must be manually repaired:\n"));
106 		error_seen = B_TRUE;
107 	}
108 
109 	va_start(ap, fmt);
110 	(void) vfprintf(stderr, fmt, ap);
111 	va_end(ap);
112 }
113 
114 static void
115 libdiskmgt_error(int error)
116 {
117 	/*
118 	 * ENXIO/ENODEV is a valid error message if the device doesn't live in
119 	 * /dev/dsk.  Don't bother printing an error message in this case.
120 	 */
121 	if (error == ENXIO || error == ENODEV)
122 		return;
123 
124 	(void) fprintf(stderr, gettext("warning: device in use checking "
125 	    "failed: %s\n"), strerror(error));
126 }
127 
128 /*
129  * Check that a file is valid.  All we can do in this case is check that it's
130  * not in use by another pool, and not in use by swap.
131  */
132 static int
133 check_file(const char *file, boolean_t force, boolean_t isspare)
134 {
135 	char  *name;
136 	int fd;
137 	int ret = 0;
138 	int err;
139 	pool_state_t state;
140 	boolean_t inuse;
141 
142 #ifndef __NetBSD__
143 	if (dm_inuse_swap(file, &err)) {
144 		if (err)
145 			libdiskmgt_error(err);
146 		else
147 			vdev_error(gettext("%s is currently used by swap. "
148 			    "Please see swap(1M).\n"), file);
149 		return (-1);
150 	}
151 #endif
152 
153 	if ((fd = open(file, O_RDONLY)) < 0)
154 		return (0);
155 
156 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
157 		const char *desc;
158 
159 		switch (state) {
160 		case POOL_STATE_ACTIVE:
161 			desc = gettext("active");
162 			break;
163 
164 		case POOL_STATE_EXPORTED:
165 			desc = gettext("exported");
166 			break;
167 
168 		case POOL_STATE_POTENTIALLY_ACTIVE:
169 			desc = gettext("potentially active");
170 			break;
171 
172 		default:
173 			desc = gettext("unknown");
174 			break;
175 		}
176 
177 		/*
178 		 * Allow hot spares to be shared between pools.
179 		 */
180 		if (state == POOL_STATE_SPARE && isspare)
181 			return (0);
182 
183 		if (state == POOL_STATE_ACTIVE ||
184 		    state == POOL_STATE_SPARE || !force) {
185 			switch (state) {
186 			case POOL_STATE_SPARE:
187 				vdev_error(gettext("%s is reserved as a hot "
188 				    "spare for pool %s\n"), file, name);
189 				break;
190 			default:
191 				vdev_error(gettext("%s is part of %s pool "
192 				    "'%s'\n"), file, desc, name);
193 				break;
194 			}
195 			ret = -1;
196 		}
197 
198 		free(name);
199 	}
200 
201 	(void) close(fd);
202 	return (ret);
203 }
204 
205 
206 /*
207  * By "whole disk" we mean an entire physical disk (something we can
208  * label, toggle the write cache on, etc.) as opposed to the full
209  * capacity of a pseudo-device such as lofi or did.  We act as if we
210  * are labeling the disk, which should be a pretty good test of whether
211  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
212  * it isn't.
213  */
214 static boolean_t
215 is_whole_disk(const char *arg)
216 {
217 	struct dk_gpt *label;
218 	int	fd;
219 	char	path[MAXPATHLEN];
220 
221 	(void) snprintf(path, sizeof (path), "%s%s%s",
222 	    RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
223 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
224 		return (B_FALSE);
225 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
226 		(void) close(fd);
227 		return (B_FALSE);
228 	}
229 	efi_free(label);
230 	(void) close(fd);
231 	return (B_TRUE);
232 }
233 
234 /*
235  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
236  * device, fill in the device id to make a complete nvlist.  Valid forms for a
237  * leaf vdev are:
238  *
239  * 	/dev/dsk/xxx	Complete disk path
240  * 	/xxx		Full path to file
241  * 	xxx		Shorthand for /dev/dsk/xxx
242  */
243 static nvlist_t *
244 make_leaf_vdev(const char *arg, uint64_t is_log)
245 {
246 	char path[MAXPATHLEN];
247 	struct stat64 statbuf;
248 	nvlist_t *vdev = NULL;
249 	char *type = NULL;
250 	boolean_t wholedisk = B_FALSE;
251 
252 	/*
253 	 * Determine what type of vdev this is, and put the full path into
254 	 * 'path'.  We detect whether this is a device of file afterwards by
255 	 * checking the st_mode of the file.
256 	 */
257 	if (arg[0] == '/') {
258 		/*
259 		 * Complete device or file path.  Exact type is determined by
260 		 * examining the file descriptor afterwards.
261 		 */
262 		wholedisk = is_whole_disk(arg);
263 		if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
264 			(void) fprintf(stderr,
265 			    gettext("cannot open '%s': %s\n"),
266 			    arg, strerror(errno));
267 			return (NULL);
268 		}
269 
270 		(void) strlcpy(path, arg, sizeof (path));
271 	} else {
272 		/*
273 		 * This may be a short path for a device, or it could be total
274 		 * gibberish.  Check to see if it's a known device in
275 		 * /dev/dsk/.  As part of this check, see if we've been given a
276 		 * an entire disk (minus the slice number).
277 		 */
278 		(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT,
279 		    arg);
280 		wholedisk = is_whole_disk(path);
281 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
282 			/*
283 			 * If we got ENOENT, then the user gave us
284 			 * gibberish, so try to direct them with a
285 			 * reasonable error message.  Otherwise,
286 			 * regurgitate strerror() since it's the best we
287 			 * can do.
288 			 */
289 			if (errno == ENOENT) {
290 				(void) fprintf(stderr,
291 				    gettext("cannot open '%s': no such "
292 				    "device in %s\n"), arg, DISK_ROOT);
293 				(void) fprintf(stderr,
294 				    gettext("must be a full path or "
295 				    "shorthand device name\n"));
296 				return (NULL);
297 			} else {
298 				(void) fprintf(stderr,
299 				    gettext("cannot open '%s': %s\n"),
300 				    path, strerror(errno));
301 				return (NULL);
302 			}
303 		}
304 	}
305 
306 	/*
307 	 * Determine whether this is a device or a file.
308 	 */
309 	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
310 		type = VDEV_TYPE_DISK;
311 	} else if (S_ISREG(statbuf.st_mode)) {
312 		type = VDEV_TYPE_FILE;
313 	} else {
314 		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
315 		    "block device or regular file\n"), path);
316 		return (NULL);
317 	}
318 
319 	/*
320 	 * Finally, we have the complete device or file, and we know that it is
321 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
322 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
323 	 */
324 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
325 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
326 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
327 	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
328 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
329 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
330 		    (uint64_t)wholedisk) == 0);
331 
332 	/*
333 	 * For a whole disk, defer getting its devid until after labeling it.
334 	 */
335 	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
336 		/*
337 		 * Get the devid for the device.
338 		 */
339 		int fd;
340 		ddi_devid_t devid;
341 		char *minor = NULL, *devid_str = NULL;
342 
343 		if ((fd = open(path, O_RDONLY)) < 0) {
344 			(void) fprintf(stderr, gettext("cannot open '%s': "
345 			    "%s\n"), path, strerror(errno));
346 			nvlist_free(vdev);
347 			return (NULL);
348 		}
349 
350 		if (devid_get(fd, &devid) == 0) {
351 			if (devid_get_minor_name(fd, &minor) == 0 &&
352 			    (devid_str = devid_str_encode(devid, minor)) !=
353 			    NULL) {
354 				verify(nvlist_add_string(vdev,
355 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
356 			}
357 			if (devid_str != NULL)
358 				devid_str_free(devid_str);
359 			if (minor != NULL)
360 				devid_str_free(minor);
361 			devid_free(devid);
362 		}
363 
364 		(void) close(fd);
365 	}
366 
367 	return (vdev);
368 }
369 
370 /*
371  * Go through and verify the replication level of the pool is consistent.
372  * Performs the following checks:
373  *
374  * 	For the new spec, verifies that devices in mirrors and raidz are the
375  * 	same size.
376  *
377  * 	If the current configuration already has inconsistent replication
378  * 	levels, ignore any other potential problems in the new spec.
379  *
380  * 	Otherwise, make sure that the current spec (if there is one) and the new
381  * 	spec have consistent replication levels.
382  */
383 typedef struct replication_level {
384 	char *zprl_type;
385 	uint64_t zprl_children;
386 	uint64_t zprl_parity;
387 } replication_level_t;
388 
389 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)
390 
391 /*
392  * Given a list of toplevel vdevs, return the current replication level.  If
393  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
394  * an error message will be displayed for each self-inconsistent vdev.
395  */
396 static replication_level_t *
397 get_replication(nvlist_t *nvroot, boolean_t fatal)
398 {
399 	nvlist_t **top;
400 	uint_t t, toplevels;
401 	nvlist_t **child;
402 	uint_t c, children;
403 	nvlist_t *nv;
404 	char *type;
405 	replication_level_t lastrep, rep, *ret;
406 	boolean_t dontreport;
407 
408 	ret = safe_malloc(sizeof (replication_level_t));
409 
410 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
411 	    &top, &toplevels) == 0);
412 
413 	lastrep.zprl_type = NULL;
414 	for (t = 0; t < toplevels; t++) {
415 		uint64_t is_log = B_FALSE;
416 
417 		nv = top[t];
418 
419 		/*
420 		 * For separate logs we ignore the top level vdev replication
421 		 * constraints.
422 		 */
423 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
424 		if (is_log)
425 			continue;
426 
427 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
428 		    &type) == 0);
429 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
430 		    &child, &children) != 0) {
431 			/*
432 			 * This is a 'file' or 'disk' vdev.
433 			 */
434 			rep.zprl_type = type;
435 			rep.zprl_children = 1;
436 			rep.zprl_parity = 0;
437 		} else {
438 			uint64_t vdev_size;
439 
440 			/*
441 			 * This is a mirror or RAID-Z vdev.  Go through and make
442 			 * sure the contents are all the same (files vs. disks),
443 			 * keeping track of the number of elements in the
444 			 * process.
445 			 *
446 			 * We also check that the size of each vdev (if it can
447 			 * be determined) is the same.
448 			 */
449 			rep.zprl_type = type;
450 			rep.zprl_children = 0;
451 
452 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
453 				verify(nvlist_lookup_uint64(nv,
454 				    ZPOOL_CONFIG_NPARITY,
455 				    &rep.zprl_parity) == 0);
456 				assert(rep.zprl_parity != 0);
457 			} else {
458 				rep.zprl_parity = 0;
459 			}
460 
461 			/*
462 			 * The 'dontreport' variable indicates that we've
463 			 * already reported an error for this spec, so don't
464 			 * bother doing it again.
465 			 */
466 			type = NULL;
467 			dontreport = 0;
468 			vdev_size = -1ULL;
469 			for (c = 0; c < children; c++) {
470 				nvlist_t *cnv = child[c];
471 				char *path;
472 				struct stat64 statbuf;
473 				uint64_t size = -1ULL;
474 				char *childtype;
475 				int fd, err;
476 
477 				rep.zprl_children++;
478 
479 				verify(nvlist_lookup_string(cnv,
480 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
481 
482 				/*
483 				 * If this is a replacing or spare vdev, then
484 				 * get the real first child of the vdev.
485 				 */
486 				if (strcmp(childtype,
487 				    VDEV_TYPE_REPLACING) == 0 ||
488 				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
489 					nvlist_t **rchild;
490 					uint_t rchildren;
491 
492 					verify(nvlist_lookup_nvlist_array(cnv,
493 					    ZPOOL_CONFIG_CHILDREN, &rchild,
494 					    &rchildren) == 0);
495 					assert(rchildren == 2);
496 					cnv = rchild[0];
497 
498 					verify(nvlist_lookup_string(cnv,
499 					    ZPOOL_CONFIG_TYPE,
500 					    &childtype) == 0);
501 				}
502 
503 				verify(nvlist_lookup_string(cnv,
504 				    ZPOOL_CONFIG_PATH, &path) == 0);
505 
506 				/*
507 				 * If we have a raidz/mirror that combines disks
508 				 * with files, report it as an error.
509 				 */
510 				if (!dontreport && type != NULL &&
511 				    strcmp(type, childtype) != 0) {
512 					if (ret != NULL)
513 						free(ret);
514 					ret = NULL;
515 					if (fatal)
516 						vdev_error(gettext(
517 						    "mismatched replication "
518 						    "level: %s contains both "
519 						    "files and devices\n"),
520 						    rep.zprl_type);
521 					else
522 						return (NULL);
523 					dontreport = B_TRUE;
524 				}
525 
526 				/*
527 				 * According to stat(2), the value of 'st_size'
528 				 * is undefined for block devices and character
529 				 * devices.  But there is no effective way to
530 				 * determine the real size in userland.
531 				 *
532 				 * Instead, we'll take advantage of an
533 				 * implementation detail of spec_size().  If the
534 				 * device is currently open, then we (should)
535 				 * return a valid size.
536 				 *
537 				 * If we still don't get a valid size (indicated
538 				 * by a size of 0 or MAXOFFSET_T), then ignore
539 				 * this device altogether.
540 				 */
541 				if ((fd = open(path, O_RDONLY)) >= 0) {
542 					err = fstat64(fd, &statbuf);
543 					(void) close(fd);
544 				} else {
545 					err = stat64(path, &statbuf);
546 				}
547 
548 				if (err != 0 ||
549 				    statbuf.st_size == 0 ||
550 				    statbuf.st_size == MAXOFFSET_T)
551 					continue;
552 
553 				size = statbuf.st_size;
554 
555 				/*
556 				 * Also make sure that devices and
557 				 * slices have a consistent size.  If
558 				 * they differ by a significant amount
559 				 * (~16MB) then report an error.
560 				 */
561 				if (!dontreport &&
562 				    (vdev_size != -1ULL &&
563 				    (labs(size - vdev_size) >
564 				    ZPOOL_FUZZ))) {
565 					if (ret != NULL)
566 						free(ret);
567 					ret = NULL;
568 					if (fatal)
569 						vdev_error(gettext(
570 						    "%s contains devices of "
571 						    "different sizes\n"),
572 						    rep.zprl_type);
573 					else
574 						return (NULL);
575 					dontreport = B_TRUE;
576 				}
577 
578 				type = childtype;
579 				vdev_size = size;
580 			}
581 		}
582 
583 		/*
584 		 * At this point, we have the replication of the last toplevel
585 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if its
586 		 * different.
587 		 */
588 		if (lastrep.zprl_type != NULL) {
589 			if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
590 				if (ret != NULL)
591 					free(ret);
592 				ret = NULL;
593 				if (fatal)
594 					vdev_error(gettext(
595 					    "mismatched replication level: "
596 					    "both %s and %s vdevs are "
597 					    "present\n"),
598 					    lastrep.zprl_type, rep.zprl_type);
599 				else
600 					return (NULL);
601 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
602 				if (ret)
603 					free(ret);
604 				ret = NULL;
605 				if (fatal)
606 					vdev_error(gettext(
607 					    "mismatched replication level: "
608 					    "both %llu and %llu device parity "
609 					    "%s vdevs are present\n"),
610 					    lastrep.zprl_parity,
611 					    rep.zprl_parity,
612 					    rep.zprl_type);
613 				else
614 					return (NULL);
615 			} else if (lastrep.zprl_children != rep.zprl_children) {
616 				if (ret)
617 					free(ret);
618 				ret = NULL;
619 				if (fatal)
620 					vdev_error(gettext(
621 					    "mismatched replication level: "
622 					    "both %llu-way and %llu-way %s "
623 					    "vdevs are present\n"),
624 					    lastrep.zprl_children,
625 					    rep.zprl_children,
626 					    rep.zprl_type);
627 				else
628 					return (NULL);
629 			}
630 		}
631 		lastrep = rep;
632 	}
633 
634 	if (ret != NULL)
635 		*ret = rep;
636 
637 	return (ret);
638 }
639 
640 /*
641  * Check the replication level of the vdev spec against the current pool.  Calls
642  * get_replication() to make sure the new spec is self-consistent.  If the pool
643  * has a consistent replication level, then we ignore any errors.  Otherwise,
644  * report any difference between the two.
645  */
646 static int
647 check_replication(nvlist_t *config, nvlist_t *newroot)
648 {
649 	nvlist_t **child;
650 	uint_t	children;
651 	replication_level_t *current = NULL, *new;
652 	int ret;
653 
654 	/*
655 	 * If we have a current pool configuration, check to see if it's
656 	 * self-consistent.  If not, simply return success.
657 	 */
658 	if (config != NULL) {
659 		nvlist_t *nvroot;
660 
661 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
662 		    &nvroot) == 0);
663 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
664 			return (0);
665 	}
666 	/*
667 	 * for spares there may be no children, and therefore no
668 	 * replication level to check
669 	 */
670 	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
671 	    &child, &children) != 0) || (children == 0)) {
672 		free(current);
673 		return (0);
674 	}
675 
676 	/*
677 	 * If all we have is logs then there's no replication level to check.
678 	 */
679 	if (num_logs(newroot) == children) {
680 		free(current);
681 		return (0);
682 	}
683 
684 	/*
685 	 * Get the replication level of the new vdev spec, reporting any
686 	 * inconsistencies found.
687 	 */
688 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
689 		free(current);
690 		return (-1);
691 	}
692 
693 	/*
694 	 * Check to see if the new vdev spec matches the replication level of
695 	 * the current pool.
696 	 */
697 	ret = 0;
698 	if (current != NULL) {
699 		if (strcmp(current->zprl_type, new->zprl_type) != 0) {
700 			vdev_error(gettext(
701 			    "mismatched replication level: pool uses %s "
702 			    "and new vdev is %s\n"),
703 			    current->zprl_type, new->zprl_type);
704 			ret = -1;
705 		} else if (current->zprl_parity != new->zprl_parity) {
706 			vdev_error(gettext(
707 			    "mismatched replication level: pool uses %llu "
708 			    "device parity and new vdev uses %llu\n"),
709 			    current->zprl_parity, new->zprl_parity);
710 			ret = -1;
711 		} else if (current->zprl_children != new->zprl_children) {
712 			vdev_error(gettext(
713 			    "mismatched replication level: pool uses %llu-way "
714 			    "%s and new vdev uses %llu-way %s\n"),
715 			    current->zprl_children, current->zprl_type,
716 			    new->zprl_children, new->zprl_type);
717 			ret = -1;
718 		}
719 	}
720 
721 	free(new);
722 	if (current != NULL)
723 		free(current);
724 
725 	return (ret);
726 }
727 
728 /*
729  * Go through and find any whole disks in the vdev specification, labelling them
730  * as appropriate.  When constructing the vdev spec, we were unable to open this
731  * device in order to provide a devid.  Now that we have labelled the disk and
732  * know that slice 0 is valid, we can construct the devid now.
733  *
734  * If the disk was already labeled with an EFI label, we will have gotten the
735  * devid already (because we were able to open the whole disk).  Otherwise, we
736  * need to get the devid after we label the disk.
737  */
738 static int
739 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
740 {
741 	nvlist_t **child;
742 	uint_t c, children;
743 	char *type, *path, *diskname;
744 	char buf[MAXPATHLEN];
745 	uint64_t wholedisk;
746 	int fd;
747 	int ret;
748 	ddi_devid_t devid;
749 	char *minor = NULL, *devid_str = NULL;
750 
751 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
752 
753 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
754 	    &child, &children) != 0) {
755 
756 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
757 			return (0);
758 
759 		/*
760 		 * We have a disk device.  Get the path to the device
761 		 * and see if it's a whole disk by appending the backup
762 		 * slice and stat()ing the device.
763 		 */
764 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
765 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
766 		    &wholedisk) != 0 || !wholedisk)
767 			return (0);
768 
769 		diskname = strrchr(path, '/');
770 		assert(diskname != NULL);
771 		diskname++;
772 		if (zpool_label_disk(g_zfs, zhp, diskname) == -1)
773 			return (-1);
774 
775 		/*
776 		 * Fill in the devid, now that we've labeled the disk.
777 		 */
778 		(void) snprintf(buf, sizeof (buf), "%ss0", path);
779 		if ((fd = open(buf, O_RDONLY)) < 0) {
780 			(void) fprintf(stderr,
781 			    gettext("cannot open '%s': %s\n"),
782 			    buf, strerror(errno));
783 			return (-1);
784 		}
785 
786 		if (devid_get(fd, &devid) == 0) {
787 			if (devid_get_minor_name(fd, &minor) == 0 &&
788 			    (devid_str = devid_str_encode(devid, minor)) !=
789 			    NULL) {
790 				verify(nvlist_add_string(nv,
791 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
792 			}
793 			if (devid_str != NULL)
794 				devid_str_free(devid_str);
795 			if (minor != NULL)
796 				devid_str_free(minor);
797 			devid_free(devid);
798 		}
799 
800 		/*
801 		 * Update the path to refer to the 's0' slice.  The presence of
802 		 * the 'whole_disk' field indicates to the CLI that we should
803 		 * chop off the slice number when displaying the device in
804 		 * future output.
805 		 */
806 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
807 
808 		(void) close(fd);
809 
810 		return (0);
811 	}
812 
813 	for (c = 0; c < children; c++)
814 		if ((ret = make_disks(zhp, child[c])) != 0)
815 			return (ret);
816 
817 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
818 	    &child, &children) == 0)
819 		for (c = 0; c < children; c++)
820 			if ((ret = make_disks(zhp, child[c])) != 0)
821 				return (ret);
822 
823 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
824 	    &child, &children) == 0)
825 		for (c = 0; c < children; c++)
826 			if ((ret = make_disks(zhp, child[c])) != 0)
827 				return (ret);
828 
829 	return (0);
830 }
831 
832 /*
833  * Determine if the given path is a hot spare within the given configuration.
834  */
835 static boolean_t
836 is_spare(nvlist_t *config, const char *path)
837 {
838 	int fd;
839 	pool_state_t state;
840 	char *name = NULL;
841 	nvlist_t *label;
842 	uint64_t guid, spareguid;
843 	nvlist_t *nvroot;
844 	nvlist_t **spares;
845 	uint_t i, nspares;
846 	boolean_t inuse;
847 
848 	if ((fd = open(path, O_RDONLY)) < 0)
849 		return (B_FALSE);
850 
851 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
852 	    !inuse ||
853 	    state != POOL_STATE_SPARE ||
854 	    zpool_read_label(fd, &label) != 0) {
855 		free(name);
856 		(void) close(fd);
857 		return (B_FALSE);
858 	}
859 	free(name);
860 
861 	(void) close(fd);
862 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
863 	nvlist_free(label);
864 
865 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
866 	    &nvroot) == 0);
867 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
868 	    &spares, &nspares) == 0) {
869 		for (i = 0; i < nspares; i++) {
870 			verify(nvlist_lookup_uint64(spares[i],
871 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
872 			if (spareguid == guid)
873 				return (B_TRUE);
874 		}
875 	}
876 
877 	return (B_FALSE);
878 }
879 
880 /*
881  * Go through and find any devices that are in use.  We rely on libdiskmgt for
882  * the majority of this task.
883  */
884 static int
885 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
886     int isspare)
887 {
888 	nvlist_t **child;
889 	uint_t c, children;
890 	char *type, *path;
891 	int ret;
892 	char buf[MAXPATHLEN];
893 	uint64_t wholedisk;
894 
895 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
896 
897 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
898 	    &child, &children) != 0) {
899 
900 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
901 
902 		/*
903 		 * As a generic check, we look to see if this is a replace of a
904 		 * hot spare within the same pool.  If so, we allow it
905 		 * regardless of what libdiskmgt or zpool_in_use() says.
906 		 */
907 		if (isreplacing) {
908 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
909 			    &wholedisk) == 0 && wholedisk)
910 				(void) snprintf(buf, sizeof (buf), "%ss0",
911 				    path);
912 			else
913 				(void) strlcpy(buf, path, sizeof (buf));
914 			if (is_spare(config, buf))
915 				return (0);
916 		}
917 
918 		if (strcmp(type, VDEV_TYPE_DISK) == 0 ||
919 		    strcmp(type, VDEV_TYPE_FILE) == 0)
920 			ret = check_file(path, force, isspare);
921 
922 		return (ret);
923 	}
924 
925 	for (c = 0; c < children; c++)
926 		if ((ret = check_in_use(config, child[c], force,
927 		    isreplacing, B_FALSE)) != 0)
928 			return (ret);
929 
930 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
931 	    &child, &children) == 0)
932 		for (c = 0; c < children; c++)
933 			if ((ret = check_in_use(config, child[c], force,
934 			    isreplacing, B_TRUE)) != 0)
935 				return (ret);
936 
937 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
938 	    &child, &children) == 0)
939 		for (c = 0; c < children; c++)
940 			if ((ret = check_in_use(config, child[c], force,
941 			    isreplacing, B_FALSE)) != 0)
942 				return (ret);
943 
944 	return (0);
945 }
946 
947 static const char *
948 is_grouping(const char *type, int *mindev, int *maxdev)
949 {
950 	if (strncmp(type, "raidz", 5) == 0) {
951 		const char *p = type + 5;
952 		char *end;
953 		long nparity;
954 
955 		if (*p == '\0') {
956 			nparity = 1;
957 		} else if (*p == '0') {
958 			return (NULL); /* no zero prefixes allowed */
959 		} else {
960 			errno = 0;
961 			nparity = strtol(p, &end, 10);
962 			if (errno != 0 || nparity < 1 || nparity >= 255 ||
963 			    *end != '\0')
964 				return (NULL);
965 		}
966 
967 		if (mindev != NULL)
968 			*mindev = nparity + 1;
969 		if (maxdev != NULL)
970 			*maxdev = 255;
971 		return (VDEV_TYPE_RAIDZ);
972 	}
973 
974 	if (maxdev != NULL)
975 		*maxdev = INT_MAX;
976 
977 	if (strcmp(type, "mirror") == 0) {
978 		if (mindev != NULL)
979 			*mindev = 2;
980 		return (VDEV_TYPE_MIRROR);
981 	}
982 
983 	if (strcmp(type, "spare") == 0) {
984 		if (mindev != NULL)
985 			*mindev = 1;
986 		return (VDEV_TYPE_SPARE);
987 	}
988 
989 	if (strcmp(type, "log") == 0) {
990 		if (mindev != NULL)
991 			*mindev = 1;
992 		return (VDEV_TYPE_LOG);
993 	}
994 
995 	if (strcmp(type, "cache") == 0) {
996 		if (mindev != NULL)
997 			*mindev = 1;
998 		return (VDEV_TYPE_L2CACHE);
999 	}
1000 
1001 	return (NULL);
1002 }
1003 
1004 /*
1005  * Construct a syntactically valid vdev specification,
1006  * and ensure that all devices and files exist and can be opened.
1007  * Note: we don't bother freeing anything in the error paths
1008  * because the program is just going to exit anyway.
1009  */
1010 nvlist_t *
1011 construct_spec(int argc, char **argv)
1012 {
1013 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1014 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1015 	const char *type;
1016 	uint64_t is_log;
1017 	boolean_t seen_logs;
1018 
1019 	top = NULL;
1020 	toplevels = 0;
1021 	spares = NULL;
1022 	l2cache = NULL;
1023 	nspares = 0;
1024 	nlogs = 0;
1025 	nl2cache = 0;
1026 	is_log = B_FALSE;
1027 	seen_logs = B_FALSE;
1028 
1029 	while (argc > 0) {
1030 		nv = NULL;
1031 
1032 		/*
1033 		 * If it's a mirror or raidz, the subsequent arguments are
1034 		 * its leaves -- until we encounter the next mirror or raidz.
1035 		 */
1036 		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1037 			nvlist_t **child = NULL;
1038 			int c, children = 0;
1039 
1040 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1041 				if (spares != NULL) {
1042 					(void) fprintf(stderr,
1043 					    gettext("invalid vdev "
1044 					    "specification: 'spare' can be "
1045 					    "specified only once\n"));
1046 					return (NULL);
1047 				}
1048 				is_log = B_FALSE;
1049 			}
1050 
1051 			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1052 				if (seen_logs) {
1053 					(void) fprintf(stderr,
1054 					    gettext("invalid vdev "
1055 					    "specification: 'log' can be "
1056 					    "specified only once\n"));
1057 					return (NULL);
1058 				}
1059 				seen_logs = B_TRUE;
1060 				is_log = B_TRUE;
1061 				argc--;
1062 				argv++;
1063 				/*
1064 				 * A log is not a real grouping device.
1065 				 * We just set is_log and continue.
1066 				 */
1067 				continue;
1068 			}
1069 
1070 			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1071 				if (l2cache != NULL) {
1072 					(void) fprintf(stderr,
1073 					    gettext("invalid vdev "
1074 					    "specification: 'cache' can be "
1075 					    "specified only once\n"));
1076 					return (NULL);
1077 				}
1078 				is_log = B_FALSE;
1079 			}
1080 
1081 			if (is_log) {
1082 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1083 					(void) fprintf(stderr,
1084 					    gettext("invalid vdev "
1085 					    "specification: unsupported 'log' "
1086 					    "device: %s\n"), type);
1087 					return (NULL);
1088 				}
1089 				nlogs++;
1090 			}
1091 
1092 			for (c = 1; c < argc; c++) {
1093 				if (is_grouping(argv[c], NULL, NULL) != NULL)
1094 					break;
1095 				children++;
1096 				child = realloc(child,
1097 				    children * sizeof (nvlist_t *));
1098 				if (child == NULL)
1099 					zpool_no_memory();
1100 				if ((nv = make_leaf_vdev(argv[c], B_FALSE))
1101 				    == NULL)
1102 					return (NULL);
1103 				child[children - 1] = nv;
1104 			}
1105 
1106 			if (children < mindev) {
1107 				(void) fprintf(stderr, gettext("invalid vdev "
1108 				    "specification: %s requires at least %d "
1109 				    "devices\n"), argv[0], mindev);
1110 				return (NULL);
1111 			}
1112 
1113 			if (children > maxdev) {
1114 				(void) fprintf(stderr, gettext("invalid vdev "
1115 				    "specification: %s supports no more than "
1116 				    "%d devices\n"), argv[0], maxdev);
1117 				return (NULL);
1118 			}
1119 
1120 			argc -= c;
1121 			argv += c;
1122 
1123 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1124 				spares = child;
1125 				nspares = children;
1126 				continue;
1127 			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1128 				l2cache = child;
1129 				nl2cache = children;
1130 				continue;
1131 			} else {
1132 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1133 				    0) == 0);
1134 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1135 				    type) == 0);
1136 				verify(nvlist_add_uint64(nv,
1137 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1138 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1139 					verify(nvlist_add_uint64(nv,
1140 					    ZPOOL_CONFIG_NPARITY,
1141 					    mindev - 1) == 0);
1142 				}
1143 				verify(nvlist_add_nvlist_array(nv,
1144 				    ZPOOL_CONFIG_CHILDREN, child,
1145 				    children) == 0);
1146 
1147 				for (c = 0; c < children; c++)
1148 					nvlist_free(child[c]);
1149 				free(child);
1150 			}
1151 		} else {
1152 			/*
1153 			 * We have a device.  Pass off to make_leaf_vdev() to
1154 			 * construct the appropriate nvlist describing the vdev.
1155 			 */
1156 			if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL)
1157 				return (NULL);
1158 			if (is_log)
1159 				nlogs++;
1160 			argc--;
1161 			argv++;
1162 		}
1163 
1164 		toplevels++;
1165 		top = realloc(top, toplevels * sizeof (nvlist_t *));
1166 		if (top == NULL)
1167 			zpool_no_memory();
1168 		top[toplevels - 1] = nv;
1169 	}
1170 
1171 	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1172 		(void) fprintf(stderr, gettext("invalid vdev "
1173 		    "specification: at least one toplevel vdev must be "
1174 		    "specified\n"));
1175 		return (NULL);
1176 	}
1177 
1178 	if (seen_logs && nlogs == 0) {
1179 		(void) fprintf(stderr, gettext("invalid vdev specification: "
1180 		    "log requires at least 1 device\n"));
1181 		return (NULL);
1182 	}
1183 
1184 	/*
1185 	 * Finally, create nvroot and add all top-level vdevs to it.
1186 	 */
1187 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1188 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1189 	    VDEV_TYPE_ROOT) == 0);
1190 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1191 	    top, toplevels) == 0);
1192 	if (nspares != 0)
1193 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1194 		    spares, nspares) == 0);
1195 	if (nl2cache != 0)
1196 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1197 		    l2cache, nl2cache) == 0);
1198 
1199 	for (t = 0; t < toplevels; t++)
1200 		nvlist_free(top[t]);
1201 	for (t = 0; t < nspares; t++)
1202 		nvlist_free(spares[t]);
1203 	for (t = 0; t < nl2cache; t++)
1204 		nvlist_free(l2cache[t]);
1205 	if (spares)
1206 		free(spares);
1207 	if (l2cache)
1208 		free(l2cache);
1209 	free(top);
1210 
1211 	return (nvroot);
1212 }
1213 
1214 nvlist_t *
1215 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1216     splitflags_t flags, int argc, char **argv)
1217 {
1218 	nvlist_t *newroot = NULL, **child;
1219 	uint_t c, children;
1220 
1221 	if (argc > 0) {
1222 		if ((newroot = construct_spec(argc, argv)) == NULL) {
1223 			(void) fprintf(stderr, gettext("Unable to build a "
1224 			    "pool from the specified devices\n"));
1225 			return (NULL);
1226 		}
1227 
1228 		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1229 			nvlist_free(newroot);
1230 			return (NULL);
1231 		}
1232 
1233 		/* avoid any tricks in the spec */
1234 		verify(nvlist_lookup_nvlist_array(newroot,
1235 		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1236 		for (c = 0; c < children; c++) {
1237 			char *path;
1238 			const char *type;
1239 			int min, max;
1240 
1241 			verify(nvlist_lookup_string(child[c],
1242 			    ZPOOL_CONFIG_PATH, &path) == 0);
1243 			if ((type = is_grouping(path, &min, &max)) != NULL) {
1244 				(void) fprintf(stderr, gettext("Cannot use "
1245 				    "'%s' as a device for splitting\n"), type);
1246 				nvlist_free(newroot);
1247 				return (NULL);
1248 			}
1249 		}
1250 	}
1251 
1252 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1253 		if (newroot != NULL)
1254 			nvlist_free(newroot);
1255 		return (NULL);
1256 	}
1257 
1258 	return (newroot);
1259 }
1260 
1261 /*
1262  * Get and validate the contents of the given vdev specification.  This ensures
1263  * that the nvlist returned is well-formed, that all the devices exist, and that
1264  * they are not currently in use by any other known consumer.  The 'poolconfig'
1265  * parameter is the current configuration of the pool when adding devices
1266  * existing pool, and is used to perform additional checks, such as changing the
1267  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1268  * new pool.  The 'force' flag controls whether devices should be forcefully
1269  * added, even if they appear in use.
1270  */
1271 nvlist_t *
1272 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
1273     boolean_t isreplacing, boolean_t dryrun, int argc, char **argv)
1274 {
1275 	nvlist_t *newroot;
1276 	nvlist_t *poolconfig = NULL;
1277 	is_force = force;
1278 
1279 	/*
1280 	 * Construct the vdev specification.  If this is successful, we know
1281 	 * that we have a valid specification, and that all devices can be
1282 	 * opened.
1283 	 */
1284 	if ((newroot = construct_spec(argc, argv)) == NULL)
1285 		return (NULL);
1286 
1287 	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
1288 		return (NULL);
1289 
1290 	/*
1291 	 * Validate each device to make sure that its not shared with another
1292 	 * subsystem.  We do this even if 'force' is set, because there are some
1293 	 * uses (such as a dedicated dump device) that even '-f' cannot
1294 	 * override.
1295 	 */
1296 	if (check_in_use(poolconfig, newroot, force, isreplacing,
1297 	    B_FALSE) != 0) {
1298 		nvlist_free(newroot);
1299 		return (NULL);
1300 	}
1301 
1302 	/*
1303 	 * Check the replication level of the given vdevs and report any errors
1304 	 * found.  We include the existing pool spec, if any, as we need to
1305 	 * catch changes against the existing replication level.
1306 	 */
1307 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
1308 		nvlist_free(newroot);
1309 		return (NULL);
1310 	}
1311 
1312 	/*
1313 	 * Run through the vdev specification and label any whole disks found.
1314 	 */
1315 	if (!dryrun && make_disks(zhp, newroot) != 0) {
1316 		nvlist_free(newroot);
1317 		return (NULL);
1318 	}
1319 
1320 	return (newroot);
1321 }
1322