xref: /freebsd-src/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c (revision b985c9cafd2aedac5cf92428c0211485ea4ede24)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy 
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24eda14cbcSMatt Macy  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25eda14cbcSMatt Macy  * Copyright (c) 2016, 2017 Intel Corporation.
26eda14cbcSMatt Macy  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27eda14cbcSMatt Macy  */
28eda14cbcSMatt Macy 
29eda14cbcSMatt Macy /*
30eda14cbcSMatt Macy  * Functions to convert between a list of vdevs and an nvlist representing the
31eda14cbcSMatt Macy  * configuration.  Each entry in the list can be one of:
32eda14cbcSMatt Macy  *
33eda14cbcSMatt Macy  * 	Device vdevs
34eda14cbcSMatt Macy  * 		disk=(path=..., devid=...)
35eda14cbcSMatt Macy  * 		file=(path=...)
36eda14cbcSMatt Macy  *
37eda14cbcSMatt Macy  * 	Group vdevs
38eda14cbcSMatt Macy  * 		raidz[1|2]=(...)
39eda14cbcSMatt Macy  * 		mirror=(...)
40eda14cbcSMatt Macy  *
41eda14cbcSMatt Macy  * 	Hot spares
42eda14cbcSMatt Macy  *
43eda14cbcSMatt Macy  * While the underlying implementation supports it, group vdevs cannot contain
44eda14cbcSMatt Macy  * other group vdevs.  All userland verification of devices is contained within
45eda14cbcSMatt Macy  * this file.  If successful, the nvlist returned can be passed directly to the
46eda14cbcSMatt Macy  * kernel; we've done as much verification as possible in userland.
47eda14cbcSMatt Macy  *
48eda14cbcSMatt Macy  * Hot spares are a special case, and passed down as an array of disk vdevs, at
49eda14cbcSMatt Macy  * the same level as the root of the vdev tree.
50eda14cbcSMatt Macy  *
51eda14cbcSMatt Macy  * The only function exported by this file is 'make_root_vdev'.  The
52eda14cbcSMatt Macy  * function performs several passes:
53eda14cbcSMatt Macy  *
54eda14cbcSMatt Macy  * 	1. Construct the vdev specification.  Performs syntax validation and
55eda14cbcSMatt Macy  *         makes sure each device is valid.
56eda14cbcSMatt Macy  * 	2. Check for devices in use.  Using libblkid to make sure that no
57eda14cbcSMatt Macy  *         devices are also in use.  Some can be overridden using the 'force'
58eda14cbcSMatt Macy  *         flag, others cannot.
59eda14cbcSMatt Macy  * 	3. Check for replication errors if the 'force' flag is not specified.
60eda14cbcSMatt Macy  *         validates that the replication level is consistent across the
61eda14cbcSMatt Macy  *         entire pool.
62eda14cbcSMatt Macy  * 	4. Call libzfs to label any whole disks with an EFI label.
63eda14cbcSMatt Macy  */
64eda14cbcSMatt Macy 
65eda14cbcSMatt Macy #include <assert.h>
66eda14cbcSMatt Macy #include <ctype.h>
67eda14cbcSMatt Macy #include <errno.h>
68eda14cbcSMatt Macy #include <fcntl.h>
69eda14cbcSMatt Macy #include <libintl.h>
70eda14cbcSMatt Macy #include <libnvpair.h>
71eda14cbcSMatt Macy #include <libzutil.h>
72eda14cbcSMatt Macy #include <limits.h>
73eda14cbcSMatt Macy #include <sys/spa.h>
74eda14cbcSMatt Macy #include <stdio.h>
75eda14cbcSMatt Macy #include <string.h>
76eda14cbcSMatt Macy #include <unistd.h>
77eda14cbcSMatt Macy #include "zpool_util.h"
78eda14cbcSMatt Macy #include <sys/zfs_context.h>
79eda14cbcSMatt Macy 
80eda14cbcSMatt Macy #include <scsi/scsi.h>
81eda14cbcSMatt Macy #include <scsi/sg.h>
82eda14cbcSMatt Macy #include <sys/efi_partition.h>
83eda14cbcSMatt Macy #include <sys/stat.h>
84eda14cbcSMatt Macy #include <sys/mntent.h>
85eda14cbcSMatt Macy #include <uuid/uuid.h>
86eda14cbcSMatt Macy #include <blkid/blkid.h>
87eda14cbcSMatt Macy 
88eda14cbcSMatt Macy typedef struct vdev_disk_db_entry
89eda14cbcSMatt Macy {
90eda14cbcSMatt Macy 	char id[24];
91eda14cbcSMatt Macy 	int sector_size;
92eda14cbcSMatt Macy } vdev_disk_db_entry_t;
93eda14cbcSMatt Macy 
94eda14cbcSMatt Macy /*
95eda14cbcSMatt Macy  * Database of block devices that lie about physical sector sizes.  The
96eda14cbcSMatt Macy  * identification string must be precisely 24 characters to avoid false
97eda14cbcSMatt Macy  * negatives
98eda14cbcSMatt Macy  */
99eda14cbcSMatt Macy static vdev_disk_db_entry_t vdev_disk_database[] = {
100eda14cbcSMatt Macy 	{"ATA     ADATA SSD S396 3", 8192},
101eda14cbcSMatt Macy 	{"ATA     APPLE SSD SM128E", 8192},
102eda14cbcSMatt Macy 	{"ATA     APPLE SSD SM256E", 8192},
103eda14cbcSMatt Macy 	{"ATA     APPLE SSD SM512E", 8192},
104eda14cbcSMatt Macy 	{"ATA     APPLE SSD SM768E", 8192},
105eda14cbcSMatt Macy 	{"ATA     C400-MTFDDAC064M", 8192},
106eda14cbcSMatt Macy 	{"ATA     C400-MTFDDAC128M", 8192},
107eda14cbcSMatt Macy 	{"ATA     C400-MTFDDAC256M", 8192},
108eda14cbcSMatt Macy 	{"ATA     C400-MTFDDAC512M", 8192},
109eda14cbcSMatt Macy 	{"ATA     Corsair Force 3 ", 8192},
110eda14cbcSMatt Macy 	{"ATA     Corsair Force GS", 8192},
111eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2CT04", 8192},
112eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2BZ10", 8192},
113eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2BZ20", 8192},
114eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2BZ30", 8192},
115eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2CW04", 8192},
116eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2CW08", 8192},
117eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2CW12", 8192},
118eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2CW16", 8192},
119eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2CW30", 8192},
120eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2CW60", 8192},
121eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CT06", 8192},
122eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CT12", 8192},
123eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CT18", 8192},
124eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CT24", 8192},
125eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CW06", 8192},
126eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CW12", 8192},
127eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CW18", 8192},
128eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CW24", 8192},
129eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2CW48", 8192},
130eda14cbcSMatt Macy 	{"ATA     KINGSTON SH100S3", 8192},
131eda14cbcSMatt Macy 	{"ATA     KINGSTON SH103S3", 8192},
132eda14cbcSMatt Macy 	{"ATA     M4-CT064M4SSD2  ", 8192},
133eda14cbcSMatt Macy 	{"ATA     M4-CT128M4SSD2  ", 8192},
134eda14cbcSMatt Macy 	{"ATA     M4-CT256M4SSD2  ", 8192},
135eda14cbcSMatt Macy 	{"ATA     M4-CT512M4SSD2  ", 8192},
136eda14cbcSMatt Macy 	{"ATA     OCZ-AGILITY2    ", 8192},
137eda14cbcSMatt Macy 	{"ATA     OCZ-AGILITY3    ", 8192},
138eda14cbcSMatt Macy 	{"ATA     OCZ-VERTEX2 3.5 ", 8192},
139eda14cbcSMatt Macy 	{"ATA     OCZ-VERTEX3     ", 8192},
140eda14cbcSMatt Macy 	{"ATA     OCZ-VERTEX3 LT  ", 8192},
141eda14cbcSMatt Macy 	{"ATA     OCZ-VERTEX3 MI  ", 8192},
142eda14cbcSMatt Macy 	{"ATA     OCZ-VERTEX4     ", 8192},
143eda14cbcSMatt Macy 	{"ATA     SAMSUNG MZ7WD120", 8192},
144eda14cbcSMatt Macy 	{"ATA     SAMSUNG MZ7WD240", 8192},
145eda14cbcSMatt Macy 	{"ATA     SAMSUNG MZ7WD480", 8192},
146eda14cbcSMatt Macy 	{"ATA     SAMSUNG MZ7WD960", 8192},
147eda14cbcSMatt Macy 	{"ATA     SAMSUNG SSD 830 ", 8192},
148eda14cbcSMatt Macy 	{"ATA     Samsung SSD 840 ", 8192},
149eda14cbcSMatt Macy 	{"ATA     SanDisk SSD U100", 8192},
150eda14cbcSMatt Macy 	{"ATA     TOSHIBA THNSNH06", 8192},
151eda14cbcSMatt Macy 	{"ATA     TOSHIBA THNSNH12", 8192},
152eda14cbcSMatt Macy 	{"ATA     TOSHIBA THNSNH25", 8192},
153eda14cbcSMatt Macy 	{"ATA     TOSHIBA THNSNH51", 8192},
154eda14cbcSMatt Macy 	{"ATA     APPLE SSD TS064C", 4096},
155eda14cbcSMatt Macy 	{"ATA     APPLE SSD TS128C", 4096},
156eda14cbcSMatt Macy 	{"ATA     APPLE SSD TS256C", 4096},
157eda14cbcSMatt Macy 	{"ATA     APPLE SSD TS512C", 4096},
158eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2M040", 4096},
159eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2M080", 4096},
160eda14cbcSMatt Macy 	{"ATA     INTEL SSDSA2M160", 4096},
161eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2MH12", 4096},
162eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2MH25", 4096},
163eda14cbcSMatt Macy 	{"ATA     OCZ CORE_SSD    ", 4096},
164eda14cbcSMatt Macy 	{"ATA     OCZ-VERTEX      ", 4096},
165eda14cbcSMatt Macy 	{"ATA     SAMSUNG MCCOE32G", 4096},
166eda14cbcSMatt Macy 	{"ATA     SAMSUNG MCCOE64G", 4096},
167eda14cbcSMatt Macy 	{"ATA     SAMSUNG SSD PM80", 4096},
168eda14cbcSMatt Macy 	/* Flash drives optimized for 4KB IOs on larger pages */
169eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BA10", 4096},
170eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BA20", 4096},
171eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BA40", 4096},
172eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BA80", 4096},
173eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB08", 4096},
174eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB12", 4096},
175eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB16", 4096},
176eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB24", 4096},
177eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB30", 4096},
178eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB40", 4096},
179eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB48", 4096},
180eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB60", 4096},
181eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BB80", 4096},
182eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BW24", 4096},
183eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BW48", 4096},
184eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BP24", 4096},
185eda14cbcSMatt Macy 	{"ATA     INTEL SSDSC2BP48", 4096},
186eda14cbcSMatt Macy 	{"NA      SmrtStorSDLKAE9W", 4096},
187eda14cbcSMatt Macy 	{"NVMe    Amazon EC2 NVMe ", 4096},
188eda14cbcSMatt Macy 	/* Imported from Open Solaris */
189eda14cbcSMatt Macy 	{"ATA     MARVELL SD88SA02", 4096},
190eda14cbcSMatt Macy 	/* Advanced format Hard drives */
191eda14cbcSMatt Macy 	{"ATA     Hitachi HDS5C303", 4096},
192eda14cbcSMatt Macy 	{"ATA     SAMSUNG HD204UI ", 4096},
193eda14cbcSMatt Macy 	{"ATA     ST2000DL004 HD20", 4096},
194eda14cbcSMatt Macy 	{"ATA     WDC WD10EARS-00M", 4096},
195eda14cbcSMatt Macy 	{"ATA     WDC WD10EARS-00S", 4096},
196eda14cbcSMatt Macy 	{"ATA     WDC WD10EARS-00Z", 4096},
197eda14cbcSMatt Macy 	{"ATA     WDC WD15EARS-00M", 4096},
198eda14cbcSMatt Macy 	{"ATA     WDC WD15EARS-00S", 4096},
199eda14cbcSMatt Macy 	{"ATA     WDC WD15EARS-00Z", 4096},
200eda14cbcSMatt Macy 	{"ATA     WDC WD20EARS-00M", 4096},
201eda14cbcSMatt Macy 	{"ATA     WDC WD20EARS-00S", 4096},
202eda14cbcSMatt Macy 	{"ATA     WDC WD20EARS-00Z", 4096},
203eda14cbcSMatt Macy 	{"ATA     WDC WD1600BEVT-0", 4096},
204eda14cbcSMatt Macy 	{"ATA     WDC WD2500BEVT-0", 4096},
205eda14cbcSMatt Macy 	{"ATA     WDC WD3200BEVT-0", 4096},
206eda14cbcSMatt Macy 	{"ATA     WDC WD5000BEVT-0", 4096},
207eda14cbcSMatt Macy };
208eda14cbcSMatt Macy 
209eda14cbcSMatt Macy 
210eda14cbcSMatt Macy #define	INQ_REPLY_LEN	96
211eda14cbcSMatt Macy #define	INQ_CMD_LEN	6
212eda14cbcSMatt Macy 
213eda14cbcSMatt Macy static const int vdev_disk_database_size =
214eda14cbcSMatt Macy 	sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
215eda14cbcSMatt Macy 
216eda14cbcSMatt Macy boolean_t
check_sector_size_database(char * path,int * sector_size)217eda14cbcSMatt Macy check_sector_size_database(char *path, int *sector_size)
218eda14cbcSMatt Macy {
219eda14cbcSMatt Macy 	unsigned char inq_buff[INQ_REPLY_LEN];
220eda14cbcSMatt Macy 	unsigned char sense_buffer[32];
221eda14cbcSMatt Macy 	unsigned char inq_cmd_blk[INQ_CMD_LEN] =
222eda14cbcSMatt Macy 	    {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
223eda14cbcSMatt Macy 	sg_io_hdr_t io_hdr;
224eda14cbcSMatt Macy 	int error;
225eda14cbcSMatt Macy 	int fd;
226eda14cbcSMatt Macy 	int i;
227eda14cbcSMatt Macy 
228eda14cbcSMatt Macy 	/* Prepare INQUIRY command */
229eda14cbcSMatt Macy 	memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
230eda14cbcSMatt Macy 	io_hdr.interface_id = 'S';
231eda14cbcSMatt Macy 	io_hdr.cmd_len = sizeof (inq_cmd_blk);
232eda14cbcSMatt Macy 	io_hdr.mx_sb_len = sizeof (sense_buffer);
233eda14cbcSMatt Macy 	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
234eda14cbcSMatt Macy 	io_hdr.dxfer_len = INQ_REPLY_LEN;
235eda14cbcSMatt Macy 	io_hdr.dxferp = inq_buff;
236eda14cbcSMatt Macy 	io_hdr.cmdp = inq_cmd_blk;
237eda14cbcSMatt Macy 	io_hdr.sbp = sense_buffer;
238eda14cbcSMatt Macy 	io_hdr.timeout = 10;		/* 10 milliseconds is ample time */
239eda14cbcSMatt Macy 
240eda14cbcSMatt Macy 	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
241eda14cbcSMatt Macy 		return (B_FALSE);
242eda14cbcSMatt Macy 
243eda14cbcSMatt Macy 	error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
244eda14cbcSMatt Macy 
245eda14cbcSMatt Macy 	(void) close(fd);
246eda14cbcSMatt Macy 
247eda14cbcSMatt Macy 	if (error < 0)
248eda14cbcSMatt Macy 		return (B_FALSE);
249eda14cbcSMatt Macy 
250eda14cbcSMatt Macy 	if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
251eda14cbcSMatt Macy 		return (B_FALSE);
252eda14cbcSMatt Macy 
253eda14cbcSMatt Macy 	for (i = 0; i < vdev_disk_database_size; i++) {
254eda14cbcSMatt Macy 		if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
255eda14cbcSMatt Macy 			continue;
256eda14cbcSMatt Macy 
257eda14cbcSMatt Macy 		*sector_size = vdev_disk_database[i].sector_size;
258eda14cbcSMatt Macy 		return (B_TRUE);
259eda14cbcSMatt Macy 	}
260eda14cbcSMatt Macy 
261eda14cbcSMatt Macy 	return (B_FALSE);
262eda14cbcSMatt Macy }
263eda14cbcSMatt Macy 
264eda14cbcSMatt Macy static int
check_slice(const char * path,blkid_cache cache,int force,boolean_t isspare)265eda14cbcSMatt Macy check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
266eda14cbcSMatt Macy {
267eda14cbcSMatt Macy 	int err;
268eda14cbcSMatt Macy 	char *value;
269eda14cbcSMatt Macy 
270eda14cbcSMatt Macy 	/* No valid type detected device is safe to use */
271eda14cbcSMatt Macy 	value = blkid_get_tag_value(cache, "TYPE", path);
272eda14cbcSMatt Macy 	if (value == NULL)
273eda14cbcSMatt Macy 		return (0);
274eda14cbcSMatt Macy 
275eda14cbcSMatt Macy 	/*
276eda14cbcSMatt Macy 	 * If libblkid detects a ZFS device, we check the device
277eda14cbcSMatt Macy 	 * using check_file() to see if it's safe.  The one safe
278eda14cbcSMatt Macy 	 * case is a spare device shared between multiple pools.
279eda14cbcSMatt Macy 	 */
280eda14cbcSMatt Macy 	if (strcmp(value, "zfs_member") == 0) {
281eda14cbcSMatt Macy 		err = check_file(path, force, isspare);
282eda14cbcSMatt Macy 	} else {
283eda14cbcSMatt Macy 		if (force) {
284eda14cbcSMatt Macy 			err = 0;
285eda14cbcSMatt Macy 		} else {
286eda14cbcSMatt Macy 			err = -1;
287eda14cbcSMatt Macy 			vdev_error(gettext("%s contains a filesystem of "
288eda14cbcSMatt Macy 			    "type '%s'\n"), path, value);
289eda14cbcSMatt Macy 		}
290eda14cbcSMatt Macy 	}
291eda14cbcSMatt Macy 
292eda14cbcSMatt Macy 	free(value);
293eda14cbcSMatt Macy 
294eda14cbcSMatt Macy 	return (err);
295eda14cbcSMatt Macy }
296eda14cbcSMatt Macy 
297eda14cbcSMatt Macy /*
298eda14cbcSMatt Macy  * Validate that a disk including all partitions are safe to use.
299eda14cbcSMatt Macy  *
300eda14cbcSMatt Macy  * For EFI labeled disks this can done relatively easily with the libefi
301eda14cbcSMatt Macy  * library.  The partition numbers are extracted from the label and used
302eda14cbcSMatt Macy  * to generate the expected /dev/ paths.  Each partition can then be
303eda14cbcSMatt Macy  * checked for conflicts.
304eda14cbcSMatt Macy  *
305eda14cbcSMatt Macy  * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
306eda14cbcSMatt Macy  * but due to the lack of a readily available libraries this scanning is
307eda14cbcSMatt Macy  * not implemented.  Instead only the device path as given is checked.
308eda14cbcSMatt Macy  */
309eda14cbcSMatt Macy static int
check_disk(const char * path,blkid_cache cache,int force,boolean_t isspare,boolean_t iswholedisk)310eda14cbcSMatt Macy check_disk(const char *path, blkid_cache cache, int force,
311eda14cbcSMatt Macy     boolean_t isspare, boolean_t iswholedisk)
312eda14cbcSMatt Macy {
313eda14cbcSMatt Macy 	struct dk_gpt *vtoc;
314eda14cbcSMatt Macy 	char slice_path[MAXPATHLEN];
315eda14cbcSMatt Macy 	int err = 0;
316eda14cbcSMatt Macy 	int fd, i;
317eda14cbcSMatt Macy 	int flags = O_RDONLY|O_DIRECT;
318eda14cbcSMatt Macy 
319eda14cbcSMatt Macy 	if (!iswholedisk)
320eda14cbcSMatt Macy 		return (check_slice(path, cache, force, isspare));
321eda14cbcSMatt Macy 
322eda14cbcSMatt Macy 	/* only spares can be shared, other devices require exclusive access */
323eda14cbcSMatt Macy 	if (!isspare)
324eda14cbcSMatt Macy 		flags |= O_EXCL;
325eda14cbcSMatt Macy 
326eda14cbcSMatt Macy 	if ((fd = open(path, flags)) < 0) {
327eda14cbcSMatt Macy 		char *value = blkid_get_tag_value(cache, "TYPE", path);
328eda14cbcSMatt Macy 		(void) fprintf(stderr, gettext("%s is in use and contains "
329eda14cbcSMatt Macy 		    "a %s filesystem.\n"), path, value ? value : "unknown");
330eda14cbcSMatt Macy 		free(value);
331eda14cbcSMatt Macy 		return (-1);
332eda14cbcSMatt Macy 	}
333eda14cbcSMatt Macy 
334eda14cbcSMatt Macy 	/*
335eda14cbcSMatt Macy 	 * Expected to fail for non-EFI labeled disks.  Just check the device
336eda14cbcSMatt Macy 	 * as given and do not attempt to detect and scan partitions.
337eda14cbcSMatt Macy 	 */
338eda14cbcSMatt Macy 	err = efi_alloc_and_read(fd, &vtoc);
339eda14cbcSMatt Macy 	if (err) {
340eda14cbcSMatt Macy 		(void) close(fd);
341eda14cbcSMatt Macy 		return (check_slice(path, cache, force, isspare));
342eda14cbcSMatt Macy 	}
343eda14cbcSMatt Macy 
344eda14cbcSMatt Macy 	/*
345eda14cbcSMatt Macy 	 * The primary efi partition label is damaged however the secondary
346eda14cbcSMatt Macy 	 * label at the end of the device is intact.  Rather than use this
347eda14cbcSMatt Macy 	 * label we should play it safe and treat this as a non efi device.
348eda14cbcSMatt Macy 	 */
349eda14cbcSMatt Macy 	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
350eda14cbcSMatt Macy 		efi_free(vtoc);
351eda14cbcSMatt Macy 		(void) close(fd);
352eda14cbcSMatt Macy 
353eda14cbcSMatt Macy 		if (force) {
354eda14cbcSMatt Macy 			/* Partitions will now be created using the backup */
355eda14cbcSMatt Macy 			return (0);
356eda14cbcSMatt Macy 		} else {
357eda14cbcSMatt Macy 			vdev_error(gettext("%s contains a corrupt primary "
358eda14cbcSMatt Macy 			    "EFI label.\n"), path);
359eda14cbcSMatt Macy 			return (-1);
360eda14cbcSMatt Macy 		}
361eda14cbcSMatt Macy 	}
362eda14cbcSMatt Macy 
363eda14cbcSMatt Macy 	for (i = 0; i < vtoc->efi_nparts; i++) {
364eda14cbcSMatt Macy 
365eda14cbcSMatt Macy 		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
366eda14cbcSMatt Macy 		    uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
367eda14cbcSMatt Macy 			continue;
368eda14cbcSMatt Macy 
369eda14cbcSMatt Macy 		if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
370eda14cbcSMatt Macy 			(void) snprintf(slice_path, sizeof (slice_path),
371eda14cbcSMatt Macy 			    "%s%s%d", path, "-part", i+1);
372eda14cbcSMatt Macy 		else
373eda14cbcSMatt Macy 			(void) snprintf(slice_path, sizeof (slice_path),
374eda14cbcSMatt Macy 			    "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
375eda14cbcSMatt Macy 			    "p" : "", i+1);
376eda14cbcSMatt Macy 
377eda14cbcSMatt Macy 		err = check_slice(slice_path, cache, force, isspare);
378eda14cbcSMatt Macy 		if (err)
379eda14cbcSMatt Macy 			break;
380eda14cbcSMatt Macy 	}
381eda14cbcSMatt Macy 
382eda14cbcSMatt Macy 	efi_free(vtoc);
383eda14cbcSMatt Macy 	(void) close(fd);
384eda14cbcSMatt Macy 
385eda14cbcSMatt Macy 	return (err);
386eda14cbcSMatt Macy }
387eda14cbcSMatt Macy 
388eda14cbcSMatt Macy int
check_device(const char * path,boolean_t force,boolean_t isspare,boolean_t iswholedisk)389eda14cbcSMatt Macy check_device(const char *path, boolean_t force,
390eda14cbcSMatt Macy     boolean_t isspare, boolean_t iswholedisk)
391eda14cbcSMatt Macy {
392eda14cbcSMatt Macy 	blkid_cache cache;
393eda14cbcSMatt Macy 	int error;
394eda14cbcSMatt Macy 
395eda14cbcSMatt Macy 	error = blkid_get_cache(&cache, NULL);
396eda14cbcSMatt Macy 	if (error != 0) {
397eda14cbcSMatt Macy 		(void) fprintf(stderr, gettext("unable to access the blkid "
398eda14cbcSMatt Macy 		    "cache.\n"));
399eda14cbcSMatt Macy 		return (-1);
400eda14cbcSMatt Macy 	}
401eda14cbcSMatt Macy 
402eda14cbcSMatt Macy 	error = check_disk(path, cache, force, isspare, iswholedisk);
403eda14cbcSMatt Macy 	blkid_put_cache(cache);
404eda14cbcSMatt Macy 
405eda14cbcSMatt Macy 	return (error);
406eda14cbcSMatt Macy }
40716038816SMartin Matuska 
40816038816SMartin Matuska void
after_zpool_upgrade(zpool_handle_t * zhp)40916038816SMartin Matuska after_zpool_upgrade(zpool_handle_t *zhp)
41016038816SMartin Matuska {
411e92ffd9bSMartin Matuska 	(void) zhp;
41216038816SMartin Matuska }
4131f88aa09SMartin Matuska 
4141f88aa09SMartin Matuska int
check_file(const char * file,boolean_t force,boolean_t isspare)4151f88aa09SMartin Matuska check_file(const char *file, boolean_t force, boolean_t isspare)
4161f88aa09SMartin Matuska {
4171f88aa09SMartin Matuska 	return (check_file_generic(file, force, isspare));
4181f88aa09SMartin Matuska }
419b356da80SMartin Matuska 
420b356da80SMartin Matuska /*
421b356da80SMartin Matuska  * Read from a sysfs file and return an allocated string.  Removes
422b356da80SMartin Matuska  * the newline from the end of the string if there is one.
423b356da80SMartin Matuska  *
424b356da80SMartin Matuska  * Returns a string on success (which must be freed), or NULL on error.
425b356da80SMartin Matuska  */
zpool_sysfs_gets(char * path)426b356da80SMartin Matuska static char *zpool_sysfs_gets(char *path)
427b356da80SMartin Matuska {
428b356da80SMartin Matuska 	int fd;
429b356da80SMartin Matuska 	struct stat statbuf;
430b356da80SMartin Matuska 	char *buf = NULL;
431b356da80SMartin Matuska 	ssize_t count = 0;
432b356da80SMartin Matuska 	fd = open(path, O_RDONLY);
433b356da80SMartin Matuska 	if (fd < 0)
434b356da80SMartin Matuska 		return (NULL);
435b356da80SMartin Matuska 
436b356da80SMartin Matuska 	if (fstat(fd, &statbuf) != 0) {
437b356da80SMartin Matuska 		close(fd);
438b356da80SMartin Matuska 		return (NULL);
439b356da80SMartin Matuska 	}
440b356da80SMartin Matuska 
441*b985c9caSMartin Matuska 	buf = calloc(statbuf.st_size + 1, sizeof (*buf));
442b356da80SMartin Matuska 	if (buf == NULL) {
443b356da80SMartin Matuska 		close(fd);
444b356da80SMartin Matuska 		return (NULL);
445b356da80SMartin Matuska 	}
446b356da80SMartin Matuska 
447b356da80SMartin Matuska 	/*
448b356da80SMartin Matuska 	 * Note, we can read less bytes than st_size, and that's ok.  Sysfs
449b356da80SMartin Matuska 	 * files will report their size is 4k even if they only return a small
450b356da80SMartin Matuska 	 * string.
451b356da80SMartin Matuska 	 */
452b356da80SMartin Matuska 	count = read(fd, buf, statbuf.st_size);
453b356da80SMartin Matuska 	if (count < 0) {
454b356da80SMartin Matuska 		/* Error doing read() or we overran the buffer */
455b356da80SMartin Matuska 		close(fd);
456b356da80SMartin Matuska 		free(buf);
457b356da80SMartin Matuska 		return (NULL);
458b356da80SMartin Matuska 	}
459b356da80SMartin Matuska 
460b356da80SMartin Matuska 	/* Remove trailing newline */
461783d3ff6SMartin Matuska 	if (count > 0 && buf[count - 1] == '\n')
462b356da80SMartin Matuska 		buf[count - 1] = 0;
463b356da80SMartin Matuska 
464b356da80SMartin Matuska 	close(fd);
465b356da80SMartin Matuska 
466b356da80SMartin Matuska 	return (buf);
467b356da80SMartin Matuska }
468b356da80SMartin Matuska 
469b356da80SMartin Matuska /*
470b356da80SMartin Matuska  * Write a string to a sysfs file.
471b356da80SMartin Matuska  *
472b356da80SMartin Matuska  * Returns 0 on success, non-zero otherwise.
473b356da80SMartin Matuska  */
zpool_sysfs_puts(char * path,char * str)474b356da80SMartin Matuska static int zpool_sysfs_puts(char *path, char *str)
475b356da80SMartin Matuska {
476b356da80SMartin Matuska 	FILE *file;
477b356da80SMartin Matuska 
478b356da80SMartin Matuska 	file = fopen(path, "w");
479b356da80SMartin Matuska 	if (!file) {
480b356da80SMartin Matuska 		return (-1);
481b356da80SMartin Matuska 	}
482b356da80SMartin Matuska 
483b356da80SMartin Matuska 	if (fputs(str, file) < 0) {
484b356da80SMartin Matuska 		fclose(file);
485b356da80SMartin Matuska 		return (-2);
486b356da80SMartin Matuska 	}
487b356da80SMartin Matuska 	fclose(file);
488b356da80SMartin Matuska 	return (0);
489b356da80SMartin Matuska }
490b356da80SMartin Matuska 
491b356da80SMartin Matuska /* Given a vdev nvlist_t, rescan its enclosure sysfs path */
492b356da80SMartin Matuska static void
rescan_vdev_config_dev_sysfs_path(nvlist_t * vdev_nv)493b356da80SMartin Matuska rescan_vdev_config_dev_sysfs_path(nvlist_t *vdev_nv)
494b356da80SMartin Matuska {
495b356da80SMartin Matuska 	update_vdev_config_dev_sysfs_path(vdev_nv,
496b356da80SMartin Matuska 	    fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH),
497b356da80SMartin Matuska 	    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH);
498b356da80SMartin Matuska }
499b356da80SMartin Matuska 
500b356da80SMartin Matuska /*
501b356da80SMartin Matuska  * Given a power string: "on", "off", "1", or "0", return 0 if it's an
502b356da80SMartin Matuska  * off value, 1 if it's an on value, and -1 if the value is unrecognized.
503b356da80SMartin Matuska  */
zpool_power_parse_value(char * str)504b356da80SMartin Matuska static int zpool_power_parse_value(char *str)
505b356da80SMartin Matuska {
506b356da80SMartin Matuska 	if ((strcmp(str, "off") == 0) || (strcmp(str, "0") == 0))
507b356da80SMartin Matuska 		return (0);
508b356da80SMartin Matuska 
509b356da80SMartin Matuska 	if ((strcmp(str, "on") == 0) || (strcmp(str, "1") == 0))
510b356da80SMartin Matuska 		return (1);
511b356da80SMartin Matuska 
512b356da80SMartin Matuska 	return (-1);
513b356da80SMartin Matuska }
514b356da80SMartin Matuska 
515b356da80SMartin Matuska /*
516b356da80SMartin Matuska  * Given a vdev string return an allocated string containing the sysfs path to
517b356da80SMartin Matuska  * its power control file.  Also do a check if the power control file really
518b356da80SMartin Matuska  * exists and has correct permissions.
519b356da80SMartin Matuska  *
520b356da80SMartin Matuska  * Example returned strings:
521b356da80SMartin Matuska  *
522b356da80SMartin Matuska  * /sys/class/enclosure/0:0:122:0/10/power_status
523b356da80SMartin Matuska  * /sys/bus/pci/slots/10/power
524b356da80SMartin Matuska  *
525b356da80SMartin Matuska  * Returns allocated string on success (which must be freed), NULL on failure.
526b356da80SMartin Matuska  */
527b356da80SMartin Matuska static char *
zpool_power_sysfs_path(zpool_handle_t * zhp,char * vdev)528b356da80SMartin Matuska zpool_power_sysfs_path(zpool_handle_t *zhp, char *vdev)
529b356da80SMartin Matuska {
530b356da80SMartin Matuska 	const char *enc_sysfs_dir = NULL;
531b356da80SMartin Matuska 	char *path = NULL;
532b356da80SMartin Matuska 	nvlist_t *vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL);
533b356da80SMartin Matuska 
534b356da80SMartin Matuska 	if (vdev_nv == NULL) {
535b356da80SMartin Matuska 		return (NULL);
536b356da80SMartin Matuska 	}
537b356da80SMartin Matuska 
538b356da80SMartin Matuska 	/* Make sure we're getting the updated enclosure sysfs path */
539b356da80SMartin Matuska 	rescan_vdev_config_dev_sysfs_path(vdev_nv);
540b356da80SMartin Matuska 
541b356da80SMartin Matuska 	if (nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
542b356da80SMartin Matuska 	    &enc_sysfs_dir) != 0) {
543b356da80SMartin Matuska 		return (NULL);
544b356da80SMartin Matuska 	}
545b356da80SMartin Matuska 
546b356da80SMartin Matuska 	if (asprintf(&path, "%s/power_status", enc_sysfs_dir) == -1)
547b356da80SMartin Matuska 		return (NULL);
548b356da80SMartin Matuska 
549b356da80SMartin Matuska 	if (access(path, W_OK) != 0) {
550b356da80SMartin Matuska 		free(path);
551b356da80SMartin Matuska 		path = NULL;
552b356da80SMartin Matuska 		/* No HDD 'power_control' file, maybe it's NVMe? */
553b356da80SMartin Matuska 		if (asprintf(&path, "%s/power", enc_sysfs_dir) == -1) {
554b356da80SMartin Matuska 			return (NULL);
555b356da80SMartin Matuska 		}
556b356da80SMartin Matuska 
557b356da80SMartin Matuska 		if (access(path, R_OK | W_OK) != 0) {
558b356da80SMartin Matuska 			/* Not NVMe either */
559b356da80SMartin Matuska 			free(path);
560b356da80SMartin Matuska 			return (NULL);
561b356da80SMartin Matuska 		}
562b356da80SMartin Matuska 	}
563b356da80SMartin Matuska 
564b356da80SMartin Matuska 	return (path);
565b356da80SMartin Matuska }
566b356da80SMartin Matuska 
567b356da80SMartin Matuska /*
568b356da80SMartin Matuska  * Given a path to a sysfs power control file, return B_TRUE if you should use
569b356da80SMartin Matuska  * "on/off" words to control it, or B_FALSE otherwise ("0/1" to control).
570b356da80SMartin Matuska  */
571b356da80SMartin Matuska static boolean_t
zpool_power_use_word(char * sysfs_path)572b356da80SMartin Matuska zpool_power_use_word(char *sysfs_path)
573b356da80SMartin Matuska {
574b356da80SMartin Matuska 	if (strcmp(&sysfs_path[strlen(sysfs_path) - strlen("power_status")],
575b356da80SMartin Matuska 	    "power_status") == 0) {
576b356da80SMartin Matuska 		return (B_TRUE);
577b356da80SMartin Matuska 	}
578b356da80SMartin Matuska 	return (B_FALSE);
579b356da80SMartin Matuska }
580b356da80SMartin Matuska 
581b356da80SMartin Matuska /*
582b356da80SMartin Matuska  * Check the sysfs power control value for a vdev.
583b356da80SMartin Matuska  *
584b356da80SMartin Matuska  * Returns:
585b356da80SMartin Matuska  *  0 - Power is off
586b356da80SMartin Matuska  *  1 - Power is on
587b356da80SMartin Matuska  * -1 - Error or unsupported
588b356da80SMartin Matuska  */
589b356da80SMartin Matuska int
zpool_power_current_state(zpool_handle_t * zhp,char * vdev)590b356da80SMartin Matuska zpool_power_current_state(zpool_handle_t *zhp, char *vdev)
591b356da80SMartin Matuska {
592b356da80SMartin Matuska 	char *val;
593b356da80SMartin Matuska 	int rc;
594b356da80SMartin Matuska 
595b356da80SMartin Matuska 	char *path = zpool_power_sysfs_path(zhp, vdev);
596b356da80SMartin Matuska 	if (path == NULL)
597b356da80SMartin Matuska 		return (-1);
598b356da80SMartin Matuska 
599b356da80SMartin Matuska 	val = zpool_sysfs_gets(path);
600b356da80SMartin Matuska 	if (val == NULL) {
601b356da80SMartin Matuska 		free(path);
602b356da80SMartin Matuska 		return (-1);
603b356da80SMartin Matuska 	}
604b356da80SMartin Matuska 
605b356da80SMartin Matuska 	rc = zpool_power_parse_value(val);
606b356da80SMartin Matuska 	free(val);
607b356da80SMartin Matuska 	free(path);
608b356da80SMartin Matuska 	return (rc);
609b356da80SMartin Matuska }
610b356da80SMartin Matuska 
611b356da80SMartin Matuska /*
612b356da80SMartin Matuska  * Turn on or off the slot to a device
613b356da80SMartin Matuska  *
614b356da80SMartin Matuska  * Device path is the full path to the device (like /dev/sda or /dev/sda1).
615b356da80SMartin Matuska  *
616b356da80SMartin Matuska  * Return code:
617b356da80SMartin Matuska  * 0:		Success
618b356da80SMartin Matuska  * ENOTSUP:	Power control not supported for OS
619b356da80SMartin Matuska  * EBADSLT:	Couldn't read current power state
620b356da80SMartin Matuska  * ENOENT:	No sysfs path to power control
621b356da80SMartin Matuska  * EIO:	Couldn't write sysfs power value
622b356da80SMartin Matuska  * EBADE:	Sysfs power value didn't change
623b356da80SMartin Matuska  */
624b356da80SMartin Matuska int
zpool_power(zpool_handle_t * zhp,char * vdev,boolean_t turn_on)625b356da80SMartin Matuska zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on)
626b356da80SMartin Matuska {
627b356da80SMartin Matuska 	char *sysfs_path;
628b356da80SMartin Matuska 	const char *val;
629b356da80SMartin Matuska 	int rc;
630b356da80SMartin Matuska 	int timeout_ms;
631b356da80SMartin Matuska 
632b356da80SMartin Matuska 	rc = zpool_power_current_state(zhp, vdev);
633b356da80SMartin Matuska 	if (rc == -1) {
634b356da80SMartin Matuska 		return (EBADSLT);
635b356da80SMartin Matuska 	}
636b356da80SMartin Matuska 
637b356da80SMartin Matuska 	/* Already correct value? */
638b356da80SMartin Matuska 	if (rc == (int)turn_on)
639b356da80SMartin Matuska 		return (0);
640b356da80SMartin Matuska 
641b356da80SMartin Matuska 	sysfs_path = zpool_power_sysfs_path(zhp, vdev);
642b356da80SMartin Matuska 	if (sysfs_path == NULL)
643b356da80SMartin Matuska 		return (ENOENT);
644b356da80SMartin Matuska 
645b356da80SMartin Matuska 	if (zpool_power_use_word(sysfs_path)) {
646b356da80SMartin Matuska 		val = turn_on ? "on" : "off";
647b356da80SMartin Matuska 	} else {
648b356da80SMartin Matuska 		val = turn_on ? "1" : "0";
649b356da80SMartin Matuska 	}
650b356da80SMartin Matuska 
651b356da80SMartin Matuska 	rc = zpool_sysfs_puts(sysfs_path, (char *)val);
652b356da80SMartin Matuska 
653b356da80SMartin Matuska 	free(sysfs_path);
654b356da80SMartin Matuska 	if (rc != 0) {
655b356da80SMartin Matuska 		return (EIO);
656b356da80SMartin Matuska 	}
657b356da80SMartin Matuska 
658b356da80SMartin Matuska 	/*
659b356da80SMartin Matuska 	 * Wait up to 30 seconds for sysfs power value to change after
660b356da80SMartin Matuska 	 * writing it.
661b356da80SMartin Matuska 	 */
662b356da80SMartin Matuska 	timeout_ms = zpool_getenv_int("ZPOOL_POWER_ON_SLOT_TIMEOUT_MS", 30000);
663b356da80SMartin Matuska 	for (int i = 0; i < MAX(1, timeout_ms / 200); i++) {
664b356da80SMartin Matuska 		rc = zpool_power_current_state(zhp, vdev);
665b356da80SMartin Matuska 		if (rc == (int)turn_on)
666b356da80SMartin Matuska 			return (0);	/* success */
667b356da80SMartin Matuska 
668b356da80SMartin Matuska 		fsleep(0.200);	/* 200ms */
669b356da80SMartin Matuska 	}
670b356da80SMartin Matuska 
671b356da80SMartin Matuska 	/* sysfs value never changed */
672b356da80SMartin Matuska 	return (EBADE);
673b356da80SMartin Matuska }
674