xref: /openbsd-src/sys/kern/subr_disk.c (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1 /*	$OpenBSD: subr_disk.c,v 1.237 2020/05/29 04:42:25 deraadt Exp $	*/
2 /*	$NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Jason R. Thorpe.  All rights reserved.
6  * Copyright (c) 1982, 1986, 1988, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/stat.h>
48 #include <sys/syslog.h>
49 #include <sys/device.h>
50 #include <sys/time.h>
51 #include <sys/disklabel.h>
52 #include <sys/conf.h>
53 #include <sys/lock.h>
54 #include <sys/disk.h>
55 #include <sys/reboot.h>
56 #include <sys/dkio.h>
57 #include <sys/vnode.h>
58 #include <sys/task.h>
59 #include <sys/stdint.h>
60 
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 
64 #include <net/if.h>
65 
66 #include <dev/cons.h>
67 
68 #include <lib/libz/zlib.h>
69 
70 #include "softraid.h"
71 
72 #ifdef DEBUG
73 #define DPRINTF(x...)	printf(x)
74 #else
75 #define DPRINTF(x...)
76 #endif
77 
78 /*
79  * A global list of all disks attached to the system.  May grow or
80  * shrink over time.
81  */
82 struct	disklist_head disklist;	/* TAILQ_HEAD */
83 int	disk_count;		/* number of drives in global disklist */
84 int	disk_change;		/* set if a disk has been attached/detached
85 				 * since last we looked at this variable. This
86 				 * is reset by hw_sysctl()
87 				 */
88 
89 #define DUID_SIZE 8
90 
91 u_char	bootduid[DUID_SIZE];	/* DUID of boot disk. */
92 u_char	rootduid[DUID_SIZE];	/* DUID of root disk. */
93 
94 /* softraid callback, do not use! */
95 void (*softraid_disk_attach)(struct disk *, int);
96 
97 void sr_map_root(void);
98 
99 struct disk_attach_task {
100 	struct task task;
101 	struct disk *dk;
102 };
103 
104 void disk_attach_callback(void *);
105 
106 int spoofgptlabel(struct buf *, void (*)(struct buf *), struct disklabel *);
107 
108 int gpt_chk_mbr(struct dos_partition *, u_int64_t);
109 int gpt_chk_hdr(struct gpt_header *, struct disklabel *);
110 int gpt_chk_parts(struct gpt_header *, struct gpt_partition *);
111 int gpt_get_fstype(struct uuid *);
112 
113 int duid_equal(u_char *, u_char *);
114 
115 /*
116  * Compute checksum for disk label.
117  */
118 u_int
119 dkcksum(struct disklabel *lp)
120 {
121 	u_int16_t *start, *end;
122 	u_int16_t sum = 0;
123 
124 	start = (u_int16_t *)lp;
125 	end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
126 	while (start < end)
127 		sum ^= *start++;
128 	return (sum);
129 }
130 
131 int
132 initdisklabel(struct disklabel *lp)
133 {
134 	int i;
135 
136 	/* minimal requirements for archetypal disk label */
137 	if (lp->d_secsize < DEV_BSIZE)
138 		lp->d_secsize = DEV_BSIZE;
139 	if (DL_GETDSIZE(lp) == 0)
140 		DL_SETDSIZE(lp, MAXDISKSIZE);
141 	if (lp->d_secpercyl == 0)
142 		return (ERANGE);
143 	lp->d_npartitions = MAXPARTITIONS;
144 	for (i = 0; i < RAW_PART; i++) {
145 		DL_SETPSIZE(&lp->d_partitions[i], 0);
146 		DL_SETPOFFSET(&lp->d_partitions[i], 0);
147 	}
148 	if (DL_GETPSIZE(&lp->d_partitions[RAW_PART]) == 0)
149 		DL_SETPSIZE(&lp->d_partitions[RAW_PART], DL_GETDSIZE(lp));
150 	DL_SETPOFFSET(&lp->d_partitions[RAW_PART], 0);
151 	DL_SETBSTART(lp, 0);
152 	DL_SETBEND(lp, DL_GETDSIZE(lp));
153 	lp->d_version = 1;
154 	lp->d_bbsize = 8192;
155 	lp->d_sbsize = 64*1024;			/* XXX ? */
156 	return (0);
157 }
158 
159 /*
160  * Check an incoming block to make sure it is a disklabel, convert it to
161  * a newer version if needed, etc etc.
162  */
163 int
164 checkdisklabel(void *rlp, struct disklabel *lp, u_int64_t boundstart,
165     u_int64_t boundend)
166 {
167 	struct disklabel *dlp = rlp;
168 	struct __partitionv0 *v0pp;
169 	struct partition *pp;
170 	u_int64_t disksize;
171 	int error = 0;
172 	int i;
173 
174 	if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC)
175 		error = ENOENT;	/* no disk label */
176 	else if (dlp->d_npartitions > MAXPARTITIONS)
177 		error = E2BIG;	/* too many partitions */
178 	else if (dlp->d_secpercyl == 0)
179 		error = EINVAL;	/* invalid label */
180 	else if (dlp->d_secsize == 0)
181 		error = ENOSPC;	/* disk too small */
182 	else if (dkcksum(dlp) != 0)
183 		error = EINVAL;	/* incorrect checksum */
184 
185 	if (error) {
186 		u_int16_t *start, *end, sum = 0;
187 
188 		/* If it is byte-swapped, attempt to convert it */
189 		if (swap32(dlp->d_magic) != DISKMAGIC ||
190 		    swap32(dlp->d_magic2) != DISKMAGIC ||
191 		    swap16(dlp->d_npartitions) > MAXPARTITIONS)
192 			return (error);
193 
194 		/*
195 		 * Need a byte-swap aware dkcksum variant
196 		 * inlined, because dkcksum uses a sub-field
197 		 */
198 		start = (u_int16_t *)dlp;
199 		end = (u_int16_t *)&dlp->d_partitions[
200 		    swap16(dlp->d_npartitions)];
201 		while (start < end)
202 			sum ^= *start++;
203 		if (sum != 0)
204 			return (error);
205 
206 		dlp->d_magic = swap32(dlp->d_magic);
207 		dlp->d_type = swap16(dlp->d_type);
208 
209 		/* d_typename and d_packname are strings */
210 
211 		dlp->d_secsize = swap32(dlp->d_secsize);
212 		dlp->d_nsectors = swap32(dlp->d_nsectors);
213 		dlp->d_ntracks = swap32(dlp->d_ntracks);
214 		dlp->d_ncylinders = swap32(dlp->d_ncylinders);
215 		dlp->d_secpercyl = swap32(dlp->d_secpercyl);
216 		dlp->d_secperunit = swap32(dlp->d_secperunit);
217 
218 		/* d_uid is a string */
219 
220 		dlp->d_acylinders = swap32(dlp->d_acylinders);
221 
222 		dlp->d_flags = swap32(dlp->d_flags);
223 
224 		for (i = 0; i < NDDATA; i++)
225 			dlp->d_drivedata[i] = swap32(dlp->d_drivedata[i]);
226 
227 		dlp->d_secperunith = swap16(dlp->d_secperunith);
228 		dlp->d_version = swap16(dlp->d_version);
229 
230 		for (i = 0; i < NSPARE; i++)
231 			dlp->d_spare[i] = swap32(dlp->d_spare[i]);
232 
233 		dlp->d_magic2 = swap32(dlp->d_magic2);
234 
235 		dlp->d_npartitions = swap16(dlp->d_npartitions);
236 		dlp->d_bbsize = swap32(dlp->d_bbsize);
237 		dlp->d_sbsize = swap32(dlp->d_sbsize);
238 
239 		for (i = 0; i < MAXPARTITIONS; i++) {
240 			pp = &dlp->d_partitions[i];
241 			pp->p_size = swap32(pp->p_size);
242 			pp->p_offset = swap32(pp->p_offset);
243 			if (dlp->d_version == 0) {
244 				v0pp = (struct __partitionv0 *)pp;
245 				v0pp->p_fsize = swap32(v0pp->p_fsize);
246 			} else {
247 				pp->p_offseth = swap16(pp->p_offseth);
248 				pp->p_sizeh = swap16(pp->p_sizeh);
249 			}
250 			pp->p_cpg = swap16(pp->p_cpg);
251 		}
252 
253 		dlp->d_checksum = 0;
254 		dlp->d_checksum = dkcksum(dlp);
255 		error = 0;
256 	}
257 
258 	/* XXX should verify lots of other fields and whine a lot */
259 
260 	/* Initial passed in lp contains the real disk size. */
261 	disksize = DL_GETDSIZE(lp);
262 
263 	if (lp != dlp)
264 		*lp = *dlp;
265 
266 	if (lp->d_version == 0) {
267 		lp->d_version = 1;
268 		lp->d_secperunith = 0;
269 
270 		v0pp = (struct __partitionv0 *)lp->d_partitions;
271 		pp = lp->d_partitions;
272 		for (i = 0; i < lp->d_npartitions; i++, pp++, v0pp++) {
273 			pp->p_fragblock = DISKLABELV1_FFS_FRAGBLOCK(v0pp->
274 			    p_fsize, v0pp->p_frag);
275 			pp->p_offseth = 0;
276 			pp->p_sizeh = 0;
277 		}
278 	}
279 
280 #ifdef DEBUG
281 	if (DL_GETDSIZE(lp) != disksize)
282 		printf("on-disk disklabel has incorrect disksize (%llu)\n",
283 		    DL_GETDSIZE(lp));
284 	if (DL_GETPSIZE(&lp->d_partitions[RAW_PART]) != disksize)
285 		printf("on-disk disklabel RAW_PART has incorrect size (%llu)\n",
286 		    DL_GETPSIZE(&lp->d_partitions[RAW_PART]));
287 	if (DL_GETPOFFSET(&lp->d_partitions[RAW_PART]) != 0)
288 		printf("on-disk disklabel RAW_PART offset != 0 (%llu)\n",
289 		    DL_GETPOFFSET(&lp->d_partitions[RAW_PART]));
290 #endif
291 	DL_SETDSIZE(lp, disksize);
292 	DL_SETPSIZE(&lp->d_partitions[RAW_PART], disksize);
293 	DL_SETPOFFSET(&lp->d_partitions[RAW_PART], 0);
294 	DL_SETBSTART(lp, boundstart);
295 	DL_SETBEND(lp, boundend < DL_GETDSIZE(lp) ? boundend : DL_GETDSIZE(lp));
296 
297 	lp->d_checksum = 0;
298 	lp->d_checksum = dkcksum(lp);
299 	return (0);
300 }
301 
302 /*
303  * Read a disk sector.
304  */
305 int
306 readdisksector(struct buf *bp, void (*strat)(struct buf *),
307     struct disklabel *lp, u_int64_t sector)
308 {
309 	bp->b_blkno = DL_SECTOBLK(lp, sector);
310 	bp->b_bcount = lp->d_secsize;
311 	bp->b_error = 0;
312 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE | B_ERROR);
313 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
314 
315 	(*strat)(bp);
316 
317 	return (biowait(bp));
318 }
319 
320 /*
321  * If dos partition table requested, attempt to load it and
322  * find disklabel inside a DOS partition. Return buffer
323  * for use in signalling errors if requested.
324  *
325  * We would like to check if each MBR has a valid BOOT_MAGIC, but
326  * we cannot because it doesn't always exist. So.. we assume the
327  * MBR is valid.
328  */
329 int
330 readdoslabel(struct buf *bp, void (*strat)(struct buf *),
331     struct disklabel *lp, daddr_t *partoffp, int spoofonly)
332 {
333 	struct disklabel *gptlp;
334 	u_int64_t dospartoff = 0, dospartend = DL_GETBEND(lp);
335 	int i, ourpart = -1, wander = 1, n = 0, loop = 0, offset;
336 	struct dos_partition dp[NDOSPART], *dp2;
337 	u_int64_t sector = DOSBBSECTOR;
338 	u_int32_t extoff = 0;
339 	int error;
340 
341 	if (lp->d_secpercyl == 0)
342 		return (EINVAL);	/* invalid label */
343 	if (lp->d_secsize == 0)
344 		return (ENOSPC);	/* disk too small */
345 
346 	/* do DOS partitions in the process of getting disklabel? */
347 
348 	/*
349 	 * Read dos partition table, follow extended partitions.
350 	 * Map the partitions to disklabel entries i-p
351 	 */
352 	while (wander && loop < DOS_MAXEBR) {
353 		loop++;
354 		wander = 0;
355 		if (sector < extoff)
356 			sector = extoff;
357 
358 		/* read MBR/EBR */
359 		error = readdisksector(bp, strat, lp, sector);
360 		if (error) {
361 /*wrong*/		if (partoffp)
362 /*wrong*/			*partoffp = -1;
363 			return (error);
364 		}
365 
366 		bcopy(bp->b_data + DOSPARTOFF, dp, sizeof(dp));
367 
368 		if (n == 0 && sector == DOSBBSECTOR) {
369 			u_int16_t mbrtest;
370 
371 			/* Check the end of sector marker. */
372 			mbrtest = ((bp->b_data[510] << 8) & 0xff00) |
373 			    (bp->b_data[511] & 0xff);
374 			if (mbrtest != 0x55aa)
375 				goto notmbr;
376 
377 			if (gpt_chk_mbr(dp, DL_GETDSIZE(lp)) != 0)
378 				goto notgpt;
379 
380 			gptlp = malloc(sizeof(struct disklabel), M_DEVBUF,
381 			    M_NOWAIT);
382 			if (gptlp == NULL)
383 				return (ENOMEM);
384 			*gptlp = *lp;
385 			error = spoofgptlabel(bp, strat, gptlp);
386 			if (error == 0) {
387 				dospartoff = DL_GETBSTART(gptlp);
388 				dospartend = DL_GETBEND(gptlp);
389 				if (partoffp == 0)
390 					*lp = *gptlp;
391 				free(gptlp, M_DEVBUF,
392 				    sizeof(struct disklabel));
393 				if (partoffp && dospartoff == 0)
394 					return (ENXIO);
395 				goto notfat;
396 			} else {
397 				free(gptlp, M_DEVBUF,
398 				    sizeof(struct disklabel));
399 				goto notmbr;
400 			}
401 		}
402 
403 notgpt:
404 		if (ourpart == -1) {
405 			/* Search for our MBR partition */
406 			for (dp2=dp, i=0; i < NDOSPART && ourpart == -1;
407 			    i++, dp2++)
408 				if (letoh32(dp2->dp_size) &&
409 				    dp2->dp_typ == DOSPTYP_OPENBSD)
410 					ourpart = i;
411 			if (ourpart == -1)
412 				goto donot;
413 			/*
414 			 * This is our MBR partition. need sector
415 			 * address for SCSI/IDE, cylinder for
416 			 * ESDI/ST506/RLL
417 			 */
418 			dp2 = &dp[ourpart];
419 			dospartoff = letoh32(dp2->dp_start) + sector;
420 			dospartend = dospartoff + letoh32(dp2->dp_size);
421 
422 			/*
423 			 * Record the OpenBSD partition's placement (in
424 			 * 512-byte blocks!) for the caller. No need to
425 			 * finish spoofing.
426 			 */
427 			if (partoffp) {
428 				*partoffp = DL_SECTOBLK(lp, dospartoff);
429 				return (0);
430 			}
431 
432 			if (lp->d_ntracks == 0)
433 				lp->d_ntracks = dp2->dp_ehd + 1;
434 			if (lp->d_nsectors == 0)
435 				lp->d_nsectors = DPSECT(dp2->dp_esect);
436 			if (lp->d_secpercyl == 0)
437 				lp->d_secpercyl = lp->d_ntracks *
438 				    lp->d_nsectors;
439 		}
440 donot:
441 		/*
442 		 * In case the disklabel read below fails, we want to
443 		 * provide a fake label in i-p.
444 		 */
445 		for (dp2=dp, i=0; i < NDOSPART; i++, dp2++) {
446 			struct partition *pp;
447 			u_int8_t fstype;
448 
449 			if (dp2->dp_typ == DOSPTYP_OPENBSD ||
450 			    dp2->dp_typ == DOSPTYP_EFI)
451 				continue;
452 			if (letoh32(dp2->dp_size) > DL_GETDSIZE(lp))
453 				continue;
454 			if (letoh32(dp2->dp_start) > DL_GETDSIZE(lp))
455 				continue;
456 			if (letoh32(dp2->dp_size) == 0)
457 				continue;
458 
459 			switch (dp2->dp_typ) {
460 			case DOSPTYP_UNUSED:
461 				fstype = FS_UNUSED;
462 				break;
463 
464 			case DOSPTYP_LINUX:
465 				fstype = FS_EXT2FS;
466 				break;
467 
468 			case DOSPTYP_NTFS:
469 				fstype = FS_NTFS;
470 				break;
471 
472 			case DOSPTYP_EFISYS:
473 			case DOSPTYP_FAT12:
474 			case DOSPTYP_FAT16S:
475 			case DOSPTYP_FAT16B:
476 			case DOSPTYP_FAT16L:
477 			case DOSPTYP_FAT32:
478 			case DOSPTYP_FAT32L:
479 				fstype = FS_MSDOS;
480 				break;
481 			case DOSPTYP_EXTEND:
482 			case DOSPTYP_EXTENDL:
483 				sector = letoh32(dp2->dp_start) + extoff;
484 				if (!extoff) {
485 					extoff = letoh32(dp2->dp_start);
486 					sector = 0;
487 				}
488 				wander = 1;
489 				continue;
490 				break;
491 			default:
492 				fstype = FS_OTHER;
493 				break;
494 			}
495 
496 			/*
497 			 * Don't set fstype/offset/size when just looking for
498 			 * the offset of the OpenBSD partition. It would
499 			 * invalidate the disklabel checksum!
500 			 *
501 			 * Don't try to spoof more than 8 partitions, i.e.
502 			 * 'i' -'p'.
503 			 */
504 			if (partoffp || n >= 8)
505 				continue;
506 
507 			pp = &lp->d_partitions[8+n];
508 			n++;
509 			pp->p_fstype = fstype;
510 			if (letoh32(dp2->dp_start))
511 				DL_SETPOFFSET(pp,
512 				    letoh32(dp2->dp_start) + sector);
513 			DL_SETPSIZE(pp, letoh32(dp2->dp_size));
514 		}
515 	}
516 
517 notmbr:
518 	if (n == 0 && sector == DOSBBSECTOR && ourpart == -1) {
519 		u_int16_t fattest;
520 
521 		/* Check for a valid initial jmp instruction. */
522 		switch ((u_int8_t)bp->b_data[0]) {
523 		case 0xeb:
524 			/*
525 			 * Two-byte jmp instruction. The 2nd byte is the number
526 			 * of bytes to jmp and the 3rd byte must be a NOP.
527 			 */
528 			if ((u_int8_t)bp->b_data[2] != 0x90)
529 				goto notfat;
530 			break;
531 		case 0xe9:
532 			/*
533 			 * Three-byte jmp instruction. The next two bytes are a
534 			 * little-endian 16 bit value.
535 			 */
536 			break;
537 		default:
538 			goto notfat;
539 			break;
540 		}
541 
542 		/* Check for a valid bytes per sector value. */
543 		fattest = ((bp->b_data[12] << 8) & 0xff00) |
544 		    (bp->b_data[11] & 0xff);
545 		if (fattest < 512 || fattest > 4096 || (fattest % 512 != 0))
546 			goto notfat;
547 
548 		if (partoffp)
549 			return (ENXIO);	/* No place for disklabel on FAT! */
550 
551 		DL_SETPSIZE(&lp->d_partitions['i' - 'a'],
552 		    DL_GETPSIZE(&lp->d_partitions[RAW_PART]));
553 		DL_SETPOFFSET(&lp->d_partitions['i' - 'a'], 0);
554 		lp->d_partitions['i' - 'a'].p_fstype = FS_MSDOS;
555 
556 		spoofonly = 1;	/* No disklabel to read from disk. */
557 	}
558 
559 notfat:
560 	/* record the OpenBSD partition's placement for the caller */
561 	if (partoffp)
562 		*partoffp = DL_SECTOBLK(lp, dospartoff);
563 	else {
564 		DL_SETBSTART(lp, dospartoff);
565 		DL_SETBEND(lp, (dospartend < DL_GETDSIZE(lp)) ? dospartend :
566 		    DL_GETDSIZE(lp));
567 	}
568 
569 	/* don't read the on-disk label if we are in spoofed-only mode */
570 	if (spoofonly)
571 		return (0);
572 
573 	error = readdisksector(bp, strat, lp, dospartoff +
574 	    DL_BLKTOSEC(lp, DOS_LABELSECTOR));
575 	if (error)
576 		return (bp->b_error);
577 
578 	offset = DL_BLKOFFSET(lp, DOS_LABELSECTOR);
579 	error = checkdisklabel(bp->b_data + offset, lp,
580 	    DL_GETBSTART((struct disklabel*)(bp->b_data+offset)),
581 	    DL_GETBEND((struct disklabel *)(bp->b_data+offset)));
582 
583 	return (error);
584 }
585 
586 /*
587  * Returns 0 if the MBR with the provided partition array is a GPT protective
588  * MBR, and returns 1 otherwise. A GPT protective MBR would have one and only
589  * one MBR partition, an EFI partition that either covers the whole disk or as
590  * much of it as is possible with a 32bit size field.
591  *
592  * NOTE: MS always uses a size of UINT32_MAX for the EFI partition!**
593  */
594 int
595 gpt_chk_mbr(struct dos_partition *dp, u_int64_t dsize)
596 {
597 	struct dos_partition *dp2;
598 	int efi, found, i;
599 	u_int32_t psize;
600 
601 	found = efi = 0;
602 	for (dp2=dp, i=0; i < NDOSPART; i++, dp2++) {
603 		if (dp2->dp_typ == DOSPTYP_UNUSED)
604 			continue;
605 		found++;
606 		if (dp2->dp_typ != DOSPTYP_EFI)
607 			continue;
608 		psize = letoh32(dp2->dp_size);
609 		if (psize == (dsize - 1) ||
610 		    psize == UINT32_MAX) {
611 			if (letoh32(dp2->dp_start) == 1)
612 				efi++;
613 		}
614 	}
615 	if (found == 1 && efi == 1)
616 		return (0);
617 
618 	return (1);
619 }
620 
621 int
622 gpt_chk_hdr(struct gpt_header *gh, struct disklabel *lp)
623 {
624 	uint64_t ghpartlba;
625 	uint64_t ghlbaend, ghlbastart;
626 	uint32_t orig_gh_csum;
627 	uint32_t ghsize, ghpartsize, ghpartspersec;
628 
629 	if (letoh64(gh->gh_sig) != GPTSIGNATURE)
630 		return (EINVAL);
631 
632 	if (letoh32(gh->gh_rev) != GPTREVISION)
633 		return (EINVAL);
634 
635 	ghsize = letoh32(gh->gh_size);
636 	ghpartsize = letoh32(gh->gh_part_size);
637 	ghpartspersec = lp->d_secsize / ghpartsize;
638 	ghpartlba = letoh64(gh->gh_part_lba);
639 	ghlbaend = letoh64(gh->gh_lba_end);
640 	ghlbastart = letoh64(gh->gh_lba_start);
641 
642 	if (ghsize < GPTMINHDRSIZE || ghsize > sizeof(struct gpt_header))
643 		return (EINVAL);
644 
645 	orig_gh_csum = gh->gh_csum;
646 	gh->gh_csum = 0;
647 	gh->gh_csum = crc32(0, (unsigned char *)gh, ghsize);
648 
649 	if (orig_gh_csum != gh->gh_csum)
650 		return (EINVAL);
651 
652 	if (ghlbastart >= DL_GETDSIZE(lp) ||
653 	    ghlbaend >= DL_GETDSIZE(lp) ||
654 	    ghpartlba >= DL_GETDSIZE(lp))
655 		return (EINVAL);
656 
657 	/*
658 	* Size per partition entry shall be 128*(2**n) with n >= 0.
659 	* We don't support partition entries larger than block size.
660 	*/
661 	if (ghpartsize % GPTMINPARTSIZE || ghpartsize > lp->d_secsize
662 	    || ghpartspersec == 0) {
663 		DPRINTF("invalid partition size\n");
664 		return (EINVAL);
665 	}
666 
667 	/* XXX: we don't support multiples of GPTMINPARTSIZE yet */
668 	if (ghpartsize != GPTMINPARTSIZE) {
669 		DPRINTF("partition sizes larger than %d bytes are not "
670 		    "supported", GPTMINPARTSIZE);
671 		return (EINVAL);
672 	}
673 
674 	if (letoh64(gh->gh_lba_alt) >= DL_GETDSIZE(lp)) {
675 		DPRINTF("alternate header's position is bogus\n");
676 		return (EINVAL);
677 	}
678 
679 	return 0;
680 }
681 
682 int
683 gpt_chk_parts(struct gpt_header *gh, struct gpt_partition *gp)
684 {
685 	u_int32_t checksum;
686 	checksum = crc32(0, (unsigned char *)gp,
687 	    letoh32(gh->gh_part_num) * letoh32(gh->gh_part_size));
688 
689 	if (checksum != gh->gh_part_csum)
690 		return (EINVAL);
691 
692 	return 0;
693 }
694 
695 int
696 gpt_get_fstype(struct uuid *uuid_part)
697 {
698 	static int init = 0;
699 	static struct uuid uuid_openbsd, uuid_msdos, uuid_chromefs,
700 	    uuid_linux, uuid_hfs, uuid_unused, uuid_efi_system;
701 	static const uint8_t gpt_uuid_openbsd[] = GPT_UUID_OPENBSD;
702 	static const uint8_t gpt_uuid_msdos[] = GPT_UUID_MSDOS;
703 	static const uint8_t gpt_uuid_chromerootfs[] = GPT_UUID_CHROMEROOTFS;
704 	static const uint8_t gpt_uuid_linux[] = GPT_UUID_LINUX;
705 	static const uint8_t gpt_uuid_hfs[] = GPT_UUID_APPLE_HFS;
706 	static const uint8_t gpt_uuid_unused[] = GPT_UUID_UNUSED;
707 	static const uint8_t gpt_uuid_efi_system[] = GPT_UUID_EFI_SYSTEM;
708 
709 	if (init == 0) {
710 		uuid_dec_be(gpt_uuid_openbsd, &uuid_openbsd);
711 		uuid_dec_be(gpt_uuid_msdos, &uuid_msdos);
712 		uuid_dec_be(gpt_uuid_chromerootfs, &uuid_chromefs);
713 		uuid_dec_be(gpt_uuid_linux, &uuid_linux);
714 		uuid_dec_be(gpt_uuid_hfs, &uuid_hfs);
715 		uuid_dec_be(gpt_uuid_unused, &uuid_unused);
716 		uuid_dec_be(gpt_uuid_efi_system, &uuid_efi_system);
717 		init = 1;
718 	}
719 
720 	if (!memcmp(uuid_part, &uuid_unused, sizeof(struct uuid)))
721 		return FS_UNUSED;
722 	else if (!memcmp(uuid_part, &uuid_openbsd, sizeof(struct uuid)))
723 		return FS_BSDFFS;
724 	else if (!memcmp(uuid_part, &uuid_msdos, sizeof(struct uuid)))
725 		return FS_MSDOS;
726 	else if (!memcmp(uuid_part, &uuid_chromefs, sizeof(struct uuid)))
727 		return FS_EXT2FS;
728 	else if (!memcmp(uuid_part, &uuid_linux, sizeof(struct uuid)))
729 		return FS_EXT2FS;
730 	else if (!memcmp(uuid_part, &uuid_hfs, sizeof(struct uuid)))
731 		return FS_HFS;
732 	else if (!memcmp(uuid_part, &uuid_efi_system, sizeof(struct uuid)))
733 		return FS_MSDOS;
734 	else
735 		return FS_OTHER;
736 }
737 
738 /*
739  * Spoof a disklabel based on the GPT information on the disk.
740  */
741 int
742 spoofgptlabel(struct buf *bp, void (*strat)(struct buf *),
743     struct disklabel *lp)
744 {
745 	static const u_int8_t gpt_uuid_openbsd[] = GPT_UUID_OPENBSD;
746 	struct gpt_header gh;
747 	struct uuid uuid_part, uuid_openbsd;
748 	struct gpt_partition *gp, *gp_tmp;
749 	struct partition *pp;
750 	size_t gpsz;
751 	u_int64_t ghlbaend, ghlbastart, gptpartoff, gptpartend, sector;
752 	u_int64_t start, end;
753 	int i, altheader = 0, error, n;
754 	uint32_t ghpartnum;
755 
756 	uuid_dec_be(gpt_uuid_openbsd, &uuid_openbsd);
757 
758 	for (sector = GPTSECTOR; ; sector = DL_GETDSIZE(lp)-1, altheader = 1) {
759 		uint64_t ghpartlba;
760 		uint32_t ghpartsize;
761 		uint32_t ghpartspersec;
762 
763 		error = readdisksector(bp, strat, lp, sector);
764 		if (error) {
765 			DPRINTF("error reading from disk\n");
766 			return (error);
767 		}
768 
769 		bcopy(bp->b_data, &gh, sizeof(gh));
770 
771 		if (gpt_chk_hdr(&gh, lp)) {
772 			if (altheader) {
773 				DPRINTF("alternate header also broken\n");
774 				return (EINVAL);
775 			}
776 			continue;
777 		}
778 
779 		ghpartsize = letoh32(gh.gh_part_size);
780 		ghpartspersec = lp->d_secsize / ghpartsize;
781 		ghpartnum = letoh32(gh.gh_part_num);
782 		ghpartlba = letoh64(gh.gh_part_lba);
783 		ghlbaend = letoh64(gh.gh_lba_end);
784 		ghlbastart = letoh64(gh.gh_lba_start);
785 
786 		/* read GPT partition entry array */
787 		gp = mallocarray(ghpartnum, sizeof(struct gpt_partition),
788 		    M_DEVBUF, M_NOWAIT|M_ZERO);
789 		if (gp == NULL)
790 			return (ENOMEM);
791 		gpsz = ghpartnum * sizeof(struct gpt_partition);
792 
793 		/*
794 		* XXX:	Fails if # of partition entries is not a multiple of
795 		*	ghpartspersec.
796 		*/
797 		sector = ghpartlba;
798 		for (i = 0; i < ghpartnum / ghpartspersec; i++, sector++) {
799 			error = readdisksector(bp, strat, lp, sector);
800 			if (error) {
801 				free(gp, M_DEVBUF, gpsz);
802 				return (error);
803 			}
804 
805 			bcopy(bp->b_data, gp + i * ghpartspersec,
806 			    ghpartspersec * sizeof(struct gpt_partition));
807 		}
808 
809 		if (gpt_chk_parts(&gh, gp)) {
810 			free(gp, M_DEVBUF, gpsz);
811 			if (altheader) {
812 				DPRINTF("alternate partition entries are also "
813 				    "broken\n");
814 				return (EINVAL);
815 			}
816 			continue;
817 		}
818 		break;
819 	}
820 
821 	/* Find OpenBSD partition and spoof others along the way. */
822 	n = 0;
823 	gptpartoff = 0;
824 	gptpartend = DL_GETBEND(lp);
825 	for (gp_tmp = gp, i = 0; i < ghpartnum; gp_tmp++, i++) {
826 		start = letoh64(gp_tmp->gp_lba_start);
827 		end = letoh64(gp_tmp->gp_lba_end);
828 		if (start > end || start < ghlbastart || end > ghlbaend)
829 			continue; /* entry invalid */
830 
831 		uuid_dec_le(&gp_tmp->gp_type, &uuid_part);
832 		if (!memcmp(&uuid_part, &uuid_openbsd, sizeof(struct uuid))) {
833 			if (gptpartoff == 0) {
834 				gptpartoff = start;
835 				gptpartend = end + 1;
836 			}
837 			continue; /* Do *NOT* spoof OpenBSD partitions! */
838 		}
839 
840 		 /*
841 		 * Don't try to spoof more than 8 partitions, i.e.
842 		 * 'i' -'p'.
843 		 */
844 		if (n >= 8)
845 			continue;
846 
847 		pp = &lp->d_partitions[8+n];
848 		n++;
849 		pp->p_fstype = gpt_get_fstype(&uuid_part);
850 		DL_SETPOFFSET(pp, start);
851 		DL_SETPSIZE(pp, end - start + 1);
852 	}
853 
854 	free(gp, M_DEVBUF, gpsz);
855 
856 	DL_SETBSTART(lp, gptpartoff);
857 	DL_SETBEND(lp, (gptpartend < DL_GETDSIZE(lp)) ? gptpartend :
858 	    DL_GETDSIZE(lp));
859 
860 	return (0);
861 }
862 
863 /*
864  * Check new disk label for sensibility before setting it.
865  */
866 int
867 setdisklabel(struct disklabel *olp, struct disklabel *nlp, u_int openmask)
868 {
869 	struct partition *opp, *npp;
870 	struct disk *dk;
871 	int i;
872 
873 	/* sanity clause */
874 	if (nlp->d_secpercyl == 0 || nlp->d_secsize == 0 ||
875 	    (nlp->d_secsize % DEV_BSIZE) != 0)
876 		return (EINVAL);
877 
878 	/* special case to allow disklabel to be invalidated */
879 	if (nlp->d_magic == 0xffffffff) {
880 		*olp = *nlp;
881 		return (0);
882 	}
883 
884 	if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
885 	    dkcksum(nlp) != 0)
886 		return (EINVAL);
887 
888 	/* XXX missing check if other dos partitions will be overwritten */
889 
890 	for (i = 0; i < MAXPARTITIONS; i++) {
891 		opp = &olp->d_partitions[i];
892 		npp = &nlp->d_partitions[i];
893 		if ((openmask & (1 << i)) &&
894 		    (DL_GETPOFFSET(npp) != DL_GETPOFFSET(opp) ||
895 		    DL_GETPSIZE(npp) < DL_GETPSIZE(opp)))
896 			return (EBUSY);
897 		/*
898 		 * Copy internally-set partition information
899 		 * if new label doesn't include it.		XXX
900 		 */
901 		if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
902 			npp->p_fragblock = opp->p_fragblock;
903 			npp->p_cpg = opp->p_cpg;
904 		}
905 	}
906 
907 	/* Generate a UID if the disklabel does not already have one. */
908 	if (duid_iszero(nlp->d_uid)) {
909 		do {
910 			arc4random_buf(nlp->d_uid, sizeof(nlp->d_uid));
911 			TAILQ_FOREACH(dk, &disklist, dk_link)
912 				if (dk->dk_label &&
913 				    duid_equal(dk->dk_label->d_uid, nlp->d_uid))
914 					break;
915 		} while (dk != NULL || duid_iszero(nlp->d_uid));
916 	}
917 
918 	/* Preserve the disk size and RAW_PART values. */
919 	DL_SETDSIZE(nlp, DL_GETDSIZE(olp));
920 	npp = &nlp->d_partitions[RAW_PART];
921 	DL_SETPOFFSET(npp, 0);
922 	DL_SETPSIZE(npp, DL_GETDSIZE(nlp));
923 
924 	nlp->d_checksum = 0;
925 	nlp->d_checksum = dkcksum(nlp);
926 	*olp = *nlp;
927 
928 	disk_change = 1;
929 
930 	return (0);
931 }
932 
933 /*
934  * Determine the size of the transfer, and make sure it is within the
935  * boundaries of the partition. Adjust transfer if needed, and signal errors or
936  * early completion.
937  */
938 int
939 bounds_check_with_label(struct buf *bp, struct disklabel *lp)
940 {
941 	struct partition *p = &lp->d_partitions[DISKPART(bp->b_dev)];
942 	daddr_t partblocks, sz;
943 
944 	/* Avoid division by zero, negative offsets, and negative sizes. */
945 	if (lp->d_secpercyl == 0 || bp->b_blkno < 0 || bp->b_bcount < 0)
946 		goto bad;
947 
948 	/* Ensure transfer is a whole number of aligned sectors. */
949 	if ((bp->b_blkno % DL_BLKSPERSEC(lp)) != 0 ||
950 	    (bp->b_bcount % lp->d_secsize) != 0)
951 		goto bad;
952 
953 	/* Ensure transfer starts within partition boundary. */
954 	partblocks = DL_SECTOBLK(lp, DL_GETPSIZE(p));
955 	if (bp->b_blkno > partblocks)
956 		goto bad;
957 
958 	/* If exactly at end of partition or null transfer, return EOF. */
959 	if (bp->b_blkno == partblocks || bp->b_bcount == 0)
960 		goto done;
961 
962 	/* Truncate request if it extends past the end of the partition. */
963 	sz = bp->b_bcount >> DEV_BSHIFT;
964 	if (sz > partblocks - bp->b_blkno) {
965 		sz = partblocks - bp->b_blkno;
966 		bp->b_bcount = sz << DEV_BSHIFT;
967 	}
968 
969 	return (0);
970 
971  bad:
972 	bp->b_error = EINVAL;
973 	bp->b_flags |= B_ERROR;
974  done:
975 	bp->b_resid = bp->b_bcount;
976 	return (-1);
977 }
978 
979 /*
980  * Disk error is the preface to plaintive error messages
981  * about failing disk transfers.  It prints messages of the form
982 
983 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
984 
985  * if the offset of the error in the transfer and a disk label
986  * are both available.  blkdone should be -1 if the position of the error
987  * is unknown; the disklabel pointer may be null from drivers that have not
988  * been converted to use them.  The message is printed with printf
989  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
990  * The message should be completed (with at least a newline) with printf
991  * or addlog, respectively.  There is no trailing space.
992  */
993 void
994 diskerr(struct buf *bp, char *dname, char *what, int pri, int blkdone,
995     struct disklabel *lp)
996 {
997 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
998 	int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)));
999 	char partname = 'a' + part;
1000 	daddr_t sn;
1001 
1002 	if (pri != LOG_PRINTF) {
1003 		log(pri, "%s", "");
1004 		pr = addlog;
1005 	} else
1006 		pr = printf;
1007 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
1008 	    bp->b_flags & B_READ ? "read" : "writ");
1009 	sn = bp->b_blkno;
1010 	if (bp->b_bcount <= DEV_BSIZE)
1011 		(*pr)("%lld", (long long)sn);
1012 	else {
1013 		if (blkdone >= 0) {
1014 			sn += blkdone;
1015 			(*pr)("%lld of ", (long long)sn);
1016 		}
1017 		(*pr)("%lld-%lld", (long long)bp->b_blkno,
1018 		    (long long)(bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE));
1019 	}
1020 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
1021 		sn += DL_SECTOBLK(lp, DL_GETPOFFSET(&lp->d_partitions[part]));
1022 		(*pr)(" (%s%d bn %lld; cn %lld", dname, unit, (long long)sn,
1023 		    (long long)(sn / DL_SECTOBLK(lp, lp->d_secpercyl)));
1024 		sn %= DL_SECTOBLK(lp, lp->d_secpercyl);
1025 		(*pr)(" tn %lld sn %lld)",
1026 		    (long long)(sn / DL_SECTOBLK(lp, lp->d_nsectors)),
1027 		    (long long)(sn % DL_SECTOBLK(lp, lp->d_nsectors)));
1028 	}
1029 }
1030 
1031 /*
1032  * Initialize the disklist.  Called by main() before autoconfiguration.
1033  */
1034 void
1035 disk_init(void)
1036 {
1037 
1038 	TAILQ_INIT(&disklist);
1039 	disk_count = disk_change = 0;
1040 }
1041 
1042 int
1043 disk_construct(struct disk *diskp)
1044 {
1045 	rw_init_flags(&diskp->dk_lock, "dklk", RWL_IS_VNODE);
1046 	mtx_init(&diskp->dk_mtx, IPL_BIO);
1047 
1048 	diskp->dk_flags |= DKF_CONSTRUCTED;
1049 
1050 	return (0);
1051 }
1052 
1053 /*
1054  * Attach a disk.
1055  */
1056 void
1057 disk_attach(struct device *dv, struct disk *diskp)
1058 {
1059 	int majdev;
1060 
1061 	if (!ISSET(diskp->dk_flags, DKF_CONSTRUCTED))
1062 		disk_construct(diskp);
1063 
1064 	/*
1065 	 * Allocate and initialize the disklabel structures.  Note that
1066 	 * it's not safe to sleep here, since we're probably going to be
1067 	 * called during autoconfiguration.
1068 	 */
1069 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF,
1070 	    M_NOWAIT|M_ZERO);
1071 	if (diskp->dk_label == NULL)
1072 		panic("disk_attach: can't allocate storage for disklabel");
1073 
1074 	/*
1075 	 * Set the attached timestamp.
1076 	 */
1077 	microuptime(&diskp->dk_attachtime);
1078 
1079 	/*
1080 	 * Link into the disklist.
1081 	 */
1082 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
1083 	++disk_count;
1084 	disk_change = 1;
1085 
1086 	/*
1087 	 * Store device structure and number for later use.
1088 	 */
1089 	diskp->dk_device = dv;
1090 	diskp->dk_devno = NODEV;
1091 	if (dv != NULL) {
1092 		majdev = findblkmajor(dv);
1093 		if (majdev >= 0)
1094 			diskp->dk_devno =
1095 			    MAKEDISKDEV(majdev, dv->dv_unit, RAW_PART);
1096 
1097 		if (diskp->dk_devno != NODEV) {
1098 			struct disk_attach_task *dat;
1099 
1100 			dat = malloc(sizeof(*dat), M_TEMP, M_WAITOK);
1101 
1102 			/* XXX: Assumes dk is part of the device softc. */
1103 			device_ref(dv);
1104 			dat->dk = diskp;
1105 
1106 			task_set(&dat->task, disk_attach_callback, dat);
1107 			task_add(systq, &dat->task);
1108 		}
1109 	}
1110 
1111 	if (softraid_disk_attach)
1112 		softraid_disk_attach(diskp, 1);
1113 }
1114 
1115 void
1116 disk_attach_callback(void *xdat)
1117 {
1118 	struct disk_attach_task *dat = xdat;
1119 	struct disk *dk = dat->dk;
1120 	struct disklabel dl;
1121 	char errbuf[100];
1122 
1123 	free(dat, M_TEMP, sizeof(*dat));
1124 
1125 	if (dk->dk_flags & (DKF_OPENED | DKF_NOLABELREAD))
1126 		goto done;
1127 
1128 	/* Read disklabel. */
1129 	if (disk_readlabel(&dl, dk->dk_devno, errbuf, sizeof(errbuf)) == NULL) {
1130 		enqueue_randomness(dl.d_checksum);
1131 		dk->dk_flags |= DKF_LABELVALID;
1132 	}
1133 
1134 done:
1135 	dk->dk_flags |= DKF_OPENED;
1136 	device_unref(dk->dk_device);
1137 	wakeup(dk);
1138 }
1139 
1140 /*
1141  * Detach a disk.
1142  */
1143 void
1144 disk_detach(struct disk *diskp)
1145 {
1146 
1147 	if (softraid_disk_attach)
1148 		softraid_disk_attach(diskp, -1);
1149 
1150 	/*
1151 	 * Free the space used by the disklabel structures.
1152 	 */
1153 	free(diskp->dk_label, M_DEVBUF, sizeof(*diskp->dk_label));
1154 
1155 	/*
1156 	 * Remove from the disklist.
1157 	 */
1158 	TAILQ_REMOVE(&disklist, diskp, dk_link);
1159 	disk_change = 1;
1160 	if (--disk_count < 0)
1161 		panic("disk_detach: disk_count < 0");
1162 }
1163 
1164 int
1165 disk_openpart(struct disk *dk, int part, int fmt, int haslabel)
1166 {
1167 	KASSERT(part >= 0 && part < MAXPARTITIONS);
1168 
1169 	/* Unless opening the raw partition, check that the partition exists. */
1170 	if (part != RAW_PART && (!haslabel ||
1171 	    part >= dk->dk_label->d_npartitions ||
1172 	    dk->dk_label->d_partitions[part].p_fstype == FS_UNUSED))
1173 		return (ENXIO);
1174 
1175 	/* Ensure the partition doesn't get changed under our feet. */
1176 	switch (fmt) {
1177 	case S_IFCHR:
1178 		dk->dk_copenmask |= (1 << part);
1179 		break;
1180 	case S_IFBLK:
1181 		dk->dk_bopenmask |= (1 << part);
1182 		break;
1183 	}
1184 	dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask;
1185 
1186 	return (0);
1187 }
1188 
1189 void
1190 disk_closepart(struct disk *dk, int part, int fmt)
1191 {
1192 	KASSERT(part >= 0 && part < MAXPARTITIONS);
1193 
1194 	switch (fmt) {
1195 	case S_IFCHR:
1196 		dk->dk_copenmask &= ~(1 << part);
1197 		break;
1198 	case S_IFBLK:
1199 		dk->dk_bopenmask &= ~(1 << part);
1200 		break;
1201 	}
1202 	dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask;
1203 }
1204 
1205 void
1206 disk_gone(int (*open)(dev_t, int, int, struct proc *), int unit)
1207 {
1208 	int bmaj, cmaj, mn;
1209 
1210 	/* Locate the lowest minor number to be detached. */
1211 	mn = DISKMINOR(unit, 0);
1212 
1213 	for (bmaj = 0; bmaj < nblkdev; bmaj++)
1214 		if (bdevsw[bmaj].d_open == open)
1215 			vdevgone(bmaj, mn, mn + MAXPARTITIONS - 1, VBLK);
1216 	for (cmaj = 0; cmaj < nchrdev; cmaj++)
1217 		if (cdevsw[cmaj].d_open == open)
1218 			vdevgone(cmaj, mn, mn + MAXPARTITIONS - 1, VCHR);
1219 }
1220 
1221 /*
1222  * Increment a disk's busy counter.  If the counter is going from
1223  * 0 to 1, set the timestamp.
1224  */
1225 void
1226 disk_busy(struct disk *diskp)
1227 {
1228 
1229 	/*
1230 	 * XXX We'd like to use something as accurate as microtime(),
1231 	 * but that doesn't depend on the system TOD clock.
1232 	 */
1233 	mtx_enter(&diskp->dk_mtx);
1234 	if (diskp->dk_busy++ == 0)
1235 		microuptime(&diskp->dk_timestamp);
1236 	mtx_leave(&diskp->dk_mtx);
1237 }
1238 
1239 /*
1240  * Decrement a disk's busy counter, increment the byte count, total busy
1241  * time, and reset the timestamp.
1242  */
1243 void
1244 disk_unbusy(struct disk *diskp, long bcount, daddr_t blkno, int read)
1245 {
1246 	struct timeval dv_time, diff_time;
1247 
1248 	mtx_enter(&diskp->dk_mtx);
1249 
1250 	if (diskp->dk_busy-- == 0)
1251 		printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
1252 
1253 	microuptime(&dv_time);
1254 
1255 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
1256 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
1257 
1258 	diskp->dk_timestamp = dv_time;
1259 	if (bcount > 0) {
1260 		if (read) {
1261 			diskp->dk_rbytes += bcount;
1262 			diskp->dk_rxfer++;
1263 		} else {
1264 			diskp->dk_wbytes += bcount;
1265 			diskp->dk_wxfer++;
1266 		}
1267 	} else
1268 		diskp->dk_seek++;
1269 
1270 	mtx_leave(&diskp->dk_mtx);
1271 
1272 	enqueue_randomness(bcount ^ diff_time.tv_usec ^
1273 	    (blkno >> 32) ^ (blkno & 0xffffffff));
1274 }
1275 
1276 int
1277 disk_lock(struct disk *dk)
1278 {
1279 	return (rw_enter(&dk->dk_lock, RW_WRITE|RW_INTR));
1280 }
1281 
1282 void
1283 disk_lock_nointr(struct disk *dk)
1284 {
1285 	rw_enter_write(&dk->dk_lock);
1286 }
1287 
1288 void
1289 disk_unlock(struct disk *dk)
1290 {
1291 	rw_exit_write(&dk->dk_lock);
1292 }
1293 
1294 int
1295 dk_mountroot(void)
1296 {
1297 	char errbuf[100];
1298 	int part = DISKPART(rootdev);
1299 	int (*mountrootfn)(void);
1300 	struct disklabel dl;
1301 	char *error;
1302 
1303 	error = disk_readlabel(&dl, rootdev, errbuf, sizeof(errbuf));
1304 	if (error)
1305 		panic("%s", error);
1306 
1307 	if (DL_GETPSIZE(&dl.d_partitions[part]) == 0)
1308 		panic("root filesystem has size 0");
1309 	switch (dl.d_partitions[part].p_fstype) {
1310 #ifdef EXT2FS
1311 	case FS_EXT2FS:
1312 		{
1313 		extern int ext2fs_mountroot(void);
1314 		mountrootfn = ext2fs_mountroot;
1315 		}
1316 		break;
1317 #endif
1318 #ifdef FFS
1319 	case FS_BSDFFS:
1320 		{
1321 		extern int ffs_mountroot(void);
1322 		mountrootfn = ffs_mountroot;
1323 		}
1324 		break;
1325 #endif
1326 #ifdef CD9660
1327 	case FS_ISO9660:
1328 		{
1329 		extern int cd9660_mountroot(void);
1330 		mountrootfn = cd9660_mountroot;
1331 		}
1332 		break;
1333 #endif
1334 	default:
1335 #ifdef FFS
1336 		{
1337 		extern int ffs_mountroot(void);
1338 
1339 		printf("filesystem type %d not known.. assuming ffs\n",
1340 		    dl.d_partitions[part].p_fstype);
1341 		mountrootfn = ffs_mountroot;
1342 		}
1343 #else
1344 		panic("disk 0x%x filesystem type %d not known",
1345 		    rootdev, dl.d_partitions[part].p_fstype);
1346 #endif
1347 	}
1348 	return (*mountrootfn)();
1349 }
1350 
1351 struct device *
1352 getdisk(char *str, int len, int defpart, dev_t *devp)
1353 {
1354 	struct device *dv;
1355 
1356 	if ((dv = parsedisk(str, len, defpart, devp)) == NULL) {
1357 		printf("use one of: exit");
1358 		TAILQ_FOREACH(dv, &alldevs, dv_list) {
1359 			if (dv->dv_class == DV_DISK)
1360 				printf(" %s[a-p]", dv->dv_xname);
1361 #if defined(NFSCLIENT)
1362 			if (dv->dv_class == DV_IFNET)
1363 				printf(" %s", dv->dv_xname);
1364 #endif
1365 		}
1366 		printf("\n");
1367 	}
1368 	return (dv);
1369 }
1370 
1371 struct device *
1372 parsedisk(char *str, int len, int defpart, dev_t *devp)
1373 {
1374 	struct device *dv;
1375 	int majdev, part = defpart;
1376 	char c;
1377 
1378 	if (len == 0)
1379 		return (NULL);
1380 	c = str[len-1];
1381 	if (c >= 'a' && (c - 'a') < MAXPARTITIONS) {
1382 		part = c - 'a';
1383 		len -= 1;
1384 	}
1385 
1386 	TAILQ_FOREACH(dv, &alldevs, dv_list) {
1387 		if (dv->dv_class == DV_DISK &&
1388 		    strncmp(str, dv->dv_xname, len) == 0 &&
1389 		    dv->dv_xname[len] == '\0') {
1390 			majdev = findblkmajor(dv);
1391 			if (majdev < 0)
1392 				return NULL;
1393 			*devp = MAKEDISKDEV(majdev, dv->dv_unit, part);
1394 			break;
1395 		}
1396 #if defined(NFSCLIENT)
1397 		if (dv->dv_class == DV_IFNET &&
1398 		    strncmp(str, dv->dv_xname, len) == 0 &&
1399 		    dv->dv_xname[len] == '\0') {
1400 			*devp = NODEV;
1401 			break;
1402 		}
1403 #endif
1404 	}
1405 
1406 	return (dv);
1407 }
1408 
1409 void
1410 setroot(struct device *bootdv, int part, int exitflags)
1411 {
1412 	int majdev, unit, len, s, slept = 0;
1413 	struct swdevt *swp;
1414 	struct device *rootdv, *dv;
1415 	dev_t nrootdev, nswapdev = NODEV, temp = NODEV;
1416 	struct ifnet *ifp = NULL;
1417 	struct disk *dk;
1418 	char buf[128];
1419 #if defined(NFSCLIENT)
1420 	extern char *nfsbootdevname;
1421 #endif
1422 
1423 	/* Ensure that all disk attach callbacks have completed. */
1424 	do {
1425 		TAILQ_FOREACH(dk, &disklist, dk_link) {
1426 			if (dk->dk_devno != NODEV &&
1427 			    (dk->dk_flags & DKF_OPENED) == 0) {
1428 				tsleep_nsec(dk, 0, "dkopen", SEC_TO_NSEC(1));
1429 				slept++;
1430 				break;
1431 			}
1432 		}
1433 	} while (dk != NULL && slept < 5);
1434 
1435 	if (slept == 5) {
1436 		printf("disklabels not read:");
1437 		TAILQ_FOREACH(dk, &disklist, dk_link)
1438 			if (dk->dk_devno != NODEV &&
1439 			    (dk->dk_flags & DKF_OPENED) == 0)
1440 				printf(" %s", dk->dk_name);
1441 		printf("\n");
1442 	}
1443 
1444 	if (duid_iszero(bootduid)) {
1445 		/* Locate DUID for boot disk since it was not provided. */
1446 		TAILQ_FOREACH(dk, &disklist, dk_link)
1447 			if (dk->dk_device == bootdv)
1448 				break;
1449 		if (dk && (dk->dk_flags & DKF_LABELVALID))
1450 			bcopy(dk->dk_label->d_uid, bootduid, sizeof(bootduid));
1451 	} else if (bootdv == NULL) {
1452 		/* Locate boot disk based on the provided DUID. */
1453 		TAILQ_FOREACH(dk, &disklist, dk_link)
1454 			if (duid_equal(dk->dk_label->d_uid, bootduid))
1455 				break;
1456 		if (dk && (dk->dk_flags & DKF_LABELVALID))
1457 			bootdv = dk->dk_device;
1458 	}
1459 	bcopy(bootduid, rootduid, sizeof(rootduid));
1460 
1461 #if NSOFTRAID > 0
1462 	sr_map_root();
1463 #endif
1464 
1465 	/*
1466 	 * If `swap generic' and we couldn't determine boot device,
1467 	 * ask the user.
1468 	 */
1469 	dk = NULL;
1470 	if (mountroot == NULL && bootdv == NULL)
1471 		boothowto |= RB_ASKNAME;
1472 	if (boothowto & RB_ASKNAME) {
1473 		while (1) {
1474 			printf("root device");
1475 			if (bootdv != NULL) {
1476 				printf(" (default %s", bootdv->dv_xname);
1477 				if (bootdv->dv_class == DV_DISK)
1478 					printf("%c", 'a' + part);
1479 				printf(")");
1480 			}
1481 			printf(": ");
1482 			s = splhigh();
1483 			cnpollc(1);
1484 			len = getsn(buf, sizeof(buf));
1485 			cnpollc(0);
1486 			splx(s);
1487 			if (strcmp(buf, "exit") == 0)
1488 				reboot(exitflags);
1489 			if (len == 0 && bootdv != NULL) {
1490 				strlcpy(buf, bootdv->dv_xname, sizeof buf);
1491 				len = strlen(buf);
1492 			}
1493 			if (len > 0 && buf[len - 1] == '*') {
1494 				buf[--len] = '\0';
1495 				dv = getdisk(buf, len, part, &nrootdev);
1496 				if (dv != NULL) {
1497 					rootdv = dv;
1498 					nswapdev = nrootdev;
1499 					goto gotswap;
1500 				}
1501 			}
1502 			dv = getdisk(buf, len, part, &nrootdev);
1503 			if (dv != NULL) {
1504 				rootdv = dv;
1505 				break;
1506 			}
1507 		}
1508 
1509 		if (rootdv->dv_class == DV_IFNET)
1510 			goto gotswap;
1511 
1512 		/* try to build swap device out of new root device */
1513 		while (1) {
1514 			printf("swap device");
1515 			if (rootdv != NULL)
1516 				printf(" (default %s%s)", rootdv->dv_xname,
1517 				    rootdv->dv_class == DV_DISK ? "b" : "");
1518 			printf(": ");
1519 			s = splhigh();
1520 			cnpollc(1);
1521 			len = getsn(buf, sizeof(buf));
1522 			cnpollc(0);
1523 			splx(s);
1524 			if (strcmp(buf, "exit") == 0)
1525 				reboot(exitflags);
1526 			if (len == 0 && rootdv != NULL) {
1527 				switch (rootdv->dv_class) {
1528 				case DV_IFNET:
1529 					nswapdev = NODEV;
1530 					break;
1531 				case DV_DISK:
1532 					nswapdev = MAKEDISKDEV(major(nrootdev),
1533 					    DISKUNIT(nrootdev), 1);
1534 					if (nswapdev == nrootdev)
1535 						continue;
1536 					break;
1537 				default:
1538 					break;
1539 				}
1540 				break;
1541 			}
1542 			dv = getdisk(buf, len, 1, &nswapdev);
1543 			if (dv) {
1544 				if (dv->dv_class == DV_IFNET)
1545 					nswapdev = NODEV;
1546 				if (nswapdev == nrootdev)
1547 					continue;
1548 				break;
1549 			}
1550 		}
1551 gotswap:
1552 		rootdev = nrootdev;
1553 		dumpdev = nswapdev;
1554 		swdevt[0].sw_dev = nswapdev;
1555 		swdevt[1].sw_dev = NODEV;
1556 #if defined(NFSCLIENT)
1557 	} else if (mountroot == nfs_mountroot) {
1558 		rootdv = bootdv;
1559 		rootdev = dumpdev = swapdev = NODEV;
1560 #endif
1561 	} else if (mountroot == NULL && rootdev == NODEV) {
1562 		/*
1563 		 * `swap generic'
1564 		 */
1565 		rootdv = bootdv;
1566 
1567 		if (bootdv->dv_class == DV_DISK) {
1568 			if (!duid_iszero(rootduid)) {
1569 				TAILQ_FOREACH(dk, &disklist, dk_link)
1570 					if ((dk->dk_flags & DKF_LABELVALID) &&
1571 					    dk->dk_label && duid_equal(
1572 					    dk->dk_label->d_uid, rootduid))
1573 						break;
1574 				if (dk == NULL)
1575 					panic("root device (%s) not found",
1576 					    duid_format(rootduid));
1577 				rootdv = dk->dk_device;
1578 			}
1579 		}
1580 
1581 		majdev = findblkmajor(rootdv);
1582 		if (majdev >= 0) {
1583 			/*
1584 			 * Root and swap are on the disk.
1585 			 * Assume swap is on partition b.
1586 			 */
1587 			rootdev = MAKEDISKDEV(majdev, rootdv->dv_unit, part);
1588 			nswapdev = MAKEDISKDEV(majdev, rootdv->dv_unit, 1);
1589 		} else {
1590 			/*
1591 			 * Root and swap are on a net.
1592 			 */
1593 			nswapdev = NODEV;
1594 		}
1595 		dumpdev = nswapdev;
1596 		swdevt[0].sw_dev = nswapdev;
1597 		/* swdevt[1].sw_dev = NODEV; */
1598 	} else {
1599 		/* Completely pre-configured, but we want rootdv .. */
1600 		majdev = major(rootdev);
1601 		if (findblkname(majdev) == NULL)
1602 			return;
1603 		unit = DISKUNIT(rootdev);
1604 		part = DISKPART(rootdev);
1605 		snprintf(buf, sizeof buf, "%s%d%c",
1606 		    findblkname(majdev), unit, 'a' + part);
1607 		rootdv = parsedisk(buf, strlen(buf), 0, &nrootdev);
1608 		if (rootdv == NULL)
1609 			panic("root device (%s) not found", buf);
1610 	}
1611 
1612 	if (bootdv != NULL && bootdv->dv_class == DV_IFNET)
1613 		ifp = ifunit(bootdv->dv_xname);
1614 
1615 	if (ifp)
1616 		if_addgroup(ifp, "netboot");
1617 
1618 	switch (rootdv->dv_class) {
1619 #if defined(NFSCLIENT)
1620 	case DV_IFNET:
1621 		mountroot = nfs_mountroot;
1622 		nfsbootdevname = rootdv->dv_xname;
1623 		return;
1624 #endif
1625 	case DV_DISK:
1626 		mountroot = dk_mountroot;
1627 		part = DISKPART(rootdev);
1628 		break;
1629 	default:
1630 		printf("can't figure root, hope your kernel is right\n");
1631 		return;
1632 	}
1633 
1634 	printf("root on %s%c", rootdv->dv_xname, 'a' + part);
1635 
1636 	if (dk && dk->dk_device == rootdv)
1637 		printf(" (%s.%c)", duid_format(rootduid), 'a' + part);
1638 
1639 	/*
1640 	 * Make the swap partition on the root drive the primary swap.
1641 	 */
1642 	for (swp = swdevt; swp->sw_dev != NODEV; swp++) {
1643 		if (major(rootdev) == major(swp->sw_dev) &&
1644 		    DISKUNIT(rootdev) == DISKUNIT(swp->sw_dev)) {
1645 			temp = swdevt[0].sw_dev;
1646 			swdevt[0].sw_dev = swp->sw_dev;
1647 			swp->sw_dev = temp;
1648 			break;
1649 		}
1650 	}
1651 	if (swp->sw_dev != NODEV) {
1652 		/*
1653 		 * If dumpdev was the same as the old primary swap device,
1654 		 * move it to the new primary swap device.
1655 		 */
1656 		if (temp == dumpdev)
1657 			dumpdev = swdevt[0].sw_dev;
1658 	}
1659 	if (swdevt[0].sw_dev != NODEV)
1660 		printf(" swap on %s%d%c", findblkname(major(swdevt[0].sw_dev)),
1661 		    DISKUNIT(swdevt[0].sw_dev),
1662 		    'a' + DISKPART(swdevt[0].sw_dev));
1663 	if (dumpdev != NODEV)
1664 		printf(" dump on %s%d%c", findblkname(major(dumpdev)),
1665 		    DISKUNIT(dumpdev), 'a' + DISKPART(dumpdev));
1666 	printf("\n");
1667 }
1668 
1669 extern struct nam2blk nam2blk[];
1670 
1671 int
1672 findblkmajor(struct device *dv)
1673 {
1674 	char buf[16], *p;
1675 	int i;
1676 
1677 	if (strlcpy(buf, dv->dv_xname, sizeof buf) >= sizeof buf)
1678 		return (-1);
1679 	for (p = buf; *p; p++)
1680 		if (*p >= '0' && *p <= '9')
1681 			*p = '\0';
1682 
1683 	for (i = 0; nam2blk[i].name; i++)
1684 		if (!strcmp(buf, nam2blk[i].name))
1685 			return (nam2blk[i].maj);
1686 	return (-1);
1687 }
1688 
1689 char *
1690 findblkname(int maj)
1691 {
1692 	int i;
1693 
1694 	for (i = 0; nam2blk[i].name; i++)
1695 		if (nam2blk[i].maj == maj)
1696 			return (nam2blk[i].name);
1697 	return (NULL);
1698 }
1699 
1700 char *
1701 disk_readlabel(struct disklabel *dl, dev_t dev, char *errbuf, size_t errsize)
1702 {
1703 	struct vnode *vn;
1704 	dev_t chrdev, rawdev;
1705 	int error;
1706 
1707 	chrdev = blktochr(dev);
1708 	rawdev = MAKEDISKDEV(major(chrdev), DISKUNIT(chrdev), RAW_PART);
1709 
1710 #ifdef DEBUG
1711 	printf("dev=0x%x chrdev=0x%x rawdev=0x%x\n", dev, chrdev, rawdev);
1712 #endif
1713 
1714 	if (cdevvp(rawdev, &vn)) {
1715 		snprintf(errbuf, errsize,
1716 		    "cannot obtain vnode for 0x%x/0x%x", dev, rawdev);
1717 		return (errbuf);
1718 	}
1719 
1720 	error = VOP_OPEN(vn, FREAD, NOCRED, curproc);
1721 	if (error) {
1722 		snprintf(errbuf, errsize,
1723 		    "cannot open disk, 0x%x/0x%x, error %d",
1724 		    dev, rawdev, error);
1725 		goto done;
1726 	}
1727 
1728 	error = VOP_IOCTL(vn, DIOCGDINFO, (caddr_t)dl, FREAD, NOCRED, curproc);
1729 	if (error) {
1730 		snprintf(errbuf, errsize,
1731 		    "cannot read disk label, 0x%x/0x%x, error %d",
1732 		    dev, rawdev, error);
1733 	}
1734 done:
1735 	VOP_CLOSE(vn, FREAD, NOCRED, curproc);
1736 	vput(vn);
1737 	if (error)
1738 		return (errbuf);
1739 	return (NULL);
1740 }
1741 
1742 int
1743 disk_map(char *path, char *mappath, int size, int flags)
1744 {
1745 	struct disk *dk, *mdk;
1746 	u_char uid[8];
1747 	char c, part;
1748 	int i;
1749 
1750 	/*
1751 	 * Attempt to map a request for a disklabel UID to the correct device.
1752 	 * We should be supplied with a disklabel UID which has the following
1753 	 * format:
1754 	 *
1755 	 * [disklabel uid] . [partition]
1756 	 *
1757 	 * Alternatively, if the DM_OPENPART flag is set the disklabel UID can
1758 	 * based passed on its own.
1759 	 */
1760 
1761 	if (strchr(path, '/') != NULL)
1762 		return -1;
1763 
1764 	/* Verify that the device name is properly formed. */
1765 	if (!((strlen(path) == 16 && (flags & DM_OPENPART)) ||
1766 	    (strlen(path) == 18 && path[16] == '.')))
1767 		return -1;
1768 
1769 	/* Get partition. */
1770 	if (flags & DM_OPENPART)
1771 		part = 'a' + RAW_PART;
1772 	else
1773 		part = path[17];
1774 
1775 	if (part < 'a' || part >= 'a' + MAXPARTITIONS)
1776 		return -1;
1777 
1778 	/* Derive label UID. */
1779 	memset(uid, 0, sizeof(uid));
1780 	for (i = 0; i < 16; i++) {
1781 		c = path[i];
1782 		if (c >= '0' && c <= '9')
1783 			c -= '0';
1784 		else if (c >= 'a' && c <= 'f')
1785 			c -= ('a' - 10);
1786 		else
1787 			return -1;
1788 
1789 		uid[i / 2] <<= 4;
1790 		uid[i / 2] |= c & 0xf;
1791 	}
1792 
1793 	mdk = NULL;
1794 	TAILQ_FOREACH(dk, &disklist, dk_link) {
1795 		if ((dk->dk_flags & DKF_LABELVALID) && dk->dk_label &&
1796 		    memcmp(dk->dk_label->d_uid, uid,
1797 		    sizeof(dk->dk_label->d_uid)) == 0) {
1798 			/* Fail if there are duplicate UIDs! */
1799 			if (mdk != NULL)
1800 				return -1;
1801 			mdk = dk;
1802 		}
1803 	}
1804 
1805 	if (mdk == NULL || mdk->dk_name == NULL)
1806 		return -1;
1807 
1808 	snprintf(mappath, size, "/dev/%s%s%c",
1809 	    (flags & DM_OPENBLCK) ? "" : "r", mdk->dk_name, part);
1810 
1811 	return 0;
1812 }
1813 
1814 /*
1815  * Lookup a disk device and verify that it has completed attaching.
1816  */
1817 struct device *
1818 disk_lookup(struct cfdriver *cd, int unit)
1819 {
1820 	struct device *dv;
1821 	struct disk *dk;
1822 
1823 	dv = device_lookup(cd, unit);
1824 	if (dv == NULL)
1825 		return (NULL);
1826 
1827 	TAILQ_FOREACH(dk, &disklist, dk_link)
1828 		if (dk->dk_device == dv)
1829 			break;
1830 
1831 	if (dk == NULL) {
1832 		device_unref(dv);
1833 		return (NULL);
1834 	}
1835 
1836 	return (dv);
1837 }
1838 
1839 int
1840 duid_equal(u_char *duid1, u_char *duid2)
1841 {
1842 	return (memcmp(duid1, duid2, DUID_SIZE) == 0);
1843 }
1844 
1845 int
1846 duid_iszero(u_char *duid)
1847 {
1848 	u_char zeroduid[DUID_SIZE];
1849 
1850 	memset(zeroduid, 0, sizeof(zeroduid));
1851 
1852 	return (duid_equal(duid, zeroduid));
1853 }
1854 
1855 const char *
1856 duid_format(u_char *duid)
1857 {
1858 	static char duid_str[17];
1859 
1860 	snprintf(duid_str, sizeof(duid_str),
1861 	    "%02x%02x%02x%02x%02x%02x%02x%02x",
1862 	    duid[0], duid[1], duid[2], duid[3],
1863 	    duid[4], duid[5], duid[6], duid[7]);
1864 
1865 	return (duid_str);
1866 }
1867