xref: /openbsd-src/sys/kern/subr_disk.c (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1 /*	$OpenBSD: subr_disk.c,v 1.232 2017/08/07 11:50:58 kettenis Exp $	*/
2 /*	$NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Jason R. Thorpe.  All rights reserved.
6  * Copyright (c) 1982, 1986, 1988, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/stat.h>
48 #include <sys/syslog.h>
49 #include <sys/device.h>
50 #include <sys/time.h>
51 #include <sys/disklabel.h>
52 #include <sys/conf.h>
53 #include <sys/lock.h>
54 #include <sys/disk.h>
55 #include <sys/reboot.h>
56 #include <sys/dkio.h>
57 #include <sys/vnode.h>
58 #include <sys/task.h>
59 #include <sys/stdint.h>
60 
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 
64 #include <net/if.h>
65 
66 #include <dev/rndvar.h>
67 #include <dev/cons.h>
68 
69 #include <lib/libz/zlib.h>
70 
71 #include "softraid.h"
72 
73 #ifdef DEBUG
74 #define DPRINTF(x...)	printf(x)
75 #else
76 #define DPRINTF(x...)
77 #endif
78 
79 /*
80  * A global list of all disks attached to the system.  May grow or
81  * shrink over time.
82  */
83 struct	disklist_head disklist;	/* TAILQ_HEAD */
84 int	disk_count;		/* number of drives in global disklist */
85 int	disk_change;		/* set if a disk has been attached/detached
86 				 * since last we looked at this variable. This
87 				 * is reset by hw_sysctl()
88 				 */
89 
90 #define DUID_SIZE 8
91 
92 u_char	bootduid[DUID_SIZE];	/* DUID of boot disk. */
93 u_char	rootduid[DUID_SIZE];	/* DUID of root disk. */
94 
95 /* softraid callback, do not use! */
96 void (*softraid_disk_attach)(struct disk *, int);
97 
98 void sr_map_root(void);
99 
100 struct disk_attach_task {
101 	struct task task;
102 	struct disk *dk;
103 };
104 
105 void disk_attach_callback(void *);
106 
107 int spoofgptlabel(struct buf *, void (*)(struct buf *), struct disklabel *);
108 
109 int gpt_chk_mbr(struct dos_partition *, u_int64_t);
110 int gpt_chk_hdr(struct gpt_header *, struct disklabel *);
111 int gpt_chk_parts(struct gpt_header *, struct gpt_partition *);
112 int gpt_get_fstype(struct uuid *);
113 
114 int duid_equal(u_char *, u_char *);
115 
116 /*
117  * Compute checksum for disk label.
118  */
119 u_int
120 dkcksum(struct disklabel *lp)
121 {
122 	u_int16_t *start, *end;
123 	u_int16_t sum = 0;
124 
125 	start = (u_int16_t *)lp;
126 	end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
127 	while (start < end)
128 		sum ^= *start++;
129 	return (sum);
130 }
131 
132 int
133 initdisklabel(struct disklabel *lp)
134 {
135 	int i;
136 
137 	/* minimal requirements for archetypal disk label */
138 	if (lp->d_secsize < DEV_BSIZE)
139 		lp->d_secsize = DEV_BSIZE;
140 	if (DL_GETDSIZE(lp) == 0)
141 		DL_SETDSIZE(lp, MAXDISKSIZE);
142 	if (lp->d_secpercyl == 0)
143 		return (ERANGE);
144 	lp->d_npartitions = MAXPARTITIONS;
145 	for (i = 0; i < RAW_PART; i++) {
146 		DL_SETPSIZE(&lp->d_partitions[i], 0);
147 		DL_SETPOFFSET(&lp->d_partitions[i], 0);
148 	}
149 	if (DL_GETPSIZE(&lp->d_partitions[RAW_PART]) == 0)
150 		DL_SETPSIZE(&lp->d_partitions[RAW_PART], DL_GETDSIZE(lp));
151 	DL_SETPOFFSET(&lp->d_partitions[RAW_PART], 0);
152 	DL_SETBSTART(lp, 0);
153 	DL_SETBEND(lp, DL_GETDSIZE(lp));
154 	lp->d_version = 1;
155 	lp->d_bbsize = 8192;
156 	lp->d_sbsize = 64*1024;			/* XXX ? */
157 	return (0);
158 }
159 
160 /*
161  * Check an incoming block to make sure it is a disklabel, convert it to
162  * a newer version if needed, etc etc.
163  */
164 int
165 checkdisklabel(void *rlp, struct disklabel *lp, u_int64_t boundstart,
166     u_int64_t boundend)
167 {
168 	struct disklabel *dlp = rlp;
169 	struct __partitionv0 *v0pp;
170 	struct partition *pp;
171 	u_int64_t disksize;
172 	int error = 0;
173 	int i;
174 
175 	if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC)
176 		error = ENOENT;	/* no disk label */
177 	else if (dlp->d_npartitions > MAXPARTITIONS)
178 		error = E2BIG;	/* too many partitions */
179 	else if (dlp->d_secpercyl == 0)
180 		error = EINVAL;	/* invalid label */
181 	else if (dlp->d_secsize == 0)
182 		error = ENOSPC;	/* disk too small */
183 	else if (dkcksum(dlp) != 0)
184 		error = EINVAL;	/* incorrect checksum */
185 
186 	if (error) {
187 		u_int16_t *start, *end, sum = 0;
188 
189 		/* If it is byte-swapped, attempt to convert it */
190 		if (swap32(dlp->d_magic) != DISKMAGIC ||
191 		    swap32(dlp->d_magic2) != DISKMAGIC ||
192 		    swap16(dlp->d_npartitions) > MAXPARTITIONS)
193 			return (error);
194 
195 		/*
196 		 * Need a byte-swap aware dkcksum variant
197 		 * inlined, because dkcksum uses a sub-field
198 		 */
199 		start = (u_int16_t *)dlp;
200 		end = (u_int16_t *)&dlp->d_partitions[
201 		    swap16(dlp->d_npartitions)];
202 		while (start < end)
203 			sum ^= *start++;
204 		if (sum != 0)
205 			return (error);
206 
207 		dlp->d_magic = swap32(dlp->d_magic);
208 		dlp->d_type = swap16(dlp->d_type);
209 
210 		/* d_typename and d_packname are strings */
211 
212 		dlp->d_secsize = swap32(dlp->d_secsize);
213 		dlp->d_nsectors = swap32(dlp->d_nsectors);
214 		dlp->d_ntracks = swap32(dlp->d_ntracks);
215 		dlp->d_ncylinders = swap32(dlp->d_ncylinders);
216 		dlp->d_secpercyl = swap32(dlp->d_secpercyl);
217 		dlp->d_secperunit = swap32(dlp->d_secperunit);
218 
219 		/* d_uid is a string */
220 
221 		dlp->d_acylinders = swap32(dlp->d_acylinders);
222 
223 		dlp->d_flags = swap32(dlp->d_flags);
224 
225 		for (i = 0; i < NDDATA; i++)
226 			dlp->d_drivedata[i] = swap32(dlp->d_drivedata[i]);
227 
228 		dlp->d_secperunith = swap16(dlp->d_secperunith);
229 		dlp->d_version = swap16(dlp->d_version);
230 
231 		for (i = 0; i < NSPARE; i++)
232 			dlp->d_spare[i] = swap32(dlp->d_spare[i]);
233 
234 		dlp->d_magic2 = swap32(dlp->d_magic2);
235 
236 		dlp->d_npartitions = swap16(dlp->d_npartitions);
237 		dlp->d_bbsize = swap32(dlp->d_bbsize);
238 		dlp->d_sbsize = swap32(dlp->d_sbsize);
239 
240 		for (i = 0; i < MAXPARTITIONS; i++) {
241 			pp = &dlp->d_partitions[i];
242 			pp->p_size = swap32(pp->p_size);
243 			pp->p_offset = swap32(pp->p_offset);
244 			if (dlp->d_version == 0) {
245 				v0pp = (struct __partitionv0 *)pp;
246 				v0pp->p_fsize = swap32(v0pp->p_fsize);
247 			} else {
248 				pp->p_offseth = swap16(pp->p_offseth);
249 				pp->p_sizeh = swap16(pp->p_sizeh);
250 			}
251 			pp->p_cpg = swap16(pp->p_cpg);
252 		}
253 
254 		dlp->d_checksum = 0;
255 		dlp->d_checksum = dkcksum(dlp);
256 		error = 0;
257 	}
258 
259 	/* XXX should verify lots of other fields and whine a lot */
260 
261 	/* Initial passed in lp contains the real disk size. */
262 	disksize = DL_GETDSIZE(lp);
263 
264 	if (lp != dlp)
265 		*lp = *dlp;
266 
267 	if (lp->d_version == 0) {
268 		lp->d_version = 1;
269 		lp->d_secperunith = 0;
270 
271 		v0pp = (struct __partitionv0 *)lp->d_partitions;
272 		pp = lp->d_partitions;
273 		for (i = 0; i < lp->d_npartitions; i++, pp++, v0pp++) {
274 			pp->p_fragblock = DISKLABELV1_FFS_FRAGBLOCK(v0pp->
275 			    p_fsize, v0pp->p_frag);
276 			pp->p_offseth = 0;
277 			pp->p_sizeh = 0;
278 		}
279 	}
280 
281 #ifdef DEBUG
282 	if (DL_GETDSIZE(lp) != disksize)
283 		printf("on-disk disklabel has incorrect disksize (%llu)\n",
284 		    DL_GETDSIZE(lp));
285 	if (DL_GETPSIZE(&lp->d_partitions[RAW_PART]) != disksize)
286 		printf("on-disk disklabel RAW_PART has incorrect size (%llu)\n",
287 		    DL_GETPSIZE(&lp->d_partitions[RAW_PART]));
288 	if (DL_GETPOFFSET(&lp->d_partitions[RAW_PART]) != 0)
289 		printf("on-disk disklabel RAW_PART offset != 0 (%llu)\n",
290 		    DL_GETPOFFSET(&lp->d_partitions[RAW_PART]));
291 #endif
292 	DL_SETDSIZE(lp, disksize);
293 	DL_SETPSIZE(&lp->d_partitions[RAW_PART], disksize);
294 	DL_SETPOFFSET(&lp->d_partitions[RAW_PART], 0);
295 	DL_SETBSTART(lp, boundstart);
296 	DL_SETBEND(lp, boundend < DL_GETDSIZE(lp) ? boundend : DL_GETDSIZE(lp));
297 
298 	lp->d_checksum = 0;
299 	lp->d_checksum = dkcksum(lp);
300 	return (0);
301 }
302 
303 /*
304  * Read a disk sector.
305  */
306 int
307 readdisksector(struct buf *bp, void (*strat)(struct buf *),
308     struct disklabel *lp, u_int64_t sector)
309 {
310 	bp->b_blkno = DL_SECTOBLK(lp, sector);
311 	bp->b_bcount = lp->d_secsize;
312 	bp->b_error = 0;
313 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE | B_ERROR);
314 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
315 
316 	(*strat)(bp);
317 
318 	return (biowait(bp));
319 }
320 
321 /*
322  * If dos partition table requested, attempt to load it and
323  * find disklabel inside a DOS partition. Return buffer
324  * for use in signalling errors if requested.
325  *
326  * We would like to check if each MBR has a valid BOOT_MAGIC, but
327  * we cannot because it doesn't always exist. So.. we assume the
328  * MBR is valid.
329  */
330 int
331 readdoslabel(struct buf *bp, void (*strat)(struct buf *),
332     struct disklabel *lp, daddr_t *partoffp, int spoofonly)
333 {
334 	struct disklabel *gptlp;
335 	u_int64_t dospartoff = 0, dospartend = DL_GETBEND(lp);
336 	int i, ourpart = -1, wander = 1, n = 0, loop = 0, offset;
337 	struct dos_partition dp[NDOSPART], *dp2;
338 	u_int64_t sector = DOSBBSECTOR;
339 	u_int32_t extoff = 0;
340 	int error;
341 
342 	if (lp->d_secpercyl == 0)
343 		return (EINVAL);	/* invalid label */
344 	if (lp->d_secsize == 0)
345 		return (ENOSPC);	/* disk too small */
346 
347 	/* do DOS partitions in the process of getting disklabel? */
348 
349 	/*
350 	 * Read dos partition table, follow extended partitions.
351 	 * Map the partitions to disklabel entries i-p
352 	 */
353 	while (wander && loop < DOS_MAXEBR) {
354 		loop++;
355 		wander = 0;
356 		if (sector < extoff)
357 			sector = extoff;
358 
359 		/* read MBR/EBR */
360 		error = readdisksector(bp, strat, lp, sector);
361 		if (error) {
362 /*wrong*/		if (partoffp)
363 /*wrong*/			*partoffp = -1;
364 			return (error);
365 		}
366 
367 		bcopy(bp->b_data + DOSPARTOFF, dp, sizeof(dp));
368 
369 		if (n == 0 && sector == DOSBBSECTOR) {
370 			u_int16_t mbrtest;
371 
372 			/* Check the end of sector marker. */
373 			mbrtest = ((bp->b_data[510] << 8) & 0xff00) |
374 			    (bp->b_data[511] & 0xff);
375 			if (mbrtest != 0x55aa)
376 				goto notmbr;
377 
378 			if (gpt_chk_mbr(dp, DL_GETDSIZE(lp)) != 0)
379 				goto notgpt;
380 
381 			gptlp = malloc(sizeof(struct disklabel), M_DEVBUF,
382 			    M_NOWAIT);
383 			if (gptlp == NULL)
384 				return (ENOMEM);
385 			*gptlp = *lp;
386 			error = spoofgptlabel(bp, strat, gptlp);
387 			if (error == 0) {
388 				dospartoff = DL_GETBSTART(gptlp);
389 				dospartend = DL_GETBEND(gptlp);
390 				if (partoffp) {
391 					if (dospartoff == 0)
392 						return (ENXIO);
393 					else
394 						goto notfat;
395 				}
396 				*lp = *gptlp;
397 				free(gptlp, M_DEVBUF,
398 				    sizeof(struct disklabel));
399 				goto notfat;
400 			} else {
401 				free(gptlp, M_DEVBUF,
402 				    sizeof(struct disklabel));
403 				goto notmbr;
404 			}
405 		}
406 
407 notgpt:
408 		if (ourpart == -1) {
409 			/* Search for our MBR partition */
410 			for (dp2=dp, i=0; i < NDOSPART && ourpart == -1;
411 			    i++, dp2++)
412 				if (letoh32(dp2->dp_size) &&
413 				    dp2->dp_typ == DOSPTYP_OPENBSD)
414 					ourpart = i;
415 			if (ourpart == -1)
416 				goto donot;
417 			/*
418 			 * This is our MBR partition. need sector
419 			 * address for SCSI/IDE, cylinder for
420 			 * ESDI/ST506/RLL
421 			 */
422 			dp2 = &dp[ourpart];
423 			dospartoff = letoh32(dp2->dp_start) + sector;
424 			dospartend = dospartoff + letoh32(dp2->dp_size);
425 
426 			/*
427 			 * Record the OpenBSD partition's placement (in
428 			 * 512-byte blocks!) for the caller. No need to
429 			 * finish spoofing.
430 			 */
431 			if (partoffp) {
432 				*partoffp = DL_SECTOBLK(lp, dospartoff);
433 				return (0);
434 			}
435 
436 			if (lp->d_ntracks == 0)
437 				lp->d_ntracks = dp2->dp_ehd + 1;
438 			if (lp->d_nsectors == 0)
439 				lp->d_nsectors = DPSECT(dp2->dp_esect);
440 			if (lp->d_secpercyl == 0)
441 				lp->d_secpercyl = lp->d_ntracks *
442 				    lp->d_nsectors;
443 		}
444 donot:
445 		/*
446 		 * In case the disklabel read below fails, we want to
447 		 * provide a fake label in i-p.
448 		 */
449 		for (dp2=dp, i=0; i < NDOSPART; i++, dp2++) {
450 			struct partition *pp;
451 			u_int8_t fstype;
452 
453 			if (dp2->dp_typ == DOSPTYP_OPENBSD ||
454 			    dp2->dp_typ == DOSPTYP_EFI)
455 				continue;
456 			if (letoh32(dp2->dp_size) > DL_GETDSIZE(lp))
457 				continue;
458 			if (letoh32(dp2->dp_start) > DL_GETDSIZE(lp))
459 				continue;
460 			if (letoh32(dp2->dp_size) == 0)
461 				continue;
462 
463 			switch (dp2->dp_typ) {
464 			case DOSPTYP_UNUSED:
465 				fstype = FS_UNUSED;
466 				break;
467 
468 			case DOSPTYP_LINUX:
469 				fstype = FS_EXT2FS;
470 				break;
471 
472 			case DOSPTYP_NTFS:
473 				fstype = FS_NTFS;
474 				break;
475 
476 			case DOSPTYP_EFISYS:
477 			case DOSPTYP_FAT12:
478 			case DOSPTYP_FAT16S:
479 			case DOSPTYP_FAT16B:
480 			case DOSPTYP_FAT16L:
481 			case DOSPTYP_FAT32:
482 			case DOSPTYP_FAT32L:
483 				fstype = FS_MSDOS;
484 				break;
485 			case DOSPTYP_EXTEND:
486 			case DOSPTYP_EXTENDL:
487 				sector = letoh32(dp2->dp_start) + extoff;
488 				if (!extoff) {
489 					extoff = letoh32(dp2->dp_start);
490 					sector = 0;
491 				}
492 				wander = 1;
493 				continue;
494 				break;
495 			default:
496 				fstype = FS_OTHER;
497 				break;
498 			}
499 
500 			/*
501 			 * Don't set fstype/offset/size when just looking for
502 			 * the offset of the OpenBSD partition. It would
503 			 * invalidate the disklabel checksum!
504 			 *
505 			 * Don't try to spoof more than 8 partitions, i.e.
506 			 * 'i' -'p'.
507 			 */
508 			if (partoffp || n >= 8)
509 				continue;
510 
511 			pp = &lp->d_partitions[8+n];
512 			n++;
513 			pp->p_fstype = fstype;
514 			if (letoh32(dp2->dp_start))
515 				DL_SETPOFFSET(pp,
516 				    letoh32(dp2->dp_start) + sector);
517 			DL_SETPSIZE(pp, letoh32(dp2->dp_size));
518 		}
519 	}
520 
521 notmbr:
522 	if (n == 0 && sector == DOSBBSECTOR && ourpart == -1) {
523 		u_int16_t fattest;
524 
525 		/* Check for a valid initial jmp instruction. */
526 		switch ((u_int8_t)bp->b_data[0]) {
527 		case 0xeb:
528 			/*
529 			 * Two-byte jmp instruction. The 2nd byte is the number
530 			 * of bytes to jmp and the 3rd byte must be a NOP.
531 			 */
532 			if ((u_int8_t)bp->b_data[2] != 0x90)
533 				goto notfat;
534 			break;
535 		case 0xe9:
536 			/*
537 			 * Three-byte jmp instruction. The next two bytes are a
538 			 * little-endian 16 bit value.
539 			 */
540 			break;
541 		default:
542 			goto notfat;
543 			break;
544 		}
545 
546 		/* Check for a valid bytes per sector value. */
547 		fattest = ((bp->b_data[12] << 8) & 0xff00) |
548 		    (bp->b_data[11] & 0xff);
549 		if (fattest < 512 || fattest > 4096 || (fattest % 512 != 0))
550 			goto notfat;
551 
552 		if (partoffp)
553 			return (ENXIO);	/* No place for disklabel on FAT! */
554 
555 		DL_SETPSIZE(&lp->d_partitions['i' - 'a'],
556 		    DL_GETPSIZE(&lp->d_partitions[RAW_PART]));
557 		DL_SETPOFFSET(&lp->d_partitions['i' - 'a'], 0);
558 		lp->d_partitions['i' - 'a'].p_fstype = FS_MSDOS;
559 
560 		spoofonly = 1;	/* No disklabel to read from disk. */
561 	}
562 
563 notfat:
564 	/* record the OpenBSD partition's placement for the caller */
565 	if (partoffp)
566 		*partoffp = DL_SECTOBLK(lp, dospartoff);
567 	else {
568 		DL_SETBSTART(lp, dospartoff);
569 		DL_SETBEND(lp, (dospartend < DL_GETDSIZE(lp)) ? dospartend :
570 		    DL_GETDSIZE(lp));
571 	}
572 
573 	/* don't read the on-disk label if we are in spoofed-only mode */
574 	if (spoofonly)
575 		return (0);
576 
577 	error = readdisksector(bp, strat, lp, dospartoff +
578 	    DL_BLKTOSEC(lp, DOS_LABELSECTOR));
579 	if (error)
580 		return (bp->b_error);
581 
582 	offset = DL_BLKOFFSET(lp, DOS_LABELSECTOR);
583 	error = checkdisklabel(bp->b_data + offset, lp,
584 	    DL_GETBSTART((struct disklabel*)(bp->b_data+offset)),
585 	    DL_GETBEND((struct disklabel *)(bp->b_data+offset)));
586 
587 	return (error);
588 }
589 
590 /*
591  * Returns 0 if the MBR with the provided partition array is a GPT protective
592  * MBR, and returns 1 otherwise. A GPT protective MBR would have one and only
593  * one MBR partition, an EFI partition that either covers the whole disk or as
594  * much of it as is possible with a 32bit size field.
595  *
596  * NOTE: MS always uses a size of UINT32_MAX for the EFI partition!**
597  */
598 int
599 gpt_chk_mbr(struct dos_partition *dp, u_int64_t dsize)
600 {
601 	struct dos_partition *dp2;
602 	int efi, found, i;
603 	u_int32_t psize;
604 
605 	found = efi = 0;
606 	for (dp2=dp, i=0; i < NDOSPART; i++, dp2++) {
607 		if (dp2->dp_typ == DOSPTYP_UNUSED)
608 			continue;
609 		found++;
610 		if (dp2->dp_typ != DOSPTYP_EFI)
611 			continue;
612 		psize = letoh32(dp2->dp_size);
613 		if (psize == (dsize - 1) ||
614 		    psize == UINT32_MAX) {
615 			if (letoh32(dp2->dp_start) == 1)
616 				efi++;
617 		}
618 	}
619 	if (found == 1 && efi == 1)
620 		return (0);
621 
622 	return (1);
623 }
624 
625 int
626 gpt_chk_hdr(struct gpt_header *gh, struct disklabel *lp)
627 {
628 	uint64_t ghpartlba;
629 	uint64_t ghlbaend, ghlbastart;
630 	uint32_t orig_gh_csum;
631 	uint32_t ghsize, ghpartsize, ghpartspersec;
632 
633 	if (letoh64(gh->gh_sig) != GPTSIGNATURE)
634 		return (EINVAL);
635 
636 	if (letoh32(gh->gh_rev) != GPTREVISION)
637 		return (EINVAL);
638 
639 	ghsize = letoh32(gh->gh_size);
640 	ghpartsize = letoh32(gh->gh_part_size);
641 	ghpartspersec = lp->d_secsize / ghpartsize;
642 	ghpartlba = letoh64(gh->gh_part_lba);
643 	ghlbaend = letoh64(gh->gh_lba_end);
644 	ghlbastart = letoh64(gh->gh_lba_start);
645 
646 	if (ghsize < GPTMINHDRSIZE || ghsize > sizeof(struct gpt_header))
647 		return (EINVAL);
648 
649 	orig_gh_csum = gh->gh_csum;
650 	gh->gh_csum = 0;
651 	gh->gh_csum = crc32(0, (unsigned char *)gh, ghsize);
652 
653 	if (orig_gh_csum != gh->gh_csum)
654 		return (EINVAL);
655 
656 	if (ghlbastart >= DL_GETDSIZE(lp) ||
657 	    ghlbaend >= DL_GETDSIZE(lp) ||
658 	    ghpartlba >= DL_GETDSIZE(lp))
659 		return (EINVAL);
660 
661 	/*
662 	* Size per partition entry shall be 128*(2**n) with n >= 0.
663 	* We don't support partition entries larger than block size.
664 	*/
665 	if (ghpartsize % GPTMINPARTSIZE || ghpartsize > lp->d_secsize
666 	    || ghpartspersec == 0) {
667 		DPRINTF("invalid partition size\n");
668 		return (EINVAL);
669 	}
670 
671 	/* XXX: we don't support multiples of GPTMINPARTSIZE yet */
672 	if (ghpartsize != GPTMINPARTSIZE) {
673 		DPRINTF("partition sizes larger than %d bytes are not "
674 		    "supported", GPTMINPARTSIZE);
675 		return (EINVAL);
676 	}
677 
678 	if (letoh64(gh->gh_lba_alt) >= DL_GETDSIZE(lp)) {
679 		DPRINTF("alternate header's position is bogus\n");
680 		return (EINVAL);
681 	}
682 
683 	return 0;
684 }
685 
686 int
687 gpt_chk_parts(struct gpt_header *gh, struct gpt_partition *gp)
688 {
689 	u_int32_t checksum;
690 	checksum = crc32(0, (unsigned char *)gp,
691 	    letoh32(gh->gh_part_num) * letoh32(gh->gh_part_size));
692 
693 	if (checksum != gh->gh_part_csum)
694 		return (EINVAL);
695 
696 	return 0;
697 }
698 
699 int
700 gpt_get_fstype(struct uuid *uuid_part)
701 {
702 	static int init = 0;
703 	static struct uuid uuid_openbsd, uuid_msdos, uuid_chromefs,
704 	    uuid_linux, uuid_hfs, uuid_unused, uuid_efi_system;
705 	static const uint8_t gpt_uuid_openbsd[] = GPT_UUID_OPENBSD;
706 	static const uint8_t gpt_uuid_msdos[] = GPT_UUID_MSDOS;
707 	static const uint8_t gpt_uuid_chromerootfs[] = GPT_UUID_CHROMEROOTFS;
708 	static const uint8_t gpt_uuid_linux[] = GPT_UUID_LINUX;
709 	static const uint8_t gpt_uuid_hfs[] = GPT_UUID_APPLE_HFS;
710 	static const uint8_t gpt_uuid_unused[] = GPT_UUID_UNUSED;
711 	static const uint8_t gpt_uuid_efi_system[] = GPT_UUID_EFI_SYSTEM;
712 
713 	if (init == 0) {
714 		uuid_dec_be(gpt_uuid_openbsd, &uuid_openbsd);
715 		uuid_dec_be(gpt_uuid_msdos, &uuid_msdos);
716 		uuid_dec_be(gpt_uuid_chromerootfs, &uuid_chromefs);
717 		uuid_dec_be(gpt_uuid_linux, &uuid_linux);
718 		uuid_dec_be(gpt_uuid_hfs, &uuid_hfs);
719 		uuid_dec_be(gpt_uuid_unused, &uuid_unused);
720 		uuid_dec_be(gpt_uuid_efi_system, &uuid_efi_system);
721 		init = 1;
722 	}
723 
724 	if (!memcmp(uuid_part, &uuid_unused, sizeof(struct uuid)))
725 		return FS_UNUSED;
726 	else if (!memcmp(uuid_part, &uuid_openbsd, sizeof(struct uuid)))
727 		return FS_BSDFFS;
728 	else if (!memcmp(uuid_part, &uuid_msdos, sizeof(struct uuid)))
729 		return FS_MSDOS;
730 	else if (!memcmp(uuid_part, &uuid_chromefs, sizeof(struct uuid)))
731 		return FS_EXT2FS;
732 	else if (!memcmp(uuid_part, &uuid_linux, sizeof(struct uuid)))
733 		return FS_EXT2FS;
734 	else if (!memcmp(uuid_part, &uuid_hfs, sizeof(struct uuid)))
735 		return FS_HFS;
736 	else if (!memcmp(uuid_part, &uuid_efi_system, sizeof(struct uuid)))
737 		return FS_MSDOS;
738 	else
739 		return FS_OTHER;
740 }
741 
742 /*
743  * Spoof a disklabel based on the GPT information on the disk.
744  */
745 int
746 spoofgptlabel(struct buf *bp, void (*strat)(struct buf *),
747     struct disklabel *lp)
748 {
749 	static const u_int8_t gpt_uuid_openbsd[] = GPT_UUID_OPENBSD;
750 	struct gpt_header gh;
751 	struct uuid uuid_part, uuid_openbsd;
752 	struct gpt_partition *gp, *gp_tmp;
753 	struct partition *pp;
754 	size_t gpsz;
755 	u_int64_t ghlbaend, ghlbastart, gptpartoff, gptpartend, sector;
756 	u_int64_t start, end;
757 	int i, altheader = 0, error, n;
758 	uint32_t ghpartnum;
759 
760 	uuid_dec_be(gpt_uuid_openbsd, &uuid_openbsd);
761 
762 	for (sector = GPTSECTOR; ; sector = DL_GETDSIZE(lp)-1, altheader = 1) {
763 		uint64_t ghpartlba;
764 		uint32_t ghpartsize;
765 		uint32_t ghpartspersec;
766 
767 		error = readdisksector(bp, strat, lp, sector);
768 		if (error) {
769 			DPRINTF("error reading from disk\n");
770 			return (error);
771 		}
772 
773 		bcopy(bp->b_data, &gh, sizeof(gh));
774 
775 		if (gpt_chk_hdr(&gh, lp)) {
776 			if (altheader) {
777 				DPRINTF("alternate header also broken\n");
778 				return (EINVAL);
779 			}
780 			continue;
781 		}
782 
783 		ghpartsize = letoh32(gh.gh_part_size);
784 		ghpartspersec = lp->d_secsize / ghpartsize;
785 		ghpartnum = letoh32(gh.gh_part_num);
786 		ghpartlba = letoh64(gh.gh_part_lba);
787 		ghlbaend = letoh64(gh.gh_lba_end);
788 		ghlbastart = letoh64(gh.gh_lba_start);
789 
790 		/* read GPT partition entry array */
791 		gp = mallocarray(ghpartnum, sizeof(struct gpt_partition),
792 		    M_DEVBUF, M_NOWAIT|M_ZERO);
793 		if (gp == NULL)
794 			return (ENOMEM);
795 		gpsz = ghpartnum * sizeof(struct gpt_partition);
796 
797 		/*
798 		* XXX:	Fails if # of partition entries is not a multiple of
799 		*	ghpartspersec.
800 		*/
801 		sector = ghpartlba;
802 		for (i = 0; i < ghpartnum / ghpartspersec; i++, sector++) {
803 			error = readdisksector(bp, strat, lp, sector);
804 			if (error) {
805 				free(gp, M_DEVBUF, gpsz);
806 				return (error);
807 			}
808 
809 			bcopy(bp->b_data, gp + i * ghpartspersec,
810 			    ghpartspersec * sizeof(struct gpt_partition));
811 		}
812 
813 		if (gpt_chk_parts(&gh, gp)) {
814 			free(gp, M_DEVBUF, gpsz);
815 			if (altheader) {
816 				DPRINTF("alternate partition entries are also "
817 				    "broken\n");
818 				return (EINVAL);
819 			}
820 			continue;
821 		}
822 		break;
823 	}
824 
825 	/* Find OpenBSD partition and spoof others along the way. */
826 	n = 0;
827 	gptpartoff = 0;
828 	gptpartend = DL_GETBEND(lp);
829 	for (gp_tmp = gp, i = 0; i < ghpartnum; gp_tmp++, i++) {
830 		start = letoh64(gp_tmp->gp_lba_start);
831 		end = letoh64(gp_tmp->gp_lba_end);
832 		if (start > end || start < ghlbastart || end > ghlbaend)
833 			continue; /* entry invalid */
834 
835 		uuid_dec_le(&gp_tmp->gp_type, &uuid_part);
836 		if (!memcmp(&uuid_part, &uuid_openbsd, sizeof(struct uuid))) {
837 			if (gptpartoff == 0) {
838 				gptpartoff = start;
839 				gptpartend = end + 1;
840 			}
841 			continue; /* Do *NOT* spoof OpenBSD partitions! */
842 		}
843 
844 		 /*
845 		 * Don't try to spoof more than 8 partitions, i.e.
846 		 * 'i' -'p'.
847 		 */
848 		if (n >= 8)
849 			continue;
850 
851 		pp = &lp->d_partitions[8+n];
852 		n++;
853 		pp->p_fstype = gpt_get_fstype(&uuid_part);
854 		DL_SETPOFFSET(pp, start);
855 		DL_SETPSIZE(pp, end - start + 1);
856 	}
857 
858 	free(gp, M_DEVBUF, gpsz);
859 
860 	DL_SETBSTART(lp, gptpartoff);
861 	DL_SETBEND(lp, (gptpartend < DL_GETDSIZE(lp)) ? gptpartend :
862 	    DL_GETDSIZE(lp));
863 
864 	return (0);
865 }
866 
867 /*
868  * Check new disk label for sensibility before setting it.
869  */
870 int
871 setdisklabel(struct disklabel *olp, struct disklabel *nlp, u_int openmask)
872 {
873 	struct partition *opp, *npp;
874 	struct disk *dk;
875 	int i;
876 
877 	/* sanity clause */
878 	if (nlp->d_secpercyl == 0 || nlp->d_secsize == 0 ||
879 	    (nlp->d_secsize % DEV_BSIZE) != 0)
880 		return (EINVAL);
881 
882 	/* special case to allow disklabel to be invalidated */
883 	if (nlp->d_magic == 0xffffffff) {
884 		*olp = *nlp;
885 		return (0);
886 	}
887 
888 	if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
889 	    dkcksum(nlp) != 0)
890 		return (EINVAL);
891 
892 	/* XXX missing check if other dos partitions will be overwritten */
893 
894 	for (i = 0; i < MAXPARTITIONS; i++) {
895 		opp = &olp->d_partitions[i];
896 		npp = &nlp->d_partitions[i];
897 		if ((openmask & (1 << i)) &&
898 		    (DL_GETPOFFSET(npp) != DL_GETPOFFSET(opp) ||
899 		    DL_GETPSIZE(npp) < DL_GETPSIZE(opp)))
900 			return (EBUSY);
901 		/*
902 		 * Copy internally-set partition information
903 		 * if new label doesn't include it.		XXX
904 		 */
905 		if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
906 			npp->p_fragblock = opp->p_fragblock;
907 			npp->p_cpg = opp->p_cpg;
908 		}
909 	}
910 
911 	/* Generate a UID if the disklabel does not already have one. */
912 	if (duid_iszero(nlp->d_uid)) {
913 		do {
914 			arc4random_buf(nlp->d_uid, sizeof(nlp->d_uid));
915 			TAILQ_FOREACH(dk, &disklist, dk_link)
916 				if (dk->dk_label &&
917 				    duid_equal(dk->dk_label->d_uid, nlp->d_uid))
918 					break;
919 		} while (dk != NULL || duid_iszero(nlp->d_uid));
920 	}
921 
922 	/* Preserve the disk size and RAW_PART values. */
923 	DL_SETDSIZE(nlp, DL_GETDSIZE(olp));
924 	npp = &nlp->d_partitions[RAW_PART];
925 	DL_SETPOFFSET(npp, 0);
926 	DL_SETPSIZE(npp, DL_GETDSIZE(nlp));
927 
928 	nlp->d_checksum = 0;
929 	nlp->d_checksum = dkcksum(nlp);
930 	*olp = *nlp;
931 
932 	disk_change = 1;
933 
934 	return (0);
935 }
936 
937 /*
938  * Determine the size of the transfer, and make sure it is within the
939  * boundaries of the partition. Adjust transfer if needed, and signal errors or
940  * early completion.
941  */
942 int
943 bounds_check_with_label(struct buf *bp, struct disklabel *lp)
944 {
945 	struct partition *p = &lp->d_partitions[DISKPART(bp->b_dev)];
946 	daddr_t partblocks, sz;
947 
948 	/* Avoid division by zero, negative offsets, and negative sizes. */
949 	if (lp->d_secpercyl == 0 || bp->b_blkno < 0 || bp->b_bcount < 0)
950 		goto bad;
951 
952 	/* Ensure transfer is a whole number of aligned sectors. */
953 	if ((bp->b_blkno % DL_BLKSPERSEC(lp)) != 0 ||
954 	    (bp->b_bcount % lp->d_secsize) != 0)
955 		goto bad;
956 
957 	/* Ensure transfer starts within partition boundary. */
958 	partblocks = DL_SECTOBLK(lp, DL_GETPSIZE(p));
959 	if (bp->b_blkno > partblocks)
960 		goto bad;
961 
962 	/* If exactly at end of partition or null transfer, return EOF. */
963 	if (bp->b_blkno == partblocks || bp->b_bcount == 0)
964 		goto done;
965 
966 	/* Truncate request if it extends past the end of the partition. */
967 	sz = bp->b_bcount >> DEV_BSHIFT;
968 	if (sz > partblocks - bp->b_blkno) {
969 		sz = partblocks - bp->b_blkno;
970 		bp->b_bcount = sz << DEV_BSHIFT;
971 	}
972 
973 	return (0);
974 
975  bad:
976 	bp->b_error = EINVAL;
977 	bp->b_flags |= B_ERROR;
978  done:
979 	bp->b_resid = bp->b_bcount;
980 	return (-1);
981 }
982 
983 /*
984  * Disk error is the preface to plaintive error messages
985  * about failing disk transfers.  It prints messages of the form
986 
987 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
988 
989  * if the offset of the error in the transfer and a disk label
990  * are both available.  blkdone should be -1 if the position of the error
991  * is unknown; the disklabel pointer may be null from drivers that have not
992  * been converted to use them.  The message is printed with printf
993  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
994  * The message should be completed (with at least a newline) with printf
995  * or addlog, respectively.  There is no trailing space.
996  */
997 void
998 diskerr(struct buf *bp, char *dname, char *what, int pri, int blkdone,
999     struct disklabel *lp)
1000 {
1001 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
1002 	int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)));
1003 	char partname = 'a' + part;
1004 	daddr_t sn;
1005 
1006 	if (pri != LOG_PRINTF) {
1007 		log(pri, "%s", "");
1008 		pr = addlog;
1009 	} else
1010 		pr = printf;
1011 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
1012 	    bp->b_flags & B_READ ? "read" : "writ");
1013 	sn = bp->b_blkno;
1014 	if (bp->b_bcount <= DEV_BSIZE)
1015 		(*pr)("%lld", (long long)sn);
1016 	else {
1017 		if (blkdone >= 0) {
1018 			sn += blkdone;
1019 			(*pr)("%lld of ", (long long)sn);
1020 		}
1021 		(*pr)("%lld-%lld", (long long)bp->b_blkno,
1022 		    (long long)(bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE));
1023 	}
1024 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
1025 		sn += DL_SECTOBLK(lp, DL_GETPOFFSET(&lp->d_partitions[part]));
1026 		(*pr)(" (%s%d bn %lld; cn %lld", dname, unit, (long long)sn,
1027 		    (long long)(sn / DL_SECTOBLK(lp, lp->d_secpercyl)));
1028 		sn %= DL_SECTOBLK(lp, lp->d_secpercyl);
1029 		(*pr)(" tn %lld sn %lld)",
1030 		    (long long)(sn / DL_SECTOBLK(lp, lp->d_nsectors)),
1031 		    (long long)(sn % DL_SECTOBLK(lp, lp->d_nsectors)));
1032 	}
1033 }
1034 
1035 /*
1036  * Initialize the disklist.  Called by main() before autoconfiguration.
1037  */
1038 void
1039 disk_init(void)
1040 {
1041 
1042 	TAILQ_INIT(&disklist);
1043 	disk_count = disk_change = 0;
1044 }
1045 
1046 int
1047 disk_construct(struct disk *diskp)
1048 {
1049 	rw_init_flags(&diskp->dk_lock, "dklk", RWL_IS_VNODE);
1050 	mtx_init(&diskp->dk_mtx, IPL_BIO);
1051 
1052 	diskp->dk_flags |= DKF_CONSTRUCTED;
1053 
1054 	return (0);
1055 }
1056 
1057 /*
1058  * Attach a disk.
1059  */
1060 void
1061 disk_attach(struct device *dv, struct disk *diskp)
1062 {
1063 	int majdev;
1064 
1065 	if (!ISSET(diskp->dk_flags, DKF_CONSTRUCTED))
1066 		disk_construct(diskp);
1067 
1068 	/*
1069 	 * Allocate and initialize the disklabel structures.  Note that
1070 	 * it's not safe to sleep here, since we're probably going to be
1071 	 * called during autoconfiguration.
1072 	 */
1073 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF,
1074 	    M_NOWAIT|M_ZERO);
1075 	if (diskp->dk_label == NULL)
1076 		panic("disk_attach: can't allocate storage for disklabel");
1077 
1078 	/*
1079 	 * Set the attached timestamp.
1080 	 */
1081 	microuptime(&diskp->dk_attachtime);
1082 
1083 	/*
1084 	 * Link into the disklist.
1085 	 */
1086 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
1087 	++disk_count;
1088 	disk_change = 1;
1089 
1090 	/*
1091 	 * Store device structure and number for later use.
1092 	 */
1093 	diskp->dk_device = dv;
1094 	diskp->dk_devno = NODEV;
1095 	if (dv != NULL) {
1096 		majdev = findblkmajor(dv);
1097 		if (majdev >= 0)
1098 			diskp->dk_devno =
1099 			    MAKEDISKDEV(majdev, dv->dv_unit, RAW_PART);
1100 
1101 		if (diskp->dk_devno != NODEV) {
1102 			struct disk_attach_task *dat;
1103 
1104 			dat = malloc(sizeof(*dat), M_TEMP, M_WAITOK);
1105 
1106 			/* XXX: Assumes dk is part of the device softc. */
1107 			device_ref(dv);
1108 			dat->dk = diskp;
1109 
1110 			task_set(&dat->task, disk_attach_callback, dat);
1111 			task_add(systq, &dat->task);
1112 		}
1113 	}
1114 
1115 	if (softraid_disk_attach)
1116 		softraid_disk_attach(diskp, 1);
1117 }
1118 
1119 void
1120 disk_attach_callback(void *xdat)
1121 {
1122 	struct disk_attach_task *dat = xdat;
1123 	struct disk *dk = dat->dk;
1124 	struct disklabel dl;
1125 	char errbuf[100];
1126 
1127 	free(dat, M_TEMP, sizeof(*dat));
1128 
1129 	if (dk->dk_flags & (DKF_OPENED | DKF_NOLABELREAD))
1130 		goto done;
1131 
1132 	/* Read disklabel. */
1133 	if (disk_readlabel(&dl, dk->dk_devno, errbuf, sizeof(errbuf)) == NULL) {
1134 		add_timer_randomness(dl.d_checksum);
1135 		dk->dk_flags |= DKF_LABELVALID;
1136 	}
1137 
1138 done:
1139 	dk->dk_flags |= DKF_OPENED;
1140 	device_unref(dk->dk_device);
1141 	wakeup(dk);
1142 }
1143 
1144 /*
1145  * Detach a disk.
1146  */
1147 void
1148 disk_detach(struct disk *diskp)
1149 {
1150 
1151 	if (softraid_disk_attach)
1152 		softraid_disk_attach(diskp, -1);
1153 
1154 	/*
1155 	 * Free the space used by the disklabel structures.
1156 	 */
1157 	free(diskp->dk_label, M_DEVBUF, sizeof(*diskp->dk_label));
1158 
1159 	/*
1160 	 * Remove from the disklist.
1161 	 */
1162 	TAILQ_REMOVE(&disklist, diskp, dk_link);
1163 	disk_change = 1;
1164 	if (--disk_count < 0)
1165 		panic("disk_detach: disk_count < 0");
1166 }
1167 
1168 int
1169 disk_openpart(struct disk *dk, int part, int fmt, int haslabel)
1170 {
1171 	KASSERT(part >= 0 && part < MAXPARTITIONS);
1172 
1173 	/* Unless opening the raw partition, check that the partition exists. */
1174 	if (part != RAW_PART && (!haslabel ||
1175 	    part >= dk->dk_label->d_npartitions ||
1176 	    dk->dk_label->d_partitions[part].p_fstype == FS_UNUSED))
1177 		return (ENXIO);
1178 
1179 	/* Ensure the partition doesn't get changed under our feet. */
1180 	switch (fmt) {
1181 	case S_IFCHR:
1182 		dk->dk_copenmask |= (1 << part);
1183 		break;
1184 	case S_IFBLK:
1185 		dk->dk_bopenmask |= (1 << part);
1186 		break;
1187 	}
1188 	dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask;
1189 
1190 	return (0);
1191 }
1192 
1193 void
1194 disk_closepart(struct disk *dk, int part, int fmt)
1195 {
1196 	KASSERT(part >= 0 && part < MAXPARTITIONS);
1197 
1198 	switch (fmt) {
1199 	case S_IFCHR:
1200 		dk->dk_copenmask &= ~(1 << part);
1201 		break;
1202 	case S_IFBLK:
1203 		dk->dk_bopenmask &= ~(1 << part);
1204 		break;
1205 	}
1206 	dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask;
1207 }
1208 
1209 void
1210 disk_gone(int (*open)(dev_t, int, int, struct proc *), int unit)
1211 {
1212 	int bmaj, cmaj, mn;
1213 
1214 	/* Locate the lowest minor number to be detached. */
1215 	mn = DISKMINOR(unit, 0);
1216 
1217 	for (bmaj = 0; bmaj < nblkdev; bmaj++)
1218 		if (bdevsw[bmaj].d_open == open)
1219 			vdevgone(bmaj, mn, mn + MAXPARTITIONS - 1, VBLK);
1220 	for (cmaj = 0; cmaj < nchrdev; cmaj++)
1221 		if (cdevsw[cmaj].d_open == open)
1222 			vdevgone(cmaj, mn, mn + MAXPARTITIONS - 1, VCHR);
1223 }
1224 
1225 /*
1226  * Increment a disk's busy counter.  If the counter is going from
1227  * 0 to 1, set the timestamp.
1228  */
1229 void
1230 disk_busy(struct disk *diskp)
1231 {
1232 
1233 	/*
1234 	 * XXX We'd like to use something as accurate as microtime(),
1235 	 * but that doesn't depend on the system TOD clock.
1236 	 */
1237 	mtx_enter(&diskp->dk_mtx);
1238 	if (diskp->dk_busy++ == 0)
1239 		microuptime(&diskp->dk_timestamp);
1240 	mtx_leave(&diskp->dk_mtx);
1241 }
1242 
1243 /*
1244  * Decrement a disk's busy counter, increment the byte count, total busy
1245  * time, and reset the timestamp.
1246  */
1247 void
1248 disk_unbusy(struct disk *diskp, long bcount, daddr_t blkno, int read)
1249 {
1250 	struct timeval dv_time, diff_time;
1251 
1252 	mtx_enter(&diskp->dk_mtx);
1253 
1254 	if (diskp->dk_busy-- == 0)
1255 		printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
1256 
1257 	microuptime(&dv_time);
1258 
1259 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
1260 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
1261 
1262 	diskp->dk_timestamp = dv_time;
1263 	if (bcount > 0) {
1264 		if (read) {
1265 			diskp->dk_rbytes += bcount;
1266 			diskp->dk_rxfer++;
1267 		} else {
1268 			diskp->dk_wbytes += bcount;
1269 			diskp->dk_wxfer++;
1270 		}
1271 	} else
1272 		diskp->dk_seek++;
1273 
1274 	mtx_leave(&diskp->dk_mtx);
1275 
1276 	add_disk_randomness(bcount ^ diff_time.tv_usec ^
1277 	    (blkno >> 32) ^ (blkno & 0xffffffff));
1278 }
1279 
1280 int
1281 disk_lock(struct disk *dk)
1282 {
1283 	return (rw_enter(&dk->dk_lock, RW_WRITE|RW_INTR));
1284 }
1285 
1286 void
1287 disk_lock_nointr(struct disk *dk)
1288 {
1289 	rw_enter_write(&dk->dk_lock);
1290 }
1291 
1292 void
1293 disk_unlock(struct disk *dk)
1294 {
1295 	rw_exit_write(&dk->dk_lock);
1296 }
1297 
1298 int
1299 dk_mountroot(void)
1300 {
1301 	char errbuf[100];
1302 	int part = DISKPART(rootdev);
1303 	int (*mountrootfn)(void);
1304 	struct disklabel dl;
1305 	char *error;
1306 
1307 	error = disk_readlabel(&dl, rootdev, errbuf, sizeof(errbuf));
1308 	if (error)
1309 		panic("%s", error);
1310 
1311 	if (DL_GETPSIZE(&dl.d_partitions[part]) == 0)
1312 		panic("root filesystem has size 0");
1313 	switch (dl.d_partitions[part].p_fstype) {
1314 #ifdef EXT2FS
1315 	case FS_EXT2FS:
1316 		{
1317 		extern int ext2fs_mountroot(void);
1318 		mountrootfn = ext2fs_mountroot;
1319 		}
1320 		break;
1321 #endif
1322 #ifdef FFS
1323 	case FS_BSDFFS:
1324 		{
1325 		extern int ffs_mountroot(void);
1326 		mountrootfn = ffs_mountroot;
1327 		}
1328 		break;
1329 #endif
1330 #ifdef CD9660
1331 	case FS_ISO9660:
1332 		{
1333 		extern int cd9660_mountroot(void);
1334 		mountrootfn = cd9660_mountroot;
1335 		}
1336 		break;
1337 #endif
1338 	default:
1339 #ifdef FFS
1340 		{
1341 		extern int ffs_mountroot(void);
1342 
1343 		printf("filesystem type %d not known.. assuming ffs\n",
1344 		    dl.d_partitions[part].p_fstype);
1345 		mountrootfn = ffs_mountroot;
1346 		}
1347 #else
1348 		panic("disk 0x%x filesystem type %d not known",
1349 		    rootdev, dl.d_partitions[part].p_fstype);
1350 #endif
1351 	}
1352 	return (*mountrootfn)();
1353 }
1354 
1355 struct device *
1356 getdisk(char *str, int len, int defpart, dev_t *devp)
1357 {
1358 	struct device *dv;
1359 
1360 	if ((dv = parsedisk(str, len, defpart, devp)) == NULL) {
1361 		printf("use one of: exit");
1362 		TAILQ_FOREACH(dv, &alldevs, dv_list) {
1363 			if (dv->dv_class == DV_DISK)
1364 				printf(" %s[a-p]", dv->dv_xname);
1365 #if defined(NFSCLIENT)
1366 			if (dv->dv_class == DV_IFNET)
1367 				printf(" %s", dv->dv_xname);
1368 #endif
1369 		}
1370 		printf("\n");
1371 	}
1372 	return (dv);
1373 }
1374 
1375 struct device *
1376 parsedisk(char *str, int len, int defpart, dev_t *devp)
1377 {
1378 	struct device *dv;
1379 	int majdev, part = defpart;
1380 	char c;
1381 
1382 	if (len == 0)
1383 		return (NULL);
1384 	c = str[len-1];
1385 	if (c >= 'a' && (c - 'a') < MAXPARTITIONS) {
1386 		part = c - 'a';
1387 		len -= 1;
1388 	}
1389 
1390 	TAILQ_FOREACH(dv, &alldevs, dv_list) {
1391 		if (dv->dv_class == DV_DISK &&
1392 		    strncmp(str, dv->dv_xname, len) == 0 &&
1393 		    dv->dv_xname[len] == '\0') {
1394 			majdev = findblkmajor(dv);
1395 			if (majdev < 0)
1396 				return NULL;
1397 			*devp = MAKEDISKDEV(majdev, dv->dv_unit, part);
1398 			break;
1399 		}
1400 #if defined(NFSCLIENT)
1401 		if (dv->dv_class == DV_IFNET &&
1402 		    strncmp(str, dv->dv_xname, len) == 0 &&
1403 		    dv->dv_xname[len] == '\0') {
1404 			*devp = NODEV;
1405 			break;
1406 		}
1407 #endif
1408 	}
1409 
1410 	return (dv);
1411 }
1412 
1413 void
1414 setroot(struct device *bootdv, int part, int exitflags)
1415 {
1416 	int majdev, unit, len, s, slept = 0;
1417 	struct swdevt *swp;
1418 	struct device *rootdv, *dv;
1419 	dev_t nrootdev, nswapdev = NODEV, temp = NODEV;
1420 	struct ifnet *ifp = NULL;
1421 	struct disk *dk;
1422 	char buf[128];
1423 #if defined(NFSCLIENT)
1424 	extern char *nfsbootdevname;
1425 #endif
1426 
1427 	/* Ensure that all disk attach callbacks have completed. */
1428 	do {
1429 		TAILQ_FOREACH(dk, &disklist, dk_link) {
1430 			if (dk->dk_devno != NODEV &&
1431 			    (dk->dk_flags & DKF_OPENED) == 0) {
1432 				tsleep(dk, 0, "dkopen", hz);
1433 				slept++;
1434 				break;
1435 			}
1436 		}
1437 	} while (dk != NULL && slept < 5);
1438 
1439 	if (slept == 5) {
1440 		printf("disklabels not read:");
1441 		TAILQ_FOREACH(dk, &disklist, dk_link)
1442 			if (dk->dk_devno != NODEV &&
1443 			    (dk->dk_flags & DKF_OPENED) == 0)
1444 				printf(" %s", dk->dk_name);
1445 		printf("\n");
1446 	}
1447 
1448 	if (duid_iszero(bootduid)) {
1449 		/* Locate DUID for boot disk since it was not provided. */
1450 		TAILQ_FOREACH(dk, &disklist, dk_link)
1451 			if (dk->dk_device == bootdv)
1452 				break;
1453 		if (dk && (dk->dk_flags & DKF_LABELVALID))
1454 			bcopy(dk->dk_label->d_uid, bootduid, sizeof(bootduid));
1455 	} else if (bootdv == NULL) {
1456 		/* Locate boot disk based on the provided DUID. */
1457 		TAILQ_FOREACH(dk, &disklist, dk_link)
1458 			if (duid_equal(dk->dk_label->d_uid, bootduid))
1459 				break;
1460 		if (dk && (dk->dk_flags & DKF_LABELVALID))
1461 			bootdv = dk->dk_device;
1462 	}
1463 	bcopy(bootduid, rootduid, sizeof(rootduid));
1464 
1465 #if NSOFTRAID > 0
1466 	sr_map_root();
1467 #endif
1468 
1469 	/*
1470 	 * If `swap generic' and we couldn't determine boot device,
1471 	 * ask the user.
1472 	 */
1473 	dk = NULL;
1474 	if (mountroot == NULL && bootdv == NULL)
1475 		boothowto |= RB_ASKNAME;
1476 	if (boothowto & RB_ASKNAME) {
1477 		while (1) {
1478 			printf("root device");
1479 			if (bootdv != NULL) {
1480 				printf(" (default %s", bootdv->dv_xname);
1481 				if (bootdv->dv_class == DV_DISK)
1482 					printf("%c", 'a' + part);
1483 				printf(")");
1484 			}
1485 			printf(": ");
1486 			s = splhigh();
1487 			cnpollc(1);
1488 			len = getsn(buf, sizeof(buf));
1489 			cnpollc(0);
1490 			splx(s);
1491 			if (strcmp(buf, "exit") == 0)
1492 				reboot(exitflags);
1493 			if (len == 0 && bootdv != NULL) {
1494 				strlcpy(buf, bootdv->dv_xname, sizeof buf);
1495 				len = strlen(buf);
1496 			}
1497 			if (len > 0 && buf[len - 1] == '*') {
1498 				buf[--len] = '\0';
1499 				dv = getdisk(buf, len, part, &nrootdev);
1500 				if (dv != NULL) {
1501 					rootdv = dv;
1502 					nswapdev = nrootdev;
1503 					goto gotswap;
1504 				}
1505 			}
1506 			dv = getdisk(buf, len, part, &nrootdev);
1507 			if (dv != NULL) {
1508 				rootdv = dv;
1509 				break;
1510 			}
1511 		}
1512 
1513 		if (rootdv->dv_class == DV_IFNET)
1514 			goto gotswap;
1515 
1516 		/* try to build swap device out of new root device */
1517 		while (1) {
1518 			printf("swap device");
1519 			if (rootdv != NULL)
1520 				printf(" (default %s%s)", rootdv->dv_xname,
1521 				    rootdv->dv_class == DV_DISK ? "b" : "");
1522 			printf(": ");
1523 			s = splhigh();
1524 			cnpollc(1);
1525 			len = getsn(buf, sizeof(buf));
1526 			cnpollc(0);
1527 			splx(s);
1528 			if (strcmp(buf, "exit") == 0)
1529 				reboot(exitflags);
1530 			if (len == 0 && rootdv != NULL) {
1531 				switch (rootdv->dv_class) {
1532 				case DV_IFNET:
1533 					nswapdev = NODEV;
1534 					break;
1535 				case DV_DISK:
1536 					nswapdev = MAKEDISKDEV(major(nrootdev),
1537 					    DISKUNIT(nrootdev), 1);
1538 					if (nswapdev == nrootdev)
1539 						continue;
1540 					break;
1541 				default:
1542 					break;
1543 				}
1544 				break;
1545 			}
1546 			dv = getdisk(buf, len, 1, &nswapdev);
1547 			if (dv) {
1548 				if (dv->dv_class == DV_IFNET)
1549 					nswapdev = NODEV;
1550 				if (nswapdev == nrootdev)
1551 					continue;
1552 				break;
1553 			}
1554 		}
1555 gotswap:
1556 		rootdev = nrootdev;
1557 		dumpdev = nswapdev;
1558 		swdevt[0].sw_dev = nswapdev;
1559 		swdevt[1].sw_dev = NODEV;
1560 #if defined(NFSCLIENT)
1561 	} else if (mountroot == nfs_mountroot) {
1562 		rootdv = bootdv;
1563 		rootdev = dumpdev = swapdev = NODEV;
1564 #endif
1565 	} else if (mountroot == NULL && rootdev == NODEV) {
1566 		/*
1567 		 * `swap generic'
1568 		 */
1569 		rootdv = bootdv;
1570 
1571 		if (bootdv->dv_class == DV_DISK) {
1572 			if (!duid_iszero(rootduid)) {
1573 				TAILQ_FOREACH(dk, &disklist, dk_link)
1574 					if ((dk->dk_flags & DKF_LABELVALID) &&
1575 					    dk->dk_label && duid_equal(
1576 					    dk->dk_label->d_uid, rootduid))
1577 						break;
1578 				if (dk == NULL)
1579 					panic("root device (%s) not found",
1580 					    duid_format(rootduid));
1581 				rootdv = dk->dk_device;
1582 			}
1583 		}
1584 
1585 		majdev = findblkmajor(rootdv);
1586 		if (majdev >= 0) {
1587 			/*
1588 			 * Root and swap are on the disk.
1589 			 * Assume swap is on partition b.
1590 			 */
1591 			rootdev = MAKEDISKDEV(majdev, rootdv->dv_unit, part);
1592 			nswapdev = MAKEDISKDEV(majdev, rootdv->dv_unit, 1);
1593 		} else {
1594 			/*
1595 			 * Root and swap are on a net.
1596 			 */
1597 			nswapdev = NODEV;
1598 		}
1599 		dumpdev = nswapdev;
1600 		swdevt[0].sw_dev = nswapdev;
1601 		/* swdevt[1].sw_dev = NODEV; */
1602 	} else {
1603 		/* Completely pre-configured, but we want rootdv .. */
1604 		majdev = major(rootdev);
1605 		if (findblkname(majdev) == NULL)
1606 			return;
1607 		unit = DISKUNIT(rootdev);
1608 		part = DISKPART(rootdev);
1609 		snprintf(buf, sizeof buf, "%s%d%c",
1610 		    findblkname(majdev), unit, 'a' + part);
1611 		rootdv = parsedisk(buf, strlen(buf), 0, &nrootdev);
1612 		if (rootdv == NULL)
1613 			panic("root device (%s) not found", buf);
1614 	}
1615 
1616 	if (rootdv && rootdv == bootdv && rootdv->dv_class == DV_IFNET)
1617 		ifp = ifunit(rootdv->dv_xname);
1618 	else if (bootdv && bootdv->dv_class == DV_IFNET)
1619 		ifp = ifunit(bootdv->dv_xname);
1620 
1621 	if (ifp)
1622 		if_addgroup(ifp, "netboot");
1623 
1624 	switch (rootdv->dv_class) {
1625 #if defined(NFSCLIENT)
1626 	case DV_IFNET:
1627 		mountroot = nfs_mountroot;
1628 		nfsbootdevname = rootdv->dv_xname;
1629 		return;
1630 #endif
1631 	case DV_DISK:
1632 		mountroot = dk_mountroot;
1633 		part = DISKPART(rootdev);
1634 		break;
1635 	default:
1636 		printf("can't figure root, hope your kernel is right\n");
1637 		return;
1638 	}
1639 
1640 	printf("root on %s%c", rootdv->dv_xname, 'a' + part);
1641 
1642 	if (dk && dk->dk_device == rootdv)
1643 		printf(" (%s.%c)", duid_format(rootduid), 'a' + part);
1644 
1645 	/*
1646 	 * Make the swap partition on the root drive the primary swap.
1647 	 */
1648 	for (swp = swdevt; swp->sw_dev != NODEV; swp++) {
1649 		if (major(rootdev) == major(swp->sw_dev) &&
1650 		    DISKUNIT(rootdev) == DISKUNIT(swp->sw_dev)) {
1651 			temp = swdevt[0].sw_dev;
1652 			swdevt[0].sw_dev = swp->sw_dev;
1653 			swp->sw_dev = temp;
1654 			break;
1655 		}
1656 	}
1657 	if (swp->sw_dev != NODEV) {
1658 		/*
1659 		 * If dumpdev was the same as the old primary swap device,
1660 		 * move it to the new primary swap device.
1661 		 */
1662 		if (temp == dumpdev)
1663 			dumpdev = swdevt[0].sw_dev;
1664 	}
1665 	if (swdevt[0].sw_dev != NODEV)
1666 		printf(" swap on %s%d%c", findblkname(major(swdevt[0].sw_dev)),
1667 		    DISKUNIT(swdevt[0].sw_dev),
1668 		    'a' + DISKPART(swdevt[0].sw_dev));
1669 	if (dumpdev != NODEV)
1670 		printf(" dump on %s%d%c", findblkname(major(dumpdev)),
1671 		    DISKUNIT(dumpdev), 'a' + DISKPART(dumpdev));
1672 	printf("\n");
1673 }
1674 
1675 extern struct nam2blk nam2blk[];
1676 
1677 int
1678 findblkmajor(struct device *dv)
1679 {
1680 	char buf[16], *p;
1681 	int i;
1682 
1683 	if (strlcpy(buf, dv->dv_xname, sizeof buf) >= sizeof buf)
1684 		return (-1);
1685 	for (p = buf; *p; p++)
1686 		if (*p >= '0' && *p <= '9')
1687 			*p = '\0';
1688 
1689 	for (i = 0; nam2blk[i].name; i++)
1690 		if (!strcmp(buf, nam2blk[i].name))
1691 			return (nam2blk[i].maj);
1692 	return (-1);
1693 }
1694 
1695 char *
1696 findblkname(int maj)
1697 {
1698 	int i;
1699 
1700 	for (i = 0; nam2blk[i].name; i++)
1701 		if (nam2blk[i].maj == maj)
1702 			return (nam2blk[i].name);
1703 	return (NULL);
1704 }
1705 
1706 char *
1707 disk_readlabel(struct disklabel *dl, dev_t dev, char *errbuf, size_t errsize)
1708 {
1709 	struct vnode *vn;
1710 	dev_t chrdev, rawdev;
1711 	int error;
1712 
1713 	chrdev = blktochr(dev);
1714 	rawdev = MAKEDISKDEV(major(chrdev), DISKUNIT(chrdev), RAW_PART);
1715 
1716 #ifdef DEBUG
1717 	printf("dev=0x%x chrdev=0x%x rawdev=0x%x\n", dev, chrdev, rawdev);
1718 #endif
1719 
1720 	if (cdevvp(rawdev, &vn)) {
1721 		snprintf(errbuf, errsize,
1722 		    "cannot obtain vnode for 0x%x/0x%x", dev, rawdev);
1723 		return (errbuf);
1724 	}
1725 
1726 	error = VOP_OPEN(vn, FREAD, NOCRED, curproc);
1727 	if (error) {
1728 		snprintf(errbuf, errsize,
1729 		    "cannot open disk, 0x%x/0x%x, error %d",
1730 		    dev, rawdev, error);
1731 		goto done;
1732 	}
1733 
1734 	error = VOP_IOCTL(vn, DIOCGDINFO, (caddr_t)dl, FREAD, NOCRED, curproc);
1735 	if (error) {
1736 		snprintf(errbuf, errsize,
1737 		    "cannot read disk label, 0x%x/0x%x, error %d",
1738 		    dev, rawdev, error);
1739 	}
1740 done:
1741 	VOP_CLOSE(vn, FREAD, NOCRED, curproc);
1742 	vput(vn);
1743 	if (error)
1744 		return (errbuf);
1745 	return (NULL);
1746 }
1747 
1748 int
1749 disk_map(char *path, char *mappath, int size, int flags)
1750 {
1751 	struct disk *dk, *mdk;
1752 	u_char uid[8];
1753 	char c, part;
1754 	int i;
1755 
1756 	/*
1757 	 * Attempt to map a request for a disklabel UID to the correct device.
1758 	 * We should be supplied with a disklabel UID which has the following
1759 	 * format:
1760 	 *
1761 	 * [disklabel uid] . [partition]
1762 	 *
1763 	 * Alternatively, if the DM_OPENPART flag is set the disklabel UID can
1764 	 * based passed on its own.
1765 	 */
1766 
1767 	if (strchr(path, '/') != NULL)
1768 		return -1;
1769 
1770 	/* Verify that the device name is properly formed. */
1771 	if (!((strlen(path) == 16 && (flags & DM_OPENPART)) ||
1772 	    (strlen(path) == 18 && path[16] == '.')))
1773 		return -1;
1774 
1775 	/* Get partition. */
1776 	if (flags & DM_OPENPART)
1777 		part = 'a' + RAW_PART;
1778 	else
1779 		part = path[17];
1780 
1781 	if (part < 'a' || part >= 'a' + MAXPARTITIONS)
1782 		return -1;
1783 
1784 	/* Derive label UID. */
1785 	memset(uid, 0, sizeof(uid));
1786 	for (i = 0; i < 16; i++) {
1787 		c = path[i];
1788 		if (c >= '0' && c <= '9')
1789 			c -= '0';
1790 		else if (c >= 'a' && c <= 'f')
1791 			c -= ('a' - 10);
1792 		else
1793 			return -1;
1794 
1795 		uid[i / 2] <<= 4;
1796 		uid[i / 2] |= c & 0xf;
1797 	}
1798 
1799 	mdk = NULL;
1800 	TAILQ_FOREACH(dk, &disklist, dk_link) {
1801 		if ((dk->dk_flags & DKF_LABELVALID) && dk->dk_label &&
1802 		    memcmp(dk->dk_label->d_uid, uid,
1803 		    sizeof(dk->dk_label->d_uid)) == 0) {
1804 			/* Fail if there are duplicate UIDs! */
1805 			if (mdk != NULL)
1806 				return -1;
1807 			mdk = dk;
1808 		}
1809 	}
1810 
1811 	if (mdk == NULL || mdk->dk_name == NULL)
1812 		return -1;
1813 
1814 	snprintf(mappath, size, "/dev/%s%s%c",
1815 	    (flags & DM_OPENBLCK) ? "" : "r", mdk->dk_name, part);
1816 
1817 	return 0;
1818 }
1819 
1820 /*
1821  * Lookup a disk device and verify that it has completed attaching.
1822  */
1823 struct device *
1824 disk_lookup(struct cfdriver *cd, int unit)
1825 {
1826 	struct device *dv;
1827 	struct disk *dk;
1828 
1829 	dv = device_lookup(cd, unit);
1830 	if (dv == NULL)
1831 		return (NULL);
1832 
1833 	TAILQ_FOREACH(dk, &disklist, dk_link)
1834 		if (dk->dk_device == dv)
1835 			break;
1836 
1837 	if (dk == NULL) {
1838 		device_unref(dv);
1839 		return (NULL);
1840 	}
1841 
1842 	return (dv);
1843 }
1844 
1845 int
1846 duid_equal(u_char *duid1, u_char *duid2)
1847 {
1848 	return (memcmp(duid1, duid2, DUID_SIZE) == 0);
1849 }
1850 
1851 int
1852 duid_iszero(u_char *duid)
1853 {
1854 	u_char zeroduid[DUID_SIZE];
1855 
1856 	memset(zeroduid, 0, sizeof(zeroduid));
1857 
1858 	return (duid_equal(duid, zeroduid));
1859 }
1860 
1861 const char *
1862 duid_format(u_char *duid)
1863 {
1864 	static char duid_str[17];
1865 
1866 	snprintf(duid_str, sizeof(duid_str),
1867 	    "%02x%02x%02x%02x%02x%02x%02x%02x",
1868 	    duid[0], duid[1], duid[2], duid[3],
1869 	    duid[4], duid[5], duid[6], duid[7]);
1870 
1871 	return (duid_str);
1872 }
1873