xref: /openbsd-src/sys/kern/subr_disk.c (revision 5b859c19fe53bbea08f5c342e0a4470e99f883e1)
1 /*	$OpenBSD: subr_disk.c,v 1.173 2014/11/03 21:00:27 tedu Exp $	*/
2 /*	$NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Jason R. Thorpe.  All rights reserved.
6  * Copyright (c) 1982, 1986, 1988, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/stat.h>
48 #include <sys/syslog.h>
49 #include <sys/device.h>
50 #include <sys/time.h>
51 #include <sys/disklabel.h>
52 #include <sys/conf.h>
53 #include <sys/lock.h>
54 #include <sys/disk.h>
55 #include <sys/reboot.h>
56 #include <sys/dkio.h>
57 #include <sys/vnode.h>
58 #include <sys/workq.h>
59 
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 
63 #include <net/if.h>
64 
65 #include <dev/rndvar.h>
66 #include <dev/cons.h>
67 
68 #include <lib/libz/zlib.h>
69 
70 #include "softraid.h"
71 
72 #ifdef DEBUG
73 #define DPRINTF(x...)	printf(x)
74 #else
75 #define DPRINTF(x...)
76 #endif
77 
78 /*
79  * A global list of all disks attached to the system.  May grow or
80  * shrink over time.
81  */
82 struct	disklist_head disklist;	/* TAILQ_HEAD */
83 int	disk_count;		/* number of drives in global disklist */
84 int	disk_change;		/* set if a disk has been attached/detached
85 				 * since last we looked at this variable. This
86 				 * is reset by hw_sysctl()
87 				 */
88 
89 u_char	bootduid[8];		/* DUID of boot disk. */
90 u_char	rootduid[8];		/* DUID of root disk. */
91 
92 /* softraid callback, do not use! */
93 void (*softraid_disk_attach)(struct disk *, int);
94 
95 void sr_map_root(void);
96 
97 void disk_attach_callback(void *, void *);
98 
99 /*
100  * Compute checksum for disk label.
101  */
102 u_int
103 dkcksum(struct disklabel *lp)
104 {
105 	u_int16_t *start, *end;
106 	u_int16_t sum = 0;
107 
108 	start = (u_int16_t *)lp;
109 	end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
110 	while (start < end)
111 		sum ^= *start++;
112 	return (sum);
113 }
114 
115 int
116 initdisklabel(struct disklabel *lp)
117 {
118 	int i;
119 
120 	/* minimal requirements for archetypal disk label */
121 	if (lp->d_secsize < DEV_BSIZE)
122 		lp->d_secsize = DEV_BSIZE;
123 	if (DL_GETDSIZE(lp) == 0)
124 		DL_SETDSIZE(lp, MAXDISKSIZE);
125 	if (lp->d_secpercyl == 0)
126 		return (ERANGE);
127 	lp->d_npartitions = MAXPARTITIONS;
128 	for (i = 0; i < RAW_PART; i++) {
129 		DL_SETPSIZE(&lp->d_partitions[i], 0);
130 		DL_SETPOFFSET(&lp->d_partitions[i], 0);
131 	}
132 	if (DL_GETPSIZE(&lp->d_partitions[RAW_PART]) == 0)
133 		DL_SETPSIZE(&lp->d_partitions[RAW_PART], DL_GETDSIZE(lp));
134 	DL_SETPOFFSET(&lp->d_partitions[RAW_PART], 0);
135 	DL_SETBSTART(lp, 0);
136 	DL_SETBEND(lp, DL_GETDSIZE(lp));
137 	lp->d_version = 1;
138 	lp->d_bbsize = 8192;
139 	lp->d_sbsize = 64*1024;			/* XXX ? */
140 	return (0);
141 }
142 
143 /*
144  * Check an incoming block to make sure it is a disklabel, convert it to
145  * a newer version if needed, etc etc.
146  */
147 int
148 checkdisklabel(void *rlp, struct disklabel *lp, u_int64_t boundstart,
149     u_int64_t boundend)
150 {
151 	struct disklabel *dlp = rlp;
152 	struct __partitionv0 *v0pp;
153 	struct partition *pp;
154 	u_int64_t disksize;
155 	int error = 0;
156 	int i;
157 
158 	if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC)
159 		error = ENOENT;	/* no disk label */
160 	else if (dlp->d_npartitions > MAXPARTITIONS)
161 		error = E2BIG;	/* too many partitions */
162 	else if (dlp->d_secpercyl == 0)
163 		error = EINVAL;	/* invalid label */
164 	else if (dlp->d_secsize == 0)
165 		error = ENOSPC;	/* disk too small */
166 	else if (dkcksum(dlp) != 0)
167 		error = EINVAL;	/* incorrect checksum */
168 
169 	if (error) {
170 		u_int16_t *start, *end, sum = 0;
171 
172 		/* If it is byte-swapped, attempt to convert it */
173 		if (swap32(dlp->d_magic) != DISKMAGIC ||
174 		    swap32(dlp->d_magic2) != DISKMAGIC ||
175 		    swap16(dlp->d_npartitions) > MAXPARTITIONS)
176 			return (error);
177 
178 		/*
179 		 * Need a byte-swap aware dkcksum variant
180 		 * inlined, because dkcksum uses a sub-field
181 		 */
182 		start = (u_int16_t *)dlp;
183 		end = (u_int16_t *)&dlp->d_partitions[
184 		    swap16(dlp->d_npartitions)];
185 		while (start < end)
186 			sum ^= *start++;
187 		if (sum != 0)
188 			return (error);
189 
190 		dlp->d_magic = swap32(dlp->d_magic);
191 		dlp->d_type = swap16(dlp->d_type);
192 		dlp->d_subtype = swap16(dlp->d_subtype);
193 
194 		/* d_typename and d_packname are strings */
195 
196 		dlp->d_secsize = swap32(dlp->d_secsize);
197 		dlp->d_nsectors = swap32(dlp->d_nsectors);
198 		dlp->d_ntracks = swap32(dlp->d_ntracks);
199 		dlp->d_ncylinders = swap32(dlp->d_ncylinders);
200 		dlp->d_secpercyl = swap32(dlp->d_secpercyl);
201 		dlp->d_secperunit = swap32(dlp->d_secperunit);
202 
203 		/* d_uid is a string */
204 
205 		dlp->d_acylinders = swap32(dlp->d_acylinders);
206 
207 		dlp->d_flags = swap32(dlp->d_flags);
208 
209 		for (i = 0; i < NDDATA; i++)
210 			dlp->d_drivedata[i] = swap32(dlp->d_drivedata[i]);
211 
212 		dlp->d_secperunith = swap16(dlp->d_secperunith);
213 		dlp->d_version = swap16(dlp->d_version);
214 
215 		for (i = 0; i < NSPARE; i++)
216 			dlp->d_spare[i] = swap32(dlp->d_spare[i]);
217 
218 		dlp->d_magic2 = swap32(dlp->d_magic2);
219 
220 		dlp->d_npartitions = swap16(dlp->d_npartitions);
221 		dlp->d_bbsize = swap32(dlp->d_bbsize);
222 		dlp->d_sbsize = swap32(dlp->d_sbsize);
223 
224 		for (i = 0; i < MAXPARTITIONS; i++) {
225 			pp = &dlp->d_partitions[i];
226 			pp->p_size = swap32(pp->p_size);
227 			pp->p_offset = swap32(pp->p_offset);
228 			if (dlp->d_version == 0) {
229 				v0pp = (struct __partitionv0 *)pp;
230 				v0pp->p_fsize = swap32(v0pp->p_fsize);
231 			} else {
232 				pp->p_offseth = swap16(pp->p_offseth);
233 				pp->p_sizeh = swap16(pp->p_sizeh);
234 			}
235 			pp->p_cpg = swap16(pp->p_cpg);
236 		}
237 
238 		dlp->d_checksum = 0;
239 		dlp->d_checksum = dkcksum(dlp);
240 		error = 0;
241 	}
242 
243 	/* XXX should verify lots of other fields and whine a lot */
244 
245 	/* Initial passed in lp contains the real disk size. */
246 	disksize = DL_GETDSIZE(lp);
247 
248 	if (lp != dlp)
249 		*lp = *dlp;
250 
251 	if (lp->d_version == 0) {
252 		lp->d_version = 1;
253 		lp->d_secperunith = 0;
254 
255 		v0pp = (struct __partitionv0 *)lp->d_partitions;
256 		pp = lp->d_partitions;
257 		for (i = 0; i < lp->d_npartitions; i++, pp++, v0pp++) {
258 			pp->p_fragblock = DISKLABELV1_FFS_FRAGBLOCK(v0pp->
259 			    p_fsize, v0pp->p_frag);
260 			pp->p_offseth = 0;
261 			pp->p_sizeh = 0;
262 		}
263 	}
264 
265 #ifdef DEBUG
266 	if (DL_GETDSIZE(lp) != disksize)
267 		printf("on-disk disklabel has incorrect disksize (%llu)\n",
268 		    DL_GETDSIZE(lp));
269 	if (DL_GETPSIZE(&lp->d_partitions[RAW_PART]) != disksize)
270 		printf("on-disk disklabel RAW_PART has incorrect size (%llu)\n",
271 		    DL_GETPSIZE(&lp->d_partitions[RAW_PART]));
272 	if (DL_GETPOFFSET(&lp->d_partitions[RAW_PART]) != 0)
273 		printf("on-disk disklabel RAW_PART offset != 0 (%llu)\n",
274 		    DL_GETPOFFSET(&lp->d_partitions[RAW_PART]));
275 #endif
276 	DL_SETDSIZE(lp, disksize);
277 	DL_SETPSIZE(&lp->d_partitions[RAW_PART], disksize);
278 	DL_SETPOFFSET(&lp->d_partitions[RAW_PART], 0);
279 	DL_SETBSTART(lp, boundstart);
280 	DL_SETBEND(lp, boundend < DL_GETDSIZE(lp) ? boundend : DL_GETDSIZE(lp));
281 
282 	lp->d_checksum = 0;
283 	lp->d_checksum = dkcksum(lp);
284 	return (0);
285 }
286 
287 /*
288  * If dos partition table requested, attempt to load it and
289  * find disklabel inside a DOS partition. Return buffer
290  * for use in signalling errors if requested.
291  *
292  * We would like to check if each MBR has a valid BOOT_MAGIC, but
293  * we cannot because it doesn't always exist. So.. we assume the
294  * MBR is valid.
295  */
296 int
297 readdoslabel(struct buf *bp, void (*strat)(struct buf *),
298     struct disklabel *lp, daddr_t *partoffp, int spoofonly)
299 {
300 	u_int64_t dospartoff = 0, dospartend = DL_GETBEND(lp);
301 	int i, ourpart = -1, wander = 1, n = 0, loop = 0, offset;
302 	struct dos_partition dp[NDOSPART], *dp2;
303 	daddr_t part_blkno = DOSBBSECTOR;
304 	u_int32_t extoff = 0;
305 	int error;
306 
307 	if (lp->d_secpercyl == 0)
308 		return (EINVAL);	/* invalid label */
309 	if (lp->d_secsize == 0)
310 		return (ENOSPC);	/* disk too small */
311 
312 	/* do DOS partitions in the process of getting disklabel? */
313 
314 	/*
315 	 * Read dos partition table, follow extended partitions.
316 	 * Map the partitions to disklabel entries i-p
317 	 */
318 	while (wander && loop < DOS_MAXEBR) {
319 		loop++;
320 		wander = 0;
321 		if (part_blkno < extoff)
322 			part_blkno = extoff;
323 
324 		/* read MBR/EBR */
325 		bp->b_blkno = DL_SECTOBLK(lp, part_blkno);
326 		bp->b_bcount = lp->d_secsize;
327 		bp->b_error = 0; /* B_ERROR and b_error may have stale data. */
328 		CLR(bp->b_flags, B_READ | B_WRITE | B_DONE | B_ERROR);
329 		SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
330 		(*strat)(bp);
331 		error = biowait(bp);
332 		if (error) {
333 /*wrong*/		if (partoffp)
334 /*wrong*/			*partoffp = -1;
335 			return (error);
336 		}
337 
338 		bcopy(bp->b_data + DOSPARTOFF, dp, sizeof(dp));
339 
340 		if (n == 0 && part_blkno == DOSBBSECTOR) {
341 			u_int16_t mbrtest;
342 
343 			/* Check the end of sector marker. */
344 			mbrtest = ((bp->b_data[510] << 8) & 0xff00) |
345 			    (bp->b_data[511] & 0xff);
346 			if (mbrtest != 0x55aa)
347 				goto notmbr;
348 		}
349 
350 		if (ourpart == -1) {
351 			/* Search for our MBR partition */
352 			for (dp2=dp, i=0; i < NDOSPART && ourpart == -1;
353 			    i++, dp2++)
354 				if (letoh32(dp2->dp_size) &&
355 				    dp2->dp_typ == DOSPTYP_OPENBSD)
356 					ourpart = i;
357 			if (ourpart == -1)
358 				goto donot;
359 			/*
360 			 * This is our MBR partition. need sector
361 			 * address for SCSI/IDE, cylinder for
362 			 * ESDI/ST506/RLL
363 			 */
364 			dp2 = &dp[ourpart];
365 			dospartoff = letoh32(dp2->dp_start) + part_blkno;
366 			dospartend = dospartoff + letoh32(dp2->dp_size);
367 
368 			/*
369 			 * Record the OpenBSD partition's placement (in
370 			 * 512-byte blocks!) for the caller. No need to
371 			 * finish spoofing.
372 			 */
373 			if (partoffp) {
374 				*partoffp = DL_SECTOBLK(lp, dospartoff);
375 				return (0);
376 			}
377 
378 			if (lp->d_ntracks == 0)
379 				lp->d_ntracks = dp2->dp_ehd + 1;
380 			if (lp->d_nsectors == 0)
381 				lp->d_nsectors = DPSECT(dp2->dp_esect);
382 			if (lp->d_secpercyl == 0)
383 				lp->d_secpercyl = lp->d_ntracks *
384 				    lp->d_nsectors;
385 		}
386 donot:
387 		/*
388 		 * In case the disklabel read below fails, we want to
389 		 * provide a fake label in i-p.
390 		 */
391 		for (dp2=dp, i=0; i < NDOSPART; i++, dp2++) {
392 			struct partition *pp;
393 			u_int8_t fstype;
394 
395 			if (dp2->dp_typ == DOSPTYP_OPENBSD ||
396 			    dp2->dp_typ == DOSPTYP_EFI)
397 				continue;
398 			if (letoh32(dp2->dp_size) > DL_GETDSIZE(lp))
399 				continue;
400 			if (letoh32(dp2->dp_start) > DL_GETDSIZE(lp))
401 				continue;
402 			if (letoh32(dp2->dp_size) == 0)
403 				continue;
404 
405 			switch (dp2->dp_typ) {
406 			case DOSPTYP_UNUSED:
407 				fstype = FS_UNUSED;
408 				break;
409 
410 			case DOSPTYP_LINUX:
411 				fstype = FS_EXT2FS;
412 				break;
413 
414 			case DOSPTYP_NTFS:
415 				fstype = FS_NTFS;
416 				break;
417 
418 			case DOSPTYP_EFISYS:
419 			case DOSPTYP_FAT12:
420 			case DOSPTYP_FAT16S:
421 			case DOSPTYP_FAT16B:
422 			case DOSPTYP_FAT16L:
423 			case DOSPTYP_FAT32:
424 			case DOSPTYP_FAT32L:
425 				fstype = FS_MSDOS;
426 				break;
427 			case DOSPTYP_EXTEND:
428 			case DOSPTYP_EXTENDL:
429 				part_blkno = letoh32(dp2->dp_start) + extoff;
430 				if (!extoff) {
431 					extoff = letoh32(dp2->dp_start);
432 					part_blkno = 0;
433 				}
434 				wander = 1;
435 				continue;
436 				break;
437 			default:
438 				fstype = FS_OTHER;
439 				break;
440 			}
441 
442 			/*
443 			 * Don't set fstype/offset/size when just looking for
444 			 * the offset of the OpenBSD partition. It would
445 			 * invalidate the disklabel checksum!
446 			 *
447 			 * Don't try to spoof more than 8 partitions, i.e.
448 			 * 'i' -'p'.
449 			 */
450 			if (partoffp || n >= 8)
451 				continue;
452 
453 			pp = &lp->d_partitions[8+n];
454 			n++;
455 			pp->p_fstype = fstype;
456 			if (letoh32(dp2->dp_start))
457 				DL_SETPOFFSET(pp,
458 				    letoh32(dp2->dp_start) + part_blkno);
459 			DL_SETPSIZE(pp, letoh32(dp2->dp_size));
460 		}
461 	}
462 
463 notmbr:
464 	if (partoffp == NULL)
465 		/* Must not modify *lp when partoffp is set. */
466 		lp->d_npartitions = MAXPARTITIONS;
467 
468 	if (n == 0 && part_blkno == DOSBBSECTOR && ourpart == -1) {
469 		u_int16_t fattest;
470 
471 		/* Check for a valid initial jmp instruction. */
472 		switch ((u_int8_t)bp->b_data[0]) {
473 		case 0xeb:
474 			/*
475 			 * Two-byte jmp instruction. The 2nd byte is the number
476 			 * of bytes to jmp and the 3rd byte must be a NOP.
477 			 */
478 			if ((u_int8_t)bp->b_data[2] != 0x90)
479 				goto notfat;
480 			break;
481 		case 0xe9:
482 			/*
483 			 * Three-byte jmp instruction. The next two bytes are a
484 			 * little-endian 16 bit value.
485 			 */
486 			break;
487 		default:
488 			goto notfat;
489 			break;
490 		}
491 
492 		/* Check for a valid bytes per sector value. */
493 		fattest = ((bp->b_data[12] << 8) & 0xff00) |
494 		    (bp->b_data[11] & 0xff);
495 		if (fattest < 512 || fattest > 4096 || (fattest % 512 != 0))
496 			goto notfat;
497 
498 		if (partoffp)
499 			return (ENXIO);	/* No place for disklabel on FAT! */
500 
501 		DL_SETPSIZE(&lp->d_partitions['i' - 'a'],
502 		    DL_GETPSIZE(&lp->d_partitions[RAW_PART]));
503 		DL_SETPOFFSET(&lp->d_partitions['i' - 'a'], 0);
504 		lp->d_partitions['i' - 'a'].p_fstype = FS_MSDOS;
505 
506 		spoofonly = 1;	/* No disklabel to read from disk. */
507 	}
508 
509 notfat:
510 	/* record the OpenBSD partition's placement for the caller */
511 	if (partoffp)
512 		*partoffp = DL_SECTOBLK(lp, dospartoff);
513 	else {
514 		DL_SETBSTART(lp, dospartoff);
515 		DL_SETBEND(lp, (dospartend < DL_GETDSIZE(lp)) ? dospartend :
516 		    DL_GETDSIZE(lp));
517 	}
518 
519 	/* don't read the on-disk label if we are in spoofed-only mode */
520 	if (spoofonly)
521 		return (0);
522 
523 	bp->b_blkno = DL_BLKTOSEC(lp, DL_SECTOBLK(lp, dospartoff) +
524 	    DOS_LABELSECTOR) * DL_BLKSPERSEC(lp);
525 	offset = DL_BLKOFFSET(lp, DL_SECTOBLK(lp, dospartoff) +
526 	    DOS_LABELSECTOR);
527 	bp->b_bcount = lp->d_secsize;
528 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
529 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
530 	(*strat)(bp);
531 	if (biowait(bp))
532 		return (bp->b_error);
533 
534 
535 	error = checkdisklabel(bp->b_data + offset, lp,
536 	    DL_GETBSTART((struct disklabel*)(bp->b_data+offset)),
537 	    DL_GETBEND((struct disklabel *)(bp->b_data+offset)));
538 
539 	return (error);
540 }
541 
542 #ifdef GPT
543 
544 int gpt_chk_hdr(struct gpt_header *);
545 int gpt_chk_parts(struct gpt_header *, struct gpt_partition *);
546 int get_fstype(struct uuid *);
547 
548 int
549 gpt_chk_hdr(struct gpt_header *gh)
550 {
551 	u_int32_t orig_gh_csum = gh->gh_csum;
552 	gh->gh_csum = 0;
553 	gh->gh_csum = crc32(0, (unsigned char *)gh, gh->gh_size);
554 
555 	if (orig_gh_csum != gh->gh_csum)
556 		return (EINVAL);
557 
558 	return 0;
559 }
560 
561 int
562 gpt_chk_parts(struct gpt_header *gh, struct gpt_partition *gp)
563 {
564 	u_int32_t checksum;
565 	checksum = crc32(0, (unsigned char *)gp,
566 	    gh->gh_part_num * gh->gh_part_size);
567 
568 	if (checksum != gh->gh_part_csum)
569 		return (EINVAL);
570 
571 	return 0;
572 }
573 
574 int
575 get_fstype(struct uuid *uuid_part)
576 {
577 	static int init = 0;
578 	static struct uuid uuid_openbsd, uuid_msdos, uuid_chromefs,
579 	    uuid_linux, uuid_hfs, uuid_unused;
580 	static const uint8_t gpt_uuid_openbsd[] = GPT_UUID_OPENBSD;
581 	static const uint8_t gpt_uuid_msdos[] = GPT_UUID_MSDOS;
582 	static const uint8_t gpt_uuid_chromerootfs[] = GPT_UUID_CHROMEROOTFS;
583 	static const uint8_t gpt_uuid_linux[] = GPT_UUID_LINUX;
584 	static const uint8_t gpt_uuid_hfs[] = GPT_UUID_APPLE_HFS;
585 	static const uint8_t gpt_uuid_unused[] = GPT_UUID_UNUSED;
586 
587 	if (init == 0) {
588 		uuid_dec_be(gpt_uuid_openbsd, &uuid_openbsd);
589 		uuid_dec_be(gpt_uuid_msdos, &uuid_msdos);
590 		uuid_dec_be(gpt_uuid_chromerootfs, &uuid_chromefs);
591 		uuid_dec_be(gpt_uuid_linux, &uuid_linux);
592 		uuid_dec_be(gpt_uuid_hfs, &uuid_hfs);
593 		uuid_dec_be(gpt_uuid_unused, &uuid_unused);
594 		init = 1;
595 	}
596 
597 	if (!memcmp(uuid_part, &uuid_unused, sizeof(struct uuid)))
598 		return FS_UNUSED;
599 	else if (!memcmp(uuid_part, &uuid_openbsd, sizeof(struct uuid)))
600 		return FS_BSDFFS;
601 	else if (!memcmp(uuid_part, &uuid_msdos, sizeof(struct uuid)))
602 		return FS_MSDOS;
603 	else if (!memcmp(uuid_part, &uuid_chromefs, sizeof(struct uuid)))
604 		return FS_EXT2FS;
605 	else if (!memcmp(uuid_part, &uuid_linux, sizeof(struct uuid)))
606 		return FS_EXT2FS;
607 	else if (!memcmp(uuid_part, &uuid_hfs, sizeof(struct uuid)))
608 		return FS_HFS;
609 	else
610 		return FS_OTHER;
611 }
612 
613 /*
614  * If gpt partition table requested, attempt to load it and
615  * find disklabel inside a GPT partition. Return buffer
616  * for use in signalling errors if requested.
617  *
618  * XXX: readgptlabel() is based on readdoslabel(), so they should be merged
619  */
620 int
621 readgptlabel(struct buf *bp, void (*strat)(struct buf *),
622     struct disklabel *lp, daddr_t *partoffp, int spoofonly)
623 {
624 	struct gpt_header gh;
625 	struct gpt_partition *gp, *gp_tmp;
626 	size_t gpsz;
627 	struct uuid uuid_part, uuid_openbsd;
628 	struct partition *pp;
629 
630 	daddr_t part_blkno;
631 	u_int64_t gptpartoff = 0, gptpartend = DL_GETBEND(lp);
632 	int i, altheader = 0, error, n=0, ourpart = -1, offset;
633 
634 	static const u_int8_t gpt_uuid_openbsd[] = GPT_UUID_OPENBSD;
635 	u_int8_t fstype;
636 
637 	uuid_dec_be(gpt_uuid_openbsd, &uuid_openbsd);
638 
639 	if (lp->d_secpercyl == 0)
640 		return (EINVAL);	/* invalid label */
641 	if (lp->d_secsize == 0)
642 		return (ENOSPC);	/* disk too small */
643 
644 	/*
645 	 * XXX: We should not trust the primary header and instead
646 	 * use the last LBA of the disk, as defined in the standard.
647 	 */
648 	for (part_blkno = GPTSECTOR; ; part_blkno = gh.gh_lba_alt,
649 	    altheader = 1) {
650 		uint32_t ghsize;
651 		uint32_t ghpartsize;
652 		uint32_t ghpartnum;
653 		size_t gpsz;
654 
655 		/* read header record */
656 		bp->b_blkno = DL_BLKTOSEC(lp, part_blkno) * DL_BLKSPERSEC(lp);
657 		offset = DL_BLKOFFSET(lp, part_blkno);
658 		bp->b_bcount = lp->d_secsize;
659 		bp->b_error = 0; /* B_ERROR and b_error may have stale data. */
660 		CLR(bp->b_flags, B_READ | B_WRITE | B_DONE | B_ERROR);
661 		SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
662 		(*strat)(bp);
663 		error = biowait(bp);
664 
665 		if (error) {
666 			DPRINTF("error reading from disk\n");
667 	/*wrong*/	if (partoffp)
668 	/*wrong*/		*partoffp = -1;
669 			return (error);
670 		}
671 
672 		bcopy(bp->b_data + offset, &gh, sizeof(gh));
673 		ghsize = letoh32(gh.gh_size);
674 		ghpartsize = letoh32(gh.gh_part_size);
675 		ghpartnum = letoh32(gh.gh_part_num);
676 
677 
678 		if (letoh64(gh.gh_sig) != GPTSIGNATURE)
679 			return (EINVAL);
680 
681 		/* we only support version 1.0 */
682 		if (letoh32(gh.gh_rev) != GPTREVISION)
683 			return (EINVAL);
684 
685 		if (gpt_chk_hdr(&gh)) {
686 			/* header broken, using alternate header */
687 			if (altheader) {
688 				DPRINTF("alternate header also broken\n");
689 				return (EINVAL);
690 			}
691 
692 			if (gh.gh_lba_alt >= DL_GETDSIZE(lp)) {
693 				DPRINTF("alternate header's position is "
694 				    "bogous\n");
695 				return (EINVAL);
696 			}
697 
698 			continue;
699 		}
700 
701 		/*
702 		 * Header size must be greater than or equal to 92 and less
703 		 * than or equal to the logical block size.
704 		 */
705 		if (ghsize < GPTMINHDRSIZE || ghsize > DEV_BSIZE)
706 			return (EINVAL);
707 
708 		if (letoh64(gh.gh_lba_start) >= DL_GETDSIZE(lp) ||
709 		    letoh64(gh.gh_lba_end) >= DL_GETDSIZE(lp) ||
710 		    letoh64(gh.gh_part_lba) >= DL_GETDSIZE(lp))
711 			return (EINVAL);
712 
713 		/*
714 		* Size per partition entry shall be 128*(2**n) with n >= 0.
715 		* We don't support partition entries larger than block size.
716 		*/
717 		if (ghpartsize % GPTMINPARTSIZE
718 		    || ghpartsize > DEV_BSIZE
719 		    || GPT_PARTSPERSEC(&gh) == 0) {
720 			DPRINTF("invalid partition size\n");
721 			return (EINVAL);
722 		}
723 
724 		/* XXX: we don't support multiples of GPTMINPARTSIZE yet */
725 		if (letoh32(gh.gh_part_size) != GPTMINPARTSIZE) {
726 			DPRINTF("partition sizes larger than %d bytes are not "
727 			    "supported", GPTMINPARTSIZE);
728 			return (EINVAL);
729 		}
730 
731 		/* read GPT partition entry array */
732 		gp = mallocarray(ghpartnum, sizeof(struct gpt_partition), M_DEVBUF, M_NOWAIT|M_ZERO);
733 		if (gp == NULL)
734 			return (ENOMEM);
735 		gpsz = ghpartnum * sizeof(struct gpt_partition);
736 
737 		/*
738 		* XXX: Fails if # of partition entries is no multiple of
739 		* GPT_PARTSPERSEC(&gh)
740 		*/
741 		for (i = 0; i < ghpartnum / GPT_PARTSPERSEC(&gh);
742 		     i++) {
743 			part_blkno = letoh64(gh.gh_part_lba) + i;
744 			/* read partition record */
745 			bp->b_blkno = DL_BLKTOSEC(lp, part_blkno) *
746 			    DL_BLKSPERSEC(lp);
747 			offset = DL_BLKOFFSET(lp, part_blkno);
748 			bp->b_bcount = lp->d_secsize;
749 			/* B_ERROR and b_error may have stale data. */
750 			bp->b_error = 0;
751 			CLR(bp->b_flags, B_READ | B_WRITE | B_DONE | B_ERROR);
752 			SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
753 			(*strat)(bp);
754 			error = biowait(bp);
755 			if (error) {
756 	/*wrong*/		if (partoffp)
757 	/*wrong*/			*partoffp = -1;
758 				free(gp, M_DEVBUF, gpsz);
759 				return (error);
760 			}
761 
762 			bcopy(bp->b_data + offset, gp +
763 			    i * GPT_PARTSPERSEC(&gh), GPT_PARTSPERSEC(&gh) *
764 			    sizeof(struct gpt_partition));
765 		}
766 
767 		if (gpt_chk_parts(&gh, gp)) {
768 			DPRINTF("partition entries broken, using alternate "
769 			    "header\n");
770 			free(gp, M_DEVBUF, gpsz);
771 
772 			if (altheader) {
773 				DPRINTF("alternate partition entries are also "
774 				    "broken\n");
775 				return (EINVAL);
776 			}
777 
778 			continue;
779 		}
780 		break;
781 	}
782 
783 	/* find OpenBSD partition */
784 	for (gp_tmp = gp, i = 0; i < letoh32(gh.gh_part_num) && ourpart == -1;
785 	    gp_tmp++, i++) {
786 		if (letoh64(gp_tmp->gp_lba_start) > letoh64(gp_tmp->gp_lba_end)
787 		    || letoh64(gp_tmp->gp_lba_start) < letoh64(gh.gh_lba_start)
788 		    || letoh64(gp_tmp->gp_lba_end) > letoh64(gh.gh_lba_end))
789 			continue; /* entry invalid */
790 
791 		uuid_dec_le(&gp_tmp->gp_type, &uuid_part);
792 		if (!memcmp(&uuid_part, &uuid_openbsd, sizeof(struct uuid))) {
793 			ourpart = i; /* found it */
794 		}
795 
796 		/*
797 		 * In case the disklabel read below fails, we want to
798 		 * provide a fake label in i-p.
799 		 */
800 		fstype = get_fstype(&uuid_part);
801 
802 		/*
803 		 * Don't set fstype/offset/size when just looking for
804 		 * the offset of the OpenBSD partition. It would
805 		 * invalidate the disklabel checksum!
806 		 *
807 		 * Don't try to spoof more than 8 partitions, i.e.
808 		 * 'i' -'p'.
809 		 */
810 		if (partoffp || n >= 8)
811 			continue;
812 
813 		pp = &lp->d_partitions[8+n];
814 		n++;
815 		pp->p_fstype = fstype;
816 		DL_SETPOFFSET(pp, letoh64(gp_tmp->gp_lba_start));
817 		DL_SETPSIZE(pp, letoh64(gp_tmp->gp_lba_end)
818 		    - letoh64(gp_tmp->gp_lba_start) + 1);
819 	}
820 
821 	if (ourpart != -1) {
822 		/* found our OpenBSD partition, so use it */
823 		gp_tmp = &gp[ourpart];
824 		gptpartoff = letoh64(gp_tmp->gp_lba_start);
825 		gptpartend = letoh64(gp_tmp->gp_lba_end) + 1;
826 	} else
827 		spoofonly = 1;	/* No disklabel to read from disk. */
828 
829 	if (!partoffp)
830 		/* Must not modify *lp when partoffp is set. */
831 		lp->d_npartitions = MAXPARTITIONS;
832 
833 	free(gp, M_DEVBUF, gpsz);
834 
835 	/* record the OpenBSD partition's placement for the caller */
836 	if (partoffp)
837 		*partoffp = gptpartoff;
838 	else {
839 		DL_SETBSTART(lp, gptpartoff);
840 		DL_SETBEND(lp, (gptpartend < DL_GETDSIZE(lp)) ? gptpartend :
841 		    DL_GETDSIZE(lp));
842 	}
843 
844 	/* don't read the on-disk label if we are in spoofed-only mode */
845 	if (spoofonly)
846 		return (0);
847 
848 	bp->b_blkno = DL_BLKTOSEC(lp, gptpartoff + DOS_LABELSECTOR) *
849 	    DL_BLKSPERSEC(lp);
850 	offset = DL_BLKOFFSET(lp, gptpartoff + DOS_LABELSECTOR);
851 	bp->b_bcount = lp->d_secsize;
852 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
853 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
854 	(*strat)(bp);
855 	if (biowait(bp))
856 		return (bp->b_error);
857 
858 	/* sub-GPT disklabels are always at a LABELOFFSET of 0 */
859 	return checkdisklabel(bp->b_data + offset, lp, gptpartoff, gptpartend);
860 }
861 
862 #endif
863 
864 /*
865  * Check new disk label for sensibility before setting it.
866  */
867 int
868 setdisklabel(struct disklabel *olp, struct disklabel *nlp, u_int openmask)
869 {
870 	struct partition *opp, *npp;
871 	struct disk *dk;
872 	u_int64_t uid;
873 	int i;
874 
875 	/* sanity clause */
876 	if (nlp->d_secpercyl == 0 || nlp->d_secsize == 0 ||
877 	    (nlp->d_secsize % DEV_BSIZE) != 0)
878 		return (EINVAL);
879 
880 	/* special case to allow disklabel to be invalidated */
881 	if (nlp->d_magic == 0xffffffff) {
882 		*olp = *nlp;
883 		return (0);
884 	}
885 
886 	if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
887 	    dkcksum(nlp) != 0)
888 		return (EINVAL);
889 
890 	/* XXX missing check if other dos partitions will be overwritten */
891 
892 	for (i = 0; i < MAXPARTITIONS; i++) {
893 		opp = &olp->d_partitions[i];
894 		npp = &nlp->d_partitions[i];
895 		if ((openmask & (1 << i)) &&
896 		    (DL_GETPOFFSET(npp) != DL_GETPOFFSET(opp) ||
897 		    DL_GETPSIZE(npp) < DL_GETPSIZE(opp)))
898 			return (EBUSY);
899 		/*
900 		 * Copy internally-set partition information
901 		 * if new label doesn't include it.		XXX
902 		 */
903 		if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
904 			npp->p_fragblock = opp->p_fragblock;
905 			npp->p_cpg = opp->p_cpg;
906 		}
907 	}
908 
909 	/* Generate a UID if the disklabel does not already have one. */
910 	uid = 0;
911 	if (memcmp(nlp->d_uid, &uid, sizeof(nlp->d_uid)) == 0) {
912 		do {
913 			arc4random_buf(nlp->d_uid, sizeof(nlp->d_uid));
914 			TAILQ_FOREACH(dk, &disklist, dk_link)
915 				if (dk->dk_label && memcmp(dk->dk_label->d_uid,
916 				    nlp->d_uid, sizeof(nlp->d_uid)) == 0)
917 					break;
918 		} while (dk != NULL &&
919 		    memcmp(nlp->d_uid, &uid, sizeof(nlp->d_uid)) == 0);
920 	}
921 
922 	nlp->d_checksum = 0;
923 	nlp->d_checksum = dkcksum(nlp);
924 	*olp = *nlp;
925 
926 	disk_change = 1;
927 
928 	return (0);
929 }
930 
931 /*
932  * Determine the size of the transfer, and make sure it is within the
933  * boundaries of the partition. Adjust transfer if needed, and signal errors or
934  * early completion.
935  */
936 int
937 bounds_check_with_label(struct buf *bp, struct disklabel *lp)
938 {
939 	struct partition *p = &lp->d_partitions[DISKPART(bp->b_dev)];
940 	daddr_t partblocks, sz;
941 
942 	/* Avoid division by zero, negative offsets, and negative sizes. */
943 	if (lp->d_secpercyl == 0 || bp->b_blkno < 0 || bp->b_bcount < 0)
944 		goto bad;
945 
946 	/* Ensure transfer is a whole number of aligned sectors. */
947 	if ((bp->b_blkno % DL_BLKSPERSEC(lp)) != 0 ||
948 	    (bp->b_bcount % lp->d_secsize) != 0)
949 		goto bad;
950 
951 	/* Ensure transfer starts within partition boundary. */
952 	partblocks = DL_SECTOBLK(lp, DL_GETPSIZE(p));
953 	if (bp->b_blkno > partblocks)
954 		goto bad;
955 
956 	/* If exactly at end of partition or null transfer, return EOF. */
957 	if (bp->b_blkno == partblocks || bp->b_bcount == 0)
958 		goto done;
959 
960 	/* Truncate request if it exceeds past the end of the partition. */
961 	sz = bp->b_bcount >> DEV_BSHIFT;
962 	if (sz > partblocks - bp->b_blkno) {
963 		sz = partblocks - bp->b_blkno;
964 		bp->b_bcount = sz << DEV_BSHIFT;
965 	}
966 
967 	return (0);
968 
969  bad:
970 	bp->b_error = EINVAL;
971 	bp->b_flags |= B_ERROR;
972  done:
973 	bp->b_resid = bp->b_bcount;
974 	return (-1);
975 }
976 
977 /*
978  * Disk error is the preface to plaintive error messages
979  * about failing disk transfers.  It prints messages of the form
980 
981 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
982 
983  * if the offset of the error in the transfer and a disk label
984  * are both available.  blkdone should be -1 if the position of the error
985  * is unknown; the disklabel pointer may be null from drivers that have not
986  * been converted to use them.  The message is printed with printf
987  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
988  * The message should be completed (with at least a newline) with printf
989  * or addlog, respectively.  There is no trailing space.
990  */
991 void
992 diskerr(struct buf *bp, char *dname, char *what, int pri, int blkdone,
993     struct disklabel *lp)
994 {
995 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
996     	int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)));
997 	char partname = 'a' + part;
998 	daddr_t sn;
999 
1000 	if (pri != LOG_PRINTF) {
1001 		log(pri, "%s", "");
1002 		pr = addlog;
1003 	} else
1004 		pr = printf;
1005 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
1006 	    bp->b_flags & B_READ ? "read" : "writ");
1007 	sn = bp->b_blkno;
1008 	if (bp->b_bcount <= DEV_BSIZE)
1009 		(*pr)("%lld", (long long)sn);
1010 	else {
1011 		if (blkdone >= 0) {
1012 			sn += blkdone;
1013 			(*pr)("%lld of ", (long long)sn);
1014 		}
1015 		(*pr)("%lld-%lld", (long long)bp->b_blkno,
1016 		    (long long)(bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE));
1017 	}
1018 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
1019 		sn += DL_SECTOBLK(lp, DL_GETPOFFSET(&lp->d_partitions[part]));
1020 		(*pr)(" (%s%d bn %lld; cn %lld", dname, unit, (long long)sn,
1021 		    (long long)(sn / DL_SECTOBLK(lp, lp->d_secpercyl)));
1022 		sn %= DL_SECTOBLK(lp, lp->d_secpercyl);
1023 		(*pr)(" tn %lld sn %lld)",
1024 		    (long long)(sn / DL_SECTOBLK(lp, lp->d_nsectors)),
1025 		    (long long)(sn % DL_SECTOBLK(lp, lp->d_nsectors)));
1026 	}
1027 }
1028 
1029 /*
1030  * Initialize the disklist.  Called by main() before autoconfiguration.
1031  */
1032 void
1033 disk_init(void)
1034 {
1035 
1036 	TAILQ_INIT(&disklist);
1037 	disk_count = disk_change = 0;
1038 }
1039 
1040 int
1041 disk_construct(struct disk *diskp)
1042 {
1043 	rw_init(&diskp->dk_lock, "dklk");
1044 	mtx_init(&diskp->dk_mtx, IPL_BIO);
1045 
1046 	diskp->dk_flags |= DKF_CONSTRUCTED;
1047 
1048 	return (0);
1049 }
1050 
1051 /*
1052  * Attach a disk.
1053  */
1054 void
1055 disk_attach(struct device *dv, struct disk *diskp)
1056 {
1057 	int majdev;
1058 
1059 	if (!ISSET(diskp->dk_flags, DKF_CONSTRUCTED))
1060 		disk_construct(diskp);
1061 
1062 	/*
1063 	 * Allocate and initialize the disklabel structures.  Note that
1064 	 * it's not safe to sleep here, since we're probably going to be
1065 	 * called during autoconfiguration.
1066 	 */
1067 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF,
1068 	    M_NOWAIT|M_ZERO);
1069 	if (diskp->dk_label == NULL)
1070 		panic("disk_attach: can't allocate storage for disklabel");
1071 
1072 	/*
1073 	 * Set the attached timestamp.
1074 	 */
1075 	microuptime(&diskp->dk_attachtime);
1076 
1077 	/*
1078 	 * Link into the disklist.
1079 	 */
1080 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
1081 	++disk_count;
1082 	disk_change = 1;
1083 
1084 	/*
1085 	 * Store device structure and number for later use.
1086 	 */
1087 	diskp->dk_device = dv;
1088 	diskp->dk_devno = NODEV;
1089 	if (dv != NULL) {
1090 		majdev = findblkmajor(dv);
1091 		if (majdev >= 0)
1092 			diskp->dk_devno =
1093 			    MAKEDISKDEV(majdev, dv->dv_unit, RAW_PART);
1094 	}
1095 	if (diskp->dk_devno != NODEV)
1096 		workq_add_task(NULL, 0, disk_attach_callback,
1097 		    (void *)(long)(diskp->dk_devno), NULL);
1098 
1099 	if (softraid_disk_attach)
1100 		softraid_disk_attach(diskp, 1);
1101 }
1102 
1103 void
1104 disk_attach_callback(void *arg1, void *arg2)
1105 {
1106 	char errbuf[100];
1107 	struct disklabel dl;
1108 	struct disk *dk;
1109 	dev_t dev = (dev_t)(long)arg1;
1110 
1111 	/* Locate disk associated with device no. */
1112 	TAILQ_FOREACH(dk, &disklist, dk_link) {
1113 		if (dk->dk_devno == dev)
1114 			break;
1115 	}
1116 	if (dk == NULL)
1117 		return;
1118 
1119 	/* XXX: Assumes dk is part of the device softc. */
1120 	device_ref(dk->dk_device);
1121 
1122 	if (dk->dk_flags & (DKF_OPENED | DKF_NOLABELREAD))
1123 		goto done;
1124 
1125 	/* Read disklabel. */
1126 	if (disk_readlabel(&dl, dev, errbuf, sizeof(errbuf)) == NULL) {
1127 		add_timer_randomness(dl.d_checksum);
1128 		dk->dk_flags |= DKF_LABELVALID;
1129 	}
1130 
1131 done:
1132 	dk->dk_flags |= DKF_OPENED;
1133 	device_unref(dk->dk_device);
1134 	wakeup(dk);
1135 }
1136 
1137 /*
1138  * Detach a disk.
1139  */
1140 void
1141 disk_detach(struct disk *diskp)
1142 {
1143 
1144 	if (softraid_disk_attach)
1145 		softraid_disk_attach(diskp, -1);
1146 
1147 	/*
1148 	 * Free the space used by the disklabel structures.
1149 	 */
1150 	free(diskp->dk_label, M_DEVBUF, sizeof(*diskp->dk_label));
1151 
1152 	/*
1153 	 * Remove from the disklist.
1154 	 */
1155 	TAILQ_REMOVE(&disklist, diskp, dk_link);
1156 	disk_change = 1;
1157 	if (--disk_count < 0)
1158 		panic("disk_detach: disk_count < 0");
1159 }
1160 
1161 int
1162 disk_openpart(struct disk *dk, int part, int fmt, int haslabel)
1163 {
1164 	KASSERT(part >= 0 && part < MAXPARTITIONS);
1165 
1166 	/* Unless opening the raw partition, check that the partition exists. */
1167 	if (part != RAW_PART && (!haslabel ||
1168 	    part >= dk->dk_label->d_npartitions ||
1169 	    dk->dk_label->d_partitions[part].p_fstype == FS_UNUSED))
1170 		return (ENXIO);
1171 
1172 	/* Ensure the partition doesn't get changed under our feet. */
1173 	switch (fmt) {
1174 	case S_IFCHR:
1175 		dk->dk_copenmask |= (1 << part);
1176 		break;
1177 	case S_IFBLK:
1178 		dk->dk_bopenmask |= (1 << part);
1179 		break;
1180 	}
1181 	dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask;
1182 
1183 	return (0);
1184 }
1185 
1186 void
1187 disk_closepart(struct disk *dk, int part, int fmt)
1188 {
1189 	KASSERT(part >= 0 && part < MAXPARTITIONS);
1190 
1191 	switch (fmt) {
1192 	case S_IFCHR:
1193 		dk->dk_copenmask &= ~(1 << part);
1194 		break;
1195 	case S_IFBLK:
1196 		dk->dk_bopenmask &= ~(1 << part);
1197 		break;
1198 	}
1199 	dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask;
1200 }
1201 
1202 void
1203 disk_gone(int (*open)(dev_t, int, int, struct proc *), int unit)
1204 {
1205 	int bmaj, cmaj, mn;
1206 
1207 	/* Locate the lowest minor number to be detached. */
1208 	mn = DISKMINOR(unit, 0);
1209 
1210 	for (bmaj = 0; bmaj < nblkdev; bmaj++)
1211 		if (bdevsw[bmaj].d_open == open)
1212 			vdevgone(bmaj, mn, mn + MAXPARTITIONS - 1, VBLK);
1213 	for (cmaj = 0; cmaj < nchrdev; cmaj++)
1214 		if (cdevsw[cmaj].d_open == open)
1215 			vdevgone(cmaj, mn, mn + MAXPARTITIONS - 1, VCHR);
1216 }
1217 
1218 /*
1219  * Increment a disk's busy counter.  If the counter is going from
1220  * 0 to 1, set the timestamp.
1221  */
1222 void
1223 disk_busy(struct disk *diskp)
1224 {
1225 
1226 	/*
1227 	 * XXX We'd like to use something as accurate as microtime(),
1228 	 * but that doesn't depend on the system TOD clock.
1229 	 */
1230 	mtx_enter(&diskp->dk_mtx);
1231 	if (diskp->dk_busy++ == 0)
1232 		microuptime(&diskp->dk_timestamp);
1233 	mtx_leave(&diskp->dk_mtx);
1234 }
1235 
1236 /*
1237  * Decrement a disk's busy counter, increment the byte count, total busy
1238  * time, and reset the timestamp.
1239  */
1240 void
1241 disk_unbusy(struct disk *diskp, long bcount, int read)
1242 {
1243 	struct timeval dv_time, diff_time;
1244 
1245 	mtx_enter(&diskp->dk_mtx);
1246 
1247 	if (diskp->dk_busy-- == 0)
1248 		printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
1249 
1250 	microuptime(&dv_time);
1251 
1252 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
1253 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
1254 
1255 	diskp->dk_timestamp = dv_time;
1256 	if (bcount > 0) {
1257 		if (read) {
1258 			diskp->dk_rbytes += bcount;
1259 			diskp->dk_rxfer++;
1260 		} else {
1261 			diskp->dk_wbytes += bcount;
1262 			diskp->dk_wxfer++;
1263 		}
1264 	} else
1265 		diskp->dk_seek++;
1266 
1267 	mtx_leave(&diskp->dk_mtx);
1268 
1269 	add_disk_randomness(bcount ^ diff_time.tv_usec);
1270 }
1271 
1272 int
1273 disk_lock(struct disk *dk)
1274 {
1275 	return (rw_enter(&dk->dk_lock, RW_WRITE|RW_INTR));
1276 }
1277 
1278 void
1279 disk_lock_nointr(struct disk *dk)
1280 {
1281 	rw_enter_write(&dk->dk_lock);
1282 }
1283 
1284 void
1285 disk_unlock(struct disk *dk)
1286 {
1287 	rw_exit_write(&dk->dk_lock);
1288 }
1289 
1290 int
1291 dk_mountroot(void)
1292 {
1293 	char errbuf[100];
1294 	int part = DISKPART(rootdev);
1295 	int (*mountrootfn)(void);
1296 	struct disklabel dl;
1297 	char *error;
1298 
1299 	error = disk_readlabel(&dl, rootdev, errbuf, sizeof(errbuf));
1300 	if (error)
1301 		panic("%s", error);
1302 
1303 	if (DL_GETPSIZE(&dl.d_partitions[part]) == 0)
1304 		panic("root filesystem has size 0");
1305 	switch (dl.d_partitions[part].p_fstype) {
1306 #ifdef EXT2FS
1307 	case FS_EXT2FS:
1308 		{
1309 		extern int ext2fs_mountroot(void);
1310 		mountrootfn = ext2fs_mountroot;
1311 		}
1312 		break;
1313 #endif
1314 #ifdef FFS
1315 	case FS_BSDFFS:
1316 		{
1317 		extern int ffs_mountroot(void);
1318 		mountrootfn = ffs_mountroot;
1319 		}
1320 		break;
1321 #endif
1322 #ifdef CD9660
1323 	case FS_ISO9660:
1324 		{
1325 		extern int cd9660_mountroot(void);
1326 		mountrootfn = cd9660_mountroot;
1327 		}
1328 		break;
1329 #endif
1330 	default:
1331 #ifdef FFS
1332 		{
1333 		extern int ffs_mountroot(void);
1334 
1335 		printf("filesystem type %d not known.. assuming ffs\n",
1336 		    dl.d_partitions[part].p_fstype);
1337 		mountrootfn = ffs_mountroot;
1338 		}
1339 #else
1340 		panic("disk 0x%x filesystem type %d not known",
1341 		    rootdev, dl.d_partitions[part].p_fstype);
1342 #endif
1343 	}
1344 	return (*mountrootfn)();
1345 }
1346 
1347 struct device *
1348 getdisk(char *str, int len, int defpart, dev_t *devp)
1349 {
1350 	struct device *dv;
1351 
1352 	if ((dv = parsedisk(str, len, defpart, devp)) == NULL) {
1353 		printf("use one of: exit");
1354 		TAILQ_FOREACH(dv, &alldevs, dv_list) {
1355 			if (dv->dv_class == DV_DISK)
1356 				printf(" %s[a-p]", dv->dv_xname);
1357 #if defined(NFSCLIENT)
1358 			if (dv->dv_class == DV_IFNET)
1359 				printf(" %s", dv->dv_xname);
1360 #endif
1361 		}
1362 		printf("\n");
1363 	}
1364 	return (dv);
1365 }
1366 
1367 struct device *
1368 parsedisk(char *str, int len, int defpart, dev_t *devp)
1369 {
1370 	struct device *dv;
1371 	int majdev, part = defpart;
1372 	char c;
1373 
1374 	if (len == 0)
1375 		return (NULL);
1376 	c = str[len-1];
1377 	if (c >= 'a' && (c - 'a') < MAXPARTITIONS) {
1378 		part = c - 'a';
1379 		len -= 1;
1380 	}
1381 
1382 	TAILQ_FOREACH(dv, &alldevs, dv_list) {
1383 		if (dv->dv_class == DV_DISK &&
1384 		    strncmp(str, dv->dv_xname, len) == 0 &&
1385 		    dv->dv_xname[len] == '\0') {
1386 			majdev = findblkmajor(dv);
1387 			if (majdev < 0)
1388 				return NULL;
1389 			*devp = MAKEDISKDEV(majdev, dv->dv_unit, part);
1390 			break;
1391 		}
1392 #if defined(NFSCLIENT)
1393 		if (dv->dv_class == DV_IFNET &&
1394 		    strncmp(str, dv->dv_xname, len) == 0 &&
1395 		    dv->dv_xname[len] == '\0') {
1396 			*devp = NODEV;
1397 			break;
1398 		}
1399 #endif
1400 	}
1401 
1402 	return (dv);
1403 }
1404 
1405 void
1406 setroot(struct device *bootdv, int part, int exitflags)
1407 {
1408 	int majdev, unit, len, s, slept = 0;
1409 	struct swdevt *swp;
1410 	struct device *rootdv, *dv;
1411 	dev_t nrootdev, nswapdev = NODEV, temp = NODEV;
1412 	struct ifnet *ifp = NULL;
1413 	struct disk *dk;
1414 	u_char duid[8];
1415 	char buf[128];
1416 #if defined(NFSCLIENT)
1417 	extern char *nfsbootdevname;
1418 #endif
1419 
1420 	/* Ensure that all disk attach callbacks have completed. */
1421 	do {
1422 		TAILQ_FOREACH(dk, &disklist, dk_link) {
1423 			if (dk->dk_devno != NODEV &&
1424 			    (dk->dk_flags & DKF_OPENED) == 0) {
1425 				tsleep(dk, 0, "dkopen", hz);
1426 				slept++;
1427 				break;
1428 			}
1429 		}
1430 	} while (dk != NULL && slept < 5);
1431 
1432 	if (slept == 5) {
1433 		printf("disklabels not read:");
1434 		TAILQ_FOREACH(dk, &disklist, dk_link)
1435 			if (dk->dk_devno != NODEV &&
1436 			    (dk->dk_flags & DKF_OPENED) == 0)
1437 				printf(" %s", dk->dk_name);
1438 		printf("\n");
1439 	}
1440 
1441 	/* Locate DUID for boot disk if not already provided. */
1442 	memset(duid, 0, sizeof(duid));
1443 	if (memcmp(bootduid, duid, sizeof(bootduid)) == 0) {
1444 		TAILQ_FOREACH(dk, &disklist, dk_link)
1445 			if (dk->dk_device == bootdv)
1446 				break;
1447 		if (dk && (dk->dk_flags & DKF_LABELVALID))
1448 			bcopy(dk->dk_label->d_uid, bootduid, sizeof(bootduid));
1449 	}
1450 	bcopy(bootduid, rootduid, sizeof(rootduid));
1451 
1452 #if NSOFTRAID > 0
1453 	sr_map_root();
1454 #endif
1455 
1456 	/*
1457 	 * If `swap generic' and we couldn't determine boot device,
1458 	 * ask the user.
1459 	 */
1460 	dk = NULL;
1461 	if (mountroot == NULL && bootdv == NULL)
1462 		boothowto |= RB_ASKNAME;
1463 	if (boothowto & RB_ASKNAME) {
1464 		while (1) {
1465 			printf("root device");
1466 			if (bootdv != NULL) {
1467 				printf(" (default %s", bootdv->dv_xname);
1468 				if (bootdv->dv_class == DV_DISK)
1469 					printf("%c", 'a' + part);
1470 				printf(")");
1471 			}
1472 			printf(": ");
1473 			s = splhigh();
1474 			cnpollc(TRUE);
1475 			len = getsn(buf, sizeof(buf));
1476 			cnpollc(FALSE);
1477 			splx(s);
1478 			if (strcmp(buf, "exit") == 0)
1479 				reboot(exitflags);
1480 			if (len == 0 && bootdv != NULL) {
1481 				strlcpy(buf, bootdv->dv_xname, sizeof buf);
1482 				len = strlen(buf);
1483 			}
1484 			if (len > 0 && buf[len - 1] == '*') {
1485 				buf[--len] = '\0';
1486 				dv = getdisk(buf, len, part, &nrootdev);
1487 				if (dv != NULL) {
1488 					rootdv = dv;
1489 					nswapdev = nrootdev;
1490 					goto gotswap;
1491 				}
1492 			}
1493 			dv = getdisk(buf, len, part, &nrootdev);
1494 			if (dv != NULL) {
1495 				rootdv = dv;
1496 				break;
1497 			}
1498 		}
1499 
1500 		if (rootdv->dv_class == DV_IFNET)
1501 			goto gotswap;
1502 
1503 		/* try to build swap device out of new root device */
1504 		while (1) {
1505 			printf("swap device");
1506 			if (rootdv != NULL)
1507 				printf(" (default %s%s)", rootdv->dv_xname,
1508 				    rootdv->dv_class == DV_DISK ? "b" : "");
1509 			printf(": ");
1510 			s = splhigh();
1511 			cnpollc(TRUE);
1512 			len = getsn(buf, sizeof(buf));
1513 			cnpollc(FALSE);
1514 			splx(s);
1515 			if (strcmp(buf, "exit") == 0)
1516 				reboot(exitflags);
1517 			if (len == 0 && rootdv != NULL) {
1518 				switch (rootdv->dv_class) {
1519 				case DV_IFNET:
1520 					nswapdev = NODEV;
1521 					break;
1522 				case DV_DISK:
1523 					nswapdev = MAKEDISKDEV(major(nrootdev),
1524 					    DISKUNIT(nrootdev), 1);
1525 					if (nswapdev == nrootdev)
1526 						continue;
1527 					break;
1528 				default:
1529 					break;
1530 				}
1531 				break;
1532 			}
1533 			dv = getdisk(buf, len, 1, &nswapdev);
1534 			if (dv) {
1535 				if (dv->dv_class == DV_IFNET)
1536 					nswapdev = NODEV;
1537 				if (nswapdev == nrootdev)
1538 					continue;
1539 				break;
1540 			}
1541 		}
1542 gotswap:
1543 		rootdev = nrootdev;
1544 		dumpdev = nswapdev;
1545 		swdevt[0].sw_dev = nswapdev;
1546 		swdevt[1].sw_dev = NODEV;
1547 #if defined(NFSCLIENT)
1548 	} else if (mountroot == nfs_mountroot) {
1549 		rootdv = bootdv;
1550 		rootdev = dumpdev = swapdev = NODEV;
1551 #endif
1552 	} else if (mountroot == NULL && rootdev == NODEV) {
1553 		/*
1554 		 * `swap generic'
1555 		 */
1556 		rootdv = bootdv;
1557 
1558 		if (bootdv->dv_class == DV_DISK) {
1559 			memset(&duid, 0, sizeof(duid));
1560 			if (memcmp(rootduid, &duid, sizeof(rootduid)) != 0) {
1561 				TAILQ_FOREACH(dk, &disklist, dk_link)
1562 					if ((dk->dk_flags & DKF_LABELVALID) &&
1563 					    dk->dk_label && memcmp(dk->dk_label->d_uid,
1564 					    &rootduid, sizeof(rootduid)) == 0)
1565 						break;
1566 				if (dk == NULL)
1567 					panic("root device (%02hx%02hx%02hx%02hx"
1568 					    "%02hx%02hx%02hx%02hx) not found",
1569 					    rootduid[0], rootduid[1], rootduid[2],
1570 					    rootduid[3], rootduid[4], rootduid[5],
1571 					    rootduid[6], rootduid[7]);
1572 				rootdv = dk->dk_device;
1573 			}
1574 		}
1575 
1576 		majdev = findblkmajor(rootdv);
1577 		if (majdev >= 0) {
1578 			/*
1579 			 * Root and swap are on the disk.
1580 			 * Assume swap is on partition b.
1581 			 */
1582 			rootdev = MAKEDISKDEV(majdev, rootdv->dv_unit, part);
1583 			nswapdev = MAKEDISKDEV(majdev, rootdv->dv_unit, 1);
1584 		} else {
1585 			/*
1586 			 * Root and swap are on a net.
1587 			 */
1588 			nswapdev = NODEV;
1589 		}
1590 		dumpdev = nswapdev;
1591 		swdevt[0].sw_dev = nswapdev;
1592 		/* swdevt[1].sw_dev = NODEV; */
1593 	} else {
1594 		/* Completely pre-configured, but we want rootdv .. */
1595 		majdev = major(rootdev);
1596 		if (findblkname(majdev) == NULL)
1597 			return;
1598 		unit = DISKUNIT(rootdev);
1599 		part = DISKPART(rootdev);
1600 		snprintf(buf, sizeof buf, "%s%d%c",
1601 		    findblkname(majdev), unit, 'a' + part);
1602 		rootdv = parsedisk(buf, strlen(buf), 0, &nrootdev);
1603 		if (rootdv == NULL)
1604 			panic("root device (%s) not found", buf);
1605 	}
1606 
1607 	if (rootdv && rootdv == bootdv && rootdv->dv_class == DV_IFNET)
1608 		ifp = ifunit(rootdv->dv_xname);
1609 	else if (bootdv && bootdv->dv_class == DV_IFNET)
1610 		ifp = ifunit(bootdv->dv_xname);
1611 
1612 	if (ifp)
1613 		if_addgroup(ifp, "netboot");
1614 
1615 	switch (rootdv->dv_class) {
1616 #if defined(NFSCLIENT)
1617 	case DV_IFNET:
1618 		mountroot = nfs_mountroot;
1619 		nfsbootdevname = rootdv->dv_xname;
1620 		return;
1621 #endif
1622 	case DV_DISK:
1623 		mountroot = dk_mountroot;
1624 		part = DISKPART(rootdev);
1625 		break;
1626 	default:
1627 		printf("can't figure root, hope your kernel is right\n");
1628 		return;
1629 	}
1630 
1631 	printf("root on %s%c", rootdv->dv_xname, 'a' + part);
1632 
1633 	if (dk && dk->dk_device == rootdv)
1634 		printf(" (%02hx%02hx%02hx%02hx%02hx%02hx%02hx%02hx.%c)",
1635 		    rootduid[0], rootduid[1], rootduid[2], rootduid[3],
1636 		    rootduid[4], rootduid[5], rootduid[6], rootduid[7],
1637 		    'a' + part);
1638 
1639 	/*
1640 	 * Make the swap partition on the root drive the primary swap.
1641 	 */
1642 	for (swp = swdevt; swp->sw_dev != NODEV; swp++) {
1643 		if (major(rootdev) == major(swp->sw_dev) &&
1644 		    DISKUNIT(rootdev) == DISKUNIT(swp->sw_dev)) {
1645 			temp = swdevt[0].sw_dev;
1646 			swdevt[0].sw_dev = swp->sw_dev;
1647 			swp->sw_dev = temp;
1648 			break;
1649 		}
1650 	}
1651 	if (swp->sw_dev != NODEV) {
1652 		/*
1653 		 * If dumpdev was the same as the old primary swap device,
1654 		 * move it to the new primary swap device.
1655 		 */
1656 		if (temp == dumpdev)
1657 			dumpdev = swdevt[0].sw_dev;
1658 	}
1659 	if (swdevt[0].sw_dev != NODEV)
1660 		printf(" swap on %s%d%c", findblkname(major(swdevt[0].sw_dev)),
1661 		    DISKUNIT(swdevt[0].sw_dev),
1662 		    'a' + DISKPART(swdevt[0].sw_dev));
1663 	if (dumpdev != NODEV)
1664 		printf(" dump on %s%d%c", findblkname(major(dumpdev)),
1665 		    DISKUNIT(dumpdev), 'a' + DISKPART(dumpdev));
1666 	printf("\n");
1667 }
1668 
1669 extern struct nam2blk nam2blk[];
1670 
1671 int
1672 findblkmajor(struct device *dv)
1673 {
1674 	char buf[16], *p;
1675 	int i;
1676 
1677 	if (strlcpy(buf, dv->dv_xname, sizeof buf) >= sizeof buf)
1678 		return (-1);
1679 	for (p = buf; *p; p++)
1680 		if (*p >= '0' && *p <= '9')
1681 			*p = '\0';
1682 
1683 	for (i = 0; nam2blk[i].name; i++)
1684 		if (!strcmp(buf, nam2blk[i].name))
1685 			return (nam2blk[i].maj);
1686 	return (-1);
1687 }
1688 
1689 char *
1690 findblkname(int maj)
1691 {
1692 	int i;
1693 
1694 	for (i = 0; nam2blk[i].name; i++)
1695 		if (nam2blk[i].maj == maj)
1696 			return (nam2blk[i].name);
1697 	return (NULL);
1698 }
1699 
1700 char *
1701 disk_readlabel(struct disklabel *dl, dev_t dev, char *errbuf, size_t errsize)
1702 {
1703 	struct vnode *vn;
1704 	dev_t chrdev, rawdev;
1705 	int error;
1706 
1707 	chrdev = blktochr(dev);
1708 	rawdev = MAKEDISKDEV(major(chrdev), DISKUNIT(chrdev), RAW_PART);
1709 
1710 #ifdef DEBUG
1711 	printf("dev=0x%x chrdev=0x%x rawdev=0x%x\n", dev, chrdev, rawdev);
1712 #endif
1713 
1714 	if (cdevvp(rawdev, &vn)) {
1715 		snprintf(errbuf, errsize,
1716 		    "cannot obtain vnode for 0x%x/0x%x", dev, rawdev);
1717 		return (errbuf);
1718 	}
1719 
1720 	error = VOP_OPEN(vn, FREAD, NOCRED, curproc);
1721 	if (error) {
1722 		snprintf(errbuf, errsize,
1723 		    "cannot open disk, 0x%x/0x%x, error %d",
1724 		    dev, rawdev, error);
1725 		goto done;
1726 	}
1727 
1728 	error = VOP_IOCTL(vn, DIOCGDINFO, (caddr_t)dl, FREAD, NOCRED, curproc);
1729 	if (error) {
1730 		snprintf(errbuf, errsize,
1731 		    "cannot read disk label, 0x%x/0x%x, error %d",
1732 		    dev, rawdev, error);
1733 	}
1734 done:
1735 	VOP_CLOSE(vn, FREAD, NOCRED, curproc);
1736 	vput(vn);
1737 	if (error)
1738 		return (errbuf);
1739 	return (NULL);
1740 }
1741 
1742 int
1743 disk_map(char *path, char *mappath, int size, int flags)
1744 {
1745 	struct disk *dk, *mdk;
1746 	u_char uid[8];
1747 	char c, part;
1748 	int i;
1749 
1750 	/*
1751 	 * Attempt to map a request for a disklabel UID to the correct device.
1752 	 * We should be supplied with a disklabel UID which has the following
1753 	 * format:
1754 	 *
1755 	 * [disklabel uid] . [partition]
1756 	 *
1757 	 * Alternatively, if the DM_OPENPART flag is set the disklabel UID can
1758 	 * based passed on its own.
1759 	 */
1760 
1761 	if (strchr(path, '/') != NULL)
1762 		return -1;
1763 
1764 	/* Verify that the device name is properly formed. */
1765 	if (!((strlen(path) == 16 && (flags & DM_OPENPART)) ||
1766 	    (strlen(path) == 18 && path[16] == '.')))
1767 		return -1;
1768 
1769 	/* Get partition. */
1770 	if (flags & DM_OPENPART)
1771 		part = 'a' + RAW_PART;
1772 	else
1773 		part = path[17];
1774 
1775 	if (part < 'a' || part >= 'a' + MAXPARTITIONS)
1776 		return -1;
1777 
1778 	/* Derive label UID. */
1779 	memset(uid, 0, sizeof(uid));
1780 	for (i = 0; i < 16; i++) {
1781 		c = path[i];
1782 		if (c >= '0' && c <= '9')
1783 			c -= '0';
1784 		else if (c >= 'a' && c <= 'f')
1785 			c -= ('a' - 10);
1786                 else
1787 			return -1;
1788 
1789 		uid[i / 2] <<= 4;
1790 		uid[i / 2] |= c & 0xf;
1791 	}
1792 
1793 	mdk = NULL;
1794 	TAILQ_FOREACH(dk, &disklist, dk_link) {
1795 		if ((dk->dk_flags & DKF_LABELVALID) && dk->dk_label &&
1796 		    memcmp(dk->dk_label->d_uid, uid,
1797 		    sizeof(dk->dk_label->d_uid)) == 0) {
1798 			/* Fail if there are duplicate UIDs! */
1799 			if (mdk != NULL)
1800 				return -1;
1801 			mdk = dk;
1802 		}
1803 	}
1804 
1805 	if (mdk == NULL || mdk->dk_name == NULL)
1806 		return -1;
1807 
1808 	snprintf(mappath, size, "/dev/%s%s%c",
1809 	    (flags & DM_OPENBLCK) ? "" : "r", mdk->dk_name, part);
1810 
1811 	return 0;
1812 }
1813 
1814 /*
1815  * Lookup a disk device and verify that it has completed attaching.
1816  */
1817 struct device *
1818 disk_lookup(struct cfdriver *cd, int unit)
1819 {
1820 	struct device *dv;
1821 	struct disk *dk;
1822 
1823 	dv = device_lookup(cd, unit);
1824 	if (dv == NULL)
1825 		return (NULL);
1826 
1827 	TAILQ_FOREACH(dk, &disklist, dk_link)
1828 		if (dk->dk_device == dv)
1829 			break;
1830 
1831 	if (dk == NULL) {
1832 		device_unref(dv);
1833 		return (NULL);
1834 	}
1835 
1836 	return (dv);
1837 }
1838