xref: /openbsd-src/sys/kern/subr_disk.c (revision 11efff7f3ac2b3cfeff0c0cddc14294d9b3aca4f)
1 /*	$OpenBSD: subr_disk.c,v 1.29 2004/12/26 21:22:13 miod Exp $	*/
2 /*	$NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Jason R. Thorpe.  All rights reserved.
6  * Copyright (c) 1982, 1986, 1988, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/stat.h>
48 #include <sys/syslog.h>
49 #include <sys/time.h>
50 #include <sys/disklabel.h>
51 #include <sys/conf.h>
52 #include <sys/lock.h>
53 #include <sys/disk.h>
54 #include <sys/dkio.h>
55 #include <sys/dkstat.h>		/* XXX */
56 #include <sys/proc.h>
57 
58 #include <dev/rndvar.h>
59 
60 /*
61  * A global list of all disks attached to the system.  May grow or
62  * shrink over time.
63  */
64 struct	disklist_head disklist;	/* TAILQ_HEAD */
65 int	disk_count;		/* number of drives in global disklist */
66 int	disk_change;		/* set if a disk has been attached/detached
67 				 * since last we looked at this variable. This
68 				 * is reset by hw_sysctl()
69 				 */
70 
71 /*
72  * Seek sort for disks.  We depend on the driver which calls us using b_resid
73  * as the current cylinder number.
74  *
75  * The argument ap structure holds a b_actf activity chain pointer on which we
76  * keep two queues, sorted in ascending cylinder order.  The first queue holds
77  * those requests which are positioned after the current cylinder (in the first
78  * request); the second holds requests which came in after their cylinder number
79  * was passed.  Thus we implement a one way scan, retracting after reaching the
80  * end of the drive to the first request on the second queue, at which time it
81  * becomes the first queue.
82  *
83  * A one-way scan is natural because of the way UNIX read-ahead blocks are
84  * allocated.
85  */
86 
87 void
88 disksort(ap, bp)
89 	register struct buf *ap, *bp;
90 {
91 	register struct buf *bq;
92 
93 	/* If the queue is empty, then it's easy. */
94 	if (ap->b_actf == NULL) {
95 		bp->b_actf = NULL;
96 		ap->b_actf = bp;
97 		return;
98 	}
99 
100 	/*
101 	 * If we lie after the first (currently active) request, then we
102 	 * must locate the second request list and add ourselves to it.
103 	 */
104 	bq = ap->b_actf;
105 	if (bp->b_cylinder < bq->b_cylinder) {
106 		while (bq->b_actf) {
107 			/*
108 			 * Check for an ``inversion'' in the normally ascending
109 			 * cylinder numbers, indicating the start of the second
110 			 * request list.
111 			 */
112 			if (bq->b_actf->b_cylinder < bq->b_cylinder) {
113 				/*
114 				 * Search the second request list for the first
115 				 * request at a larger cylinder number.  We go
116 				 * before that; if there is no such request, we
117 				 * go at end.
118 				 */
119 				do {
120 					if (bp->b_cylinder <
121 					    bq->b_actf->b_cylinder)
122 						goto insert;
123 					if (bp->b_cylinder ==
124 					    bq->b_actf->b_cylinder &&
125 					    bp->b_blkno < bq->b_actf->b_blkno)
126 						goto insert;
127 					bq = bq->b_actf;
128 				} while (bq->b_actf);
129 				goto insert;		/* after last */
130 			}
131 			bq = bq->b_actf;
132 		}
133 		/*
134 		 * No inversions... we will go after the last, and
135 		 * be the first request in the second request list.
136 		 */
137 		goto insert;
138 	}
139 	/*
140 	 * Request is at/after the current request...
141 	 * sort in the first request list.
142 	 */
143 	while (bq->b_actf) {
144 		/*
145 		 * We want to go after the current request if there is an
146 		 * inversion after it (i.e. it is the end of the first
147 		 * request list), or if the next request is a larger cylinder
148 		 * than our request.
149 		 */
150 		if (bq->b_actf->b_cylinder < bq->b_cylinder ||
151 		    bp->b_cylinder < bq->b_actf->b_cylinder ||
152 		    (bp->b_cylinder == bq->b_actf->b_cylinder &&
153 		    bp->b_blkno < bq->b_actf->b_blkno))
154 			goto insert;
155 		bq = bq->b_actf;
156 	}
157 	/*
158 	 * Neither a second list nor a larger request... we go at the end of
159 	 * the first list, which is the same as the end of the whole schebang.
160 	 */
161 insert:	bp->b_actf = bq->b_actf;
162 	bq->b_actf = bp;
163 }
164 
165 /*
166  * Compute checksum for disk label.
167  */
168 u_int
169 dkcksum(lp)
170 	register struct disklabel *lp;
171 {
172 	register u_int16_t *start, *end;
173 	register u_int16_t sum = 0;
174 
175 	start = (u_int16_t *)lp;
176 	end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
177 	while (start < end)
178 		sum ^= *start++;
179 	return (sum);
180 }
181 
182 /*
183  * Disk error is the preface to plaintive error messages
184  * about failing disk transfers.  It prints messages of the form
185 
186 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
187 
188  * if the offset of the error in the transfer and a disk label
189  * are both available.  blkdone should be -1 if the position of the error
190  * is unknown; the disklabel pointer may be null from drivers that have not
191  * been converted to use them.  The message is printed with printf
192  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
193  * The message should be completed (with at least a newline) with printf
194  * or addlog, respectively.  There is no trailing space.
195  */
196 void
197 diskerr(bp, dname, what, pri, blkdone, lp)
198 	register struct buf *bp;
199 	char *dname, *what;
200 	int pri, blkdone;
201 	register struct disklabel *lp;
202 {
203 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
204 	register int (*pr)(const char *, ...);
205 	char partname = 'a' + part;
206 	int sn;
207 
208 	if (pri != LOG_PRINTF) {
209 		static const char fmt[] = "";
210 		log(pri, fmt);
211 		pr = addlog;
212 	} else
213 		pr = printf;
214 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
215 	    bp->b_flags & B_READ ? "read" : "writ");
216 	sn = bp->b_blkno;
217 	if (bp->b_bcount <= DEV_BSIZE)
218 		(*pr)("%d", sn);
219 	else {
220 		if (blkdone >= 0) {
221 			sn += blkdone;
222 			(*pr)("%d of ", sn);
223 		}
224 		(*pr)("%d-%d", bp->b_blkno,
225 		    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
226 	}
227 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
228 		sn += lp->d_partitions[part].p_offset;
229 		(*pr)(" (%s%d bn %d; cn %d", dname, unit, sn,
230 		    sn / lp->d_secpercyl);
231 		sn %= lp->d_secpercyl;
232 		(*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors);
233 	}
234 }
235 
236 /*
237  * Initialize the disklist.  Called by main() before autoconfiguration.
238  */
239 void
240 disk_init()
241 {
242 
243 	TAILQ_INIT(&disklist);
244 	disk_count = disk_change = 0;
245 }
246 
247 /*
248  * Searches the disklist for the disk corresponding to the
249  * name provided.
250  */
251 struct disk *
252 disk_find(name)
253 	char *name;
254 {
255 	struct disk *diskp;
256 
257 	if ((name == NULL) || (disk_count <= 0))
258 		return (NULL);
259 
260 	TAILQ_FOREACH(diskp, &disklist, dk_link)
261 		if (strcmp(diskp->dk_name, name) == 0)
262 			return (diskp);
263 
264 	return (NULL);
265 }
266 
267 int
268 disk_construct(diskp, lockname)
269 	struct disk *diskp;
270 	char *lockname;
271 {
272 	lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname,
273 		 0, LK_CANRECURSE);
274 
275 	diskp->dk_flags |= DKF_CONSTRUCTED;
276 
277 	return (0);
278 }
279 
280 /*
281  * Attach a disk.
282  */
283 void
284 disk_attach(diskp)
285 	struct disk *diskp;
286 {
287 
288 	if (!diskp->dk_flags & DKF_CONSTRUCTED)
289 		disk_construct(diskp, diskp->dk_name);
290 
291 	/*
292 	 * Allocate and initialize the disklabel structures.  Note that
293 	 * it's not safe to sleep here, since we're probably going to be
294 	 * called during autoconfiguration.
295 	 */
296 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
297 	diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
298 	    M_NOWAIT);
299 	if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
300 		panic("disk_attach: can't allocate storage for disklabel");
301 
302 	bzero(diskp->dk_label, sizeof(struct disklabel));
303 	bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel));
304 
305 	/*
306 	 * Set the attached timestamp.
307 	 */
308 	microuptime(&diskp->dk_attachtime);
309 
310 	/*
311 	 * Link into the disklist.
312 	 */
313 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
314 	++disk_count;
315 	disk_change = 1;
316 }
317 
318 /*
319  * Detach a disk.
320  */
321 void
322 disk_detach(diskp)
323 	struct disk *diskp;
324 {
325 
326 	/*
327 	 * Free the space used by the disklabel structures.
328 	 */
329 	free(diskp->dk_label, M_DEVBUF);
330 	free(diskp->dk_cpulabel, M_DEVBUF);
331 
332 	/*
333 	 * Remove from the disklist.
334 	 */
335 	TAILQ_REMOVE(&disklist, diskp, dk_link);
336 	disk_change = 1;
337 	if (--disk_count < 0)
338 		panic("disk_detach: disk_count < 0");
339 }
340 
341 /*
342  * Increment a disk's busy counter.  If the counter is going from
343  * 0 to 1, set the timestamp.
344  */
345 void
346 disk_busy(diskp)
347 	struct disk *diskp;
348 {
349 
350 	/*
351 	 * XXX We'd like to use something as accurate as microtime(),
352 	 * but that doesn't depend on the system TOD clock.
353 	 */
354 	if (diskp->dk_busy++ == 0) {
355 		microuptime(&diskp->dk_timestamp);
356 	}
357 }
358 
359 /*
360  * Decrement a disk's busy counter, increment the byte count, total busy
361  * time, and reset the timestamp.
362  */
363 void
364 disk_unbusy(diskp, bcount, read)
365 	struct disk *diskp;
366 	long bcount;
367 	int read;
368 {
369 	struct timeval dv_time, diff_time;
370 
371 	if (diskp->dk_busy-- == 0)
372 		printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
373 
374 	microuptime(&dv_time);
375 
376 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
377 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
378 
379 	diskp->dk_timestamp = dv_time;
380 	if (bcount > 0) {
381 		if (read) {
382 			diskp->dk_rbytes += bcount;
383 			diskp->dk_rxfer++;
384 		} else {
385 			diskp->dk_wbytes += bcount;
386 			diskp->dk_wxfer++;
387 		}
388 	} else
389 		diskp->dk_seek++;
390 
391 	add_disk_randomness(bcount ^ diff_time.tv_usec);
392 }
393 
394 
395 int
396 disk_lock(dk)
397 	struct disk *dk;
398 {
399 	int error;
400 
401 	error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, 0, curproc);
402 
403 	return (error);
404 }
405 
406 void
407 disk_unlock(dk)
408 	struct disk *dk;
409 {
410 	lockmgr(&dk->dk_lock, LK_RELEASE, 0, curproc);
411 }
412 
413 
414 /*
415  * Reset the metrics counters on the given disk.  Note that we cannot
416  * reset the busy counter, as it may case a panic in disk_unbusy().
417  * We also must avoid playing with the timestamp information, as it
418  * may skew any pending transfer results.
419  */
420 void
421 disk_resetstat(diskp)
422 	struct disk *diskp;
423 {
424 	int s = splbio();
425 
426 	diskp->dk_rxfer = 0;
427 	diskp->dk_rbytes = 0;
428 	diskp->dk_wxfer = 0;
429 	diskp->dk_wbytes = 0;
430 	diskp->dk_seek = 0;
431 
432 	microuptime(&diskp->dk_attachtime);
433 
434 	timerclear(&diskp->dk_time);
435 
436 	splx(s);
437 }
438 
439 
440 int
441 dk_mountroot()
442 {
443 	dev_t rawdev, rrootdev;
444 	int part = DISKPART(rootdev);
445 	int (*mountrootfn)(void);
446 	struct disklabel dl;
447 	int error;
448 
449 	rrootdev = blktochr(rootdev);
450 	rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART);
451 	printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev,
452 	    rrootdev, rawdev);
453 
454 	/*
455 	 * open device, ioctl for the disklabel, and close it.
456 	 */
457 	error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD,
458 	    S_IFCHR, curproc);
459 	if (error)
460 		panic("cannot open disk, 0x%x/0x%x, error %d",
461 		    rootdev, rrootdev, error);
462 	error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO,
463 	    (caddr_t)&dl, FREAD, curproc);
464 	if (error)
465 		panic("cannot read disk label, 0x%x/0x%x, error %d",
466 		    rootdev, rrootdev, error);
467 	(void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD,
468 	    S_IFCHR, curproc);
469 
470 	if (dl.d_partitions[part].p_size == 0)
471 		panic("root filesystem has size 0");
472 	switch (dl.d_partitions[part].p_fstype) {
473 #ifdef EXT2FS
474 	case FS_EXT2FS:
475 		{
476 		extern int ext2fs_mountroot(void);
477 		mountrootfn = ext2fs_mountroot;
478 		}
479 		break;
480 #endif
481 #ifdef FFS
482 	case FS_BSDFFS:
483 		{
484 		extern int ffs_mountroot(void);
485 		mountrootfn = ffs_mountroot;
486 		}
487 		break;
488 #endif
489 #ifdef LFS
490 	case FS_BSDLFS:
491 		{
492 		extern int lfs_mountroot(void);
493 		mountrootfn = lfs_mountroot;
494 		}
495 		break;
496 #endif
497 #ifdef CD9660
498 	case FS_ISO9660:
499 		{
500 		extern int cd9660_mountroot(void);
501 		mountrootfn = cd9660_mountroot;
502 		}
503 		break;
504 #endif
505 	default:
506 #ifdef FFS
507 		{
508 		extern int ffs_mountroot(void);
509 
510 		printf("filesystem type %d not known.. assuming ffs\n",
511 		    dl.d_partitions[part].p_fstype);
512 		mountrootfn = ffs_mountroot;
513 		}
514 #else
515 		panic("disk 0x%x/0x%x filesystem type %d not known",
516 		    rootdev, rrootdev, dl.d_partitions[part].p_fstype);
517 #endif
518 	}
519 	return (*mountrootfn)();
520 }
521 
522 struct bufq *
523 bufq_default_alloc(void)
524 {
525 	struct bufq_default *bq;
526 
527 	bq = malloc(sizeof(*bq), M_DEVBUF, M_NOWAIT);
528 	if (bq == NULL)
529 		panic("bufq_default_alloc: no memory");
530 
531 	memset(bq, 0, sizeof(*bq));
532 	bq->bufq.bufq_free = bufq_default_free;
533 	bq->bufq.bufq_add = bufq_default_add;
534 	bq->bufq.bufq_get = bufq_default_get;
535 
536 	return ((struct bufq *)bq);
537 }
538 
539 void
540 bufq_default_free(struct bufq *bq)
541 {
542 	free(bq, M_DEVBUF);
543 }
544 
545 void
546 bufq_default_add(struct bufq *bq, struct buf *bp)
547 {
548 	struct bufq_default *bufq = (struct bufq_default *)bq;
549 	struct proc *p = bp->b_proc;
550 	struct buf *head;
551 
552 	if (p == NULL || p->p_nice < NZERO)
553 		head = &bufq->bufq_head[0];
554 	else if (p->p_nice == NZERO)
555 		head = &bufq->bufq_head[1];
556 	else
557 		head = &bufq->bufq_head[2];
558 
559 	disksort(head, bp);
560 }
561 
562 struct buf *
563 bufq_default_get(struct bufq *bq)
564 {
565 	struct bufq_default *bufq = (struct bufq_default *)bq;
566 	struct buf *bp, *head;
567 	int i;
568 
569 	for (i = 0; i < 3; i++) {
570 		head = &bufq->bufq_head[i];
571 		if ((bp = head->b_actf))
572 			break;
573 	}
574 	if (bp == NULL)
575 		return (NULL);
576 	head->b_actf = bp->b_actf;
577 	return (bp);
578 }
579