xref: /openbsd-src/sys/kern/subr_disk.c (revision 0eea0d082377cb9c3ec583313dc4d52b7b6a4d6d)
1 /*	$OpenBSD: subr_disk.c,v 1.26 2004/06/24 19:35:24 tholo Exp $	*/
2 /*	$NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Jason R. Thorpe.  All rights reserved.
6  * Copyright (c) 1982, 1986, 1988, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/stat.h>
48 #include <sys/syslog.h>
49 #include <sys/time.h>
50 #include <sys/disklabel.h>
51 #include <sys/conf.h>
52 #include <sys/lock.h>
53 #include <sys/disk.h>
54 #include <sys/dkio.h>
55 #include <sys/dkstat.h>		/* XXX */
56 #include <sys/proc.h>
57 
58 #include <dev/rndvar.h>
59 
60 /*
61  * A global list of all disks attached to the system.  May grow or
62  * shrink over time.
63  */
64 struct	disklist_head disklist;	/* TAILQ_HEAD */
65 int	disk_count;		/* number of drives in global disklist */
66 int	disk_change;		/* set if a disk has been attached/detached
67 				 * since last we looked at this variable. This
68 				 * is reset by hw_sysctl()
69 				 */
70 
71 /*
72  * Seek sort for disks.  We depend on the driver which calls us using b_resid
73  * as the current cylinder number.
74  *
75  * The argument ap structure holds a b_actf activity chain pointer on which we
76  * keep two queues, sorted in ascending cylinder order.  The first queue holds
77  * those requests which are positioned after the current cylinder (in the first
78  * request); the second holds requests which came in after their cylinder number
79  * was passed.  Thus we implement a one way scan, retracting after reaching the
80  * end of the drive to the first request on the second queue, at which time it
81  * becomes the first queue.
82  *
83  * A one-way scan is natural because of the way UNIX read-ahead blocks are
84  * allocated.
85  */
86 
87 void
88 disksort(ap, bp)
89 	register struct buf *ap, *bp;
90 {
91 	register struct buf *bq;
92 
93 	/* If the queue is empty, then it's easy. */
94 	if (ap->b_actf == NULL) {
95 		bp->b_actf = NULL;
96 		ap->b_actf = bp;
97 		return;
98 	}
99 
100 	/*
101 	 * If we lie after the first (currently active) request, then we
102 	 * must locate the second request list and add ourselves to it.
103 	 */
104 	bq = ap->b_actf;
105 	if (bp->b_cylinder < bq->b_cylinder) {
106 		while (bq->b_actf) {
107 			/*
108 			 * Check for an ``inversion'' in the normally ascending
109 			 * cylinder numbers, indicating the start of the second
110 			 * request list.
111 			 */
112 			if (bq->b_actf->b_cylinder < bq->b_cylinder) {
113 				/*
114 				 * Search the second request list for the first
115 				 * request at a larger cylinder number.  We go
116 				 * before that; if there is no such request, we
117 				 * go at end.
118 				 */
119 				do {
120 					if (bp->b_cylinder <
121 					    bq->b_actf->b_cylinder)
122 						goto insert;
123 					if (bp->b_cylinder ==
124 					    bq->b_actf->b_cylinder &&
125 					    bp->b_blkno < bq->b_actf->b_blkno)
126 						goto insert;
127 					bq = bq->b_actf;
128 				} while (bq->b_actf);
129 				goto insert;		/* after last */
130 			}
131 			bq = bq->b_actf;
132 		}
133 		/*
134 		 * No inversions... we will go after the last, and
135 		 * be the first request in the second request list.
136 		 */
137 		goto insert;
138 	}
139 	/*
140 	 * Request is at/after the current request...
141 	 * sort in the first request list.
142 	 */
143 	while (bq->b_actf) {
144 		/*
145 		 * We want to go after the current request if there is an
146 		 * inversion after it (i.e. it is the end of the first
147 		 * request list), or if the next request is a larger cylinder
148 		 * than our request.
149 		 */
150 		if (bq->b_actf->b_cylinder < bq->b_cylinder ||
151 		    bp->b_cylinder < bq->b_actf->b_cylinder ||
152 		    (bp->b_cylinder == bq->b_actf->b_cylinder &&
153 		    bp->b_blkno < bq->b_actf->b_blkno))
154 			goto insert;
155 		bq = bq->b_actf;
156 	}
157 	/*
158 	 * Neither a second list nor a larger request... we go at the end of
159 	 * the first list, which is the same as the end of the whole schebang.
160 	 */
161 insert:	bp->b_actf = bq->b_actf;
162 	bq->b_actf = bp;
163 }
164 
165 /*
166  * Compute checksum for disk label.
167  */
168 u_int
169 dkcksum(lp)
170 	register struct disklabel *lp;
171 {
172 	register u_int16_t *start, *end;
173 	register u_int16_t sum = 0;
174 
175 	start = (u_int16_t *)lp;
176 	end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
177 	while (start < end)
178 		sum ^= *start++;
179 	return (sum);
180 }
181 
182 /*
183  * Disk error is the preface to plaintive error messages
184  * about failing disk transfers.  It prints messages of the form
185 
186 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
187 
188  * if the offset of the error in the transfer and a disk label
189  * are both available.  blkdone should be -1 if the position of the error
190  * is unknown; the disklabel pointer may be null from drivers that have not
191  * been converted to use them.  The message is printed with printf
192  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
193  * The message should be completed (with at least a newline) with printf
194  * or addlog, respectively.  There is no trailing space.
195  */
196 void
197 diskerr(bp, dname, what, pri, blkdone, lp)
198 	register struct buf *bp;
199 	char *dname, *what;
200 	int pri, blkdone;
201 	register struct disklabel *lp;
202 {
203 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
204 	register int (*pr)(const char *, ...);
205 	char partname = 'a' + part;
206 	int sn;
207 
208 	if (pri != LOG_PRINTF) {
209 		static const char fmt[] = "";
210 		log(pri, fmt);
211 		pr = addlog;
212 	} else
213 		pr = printf;
214 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
215 	    bp->b_flags & B_READ ? "read" : "writ");
216 	sn = bp->b_blkno;
217 	if (bp->b_bcount <= DEV_BSIZE)
218 		(*pr)("%d", sn);
219 	else {
220 		if (blkdone >= 0) {
221 			sn += blkdone;
222 			(*pr)("%d of ", sn);
223 		}
224 		(*pr)("%d-%d", bp->b_blkno,
225 		    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
226 	}
227 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
228 #ifdef tahoe
229 		sn *= DEV_BSIZE / lp->d_secsize;		/* XXX */
230 #endif
231 		sn += lp->d_partitions[part].p_offset;
232 		(*pr)(" (%s%d bn %d; cn %d", dname, unit, sn,
233 		    sn / lp->d_secpercyl);
234 		sn %= lp->d_secpercyl;
235 		(*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors);
236 	}
237 }
238 
239 /*
240  * Initialize the disklist.  Called by main() before autoconfiguration.
241  */
242 void
243 disk_init()
244 {
245 
246 	TAILQ_INIT(&disklist);
247 	disk_count = disk_change = 0;
248 }
249 
250 /*
251  * Searches the disklist for the disk corresponding to the
252  * name provided.
253  */
254 struct disk *
255 disk_find(name)
256 	char *name;
257 {
258 	struct disk *diskp;
259 
260 	if ((name == NULL) || (disk_count <= 0))
261 		return (NULL);
262 
263 	for (diskp = disklist.tqh_first; diskp != NULL;
264 	    diskp = diskp->dk_link.tqe_next)
265 		if (strcmp(diskp->dk_name, name) == 0)
266 			return (diskp);
267 
268 	return (NULL);
269 }
270 
271 int
272 disk_construct(diskp, lockname)
273 	struct disk *diskp;
274 	char *lockname;
275 {
276 	lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname,
277 		 0, LK_CANRECURSE);
278 
279 	diskp->dk_flags |= DKF_CONSTRUCTED;
280 
281 	return (0);
282 }
283 
284 /*
285  * Attach a disk.
286  */
287 void
288 disk_attach(diskp)
289 	struct disk *diskp;
290 {
291 
292 	if (!diskp->dk_flags & DKF_CONSTRUCTED)
293 		disk_construct(diskp, diskp->dk_name);
294 
295 	/*
296 	 * Allocate and initialize the disklabel structures.  Note that
297 	 * it's not safe to sleep here, since we're probably going to be
298 	 * called during autoconfiguration.
299 	 */
300 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
301 	diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
302 	    M_NOWAIT);
303 	if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
304 		panic("disk_attach: can't allocate storage for disklabel");
305 
306 	bzero(diskp->dk_label, sizeof(struct disklabel));
307 	bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel));
308 
309 	/*
310 	 * Set the attached timestamp.
311 	 */
312 	microuptime(&diskp->dk_attachtime);
313 
314 	/*
315 	 * Link into the disklist.
316 	 */
317 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
318 	++disk_count;
319 	disk_change = 1;
320 }
321 
322 /*
323  * Detach a disk.
324  */
325 void
326 disk_detach(diskp)
327 	struct disk *diskp;
328 {
329 
330 	/*
331 	 * Free the space used by the disklabel structures.
332 	 */
333 	free(diskp->dk_label, M_DEVBUF);
334 	free(diskp->dk_cpulabel, M_DEVBUF);
335 
336 	/*
337 	 * Remove from the disklist.
338 	 */
339 	TAILQ_REMOVE(&disklist, diskp, dk_link);
340 	disk_change = 1;
341 	if (--disk_count < 0)
342 		panic("disk_detach: disk_count < 0");
343 }
344 
345 /*
346  * Increment a disk's busy counter.  If the counter is going from
347  * 0 to 1, set the timestamp.
348  */
349 void
350 disk_busy(diskp)
351 	struct disk *diskp;
352 {
353 
354 	/*
355 	 * XXX We'd like to use something as accurate as microtime(),
356 	 * but that doesn't depend on the system TOD clock.
357 	 */
358 	if (diskp->dk_busy++ == 0) {
359 		microuptime(&diskp->dk_timestamp);
360 	}
361 }
362 
363 /*
364  * Decrement a disk's busy counter, increment the byte count, total busy
365  * time, and reset the timestamp.
366  */
367 void
368 disk_unbusy(diskp, bcount, read)
369 	struct disk *diskp;
370 	long bcount;
371 	int read;
372 {
373 	struct timeval dv_time, diff_time;
374 
375 	if (diskp->dk_busy-- == 0)
376 		printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
377 
378 	microuptime(&dv_time);
379 
380 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
381 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
382 
383 	diskp->dk_timestamp = dv_time;
384 	if (bcount > 0) {
385 		if (read) {
386 			diskp->dk_rbytes += bcount;
387 			diskp->dk_rxfer++;
388 		} else {
389 			diskp->dk_wbytes += bcount;
390 			diskp->dk_wxfer++;
391 		}
392 	} else
393 		diskp->dk_seek++;
394 
395 	add_disk_randomness(bcount ^ diff_time.tv_usec);
396 }
397 
398 
399 int
400 disk_lock(dk)
401 	struct disk *dk;
402 {
403 	int error;
404 
405 	error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, 0, curproc);
406 
407 	return (error);
408 }
409 
410 void
411 disk_unlock(dk)
412 	struct disk *dk;
413 {
414 	lockmgr(&dk->dk_lock, LK_RELEASE, 0, curproc);
415 }
416 
417 
418 /*
419  * Reset the metrics counters on the given disk.  Note that we cannot
420  * reset the busy counter, as it may case a panic in disk_unbusy().
421  * We also must avoid playing with the timestamp information, as it
422  * may skew any pending transfer results.
423  */
424 void
425 disk_resetstat(diskp)
426 	struct disk *diskp;
427 {
428 	int s = splbio();
429 
430 	diskp->dk_rxfer = 0;
431 	diskp->dk_rbytes = 0;
432 	diskp->dk_wxfer = 0;
433 	diskp->dk_wbytes = 0;
434 	diskp->dk_seek = 0;
435 
436 	microuptime(&diskp->dk_attachtime);
437 
438 	timerclear(&diskp->dk_time);
439 
440 	splx(s);
441 }
442 
443 
444 int
445 dk_mountroot()
446 {
447 	dev_t rawdev, rrootdev;
448 	int part = DISKPART(rootdev);
449 	int (*mountrootfn)(void);
450 	struct disklabel dl;
451 	int error;
452 
453 	rrootdev = blktochr(rootdev);
454 	rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART);
455 	printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev,
456 	    rrootdev, rawdev);
457 
458 	/*
459 	 * open device, ioctl for the disklabel, and close it.
460 	 */
461 	error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD,
462 	    S_IFCHR, curproc);
463 	if (error)
464 		panic("cannot open disk, 0x%x/0x%x, error %d",
465 		    rootdev, rrootdev, error);
466 	error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO,
467 	    (caddr_t)&dl, FREAD, curproc);
468 	if (error)
469 		panic("cannot read disk label, 0x%x/0x%x, error %d",
470 		    rootdev, rrootdev, error);
471 	(void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD,
472 	    S_IFCHR, curproc);
473 
474 	if (dl.d_partitions[part].p_size == 0)
475 		panic("root filesystem has size 0");
476 	switch (dl.d_partitions[part].p_fstype) {
477 #ifdef EXT2FS
478 	case FS_EXT2FS:
479 		{
480 		extern int ext2fs_mountroot(void);
481 		mountrootfn = ext2fs_mountroot;
482 		}
483 		break;
484 #endif
485 #ifdef FFS
486 	case FS_BSDFFS:
487 		{
488 		extern int ffs_mountroot(void);
489 		mountrootfn = ffs_mountroot;
490 		}
491 		break;
492 #endif
493 #ifdef LFS
494 	case FS_BSDLFS:
495 		{
496 		extern int lfs_mountroot(void);
497 		mountrootfn = lfs_mountroot;
498 		}
499 		break;
500 #endif
501 #ifdef CD9660
502 	case FS_ISO9660:
503 		{
504 		extern int cd9660_mountroot(void);
505 		mountrootfn = cd9660_mountroot;
506 		}
507 		break;
508 #endif
509 	default:
510 #ifdef FFS
511 		{
512 		extern int ffs_mountroot(void);
513 
514 		printf("filesystem type %d not known.. assuming ffs\n",
515 		    dl.d_partitions[part].p_fstype);
516 		mountrootfn = ffs_mountroot;
517 		}
518 #else
519 		panic("disk 0x%x/0x%x filesystem type %d not known",
520 		    rootdev, rrootdev, dl.d_partitions[part].p_fstype);
521 #endif
522 	}
523 	return (*mountrootfn)();
524 }
525 
526 struct bufq *
527 bufq_default_alloc(void)
528 {
529 	struct bufq_default *bq;
530 
531 	bq = malloc(sizeof(*bq), M_DEVBUF, M_NOWAIT);
532 	memset(bq, 0, sizeof(*bq));
533 	bq->bufq.bufq_free = bufq_default_free;
534 	bq->bufq.bufq_add = bufq_default_add;
535 	bq->bufq.bufq_get = bufq_default_get;
536 
537 	return ((struct bufq *)bq);
538 }
539 
540 void
541 bufq_default_free(struct bufq *bq)
542 {
543 	free(bq, M_DEVBUF);
544 }
545 
546 void
547 bufq_default_add(struct bufq *bq, struct buf *bp)
548 {
549 	struct bufq_default *bufq = (struct bufq_default *)bq;
550 	struct proc *p = bp->b_proc;
551 	struct buf *head;
552 
553 	if (p == NULL || p->p_nice < NZERO)
554 		head = &bufq->bufq_head[0];
555 	else if (p->p_nice == NZERO)
556 		head = &bufq->bufq_head[1];
557 	else
558 		head = &bufq->bufq_head[2];
559 
560 	disksort(head, bp);
561 }
562 
563 struct buf *
564 bufq_default_get(struct bufq *bq)
565 {
566 	struct bufq_default *bufq = (struct bufq_default *)bq;
567 	struct buf *bp, *head;
568 	int i;
569 
570 	for (i = 0; i < 3; i++) {
571 		head = &bufq->bufq_head[i];
572 		if ((bp = head->b_actf))
573 			break;
574 	}
575 	if (bp == NULL)
576 		return (NULL);
577 	head->b_actf = bp->b_actf;
578 	return (bp);
579 }
580