xref: /openbsd-src/sys/kern/subr_disk.c (revision daf88648c0e349d5c02e1504293082072c981640)
1 /*	$OpenBSD: subr_disk.c,v 1.32 2006/05/11 18:58:59 miod Exp $	*/
2 /*	$NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1995 Jason R. Thorpe.  All rights reserved.
6  * Copyright (c) 1982, 1986, 1988, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/stat.h>
48 #include <sys/syslog.h>
49 #include <sys/time.h>
50 #include <sys/disklabel.h>
51 #include <sys/conf.h>
52 #include <sys/lock.h>
53 #include <sys/disk.h>
54 #include <sys/dkio.h>
55 #include <sys/dkstat.h>		/* XXX */
56 #include <sys/proc.h>
57 
58 #include <dev/rndvar.h>
59 
60 /*
61  * A global list of all disks attached to the system.  May grow or
62  * shrink over time.
63  */
64 struct	disklist_head disklist;	/* TAILQ_HEAD */
65 int	disk_count;		/* number of drives in global disklist */
66 int	disk_change;		/* set if a disk has been attached/detached
67 				 * since last we looked at this variable. This
68 				 * is reset by hw_sysctl()
69 				 */
70 
71 /*
72  * Seek sort for disks.  We depend on the driver which calls us using b_resid
73  * as the current cylinder number.
74  *
75  * The argument ap structure holds a b_actf activity chain pointer on which we
76  * keep two queues, sorted in ascending cylinder order.  The first queue holds
77  * those requests which are positioned after the current cylinder (in the first
78  * request); the second holds requests which came in after their cylinder number
79  * was passed.  Thus we implement a one way scan, retracting after reaching the
80  * end of the drive to the first request on the second queue, at which time it
81  * becomes the first queue.
82  *
83  * A one-way scan is natural because of the way UNIX read-ahead blocks are
84  * allocated.
85  */
86 
87 void
88 disksort(struct buf *ap, struct buf *bp)
89 {
90 	struct buf *bq;
91 
92 	/* If the queue is empty, then it's easy. */
93 	if (ap->b_actf == NULL) {
94 		bp->b_actf = NULL;
95 		ap->b_actf = bp;
96 		return;
97 	}
98 
99 	/*
100 	 * If we lie after the first (currently active) request, then we
101 	 * must locate the second request list and add ourselves to it.
102 	 */
103 	bq = ap->b_actf;
104 	if (bp->b_cylinder < bq->b_cylinder) {
105 		while (bq->b_actf) {
106 			/*
107 			 * Check for an ``inversion'' in the normally ascending
108 			 * cylinder numbers, indicating the start of the second
109 			 * request list.
110 			 */
111 			if (bq->b_actf->b_cylinder < bq->b_cylinder) {
112 				/*
113 				 * Search the second request list for the first
114 				 * request at a larger cylinder number.  We go
115 				 * before that; if there is no such request, we
116 				 * go at end.
117 				 */
118 				do {
119 					if (bp->b_cylinder <
120 					    bq->b_actf->b_cylinder)
121 						goto insert;
122 					if (bp->b_cylinder ==
123 					    bq->b_actf->b_cylinder &&
124 					    bp->b_blkno < bq->b_actf->b_blkno)
125 						goto insert;
126 					bq = bq->b_actf;
127 				} while (bq->b_actf);
128 				goto insert;		/* after last */
129 			}
130 			bq = bq->b_actf;
131 		}
132 		/*
133 		 * No inversions... we will go after the last, and
134 		 * be the first request in the second request list.
135 		 */
136 		goto insert;
137 	}
138 	/*
139 	 * Request is at/after the current request...
140 	 * sort in the first request list.
141 	 */
142 	while (bq->b_actf) {
143 		/*
144 		 * We want to go after the current request if there is an
145 		 * inversion after it (i.e. it is the end of the first
146 		 * request list), or if the next request is a larger cylinder
147 		 * than our request.
148 		 */
149 		if (bq->b_actf->b_cylinder < bq->b_cylinder ||
150 		    bp->b_cylinder < bq->b_actf->b_cylinder ||
151 		    (bp->b_cylinder == bq->b_actf->b_cylinder &&
152 		    bp->b_blkno < bq->b_actf->b_blkno))
153 			goto insert;
154 		bq = bq->b_actf;
155 	}
156 	/*
157 	 * Neither a second list nor a larger request... we go at the end of
158 	 * the first list, which is the same as the end of the whole schebang.
159 	 */
160 insert:	bp->b_actf = bq->b_actf;
161 	bq->b_actf = bp;
162 }
163 
164 /*
165  * Compute checksum for disk label.
166  */
167 u_int
168 dkcksum(struct disklabel *lp)
169 {
170 	u_int16_t *start, *end;
171 	u_int16_t sum = 0;
172 
173 	start = (u_int16_t *)lp;
174 	end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions];
175 	while (start < end)
176 		sum ^= *start++;
177 	return (sum);
178 }
179 
180 /*
181  * Disk error is the preface to plaintive error messages
182  * about failing disk transfers.  It prints messages of the form
183 
184 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
185 
186  * if the offset of the error in the transfer and a disk label
187  * are both available.  blkdone should be -1 if the position of the error
188  * is unknown; the disklabel pointer may be null from drivers that have not
189  * been converted to use them.  The message is printed with printf
190  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
191  * The message should be completed (with at least a newline) with printf
192  * or addlog, respectively.  There is no trailing space.
193  */
194 void
195 diskerr(struct buf *bp, char *dname, char *what, int pri, int blkdone,
196     struct disklabel *lp)
197 {
198 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
199 	int (*pr)(const char *, ...);
200 	char partname = 'a' + part;
201 	int sn;
202 
203 	if (pri != LOG_PRINTF) {
204 		static const char fmt[] = "";
205 		log(pri, fmt);
206 		pr = addlog;
207 	} else
208 		pr = printf;
209 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
210 	    bp->b_flags & B_READ ? "read" : "writ");
211 	sn = bp->b_blkno;
212 	if (bp->b_bcount <= DEV_BSIZE)
213 		(*pr)("%d", sn);
214 	else {
215 		if (blkdone >= 0) {
216 			sn += blkdone;
217 			(*pr)("%d of ", sn);
218 		}
219 		(*pr)("%d-%d", bp->b_blkno,
220 		    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
221 	}
222 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
223 		sn += lp->d_partitions[part].p_offset;
224 		(*pr)(" (%s%d bn %d; cn %d", dname, unit, sn,
225 		    sn / lp->d_secpercyl);
226 		sn %= lp->d_secpercyl;
227 		(*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors);
228 	}
229 }
230 
231 /*
232  * Initialize the disklist.  Called by main() before autoconfiguration.
233  */
234 void
235 disk_init(void)
236 {
237 
238 	TAILQ_INIT(&disklist);
239 	disk_count = disk_change = 0;
240 }
241 
242 /*
243  * Searches the disklist for the disk corresponding to the
244  * name provided.
245  */
246 struct disk *
247 disk_find(char *name)
248 {
249 	struct disk *diskp;
250 
251 	if ((name == NULL) || (disk_count <= 0))
252 		return (NULL);
253 
254 	TAILQ_FOREACH(diskp, &disklist, dk_link)
255 		if (strcmp(diskp->dk_name, name) == 0)
256 			return (diskp);
257 
258 	return (NULL);
259 }
260 
261 int
262 disk_construct(struct disk *diskp, char *lockname)
263 {
264 	lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname,
265 		 0, LK_CANRECURSE);
266 
267 	diskp->dk_flags |= DKF_CONSTRUCTED;
268 
269 	return (0);
270 }
271 
272 /*
273  * Attach a disk.
274  */
275 void
276 disk_attach(struct disk *diskp)
277 {
278 
279 	if (!ISSET(diskp->dk_flags, DKF_CONSTRUCTED))
280 		disk_construct(diskp, diskp->dk_name);
281 
282 	/*
283 	 * Allocate and initialize the disklabel structures.  Note that
284 	 * it's not safe to sleep here, since we're probably going to be
285 	 * called during autoconfiguration.
286 	 */
287 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
288 	diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
289 	    M_NOWAIT);
290 	if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
291 		panic("disk_attach: can't allocate storage for disklabel");
292 
293 	bzero(diskp->dk_label, sizeof(struct disklabel));
294 	bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel));
295 
296 	/*
297 	 * Set the attached timestamp.
298 	 */
299 	microuptime(&diskp->dk_attachtime);
300 
301 	/*
302 	 * Link into the disklist.
303 	 */
304 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
305 	++disk_count;
306 	disk_change = 1;
307 }
308 
309 /*
310  * Detach a disk.
311  */
312 void
313 disk_detach(struct disk *diskp)
314 {
315 
316 	/*
317 	 * Free the space used by the disklabel structures.
318 	 */
319 	free(diskp->dk_label, M_DEVBUF);
320 	free(diskp->dk_cpulabel, M_DEVBUF);
321 
322 	/*
323 	 * Remove from the disklist.
324 	 */
325 	TAILQ_REMOVE(&disklist, diskp, dk_link);
326 	disk_change = 1;
327 	if (--disk_count < 0)
328 		panic("disk_detach: disk_count < 0");
329 }
330 
331 /*
332  * Increment a disk's busy counter.  If the counter is going from
333  * 0 to 1, set the timestamp.
334  */
335 void
336 disk_busy(struct disk *diskp)
337 {
338 
339 	/*
340 	 * XXX We'd like to use something as accurate as microtime(),
341 	 * but that doesn't depend on the system TOD clock.
342 	 */
343 	if (diskp->dk_busy++ == 0) {
344 		microuptime(&diskp->dk_timestamp);
345 	}
346 }
347 
348 /*
349  * Decrement a disk's busy counter, increment the byte count, total busy
350  * time, and reset the timestamp.
351  */
352 void
353 disk_unbusy(struct disk *diskp, long bcount, int read)
354 {
355 	struct timeval dv_time, diff_time;
356 
357 	if (diskp->dk_busy-- == 0)
358 		printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name);
359 
360 	microuptime(&dv_time);
361 
362 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
363 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
364 
365 	diskp->dk_timestamp = dv_time;
366 	if (bcount > 0) {
367 		if (read) {
368 			diskp->dk_rbytes += bcount;
369 			diskp->dk_rxfer++;
370 		} else {
371 			diskp->dk_wbytes += bcount;
372 			diskp->dk_wxfer++;
373 		}
374 	} else
375 		diskp->dk_seek++;
376 
377 	add_disk_randomness(bcount ^ diff_time.tv_usec);
378 }
379 
380 int
381 disk_lock(struct disk *dk)
382 {
383 	int error;
384 
385 	error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, NULL);
386 
387 	return (error);
388 }
389 
390 void
391 disk_unlock(struct disk *dk)
392 {
393 	lockmgr(&dk->dk_lock, LK_RELEASE, NULL);
394 }
395 
396 /*
397  * Reset the metrics counters on the given disk.  Note that we cannot
398  * reset the busy counter, as it may case a panic in disk_unbusy().
399  * We also must avoid playing with the timestamp information, as it
400  * may skew any pending transfer results.
401  */
402 void
403 disk_resetstat(struct disk *diskp)
404 {
405 	int s = splbio();
406 
407 	diskp->dk_rxfer = 0;
408 	diskp->dk_rbytes = 0;
409 	diskp->dk_wxfer = 0;
410 	diskp->dk_wbytes = 0;
411 	diskp->dk_seek = 0;
412 
413 	microuptime(&diskp->dk_attachtime);
414 
415 	timerclear(&diskp->dk_time);
416 
417 	splx(s);
418 }
419 
420 
421 int
422 dk_mountroot(void)
423 {
424 	dev_t rawdev, rrootdev;
425 	int part = DISKPART(rootdev);
426 	int (*mountrootfn)(void);
427 	struct disklabel dl;
428 	int error;
429 
430 	rrootdev = blktochr(rootdev);
431 	rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART);
432 	printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev,
433 	    rrootdev, rawdev);
434 
435 	/*
436 	 * open device, ioctl for the disklabel, and close it.
437 	 */
438 	error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD,
439 	    S_IFCHR, curproc);
440 	if (error)
441 		panic("cannot open disk, 0x%x/0x%x, error %d",
442 		    rootdev, rrootdev, error);
443 	error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO,
444 	    (caddr_t)&dl, FREAD, curproc);
445 	if (error)
446 		panic("cannot read disk label, 0x%x/0x%x, error %d",
447 		    rootdev, rrootdev, error);
448 	(void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD,
449 	    S_IFCHR, curproc);
450 
451 	if (dl.d_partitions[part].p_size == 0)
452 		panic("root filesystem has size 0");
453 	switch (dl.d_partitions[part].p_fstype) {
454 #ifdef EXT2FS
455 	case FS_EXT2FS:
456 		{
457 		extern int ext2fs_mountroot(void);
458 		mountrootfn = ext2fs_mountroot;
459 		}
460 		break;
461 #endif
462 #ifdef FFS
463 	case FS_BSDFFS:
464 		{
465 		extern int ffs_mountroot(void);
466 		mountrootfn = ffs_mountroot;
467 		}
468 		break;
469 #endif
470 #ifdef LFS
471 	case FS_BSDLFS:
472 		{
473 		extern int lfs_mountroot(void);
474 		mountrootfn = lfs_mountroot;
475 		}
476 		break;
477 #endif
478 #ifdef CD9660
479 	case FS_ISO9660:
480 		{
481 		extern int cd9660_mountroot(void);
482 		mountrootfn = cd9660_mountroot;
483 		}
484 		break;
485 #endif
486 	default:
487 #ifdef FFS
488 		{
489 		extern int ffs_mountroot(void);
490 
491 		printf("filesystem type %d not known.. assuming ffs\n",
492 		    dl.d_partitions[part].p_fstype);
493 		mountrootfn = ffs_mountroot;
494 		}
495 #else
496 		panic("disk 0x%x/0x%x filesystem type %d not known",
497 		    rootdev, rrootdev, dl.d_partitions[part].p_fstype);
498 #endif
499 	}
500 	return (*mountrootfn)();
501 }
502 
503 struct bufq *
504 bufq_default_alloc(void)
505 {
506 	struct bufq_default *bq;
507 
508 	bq = malloc(sizeof(*bq), M_DEVBUF, M_NOWAIT);
509 	if (bq == NULL)
510 		panic("bufq_default_alloc: no memory");
511 
512 	memset(bq, 0, sizeof(*bq));
513 	bq->bufq.bufq_free = bufq_default_free;
514 	bq->bufq.bufq_add = bufq_default_add;
515 	bq->bufq.bufq_get = bufq_default_get;
516 
517 	return ((struct bufq *)bq);
518 }
519 
520 void
521 bufq_default_free(struct bufq *bq)
522 {
523 	free(bq, M_DEVBUF);
524 }
525 
526 void
527 bufq_default_add(struct bufq *bq, struct buf *bp)
528 {
529 	struct bufq_default *bufq = (struct bufq_default *)bq;
530 	struct proc *p = bp->b_proc;
531 	struct buf *head;
532 
533 	if (p == NULL || p->p_nice < NZERO)
534 		head = &bufq->bufq_head[0];
535 	else if (p->p_nice == NZERO)
536 		head = &bufq->bufq_head[1];
537 	else
538 		head = &bufq->bufq_head[2];
539 
540 	disksort(head, bp);
541 }
542 
543 struct buf *
544 bufq_default_get(struct bufq *bq)
545 {
546 	struct bufq_default *bufq = (struct bufq_default *)bq;
547 	struct buf *bp, *head;
548 	int i;
549 
550 	for (i = 0; i < 3; i++) {
551 		head = &bufq->bufq_head[i];
552 		if ((bp = head->b_actf))
553 			break;
554 	}
555 	if (bp == NULL)
556 		return (NULL);
557 	head->b_actf = bp->b_actf;
558 	return (bp);
559 }
560