xref: /freebsd-src/sys/kern/subr_disk.c (revision 2e307eb8c939cb55919948243fe49f593434e42c)
1 /*
2  * ----------------------------------------------------------------------------
3  * "THE BEER-WARE LICENSE" (Revision 42):
4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5  * can do whatever you want with this stuff. If we meet some day, and you think
6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7  * ----------------------------------------------------------------------------
8  *
9  * $FreeBSD$
10  *
11  */
12 
13 #include "opt_geom.h"
14 
15 #include <sys/param.h>
16 #include <sys/systm.h>
17 #include <sys/stdint.h>
18 #include <sys/bio.h>
19 #include <sys/conf.h>
20 #include <sys/disk.h>
21 #include <sys/diskslice.h>
22 #include <sys/disklabel.h>
23 #ifdef NO_GEOM
24 #include <sys/kernel.h>
25 #include <sys/sysctl.h>
26 #include <sys/malloc.h>
27 #include <sys/sysctl.h>
28 #include <machine/md_var.h>
29 #include <sys/ctype.h>
30 
31 static MALLOC_DEFINE(M_DISK, "disk", "disk data");
32 
33 static d_strategy_t diskstrategy;
34 static d_open_t diskopen;
35 static d_close_t diskclose;
36 static d_ioctl_t diskioctl;
37 static d_psize_t diskpsize;
38 
39 static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
40 
41 void disk_dev_synth(dev_t dev);
42 
43 void
44 disk_dev_synth(dev_t dev)
45 {
46 	struct disk *dp;
47 	int u, s, p;
48 	dev_t pdev;
49 
50 	if (dksparebits(dev))
51 		return;
52 	LIST_FOREACH(dp, &disklist, d_list) {
53 		if (major(dev) != dp->d_devsw->d_maj)
54 			continue;
55 		u = dkunit(dev);
56 		p = RAW_PART;
57 		s = WHOLE_DISK_SLICE;
58 		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
59 		if (pdev->si_devsw == NULL)
60 			return;		/* Probably a unit we don't have */
61 		s = dkslice(dev);
62 		p = dkpart(dev);
63 		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
64 			/* XXX: actually should not happen */
65 			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
66 			    UID_ROOT, GID_OPERATOR, 0640, "%s%d",
67 				dp->d_devsw->d_name, u);
68 			dev_depends(pdev, dev);
69 			return;
70 		}
71 		if (s == COMPATIBILITY_SLICE) {
72 			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
73 			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
74 				dp->d_devsw->d_name, u, 'a' + p);
75 			dev_depends(pdev, dev);
76 			return;
77 		}
78 		if (p != RAW_PART) {
79 			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
80 			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
81 				dp->d_devsw->d_name, u, s - BASE_SLICE + 1,
82 				'a' + p);
83 		} else {
84 			dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
85 			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
86 				dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
87 			make_dev_alias(dev, "%s%ds%dc",
88 			    dp->d_devsw->d_name, u, s - BASE_SLICE + 1);
89 		}
90 		dev_depends(pdev, dev);
91 		return;
92 	}
93 }
94 
95 static void
96 disk_clone(void *arg, char *name, int namelen, dev_t *dev)
97 {
98 	struct disk *dp;
99 	char const *d;
100 	char *e;
101 	int j, u, s, p;
102 	dev_t pdev;
103 
104 	if (*dev != NODEV)
105 		return;
106 
107 	LIST_FOREACH(dp, &disklist, d_list) {
108 		d = dp->d_devsw->d_name;
109 		j = dev_stdclone(name, &e, d, &u);
110 		if (j == 0)
111 			continue;
112 		if (u > DKMAXUNIT)
113 			continue;
114 		p = RAW_PART;
115 		s = WHOLE_DISK_SLICE;
116 		pdev = makedev(dp->d_devsw->d_maj, dkmakeminor(u, s, p));
117 		if (pdev->si_disk == NULL)
118 			continue;
119 		if (*e != '\0') {
120 			j = dev_stdclone(e, &e, "s", &s);
121 			if (j == 0)
122 				s = COMPATIBILITY_SLICE;
123 			else if (j == 1 || j == 2)
124 				s += BASE_SLICE - 1;
125 			if (!*e)
126 				;		/* ad0s1 case */
127 			else if (e[1] != '\0')
128 				return;		/* can never be a disk name */
129 			else if (*e < 'a' || *e > 'h')
130 				return;		/* can never be a disk name */
131 			else
132 				p = *e - 'a';
133 		}
134 		if (s == WHOLE_DISK_SLICE && p == RAW_PART) {
135 			return;
136 		} else if (s >= BASE_SLICE && p != RAW_PART) {
137 			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
138 			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d%c",
139 			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1,
140 			    p + 'a');
141 		} else if (s >= BASE_SLICE) {
142 			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
143 			    UID_ROOT, GID_OPERATOR, 0640, "%s%ds%d",
144 			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
145 			make_dev_alias(*dev, "%s%ds%dc",
146 			    pdev->si_devsw->d_name, u, s - BASE_SLICE + 1);
147 		} else {
148 			*dev = make_dev(pdev->si_devsw, dkmakeminor(u, s, p),
149 			    UID_ROOT, GID_OPERATOR, 0640, "%s%d%c",
150 			    pdev->si_devsw->d_name, u, p + 'a');
151 		}
152 		dev_depends(pdev, *dev);
153 		return;
154 	}
155 }
156 
157 static void
158 inherit_raw(dev_t pdev, dev_t dev)
159 {
160 	dev->si_disk = pdev->si_disk;
161 	dev->si_drv1 = pdev->si_drv1;
162 	dev->si_drv2 = pdev->si_drv2;
163 	dev->si_iosize_max = pdev->si_iosize_max;
164 	dev->si_bsize_phys = pdev->si_bsize_phys;
165 	dev->si_bsize_best = pdev->si_bsize_best;
166 }
167 
168 dev_t
169 disk_create(int unit, struct disk *dp, int flags, struct cdevsw *cdevsw, struct cdevsw *proto)
170 {
171 	static int once;
172 	dev_t dev;
173 
174 	if (!once) {
175 		EVENTHANDLER_REGISTER(dev_clone, disk_clone, 0, 1000);
176 		once++;
177 	}
178 
179 	bzero(dp, sizeof(*dp));
180 	dp->d_label = malloc(sizeof *dp->d_label, M_DEVBUF, M_WAITOK|M_ZERO);
181 
182 	if (proto->d_open != diskopen) {
183 		*proto = *cdevsw;
184 		proto->d_open = diskopen;
185 		proto->d_close = diskclose;
186 		proto->d_ioctl = diskioctl;
187 		proto->d_strategy = diskstrategy;
188 		proto->d_psize = diskpsize;
189 	}
190 
191 	if (bootverbose)
192 		printf("Creating DISK %s%d\n", cdevsw->d_name, unit);
193 	dev = make_dev(proto, dkmakeminor(unit, WHOLE_DISK_SLICE, RAW_PART),
194 	    UID_ROOT, GID_OPERATOR, 0640, "%s%d", cdevsw->d_name, unit);
195 
196 	dev->si_disk = dp;
197 	dp->d_dev = dev;
198 	dp->d_dsflags = flags;
199 	dp->d_devsw = cdevsw;
200 	LIST_INSERT_HEAD(&disklist, dp, d_list);
201 
202 	return (dev);
203 }
204 
205 static int
206 diskdumpconf(u_int onoff, dev_t dev, struct disk *dp)
207 {
208 	struct dumperinfo di;
209 	struct disklabel *dl;
210 
211 	if (!onoff)
212 		return(set_dumper(NULL));
213 	dl = dsgetlabel(dev, dp->d_slice);
214 	if (!dl)
215 		return (ENXIO);
216 	bzero(&di, sizeof di);
217 	di.dumper = (dumper_t *)dp->d_devsw->d_dump;
218 	di.priv = dp->d_dev;
219 	di.blocksize = dl->d_secsize;
220 	di.mediaoffset = (off_t)(dl->d_partitions[dkpart(dev)].p_offset +
221 	    dp->d_slice->dss_slices[dkslice(dev)].ds_offset) * DEV_BSIZE;
222 	di.mediasize =
223 	    (off_t)(dl->d_partitions[dkpart(dev)].p_size) * DEV_BSIZE;
224 	if (di.mediasize == 0)
225 		return (EINVAL);
226 	return(set_dumper(&di));
227 }
228 
229 void
230 disk_invalidate (struct disk *disk)
231 {
232 	if (disk->d_slice)
233 		dsgone(&disk->d_slice);
234 }
235 
236 void
237 disk_destroy(dev_t dev)
238 {
239 	LIST_REMOVE(dev->si_disk, d_list);
240 	free(dev->si_disk->d_label, M_DEVBUF);
241 	bzero(dev->si_disk, sizeof(*dev->si_disk));
242     	dev->si_disk = NULL;
243 	destroy_dev(dev);
244 	return;
245 }
246 
247 struct disk *
248 disk_enumerate(struct disk *disk)
249 {
250 	if (!disk)
251 		return (LIST_FIRST(&disklist));
252 	else
253 		return (LIST_NEXT(disk, d_list));
254 }
255 
256 static int
257 sysctl_disks(SYSCTL_HANDLER_ARGS)
258 {
259 	struct disk *disk;
260 	int error, first;
261 
262 	disk = NULL;
263 	first = 1;
264 
265 	while ((disk = disk_enumerate(disk))) {
266 		if (!first) {
267 			error = SYSCTL_OUT(req, " ", 1);
268 			if (error)
269 				return error;
270 		} else {
271 			first = 0;
272 		}
273 		error = SYSCTL_OUT(req, disk->d_dev->si_name, strlen(disk->d_dev->si_name));
274 		if (error)
275 			return error;
276 	}
277 	error = SYSCTL_OUT(req, "", 1);
278 	return error;
279 }
280 
281 SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
282     sysctl_disks, "A", "names of available disks");
283 
284 /*
285  * The cdevsw functions
286  */
287 
288 static int
289 diskopen(dev_t dev, int oflags, int devtype, struct thread *td)
290 {
291 	dev_t pdev;
292 	struct disk *dp;
293 	int error;
294 
295 	error = 0;
296 	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
297 
298 	dp = pdev->si_disk;
299 	if (!dp)
300 		return (ENXIO);
301 
302 	while (dp->d_flags & DISKFLAG_LOCK) {
303 		dp->d_flags |= DISKFLAG_WANTED;
304 		error = tsleep(dp, PRIBIO | PCATCH, "diskopen", hz);
305 		if (error)
306 			return (error);
307 	}
308 	dp->d_flags |= DISKFLAG_LOCK;
309 
310 	if (!dsisopen(dp->d_slice)) {
311 		if (!pdev->si_iosize_max)
312 			pdev->si_iosize_max = dev->si_iosize_max;
313 		error = dp->d_devsw->d_open(pdev, oflags, devtype, td);
314 		dp->d_label->d_secsize = dp->d_sectorsize;
315 		dp->d_label->d_secperunit = dp->d_mediasize / dp->d_sectorsize;
316 		dp->d_label->d_nsectors = dp->d_fwsectors;
317 		dp->d_label->d_ntracks = dp->d_fwheads;
318 	}
319 
320 	/* Inherit properties from the whole/raw dev_t */
321 	inherit_raw(pdev, dev);
322 
323 	if (error)
324 		goto out;
325 
326 	error = dsopen(dev, devtype, dp->d_dsflags, &dp->d_slice, dp->d_label);
327 
328 	if (!dsisopen(dp->d_slice))
329 		dp->d_devsw->d_close(pdev, oflags, devtype, td);
330 out:
331 	dp->d_flags &= ~DISKFLAG_LOCK;
332 	if (dp->d_flags & DISKFLAG_WANTED) {
333 		dp->d_flags &= ~DISKFLAG_WANTED;
334 		wakeup(dp);
335 	}
336 
337 	return(error);
338 }
339 
340 static int
341 diskclose(dev_t dev, int fflag, int devtype, struct thread *td)
342 {
343 	struct disk *dp;
344 	int error;
345 	dev_t pdev;
346 
347 	error = 0;
348 	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
349 	dp = pdev->si_disk;
350 	if (!dp)
351 		return (ENXIO);
352 	dsclose(dev, devtype, dp->d_slice);
353 	if (!dsisopen(dp->d_slice))
354 		error = dp->d_devsw->d_close(dp->d_dev, fflag, devtype, td);
355 	return (error);
356 }
357 
358 static void
359 diskstrategy(struct bio *bp)
360 {
361 	dev_t pdev;
362 	struct disk *dp;
363 
364 	pdev = dkmodpart(dkmodslice(bp->bio_dev, WHOLE_DISK_SLICE), RAW_PART);
365 	dp = pdev->si_disk;
366 	bp->bio_resid = bp->bio_bcount;
367 	if (dp != bp->bio_dev->si_disk)
368 		inherit_raw(pdev, bp->bio_dev);
369 
370 	if (!dp) {
371 		biofinish(bp, NULL, ENXIO);
372 		return;
373 	}
374 
375 	if (dscheck(bp, dp->d_slice) <= 0) {
376 		biodone(bp);
377 		return;
378 	}
379 
380 	if (bp->bio_bcount == 0) {
381 		biodone(bp);
382 		return;
383 	}
384 
385 	KASSERT(dp->d_devsw != NULL, ("NULL devsw"));
386 	KASSERT(dp->d_devsw->d_strategy != NULL, ("NULL d_strategy"));
387 	dp->d_devsw->d_strategy(bp);
388 	return;
389 
390 }
391 
392 static int
393 diskioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
394 {
395 	struct disk *dp;
396 	int error;
397 	u_int u;
398 	dev_t pdev;
399 
400 	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
401 	dp = pdev->si_disk;
402 	if (!dp)
403 		return (ENXIO);
404 	if (cmd == DIOCSKERNELDUMP) {
405 		u = *(u_int *)data;
406 		return (diskdumpconf(u, dev, dp));
407 	}
408 	if (cmd == DIOCGFRONTSTUFF) {
409 		*(off_t *)data = 8192;	/* XXX: crude but enough) */
410 		return (0);
411 	}
412 	error = dsioctl(dev, cmd, data, fflag, &dp->d_slice);
413 	if (error == ENOIOCTL)
414 		error = dp->d_devsw->d_ioctl(dev, cmd, data, fflag, td);
415 	return (error);
416 }
417 
418 static int
419 diskpsize(dev_t dev)
420 {
421 	struct disk *dp;
422 	dev_t pdev;
423 
424 	pdev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
425 	dp = pdev->si_disk;
426 	if (!dp)
427 		return (-1);
428 	if (dp != dev->si_disk) {
429 		dev->si_drv1 = pdev->si_drv1;
430 		dev->si_drv2 = pdev->si_drv2;
431 		/* XXX: don't set bp->b_dev->si_disk (?) */
432 	}
433 	return (dssize(dev, &dp->d_slice));
434 }
435 
436 SYSCTL_INT(_debug_sizeof, OID_AUTO, disklabel, CTLFLAG_RD,
437     0, sizeof(struct disklabel), "sizeof(struct disklabel)");
438 
439 SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD,
440     0, sizeof(struct diskslices), "sizeof(struct diskslices)");
441 
442 SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD,
443     0, sizeof(struct disk), "sizeof(struct disk)");
444 
445 #endif /* NO_GEOM */
446 
447 /*-
448  * Disk error is the preface to plaintive error messages
449  * about failing disk transfers.  It prints messages of the form
450  * 	"hp0g: BLABLABLA cmd=read fsbn 12345 of 12344-12347"
451  * blkdone should be -1 if the position of the error is unknown.
452  * The message is printed with printf.
453  */
454 void
455 disk_err(struct bio *bp, const char *what, int blkdone, int nl)
456 {
457 	daddr_t sn;
458 
459 	printf("%s: %s ", devtoname(bp->bio_dev), what);
460 	switch(bp->bio_cmd) {
461 	case BIO_READ:		printf("cmd=read "); break;
462 	case BIO_WRITE:		printf("cmd=write "); break;
463 	case BIO_DELETE:	printf("cmd=delete "); break;
464 	case BIO_GETATTR:	printf("cmd=getattr "); break;
465 	case BIO_SETATTR:	printf("cmd=setattr "); break;
466 	default:		printf("cmd=%x ", bp->bio_cmd); break;
467 	}
468 	sn = bp->bio_blkno;
469 	if (bp->bio_bcount <= DEV_BSIZE) {
470 		printf("fsbn %jd%s", (intmax_t)sn, nl ? "\n" : "");
471 		return;
472 	}
473 	if (blkdone >= 0) {
474 		sn += blkdone;
475 		printf("fsbn %jd of ", (intmax_t)sn);
476 	}
477 	printf("%jd-%jd", (intmax_t)bp->bio_blkno,
478 	    (intmax_t)(bp->bio_blkno + (bp->bio_bcount - 1) / DEV_BSIZE));
479 	if (nl)
480 		printf("\n");
481 }
482 
483 #ifdef notquite
484 /*
485  * Mutex to use when delaying niced I/O bound processes in bioq_disksort().
486  */
487 static struct mtx dksort_mtx;
488 static void
489 dksort_init(void)
490 {
491 
492 	mtx_init(&dksort_mtx, "dksort", NULL, MTX_DEF);
493 }
494 SYSINIT(dksort, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, dksort_init, NULL)
495 #endif
496 
497 /*
498  * Seek sort for disks.
499  *
500  * The buf_queue keep two queues, sorted in ascending block order.  The first
501  * queue holds those requests which are positioned after the current block
502  * (in the first request); the second, which starts at queue->switch_point,
503  * holds requests which came in after their block number was passed.  Thus
504  * we implement a one way scan, retracting after reaching the end of the drive
505  * to the first request on the second queue, at which time it becomes the
506  * first queue.
507  *
508  * A one-way scan is natural because of the way UNIX read-ahead blocks are
509  * allocated.
510  */
511 
512 void
513 bioq_disksort(bioq, bp)
514 	struct bio_queue_head *bioq;
515 	struct bio *bp;
516 {
517 	struct bio *bq;
518 	struct bio *bn;
519 	struct bio *be;
520 
521 #ifdef notquite
522 	struct thread *td = curthread;
523 
524 	if (td && td->td_ksegrp->kg_nice > 0) {
525 		TAILQ_FOREACH(bn, &bioq->queue, bio_queue)
526 			if (BIOTOBUF(bp)->b_vp != BIOTOBUF(bn)->b_vp)
527 				break;
528 		if (bn != NULL) {
529 			mtx_lock(&dksort_mtx);
530 			msleep(&dksort_mtx, &dksort_mtx,
531 			    PPAUSE | PCATCH | PDROP, "ioslow",
532 			    td->td_ksegrp->kg_nice);
533 		}
534 	}
535 #endif
536 	if (!atomic_cmpset_int(&bioq->busy, 0, 1))
537 		panic("Recursing in bioq_disksort()");
538 	be = TAILQ_LAST(&bioq->queue, bio_queue);
539 	/*
540 	 * If the queue is empty or we are an
541 	 * ordered transaction, then it's easy.
542 	 */
543 	if ((bq = bioq_first(bioq)) == NULL) {
544 		bioq_insert_tail(bioq, bp);
545 		bioq->busy = 0;
546 		return;
547 	} else if (bioq->insert_point != NULL) {
548 
549 		/*
550 		 * A certain portion of the list is
551 		 * "locked" to preserve ordering, so
552 		 * we can only insert after the insert
553 		 * point.
554 		 */
555 		bq = bioq->insert_point;
556 	} else {
557 
558 		/*
559 		 * If we lie before the last removed (currently active)
560 		 * request, and are not inserting ourselves into the
561 		 * "locked" portion of the list, then we must add ourselves
562 		 * to the second request list.
563 		 */
564 		if (bp->bio_pblkno < bioq->last_pblkno) {
565 
566 			bq = bioq->switch_point;
567 			/*
568 			 * If we are starting a new secondary list,
569 			 * then it's easy.
570 			 */
571 			if (bq == NULL) {
572 				bioq->switch_point = bp;
573 				bioq_insert_tail(bioq, bp);
574 				bioq->busy = 0;
575 				return;
576 			}
577 			/*
578 			 * If we lie ahead of the current switch point,
579 			 * insert us before the switch point and move
580 			 * the switch point.
581 			 */
582 			if (bp->bio_pblkno < bq->bio_pblkno) {
583 				bioq->switch_point = bp;
584 				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
585 				bioq->busy = 0;
586 				return;
587 			}
588 		} else {
589 			if (bioq->switch_point != NULL)
590 				be = TAILQ_PREV(bioq->switch_point,
591 						bio_queue, bio_queue);
592 			/*
593 			 * If we lie between last_pblkno and bq,
594 			 * insert before bq.
595 			 */
596 			if (bp->bio_pblkno < bq->bio_pblkno) {
597 				TAILQ_INSERT_BEFORE(bq, bp, bio_queue);
598 				bioq->busy = 0;
599 				return;
600 			}
601 		}
602 	}
603 
604 	/*
605 	 * Request is at/after our current position in the list.
606 	 * Optimize for sequential I/O by seeing if we go at the tail.
607 	 */
608 	if (bp->bio_pblkno > be->bio_pblkno) {
609 		TAILQ_INSERT_AFTER(&bioq->queue, be, bp, bio_queue);
610 		bioq->busy = 0;
611 		return;
612 	}
613 
614 	/* Otherwise, insertion sort */
615 	while ((bn = TAILQ_NEXT(bq, bio_queue)) != NULL) {
616 
617 		/*
618 		 * We want to go after the current request if it is the end
619 		 * of the first request list, or if the next request is a
620 		 * larger cylinder than our request.
621 		 */
622 		if (bn == bioq->switch_point
623 		 || bp->bio_pblkno < bn->bio_pblkno)
624 			break;
625 		bq = bn;
626 	}
627 	TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue);
628 	bioq->busy = 0;
629 }
630 
631 
632