xref: /netbsd-src/sys/rump/librump/rumpvfs/rumpblk.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*	$NetBSD: rumpblk.c,v 1.55 2014/03/16 05:20:30 dholland Exp $	*/
2 
3 /*
4  * Copyright (c) 2009 Antti Kantee.  All Rights Reserved.
5  *
6  * Development of this software was supported by the
7  * Finnish Cultural Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 /*
32  * Block device emulation.  Presents a block device interface and
33  * uses rumpuser system calls to satisfy I/O requests.
34  *
35  * We provide fault injection.  The driver can be made to fail
36  * I/O occasionally.
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.55 2014/03/16 05:20:30 dholland Exp $");
41 
42 #include <sys/param.h>
43 #include <sys/buf.h>
44 #include <sys/conf.h>
45 #include <sys/condvar.h>
46 #include <sys/disklabel.h>
47 #include <sys/evcnt.h>
48 #include <sys/fcntl.h>
49 #include <sys/kmem.h>
50 #include <sys/malloc.h>
51 #include <sys/queue.h>
52 #include <sys/stat.h>
53 #include <sys/cprng.h>
54 
55 #include <rump/rumpuser.h>
56 
57 #include "rump_private.h"
58 #include "rump_vfs_private.h"
59 
60 #if 0
61 #define DPRINTF(x) printf x
62 #else
63 #define DPRINTF(x)
64 #endif
65 
66 #define RUMPBLK_SIZE 16
67 static struct rblkdev {
68 	char *rblk_path;
69 	int rblk_fd;
70 	int rblk_mode;
71 
72 	uint64_t rblk_size;
73 	uint64_t rblk_hostoffset;
74 	uint64_t rblk_hostsize;
75 	int rblk_ftype;
76 
77 	struct disklabel rblk_label;
78 } minors[RUMPBLK_SIZE];
79 
80 static struct evcnt ev_io_total;
81 static struct evcnt ev_io_async;
82 
83 static struct evcnt ev_bwrite_total;
84 static struct evcnt ev_bwrite_async;
85 static struct evcnt ev_bread_total;
86 
87 dev_type_open(rumpblk_open);
88 dev_type_close(rumpblk_close);
89 dev_type_read(rumpblk_read);
90 dev_type_write(rumpblk_write);
91 dev_type_ioctl(rumpblk_ioctl);
92 dev_type_strategy(rumpblk_strategy);
93 dev_type_strategy(rumpblk_strategy_fail);
94 dev_type_dump(rumpblk_dump);
95 dev_type_size(rumpblk_size);
96 
97 static const struct bdevsw rumpblk_bdevsw = {
98 	.d_open = rumpblk_open,
99 	.d_close = rumpblk_close,
100 	.d_strategy = rumpblk_strategy,
101 	.d_ioctl = rumpblk_ioctl,
102 	.d_dump = nodump,
103 	.d_psize = nosize,
104 	.d_flag = D_DISK
105 };
106 
107 static const struct bdevsw rumpblk_bdevsw_fail = {
108 	.d_open = rumpblk_open,
109 	.d_close = rumpblk_close,
110 	.d_strategy = rumpblk_strategy_fail,
111 	.d_ioctl = rumpblk_ioctl,
112 	.d_dump = nodump,
113 	.d_psize = nosize,
114 	.d_flag = D_DISK
115 };
116 
117 static const struct cdevsw rumpblk_cdevsw = {
118 	.d_open = rumpblk_open,
119 	.d_close = rumpblk_close,
120 	.d_read = rumpblk_read,
121 	.d_write = rumpblk_write,
122 	.d_ioctl = rumpblk_ioctl,
123 	.d_stop = nostop,
124 	.d_tty = notty,
125 	.d_poll = nopoll,
126 	.d_mmap = nommap,
127 	.d_kqfilter = nokqfilter,
128 	.d_flag = D_DISK
129 };
130 
131 static int backend_open(struct rblkdev *, const char *);
132 static int backend_close(struct rblkdev *);
133 
134 /* fail every n out of BLKFAIL_MAX */
135 #define BLKFAIL_MAX 10000
136 static int blkfail;
137 static unsigned randstate;
138 static kmutex_t rumpblk_lock;
139 static int sectshift = DEV_BSHIFT;
140 
141 static void
142 makedefaultlabel(struct disklabel *lp, off_t size, int part)
143 {
144 	int i;
145 
146 	memset(lp, 0, sizeof(*lp));
147 
148 	lp->d_secperunit = size;
149 	lp->d_secsize = 1 << sectshift;
150 	lp->d_nsectors = size >> sectshift;
151 	lp->d_ntracks = 1;
152 	lp->d_ncylinders = 1;
153 	lp->d_secpercyl = lp->d_nsectors;
154 
155 	/* oh dear oh dear */
156 	strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
157 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
158 
159 	lp->d_type = DTYPE_RUMPD;
160 	lp->d_rpm = 11;
161 	lp->d_interleave = 1;
162 	lp->d_flags = 0;
163 
164 	/* XXX: RAW_PART handling? */
165 	for (i = 0; i < part; i++) {
166 		lp->d_partitions[i].p_fstype = FS_UNUSED;
167 	}
168 	lp->d_partitions[part].p_size = size >> sectshift;
169 	lp->d_npartitions = part+1;
170 	/* XXX: file system type? */
171 
172 	lp->d_magic = DISKMAGIC;
173 	lp->d_magic2 = DISKMAGIC;
174 	lp->d_checksum = 0; /* XXX */
175 }
176 
177 int
178 rumpblk_init(void)
179 {
180 	char buf[64];
181 	devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
182 	unsigned tmp;
183 	int i;
184 
185 	mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);
186 
187 	if (rumpuser_getparam("RUMP_BLKFAIL", buf, sizeof(buf)) == 0) {
188 		blkfail = strtoul(buf, NULL, 10);
189 		/* fail everything */
190 		if (blkfail > BLKFAIL_MAX)
191 			blkfail = BLKFAIL_MAX;
192 		if (rumpuser_getparam("RUMP_BLKFAIL_SEED",
193 		    buf, sizeof(buf)) == 0) {
194 			randstate = strtoul(buf, NULL, 10);
195 		} else {
196 			randstate = cprng_fast32();
197 		}
198 		printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
199 		    "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
200 	} else {
201 		blkfail = 0;
202 	}
203 
204 	if (rumpuser_getparam("RUMP_BLKSECTSHIFT", buf, sizeof(buf)) == 0) {
205 		printf("rumpblk: ");
206 		tmp = strtoul(buf, NULL, 10);
207 		if (tmp >= DEV_BSHIFT)
208 			sectshift = tmp;
209 		else
210 			printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
211 			   DEV_BSHIFT, tmp);
212 		printf("using %d for sector shift (size %d)\n",
213 		    sectshift, 1<<sectshift);
214 	}
215 
216 	memset(minors, 0, sizeof(minors));
217 	for (i = 0; i < RUMPBLK_SIZE; i++) {
218 		minors[i].rblk_fd = -1;
219 	}
220 
221 	evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
222 	    "rumpblk", "I/O reqs");
223 	evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
224 	    "rumpblk", "async I/O");
225 
226 	evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
227 	    "rumpblk", "bytes read");
228 	evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
229 	    "rumpblk", "bytes written");
230 	evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
231 	    "rumpblk", "bytes written async");
232 
233 	if (blkfail) {
234 		return devsw_attach("rumpblk",
235 		    &rumpblk_bdevsw_fail, &rumpblkmaj,
236 		    &rumpblk_cdevsw, &rumpblkmaj);
237 	} else {
238 		return devsw_attach("rumpblk",
239 		    &rumpblk_bdevsw, &rumpblkmaj,
240 		    &rumpblk_cdevsw, &rumpblkmaj);
241 	}
242 }
243 
244 int
245 rumpblk_register(const char *path, devminor_t *dmin,
246 	uint64_t offset, uint64_t size)
247 {
248 	struct rblkdev *rblk;
249 	uint64_t flen;
250 	size_t len;
251 	int ftype, error, i;
252 
253 	/* devices might not report correct size unless they're open */
254 	if ((error = rumpuser_getfileinfo(path, &flen, &ftype)) != 0)
255 		return error;
256 
257 	/* verify host file is of supported type */
258 	if (!(ftype == RUMPUSER_FT_REG
259 	   || ftype == RUMPUSER_FT_BLK
260 	   || ftype == RUMPUSER_FT_CHR))
261 		return EINVAL;
262 
263 	mutex_enter(&rumpblk_lock);
264 	for (i = 0; i < RUMPBLK_SIZE; i++) {
265 		if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
266 			mutex_exit(&rumpblk_lock);
267 			*dmin = i;
268 			return 0;
269 		}
270 	}
271 
272 	for (i = 0; i < RUMPBLK_SIZE; i++)
273 		if (minors[i].rblk_path == NULL)
274 			break;
275 	if (i == RUMPBLK_SIZE) {
276 		mutex_exit(&rumpblk_lock);
277 		return EBUSY;
278 	}
279 
280 	rblk = &minors[i];
281 	rblk->rblk_path = __UNCONST("taken");
282 	mutex_exit(&rumpblk_lock);
283 
284 	len = strlen(path);
285 	rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
286 	strcpy(rblk->rblk_path, path);
287 	rblk->rblk_hostoffset = offset;
288 	if (size != RUMPBLK_SIZENOTSET) {
289 		KASSERT(size + offset <= flen);
290 		rblk->rblk_size = size;
291 	} else {
292 		KASSERT(offset < flen);
293 		rblk->rblk_size = flen - offset;
294 	}
295 	rblk->rblk_hostsize = flen;
296 	rblk->rblk_ftype = ftype;
297 	makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);
298 
299 	if ((error = backend_open(rblk, path)) != 0) {
300 		memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
301 		free(rblk->rblk_path, M_TEMP);
302 		rblk->rblk_path = NULL;
303 		return error;
304 	}
305 
306 	*dmin = i;
307 	return 0;
308 }
309 
310 /*
311  * Unregister rumpblk.  It's the callers responsibility to make
312  * sure it's no longer in use.
313  */
314 int
315 rumpblk_deregister(const char *path)
316 {
317 	struct rblkdev *rblk;
318 	int i;
319 
320 	mutex_enter(&rumpblk_lock);
321 	for (i = 0; i < RUMPBLK_SIZE; i++) {
322 		if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
323 			break;
324 		}
325 	}
326 	mutex_exit(&rumpblk_lock);
327 
328 	if (i == RUMPBLK_SIZE)
329 		return ENOENT;
330 
331 	rblk = &minors[i];
332 	backend_close(rblk);
333 
334 	free(rblk->rblk_path, M_TEMP);
335 	memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
336 	rblk->rblk_path = NULL;
337 
338 	return 0;
339 }
340 
341 static int
342 backend_open(struct rblkdev *rblk, const char *path)
343 {
344 	int error, fd;
345 
346 	KASSERT(rblk->rblk_fd == -1);
347 	error = rumpuser_open(path,
348 	    RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &fd);
349 	if (error) {
350 		error = rumpuser_open(path,
351 		    RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &fd);
352 		if (error)
353 			return error;
354 		rblk->rblk_mode = FREAD;
355 	} else {
356 		rblk->rblk_mode = FREAD|FWRITE;
357 	}
358 
359 	rblk->rblk_fd = fd;
360 	KASSERT(rblk->rblk_fd != -1);
361 	return 0;
362 }
363 
364 static int
365 backend_close(struct rblkdev *rblk)
366 {
367 
368 	rumpuser_close(rblk->rblk_fd);
369 	rblk->rblk_fd = -1;
370 
371 	return 0;
372 }
373 
374 int
375 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
376 {
377 	struct rblkdev *rblk = &minors[minor(dev)];
378 
379 	if (rblk->rblk_fd == -1)
380 		return ENXIO;
381 
382 	if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
383 		return EACCES;
384 	}
385 
386 	return 0;
387 }
388 
389 int
390 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
391 {
392 
393 	return 0;
394 }
395 
396 int
397 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
398 {
399 	devminor_t dmin = minor(dev);
400 	struct rblkdev *rblk = &minors[dmin];
401 	struct partinfo *pi;
402 	int error = 0;
403 
404 	/* well, me should support a few more, but we don't for now */
405 	switch (xfer) {
406 	case DIOCGDINFO:
407 		*(struct disklabel *)addr = rblk->rblk_label;
408 		break;
409 
410 	case DIOCGPART:
411 		pi = addr;
412 		pi->part = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
413 		pi->disklab = &rblk->rblk_label;
414 		break;
415 
416 	/* it's synced enough along the write path */
417 	case DIOCCACHESYNC:
418 		break;
419 
420 	default:
421 		error = ENOTTY;
422 		break;
423 	}
424 
425 	return error;
426 }
427 
428 static int
429 do_physio(dev_t dev, struct uio *uio, int which)
430 {
431 	void (*strat)(struct buf *);
432 
433 	if (blkfail)
434 		strat = rumpblk_strategy_fail;
435 	else
436 		strat = rumpblk_strategy;
437 
438 	return physio(strat, NULL, dev, which, minphys, uio);
439 }
440 
441 int
442 rumpblk_read(dev_t dev, struct uio *uio, int flags)
443 {
444 
445 	return do_physio(dev, uio, B_READ);
446 }
447 
448 int
449 rumpblk_write(dev_t dev, struct uio *uio, int flags)
450 {
451 
452 	return do_physio(dev, uio, B_WRITE);
453 }
454 
455 static void
456 dostrategy(struct buf *bp)
457 {
458 	struct rblkdev *rblk = &minors[minor(bp->b_dev)];
459 	off_t off;
460 	int async = bp->b_flags & B_ASYNC;
461 	int op;
462 
463 	if (bp->b_bcount % (1<<sectshift) != 0) {
464 		rump_biodone(bp, 0, EINVAL);
465 		return;
466 	}
467 
468 	/* collect statistics */
469 	ev_io_total.ev_count++;
470 	if (async)
471 		ev_io_async.ev_count++;
472 	if (BUF_ISWRITE(bp)) {
473 		ev_bwrite_total.ev_count += bp->b_bcount;
474 		if (async)
475 			ev_bwrite_async.ev_count += bp->b_bcount;
476 	} else {
477 		ev_bread_total.ev_count++;
478 	}
479 
480 	/*
481 	 * b_blkno is always in terms of DEV_BSIZE, and since we need
482 	 * to translate to a byte offset for the host read, this
483 	 * calculation does not need sectshift.
484 	 */
485 	off = bp->b_blkno << DEV_BSHIFT;
486 
487 	/*
488 	 * Do bounds checking if we're working on a file.  Otherwise
489 	 * invalid file systems might attempt to read beyond EOF.  This
490 	 * is bad(tm) especially on mmapped images.  This is essentially
491 	 * the kernel bounds_check() routines.
492 	 */
493 	if (off + bp->b_bcount > rblk->rblk_size) {
494 		int64_t sz = rblk->rblk_size - off;
495 
496 		/* EOF */
497 		if (sz == 0) {
498 			rump_biodone(bp, 0, 0);
499 			return;
500 		}
501 		/* beyond EOF ==> error */
502 		if (sz < 0) {
503 			rump_biodone(bp, 0, EINVAL);
504 			return;
505 		}
506 
507 		/* truncate to device size */
508 		bp->b_bcount = sz;
509 	}
510 
511 	off += rblk->rblk_hostoffset;
512 	DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
513 	    " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
514 	    bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
515 	    off, off, (off + bp->b_bcount), async ? "a" : ""));
516 
517 	op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE;
518 	if (BUF_ISWRITE(bp) && !async)
519 		op |= RUMPUSER_BIO_SYNC;
520 
521 	rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off,
522 	    rump_biodone, bp);
523 }
524 
525 void
526 rumpblk_strategy(struct buf *bp)
527 {
528 
529 	dostrategy(bp);
530 }
531 
532 /*
533  * Simple random number generator.  This is private so that we can
534  * very repeatedly control which blocks will fail.
535  *
536  * <mlelstv> pooka, rand()
537  * <mlelstv> [paste]
538  */
539 static unsigned
540 gimmerand(void)
541 {
542 
543 	return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
544 }
545 
546 /*
547  * Block device with very simple fault injection.  Fails every
548  * n out of BLKFAIL_MAX I/O with EIO.  n is determined by the env
549  * variable RUMP_BLKFAIL.
550  */
551 void
552 rumpblk_strategy_fail(struct buf *bp)
553 {
554 
555 	if (gimmerand() % BLKFAIL_MAX >= blkfail) {
556 		dostrategy(bp);
557 	} else {
558 		printf("block fault injection: failing I/O on block %lld\n",
559 		    (long long)bp->b_blkno);
560 		bp->b_error = EIO;
561 		biodone(bp);
562 	}
563 }
564