xref: /dflybsd-src/sys/dev/virtual/virtio/block/virtio_blk.c (revision f1a18162c172a78b14072792f1c4fa71be013701)
1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * $FreeBSD: head/sys/dev/virtio/block/virtio_blk.c 252707 2013-07-04 17:57:26Z bryanv $
27  */
28 
29 /* Driver for VirtIO block devices. */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/bio.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/sglist.h>
38 #include <sys/sysctl.h>
39 #include <sys/queue.h>
40 #include <sys/serialize.h>
41 #include <sys/buf2.h>
42 #include <sys/rman.h>
43 #include <sys/disk.h>
44 #include <sys/devicestat.h>
45 
46 #include <dev/virtual/virtio/virtio/virtio.h>
47 #include <dev/virtual/virtio/virtio/virtqueue.h>
48 #include "virtio_blk.h"
49 
50 struct vtblk_request {
51 	struct virtio_blk_outhdr	 vbr_hdr __aligned(16);
52 	struct bio			*vbr_bio;
53 	uint8_t				 vbr_ack;
54 
55 	SLIST_ENTRY(vtblk_request)	 vbr_link;
56 };
57 
58 enum vtblk_cache_mode {
59 	VTBLK_CACHE_WRITETHROUGH,
60 	VTBLK_CACHE_WRITEBACK,
61 	VTBLK_CACHE_MAX
62 };
63 
64 struct vtblk_softc {
65 	device_t		 vtblk_dev;
66 	struct lwkt_serialize	 vtblk_slz;
67 	uint64_t		 vtblk_features;
68 	uint32_t		 vtblk_flags;
69 #define VTBLK_FLAG_INDIRECT	0x0001
70 #define VTBLK_FLAG_READONLY	0x0002
71 #define VTBLK_FLAG_DETACH	0x0004
72 #define VTBLK_FLAG_SUSPEND	0x0008
73 #define VTBLK_FLAG_DUMPING	0x0010
74 #define VTBLK_FLAG_WC_CONFIG	0x0020
75 
76 	struct virtqueue	*vtblk_vq;
77 	struct sglist		*vtblk_sglist;
78 	struct disk		 vtblk_disk;
79 	cdev_t			 cdev;
80 	struct devstat		 stats;
81 
82 	struct bio_queue_head	 vtblk_bioq;
83 	SLIST_HEAD(, vtblk_request)
84 				 vtblk_req_free;
85 
86 	int			 vtblk_sector_size;
87 	int			 vtblk_max_nsegs;
88 	int			 vtblk_request_count;
89 	enum vtblk_cache_mode	 vtblk_write_cache;
90 
91 	struct vtblk_request	 vtblk_dump_request;
92 };
93 
94 static struct virtio_feature_desc vtblk_feature_desc[] = {
95 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
96 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
97 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
98 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
99 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
100 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
101 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
102 	{ VIRTIO_BLK_F_WCE,		"WriteCache"	},
103 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
104 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
105 
106 	{ 0, NULL }
107 };
108 
109 static int	vtblk_probe(device_t);
110 static int	vtblk_attach(device_t);
111 static int	vtblk_detach(device_t);
112 static int	vtblk_suspend(device_t);
113 static int	vtblk_resume(device_t);
114 static int	vtblk_shutdown(device_t);
115 
116 static void	vtblk_negotiate_features(struct vtblk_softc *);
117 static int	vtblk_alloc_intr(struct vtblk_softc *);
118 static int	vtblk_maximum_segments(struct vtblk_softc *,
119 		    struct virtio_blk_config *);
120 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
121 static void	vtblk_set_write_cache(struct vtblk_softc *, int);
122 static int	vtblk_write_cache_enabled(struct vtblk_softc *,
123 		    struct virtio_blk_config *);
124 static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
125 static void	vtblk_alloc_disk(struct vtblk_softc *,
126 		    struct virtio_blk_config *);
127 /*
128  * Interface to the device switch.
129  */
130 static d_open_t		vtblk_open;
131 static d_strategy_t	vtblk_strategy;
132 static d_dump_t		vtblk_dump;
133 
134 static struct dev_ops vbd_disk_ops = {
135 	{ "vbd", 200, D_DISK | D_MPSAFE },
136 	.d_open		= vtblk_open,
137 	.d_close	= nullclose,
138 	.d_read		= physread,
139 	.d_write	= physwrite,
140 	.d_strategy	= vtblk_strategy,
141 	.d_dump		= vtblk_dump,
142 };
143 
144 static void	vtblk_startio(struct vtblk_softc *);
145 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
146 static int	vtblk_execute_request(struct vtblk_softc *,
147 		    struct vtblk_request *);
148 static void	vtblk_vq_intr(void *);
149 
150 static void	vtblk_stop(struct vtblk_softc *);
151 
152 static void	vtblk_prepare_dump(struct vtblk_softc *);
153 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
154 static int	vtblk_flush_dump(struct vtblk_softc *);
155 static int	vtblk_poll_request(struct vtblk_softc *,
156 		    struct vtblk_request *);
157 
158 static void	vtblk_drain_vq(struct vtblk_softc *, int);
159 static void	vtblk_drain(struct vtblk_softc *);
160 
161 static int	vtblk_alloc_requests(struct vtblk_softc *);
162 static void	vtblk_free_requests(struct vtblk_softc *);
163 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
164 static void	vtblk_enqueue_request(struct vtblk_softc *,
165 		    struct vtblk_request *);
166 
167 static int	vtblk_request_error(struct vtblk_request *);
168 static void	vtblk_finish_bio(struct bio *, int);
169 
170 static void	vtblk_setup_sysctl(struct vtblk_softc *);
171 static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
172 
173 /* Tunables. */
174 static int vtblk_writecache_mode = -1;
175 TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
176 
177 /* Features desired/implemented by this driver. */
178 #define VTBLK_FEATURES \
179     (VIRTIO_BLK_F_SIZE_MAX		| \
180      VIRTIO_BLK_F_SEG_MAX		| \
181      VIRTIO_BLK_F_GEOMETRY		| \
182      VIRTIO_BLK_F_RO			| \
183      VIRTIO_BLK_F_BLK_SIZE		| \
184      VIRTIO_BLK_F_WCE			| \
185      VIRTIO_BLK_F_CONFIG_WCE		| \
186      VIRTIO_RING_F_INDIRECT_DESC)
187 
188 /*
189  * Each block request uses at least two segments - one for the header
190  * and one for the status.
191  */
192 #define VTBLK_MIN_SEGMENTS	2
193 
194 static device_method_t vtblk_methods[] = {
195 	/* Device methods. */
196 	DEVMETHOD(device_probe,		vtblk_probe),
197 	DEVMETHOD(device_attach,	vtblk_attach),
198 	DEVMETHOD(device_detach,	vtblk_detach),
199 	DEVMETHOD(device_suspend,	vtblk_suspend),
200 	DEVMETHOD(device_resume,	vtblk_resume),
201 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
202 
203 	DEVMETHOD_END
204 };
205 
206 static driver_t vtblk_driver = {
207 	"vtblk",
208 	vtblk_methods,
209 	sizeof(struct vtblk_softc)
210 };
211 static devclass_t vtblk_devclass;
212 
213 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass, NULL, NULL);
214 MODULE_VERSION(virtio_blk, 1);
215 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
216 
217 static int
218 vtblk_probe(device_t dev)
219 {
220 
221 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
222 		return (ENXIO);
223 
224 	device_set_desc(dev, "VirtIO Block Adapter");
225 
226 	return (BUS_PROBE_DEFAULT);
227 }
228 
229 static int
230 vtblk_attach(device_t dev)
231 {
232 	struct vtblk_softc *sc;
233 	struct virtio_blk_config blkcfg;
234 	int error;
235 
236 	sc = device_get_softc(dev);
237 	sc->vtblk_dev = dev;
238 
239 	lwkt_serialize_init(&sc->vtblk_slz);
240 
241 	bioq_init(&sc->vtblk_bioq);
242 	SLIST_INIT(&sc->vtblk_req_free);
243 
244 	virtio_set_feature_desc(dev, vtblk_feature_desc);
245 	vtblk_negotiate_features(sc);
246 
247 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
248 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
249 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
250 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
251 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
252 		sc->vtblk_flags |= VTBLK_FLAG_WC_CONFIG;
253 
254 	vtblk_setup_sysctl(sc);
255 
256 	/* Get local copy of config. */
257 	virtio_read_device_config(dev, 0, &blkcfg,
258 				  sizeof(struct virtio_blk_config));
259 
260 	/*
261 	 * With the current sglist(9) implementation, it is not easy
262 	 * for us to support a maximum segment size as adjacent
263 	 * segments are coalesced. For now, just make sure it's larger
264 	 * than the maximum supported transfer size.
265 	 */
266 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
267 		if (blkcfg.size_max < MAXPHYS) {
268 			error = ENOTSUP;
269 			device_printf(dev, "host requires unsupported "
270 			    "maximum segment size feature\n");
271 			goto fail;
272 		}
273 	}
274 
275 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
276 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
277 		error = EINVAL;
278 		device_printf(dev, "fewer than minimum number of segments "
279 		    "allowed: %d\n", sc->vtblk_max_nsegs);
280 		goto fail;
281 	}
282 
283 	/*
284 	 * Allocate working sglist. The number of segments may be too
285 	 * large to safely store on the stack.
286 	 */
287 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_INTWAIT);
288 	if (sc->vtblk_sglist == NULL) {
289 		error = ENOMEM;
290 		device_printf(dev, "cannot allocate sglist\n");
291 		goto fail;
292 	}
293 
294 	error = vtblk_alloc_intr(sc);
295 	if (error) {
296 		device_printf(dev, "cannot allocate interrupt\n");
297 		goto fail;
298 	}
299 
300 	error = vtblk_alloc_virtqueue(sc);
301 	if (error) {
302 		device_printf(dev, "cannot allocate virtqueue\n");
303 		goto fail;
304 	}
305 
306 	error = virtio_bind_intr(sc->vtblk_dev, 0, 0, vtblk_vq_intr, sc);
307 	if (error) {
308 		device_printf(dev, "cannot assign virtqueue to interrupt\n");
309 		goto fail;
310 	}
311 
312 	error = vtblk_alloc_requests(sc);
313 	if (error) {
314 		device_printf(dev, "cannot preallocate requests\n");
315 		goto fail;
316 	}
317 
318 	error = virtio_setup_intr(dev, 0, &sc->vtblk_slz);
319 	if (error) {
320 		device_printf(dev, "cannot setup virtqueue interrupt\n");
321 		goto fail;
322 	}
323 
324 	virtqueue_enable_intr(sc->vtblk_vq);
325 
326 	vtblk_alloc_disk(sc, &blkcfg);
327 
328 fail:
329 	if (error)
330 		vtblk_detach(dev);
331 
332 	return (error);
333 }
334 
335 static int
336 vtblk_detach(device_t dev)
337 {
338 	struct vtblk_softc *sc;
339 
340 	sc = device_get_softc(dev);
341 
342 	virtio_teardown_intr(dev, 0);
343 
344 	lwkt_serialize_enter(&sc->vtblk_slz);
345 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
346 	if (device_is_attached(dev))
347 		vtblk_stop(sc);
348 	lwkt_serialize_exit(&sc->vtblk_slz);
349 
350 	vtblk_drain(sc);
351 
352 	if (sc->cdev != NULL) {
353 		disk_destroy(&sc->vtblk_disk);
354 		sc->cdev = NULL;
355 	}
356 
357 	if (sc->vtblk_sglist != NULL) {
358 		sglist_free(sc->vtblk_sglist);
359 		sc->vtblk_sglist = NULL;
360 	}
361 
362 	return (0);
363 }
364 
365 static int
366 vtblk_suspend(device_t dev)
367 {
368 	struct vtblk_softc *sc;
369 
370 	sc = device_get_softc(dev);
371 
372 	lwkt_serialize_enter(&sc->vtblk_slz);
373 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
374 	/* XXX BMV: virtio_stop(), etc needed here? */
375 	lwkt_serialize_exit(&sc->vtblk_slz);
376 
377 	return (0);
378 }
379 
380 static int
381 vtblk_resume(device_t dev)
382 {
383 	struct vtblk_softc *sc;
384 
385 	sc = device_get_softc(dev);
386 
387 	lwkt_serialize_enter(&sc->vtblk_slz);
388 	/* XXX BMV: virtio_reinit(), etc needed here? */
389 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
390 #if 0 /* XXX Resume IO? */
391 	vtblk_startio(sc);
392 #endif
393 	lwkt_serialize_exit(&sc->vtblk_slz);
394 
395 	return (0);
396 }
397 
398 static int
399 vtblk_shutdown(device_t dev)
400 {
401 
402 	return (0);
403 }
404 
405 static int
406 vtblk_open(struct dev_open_args *ap)
407 {
408 	struct vtblk_softc *sc;
409 	cdev_t dev = ap->a_head.a_dev;
410 	sc = dev->si_drv1;
411 	if (sc == NULL)
412 		return (ENXIO);
413 
414 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
415 }
416 
417 static int
418 vtblk_dump(struct dev_dump_args *ap)
419 {
420 	struct vtblk_softc *sc;
421 	cdev_t dev = ap->a_head.a_dev;
422         uint64_t buf_start, buf_len;
423         int error;
424 
425 	sc = dev->si_drv1;
426 	if (sc == NULL)
427 		return (ENXIO);
428 
429         buf_start = ap->a_offset;
430         buf_len = ap->a_length;
431 
432 //	lwkt_serialize_enter(&sc->vtblk_slz);
433 
434 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
435 		vtblk_prepare_dump(sc);
436 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
437 	}
438 
439 	if (buf_len > 0)
440 		error = vtblk_write_dump(sc, ap->a_virtual, buf_start,
441 		    buf_len);
442 	else if (buf_len == 0)
443 		error = vtblk_flush_dump(sc);
444 	else {
445 		error = EINVAL;
446 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
447 	}
448 
449 //	lwkt_serialize_exit(&sc->vtblk_slz);
450 
451 	return (error);
452 }
453 
454 static int
455 vtblk_strategy(struct dev_strategy_args *ap)
456 {
457 	struct vtblk_softc *sc;
458 	cdev_t dev = ap->a_head.a_dev;
459 	sc = dev->si_drv1;
460 	struct bio *bio = ap->a_bio;
461 	struct buf *bp = bio->bio_buf;
462 
463 	if (sc == NULL) {
464 		vtblk_finish_bio(bio, EINVAL);
465 		return EINVAL;
466 	}
467 
468 	/*
469 	 * Fail any write if RO. Unfortunately, there does not seem to
470 	 * be a better way to report our readonly'ness to GEOM above.
471 	 *
472 	 * XXX: Is that true in DFly?
473 	 */
474 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
475 	    (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_FLUSH)) {
476 		vtblk_finish_bio(bio, EROFS);
477 		return (EINVAL);
478 	}
479 
480 	lwkt_serialize_enter(&sc->vtblk_slz);
481 	if ((sc->vtblk_flags & VTBLK_FLAG_DETACH) == 0) {
482 		bioqdisksort(&sc->vtblk_bioq, bio);
483 		vtblk_startio(sc);
484 		lwkt_serialize_exit(&sc->vtblk_slz);
485 	} else {
486 		lwkt_serialize_exit(&sc->vtblk_slz);
487 		vtblk_finish_bio(bio, ENXIO);
488 	}
489 	return 0;
490 }
491 
492 static void
493 vtblk_negotiate_features(struct vtblk_softc *sc)
494 {
495 	device_t dev;
496 	uint64_t features;
497 
498 	dev = sc->vtblk_dev;
499 	features = VTBLK_FEATURES;
500 
501 	sc->vtblk_features = virtio_negotiate_features(dev, features);
502 }
503 
504 /*
505  * Calculate the maximum number of DMA segment supported.  Note
506  * that the in/out header is encoded in the segment list.  We
507  * assume that VTBLK_MIN_SEGMENTS covers that part of it so
508  * we add it into the desired total.  If the SEG_MAX feature
509  * is not specified we have to just assume that the host can
510  * handle the maximum number of segments required for a MAXPHYS
511  * sized request.
512  *
513  * The additional + 1 is in case a MAXPHYS-sized buffer crosses
514  * a page boundary.
515  */
516 static int
517 vtblk_maximum_segments(struct vtblk_softc *sc,
518     struct virtio_blk_config *blkcfg)
519 {
520 	device_t dev;
521 	int nsegs;
522 
523 	dev = sc->vtblk_dev;
524 	nsegs = VTBLK_MIN_SEGMENTS;
525 
526 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
527 		nsegs = MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1 + nsegs);
528 	} else {
529 		nsegs = MAXPHYS / PAGE_SIZE + 1 + nsegs;
530 	}
531 	if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
532 		nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
533 
534 	return (nsegs);
535 }
536 
537 static int
538 vtblk_alloc_intr(struct vtblk_softc *sc)
539 {
540 	int cnt = 1;
541 	int error;
542 
543 	error = virtio_intr_alloc(sc->vtblk_dev, &cnt, 0, NULL);
544 	if (error != 0)
545 		return (error);
546 	else if (cnt != 1)
547 		return (ENXIO);
548 
549 	return (0);
550 }
551 
552 static int
553 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
554 {
555 	device_t dev;
556 	struct vq_alloc_info vq_info;
557 
558 	dev = sc->vtblk_dev;
559 
560 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
561 	    &sc->vtblk_vq, "%s request", device_get_nameunit(dev));
562 
563 	return (virtio_alloc_virtqueues(dev, 1, &vq_info));
564 }
565 
566 static void
567 vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
568 {
569 
570 	/* Set either writeback (1) or writethrough (0) mode. */
571 	virtio_write_dev_config_1(sc->vtblk_dev,
572 	    offsetof(struct virtio_blk_config, writeback), wc);
573 }
574 
575 static int
576 vtblk_write_cache_enabled(struct vtblk_softc *sc,
577     struct virtio_blk_config *blkcfg)
578 {
579 	int wc;
580 
581 	if (sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) {
582 		wc = vtblk_tunable_int(sc, "writecache_mode",
583 		    vtblk_writecache_mode);
584 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
585 			vtblk_set_write_cache(sc, wc);
586 		else
587 			wc = blkcfg->writeback;
588 	} else
589 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_WCE);
590 
591 	return (wc);
592 }
593 
594 static int
595 vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
596 {
597 	struct vtblk_softc *sc;
598 	int wc, error;
599 
600 	sc = oidp->oid_arg1;
601 	wc = sc->vtblk_write_cache;
602 
603 	error = sysctl_handle_int(oidp, &wc, 0, req);
604 	if (error || req->newptr == NULL)
605 		return (error);
606 	if ((sc->vtblk_flags & VTBLK_FLAG_WC_CONFIG) == 0)
607 		return (EPERM);
608 	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
609 		return (EINVAL);
610 
611 	lwkt_serialize_enter(&sc->vtblk_slz);
612 	sc->vtblk_write_cache = wc;
613 	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
614 	lwkt_serialize_exit(&sc->vtblk_slz);
615 
616 	return (0);
617 }
618 
619 static void
620 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
621 {
622 	struct disk_info info;
623 
624 	/* construct the disk_info */
625 	bzero(&info, sizeof(info));
626 
627 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_BLK_SIZE))
628 		sc->vtblk_sector_size = blkcfg->blk_size;
629 	else
630 		sc->vtblk_sector_size = 512;
631 
632 	/* blkcfg->capacity is always expressed in 512 byte sectors. */
633 	info.d_media_blksize = 512;
634 	info.d_media_blocks = blkcfg->capacity;
635 
636 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_GEOMETRY)) {
637 		info.d_ncylinders = blkcfg->geometry.cylinders;
638 		info.d_nheads = blkcfg->geometry.heads;
639 		info.d_secpertrack = blkcfg->geometry.sectors;
640 		info.d_secpercyl = info.d_secpertrack * info.d_nheads;
641 	} else {
642 		/* Fabricate a geometry */
643 		info.d_secpertrack = 1024;
644 		info.d_nheads = 1;
645 		info.d_secpercyl = info.d_secpertrack * info.d_nheads;
646 		info.d_ncylinders =
647 		    (u_int)(info.d_media_blocks / info.d_secpercyl);
648 	}
649 
650 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
651 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
652 	else
653 		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
654 
655 	devstat_add_entry(&sc->stats, "vbd", device_get_unit(sc->vtblk_dev),
656 			  DEV_BSIZE, DEVSTAT_ALL_SUPPORTED,
657 			  DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
658 			  DEVSTAT_PRIORITY_DISK);
659 
660 	/* attach a generic disk device to ourselves */
661 	sc->cdev = disk_create(device_get_unit(sc->vtblk_dev), &sc->vtblk_disk,
662 			       &vbd_disk_ops);
663 
664 	sc->cdev->si_drv1 = sc;
665 	sc->cdev->si_iosize_max = MAXPHYS;
666 	disk_setdiskinfo(&sc->vtblk_disk, &info);
667 	if (virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_BLK_SIZE)) {
668 		device_printf(sc->vtblk_dev, "Block size: %u\n",
669 		    sc->vtblk_sector_size);
670 	}
671 	device_printf(sc->vtblk_dev,
672 	    "%juMB (%ju 512 byte sectors: %dH %dS/T %dC)\n",
673 	    ((uintmax_t)blkcfg->capacity * 512) / (1024*1024),
674 	    (uintmax_t)blkcfg->capacity, blkcfg->geometry.heads,
675 	    blkcfg->geometry.sectors, blkcfg->geometry.cylinders);
676 }
677 
678 static void
679 vtblk_startio(struct vtblk_softc *sc)
680 {
681 	struct virtqueue *vq;
682 	struct vtblk_request *req;
683 	int enq;
684 
685 	vq = sc->vtblk_vq;
686 	enq = 0;
687 
688 	ASSERT_SERIALIZED(&sc->vtblk_slz);
689 
690 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
691 		return;
692 
693 	while (!virtqueue_full(vq)) {
694 		req = vtblk_bio_request(sc);
695 		if (req == NULL)
696 			break;
697 
698 		if (vtblk_execute_request(sc, req) != 0) {
699 			bioqdisksort(&sc->vtblk_bioq, req->vbr_bio);
700 			vtblk_enqueue_request(sc, req);
701 			break;
702 		}
703 		devstat_start_transaction(&sc->stats);
704 
705 		enq++;
706 	}
707 
708 	if (enq > 0)
709 		virtqueue_notify(vq, &sc->vtblk_slz);
710 }
711 
712 static struct vtblk_request *
713 vtblk_bio_request(struct vtblk_softc *sc)
714 {
715 	struct bio_queue_head *bioq;
716 	struct vtblk_request *req;
717 	struct bio *bio;
718 	struct buf *bp;
719 
720 	bioq = &sc->vtblk_bioq;
721 
722 	if (bioq_first(bioq) == NULL)
723 		return (NULL);
724 
725 	req = vtblk_dequeue_request(sc);
726 	if (req == NULL)
727 		return (NULL);
728 
729 	bio = bioq_takefirst(bioq);
730 	req->vbr_bio = bio;
731 	req->vbr_ack = -1;
732 	req->vbr_hdr.ioprio = 1;
733 	bp = bio->bio_buf;
734 
735 	switch (bp->b_cmd) {
736 	case BUF_CMD_FLUSH:
737 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
738 		break;
739 	case BUF_CMD_READ:
740 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
741 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
742 		break;
743 	case BUF_CMD_WRITE:
744 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
745 		req->vbr_hdr.sector = bio->bio_offset / DEV_BSIZE;
746 		break;
747 	default:
748 		KASSERT(0, ("bio with unhandled cmd: %d", bp->b_cmd));
749 		req->vbr_hdr.type = -1;
750 		break;
751 	}
752 
753 	return (req);
754 }
755 
756 static int
757 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
758 {
759 	struct sglist *sg;
760 	struct bio *bio;
761 	struct buf *bp;
762 	int writable, error;
763 
764 	sg = sc->vtblk_sglist;
765 	bio = req->vbr_bio;
766 	bp = bio->bio_buf;
767 	writable = 0;
768 
769 	/*
770 	 * sglist is live throughout this subroutine.
771 	 */
772 	error = sglist_append(sg, &req->vbr_hdr,
773 			      sizeof(struct virtio_blk_outhdr));
774 	KASSERT(error == 0, ("error adding header to sglist"));
775 	KASSERT(sg->sg_nseg == 1,
776 	    ("header spanned multiple segments: %d", sg->sg_nseg));
777 
778 	if (bp->b_cmd == BUF_CMD_READ || bp->b_cmd == BUF_CMD_WRITE) {
779 		error = sglist_append(sg, bp->b_data, bp->b_bcount);
780 		KASSERT(error == 0, ("error adding buffer to sglist"));
781 
782 		/* BUF_CMD_READ means the host writes into our buffer. */
783 		if (bp->b_cmd == BUF_CMD_READ)
784 			writable += sg->sg_nseg - 1;
785 	}
786 
787 	error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
788 	KASSERT(error == 0, ("error adding ack to sglist"));
789 	writable++;
790 
791 	KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
792 	    ("fewer than min segments: %d", sg->sg_nseg));
793 
794 	error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
795 				  sg->sg_nseg - writable, writable);
796 
797 	sglist_reset(sg);
798 
799 	return (error);
800 }
801 
802 static void
803 vtblk_vq_intr(void *arg)
804 {
805 	struct vtblk_softc *sc = arg;
806 	struct virtqueue *vq = sc->vtblk_vq;
807 	struct vtblk_request *req;
808 	struct bio *bio;
809 	struct buf *bp;
810 
811 	ASSERT_SERIALIZED(&sc->vtblk_slz);
812 
813 	if (!virtqueue_pending(vq))
814 		return;
815 
816 	lwkt_serialize_handler_disable(&sc->vtblk_slz);
817 	virtqueue_disable_intr(sc->vtblk_vq);
818 
819 retry:
820 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
821 		return;
822 
823 	while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
824 		bio = req->vbr_bio;
825 		bp = bio->bio_buf;
826 
827 		if (req->vbr_ack == VIRTIO_BLK_S_OK) {
828 			bp->b_resid = 0;
829 		} else {
830 			bp->b_flags |= B_ERROR;
831 			if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP) {
832 				bp->b_error = ENOTSUP;
833 			} else {
834 				bp->b_error = EIO;
835 			}
836 		}
837 
838 		devstat_end_transaction_buf(&sc->stats, bio->bio_buf);
839 
840 		lwkt_serialize_exit(&sc->vtblk_slz);
841 		/*
842 		 * Unlocking the controller around biodone() does not allow
843 		 * processing further device interrupts; when we queued
844 		 * vtblk_vq_intr, we disabled interrupts. It will allow
845 		 * concurrent vtblk_strategy/_startio command dispatches.
846 		 */
847 		biodone(bio);
848 		lwkt_serialize_enter(&sc->vtblk_slz);
849 
850 		vtblk_enqueue_request(sc, req);
851 	}
852 
853 	vtblk_startio(sc);
854 
855 	if (virtqueue_enable_intr(vq) != 0) {
856 		/*
857 		 * If new virtqueue entries appeared immediately after
858 		 * enabling interrupts, process them now. Release and
859 		 * retake softcontroller lock to try to avoid blocking
860 		 * I/O dispatch for too long.
861 		 */
862 		virtqueue_disable_intr(vq);
863 		goto retry;
864 	}
865 	lwkt_serialize_handler_enable(&sc->vtblk_slz);
866 }
867 
868 static void
869 vtblk_stop(struct vtblk_softc *sc)
870 {
871 
872 	virtqueue_disable_intr(sc->vtblk_vq);
873 	virtio_stop(sc->vtblk_dev);
874 }
875 
876 static void
877 vtblk_prepare_dump(struct vtblk_softc *sc)
878 {
879 	device_t dev;
880 	struct virtqueue *vq;
881 
882 	dev = sc->vtblk_dev;
883 	vq = sc->vtblk_vq;
884 
885 	vtblk_stop(sc);
886 
887 	/*
888 	 * Drain all requests caught in-flight in the virtqueue,
889 	 * skipping biodone(). When dumping, only one request is
890 	 * outstanding at a time, and we just poll the virtqueue
891 	 * for the response.
892 	 */
893 	vtblk_drain_vq(sc, 1);
894 
895 	if (virtio_reinit(dev, sc->vtblk_features) != 0) {
896 		panic("%s: cannot reinit VirtIO block device during dump",
897 		    device_get_nameunit(dev));
898 	}
899 
900 	virtqueue_disable_intr(vq);
901 	virtio_reinit_complete(dev);
902 }
903 
904 static int
905 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
906     size_t length)
907 {
908 	struct bio bio;
909 	struct buf bp;
910 	struct vtblk_request *req;
911 
912 	req = &sc->vtblk_dump_request;
913 	req->vbr_ack = -1;
914 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
915 	req->vbr_hdr.ioprio = 1;
916 	req->vbr_hdr.sector = offset / 512;
917 
918 	req->vbr_bio = &bio;
919 	bzero(&bio, sizeof(struct bio));
920 	bzero(&bp, sizeof(struct buf));
921 
922 	bio.bio_buf = &bp;
923 	bp.b_cmd = BUF_CMD_WRITE;
924 	bp.b_data = virtual;
925 	bp.b_bcount = length;
926 
927 	return (vtblk_poll_request(sc, req));
928 }
929 
930 static int
931 vtblk_flush_dump(struct vtblk_softc *sc)
932 {
933 	struct bio bio;
934 	struct buf bp;
935 	struct vtblk_request *req;
936 
937 	req = &sc->vtblk_dump_request;
938 	req->vbr_ack = -1;
939 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
940 	req->vbr_hdr.ioprio = 1;
941 	req->vbr_hdr.sector = 0;
942 
943 	req->vbr_bio = &bio;
944 	bzero(&bio, sizeof(struct bio));
945 	bzero(&bp, sizeof(struct buf));
946 
947 	bio.bio_buf = &bp;
948 	bp.b_cmd = BUF_CMD_FLUSH;
949 
950 	return (vtblk_poll_request(sc, req));
951 }
952 
953 static int
954 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
955 {
956 	struct virtqueue *vq;
957 	int error;
958 
959 	vq = sc->vtblk_vq;
960 
961 	if (!virtqueue_empty(vq))
962 		return (EBUSY);
963 
964 	error = vtblk_execute_request(sc, req);
965 	if (error)
966 		return (error);
967 
968 	virtqueue_notify(vq, NULL);
969 	virtqueue_poll(vq, NULL);
970 
971 	error = vtblk_request_error(req);
972 	if (error && bootverbose) {
973 		device_printf(sc->vtblk_dev,
974 		    "%s: IO error: %d\n", __func__, error);
975 	}
976 
977 	return (error);
978 }
979 
980 static void
981 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
982 {
983 	struct virtqueue *vq;
984 	struct vtblk_request *req;
985 	int last;
986 
987 	vq = sc->vtblk_vq;
988 	last = 0;
989 
990 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
991 		if (!skip_done)
992 			vtblk_finish_bio(req->vbr_bio, ENXIO);
993 
994 		vtblk_enqueue_request(sc, req);
995 	}
996 
997 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
998 }
999 
1000 static void
1001 vtblk_drain(struct vtblk_softc *sc)
1002 {
1003 	struct bio_queue_head *bioq;
1004 	struct bio *bio;
1005 
1006 	bioq = &sc->vtblk_bioq;
1007 
1008 	if (sc->vtblk_vq != NULL)
1009 		vtblk_drain_vq(sc, 0);
1010 
1011 	while (bioq_first(bioq) != NULL) {
1012 		bio = bioq_takefirst(bioq);
1013 		vtblk_finish_bio(bio, ENXIO);
1014 	}
1015 
1016 	vtblk_free_requests(sc);
1017 }
1018 
1019 static int
1020 vtblk_alloc_requests(struct vtblk_softc *sc)
1021 {
1022 	struct vtblk_request *req;
1023 	int i, nreqs;
1024 
1025 	nreqs = virtqueue_size(sc->vtblk_vq);
1026 
1027 	/*
1028 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1029 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1030 	 * the number allocated when indirect descriptors are not available.
1031 	 */
1032 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1033 		nreqs /= VTBLK_MIN_SEGMENTS;
1034 
1035 	for (i = 0; i < nreqs; i++) {
1036 		req = contigmalloc(sizeof(struct vtblk_request), M_DEVBUF,
1037 		    M_WAITOK, 0, BUS_SPACE_MAXADDR, 16, 0);
1038 		if (req == NULL)
1039 			return (ENOMEM);
1040 
1041 		KKASSERT(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr))
1042 		    == 1);
1043 		KKASSERT(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack))
1044 		    == 1);
1045 
1046 		sc->vtblk_request_count++;
1047 		vtblk_enqueue_request(sc, req);
1048 	}
1049 
1050 	return (0);
1051 }
1052 
1053 static void
1054 vtblk_free_requests(struct vtblk_softc *sc)
1055 {
1056 	struct vtblk_request *req;
1057 
1058 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1059 		sc->vtblk_request_count--;
1060 		contigfree(req, sizeof(struct vtblk_request), M_DEVBUF);
1061 	}
1062 
1063 	KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
1064 }
1065 
1066 static struct vtblk_request *
1067 vtblk_dequeue_request(struct vtblk_softc *sc)
1068 {
1069 	struct vtblk_request *req;
1070 
1071 	req = SLIST_FIRST(&sc->vtblk_req_free);
1072 	if (req != NULL)
1073 		SLIST_REMOVE_HEAD(&sc->vtblk_req_free, vbr_link);
1074 
1075 	return (req);
1076 }
1077 
1078 static void
1079 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1080 {
1081 
1082 	bzero(req, sizeof(struct vtblk_request));
1083 	SLIST_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1084 }
1085 
1086 static int
1087 vtblk_request_error(struct vtblk_request *req)
1088 {
1089 	int error;
1090 
1091 	switch (req->vbr_ack) {
1092 	case VIRTIO_BLK_S_OK:
1093 		error = 0;
1094 		break;
1095 	case VIRTIO_BLK_S_UNSUPP:
1096 		error = ENOTSUP;
1097 		break;
1098 	default:
1099 		error = EIO;
1100 		break;
1101 	}
1102 
1103 	return (error);
1104 }
1105 
1106 static void
1107 vtblk_finish_bio(struct bio *bio, int error)
1108 {
1109 
1110 	biodone(bio);
1111 }
1112 
1113 static void
1114 vtblk_setup_sysctl(struct vtblk_softc *sc)
1115 {
1116 	device_t dev;
1117 	struct sysctl_ctx_list *ctx;
1118 	struct sysctl_oid *tree;
1119 	struct sysctl_oid_list *child;
1120 
1121 	dev = sc->vtblk_dev;
1122 	ctx = device_get_sysctl_ctx(dev);
1123 	tree = device_get_sysctl_tree(dev);
1124 	child = SYSCTL_CHILDREN(tree);
1125 
1126 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1127 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, vtblk_write_cache_sysctl,
1128 	    "I", "Write cache mode (writethrough (0) or writeback (1))");
1129 }
1130 
1131 static int
1132 vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1133 {
1134 	char path[64];
1135 
1136 	ksnprintf(path, sizeof(path),
1137 	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1138 	TUNABLE_INT_FETCH(path, &def);
1139 
1140 	return (def);
1141 }
1142