xref: /netbsd-src/sys/arch/xen/xen/xbd_xenbus.c (revision 46a7238e11aef14cc7e03d85b6df1697b9b91c13)
1 /*      $NetBSD: xbd_xenbus.c,v 1.134 2023/07/25 16:15:50 bouyer Exp $      */
2 
3 /*
4  * Copyright (c) 2006 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * The file contains the xbd frontend code required for block-level
30  * communications (similar to hard disks) between two Xen domains.
31  *
32  * We are not supposed to receive solicitations spontaneously from backend. The
33  * protocol is therefore fairly simple and uses only one ring to communicate
34  * with backend: frontend posts requests to the ring then wait for their
35  * replies asynchronously.
36  *
37  * xbd follows NetBSD's disk(9) convention. At any time, a LWP can schedule
38  * an operation request for the device (be it open(), read(), write(), ...).
39  * Calls are typically processed that way:
40  * - initiate request: xbdread/write/open/ioctl/..
41  * - depending on operation, it is handled directly by disk(9) subsystem or
42  *   goes through physio(9) first.
43  * - the request is ultimately processed by xbd_diskstart() that prepares the
44  *   xbd requests, post them in the ring I/O queue, then signal the backend.
45  *
46  * When a response is available in the queue, the backend signals the frontend
47  * via its event channel. This triggers xbd_handler(), which will link back
48  * the response to its request through the request ID, and mark the I/O as
49  * completed.
50  */
51 
52 #include <sys/cdefs.h>
53 __KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.134 2023/07/25 16:15:50 bouyer Exp $");
54 
55 #include "opt_xen.h"
56 
57 
58 #include <sys/param.h>
59 #include <sys/buf.h>
60 #include <sys/bufq.h>
61 #include <sys/device.h>
62 #include <sys/disk.h>
63 #include <sys/disklabel.h>
64 #include <sys/conf.h>
65 #include <sys/fcntl.h>
66 #include <sys/kernel.h>
67 #include <sys/proc.h>
68 #include <sys/systm.h>
69 #include <sys/stat.h>
70 #include <sys/vnode.h>
71 #include <sys/mutex.h>
72 
73 #include <dev/dkvar.h>
74 
75 #include <uvm/uvm.h>
76 
77 #include <xen/intr.h>
78 #include <xen/hypervisor.h>
79 #include <xen/evtchn.h>
80 #include <xen/granttables.h>
81 #include <xen/include/public/io/blkif.h>
82 #include <xen/include/public/io/protocols.h>
83 
84 #include <xen/xenbus.h>
85 #include "locators.h"
86 
87 #undef XBD_DEBUG
88 #ifdef XBD_DEBUG
89 #define DPRINTF(x) printf x;
90 #else
91 #define DPRINTF(x)
92 #endif
93 
94 #define GRANT_INVALID_REF -1
95 
96 #define XBD_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
97 #define XBD_MAX_XFER (PAGE_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST)
98 #define XBD_MAX_CHUNK	32*1024		/* max I/O size we process in 1 req */
99 #define XBD_XFER_LIMIT	(2*XBD_MAX_XFER)
100 
101 #define XEN_BSHIFT      9               /* log2(XEN_BSIZE) */
102 #define XEN_BSIZE       (1 << XEN_BSHIFT)
103 
104 CTASSERT((MAXPHYS <= 2*XBD_MAX_CHUNK));
105 CTASSERT(XEN_BSIZE == DEV_BSIZE);
106 
107 struct xbd_indirect {
108 	SLIST_ENTRY(xbd_indirect) in_next;
109 	struct blkif_request_segment *in_addr;
110 	grant_ref_t in_gntref;
111 };
112 
113 struct xbd_req {
114 	SLIST_ENTRY(xbd_req) req_next;
115 	uint16_t req_id; /* ID passed to backend */
116 	bus_dmamap_t req_dmamap;
117 	struct xbd_req *req_parent, *req_child;
118 	bool req_parent_done;
119 	union {
120 	    struct {
121 		grant_ref_t req_gntref[XBD_XFER_LIMIT >> PAGE_SHIFT];
122 		struct buf *req_bp; /* buffer associated with this request */
123 		void *req_data; /* pointer to the data buffer */
124 		struct xbd_indirect *req_indirect;	/* indirect page */
125 	    } req_rw;
126 	    struct {
127 		int s_error;
128 		int s_done;
129 	    } req_sync;
130 	} u;
131 };
132 #define req_gntref	u.req_rw.req_gntref
133 #define req_bp		u.req_rw.req_bp
134 #define req_data	u.req_rw.req_data
135 #define req_indirect	u.req_rw.req_indirect
136 #define req_sync	u.req_sync
137 
138 struct xbd_xenbus_softc {
139 	struct dk_softc sc_dksc;	/* Must be first in this struct */
140 	struct xenbus_device *sc_xbusd;
141 	unsigned int sc_evtchn;
142 
143 	struct intrhand *sc_ih; /* Interrupt handler for this instance. */
144 	kmutex_t sc_lock;
145 	kcondvar_t sc_cache_flush_cv;
146 	kcondvar_t sc_req_cv;
147 	kcondvar_t sc_detach_cv;
148 	kcondvar_t sc_suspend_cv;
149 
150 	blkif_front_ring_t sc_ring;
151 	grant_ref_t sc_ring_gntref;
152 
153 	struct xbd_req sc_reqs[XBD_RING_SIZE];
154 	SLIST_HEAD(,xbd_req) sc_xbdreq_head; /* list of free requests */
155 
156 	struct xbd_indirect sc_indirect[XBD_RING_SIZE];
157 	SLIST_HEAD(,xbd_indirect) sc_indirect_head;
158 
159 	vmem_addr_t sc_unalign_buffer;
160 	void *sc_unalign_used;
161 
162 	int sc_backend_status; /* our status with backend */
163 #define BLKIF_STATE_DISCONNECTED 0
164 #define BLKIF_STATE_CONNECTED    1
165 #define BLKIF_STATE_SUSPENDED    2
166 
167 	int sc_shutdown;
168 #define BLKIF_SHUTDOWN_RUN    0 /* no shutdown */
169 #define BLKIF_SHUTDOWN_REMOTE 1 /* backend-initiated shutdown in progress */
170 #define BLKIF_SHUTDOWN_LOCAL  2 /* locally-initiated shutdown in progress */
171 
172 	uint64_t sc_sectors; /* number of sc_secsize sectors for this device */
173 	u_long sc_secsize; /* sector size */
174 	uint64_t sc_xbdsize; /* size of disk in DEV_BSIZE */
175 	u_long sc_info; /* VDISK_* */
176 	u_long sc_handle; /* from backend */
177 	int sc_features;
178 #define BLKIF_FEATURE_CACHE_FLUSH	0x1
179 #define BLKIF_FEATURE_BARRIER		0x2
180 #define BLKIF_FEATURE_PERSISTENT	0x4
181 #define BLKIF_FEATURE_INDIRECT		0x8
182 #define BLKIF_FEATURE_BITS		\
183 	"\20\1CACHE-FLUSH\2BARRIER\3PERSISTENT\4INDIRECT"
184 	struct evcnt sc_cnt_map_unalign;
185 	struct evcnt sc_cnt_unalign_busy;
186 	struct evcnt sc_cnt_queue_full;
187 	struct evcnt sc_cnt_indirect;
188 };
189 
190 static int  xbd_xenbus_match(device_t, cfdata_t, void *);
191 static void xbd_xenbus_attach(device_t, device_t, void *);
192 static int  xbd_xenbus_detach(device_t, int);
193 
194 static bool xbd_xenbus_suspend(device_t, const pmf_qual_t *);
195 static bool xbd_xenbus_resume(device_t, const pmf_qual_t *);
196 
197 static int  xbd_handler(void *);
198 static int  xbd_diskstart(device_t, struct buf *);
199 static void xbd_iosize(device_t, int *);
200 static void xbd_backend_changed(void *, XenbusState);
201 static void xbd_connect(struct xbd_xenbus_softc *);
202 static void xbd_features(struct xbd_xenbus_softc *);
203 
204 static void xbd_diskstart_submit(struct xbd_xenbus_softc *, int,
205 	struct buf *bp, int, bus_dmamap_t, grant_ref_t *);
206 static void xbd_diskstart_submit_indirect(struct xbd_xenbus_softc *,
207 	struct xbd_req *, struct buf *bp);
208 static int  xbd_map_align(struct xbd_xenbus_softc *, struct xbd_req *);
209 static void xbd_unmap_align(struct xbd_xenbus_softc *, struct xbd_req *,
210 	struct buf *);
211 
212 static void xbdminphys(struct buf *);
213 
214 CFATTACH_DECL3_NEW(xbd, sizeof(struct xbd_xenbus_softc),
215     xbd_xenbus_match, xbd_xenbus_attach, xbd_xenbus_detach, NULL, NULL, NULL,
216     DVF_DETACH_SHUTDOWN);
217 
218 static dev_type_open(xbdopen);
219 static dev_type_close(xbdclose);
220 static dev_type_read(xbdread);
221 static dev_type_write(xbdwrite);
222 static dev_type_ioctl(xbdioctl);
223 static dev_type_strategy(xbdstrategy);
224 static dev_type_dump(xbddump);
225 static dev_type_size(xbdsize);
226 
227 const struct bdevsw xbd_bdevsw = {
228 	.d_open = xbdopen,
229 	.d_close = xbdclose,
230 	.d_strategy = xbdstrategy,
231 	.d_ioctl = xbdioctl,
232 	.d_dump = xbddump,
233 	.d_psize = xbdsize,
234 	.d_discard = nodiscard,
235 	.d_flag = D_DISK | D_MPSAFE
236 };
237 
238 const struct cdevsw xbd_cdevsw = {
239 	.d_open = xbdopen,
240 	.d_close = xbdclose,
241 	.d_read = xbdread,
242 	.d_write = xbdwrite,
243 	.d_ioctl = xbdioctl,
244 	.d_stop = nostop,
245 	.d_tty = notty,
246 	.d_poll = nopoll,
247 	.d_mmap = nommap,
248 	.d_kqfilter = nokqfilter,
249 	.d_discard = nodiscard,
250 	.d_flag = D_DISK | D_MPSAFE
251 };
252 
253 extern struct cfdriver xbd_cd;
254 
255 static const struct dkdriver xbddkdriver = {
256         .d_strategy = xbdstrategy,
257 	.d_minphys = xbdminphys,
258 	.d_open = xbdopen,
259 	.d_close = xbdclose,
260 	.d_diskstart = xbd_diskstart,
261 	.d_iosize = xbd_iosize,
262 };
263 
264 static int
xbd_xenbus_match(device_t parent,cfdata_t match,void * aux)265 xbd_xenbus_match(device_t parent, cfdata_t match, void *aux)
266 {
267 	struct xenbusdev_attach_args *xa = aux;
268 
269 	if (strcmp(xa->xa_type, "vbd") != 0)
270 		return 0;
271 
272 	if (match->cf_loc[XENBUSCF_ID] != XENBUSCF_ID_DEFAULT &&
273 	    match->cf_loc[XENBUSCF_ID] != xa->xa_id)
274 		return 0;
275 
276 	return 1;
277 }
278 
279 static void
xbd_xenbus_attach(device_t parent,device_t self,void * aux)280 xbd_xenbus_attach(device_t parent, device_t self, void *aux)
281 {
282 	struct xbd_xenbus_softc *sc = device_private(self);
283 	struct xenbusdev_attach_args *xa = aux;
284 	blkif_sring_t *ring;
285 	RING_IDX i;
286 
287 	config_pending_incr(self);
288 	aprint_normal(": Xen Virtual Block Device Interface\n");
289 
290 	dk_init(&sc->sc_dksc, self, DKTYPE_ESDI);
291 	disk_init(&sc->sc_dksc.sc_dkdev, device_xname(self), &xbddkdriver);
292 
293 	sc->sc_xbusd = xa->xa_xbusd;
294 	sc->sc_xbusd->xbusd_otherend_changed = xbd_backend_changed;
295 
296 	mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_BIO);
297 	cv_init(&sc->sc_cache_flush_cv, "xbdsync");
298 	cv_init(&sc->sc_req_cv, "xbdreq");
299 	cv_init(&sc->sc_detach_cv, "xbddetach");
300 	cv_init(&sc->sc_suspend_cv, "xbdsuspend");
301 
302 	xbd_features(sc);
303 
304 	/* initialize free requests list */
305 	SLIST_INIT(&sc->sc_xbdreq_head);
306 	for (i = 0; i < XBD_RING_SIZE; i++) {
307 		sc->sc_reqs[i].req_id = i;
308 		SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, &sc->sc_reqs[i],
309 		    req_next);
310 	}
311 
312 	if (sc->sc_features & BLKIF_FEATURE_INDIRECT) {
313 		/* initialize indirect page list */
314 		for (i = 0; i < XBD_RING_SIZE; i++) {
315 			vmem_addr_t va;
316 			if (uvm_km_kmem_alloc(kmem_va_arena,
317 			    PAGE_SIZE, VM_SLEEP | VM_INSTANTFIT, &va) != 0) {
318 				aprint_error_dev(self,
319 				    "can't alloc indirect pages\n");
320 				return;
321 			}
322 			sc->sc_indirect[i].in_addr = (void *)va;
323 			SLIST_INSERT_HEAD(&sc->sc_indirect_head,
324 			    &sc->sc_indirect[i], in_next);
325 		}
326 	}
327 
328 	sc->sc_backend_status = BLKIF_STATE_DISCONNECTED;
329 	sc->sc_shutdown = BLKIF_SHUTDOWN_REMOTE;
330 
331 	ring = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED);
332 	if (ring == NULL)
333 		panic("%s: can't alloc ring", device_xname(self));
334 	sc->sc_ring.sring = ring;
335 
336 	evcnt_attach_dynamic(&sc->sc_cnt_map_unalign, EVCNT_TYPE_MISC,
337 	    NULL, device_xname(self), "map unaligned");
338 	evcnt_attach_dynamic(&sc->sc_cnt_unalign_busy, EVCNT_TYPE_MISC,
339 	    NULL, device_xname(self), "map unaligned");
340 	evcnt_attach_dynamic(&sc->sc_cnt_queue_full, EVCNT_TYPE_MISC,
341 	    NULL, device_xname(self), "queue full");
342 	evcnt_attach_dynamic(&sc->sc_cnt_indirect, EVCNT_TYPE_MISC,
343 	    NULL, device_xname(self), "indirect segment");
344 
345 	for (i = 0; i < XBD_RING_SIZE; i++) {
346 		if (bus_dmamap_create(sc->sc_xbusd->xbusd_dmat,
347 		    MAXPHYS, XBD_XFER_LIMIT >> PAGE_SHIFT,
348 		    PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
349 		    &sc->sc_reqs[i].req_dmamap) != 0) {
350 			aprint_error_dev(self, "can't alloc dma maps\n");
351 			return;
352 		}
353 	}
354 
355 	if (uvm_km_kmem_alloc(kmem_va_arena,
356 	    MAXPHYS, VM_SLEEP | VM_INSTANTFIT, &sc->sc_unalign_buffer) != 0) {
357 		aprint_error_dev(self, "can't alloc align buffer\n");
358 		return;
359 	}
360 
361 	/* resume shared structures and tell backend that we are ready */
362 	if (xbd_xenbus_resume(self, PMF_Q_NONE) == false) {
363 		uvm_km_free(kernel_map, (vaddr_t)ring, PAGE_SIZE,
364 		    UVM_KMF_WIRED);
365 		return;
366 	}
367 
368 	if (!pmf_device_register(self, xbd_xenbus_suspend, xbd_xenbus_resume))
369 		aprint_error_dev(self, "couldn't establish power handler\n");
370 }
371 
372 static int
xbd_xenbus_detach(device_t dev,int flags)373 xbd_xenbus_detach(device_t dev, int flags)
374 {
375 	struct xbd_xenbus_softc *sc = device_private(dev);
376 	int bmaj, cmaj, i, mn, rc;
377 
378 	DPRINTF(("%s: xbd_detach\n", device_xname(dev)));
379 
380 	rc = disk_begindetach(&sc->sc_dksc.sc_dkdev, NULL, dev, flags);
381 	if (rc != 0)
382 		return rc;
383 
384 	mutex_enter(&sc->sc_lock);
385 	if (sc->sc_shutdown == BLKIF_SHUTDOWN_RUN) {
386 		sc->sc_shutdown = BLKIF_SHUTDOWN_LOCAL;
387 
388 		/* wait for requests to complete */
389 		while (sc->sc_backend_status == BLKIF_STATE_CONNECTED &&
390 		    disk_isbusy(&sc->sc_dksc.sc_dkdev)) {
391 			cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2);
392 		}
393 		mutex_exit(&sc->sc_lock);
394 
395 		/* Trigger state transition with backend */
396 		xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosing);
397 
398 		mutex_enter(&sc->sc_lock);
399 	}
400 	if ((flags & DETACH_FORCE) == 0) {
401 		/* xbd_xenbus_detach already in progress */
402 		cv_broadcast(&sc->sc_detach_cv);
403 		mutex_exit(&sc->sc_lock);
404 		return EALREADY;
405 	}
406 	mutex_exit(&sc->sc_lock);
407 	while (xenbus_read_driver_state(sc->sc_xbusd->xbusd_otherend)
408 	    != XenbusStateClosed) {
409 		mutex_enter(&sc->sc_lock);
410 		cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2);
411 		mutex_exit(&sc->sc_lock);
412 	}
413 
414 	/* locate the major number */
415 	bmaj = bdevsw_lookup_major(&xbd_bdevsw);
416 	cmaj = cdevsw_lookup_major(&xbd_cdevsw);
417 
418 	/* Nuke the vnodes for any open instances. */
419 	for (i = 0; i < MAXPARTITIONS; i++) {
420 		mn = DISKMINOR(device_unit(dev), i);
421 		vdevgone(bmaj, mn, mn, VBLK);
422 		vdevgone(cmaj, mn, mn, VCHR);
423 	}
424 
425 	if (sc->sc_backend_status == BLKIF_STATE_CONNECTED) {
426 		/* Delete all of our wedges. */
427 		dkwedge_delall(&sc->sc_dksc.sc_dkdev);
428 
429 		/* Kill off any queued buffers. */
430 		dk_drain(&sc->sc_dksc);
431 		bufq_free(sc->sc_dksc.sc_bufq);
432 
433 		/* detach disk */
434 		disk_detach(&sc->sc_dksc.sc_dkdev);
435 		disk_destroy(&sc->sc_dksc.sc_dkdev);
436 		dk_detach(&sc->sc_dksc);
437 	}
438 
439 	hypervisor_mask_event(sc->sc_evtchn);
440 	if (sc->sc_ih != NULL) {
441 		xen_intr_disestablish(sc->sc_ih);
442 		sc->sc_ih = NULL;
443 	}
444 
445 	mutex_enter(&sc->sc_lock);
446 	while (xengnt_status(sc->sc_ring_gntref))
447 		cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2);
448 	mutex_exit(&sc->sc_lock);
449 
450 	xengnt_revoke_access(sc->sc_ring_gntref);
451 	uvm_km_free(kernel_map, (vaddr_t)sc->sc_ring.sring,
452 	    PAGE_SIZE, UVM_KMF_WIRED);
453 
454 	for (i = 0; i < XBD_RING_SIZE; i++) {
455 		if (sc->sc_reqs[i].req_dmamap != NULL) {
456 			bus_dmamap_destroy(sc->sc_xbusd->xbusd_dmat,
457 			    sc->sc_reqs[i].req_dmamap);
458 			sc->sc_reqs[i].req_dmamap = NULL;
459 		}
460 	}
461 
462 	if (sc->sc_unalign_buffer != 0) {
463 		uvm_km_kmem_free(kmem_va_arena, sc->sc_unalign_buffer, MAXPHYS);
464 		sc->sc_unalign_buffer = 0;
465 	}
466 
467 	mutex_destroy(&sc->sc_lock);
468 
469 	evcnt_detach(&sc->sc_cnt_map_unalign);
470 	evcnt_detach(&sc->sc_cnt_unalign_busy);
471 	evcnt_detach(&sc->sc_cnt_queue_full);
472 	evcnt_detach(&sc->sc_cnt_indirect);
473 
474 	pmf_device_deregister(dev);
475 
476 	return 0;
477 }
478 
479 static bool
xbd_xenbus_suspend(device_t dev,const pmf_qual_t * qual)480 xbd_xenbus_suspend(device_t dev, const pmf_qual_t *qual) {
481 
482 	struct xbd_xenbus_softc *sc;
483 
484 	sc = device_private(dev);
485 
486 	mutex_enter(&sc->sc_lock);
487 	/* wait for requests to complete, then suspend device */
488 	while (sc->sc_backend_status == BLKIF_STATE_CONNECTED &&
489 	    disk_isbusy(&sc->sc_dksc.sc_dkdev)) {
490 		cv_timedwait(&sc->sc_suspend_cv, &sc->sc_lock, hz/2);
491 	}
492 
493 	hypervisor_mask_event(sc->sc_evtchn);
494 	sc->sc_backend_status = BLKIF_STATE_SUSPENDED;
495 
496 #ifdef DIAGNOSTIC
497 	/* Check that all requests are finished and device ready for resume */
498 	int reqcnt = 0;
499 	struct xbd_req *req;
500 	SLIST_FOREACH(req, &sc->sc_xbdreq_head, req_next)
501 		reqcnt++;
502 	KASSERT(reqcnt == __arraycount(sc->sc_reqs));
503 
504 	int incnt = 0;
505 	struct xbd_indirect *in;
506 	SLIST_FOREACH(in, &sc->sc_indirect_head, in_next)
507 		incnt++;
508 	KASSERT(incnt == __arraycount(sc->sc_indirect));
509 #endif
510 
511 	mutex_exit(&sc->sc_lock);
512 
513 	xenbus_device_suspend(sc->sc_xbusd);
514 	aprint_verbose_dev(dev, "removed event channel %d\n", sc->sc_evtchn);
515 
516 	return true;
517 }
518 
519 static bool
xbd_xenbus_resume(device_t dev,const pmf_qual_t * qual)520 xbd_xenbus_resume(device_t dev, const pmf_qual_t *qual)
521 {
522 	struct xbd_xenbus_softc *sc;
523 	struct xenbus_transaction *xbt;
524 	int error;
525 	blkif_sring_t *ring;
526 	paddr_t ma;
527 	const char *errmsg;
528 
529 	sc = device_private(dev);
530 
531 	/* All grants were removed during suspend */
532 	sc->sc_ring_gntref = GRANT_INVALID_REF;
533 
534 	/* Initialize ring */
535 	ring = sc->sc_ring.sring;
536 	memset(ring, 0, PAGE_SIZE);
537 	SHARED_RING_INIT(ring);
538 	FRONT_RING_INIT(&sc->sc_ring, ring, PAGE_SIZE);
539 
540 	/*
541 	 * get MA address of the ring, and use it to set up the grant entry
542 	 * for the block device
543 	 */
544 	(void)pmap_extract_ma(pmap_kernel(), (vaddr_t)ring, &ma);
545 	error = xenbus_grant_ring(sc->sc_xbusd, ma, &sc->sc_ring_gntref);
546 	if (error)
547 		goto abort_resume;
548 
549 	if (sc->sc_features & BLKIF_FEATURE_INDIRECT) {
550 		for (int i = 0; i < XBD_RING_SIZE; i++) {
551 			vaddr_t va = (vaddr_t)sc->sc_indirect[i].in_addr;
552 			KASSERT(va != 0);
553 			KASSERT((va & PAGE_MASK) == 0);
554 			(void)pmap_extract_ma(pmap_kernel(), va, &ma);
555 			if (xengnt_grant_access(
556 			    sc->sc_xbusd->xbusd_otherend_id,
557 			    ma, true, &sc->sc_indirect[i].in_gntref)) {
558 				aprint_error_dev(dev,
559 				    "indirect page grant failed\n");
560 				goto abort_resume;
561 			}
562 		}
563 	}
564 
565 	error = xenbus_alloc_evtchn(sc->sc_xbusd, &sc->sc_evtchn);
566 	if (error)
567 		goto abort_resume;
568 
569 	if (sc->sc_ih != NULL) {
570 		xen_intr_disestablish(sc->sc_ih);
571 		sc->sc_ih = NULL;
572 	}
573 	aprint_verbose_dev(dev, "using event channel %d\n",
574 	    sc->sc_evtchn);
575 	sc->sc_ih = xen_intr_establish_xname(-1, &xen_pic, sc->sc_evtchn,
576 	    IST_LEVEL, IPL_BIO, &xbd_handler, sc, true, device_xname(dev));
577 	KASSERT(sc->sc_ih != NULL);
578 
579 again:
580 	xbt = xenbus_transaction_start();
581 	if (xbt == NULL)
582 		return false;
583 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
584 	    "ring-ref","%u", sc->sc_ring_gntref);
585 	if (error) {
586 		errmsg = "writing ring-ref";
587 		goto abort_transaction;
588 	}
589 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
590 	    "event-channel", "%u", sc->sc_evtchn);
591 	if (error) {
592 		errmsg = "writing event channel";
593 		goto abort_transaction;
594 	}
595 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
596 	    "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
597 	if (error) {
598 		errmsg = "writing protocol";
599 		goto abort_transaction;
600 	}
601 	error = xenbus_transaction_end(xbt, 0);
602 	if (error == EAGAIN)
603 		goto again;
604 	if (error != 0) {
605 		xenbus_dev_fatal(sc->sc_xbusd, error,
606 		    "completing transaction");
607 		return false;
608 	}
609 
610 	xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateInitialised);
611 
612 	if (sc->sc_backend_status == BLKIF_STATE_SUSPENDED) {
613 		/*
614 		 * device was suspended, softc structures are
615 		 * already initialized - we use a shortcut
616 		 */
617 		sc->sc_backend_status = BLKIF_STATE_CONNECTED;
618 		xenbus_device_resume(sc->sc_xbusd);
619 		hypervisor_unmask_event(sc->sc_evtchn);
620 		xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected);
621 	}
622 
623 	return true;
624 
625 abort_resume:
626 	xenbus_dev_fatal(sc->sc_xbusd, error, "resuming device");
627 	return false;
628 
629 abort_transaction:
630 	xenbus_transaction_end(xbt, 1);
631 	xenbus_dev_fatal(sc->sc_xbusd, error, "%s", errmsg);
632 	return false;
633 }
634 
635 static void
xbd_backend_changed(void * arg,XenbusState new_state)636 xbd_backend_changed(void *arg, XenbusState new_state)
637 {
638 	struct xbd_xenbus_softc *sc = device_private((device_t)arg);
639 	struct disk_geom *dg;
640 
641 	char buf[64];
642 	DPRINTF(("%s: new backend state %d\n",
643 	    device_xname(sc->sc_dksc.sc_dev), new_state));
644 
645 	switch (new_state) {
646 	case XenbusStateUnknown:
647 	case XenbusStateInitialising:
648 	case XenbusStateInitWait:
649 	case XenbusStateInitialised:
650 		break;
651 	case XenbusStateClosing:
652 		mutex_enter(&sc->sc_lock);
653 		if (sc->sc_shutdown == BLKIF_SHUTDOWN_RUN)
654 			sc->sc_shutdown = BLKIF_SHUTDOWN_REMOTE;
655 		/* wait for requests to complete */
656 		while (sc->sc_backend_status == BLKIF_STATE_CONNECTED &&
657 		    disk_isbusy(&sc->sc_dksc.sc_dkdev)) {
658 			cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2);
659 		}
660 		mutex_exit(&sc->sc_lock);
661 		xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosed);
662 		break;
663 	case XenbusStateConnected:
664 		/*
665 		 * note that xbd_backend_changed() can only be called by
666 		 * the xenbus thread.
667 		 */
668 
669 		if (sc->sc_backend_status == BLKIF_STATE_CONNECTED ||
670 		    sc->sc_backend_status == BLKIF_STATE_SUSPENDED)
671 			/* already connected */
672 			return;
673 
674 		xbd_connect(sc);
675 		sc->sc_shutdown = BLKIF_SHUTDOWN_RUN;
676 		sc->sc_xbdsize =
677 		    sc->sc_sectors * (uint64_t)sc->sc_secsize / DEV_BSIZE;
678 		dg = &sc->sc_dksc.sc_dkdev.dk_geom;
679 		memset(dg, 0, sizeof(*dg));
680 
681 		dg->dg_secperunit = sc->sc_sectors;
682 		dg->dg_secsize = sc->sc_secsize;
683 		dg->dg_ntracks = 1;
684 		dg->dg_nsectors = (1024 * 1024) / dg->dg_secsize;
685 		dg->dg_ncylinders = dg->dg_secperunit / dg->dg_nsectors;
686 
687 		bufq_alloc(&sc->sc_dksc.sc_bufq, "fcfs", 0);
688 		dk_attach(&sc->sc_dksc);
689 		disk_attach(&sc->sc_dksc.sc_dkdev);
690 
691 		sc->sc_backend_status = BLKIF_STATE_CONNECTED;
692 		hypervisor_unmask_event(sc->sc_evtchn);
693 
694 		format_bytes(buf, uimin(9, sizeof(buf)),
695 		    sc->sc_sectors * dg->dg_secsize);
696 		aprint_normal_dev(sc->sc_dksc.sc_dev,
697 				"%s, %d bytes/sect x %" PRIu64 " sectors\n",
698 				buf, (int)dg->dg_secsize, sc->sc_sectors);
699 		snprintb(buf, sizeof(buf), BLKIF_FEATURE_BITS,
700 		    sc->sc_features);
701 		aprint_normal_dev(sc->sc_dksc.sc_dev,
702 		    "backend features %s\n", buf);
703 
704 		/* Discover wedges on this disk. */
705 		dkwedge_discover(&sc->sc_dksc.sc_dkdev);
706 
707 		disk_set_info(sc->sc_dksc.sc_dev, &sc->sc_dksc.sc_dkdev, NULL);
708 
709 		/* the disk should be working now */
710 		config_pending_decr(sc->sc_dksc.sc_dev);
711 		break;
712 	default:
713 		panic("bad backend state %d", new_state);
714 	}
715 }
716 
717 static void
xbd_connect(struct xbd_xenbus_softc * sc)718 xbd_connect(struct xbd_xenbus_softc *sc)
719 {
720 	int err;
721 	unsigned long long sectors;
722 	u_long val;
723 
724 	/*
725 	 * Must read feature-persistent later, e.g. Linux Dom0 writes
726 	 * this together with the device info.
727 	 */
728 	err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
729 	    "feature-persistent", &val, 10);
730 	if (err)
731 		val = 0;
732 	if (val > 0)
733 		sc->sc_features |= BLKIF_FEATURE_PERSISTENT;
734 
735 	err = xenbus_read_ul(NULL,
736 	    sc->sc_xbusd->xbusd_path, "virtual-device", &sc->sc_handle, 10);
737 	if (err)
738 		panic("%s: can't read number from %s/virtual-device\n",
739 		    device_xname(sc->sc_dksc.sc_dev),
740 		    sc->sc_xbusd->xbusd_otherend);
741 	err = xenbus_read_ul(NULL,
742 	    sc->sc_xbusd->xbusd_otherend, "info", &sc->sc_info, 10);
743 	if (err)
744 		panic("%s: can't read number from %s/info\n",
745 		    device_xname(sc->sc_dksc.sc_dev),
746 		    sc->sc_xbusd->xbusd_otherend);
747 	err = xenbus_read_ul(NULL,
748 	    sc->sc_xbusd->xbusd_otherend, "sector-size", &sc->sc_secsize, 10);
749 	if (err)
750 		panic("%s: can't read number from %s/sector-size\n",
751 		    device_xname(sc->sc_dksc.sc_dev),
752 		    sc->sc_xbusd->xbusd_otherend);
753 
754 	err = xenbus_read_ull(NULL,
755 	    sc->sc_xbusd->xbusd_otherend, "sectors", &sectors, 10);
756 	if (err)
757 		panic("%s: can't read number from %s/sectors\n",
758 		    device_xname(sc->sc_dksc.sc_dev),
759 		    sc->sc_xbusd->xbusd_otherend);
760 	sc->sc_sectors = sectors * (uint64_t)XEN_BSIZE / sc->sc_secsize;
761 
762 	xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected);
763 }
764 
765 static void
xbd_features(struct xbd_xenbus_softc * sc)766 xbd_features(struct xbd_xenbus_softc *sc)
767 {
768 	int err;
769 	u_long val;
770 
771 	err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
772 	    "feature-flush-cache", &val, 10);
773 	if (err)
774 		val = 0;
775 	if (val > 0)
776 		sc->sc_features |= BLKIF_FEATURE_CACHE_FLUSH;
777 
778 	err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
779 	    "feature-barrier", &val, 10);
780 	if (err)
781 		val = 0;
782 	if (val > 0)
783 		sc->sc_features |= BLKIF_FEATURE_BARRIER;
784 
785 	err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
786 	    "feature-max-indirect-segments", &val, 10);
787 	if (err)
788 		val = 0;
789 	if (val >= (MAXPHYS >> PAGE_SHIFT) + 1) {
790 		/* We can use indirect segments, the limit is big enough */
791 		sc->sc_features |= BLKIF_FEATURE_INDIRECT;
792 	}
793 }
794 
795 static int
xbd_handler(void * arg)796 xbd_handler(void *arg)
797 {
798 	struct xbd_xenbus_softc *sc = arg;
799 	struct buf *bp;
800 	RING_IDX resp_prod, i;
801 	int more_to_do;
802 	int seg;
803 	grant_ref_t gntref;
804 
805 	DPRINTF(("xbd_handler(%s)\n", device_xname(sc->sc_dksc.sc_dev)));
806 
807 	if (__predict_false(sc->sc_backend_status != BLKIF_STATE_CONNECTED))
808 		return 0;
809 
810 	mutex_enter(&sc->sc_lock);
811 again:
812 	resp_prod = sc->sc_ring.sring->rsp_prod;
813 	xen_rmb(); /* ensure we see replies up to resp_prod */
814 	for (i = sc->sc_ring.rsp_cons; i != resp_prod; i++) {
815 		blkif_response_t *rep = RING_GET_RESPONSE(&sc->sc_ring, i);
816 		struct xbd_req *xbdreq = &sc->sc_reqs[rep->id];
817 
818 		if (rep->operation == BLKIF_OP_FLUSH_DISKCACHE) {
819 			KASSERT(xbdreq->req_bp == NULL);
820 			xbdreq->req_sync.s_error = rep->status;
821 			xbdreq->req_sync.s_done = 1;
822 			cv_broadcast(&sc->sc_cache_flush_cv);
823 			/* caller will free the req */
824 			continue;
825 		}
826 
827 		if (rep->operation != BLKIF_OP_READ &&
828 		    rep->operation != BLKIF_OP_WRITE) {
829 			aprint_error_dev(sc->sc_dksc.sc_dev,
830 			    "bad operation %d from backend\n", rep->operation);
831 			continue;
832 		}
833 
834 		bp = xbdreq->req_bp;
835 		xbdreq->req_bp = NULL;
836 		KASSERT(bp != NULL && bp->b_data != NULL);
837 		DPRINTF(("%s(%p): b_bcount = %ld\n", __func__,
838 		    bp, (long)bp->b_bcount));
839 
840 		if (bp->b_error != 0 || rep->status != BLKIF_RSP_OKAY) {
841 			DPRINTF(("%s: error %d status %d\n", __func__,
842 			    bp->b_error, rep->status));
843 			bp->b_error = EIO;
844 			bp->b_resid = bp->b_bcount;
845 		}
846 
847 		if (xbdreq->req_parent) {
848 			struct xbd_req *req_parent = xbdreq->req_parent;
849 
850 			/* Unhook and recycle child */
851 			xbdreq->req_parent = NULL;
852 			req_parent->req_child = NULL;
853 			SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq,
854 				    req_next);
855 
856 			if (!req_parent->req_parent_done) {
857 				/* Finished before parent, nothig else to do */
858 				continue;
859 			}
860 
861 			/* Must do the cleanup now */
862 			xbdreq = req_parent;
863 		}
864 		if (xbdreq->req_child) {
865 			/* Finished before child, child will cleanup */
866 			xbdreq->req_parent_done = true;
867 			continue;
868 		}
869 
870 		if (bp->b_error == 0)
871 			bp->b_resid = 0;
872 
873 		KASSERT(xbdreq->req_dmamap->dm_nsegs > 0);
874 		for (seg = 0; seg < xbdreq->req_dmamap->dm_nsegs; seg++) {
875 			/*
876 			 * We are not allowing persistent mappings, so
877 			 * expect the backend to release the grant
878 			 * immediately.
879 			 */
880 			if (xbdreq->req_indirect) {
881 				gntref =
882 				    xbdreq->req_indirect->in_addr[seg].gref;
883 			} else
884 				gntref = xbdreq->req_gntref[seg];
885 			KASSERT(xengnt_status(gntref) == 0);
886 			xengnt_revoke_access(gntref);
887 		}
888 
889 		bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat, xbdreq->req_dmamap);
890 
891 		if (__predict_false(bp->b_data != xbdreq->req_data))
892 			xbd_unmap_align(sc, xbdreq, bp);
893 		xbdreq->req_data = NULL;
894 
895 		dk_done(&sc->sc_dksc, bp);
896 
897 		if (xbdreq->req_indirect) {
898 			/* No persistent mappings, so check that
899 			 * backend unmapped the indirect segment grant too.
900 			 */
901 			KASSERT(xengnt_status(xbdreq->req_indirect->in_gntref)
902 			    == 0);
903 			SLIST_INSERT_HEAD(&sc->sc_indirect_head,
904 			    xbdreq->req_indirect, in_next);
905 			xbdreq->req_indirect = NULL;
906 		}
907 		SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next);
908 	}
909 	sc->sc_ring.rsp_cons = i;
910 
911 	xen_wmb();
912 	RING_FINAL_CHECK_FOR_RESPONSES(&sc->sc_ring, more_to_do);
913 	if (more_to_do)
914 		goto again;
915 
916 	cv_signal(&sc->sc_req_cv);
917 	mutex_exit(&sc->sc_lock);
918 
919 	dk_start(&sc->sc_dksc, NULL);
920 
921 	return 1;
922 }
923 
924 static void
xbdminphys(struct buf * bp)925 xbdminphys(struct buf *bp)
926 {
927 	if (bp->b_bcount > XBD_XFER_LIMIT) {
928 		bp->b_bcount = XBD_XFER_LIMIT;
929 	}
930 	minphys(bp);
931 }
932 
933 static void
xbd_iosize(device_t dev,int * maxxfer)934 xbd_iosize(device_t dev, int *maxxfer)
935 {
936 	/*
937 	 * Always restrict dumps to XBD_MAX_XFER to avoid indirect segments,
938 	 * so that it uses as little memory as possible.
939 	 */
940 	if (*maxxfer > XBD_MAX_XFER)
941 		*maxxfer = XBD_MAX_XFER;
942 }
943 
944 static int
xbdopen(dev_t dev,int flags,int fmt,struct lwp * l)945 xbdopen(dev_t dev, int flags, int fmt, struct lwp *l)
946 {
947 	struct	xbd_xenbus_softc *sc;
948 
949 	sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
950 	if (sc == NULL)
951 		return (ENXIO);
952 	if ((flags & FWRITE) && (sc->sc_info & VDISK_READONLY))
953 		return EROFS;
954 
955 	DPRINTF(("xbdopen(%" PRIx64 ", %d)\n", dev, flags));
956 	return dk_open(&sc->sc_dksc, dev, flags, fmt, l);
957 }
958 
959 static int
xbdclose(dev_t dev,int flags,int fmt,struct lwp * l)960 xbdclose(dev_t dev, int flags, int fmt, struct lwp *l)
961 {
962 	struct xbd_xenbus_softc *sc;
963 
964 	sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
965 
966 	DPRINTF(("xbdclose(%" PRIx64 ", %d)\n", dev, flags));
967 	return dk_close(&sc->sc_dksc, dev, flags, fmt, l);
968 }
969 
970 static void
xbdstrategy(struct buf * bp)971 xbdstrategy(struct buf *bp)
972 {
973 	struct xbd_xenbus_softc *sc;
974 
975 	sc = device_lookup_private(&xbd_cd, DISKUNIT(bp->b_dev));
976 
977 	DPRINTF(("xbdstrategy(%p): b_bcount = %ld\n", bp,
978 	    (long)bp->b_bcount));
979 
980 	if (sc == NULL || sc->sc_shutdown != BLKIF_SHUTDOWN_RUN) {
981 		bp->b_error = EIO;
982 		biodone(bp);
983 		return;
984 	}
985 	if (__predict_false((sc->sc_info & VDISK_READONLY) &&
986 	    (bp->b_flags & B_READ) == 0)) {
987 		bp->b_error = EROFS;
988 		biodone(bp);
989 		return;
990 	}
991 
992 	dk_strategy(&sc->sc_dksc, bp);
993 	return;
994 }
995 
996 static int
xbdsize(dev_t dev)997 xbdsize(dev_t dev)
998 {
999 	struct	xbd_xenbus_softc *sc;
1000 
1001 	DPRINTF(("xbdsize(%" PRIx64 ")\n", dev));
1002 
1003 	sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
1004 	if (sc == NULL || sc->sc_shutdown != BLKIF_SHUTDOWN_RUN)
1005 		return -1;
1006 	return dk_size(&sc->sc_dksc, dev);
1007 }
1008 
1009 static int
xbdread(dev_t dev,struct uio * uio,int flags)1010 xbdread(dev_t dev, struct uio *uio, int flags)
1011 {
1012 	struct xbd_xenbus_softc *sc =
1013 	    device_lookup_private(&xbd_cd, DISKUNIT(dev));
1014 	struct  dk_softc *dksc = &sc->sc_dksc;
1015 
1016 	if (!DK_ATTACHED(dksc))
1017 		return ENXIO;
1018 	return physio(xbdstrategy, NULL, dev, B_READ, xbdminphys, uio);
1019 }
1020 
1021 static int
xbdwrite(dev_t dev,struct uio * uio,int flags)1022 xbdwrite(dev_t dev, struct uio *uio, int flags)
1023 {
1024 	struct xbd_xenbus_softc *sc =
1025 	    device_lookup_private(&xbd_cd, DISKUNIT(dev));
1026 	struct  dk_softc *dksc = &sc->sc_dksc;
1027 
1028 	if (!DK_ATTACHED(dksc))
1029 		return ENXIO;
1030 	if (__predict_false(sc->sc_info & VDISK_READONLY))
1031 		return EROFS;
1032 	return physio(xbdstrategy, NULL, dev, B_WRITE, xbdminphys, uio);
1033 }
1034 
1035 static int
xbdioctl(dev_t dev,u_long cmd,void * data,int flag,struct lwp * l)1036 xbdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1037 {
1038 	struct xbd_xenbus_softc *sc =
1039 	    device_lookup_private(&xbd_cd, DISKUNIT(dev));
1040 	struct	dk_softc *dksc;
1041 	int	error;
1042 	struct xbd_req *xbdreq;
1043 	blkif_request_t *req;
1044 	int notify;
1045 
1046 	DPRINTF(("xbdioctl(%" PRIx64 ", %08lx, %p, %d, %p)\n",
1047 	    dev, cmd, data, flag, l));
1048 	dksc = &sc->sc_dksc;
1049 
1050 	switch (cmd) {
1051 	case DIOCGCACHE:
1052 	    {
1053 		/* Assume there is write cache if cache-flush is supported */
1054 		int *bitsp = (int *)data;
1055 		*bitsp = 0;
1056 		if (sc->sc_features & BLKIF_FEATURE_CACHE_FLUSH)
1057 			*bitsp |= DKCACHE_WRITE;
1058 		error = 0;
1059 		break;
1060 	    }
1061 	case DIOCCACHESYNC:
1062 		if ((sc->sc_features & BLKIF_FEATURE_CACHE_FLUSH) == 0)
1063 			return EOPNOTSUPP;
1064 
1065 		mutex_enter(&sc->sc_lock);
1066 		while ((xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head)) == NULL)
1067 			cv_wait(&sc->sc_req_cv, &sc->sc_lock);
1068 		KASSERT(!RING_FULL(&sc->sc_ring));
1069 
1070 		SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
1071 		req = RING_GET_REQUEST(&sc->sc_ring,
1072 		    sc->sc_ring.req_prod_pvt);
1073 		req->id = xbdreq->req_id;
1074 		req->operation = BLKIF_OP_FLUSH_DISKCACHE;
1075 		req->handle = sc->sc_handle;
1076 		xbdreq->req_sync.s_done = 0;
1077 		sc->sc_ring.req_prod_pvt++;
1078 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, notify);
1079 		if (notify)
1080 			hypervisor_notify_via_evtchn(sc->sc_evtchn);
1081 		/* request sent, now wait for completion */
1082 		while (xbdreq->req_sync.s_done == 0)
1083 			cv_wait(&sc->sc_cache_flush_cv, &sc->sc_lock);
1084 
1085 		if (xbdreq->req_sync.s_error == BLKIF_RSP_EOPNOTSUPP)
1086 			error = EOPNOTSUPP;
1087 		else if (xbdreq->req_sync.s_error == BLKIF_RSP_OKAY)
1088 			error = 0;
1089 		else
1090 			error = EIO;
1091 		SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next);
1092 		cv_signal(&sc->sc_req_cv);
1093 		mutex_exit(&sc->sc_lock);
1094 
1095 		/* Restart I/O if it was waiting for req */
1096 		dk_start(&sc->sc_dksc, NULL);
1097 		break;
1098 
1099 	default:
1100 		error = dk_ioctl(dksc, dev, cmd, data, flag, l);
1101 		break;
1102 	}
1103 
1104 	return error;
1105 }
1106 
1107 static int
xbddump(dev_t dev,daddr_t blkno,void * va,size_t size)1108 xbddump(dev_t dev, daddr_t blkno, void *va, size_t size)
1109 {
1110 	struct xbd_xenbus_softc *sc;
1111 
1112 	sc  = device_lookup_private(&xbd_cd, DISKUNIT(dev));
1113 	if (sc == NULL)
1114 		return (ENXIO);
1115 
1116 	DPRINTF(("xbddump(%" PRIx64 ", %" PRId64 ", %p, %lu)\n", dev, blkno, va,
1117 	    (unsigned long)size));
1118 	return dk_dump(&sc->sc_dksc, dev, blkno, va, size, 0);
1119 }
1120 
1121 static int
xbd_diskstart(device_t self,struct buf * bp)1122 xbd_diskstart(device_t self, struct buf *bp)
1123 {
1124 	struct xbd_xenbus_softc *sc = device_private(self);
1125 	struct xbd_req *xbdreq;
1126 	int error = 0;
1127 	int notify;
1128 
1129 	KASSERT(bp->b_bcount <= MAXPHYS);
1130 
1131 	DPRINTF(("xbd_diskstart(%p): b_bcount = %ld\n",
1132 	    bp, (long)bp->b_bcount));
1133 
1134 	mutex_enter(&sc->sc_lock);
1135 
1136 	if (sc->sc_shutdown != BLKIF_SHUTDOWN_RUN) {
1137 		error = EIO;
1138 		goto out;
1139 	}
1140 
1141 	if (bp->b_rawblkno < 0 || bp->b_rawblkno > sc->sc_sectors) {
1142 		/* invalid block number */
1143 		error = EINVAL;
1144 		goto out;
1145 	}
1146 
1147 	if (__predict_false(
1148 	    sc->sc_backend_status == BLKIF_STATE_SUSPENDED)) {
1149 		/* device is suspended, do not consume buffer */
1150 		DPRINTF(("%s: (xbd_diskstart) device suspended\n",
1151 		    sc->sc_dksc.sc_xname));
1152 		error = EAGAIN;
1153 		goto out;
1154 	}
1155 
1156 	xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head);
1157 	if (__predict_false(xbdreq == NULL)) {
1158 		sc->sc_cnt_queue_full.ev_count++;
1159 		DPRINTF(("xbd_diskstart: no req\n"));
1160 		error = EAGAIN;
1161 		goto out;
1162 	}
1163 	KASSERT(!RING_FULL(&sc->sc_ring));
1164 
1165 	if ((sc->sc_features & BLKIF_FEATURE_INDIRECT) == 0
1166 	    && bp->b_bcount > XBD_MAX_CHUNK) {
1167 		if (!SLIST_NEXT(xbdreq, req_next)) {
1168 			DPRINTF(("%s: need extra req\n", __func__));
1169 			error = EAGAIN;
1170 			goto out;
1171 		}
1172 	}
1173 
1174 	bp->b_resid = bp->b_bcount;
1175 	xbdreq->req_bp = bp;
1176 	xbdreq->req_data = bp->b_data;
1177 	if (__predict_false((vaddr_t)bp->b_data & (sc->sc_secsize - 1))) {
1178 		if (__predict_false(xbd_map_align(sc, xbdreq) != 0)) {
1179 			DPRINTF(("xbd_diskstart: no align\n"));
1180 			error = EAGAIN;
1181 			goto out;
1182 		}
1183 	}
1184 
1185 	if (__predict_false(bus_dmamap_load(sc->sc_xbusd->xbusd_dmat,
1186 	    xbdreq->req_dmamap, xbdreq->req_data, bp->b_bcount, NULL,
1187 	    BUS_DMA_NOWAIT) != 0)) {
1188 		printf("%s: %s: bus_dmamap_load failed\n",
1189 		    device_xname(sc->sc_dksc.sc_dev), __func__);
1190 		if (__predict_false(bp->b_data != xbdreq->req_data))
1191 			xbd_unmap_align(sc, xbdreq, NULL);
1192 		error = EINVAL;
1193 		goto out;
1194 	}
1195 	KASSERTMSG(xbdreq->req_dmamap->dm_nsegs > 0,
1196 	    "dm_nsegs == 0 with bcount %d", bp->b_bcount);
1197 
1198 	for (int seg = 0; seg < xbdreq->req_dmamap->dm_nsegs; seg++) {
1199 		KASSERT(seg < __arraycount(xbdreq->req_gntref));
1200 
1201 		paddr_t ma = xbdreq->req_dmamap->dm_segs[seg].ds_addr;
1202 		if (__predict_false(xengnt_grant_access(
1203 		    sc->sc_xbusd->xbusd_otherend_id,
1204 		    (ma & ~PAGE_MASK), (bp->b_flags & B_READ) == 0,
1205 		    &xbdreq->req_gntref[seg]))) {
1206 			printf("%s: %s: xengnt_grant_access failed\n",
1207 			    device_xname(sc->sc_dksc.sc_dev), __func__);
1208 			if (seg > 0) {
1209 				for (; --seg >= 0; ) {
1210 					xengnt_revoke_access(
1211 					    xbdreq->req_gntref[seg]);
1212 				}
1213 			}
1214 			bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat,
1215 			    xbdreq->req_dmamap);
1216 			if (__predict_false(bp->b_data != xbdreq->req_data))
1217 				xbd_unmap_align(sc, xbdreq, NULL);
1218 			error = EAGAIN;
1219 			goto out;
1220 		}
1221 	}
1222 
1223 	KASSERT(xbdreq->req_parent == NULL);
1224 	KASSERT(xbdreq->req_child == NULL);
1225 
1226 	/* We are now committed to the transfer */
1227 	SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
1228 
1229 	if ((sc->sc_features & BLKIF_FEATURE_INDIRECT) != 0 &&
1230 	    bp->b_bcount > XBD_MAX_CHUNK) {
1231 		xbd_diskstart_submit_indirect(sc, xbdreq, bp);
1232 		goto push;
1233 	}
1234 
1235 	xbd_diskstart_submit(sc, xbdreq->req_id,
1236 	    bp, 0, xbdreq->req_dmamap, xbdreq->req_gntref);
1237 
1238 	if (bp->b_bcount > XBD_MAX_CHUNK) {
1239 		KASSERT(!RING_FULL(&sc->sc_ring));
1240 		struct xbd_req *xbdreq2 = SLIST_FIRST(&sc->sc_xbdreq_head);
1241 		KASSERT(xbdreq2 != NULL); /* Checked earlier */
1242 		SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
1243 		xbdreq->req_child = xbdreq2;
1244 		xbdreq->req_parent_done = false;
1245 		xbdreq2->req_parent = xbdreq;
1246 		xbdreq2->req_bp = bp;
1247 		xbdreq2->req_data = xbdreq->req_data;
1248 		xbd_diskstart_submit(sc, xbdreq2->req_id,
1249 		    bp, XBD_MAX_CHUNK, xbdreq->req_dmamap,
1250 		    xbdreq->req_gntref);
1251 	}
1252 
1253 push:
1254 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, notify);
1255 	if (notify)
1256 		hypervisor_notify_via_evtchn(sc->sc_evtchn);
1257 out:
1258 	mutex_exit(&sc->sc_lock);
1259 	return error;
1260 }
1261 
1262 static void
xbd_diskstart_submit(struct xbd_xenbus_softc * sc,int req_id,struct buf * bp,int start,bus_dmamap_t dmamap,grant_ref_t * gntref)1263 xbd_diskstart_submit(struct xbd_xenbus_softc *sc,
1264     int req_id, struct buf *bp, int start, bus_dmamap_t dmamap,
1265     grant_ref_t *gntref)
1266 {
1267 	blkif_request_t *req;
1268 	paddr_t ma;
1269 	int nsects, nbytes, dmaseg, first_sect, size, segidx = 0;
1270 	struct blkif_request_segment *reqseg;
1271 
1272 	KASSERT(mutex_owned(&sc->sc_lock));
1273 
1274 	req = RING_GET_REQUEST(&sc->sc_ring, sc->sc_ring.req_prod_pvt);
1275 	req->id = req_id;
1276 	req->operation =
1277 	    bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE;
1278 	req->sector_number = (bp->b_rawblkno * sc->sc_secsize / XEN_BSIZE) +
1279 	    (start >> XEN_BSHIFT);
1280 	req->handle = sc->sc_handle;
1281 	DPRINTF(("%s: id %" PRIu64 " op %d sn %" PRIu64 " handle %d\n",
1282 	    __func__, req->id, req->operation, req->sector_number,
1283 	    req->handle));
1284 
1285 	size = uimin(bp->b_bcount - start, XBD_MAX_CHUNK);
1286 	for (dmaseg = 0; dmaseg < dmamap->dm_nsegs && size > 0; dmaseg++) {
1287 		bus_dma_segment_t *ds = &dmamap->dm_segs[dmaseg];
1288 
1289 		ma = ds->ds_addr;
1290 		nbytes = ds->ds_len;
1291 
1292 		if (start > 0) {
1293 			if (start >= nbytes) {
1294 				start -= nbytes;
1295 				continue;
1296 			}
1297 			ma += start;
1298 			nbytes -= start;
1299 			start = 0;
1300 		}
1301 		size -= nbytes;
1302 
1303 		KASSERT(((ma & PAGE_MASK) & (sc->sc_secsize - 1)) == 0);
1304 		KASSERT((nbytes & (sc->sc_secsize - 1)) == 0);
1305 		KASSERT((size & (sc->sc_secsize - 1)) == 0);
1306 		first_sect = (ma & PAGE_MASK) >> XEN_BSHIFT;
1307 		nsects = nbytes >> XEN_BSHIFT;
1308 
1309 		reqseg = &req->seg[segidx++];
1310 		reqseg->first_sect = first_sect;
1311 		reqseg->last_sect = first_sect + nsects - 1;
1312 		KASSERT(reqseg->first_sect <= reqseg->last_sect);
1313 		KASSERT(reqseg->last_sect < (PAGE_SIZE / XEN_BSIZE));
1314 		DPRINTF(("%s: seg %d fs %d ls %d\n", __func__, segidx,
1315 		    reqseg->first_sect, reqseg->last_sect));
1316 
1317 		reqseg->gref = gntref[dmaseg];
1318 	}
1319 	KASSERT(segidx > 0);
1320 	req->nr_segments = segidx;
1321 	sc->sc_ring.req_prod_pvt++;
1322 }
1323 
1324 static void
xbd_diskstart_submit_indirect(struct xbd_xenbus_softc * sc,struct xbd_req * xbdreq,struct buf * bp)1325 xbd_diskstart_submit_indirect(struct xbd_xenbus_softc *sc,
1326     struct xbd_req *xbdreq, struct buf *bp)
1327 {
1328 	blkif_request_indirect_t *req;
1329 	paddr_t ma;
1330 	int nsects, nbytes, dmaseg, first_sect;
1331 	struct blkif_request_segment *reqseg;
1332 
1333 	KASSERT(mutex_owned(&sc->sc_lock));
1334 
1335 	req = (blkif_request_indirect_t *)RING_GET_REQUEST(&sc->sc_ring,
1336 	    sc->sc_ring.req_prod_pvt);
1337 	req->id = xbdreq->req_id;
1338 	req->operation = BLKIF_OP_INDIRECT;
1339 	req->indirect_op =
1340 	    bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE;
1341 	req->sector_number = bp->b_rawblkno * sc->sc_secsize / XEN_BSIZE;
1342 	req->handle = sc->sc_handle;
1343 	DPRINTF(("%s: id %" PRIu64 " op %d sn %" PRIu64 " handle %d\n",
1344 	    __func__, req->id, req->indirect_op, req->sector_number,
1345 	    req->handle));
1346 
1347 	xbdreq->req_indirect = SLIST_FIRST(&sc->sc_indirect_head);
1348 	KASSERT(xbdreq->req_indirect != NULL);	/* always as many as reqs */
1349 	SLIST_REMOVE_HEAD(&sc->sc_indirect_head, in_next);
1350 	req->indirect_grefs[0] = xbdreq->req_indirect->in_gntref;
1351 
1352 	reqseg = xbdreq->req_indirect->in_addr;
1353 	for (dmaseg = 0; dmaseg < xbdreq->req_dmamap->dm_nsegs; dmaseg++) {
1354 		bus_dma_segment_t *ds = &xbdreq->req_dmamap->dm_segs[dmaseg];
1355 
1356 		ma = ds->ds_addr;
1357 		nbytes = ds->ds_len;
1358 
1359 		KASSERT(((ma & PAGE_MASK) & (sc->sc_secsize - 1)) == 0);
1360 		KASSERT((nbytes & (sc->sc_secsize - 1)) == 0);
1361 
1362 		first_sect = (ma & PAGE_MASK) >> XEN_BSHIFT;
1363 		nsects = nbytes >> XEN_BSHIFT;
1364 
1365 		reqseg->first_sect = first_sect;
1366 		reqseg->last_sect = first_sect + nsects - 1;
1367 		reqseg->gref = xbdreq->req_gntref[dmaseg];
1368 		DPRINTF(("%s: seg %d fs %d ls %d\n", __func__, dmaseg,
1369 		    reqseg->first_sect, reqseg->last_sect));
1370 
1371 		KASSERT(reqseg->first_sect <= reqseg->last_sect);
1372 		KASSERT(reqseg->last_sect < (PAGE_SIZE / XEN_BSIZE));
1373 
1374 		reqseg++;
1375 	}
1376 	req->nr_segments = dmaseg;
1377 	sc->sc_ring.req_prod_pvt++;
1378 
1379 	sc->sc_cnt_indirect.ev_count++;
1380 }
1381 
1382 static int
xbd_map_align(struct xbd_xenbus_softc * sc,struct xbd_req * req)1383 xbd_map_align(struct xbd_xenbus_softc *sc, struct xbd_req *req)
1384 {
1385 	sc->sc_cnt_map_unalign.ev_count++;
1386 
1387 	if (sc->sc_unalign_used) {
1388 		sc->sc_cnt_unalign_busy.ev_count++;
1389 		return EAGAIN;
1390 	}
1391 	sc->sc_unalign_used = req->req_bp;
1392 
1393 	KASSERT(req->req_bp->b_bcount <= MAXPHYS);
1394 	req->req_data = (void *)sc->sc_unalign_buffer;
1395 	if ((req->req_bp->b_flags & B_READ) == 0)
1396 		memcpy(req->req_data, req->req_bp->b_data,
1397 		    req->req_bp->b_bcount);
1398 	return 0;
1399 }
1400 
1401 static void
xbd_unmap_align(struct xbd_xenbus_softc * sc,struct xbd_req * req,struct buf * bp)1402 xbd_unmap_align(struct xbd_xenbus_softc *sc, struct xbd_req *req,
1403     struct buf *bp)
1404 {
1405 	KASSERT(!bp || sc->sc_unalign_used == bp);
1406 	if (bp && bp->b_flags & B_READ)
1407 		memcpy(bp->b_data, req->req_data, bp->b_bcount);
1408 	sc->sc_unalign_used = NULL;
1409 }
1410