xref: /netbsd-src/sys/arch/xen/xen/xbdback_xenbus.c (revision d55161bb9f08bdbecc38fc10cede7a9a4fcc9f7d)
1 /*      $NetBSD: xbdback_xenbus.c,v 1.107 2024/06/20 15:17:27 bouyer Exp $      */
2 
3 /*
4  * Copyright (c) 2006,2024 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.107 2024/06/20 15:17:27 bouyer Exp $");
30 
31 #include <sys/buf.h>
32 #include <sys/condvar.h>
33 #include <sys/conf.h>
34 #include <sys/disk.h>
35 #include <sys/device.h>
36 #include <sys/fcntl.h>
37 #include <sys/kauth.h>
38 #include <sys/kernel.h>
39 #include <sys/kmem.h>
40 #include <sys/kthread.h>
41 #include <sys/mutex.h>
42 #include <sys/param.h>
43 #include <sys/queue.h>
44 #include <sys/systm.h>
45 #include <sys/time.h>
46 #include <sys/types.h>
47 #include <sys/vnode.h>
48 
49 #include <xen/intr.h>
50 #include <xen/hypervisor.h>
51 #include <xen/xen.h>
52 #include <xen/xen_shm.h>
53 #include <xen/evtchn.h>
54 #include <xen/xenbus.h>
55 #include <xen/xenring.h>
56 #include <xen/include/public/io/protocols.h>
57 
58 /* #define XENDEBUG_VBD */
59 #ifdef XENDEBUG_VBD
60 #define XENPRINTF(x) printf x
61 #else
62 #define XENPRINTF(x)
63 #endif
64 
65 #define BLKIF_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
66 
67 /*
68  * Backend block device driver for Xen
69  */
70 
71 /* Values are expressed in 512-byte sectors */
72 #define VBD_BSIZE 512
73 #define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1)
74 
75 #define VBD_VA_SIZE			MAXPHYS
76 #define VBD_MAX_INDIRECT_SEGMENTS	(VBD_VA_SIZE >> PAGE_SHIFT)
77 
78 CTASSERT(XENSHM_MAX_PAGES_PER_REQUEST >= VBD_MAX_INDIRECT_SEGMENTS);
79 
80 struct xbdback_instance;
81 
82 /*
83  * status of a xbdback instance:
84  * WAITING: xbdback instance is connected, waiting for requests
85  * RUN: xbdi thread must be woken up, I/Os have to be processed
86  * DISCONNECTING: the instance is closing, no more I/Os can be scheduled
87  * DISCONNECTED: no I/Os, no ring, the thread should terminate.
88  */
89 typedef enum {WAITING, RUN, DISCONNECTING, DISCONNECTED} xbdback_state_t;
90 
91 /*
92  * Each xbdback instance is managed by a single thread that handles all
93  * the I/O processing. As there are a variety of conditions that can block,
94  * everything will be done in a sort of continuation-passing style.
95  *
96  * When the execution has to block to delay processing, for example to
97  * allow system to recover because of memory shortage (via shared memory
98  * callback), the return value of a continuation can be set to NULL. In that
99  * case, the thread will go back to sleeping and wait for the proper
100  * condition before it starts processing requests again from where it left.
101  * Continuation state is "stored" in the xbdback instance (xbdi_cont),
102  * and should only be manipulated by the instance thread.
103  * If a continuation has to be restarted from a specific point,
104  * the callback and argument can be stored in xbdi_cont_restart and
105  * xbdi_cont_restart_obj
106  *
107  *
108  * As xbdback(4) has to handle different sort of asynchronous events (Xen
109  * event channels, biointr() soft interrupts, xenbus commands), the xbdi_lock
110  * mutex is used to protect specific elements of the xbdback instance from
111  * concurrent access: thread status and ring access (when pushing responses).
112  *
113  * Here's how the call graph is supposed to be for a single I/O:
114  *
115  * xbdback_co_main()
116  *        |               --> xbdback_co_cache_flush()
117  *        |               |    |
118  *        |               |    -> xbdback_co_do_io() or NULL
119  * xbdback_co_main_loop()-|
120  *        |               |-> xbdback_co_main_done2() or NULL
121  *        |               |
122  *        |               --> xbdback_co_main_incr() -> xbdback_co_main_loop()
123  *        |
124  *     xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop()
125  *        |
126  *     xbdback_co_io_gotio() -> xbdback_co_main_incr() -> xbdback_co_main_loop()
127  *        |
128  *     xbdback_co_do_io()
129  *        |
130  *     xbdback_co_main_incr() -> xbdback_co_main_loop()
131  */
132 typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);
133 
134 enum xbdi_proto {
135 	XBDIP_NATIVE,
136 	XBDIP_32,
137 	XBDIP_64
138 };
139 
140 struct xbdback_va {
141 	SLIST_ENTRY(xbdback_va) xv_next;
142 	vaddr_t xv_vaddr;
143 };
144 
145 /*
146  * For each I/O operation associated with one of those requests, an
147  * xbdback_io is allocated from a pool.  It may correspond to multiple
148  * Xen disk requests, or parts of them, if several arrive at once that
149  * can be coalesced.
150  */
151 struct xbdback_io {
152 	SLIST_ENTRY(xbdback_io) xio_next;
153 	/* The instance pointer is duplicated for convenience. */
154 	struct xbdback_instance *xio_xbdi; /* our xbd instance */
155 	/* _request state: track requests fetched from ring */
156 	blkif_request_t xio_xen_req;
157 	/* array of segments[VBD_MAX_INDIRECT_SEGMENTS] allocated separately */
158 	struct blkif_request_segment *xio_seg;
159 	bus_dmamap_t xio_seg_dmamap;
160 	/* internal states */
161 	union {
162 		struct {
163 			struct buf xio_buf; /* our I/O */
164 			/* the virtual address to map the request at */
165 			vaddr_t xio_vaddr;
166 			struct xbdback_va *xio_xv;
167 			vaddr_t xio_start_offset;	/* I/O start offset */
168 			/* grants to map */
169 			grant_ref_t xio_gref[VBD_MAX_INDIRECT_SEGMENTS];
170 			/* grants release */
171 			grant_handle_t xio_gh[VBD_MAX_INDIRECT_SEGMENTS];
172 			bool xio_need_bounce; /* request is not contiguous */
173 		} xio_rw;
174 	} u;
175 };
176 #define xio_buf		u.xio_rw.xio_buf
177 #define xio_vaddr	u.xio_rw.xio_vaddr
178 #define xio_start_offset	u.xio_rw.xio_start_offset
179 #define xio_xv		u.xio_rw.xio_xv
180 #define xio_gref	u.xio_rw.xio_gref
181 #define xio_gh		u.xio_rw.xio_gh
182 #define xio_need_bounce	u.xio_rw.xio_need_bounce
183 
184 /* we keep the xbdback instances in a linked list */
185 struct xbdback_instance {
186 	SLIST_ENTRY(xbdback_instance) next;
187 	struct xenbus_device *xbdi_xbusd; /* our xenstore entry */
188 	struct xenbus_watch xbdi_watch; /* to watch our store */
189 	domid_t xbdi_domid;	/* attached to this domain */
190 	uint32_t xbdi_handle;	/* domain-specific handle */
191 	char xbdi_name[16];	/* name of this instance */
192 	/* mutex that protects concurrent access to the xbdback instance */
193 	kmutex_t xbdi_lock;
194 	kcondvar_t xbdi_cv;	/* wait channel for thread work */
195 	xbdback_state_t xbdi_status; /* thread's status */
196 	/* context and KVA for mapping transfers */
197 	struct xbdback_io xbdi_io[BLKIF_RING_SIZE];
198 	SLIST_HEAD(, xbdback_io) xbdi_io_free;
199 	struct xbdback_va xbdi_va[BLKIF_RING_SIZE];
200 	SLIST_HEAD(, xbdback_va) xbdi_va_free;
201 	/* segments structure allocated in page-aligned chunks */
202 	struct blkif_request_segment *xbdi_segs;
203 	/* bounce buffer in case a transfer is not contiguous */
204 	vaddr_t xbdi_bouncebuf;
205 	int xbdi_bouncebuf_use; /* is bounce buffer in use? */
206 	/* backing device parameters */
207 	dev_t xbdi_dev;
208 	const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */
209 	struct vnode *xbdi_vp;
210 	uint64_t xbdi_size;
211 	bool xbdi_ro; /* is device read-only ? */
212 	/* parameters for the communication */
213 	unsigned int xbdi_evtchn;
214 	struct intrhand *xbdi_ih;
215 	/* private parameters for communication */
216 	blkif_back_ring_proto_t xbdi_ring;
217 	enum xbdi_proto xbdi_proto;
218 	grant_handle_t xbdi_ring_handle; /* to unmap the ring */
219 	vaddr_t xbdi_ring_va; /* to unmap the ring */
220 	/* disconnection must be postponed until all I/O is done */
221 	int xbdi_refcnt;
222 	/*
223 	 * State for I/O processing/coalescing follows; this has to
224 	 * live here instead of on the stack because of the
225 	 * continuation-ness (see above).
226 	 */
227 	RING_IDX xbdi_req_prod; /* limit on request indices */
228 	xbdback_cont_t xbdi_cont;
229 	/* if not NULL, will restart here after thread wakes up */
230 	xbdback_cont_t xbdi_cont_restart;
231 	void *xbdi_cont_restart_obj;
232 	/* other state */
233 	uint xbdi_pendingreqs; /* number of I/O in fly */
234 	struct timeval xbdi_lasterr_time;    /* error time tracking */
235 };
236 /* Manipulation of the above reference count. */
237 #define xbdi_get(xbdip) 					\
238 do {								\
239 	KASSERT(mutex_owned(&xbdip->xbdi_lock));		\
240 	(xbdip)->xbdi_refcnt++;					\
241 } while (0)
242 
243 #define xbdi_put(xbdip)						\
244 do {								\
245 	KASSERT(mutex_owned(&xbdip->xbdi_lock));		\
246 	if (--((xbdip)->xbdi_refcnt) == 0)  			\
247                xbdback_finish_disconnect(xbdip);		\
248 } while (0)
249 
250 static SLIST_HEAD(, xbdback_instance) xbdback_instances;
251 static kmutex_t xbdback_lock;
252 
253 /* Interval between reports of I/O errors from frontend */
254 static const struct timeval xbdback_err_intvl = { 1, 0 };
255 
256        void xbdbackattach(int);
257 static int  xbdback_xenbus_create(struct xenbus_device *);
258 static int  xbdback_xenbus_destroy(void *);
259 static void xbdback_frontend_changed(void *, XenbusState);
260 static void xbdback_backend_changed(struct xenbus_watch *,
261     const char **, unsigned int);
262 static int  xbdback_evthandler(void *);
263 
264 static int  xbdback_connect(struct xbdback_instance *);
265 static void xbdback_disconnect(struct xbdback_instance *);
266 static void xbdback_finish_disconnect(struct xbdback_instance *);
267 
268 static bool xbdif_lookup(domid_t, uint32_t);
269 
270 static void *xbdback_co_main(struct xbdback_instance *, void *);
271 static void *xbdback_co_main_loop(struct xbdback_instance *, void *);
272 static void *xbdback_co_main_incr(struct xbdback_instance *, void *);
273 static void *xbdback_co_main_done2(struct xbdback_instance *, void *);
274 
275 static void *xbdback_co_cache_flush(struct xbdback_instance *, void *);
276 
277 static void *xbdback_co_io(struct xbdback_instance *, void *);
278 static void *xbdback_co_io_gotio(struct xbdback_instance *, void *);
279 
280 static void *xbdback_co_do_io(struct xbdback_instance *, void *);
281 
282 static void xbdback_io_error(struct xbdback_io *, int);
283 static void xbdback_iodone(struct buf *);
284 static void xbdback_iodone_locked(struct xbdback_instance *,
285 		struct xbdback_io *, struct buf *);
286 static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int);
287 
288 static int  xbdback_map_shm(struct xbdback_io *);
289 static void xbdback_unmap_shm(struct xbdback_io *);
290 
291 static struct xbdback_io *xbdback_io_get(struct xbdback_instance *);
292 static void xbdback_io_put(struct xbdback_instance *, struct xbdback_io *);
293 static void xbdback_thread(void *);
294 static void xbdback_wakeup_thread(struct xbdback_instance *);
295 static void xbdback_trampoline(struct xbdback_instance *, void *);
296 
297 static struct xenbus_backend_driver xbd_backend_driver = {
298 	.xbakd_create = xbdback_xenbus_create,
299 	.xbakd_type = "vbd"
300 };
301 
302 void
xbdbackattach(int n)303 xbdbackattach(int n)
304 {
305 	XENPRINTF(("xbdbackattach\n"));
306 
307 	/*
308 	 * initialize the backend driver, register the control message handler
309 	 * and send driver up message.
310 	 */
311 	SLIST_INIT(&xbdback_instances);
312 	mutex_init(&xbdback_lock, MUTEX_DEFAULT, IPL_NONE);
313 
314 	xenbus_backend_register(&xbd_backend_driver);
315 }
316 
317 static int
xbdback_xenbus_create(struct xenbus_device * xbusd)318 xbdback_xenbus_create(struct xenbus_device *xbusd)
319 {
320 	struct xbdback_instance *xbdi;
321 	long domid, handle;
322 	int error, i;
323 	int segalloc = 0;
324 	char *ep;
325 
326 	if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path,
327 	    "frontend-id", &domid, 10)) != 0) {
328 		aprint_error("xbdback: can't read %s/frontend-id: %d\n",
329 		    xbusd->xbusd_path, error);
330 		return error;
331 	}
332 
333 	/*
334 	 * get handle: this is the last component of the path; which is
335 	 * a decimal number. $path/dev contains the device name, which is not
336 	 * appropriate.
337 	 */
338 	for (i = strlen(xbusd->xbusd_path); i > 0; i--) {
339 		if (xbusd->xbusd_path[i] == '/')
340 			break;
341 	}
342 	if (i == 0) {
343 		aprint_error("xbdback: can't parse %s\n",
344 		    xbusd->xbusd_path);
345 		return EFTYPE;
346 	}
347 	handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10);
348 	if (*ep != '\0') {
349 		aprint_error("xbdback: can't parse %s\n",
350 		    xbusd->xbusd_path);
351 		return EFTYPE;
352 	}
353 
354 	xbdi = kmem_zalloc(sizeof(*xbdi), KM_SLEEP);
355 
356 	xbdi->xbdi_domid = domid;
357 	xbdi->xbdi_handle = handle;
358 	snprintf(xbdi->xbdi_name, sizeof(xbdi->xbdi_name), "xbdb%di%d",
359 	    xbdi->xbdi_domid, xbdi->xbdi_handle);
360 
361 	mutex_enter(&xbdback_lock);
362 	if (xbdif_lookup(domid, handle)) {
363 		mutex_exit(&xbdback_lock);
364 		kmem_free(xbdi, sizeof(*xbdi));
365 		return EEXIST;
366 	}
367 	SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next);
368 	mutex_exit(&xbdback_lock);
369 
370 	/* initialize status and reference counter */
371 	xbdi->xbdi_status = DISCONNECTED;
372 
373 	mutex_init(&xbdi->xbdi_lock, MUTEX_DEFAULT, IPL_BIO);
374 	cv_init(&xbdi->xbdi_cv, xbdi->xbdi_name);
375 
376 	mutex_enter(&xbdi->xbdi_lock);
377 	xbdi_get(xbdi);
378 	mutex_exit(&xbdi->xbdi_lock);
379 
380 	xbusd->xbusd_u.b.b_cookie = xbdi;
381 	xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy;
382 	xbusd->xbusd_otherend_changed = xbdback_frontend_changed;
383 	xbdi->xbdi_xbusd = xbusd;
384 
385 	SLIST_INIT(&xbdi->xbdi_va_free);
386 	for (i = 0; i < BLKIF_RING_SIZE; i++) {
387 		xbdi->xbdi_va[i].xv_vaddr = uvm_km_alloc(kernel_map,
388 		    VBD_VA_SIZE, 0, UVM_KMF_VAONLY|UVM_KMF_WAITVA);
389 		SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, &xbdi->xbdi_va[i],
390 		    xv_next);
391 	}
392 
393 	/*
394 	 * allocate page-aligned memory for segments, so that for each
395 	 * xbdback_io its segments are in a single page.
396 	 * sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS
397 	 * is 128 so this helps us avoiding a page boundary withing a
398 	 * block of VBD_MAX_INDIRECT_SEGMENTS segments.
399 	 */
400 	CTASSERT(sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS == 128);
401 	xbdi->xbdi_segs = (void *)uvm_km_alloc(kernel_map, round_page(
402 	    sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS * BLKIF_RING_SIZE),
403 	    PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_WAITVA);
404 
405 	SLIST_INIT(&xbdi->xbdi_io_free);
406 	for (i = 0; i < BLKIF_RING_SIZE; i++) {
407 		struct xbdback_io *xbd_io = &xbdi->xbdi_io[i];
408 		xbd_io->xio_seg =
409 		    &xbdi->xbdi_segs[i * VBD_MAX_INDIRECT_SEGMENTS];
410 		error = bus_dmamap_create(xbdi->xbdi_xbusd->xbusd_dmat,
411 		    PAGE_SIZE, 1, PAGE_SIZE, PAGE_SIZE,
412 		    BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
413 		    &xbd_io->xio_seg_dmamap);
414 		if (error != 0) {
415 			printf("%s: can't create dma map for indirect segments %d\n",
416 			    xbdi->xbdi_name, i);
417 			goto fail;
418 		}
419 		error = bus_dmamap_load(xbdi->xbdi_xbusd->xbusd_dmat,
420 		    xbd_io->xio_seg_dmamap, xbd_io->xio_seg,
421 		    sizeof(struct blkif_request_segment) * VBD_MAX_INDIRECT_SEGMENTS,
422 		    NULL, BUS_DMA_WAITOK);
423 		if (error != 0) {
424 			printf("%s: can't load dma map for indirect segments %d @%p (%d, %zu)\n",
425 			    xbdi->xbdi_name, i, xbd_io->xio_seg, error, sizeof(xbd_io->xio_seg));
426 			bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat,
427 			    xbd_io->xio_seg_dmamap);
428 			goto fail;
429 		}
430 		KASSERT(xbd_io->xio_seg_dmamap->dm_nsegs == 1);
431 		segalloc = i;
432 		SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, xbd_io, xio_next);
433 	}
434 
435 	error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device",
436 	    &xbdi->xbdi_watch, xbdback_backend_changed);
437 	if (error) {
438 		printf("failed to watch on %s/physical-device: %d\n",
439 		    xbusd->xbusd_path, error);
440 		goto fail;
441 	}
442 	xbdi->xbdi_watch.xbw_dev = xbusd;
443 	error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait);
444 	if (error) {
445 		printf("failed to switch state on %s: %d\n",
446 		    xbusd->xbusd_path, error);
447 		goto fail2;
448 	}
449 
450 	xbdi->xbdi_bouncebuf = uvm_km_alloc(kernel_map, MAXPHYS, PAGE_SIZE,
451 	    UVM_KMF_WIRED | UVM_KMF_WAITVA);
452 	return 0;
453 fail2:
454 	unregister_xenbus_watch(&xbdi->xbdi_watch);
455 fail:
456 	for (i = 0; i < segalloc; i++) {
457 		struct xbdback_io *xbd_io = &xbdi->xbdi_io[i];
458 		bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat,
459 		    xbd_io->xio_seg_dmamap);
460 		bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat,
461 		    xbd_io->xio_seg_dmamap);
462 	}
463 	mutex_enter(&xbdback_lock);
464 	SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
465 	mutex_exit(&xbdback_lock);
466 	kmem_free(xbdi, sizeof(*xbdi));
467 	return error;
468 }
469 
470 static int
xbdback_xenbus_destroy(void * arg)471 xbdback_xenbus_destroy(void *arg)
472 {
473 	struct xbdback_instance *xbdi = arg;
474 
475 	XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status));
476 
477 	xbdback_disconnect(xbdi);
478 
479 	/* unregister watch */
480 	if (xbdi->xbdi_watch.node)
481 		xenbus_unwatch_path(&xbdi->xbdi_watch);
482 	/* unmap ring */
483 	if (xbdi->xbdi_ring_handle) {
484 		xen_shm_unmap(xbdi->xbdi_ring_va, 1, &xbdi->xbdi_ring_handle);
485 	}
486 
487 	if (xbdi->xbdi_ring_va != 0) {
488 		uvm_km_free(kernel_map, xbdi->xbdi_ring_va,
489 		    PAGE_SIZE, UVM_KMF_VAONLY);
490 	}
491 
492 	/* close device */
493 	if (xbdi->xbdi_size) {
494 		const char *name;
495 		struct dkwedge_info wi;
496 		if (getdiskinfo(xbdi->xbdi_vp, &wi) == 0)
497 			name = wi.dkw_devname;
498 		else
499 			name = "*unknown*";
500 		printf("xbd backend: detach device %s for domain %d\n",
501 		    name, xbdi->xbdi_domid);
502 		vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
503 	}
504 	mutex_enter(&xbdback_lock);
505 	SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
506 	mutex_exit(&xbdback_lock);
507 
508 	for (int i = 0; i < BLKIF_RING_SIZE; i++) {
509 		struct xbdback_io *xbd_io = &xbdi->xbdi_io[i];
510 		bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat,
511 		    xbd_io->xio_seg_dmamap);
512 		bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat,
513 		    xbd_io->xio_seg_dmamap);
514 		if (xbdi->xbdi_va[i].xv_vaddr != 0) {
515 			uvm_km_free(kernel_map, xbdi->xbdi_va[i].xv_vaddr,
516 			    VBD_VA_SIZE, UVM_KMF_VAONLY);
517 			xbdi->xbdi_va[i].xv_vaddr = 0;
518 		}
519 	}
520 
521 
522 	mutex_destroy(&xbdi->xbdi_lock);
523 	cv_destroy(&xbdi->xbdi_cv);
524 	kmem_free(xbdi, sizeof(*xbdi));
525 	return 0;
526 }
527 
528 static int
xbdback_connect(struct xbdback_instance * xbdi)529 xbdback_connect(struct xbdback_instance *xbdi)
530 {
531 	int err;
532 	evtchn_op_t evop;
533 	grant_ref_t gring_ref;
534 	u_long ring_ref, revtchn;
535 	char xsproto[32];
536 	const char *proto;
537 	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
538 
539 	XENPRINTF(("xbdback %s: connect\n", xbusd->xbusd_path));
540 	/* read comunication informations */
541 	err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
542 	    "ring-ref", &ring_ref, 10);
543 	if (err) {
544 		xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref",
545 		    xbusd->xbusd_otherend);
546 		return -1;
547 	}
548 	XENPRINTF(("xbdback %s: connect ring-ref %lu\n", xbusd->xbusd_path, ring_ref));
549 	err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
550 	    "event-channel", &revtchn, 10);
551 	if (err) {
552 		xenbus_dev_fatal(xbusd, err, "reading %s/event-channel",
553 		    xbusd->xbusd_otherend);
554 		return -1;
555 	}
556 	XENPRINTF(("xbdback %s: connect revtchn %lu\n", xbusd->xbusd_path, revtchn));
557 	err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol",
558 	    xsproto, sizeof(xsproto));
559 	if (err) {
560 		xbdi->xbdi_proto = XBDIP_NATIVE;
561 		proto = "unspecified";
562 		XENPRINTF(("xbdback %s: connect no xsproto\n", xbusd->xbusd_path));
563 	} else {
564 		XENPRINTF(("xbdback %s: connect xsproto %s\n", xbusd->xbusd_path, xsproto));
565 		if (strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) {
566 			xbdi->xbdi_proto = XBDIP_NATIVE;
567 			proto = XEN_IO_PROTO_ABI_NATIVE;
568 		} else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) {
569 			xbdi->xbdi_proto = XBDIP_32;
570 			proto = XEN_IO_PROTO_ABI_X86_32;
571 		} else if (strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) {
572 			xbdi->xbdi_proto = XBDIP_64;
573 			proto = XEN_IO_PROTO_ABI_X86_64;
574 		} else {
575 			aprint_error("xbd domain %d: unknown proto %s\n",
576 			    xbdi->xbdi_domid, xsproto);
577 			return -1;
578 		}
579 	}
580 
581 	/* allocate VA space and map rings */
582 	xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
583 	    UVM_KMF_VAONLY);
584 	if (xbdi->xbdi_ring_va == 0) {
585 		xenbus_dev_fatal(xbusd, ENOMEM,
586 		    "can't get VA for ring", xbusd->xbusd_otherend);
587 		return -1;
588 	}
589 	XENPRINTF(("xbdback %s: connect va 0x%" PRIxVADDR "\n", xbusd->xbusd_path, xbdi->xbdi_ring_va));
590 
591 	gring_ref = ring_ref;
592 	if (xen_shm_map(1, xbdi->xbdi_domid, &gring_ref, xbdi->xbdi_ring_va,
593 	    &xbdi->xbdi_ring_handle, 0) != 0) {
594 		aprint_error("xbdback %s: can't map grant ref\n",
595 		    xbusd->xbusd_path);
596 		xenbus_dev_fatal(xbusd, EINVAL,
597 		    "can't map ring", xbusd->xbusd_otherend);
598 		goto err1;
599 	}
600 	XENPRINTF(("xbdback %s: connect grhandle %d\n", xbusd->xbusd_path, xbdi->xbdi_ring_handle));
601 
602 	switch(xbdi->xbdi_proto) {
603 	case XBDIP_NATIVE:
604 	{
605 		blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va;
606 		BACK_RING_INIT(&xbdi->xbdi_ring.ring_n, sring, PAGE_SIZE);
607 		break;
608 	}
609 	case XBDIP_32:
610 	{
611 		blkif_x86_32_sring_t *sring = (void *)xbdi->xbdi_ring_va;
612 		BACK_RING_INIT(&xbdi->xbdi_ring.ring_32, sring, PAGE_SIZE);
613 		break;
614 	}
615 	case XBDIP_64:
616 	{
617 		blkif_x86_64_sring_t *sring = (void *)xbdi->xbdi_ring_va;
618 		BACK_RING_INIT(&xbdi->xbdi_ring.ring_64, sring, PAGE_SIZE);
619 		break;
620 	}
621 	}
622 
623 	evop.cmd = EVTCHNOP_bind_interdomain;
624 	evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid;
625 	evop.u.bind_interdomain.remote_port = revtchn;
626 	err = HYPERVISOR_event_channel_op(&evop);
627 	if (err) {
628 		aprint_error("blkback %s: "
629 		    "can't get event channel: %d\n",
630 		    xbusd->xbusd_otherend, err);
631 		xenbus_dev_fatal(xbusd, err,
632 		    "can't bind event channel", xbusd->xbusd_otherend);
633 		goto err2;
634 	}
635 	xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port;
636 	XENPRINTF(("xbdback %s: connect evchannel %d\n", xbusd->xbusd_path, xbdi->xbdi_evtchn));
637 
638 	xbdi->xbdi_ih = xen_intr_establish_xname(-1, &xen_pic,
639 	    xbdi->xbdi_evtchn, IST_LEVEL, IPL_BIO, xbdback_evthandler, xbdi,
640 	    true, xbdi->xbdi_name);
641 	KASSERT(xbdi->xbdi_ih != NULL);
642 	aprint_verbose("xbd backend domain %d handle %#x (%d) "
643 	    "using event channel %d, protocol %s\n", xbdi->xbdi_domid,
644 	    xbdi->xbdi_handle, xbdi->xbdi_handle, xbdi->xbdi_evtchn, proto);
645 
646 	/* enable the xbdback event handler machinery */
647 	xbdi->xbdi_status = WAITING;
648 	hypervisor_unmask_event(xbdi->xbdi_evtchn);
649 	hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
650 
651 	if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
652 	    xbdback_thread, xbdi, NULL, "%s", xbdi->xbdi_name) == 0)
653 		return 0;
654 
655 err2:
656 	/* unmap ring */
657 	xen_shm_unmap(xbdi->xbdi_ring_va, 1, &xbdi->xbdi_ring_handle);
658 err1:
659 	/* free ring VA space */
660 	uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY);
661 	return -1;
662 }
663 
664 /*
665  * Signal a xbdback thread to disconnect. Done in 'xenwatch' thread context.
666  */
667 static void
xbdback_disconnect(struct xbdback_instance * xbdi)668 xbdback_disconnect(struct xbdback_instance *xbdi)
669 {
670 
671 	mutex_enter(&xbdi->xbdi_lock);
672 	if (xbdi->xbdi_status == DISCONNECTED) {
673 		mutex_exit(&xbdi->xbdi_lock);
674 		return;
675 	}
676 	hypervisor_mask_event(xbdi->xbdi_evtchn);
677 
678 	/* signal thread that we want to disconnect, then wait for it */
679 	xbdi->xbdi_status = DISCONNECTING;
680 	cv_signal(&xbdi->xbdi_cv);
681 
682 	while (xbdi->xbdi_status != DISCONNECTED)
683 		cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
684 
685 	mutex_exit(&xbdi->xbdi_lock);
686 	xen_intr_disestablish(xbdi->xbdi_ih);
687 
688 	xenbus_switch_state(xbdi->xbdi_xbusd, NULL, XenbusStateClosing);
689 }
690 
691 static void
xbdback_frontend_changed(void * arg,XenbusState new_state)692 xbdback_frontend_changed(void *arg, XenbusState new_state)
693 {
694 	struct xbdback_instance *xbdi = arg;
695 	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
696 
697 	XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state));
698 	switch(new_state) {
699 	case XenbusStateInitialising:
700 		break;
701 	case XenbusStateInitialised:
702 	case XenbusStateConnected:
703 		if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN)
704 			break;
705 		xbdback_connect(xbdi);
706 		break;
707 	case XenbusStateClosing:
708 		xbdback_disconnect(xbdi);
709 		break;
710 	case XenbusStateClosed:
711 		/* otherend_changed() should handle it for us */
712 		panic("xbdback_frontend_changed: closed\n");
713 	case XenbusStateUnknown:
714 	case XenbusStateInitWait:
715 	default:
716 		aprint_error("xbdback %s: invalid frontend state %d\n",
717 		    xbusd->xbusd_path, new_state);
718 	}
719 	return;
720 }
721 
722 static void
xbdback_backend_changed(struct xenbus_watch * watch,const char ** vec,unsigned int len)723 xbdback_backend_changed(struct xenbus_watch *watch,
724     const char **vec, unsigned int len)
725 {
726 	struct xenbus_device *xbusd = watch->xbw_dev;
727 	struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie;
728 	int err;
729 	long dev;
730 	char mode[32];
731 	struct xenbus_transaction *xbt;
732 	const char *devname;
733 	int major;
734 
735 	err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device",
736 	    &dev, 10);
737 	/*
738 	 * An error can occur as the watch can fire up just after being
739 	 * registered. So we have to ignore error  :(
740 	 */
741 	if (err)
742 		return;
743 	/*
744 	 * we can also fire up after having opened the device, don't try
745 	 * to do it twice.
746 	 */
747 	if (xbdi->xbdi_vp != NULL) {
748 		if (xbdi->xbdi_status == WAITING || xbdi->xbdi_status == RUN) {
749 			if (xbdi->xbdi_dev != dev) {
750 				printf("xbdback %s: changing physical device "
751 				    "from %#"PRIx64" to %#lx not supported\n",
752 				    xbusd->xbusd_path, xbdi->xbdi_dev, dev);
753 			}
754 		}
755 		return;
756 	}
757 	xbdi->xbdi_dev = dev;
758 	err = xenbus_read(NULL, xbusd->xbusd_path, "mode", mode, sizeof(mode));
759 	if (err) {
760 		printf("xbdback: failed to read %s/mode: %d\n",
761 		    xbusd->xbusd_path, err);
762 		return;
763 	}
764 	if (mode[0] == 'w')
765 		xbdi->xbdi_ro = false;
766 	else
767 		xbdi->xbdi_ro = true;
768 	major = major(xbdi->xbdi_dev);
769 	devname = devsw_blk2name(major);
770 	if (devname == NULL) {
771 		printf("xbdback %s: unknown device 0x%"PRIx64"\n",
772 		    xbusd->xbusd_path, xbdi->xbdi_dev);
773 		return;
774 	}
775 	xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev);
776 	if (xbdi->xbdi_bdevsw == NULL) {
777 		printf("xbdback %s: no bdevsw for device 0x%"PRIx64"\n",
778 		    xbusd->xbusd_path, xbdi->xbdi_dev);
779 		return;
780 	}
781 	err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp);
782 	if (err) {
783 		printf("xbdback %s: can't open device 0x%"PRIx64": %d\n",
784 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
785 		return;
786 	}
787 	err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY);
788 	if (err) {
789 		printf("xbdback %s: can't vn_lock device 0x%"PRIx64": %d\n",
790 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
791 		vrele(xbdi->xbdi_vp);
792 		return;
793 	}
794 	err  = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED);
795 	if (err) {
796 		printf("xbdback %s: can't VOP_OPEN device 0x%"PRIx64": %d\n",
797 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
798 		vput(xbdi->xbdi_vp);
799 		return;
800 	}
801 	VOP_UNLOCK(xbdi->xbdi_vp);
802 
803 	/* dk device; get wedge data */
804 	struct dkwedge_info wi;
805 	if ((err = getdiskinfo(xbdi->xbdi_vp, &wi)) == 0) {
806 		xbdi->xbdi_size = wi.dkw_size;
807 		printf("xbd backend: attach device %s (size %" PRIu64 ") "
808 		    "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size,
809 		    xbdi->xbdi_domid);
810 	} else {
811 		/* If both Ioctls failed set device size to 0 and return */
812 		printf("xbdback %s: can't DIOCGWEDGEINFO device "
813 		    "0x%"PRIx64": %d\n", xbusd->xbusd_path,
814 		    xbdi->xbdi_dev, err);
815 		xbdi->xbdi_size = xbdi->xbdi_dev = 0;
816 		vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
817 		xbdi->xbdi_vp = NULL;
818 		return;
819 	}
820 again:
821 	xbt = xenbus_transaction_start();
822 	if (xbt == NULL) {
823 		printf("xbdback %s: can't start transaction\n",
824 		    xbusd->xbusd_path);
825 		    return;
826 	}
827 	err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 ,
828 	    xbdi->xbdi_size);
829 	if (err) {
830 		printf("xbdback: failed to write %s/sectors: %d\n",
831 		    xbusd->xbusd_path, err);
832 		goto abort;
833 	}
834 	err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u",
835 	    xbdi->xbdi_ro ? VDISK_READONLY : 0);
836 	if (err) {
837 		printf("xbdback: failed to write %s/info: %d\n",
838 		    xbusd->xbusd_path, err);
839 		goto abort;
840 	}
841 	err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu",
842 	    (u_long)DEV_BSIZE);
843 	if (err) {
844 		printf("xbdback: failed to write %s/sector-size: %d\n",
845 		    xbusd->xbusd_path, err);
846 		goto abort;
847 	}
848 	err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache",
849 	    "%u", 1);
850 	if (err) {
851 		printf("xbdback: failed to write %s/feature-flush-cache: %d\n",
852 		    xbusd->xbusd_path, err);
853 		goto abort;
854 	}
855 	err = xenbus_printf(xbt, xbusd->xbusd_path,
856 	    "feature-max-indirect-segments", "%u", VBD_MAX_INDIRECT_SEGMENTS);
857 	if (err) {
858 		printf("xbdback: failed to write %s/feature-indirect: %d\n",
859 		    xbusd->xbusd_path, err);
860 		goto abort;
861 	}
862 	err = xenbus_transaction_end(xbt, 0);
863 	if (err == EAGAIN)
864 		goto again;
865 	if (err) {
866 		printf("xbdback %s: can't end transaction: %d\n",
867 		    xbusd->xbusd_path, err);
868 	}
869 	err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected);
870 	if (err) {
871 		printf("xbdback %s: can't switch state: %d\n",
872 		    xbusd->xbusd_path, err);
873 	}
874 	return;
875 abort:
876 	xenbus_transaction_end(xbt, 1);
877 }
878 
879 /*
880  * Used by a xbdi thread to signal that it is now disconnected.
881  */
882 static void
xbdback_finish_disconnect(struct xbdback_instance * xbdi)883 xbdback_finish_disconnect(struct xbdback_instance *xbdi)
884 {
885 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
886 	KASSERT(xbdi->xbdi_status == DISCONNECTING);
887 
888 	xbdi->xbdi_status = DISCONNECTED;
889 
890 	cv_broadcast(&xbdi->xbdi_cv);
891 }
892 
893 static bool
xbdif_lookup(domid_t dom,uint32_t handle)894 xbdif_lookup(domid_t dom , uint32_t handle)
895 {
896 	struct xbdback_instance *xbdi;
897 	bool found = false;
898 
899 	KASSERT(mutex_owned(&xbdback_lock));
900 
901 	SLIST_FOREACH(xbdi, &xbdback_instances, next) {
902 		if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle) {
903 			found = true;
904 			break;
905 		}
906 	}
907 
908 	return found;
909 }
910 
911 static int
xbdback_evthandler(void * arg)912 xbdback_evthandler(void *arg)
913 {
914 	struct xbdback_instance *xbdi = arg;
915 
916 	XENPRINTF(("xbdback_evthandler domain %d: cont %p\n",
917 	    xbdi->xbdi_domid, xbdi->xbdi_cont));
918 
919 	mutex_enter(&xbdi->xbdi_lock);
920 	xbdback_wakeup_thread(xbdi);
921 	mutex_exit(&xbdi->xbdi_lock);
922 
923 	return 1;
924 }
925 
926 /*
927  * Main thread routine for one xbdback instance. Woken up by
928  * xbdback_evthandler when a domain has I/O work scheduled in a I/O ring.
929  */
930 static void
xbdback_thread(void * arg)931 xbdback_thread(void *arg)
932 {
933 	struct xbdback_instance *xbdi = arg;
934 	void *obj;
935 
936 	mutex_enter(&xbdi->xbdi_lock);
937 	for (;;) {
938 		switch (xbdi->xbdi_status) {
939 		case WAITING:
940 			cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
941 			break;
942 		case RUN:
943 			xbdi->xbdi_status = WAITING; /* reset state */
944 			obj = xbdi;
945 			if (xbdi->xbdi_cont_restart != NULL) {
946 				KASSERT(xbdi->xbdi_cont == NULL);
947 				xbdi->xbdi_cont = xbdi->xbdi_cont_restart;
948 				obj = xbdi->xbdi_cont_restart_obj;
949 				xbdi->xbdi_cont_restart = NULL;
950 				xbdi->xbdi_cont_restart_obj = NULL;
951 			}
952 			if (xbdi->xbdi_cont == NULL) {
953 				xbdi->xbdi_cont = xbdback_co_main;
954 			}
955 
956 			xbdback_trampoline(xbdi, obj);
957 			break;
958 		case DISCONNECTING:
959 			if (xbdi->xbdi_pendingreqs > 0) {
960 				/* there are pending I/Os. Wait for them. */
961 				cv_wait(&xbdi->xbdi_cv, &xbdi->xbdi_lock);
962 				continue;
963 			}
964 
965 			/* All I/Os should have been processed by now,
966 			 * xbdi_refcnt should drop to 0 */
967 			xbdi_put(xbdi);
968 			KASSERT(xbdi->xbdi_refcnt == 0);
969 			goto out;
970 			/* NOTREACHED */
971 		default:
972 			panic("%s: invalid state %d",
973 			    xbdi->xbdi_name, xbdi->xbdi_status);
974 		}
975 	}
976 out:
977 	mutex_exit(&xbdi->xbdi_lock);
978 
979 	kthread_exit(0);
980 }
981 
982 static void *
xbdback_co_main(struct xbdback_instance * xbdi,void * obj)983 xbdback_co_main(struct xbdback_instance *xbdi, void *obj)
984 {
985 	(void)obj;
986 
987 	xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod;
988 	xen_rmb(); /* ensure we see all requests up to req_prod */
989 	/*
990 	 * note that we'll eventually get a full ring of request.
991 	 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod)
992 	 */
993 	xbdi->xbdi_cont = xbdback_co_main_loop;
994 	return xbdi;
995 }
996 
997 /*
998  * Fetch a blkif request from the ring, and pass control to the appropriate
999  * continuation.
1000  * If someone asked for disconnection, do not fetch any more request from
1001  * the ring.
1002  */
1003 static void *
xbdback_co_main_loop(struct xbdback_instance * xbdi,void * obj __unused)1004 xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj __unused)
1005 {
1006 	blkif_request_t *req, *reqn;
1007 	blkif_x86_32_request_t *req32;
1008 	blkif_x86_64_request_t *req64;
1009 	blkif_request_indirect_t *rinn;
1010 	blkif_x86_32_request_indirect_t *rin32;
1011 	blkif_x86_64_request_indirect_t *rin64;
1012 
1013 	if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) {
1014 		struct xbdback_io *xbd_io = xbdback_io_get(xbdi);
1015 		uint8_t real_op = 0xff;
1016 
1017 		if (xbd_io == NULL) {
1018 			/* retry after iodone */
1019 			xbdi->xbdi_cont = NULL;
1020 			return NULL;
1021 		}
1022 		memset(&xbd_io->u, 0, sizeof(xbd_io->u));
1023 
1024 		buf_init(&xbd_io->xio_buf);
1025 		xbd_io->xio_xbdi = xbdi;
1026 
1027 		req = &xbd_io->xio_xen_req;
1028 		memset(req, 0, sizeof(*req));
1029 
1030 		switch(xbdi->xbdi_proto) {
1031 		case XBDIP_NATIVE:
1032 			reqn = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
1033 			    xbdi->xbdi_ring.ring_n.req_cons);
1034 			real_op = req->operation = reqn->operation;
1035 			if (real_op == BLKIF_OP_INDIRECT) {
1036 				rinn = (blkif_request_indirect_t *)reqn;
1037 				real_op = rinn->indirect_op;
1038 			}
1039 			req->id = reqn->id;
1040 			break;
1041 		case XBDIP_32:
1042 			req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
1043 			    xbdi->xbdi_ring.ring_n.req_cons);
1044 			real_op = req->operation = req32->operation;
1045 			if (real_op == BLKIF_OP_INDIRECT) {
1046 				rin32 = (blkif_x86_32_request_indirect_t*)req32;
1047 				real_op = rin32->indirect_op;
1048 			}
1049 			req->id = req32->id;
1050 			break;
1051 		case XBDIP_64:
1052 			req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
1053 			    xbdi->xbdi_ring.ring_n.req_cons);
1054 			real_op = req->operation = req64->operation;
1055 			if (real_op == BLKIF_OP_INDIRECT) {
1056 				rin64 = (blkif_x86_64_request_indirect_t*)req64;
1057 				real_op = rin64->indirect_op;
1058 			}
1059 			req->id = req64->id;
1060 			break;
1061 		}
1062 		__insn_barrier();
1063 		XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x "
1064 		    "resp_prod 0x%x id %" PRIu64 "\n", req->operation,
1065 			xbdi->xbdi_ring.ring_n.req_cons,
1066 			xbdi->xbdi_req_prod,
1067 			xbdi->xbdi_ring.ring_n.rsp_prod_pvt,
1068 			req->id));
1069 		switch (req->operation) {
1070 		case BLKIF_OP_INDIRECT:
1071 			/* just check indirect_op, rest is handled later */
1072 			if (real_op != BLKIF_OP_READ &&
1073 			    real_op != BLKIF_OP_WRITE) {
1074 				if (ratecheck(&xbdi->xbdi_lasterr_time,
1075 				    &xbdback_err_intvl)) {
1076 					printf("%s: unknown ind operation %d\n",
1077 					    xbdi->xbdi_name,
1078 					    real_op);
1079 				}
1080 				goto fail;
1081 			}
1082 			/* FALLTHROUGH */
1083 		case BLKIF_OP_READ:
1084 		case BLKIF_OP_WRITE:
1085 			xbdi->xbdi_cont = xbdback_co_io;
1086 			return xbd_io;
1087 		case BLKIF_OP_FLUSH_DISKCACHE:
1088 			xbdi->xbdi_cont = xbdback_co_cache_flush;
1089 			return xbd_io;
1090 		default:
1091 			if (ratecheck(&xbdi->xbdi_lasterr_time,
1092 			    &xbdback_err_intvl)) {
1093 				printf("%s: unknown operation %d\n",
1094 				    xbdi->xbdi_name, req->operation);
1095 			}
1096 fail:
1097 			xbdback_send_reply(xbdi, req->id, real_op,
1098 			    BLKIF_RSP_ERROR);
1099 			xbdi->xbdi_cont = xbdback_co_main_incr;
1100 			return xbdi;
1101 		}
1102 	} else {
1103 		xbdi->xbdi_cont = xbdback_co_main_done2;
1104 		return xbdi;
1105 	}
1106 }
1107 
1108 /*
1109  * Increment consumer index and move on to the next request. In case
1110  * we want to disconnect, leave continuation now.
1111  */
1112 static void *
xbdback_co_main_incr(struct xbdback_instance * xbdi,void * obj __unused)1113 xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj __unused)
1114 {
1115 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
1116 
1117 	blkif_back_ring_t *ring = &xbdi->xbdi_ring.ring_n;
1118 
1119 	ring->req_cons++;
1120 
1121 	if (xbdi->xbdi_status == DISCONNECTING)
1122 		xbdi->xbdi_cont = NULL;
1123 	else
1124 		xbdi->xbdi_cont = xbdback_co_main_loop;
1125 
1126 	return xbdi;
1127 }
1128 
1129 /*
1130  * Check for requests in the instance's ring. In case there are, start again
1131  * from the beginning. If not, stall.
1132  */
1133 static void *
xbdback_co_main_done2(struct xbdback_instance * xbdi,void * obj)1134 xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj)
1135 {
1136 	int work_to_do;
1137 
1138 	xen_wmb();
1139 	RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do);
1140 	if (work_to_do)
1141 		xbdi->xbdi_cont = xbdback_co_main;
1142 	else
1143 		xbdi->xbdi_cont = NULL;
1144 
1145 	return xbdi;
1146 }
1147 
1148 /*
1149  * Frontend requested a cache flush operation.
1150  */
1151 static void *
xbdback_co_cache_flush(struct xbdback_instance * xbdi,void * obj)1152 xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj)
1153 {
1154 	struct xbdback_io *xbd_io = obj;
1155 	KASSERT(xbd_io->xio_xen_req.operation == BLKIF_OP_FLUSH_DISKCACHE);
1156 	if (xbdi->xbdi_pendingreqs > 0) {
1157 		/*
1158 		 * There are pending requests.
1159 		 * Event or iodone() will restart processing
1160 		 */
1161 		xbdi->xbdi_cont_restart = xbdback_co_cache_flush;
1162 		xbdi->xbdi_cont_restart_obj = xbd_io;
1163 		xbdi->xbdi_cont = NULL;
1164 		return NULL;
1165 	}
1166 	xbdi_get(xbdi);
1167 	xbdi->xbdi_cont = xbdback_co_do_io;
1168 	return xbd_io;
1169 }
1170 
1171 /*
1172  * A read or write I/O request must be processed. Do some checks first,
1173  * then get the segment information directly from the ring request.
1174  */
1175 static void *
xbdback_co_io(struct xbdback_instance * xbdi,void * obj)1176 xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
1177 {
1178 	int i, error;
1179 	blkif_request_t *req, *reqn;
1180 	blkif_x86_32_request_t *req32;
1181 	blkif_x86_64_request_t *req64;
1182 	blkif_request_indirect_t *rinn;
1183 	blkif_x86_32_request_indirect_t *rin32;
1184 	blkif_x86_64_request_indirect_t *rin64;
1185 	const char *errstr;
1186 	struct xbdback_io *xbd_io = obj;
1187 	grant_ref_t in_gntref = 0;
1188 
1189 	req = &xbd_io->xio_xen_req;
1190 
1191 	/* some sanity checks */
1192 	KASSERT(req->operation == BLKIF_OP_READ ||
1193 	    req->operation == BLKIF_OP_WRITE ||
1194 	    req->operation == BLKIF_OP_INDIRECT);
1195 
1196 	/* copy request segments */
1197 	switch (xbdi->xbdi_proto) {
1198 	case XBDIP_NATIVE:
1199 		reqn = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
1200 		    xbdi->xbdi_ring.ring_n.req_cons);
1201 		req->handle = reqn->handle;
1202 		req->sector_number = reqn->sector_number;
1203 		if (reqn->operation == BLKIF_OP_INDIRECT) {
1204 			rinn = (blkif_request_indirect_t *)reqn;
1205 			req->operation = rinn->indirect_op;
1206 			req->nr_segments = (uint8_t)rinn->nr_segments;
1207 			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) {
1208 				errstr = "too many indirect segments";
1209 				goto bad_segments;
1210 			}
1211 			in_gntref = rinn->indirect_grefs[0];
1212 			/* first_sect and segment grefs fetched later */
1213 		} else {
1214 			req->nr_segments = reqn->nr_segments;
1215 			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
1216 				errstr = "too many segments";
1217 				goto bad_segments;
1218 			}
1219 			for (i = 0; i < req->nr_segments; i++)
1220 				xbd_io->xio_seg[i] = reqn->seg[i];
1221 		}
1222 		break;
1223 	case XBDIP_32:
1224 		req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
1225 		    xbdi->xbdi_ring.ring_n.req_cons);
1226 		req->handle = req32->handle;
1227 		req->sector_number = req32->sector_number;
1228 		if (req32->operation == BLKIF_OP_INDIRECT) {
1229 			rin32 = (blkif_x86_32_request_indirect_t *)req32;
1230 			req->operation = rin32->indirect_op;
1231 			req->nr_segments = (uint8_t)rin32->nr_segments;
1232 			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) {
1233 				errstr = "too many indirect segments";
1234 				goto bad_segments;
1235 			}
1236 			in_gntref = rin32->indirect_grefs[0];
1237 			/* first_sect and segment grefs fetched later */
1238 		} else {
1239 			req->nr_segments = req32->nr_segments;
1240 			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
1241 				errstr = "too many segments";
1242 				goto bad_segments;
1243 			}
1244 			for (i = 0; i < req->nr_segments; i++)
1245 				xbd_io->xio_seg[i] = req32->seg[i];
1246 		}
1247 		break;
1248 	case XBDIP_64:
1249 		req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
1250 		    xbdi->xbdi_ring.ring_n.req_cons);
1251 		req->handle = req64->handle;
1252 		req->sector_number = req64->sector_number;
1253 		if (req64->operation == BLKIF_OP_INDIRECT) {
1254 			rin64 = (blkif_x86_64_request_indirect_t *)req64;
1255 			req->nr_segments = (uint8_t)rin64->nr_segments;
1256 			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS) {
1257 				errstr = "too many indirect segments";
1258 				goto bad_segments;
1259 			}
1260 			in_gntref = rin64->indirect_grefs[0];
1261 			/* first_sect and segment grefs fetched later */
1262 		} else {
1263 			req->nr_segments = req64->nr_segments;
1264 			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
1265 				errstr = "too many segments";
1266 				goto bad_segments;
1267 			}
1268 			for (i = 0; i < req->nr_segments; i++)
1269 				xbd_io->xio_seg[i] = req64->seg[i];
1270 		}
1271 		break;
1272 	}
1273 
1274 	if (req->operation == BLKIF_OP_WRITE) {
1275 		if (xbdi->xbdi_ro) {
1276 			error = EROFS;
1277 			goto end;
1278 		}
1279 	}
1280 
1281 	/* Max value checked already earlier */
1282 	if (req->nr_segments < 1) {
1283 		errstr = "invalid number of segments";
1284 		goto bad_segments;
1285 	}
1286 
1287 	/* If segments are on an indirect page, copy them now */
1288 	if (in_gntref) {
1289 		gnttab_copy_t gop;
1290 		paddr_t ma;
1291 
1292 		gop.flags = GNTCOPY_source_gref;
1293 		gop.len = req->nr_segments
1294 		    * sizeof(struct blkif_request_segment);
1295 
1296 		gop.source.u.ref = in_gntref;
1297 		gop.source.offset = 0;
1298 		gop.source.domid = xbdi->xbdi_domid;
1299 
1300 		ma = xbd_io->xio_seg_dmamap->dm_segs[0].ds_addr;
1301 		gop.dest.offset = ma & PAGE_MASK;
1302 		gop.dest.domid = DOMID_SELF;
1303 		gop.dest.u.gmfn = ma >> PAGE_SHIFT;
1304 
1305 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, &gop, 1) != 0) {
1306 			errstr = "GNTTABOP_copy failed";
1307 			goto bad_segments;
1308 		}
1309 	}
1310 
1311 	xbdi_get(xbdi);
1312 	xbdi->xbdi_cont = xbdback_co_io_gotio;
1313 	return xbd_io;
1314 
1315  bad_segments:
1316 	if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) {
1317 		printf("%s: %s\n", xbdi->xbdi_name, errstr);
1318 	}
1319 	error = EINVAL;
1320 	/* FALLTHROUGH */
1321 
1322  end:
1323 	xbdback_send_reply(xbdi, req->id, req->operation,
1324 	    (error == EROFS) ? BLKIF_RSP_EOPNOTSUPP : BLKIF_RSP_ERROR);
1325 	xbdi->xbdi_cont = xbdback_co_main_incr;
1326 	return xbdi;
1327 }
1328 
1329 /* Prepare an I/O buffer for a xbdback instance */
1330 static void *
xbdback_co_io_gotio(struct xbdback_instance * xbdi,void * obj)1331 xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj)
1332 {
1333 	struct xbdback_io *xbd_io = obj;
1334 	int buf_flags;
1335 	size_t bcount;
1336 	blkif_request_t *req = &xbd_io->xio_xen_req;
1337 	uint8_t last_sect;
1338 	int error;
1339 
1340 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
1341 	KASSERT(xbdi->xbdi_refcnt > 0);
1342 
1343 	/* Process segments */
1344 	bcount = 0;
1345 	for (int i = 0; i < req->nr_segments; i++) {
1346 		struct blkif_request_segment *seg = &xbd_io->xio_seg[i];
1347 		if (seg->last_sect > VBD_MAXSECT ||
1348 		    seg->first_sect > VBD_MAXSECT) {
1349 			if (ratecheck(&xbdi->xbdi_lasterr_time,
1350 			    &xbdback_err_intvl)) {
1351 				printf("%s: invalid segment sectors %d %d\n",
1352 				    xbdi->xbdi_name,
1353 				    seg->first_sect, seg->last_sect);
1354 			}
1355 			xbdi->xbdi_pendingreqs++; /* xbdback_io_error will -- */
1356 			xbdback_io_error(xbd_io, EINVAL);
1357 			/* do not retry */
1358 			xbdi->xbdi_cont = xbdback_co_main_incr;
1359 			return xbdi;
1360 		}
1361 
1362 		if (i > 0) {
1363 			if (last_sect != VBD_MAXSECT ||
1364 			    seg->first_sect != 0) {
1365 				xbd_io->xio_need_bounce = 1;
1366 			}
1367 		}
1368 		last_sect = seg->last_sect;
1369 		xbd_io->xio_gref[i] = seg->gref;
1370 		bcount += (seg->last_sect - seg->first_sect + 1)
1371 			* VBD_BSIZE;
1372 	}
1373 	xbd_io->xio_start_offset = xbd_io->xio_seg[0].first_sect * VBD_BSIZE;
1374 
1375 	KASSERT(bcount <= MAXPHYS);
1376 	KASSERT(xbd_io->xio_start_offset < PAGE_SIZE);
1377 	KASSERT(bcount + xbd_io->xio_start_offset <= VBD_VA_SIZE);
1378 
1379 	/* Fill-in the buf */
1380 	if (req->operation == BLKIF_OP_WRITE) {
1381 		buf_flags = B_WRITE;
1382 	} else {
1383 		buf_flags = B_READ;
1384 	}
1385 
1386 	xbd_io->xio_buf.b_flags = buf_flags;
1387 	xbd_io->xio_buf.b_cflags = 0;
1388 	xbd_io->xio_buf.b_oflags = 0;
1389 	xbd_io->xio_buf.b_iodone = xbdback_iodone;
1390 	xbd_io->xio_buf.b_proc = NULL;
1391 	xbd_io->xio_buf.b_vp = xbdi->xbdi_vp;
1392 	xbd_io->xio_buf.b_objlock = xbdi->xbdi_vp->v_interlock;
1393 	xbd_io->xio_buf.b_dev = xbdi->xbdi_dev;
1394 	xbd_io->xio_buf.b_blkno = req->sector_number;
1395 	xbd_io->xio_buf.b_bcount = bcount;
1396 	if (__predict_false(xbd_io->xio_need_bounce)) {
1397 		if (__predict_false(xbdi->xbdi_bouncebuf_use)) {
1398 			KASSERT(xbdi->xbdi_pendingreqs > 1);
1399 			/* retry later */
1400 			xbdi->xbdi_cont_restart = xbdback_co_io_gotio;
1401 			xbdi->xbdi_cont_restart_obj = xbd_io;
1402 			xbdi->xbdi_cont = NULL;
1403 			return NULL;
1404 		}
1405 		xbdi->xbdi_bouncebuf_use++;
1406 		KASSERT(xbdi->xbdi_bouncebuf_use == 1);
1407 		xbd_io->xio_buf.b_data = (void *)xbdi->xbdi_bouncebuf;
1408 	}
1409 	xbdi->xbdi_pendingreqs++;
1410 	if ((error = xbdback_map_shm(xbd_io)) != 0) {
1411 		xbdback_io_error(xbd_io, error);
1412 		/* do not retry */
1413 		xbdi->xbdi_cont = xbdback_co_main_incr;
1414 		return xbdi;
1415 	}
1416 	if (__predict_true(xbd_io->xio_need_bounce == 0)) {
1417 		xbd_io->xio_buf.b_data = (void *)
1418 		    (xbd_io->xio_vaddr + xbd_io->xio_start_offset);
1419 	}
1420 
1421 
1422 	xbd_io->xio_buf.b_private = xbd_io;
1423 
1424 	xbdi->xbdi_cont = xbdback_co_do_io;
1425 	return xbd_io;
1426 }
1427 
1428 static void
xbdback_io_error(struct xbdback_io * xbd_io,int error)1429 xbdback_io_error(struct xbdback_io *xbd_io, int error)
1430 {
1431 	KASSERT(mutex_owned(&xbd_io->xio_xbdi->xbdi_lock));
1432 
1433 	struct buf *bp = &xbd_io->xio_buf;
1434 
1435 	bp->b_error = error;
1436 	xbdback_iodone_locked(xbd_io->xio_xbdi, xbd_io, bp);
1437 }
1438 
1439 /*
1440  * Main xbdback I/O routine. It can either perform a flush operation or
1441  * schedule a read/write operation.
1442  */
1443 static void *
xbdback_co_do_io(struct xbdback_instance * xbdi,void * obj)1444 xbdback_co_do_io(struct xbdback_instance *xbdi, void *obj)
1445 {
1446 	struct xbdback_io *xbd_io = obj;
1447 	blkif_request_t *req = &xbd_io->xio_xen_req;
1448 
1449 	KASSERT(xbdi->xbdi_refcnt > 0);
1450 
1451 	switch (req->operation) {
1452 	case BLKIF_OP_FLUSH_DISKCACHE:
1453 	{
1454 		int error;
1455 		int force = 1;
1456 
1457 		KASSERT(mutex_owned(&xbdi->xbdi_lock));
1458 		mutex_exit(&xbdi->xbdi_lock);
1459 		error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, &force, FWRITE,
1460 		    kauth_cred_get());
1461 		mutex_enter(&xbdi->xbdi_lock);
1462 		if (error) {
1463 			aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n",
1464 			    xbdi->xbdi_xbusd->xbusd_path, error);
1465 			 if (error == EOPNOTSUPP || error == ENOTTY)
1466 				error = BLKIF_RSP_EOPNOTSUPP;
1467 			 else
1468 				error = BLKIF_RSP_ERROR;
1469 		} else
1470 			error = BLKIF_RSP_OKAY;
1471 		xbdback_send_reply(xbdi, req->id, req->operation, error);
1472 		xbdback_io_put(xbdi, xbd_io);
1473 		xbdi_put(xbdi);
1474 		xbdi->xbdi_cont = xbdback_co_main_incr;
1475 		return xbdi;
1476 	}
1477 	case BLKIF_OP_READ:
1478 	case BLKIF_OP_WRITE:
1479 		if (__predict_false(xbd_io->xio_need_bounce) &&
1480 		    req->operation == BLKIF_OP_WRITE) {
1481 			vaddr_t boffset = 0;
1482 			for (int i = 0; i < req->nr_segments; i++) {
1483 				struct blkif_request_segment *seg =
1484 				    &xbd_io->xio_seg[i];
1485 				vaddr_t segoffset = seg->first_sect * VBD_BSIZE;
1486 				size_t segbcount =
1487 				   (seg->last_sect - seg->first_sect + 1) *
1488 				    VBD_BSIZE;
1489 				KASSERT(segoffset + segbcount <= PAGE_SIZE);
1490 				KASSERT(boffset + segbcount < MAXPHYS);
1491 				segoffset += PAGE_SIZE * i;
1492 				memcpy(
1493 				    (void *)(xbdi->xbdi_bouncebuf + boffset),
1494 				    (void *)(xbd_io->xio_vaddr + segoffset),
1495 				    segbcount);
1496 				boffset += segbcount;
1497 			}
1498 		}
1499 		KASSERT(mutex_owned(&xbdi->xbdi_lock));
1500 		mutex_exit(&xbdi->xbdi_lock);
1501 		if ((xbd_io->xio_buf.b_flags & B_READ) == 0) {
1502 			mutex_enter(xbd_io->xio_buf.b_vp->v_interlock);
1503 			xbd_io->xio_buf.b_vp->v_numoutput++;
1504 			mutex_exit(xbd_io->xio_buf.b_vp->v_interlock);
1505 		}
1506 		/* will call xbdback_iodone() asynchronously when done */
1507 		bdev_strategy(&xbd_io->xio_buf);
1508 		mutex_enter(&xbdi->xbdi_lock);
1509 		xbdi->xbdi_cont = xbdback_co_main_incr;
1510 		return xbdi;
1511 	default:
1512 		/* Should never happen */
1513 		panic("xbdback_co_do_io: unsupported operation %d",
1514 		    req->operation);
1515 	}
1516 }
1517 
1518 /*
1519  * Called from softint(9) context when an I/O is done: for each request, send
1520  * back the associated reply to the domain.
1521  */
1522 static void
xbdback_iodone(struct buf * bp)1523 xbdback_iodone(struct buf *bp)
1524 {
1525 	struct xbdback_io *xbd_io;
1526 	struct xbdback_instance *xbdi;
1527 
1528 	xbd_io = bp->b_private;
1529 	KASSERT(bp == &xbd_io->xio_buf);
1530 	xbdi = xbd_io->xio_xbdi;
1531 
1532 	mutex_enter(&xbdi->xbdi_lock);
1533 	xbdback_iodone_locked(xbdi, xbd_io, bp);
1534 	mutex_exit(&xbdi->xbdi_lock);
1535 }
1536 
1537 /*
1538  * This gets reused by xbdback_io_error to report errors from other sources.
1539  */
1540 static void
xbdback_iodone_locked(struct xbdback_instance * xbdi,struct xbdback_io * xbd_io,struct buf * bp)1541 xbdback_iodone_locked(struct xbdback_instance *xbdi, struct xbdback_io *xbd_io,
1542     struct buf *bp)
1543 {
1544 	int status;
1545 	blkif_request_t *req = &xbd_io->xio_xen_req;
1546 
1547 	XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n",
1548 		   xbdi->xbdi_domid, (long)xbd_io));
1549 
1550 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
1551 
1552 	KASSERT(bp->b_error != 0 || xbd_io->xio_xv != NULL);
1553 	if (__predict_false(xbd_io->xio_need_bounce)) {
1554 		KASSERT(xbd_io->xio_buf.b_data == (void *)xbdi->xbdi_bouncebuf);
1555 
1556 		KASSERT(req->operation == BLKIF_OP_WRITE ||
1557 		    req->operation == BLKIF_OP_READ);
1558 
1559 		if (req->operation == BLKIF_OP_READ && bp->b_error == 0) {
1560 			vaddr_t boffset = 0;
1561 			for (int i = 0; i < req->nr_segments; i++) {
1562 				struct blkif_request_segment *seg =
1563 				    &xbd_io->xio_seg[i];
1564 				vaddr_t segoffset = seg->first_sect * VBD_BSIZE;
1565 				size_t segbcount =
1566 				   (seg->last_sect - seg->first_sect + 1) *
1567 				    VBD_BSIZE;
1568 				KASSERT(segoffset + segbcount <= PAGE_SIZE);
1569 				KASSERT(boffset + segbcount < MAXPHYS);
1570 				segoffset += PAGE_SIZE * i;
1571 				memcpy(
1572 				    (void *)(xbd_io->xio_vaddr + segoffset),
1573 				    (void *)(xbdi->xbdi_bouncebuf + boffset),
1574 				    segbcount);
1575 				boffset += segbcount;
1576 			}
1577 		}
1578 		KASSERT(xbdi->xbdi_bouncebuf_use == 1);
1579 		xbdi->xbdi_bouncebuf_use--;
1580 	}
1581 	if (xbd_io->xio_xv != NULL)
1582 		xbdback_unmap_shm(xbd_io);
1583 
1584 	if (bp->b_error != 0) {
1585 		printf("xbd IO domain %d: error %d\n",
1586 		       xbdi->xbdi_domid, bp->b_error);
1587 		status = BLKIF_RSP_ERROR;
1588 	} else
1589 		status = BLKIF_RSP_OKAY;
1590 
1591 	xbdback_send_reply(xbdi, req->id, req->operation, status);
1592 
1593 	xbdi_put(xbdi);
1594 	KASSERT(xbdi->xbdi_pendingreqs > 0);
1595 	xbdi->xbdi_pendingreqs--;
1596 	buf_destroy(&xbd_io->xio_buf);
1597 	xbdback_io_put(xbdi, xbd_io);
1598 
1599 	xbdback_wakeup_thread(xbdi);
1600 }
1601 
1602 /*
1603  * Wake up the per xbdback instance thread.
1604  */
1605 static void
xbdback_wakeup_thread(struct xbdback_instance * xbdi)1606 xbdback_wakeup_thread(struct xbdback_instance *xbdi)
1607 {
1608 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
1609 
1610 	/* only set RUN state when we are WAITING for work */
1611 	if (xbdi->xbdi_status == WAITING)
1612 	       xbdi->xbdi_status = RUN;
1613 	cv_signal(&xbdi->xbdi_cv);
1614 }
1615 
1616 /*
1617  * called once a request has completed. Place the reply in the ring and
1618  * notify the guest OS.
1619  */
1620 static void
xbdback_send_reply(struct xbdback_instance * xbdi,uint64_t id,int op,int status)1621 xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id,
1622     int op, int status)
1623 {
1624 	blkif_response_t *resp_n;
1625 	blkif_x86_32_response_t *resp32;
1626 	blkif_x86_64_response_t *resp64;
1627 	int notify;
1628 
1629 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
1630 
1631 	/*
1632 	 * The ring can be accessed by the xbdback thread, xbdback_iodone()
1633 	 * handler, or any handler that triggered the shm callback. So
1634 	 * protect ring access via the xbdi_lock mutex.
1635 	 */
1636 	switch (xbdi->xbdi_proto) {
1637 	case XBDIP_NATIVE:
1638 		resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n,
1639 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1640 		resp_n->id        = id;
1641 		resp_n->operation = op;
1642 		resp_n->status    = status;
1643 		break;
1644 	case XBDIP_32:
1645 		resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32,
1646 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1647 		resp32->id        = id;
1648 		resp32->operation = op;
1649 		resp32->status    = status;
1650 		break;
1651 	case XBDIP_64:
1652 		resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64,
1653 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1654 		resp64->id        = id;
1655 		resp64->operation = op;
1656 		resp64->status    = status;
1657 		break;
1658 	}
1659 	xbdi->xbdi_ring.ring_n.rsp_prod_pvt++;
1660 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify);
1661 
1662 	if (notify) {
1663 		XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid));
1664 		hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
1665 	}
1666 }
1667 
1668 /*
1669  * Map multiple entries of an I/O request into backend's VA space.
1670  * The xbd_io->xio_gref array has to be filled out by the caller.
1671  */
1672 static int
xbdback_map_shm(struct xbdback_io * xbd_io)1673 xbdback_map_shm(struct xbdback_io *xbd_io)
1674 {
1675 	struct xbdback_instance *xbdi = xbd_io->xio_xbdi;
1676 	blkif_request_t *req = &xbd_io->xio_xen_req;
1677 	int error;
1678 
1679 #ifdef XENDEBUG_VBD
1680 	int i;
1681 	printf("xbdback_map_shm map grant ");
1682 	for (i = 0; i < req->nr_segments; i++) {
1683 		printf("%u ", (u_int)xbd_io->xio_gref[i]);
1684 	}
1685 #endif
1686 
1687 	KASSERT(mutex_owned(&xbdi->xbdi_lock));
1688 	KASSERT(xbd_io->xio_xv == NULL);
1689 
1690 	xbd_io->xio_xv = SLIST_FIRST(&xbdi->xbdi_va_free);
1691 	KASSERT(xbd_io->xio_xv != NULL);
1692 	SLIST_REMOVE_HEAD(&xbdi->xbdi_va_free, xv_next);
1693 	xbd_io->xio_vaddr = xbd_io->xio_xv->xv_vaddr;
1694 
1695 	error = xen_shm_map(req->nr_segments, xbdi->xbdi_domid,
1696 	    xbd_io->xio_gref, xbd_io->xio_vaddr, xbd_io->xio_gh,
1697 	    (req->operation == BLKIF_OP_WRITE) ? XSHM_RO : 0);
1698 
1699 	switch(error) {
1700 	case 0:
1701 #ifdef XENDEBUG_VBD
1702 		printf("handle");
1703 		for (i = 0; i < req->nr_segments; i++) {
1704 			printf(" %u ", (u_int)xbd_io->xio_gh[i]);
1705 		}
1706 		printf("\n");
1707 #endif
1708 		return 0;
1709 	default:
1710 		/* reset xio_xv so error handling won't try to unmap it */
1711 		SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next);
1712 		xbd_io->xio_xv = NULL;
1713 		return error;
1714 	}
1715 }
1716 
1717 /* unmap a request from our virtual address space (request is done) */
1718 static void
xbdback_unmap_shm(struct xbdback_io * xbd_io)1719 xbdback_unmap_shm(struct xbdback_io *xbd_io)
1720 {
1721 	struct xbdback_instance *xbdi = xbd_io->xio_xbdi;
1722 	blkif_request_t *req = &xbd_io->xio_xen_req;
1723 
1724 #ifdef XENDEBUG_VBD
1725 	int i;
1726 	printf("xbdback_unmap_shm handle ");
1727 	for (i = 0; i < req->nr_segments; i++) {
1728 		printf("%u ", (u_int)xbd_io->xio_gh[i]);
1729 	}
1730 	printf("\n");
1731 #endif
1732 
1733 	KASSERT(xbd_io->xio_xv != NULL);
1734 	xen_shm_unmap(xbd_io->xio_vaddr, req->nr_segments,
1735 	    xbd_io->xio_gh);
1736 	SLIST_INSERT_HEAD(&xbdi->xbdi_va_free, xbd_io->xio_xv, xv_next);
1737 	xbd_io->xio_xv = NULL;
1738 	xbd_io->xio_vaddr = -1;
1739 }
1740 
1741 /* Obtain memory from a pool */
1742 static struct xbdback_io *
xbdback_io_get(struct xbdback_instance * xbdi)1743 xbdback_io_get(struct xbdback_instance *xbdi)
1744 {
1745 	struct xbdback_io *xbd_io = SLIST_FIRST(&xbdi->xbdi_io_free);
1746 	SLIST_REMOVE_HEAD(&xbdi->xbdi_io_free, xio_next);
1747 	return xbd_io;
1748 }
1749 
1750 /* Restore memory to a pool */
1751 static void
xbdback_io_put(struct xbdback_instance * xbdi,struct xbdback_io * xbd_io)1752 xbdback_io_put(struct xbdback_instance *xbdi, struct xbdback_io *xbd_io)
1753 {
1754 	KASSERT(xbd_io->xio_xv == NULL);
1755 	KASSERT(xbd_io != NULL);
1756 	SLIST_INSERT_HEAD(&xbdi->xbdi_io_free, xbd_io, xio_next);
1757 }
1758 
1759 /*
1760  * Trampoline routine. Calls continuations in a loop and only exits when
1761  * either the returned object or the next callback is NULL.
1762  */
1763 static void
xbdback_trampoline(struct xbdback_instance * xbdi,void * obj)1764 xbdback_trampoline(struct xbdback_instance *xbdi, void *obj)
1765 {
1766 	xbdback_cont_t cont;
1767 
1768 	while(obj != NULL && xbdi->xbdi_cont != NULL) {
1769 		KASSERT(xbdi->xbdi_cont_restart == NULL);
1770 		KASSERT(xbdi->xbdi_cont_restart_obj == NULL);
1771 		cont = xbdi->xbdi_cont;
1772 #ifdef DIAGNOSTIC
1773 		xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF;
1774 #endif
1775 		obj = (*cont)(xbdi, obj);
1776 #ifdef DIAGNOSTIC
1777 		if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) {
1778 			printf("xbdback_trampoline: 0x%lx didn't set "
1779 			       "xbdi->xbdi_cont!\n", (long)cont);
1780 			panic("xbdback_trampoline: bad continuation");
1781 		}
1782 		if (xbdi->xbdi_cont_restart != NULL ||
1783 		    xbdi->xbdi_cont_restart_obj != NULL) {
1784 			KASSERT(xbdi->xbdi_cont_restart != NULL);
1785 			KASSERT(xbdi->xbdi_cont_restart_obj != NULL);
1786 			KASSERT(xbdi->xbdi_cont == NULL);
1787 			KASSERT(obj == NULL);
1788 		}
1789 #endif
1790 	}
1791 }
1792