xref: /netbsd-src/sys/arch/xen/xen/xbdback_xenbus.c (revision f82d7874c259b2a6cc59b714f844919f32bf7b51)
1 /*      $NetBSD: xbdback_xenbus.c,v 1.16 2008/03/22 14:21:56 ad Exp $      */
2 
3 /*
4  * Copyright (c) 2006 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *      This product includes software developed by Manuel Bouyer.
17  * 4. The name of the author may not be used to endorse or promote products
18  *    derived from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.16 2008/03/22 14:21:56 ad Exp $");
35 
36 #include <sys/types.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/queue.h>
41 #include <sys/kernel.h>
42 #include <sys/conf.h>
43 #include <sys/disk.h>
44 #include <sys/disklabel.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/kauth.h>
48 
49 #include <xen/xen.h>
50 #include <xen/xen_shm.h>
51 #include <xen/evtchn.h>
52 #include <xen/xenbus.h>
53 #include <xen/xen3-public/io/protocols.h>
54 
55 /* #define XENDEBUG_VBD */
56 #ifdef XENDEBUG_VBD
57 #define XENPRINTF(x) printf x
58 #else
59 #define XENPRINTF(x)
60 #endif
61 
62 #define BLKIF_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
63 
64 /*
65  * Backend block device driver for Xen
66  */
67 
68 /* Max number of pages per request. The request may not be page aligned */
69 #define BLKIF_MAX_PAGES_PER_REQUEST (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
70 
71 /* Values are expressed in 512-byte sectors */
72 #define VBD_BSIZE 512
73 #define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1)
74 
75 struct xbdback_request;
76 struct xbdback_io;
77 struct xbdback_fragment;
78 struct xbdback_instance;
79 
80 /* state of a xbdback instance */
81 typedef enum {CONNECTED, DISCONNECTING, DISCONNECTED} xbdback_state_t;
82 
83 /*
84  * Since there are a variety of conditions that can block our I/O
85  * processing, which isn't allowed to suspend its thread's execution,
86  * such things will be done in a sort of continuation-passing style.
87  *
88  * Return value is NULL to indicate that execution has blocked; if
89  * it's finished, set xbdi->xbdi_cont (see below) to NULL and the return
90  * doesn't matter.  Otherwise it's passed as the second parameter to
91  * the new value of xbdi->xbdi_cont.
92  */
93 typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);
94 
95 enum xbdi_proto {
96 	XBDIP_NATIVE,
97 	XBDIP_32,
98 	XBDIP_64
99 };
100 
101 
102 /* we keep the xbdback instances in a linked list */
103 struct xbdback_instance {
104 	SLIST_ENTRY(xbdback_instance) next;
105 	struct xenbus_device *xbdi_xbusd; /* our xenstore entry */
106 	struct xenbus_watch xbdi_watch; /* to watch our store */
107 	domid_t xbdi_domid;		/* attached to this domain */
108 	uint32_t xbdi_handle;	/* domain-specific handle */
109 	xbdback_state_t xbdi_status;
110 	/* backing device parameters */
111 	dev_t xbdi_dev;
112 	const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */
113 	struct vnode *xbdi_vp;
114 	uint64_t xbdi_size;
115 	int xbdi_ro; /* is device read-only ? */
116 	/* parameters for the communication */
117 	unsigned int xbdi_evtchn;
118 	/* private parameters for communication */
119 	blkif_back_ring_proto_t xbdi_ring;
120 	enum xbdi_proto xbdi_proto;
121 	grant_handle_t xbdi_ring_handle; /* to unmap the ring */
122 	vaddr_t xbdi_ring_va; /* to unmap the ring */
123 	/* disconnection must be postponed until all I/O is done */
124 	volatile unsigned xbdi_refcnt;
125 	/*
126 	 * State for I/O processing/coalescing follows; this has to
127 	 * live here instead of on the stack because of the
128 	 * continuation-ness (see above).
129 	 */
130 	RING_IDX xbdi_req_prod; /* limit on request indices */
131 	xbdback_cont_t xbdi_cont, xbdi_cont_aux;
132 	SIMPLEQ_ENTRY(xbdback_instance) xbdi_on_hold; /* waiting on resources */
133 	/* _request state */
134 	struct xbdback_request *xbdi_req; /* if NULL, ignore following */
135 	blkif_request_t xbdi_xen_req;
136 	int xbdi_segno;
137 	/* _io state */
138 	struct xbdback_io *xbdi_io; /* if NULL, ignore next field */
139 	daddr_t xbdi_next_sector;
140 	uint8_t xbdi_last_fs, xbdi_this_fs; /* first sectors */
141 	uint8_t xbdi_last_ls, xbdi_this_ls; /* last sectors */
142 	grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */
143 	/* other state */
144 	int xbdi_same_page; /* are we merging two segments on the same page? */
145 };
146 /* Manipulation of the above reference count. */
147 /* XXXjld@panix.com: not MP-safe, and move the i386 asm elsewhere. */
148 #define xbdi_get(xbdip) (++(xbdip)->xbdi_refcnt)
149 #define xbdi_put(xbdip)                                      \
150 do {                                                         \
151 	__asm volatile("decl %0"                           \
152 	    : "=m"((xbdip)->xbdi_refcnt) : "m"((xbdip)->xbdi_refcnt)); \
153 	if (0 == (xbdip)->xbdi_refcnt)                            \
154                xbdback_finish_disconnect(xbdip);             \
155 } while (/* CONSTCOND */ 0)
156 
157 SLIST_HEAD(, xbdback_instance) xbdback_instances;
158 
159 /*
160  * For each request from a guest, a xbdback_request is allocated from
161  * a pool.  This will describe the request until completion.  The
162  * request may require multiple IO operations to perform, so the
163  * per-IO information is not stored here.
164  */
165 struct xbdback_request {
166 	struct xbdback_instance *rq_xbdi; /* our xbd instance */
167 	uint64_t rq_id;
168 	int rq_iocount; /* reference count; or, number of outstanding I/O's */
169 	int rq_ioerrs;
170 	uint8_t rq_operation;
171 };
172 
173 /*
174  * For each I/O operation associated with one of those requests, an
175  * xbdback_io is allocated from a pool.  It may correspond to multiple
176  * Xen disk requests, or parts of them, if several arrive at once that
177  * can be coalesced.
178  */
179 struct xbdback_io {
180 	struct buf xio_buf; /* our I/O */
181 	/* The instance pointer is duplicated for convenience. */
182 	struct xbdback_instance *xio_xbdi; /* our xbd instance */
183 	SLIST_HEAD(, xbdback_fragment) xio_rq; /* xbd requests involved */
184 	vaddr_t xio_vaddr; /* the virtual address to map the request at */
185 	grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; /* grants to map */
186 	grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];/* grants release */
187 	uint16_t xio_nrma; /* number of guest pages */
188 	uint16_t xio_mapped;
189 };
190 
191 /*
192  * Rather than have the xbdback_io keep an array of the
193  * xbdback_requests involved, since the actual number will probably be
194  * small but might be as large as BLKIF_RING_SIZE, use a list.  This
195  * would be threaded through xbdback_request, but one of them might be
196  * part of multiple I/O's, alas.
197  */
198 struct xbdback_fragment {
199 	struct xbdback_request *car;
200 	SLIST_ENTRY(xbdback_fragment) cdr;
201 };
202 
203 /*
204  * Wrap our pools with a chain of xbdback_instances whose I/O
205  * processing has blocked for want of memory from that pool.
206  */
207 struct xbdback_pool {
208 	struct pool p;
209 	SIMPLEQ_HEAD(xbdback_iqueue, xbdback_instance) q;
210 	struct timeval last_warning;
211 } xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool;
212 static struct xbdback_iqueue xbdback_shmq;
213 static int xbdback_shmcb; /* have we already registered a callback? */
214 
215 struct timeval xbdback_poolsleep_intvl = { 5, 0 };
216 #ifdef DEBUG
217 struct timeval xbdback_fragio_intvl = { 60, 0 };
218 #endif
219        void xbdbackattach(int);
220 static int  xbdback_xenbus_create(struct xenbus_device *);
221 static int  xbdback_xenbus_destroy(void *);
222 static void xbdback_frontend_changed(void *, XenbusState);
223 static void xbdback_backend_changed(struct xenbus_watch *,
224     const char **, unsigned int);
225 static int  xbdback_evthandler(void *);
226 static void xbdback_finish_disconnect(struct xbdback_instance *);
227 
228 static struct xbdback_instance *xbdif_lookup(domid_t, uint32_t);
229 
230 static void *xbdback_co_main(struct xbdback_instance *, void *);
231 static void *xbdback_co_main_loop(struct xbdback_instance *, void *);
232 static void *xbdback_co_main_incr(struct xbdback_instance *, void *);
233 static void *xbdback_co_main_done(struct xbdback_instance *, void *);
234 static void *xbdback_co_main_done2(struct xbdback_instance *, void *);
235 
236 static void *xbdback_co_io(struct xbdback_instance *, void *);
237 static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *);
238 static void *xbdback_co_io_loop(struct xbdback_instance *, void *);
239 static void *xbdback_co_io_gotio(struct xbdback_instance *, void *);
240 static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *);
241 static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *);
242 static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *);
243 
244 static void *xbdback_co_flush(struct xbdback_instance *, void *);
245 static void *xbdback_co_flush_done(struct xbdback_instance *, void *);
246 
247 static int  xbdback_shm_callback(void *);
248 static void xbdback_io_error(struct xbdback_io *, int);
249 static void xbdback_do_io(struct xbdback_io *);
250 static void xbdback_iodone(struct buf *);
251 static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int);
252 
253 static void *xbdback_map_shm(struct xbdback_io *);
254 static void xbdback_unmap_shm(struct xbdback_io *);
255 
256 static void *xbdback_pool_get(struct xbdback_pool *,
257 			      struct xbdback_instance *);
258 static void xbdback_pool_put(struct xbdback_pool *, void *);
259 static void xbdback_trampoline(struct xbdback_instance *, void *);
260 
261 static struct xenbus_backend_driver xbd_backend_driver = {
262 	.xbakd_create = xbdback_xenbus_create,
263 	.xbakd_type = "vbd"
264 };
265 
266 void
267 xbdbackattach(int n)
268 {
269 	XENPRINTF(("xbdbackattach\n"));
270 
271 	/*
272 	 * initialize the backend driver, register the control message handler
273 	 * and send driver up message.
274 	 */
275 	SLIST_INIT(&xbdback_instances);
276 	SIMPLEQ_INIT(&xbdback_shmq);
277 	xbdback_shmcb = 0;
278 	pool_init(&xbdback_request_pool.p, sizeof(struct xbdback_request),
279 	    0, 0, 0, "xbbrp", NULL, IPL_BIO);
280 	SIMPLEQ_INIT(&xbdback_request_pool.q);
281 	pool_init(&xbdback_io_pool.p, sizeof(struct xbdback_io),
282 	    0, 0, 0, "xbbip", NULL, IPL_BIO);
283 	SIMPLEQ_INIT(&xbdback_io_pool.q);
284 	pool_init(&xbdback_fragment_pool.p, sizeof(struct xbdback_fragment),
285 	    0, 0, 0, "xbbfp", NULL, IPL_BIO);
286 	SIMPLEQ_INIT(&xbdback_fragment_pool.q);
287 	/* we allocate enough to handle a whole ring at once */
288 	if (pool_prime(&xbdback_request_pool.p, BLKIF_RING_SIZE) != 0)
289 		printf("xbdback: failed to prime request pool\n");
290 	if (pool_prime(&xbdback_io_pool.p, BLKIF_RING_SIZE) != 0)
291 		printf("xbdback: failed to prime io pool\n");
292 	if (pool_prime(&xbdback_fragment_pool.p,
293             BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) != 0)
294 		printf("xbdback: failed to prime fragment pool\n");
295 
296 	xenbus_backend_register(&xbd_backend_driver);
297 }
298 
299 static int
300 xbdback_xenbus_create(struct xenbus_device *xbusd)
301 {
302 	struct xbdback_instance *xbdi;
303 	long domid, handle;
304 	int error, i;
305 	char *ep;
306 
307 	if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path,
308 	    "frontend-id", &domid, 10)) != 0) {
309 		aprint_error("xbdback: can' read %s/frontend-id: %d\n",
310 		    xbusd->xbusd_path, error);
311 		return error;
312 	}
313 
314 	/*
315 	 * get handle: this is the last component of the path; which is
316 	 * a decimal number. $path/dev contains the device name, which is not
317 	 * appropriate.
318 	 */
319 	for (i = strlen(xbusd->xbusd_path); i > 0; i--) {
320 		if (xbusd->xbusd_path[i] == '/')
321 			break;
322 	}
323 	if (i == 0) {
324 		aprint_error("xbdback: can't parse %s\n",
325 		    xbusd->xbusd_path);
326 		return EFTYPE;
327 	}
328 	handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10);
329 	if (*ep != '\0') {
330 		aprint_error("xbdback: can't parse %s\n",
331 		    xbusd->xbusd_path);
332 		return EFTYPE;
333 	}
334 
335 	if (xbdif_lookup(domid, handle) != NULL) {
336 		return EEXIST;
337 	}
338 	xbdi = malloc(sizeof(struct xbdback_instance), M_DEVBUF,
339 	    M_NOWAIT | M_ZERO);
340 	if (xbdi == NULL) {
341 		return ENOMEM;
342 	}
343 	xbdi->xbdi_domid = domid;
344 	xbdi->xbdi_handle = handle;
345 	xbdi->xbdi_status = DISCONNECTED;
346 	xbdi->xbdi_refcnt = 1;
347 	SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next);
348 	xbusd->xbusd_u.b.b_cookie = xbdi;
349 	xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy;
350 	xbusd->xbusd_otherend_changed = xbdback_frontend_changed;
351 	xbdi->xbdi_xbusd = xbusd;
352 
353 	error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device",
354 	    &xbdi->xbdi_watch,  xbdback_backend_changed);
355 	if (error) {
356 		printf("failed to watch on %s/physical-device: %d\n",
357 		    xbusd->xbusd_path, error);
358 		goto fail;
359 	}
360 	xbdi->xbdi_watch.xbw_dev = xbusd;
361 	error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait);
362 	if (error) {
363 		printf("failed to switch state on %s: %d\n",
364 		    xbusd->xbusd_path, error);
365 		goto fail2;
366 	}
367 	return 0;
368 fail2:
369 	unregister_xenbus_watch(&xbdi->xbdi_watch);
370 fail:
371 	free(xbdi, M_DEVBUF);
372 	return error;
373 }
374 
375 static int
376 xbdback_xenbus_destroy(void *arg)
377 {
378 	struct xbdback_instance *xbdi = arg;
379 	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
380 	struct gnttab_unmap_grant_ref ungrop;
381 	int err, s;
382 
383 	XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status));
384 
385 	if (xbdi->xbdi_status != DISCONNECTED) {
386 		hypervisor_mask_event(xbdi->xbdi_evtchn);
387 		event_remove_handler(xbdi->xbdi_evtchn, xbdback_evthandler,
388 		    xbdi);
389 		xbdi->xbdi_status = DISCONNECTING;
390 		s = splbio();
391 		xbdi_put(xbdi);
392 		while (xbdi->xbdi_status != DISCONNECTED) {
393 			tsleep(&xbdi->xbdi_status, PRIBIO, "xbddis", 0);
394 		}
395 		splx(s);
396 	}
397 	/* unregister watch */
398 	if (xbdi->xbdi_watch.node) {
399 		unregister_xenbus_watch(&xbdi->xbdi_watch);
400 		free(xbdi->xbdi_watch.node, M_DEVBUF);
401 		xbdi->xbdi_watch.node = NULL;
402 	}
403 	/* unmap ring */
404 	if (xbdi->xbdi_ring_va != 0) {
405 		ungrop.host_addr = xbdi->xbdi_ring_va;
406 		ungrop.handle = xbdi->xbdi_ring_handle;
407 		ungrop.dev_bus_addr = 0;
408 		err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
409 		    &ungrop, 1);
410 		if (err)
411 		    printf("xbdback %s: unmap_grant_ref failed: %d\n",
412 			xbusd->xbusd_otherend, err);
413 		uvm_km_free(kernel_map, xbdi->xbdi_ring_va,
414 		    PAGE_SIZE, UVM_KMF_VAONLY);
415 	}
416 	/* close device */
417 	if (xbdi->xbdi_size) {
418 		printf("xbd backend: detach device %s%d%c for domain %d\n",
419 		    devsw_blk2name(major(xbdi->xbdi_dev)),
420 		    DISKUNIT(xbdi->xbdi_dev), DISKPART(xbdi->xbdi_dev) + 'a',
421 		    xbdi->xbdi_domid);
422 		vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
423 	}
424 	SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
425 	free(xbdi, M_DEVBUF);
426 	return 0;
427 }
428 
429 static void
430 xbdback_frontend_changed(void *arg, XenbusState new_state)
431 {
432 	struct xbdback_instance *xbdi = arg;
433 	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
434 	u_long ring_ref, revtchn;
435 	struct gnttab_map_grant_ref grop;
436 	struct gnttab_unmap_grant_ref ungrop;
437 	evtchn_op_t evop;
438 	char evname[16];
439 	const char *proto;
440 	char *xsproto;
441 	int len;
442 	int err, s;
443 
444 	XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state));
445 	switch(new_state) {
446 	case XenbusStateInitialising:
447 		break;
448 	case XenbusStateInitialised:
449 	case XenbusStateConnected:
450 		if (xbdi->xbdi_status == CONNECTED)
451 			break;
452 		/* read comunication informations */
453 		err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
454 		    "ring-ref", &ring_ref, 10);
455 		if (err) {
456 			xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref",
457 			    xbusd->xbusd_otherend);
458 			break;
459 		}
460 		err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
461 		    "event-channel", &revtchn, 10);
462 		if (err) {
463 			xenbus_dev_fatal(xbusd, err, "reading %s/event-channel",
464 			    xbusd->xbusd_otherend);
465 			break;
466 		}
467 		err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol",
468 		    &len, &xsproto);
469 		if (err) {
470 			proto = "unspecified";
471 			xbdi->xbdi_proto = XBDIP_NATIVE;
472 		} else {
473 			if(strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) {
474 				xbdi->xbdi_proto = XBDIP_NATIVE;
475 				proto = XEN_IO_PROTO_ABI_NATIVE;
476 			} else if(strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) {
477 				xbdi->xbdi_proto = XBDIP_32;
478 				proto = XEN_IO_PROTO_ABI_X86_32;
479 			} else if(strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) {
480 				xbdi->xbdi_proto = XBDIP_64;
481 				proto = XEN_IO_PROTO_ABI_X86_32;
482 			} else {
483 				printf("xbd domain %d: unknown proto %s\n",
484 				    xbdi->xbdi_domid, xsproto);
485 				free(xsproto, M_DEVBUF);
486 				return;
487 			}
488 			free(xsproto, M_DEVBUF);
489 		}
490 		/* allocate VA space and map rings */
491 		xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
492 		    UVM_KMF_VAONLY);
493 		if (xbdi->xbdi_ring_va == 0) {
494 			xenbus_dev_fatal(xbusd, ENOMEM,
495 			    "can't get VA for ring", xbusd->xbusd_otherend);
496 			break;
497 		}
498 		grop.host_addr = xbdi->xbdi_ring_va;
499 		grop.flags = GNTMAP_host_map;
500 		grop.ref = ring_ref;
501 		grop.dom = xbdi->xbdi_domid;
502 		err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
503 		    &grop, 1);
504 		if (err || grop.status) {
505 			printf("xbdback %s: can't map grant ref: %d/%d\n",
506 			    xbusd->xbusd_path, err, grop.status);
507 			xenbus_dev_fatal(xbusd, EINVAL,
508 			    "can't map ring", xbusd->xbusd_otherend);
509 			goto err;
510 		}
511 		xbdi->xbdi_ring_handle = grop.handle;
512 		switch(xbdi->xbdi_proto) {
513 		case XBDIP_NATIVE:
514 		{
515 			blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va;
516 			BACK_RING_INIT(&xbdi->xbdi_ring.ring_n,
517 			    sring, PAGE_SIZE);
518 			break;
519 		}
520 		case XBDIP_32:
521 		{
522 			blkif_x86_32_sring_t *sring =
523 			    (void *)xbdi->xbdi_ring_va;
524 			BACK_RING_INIT(&xbdi->xbdi_ring.ring_32,
525 			    sring, PAGE_SIZE);
526 			break;
527 		}
528 		case XBDIP_64:
529 		{
530 			blkif_x86_64_sring_t *sring =
531 			    (void *)xbdi->xbdi_ring_va;
532 			BACK_RING_INIT(&xbdi->xbdi_ring.ring_64,
533 			    sring, PAGE_SIZE);
534 			break;
535 		}
536 		}
537 		evop.cmd = EVTCHNOP_bind_interdomain;
538 		evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid;
539 		evop.u.bind_interdomain.remote_port = revtchn;
540 		err = HYPERVISOR_event_channel_op(&evop);
541 		if (err) {
542 			printf("blkback %s: can't get event channel: %d\n",
543 			    xbusd->xbusd_otherend, err);
544 			xenbus_dev_fatal(xbusd, err,
545 			    "can't bind event chanel", xbusd->xbusd_otherend);
546 			goto err2;
547 		}
548 		xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port;
549 		snprintf(evname, sizeof(evname), "xbd%d.%d",
550 		    xbdi->xbdi_domid, xbdi->xbdi_handle);
551 		event_set_handler(xbdi->xbdi_evtchn, xbdback_evthandler,
552 		    xbdi, IPL_BIO, evname);
553 		printf("xbd backend 0x%x for domain %d "
554 		    "using event channel %d, protocol %s\n", xbdi->xbdi_handle,
555 		    xbdi->xbdi_domid, xbdi->xbdi_evtchn, proto);
556 		hypervisor_enable_event(xbdi->xbdi_evtchn);
557 		hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
558 		xbdi->xbdi_status = CONNECTED;
559 		break;
560 	case XenbusStateClosing:
561 		hypervisor_mask_event(xbdi->xbdi_evtchn);
562 		event_remove_handler(xbdi->xbdi_evtchn, xbdback_evthandler,
563 		    xbdi);
564 		xbdi->xbdi_status = DISCONNECTING;
565 		s = splbio();
566 		xbdi_put(xbdi);
567 		while (xbdi->xbdi_status != DISCONNECTED) {
568 			tsleep(&xbdi->xbdi_status, PRIBIO, "xbddis", 0);
569 		}
570 		splx(s);
571 		xenbus_switch_state(xbusd, NULL, XenbusStateClosing);
572 		break;
573 	case XenbusStateClosed:
574 		/* otherend_changed() should handle it for us */
575 		panic("xbdback_frontend_changed: closed\n");
576 	case XenbusStateUnknown:
577 	case XenbusStateInitWait:
578 	default:
579 		aprint_error("xbdback %s: invalid frontend state %d\n",
580 		    xbusd->xbusd_path, new_state);
581 	}
582 	return;
583 err2:
584 	/* unmap ring */
585 	ungrop.host_addr = xbdi->xbdi_ring_va;
586 	ungrop.handle = xbdi->xbdi_ring_handle;
587 	ungrop.dev_bus_addr = 0;
588 	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
589 	    &ungrop, 1);
590 	if (err)
591 	    printf("xbdback %s: unmap_grant_ref failed: %d\n",
592 		xbusd->xbusd_path, err);
593 err:
594 	uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY);
595 	return;
596 }
597 
598 static void
599 xbdback_backend_changed(struct xenbus_watch *watch,
600     const char **vec, unsigned int len)
601 {
602 	struct xenbus_device *xbusd = watch->xbw_dev;
603 	struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie;
604 	int err;
605 	long dev;
606 	char *mode;
607 	struct xenbus_transaction *xbt;
608 	const char *devname;
609 	int major;
610 
611 	err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device",
612 	    &dev, 10);
613 	/*
614 	 * An error can occur as the watch can fire up just after being
615 	 * registered. So we have to ignore error  :(
616 	 */
617 	if (err)
618 		return;
619 	if (xbdi->xbdi_status == CONNECTED && xbdi->xbdi_dev != dev) {
620 		printf("xbdback %s: changing physical device from 0x%x to "
621 		    "0x%lx not supported\n", xbusd->xbusd_path, xbdi->xbdi_dev,
622 		    dev);
623 		return;
624 	}
625 	xbdi->xbdi_dev = dev;
626 	err = xenbus_read(NULL, xbusd->xbusd_path, "mode", NULL, &mode);
627 	if (err) {
628 		printf("xbdback: failed to read %s/mode: %d\n",
629 		    xbusd->xbusd_path, err);
630 		return;
631 	}
632 	if (mode[0] == 'w')
633 		xbdi->xbdi_ro = 0;
634 	else
635 		xbdi->xbdi_ro = 1;
636 	major = major(xbdi->xbdi_dev);
637 	devname = devsw_blk2name(major);
638 	if (devname == NULL) {
639 		printf("xbdback %s: unknwon device 0x%x\n", xbusd->xbusd_path,
640 		    xbdi->xbdi_dev);
641 		return;
642 	}
643 	xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev);
644 	if (xbdi->xbdi_bdevsw == NULL) {
645 		printf("xbdback %s: no bdevsw for device 0x%x\n",
646 		    xbusd->xbusd_path, xbdi->xbdi_dev);
647 		return;
648 	}
649 	err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp);
650 	if (err) {
651 		printf("xbdback %s: can't open device 0x%x: %d\n",
652 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
653 		return;
654 	}
655 	err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY);
656 	if (err) {
657 		printf("xbdback %s: can't vn_lock device 0x%x: %d\n",
658 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
659 		vrele(xbdi->xbdi_vp);
660 		return;
661 	}
662 	err  = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED);
663 	if (err) {
664 		printf("xbdback %s: can't VOP_OPEN device 0x%x: %d\n",
665 		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
666 		vput(xbdi->xbdi_vp);
667 		return;
668 	}
669 	VOP_UNLOCK(xbdi->xbdi_vp, 0);
670 	if (strcmp(devname, "dk") == 0) {
671 		/* dk device; get wedge data */
672 		struct dkwedge_info wi;
673 		err = VOP_IOCTL(xbdi->xbdi_vp, DIOCGWEDGEINFO, &wi,
674 		    FREAD, NOCRED);
675 		if (err) {
676 			printf("xbdback %s: can't DIOCGWEDGEINFO device "
677 			    "0x%x: %d\n", xbusd->xbusd_path,
678 			    xbdi->xbdi_dev, err);
679 			xbdi->xbdi_size = xbdi->xbdi_dev = 0;
680 			vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
681 			xbdi->xbdi_vp = NULL;
682 			return;
683 		}
684 		xbdi->xbdi_size = wi.dkw_size;
685 		printf("xbd backend: attach device %s (size %" PRIu64 ") "
686 		    "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size,
687 		    xbdi->xbdi_domid);
688 	} else {
689 		/* disk device, get partition data */
690 		struct partinfo dpart;
691 		err = VOP_IOCTL(xbdi->xbdi_vp, DIOCGPART, &dpart, FREAD, 0);
692 		if (err) {
693 			printf("xbdback %s: can't DIOCGPART device 0x%x: %d\n",
694 			    xbusd->xbusd_path, xbdi->xbdi_dev, err);
695 			xbdi->xbdi_size = xbdi->xbdi_dev = 0;
696 			vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
697 			xbdi->xbdi_vp = NULL;
698 			return;
699 		}
700 		xbdi->xbdi_size = dpart.part->p_size;
701 		printf("xbd backend: attach device %s%d%c (size %" PRIu64 ") "
702 		    "for domain %d\n", devname, DISKUNIT(xbdi->xbdi_dev),
703 		    DISKPART(xbdi->xbdi_dev) + 'a', xbdi->xbdi_size,
704 		    xbdi->xbdi_domid);
705 	}
706 again:
707 	xbt = xenbus_transaction_start();
708 	if (xbt == NULL) {
709 		printf("xbdback %s: can't start transaction\n",
710 		    xbusd->xbusd_path);
711 		    return;
712 	}
713 	err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 ,
714 	    xbdi->xbdi_size);
715 	if (err) {
716 		printf("xbdback: failed to write %s/sectors: %d\n",
717 		    xbusd->xbusd_path, err);
718 		goto abort;
719 	}
720 	err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u",
721 	    xbdi->xbdi_ro ? VDISK_READONLY : 0);
722 	if (err) {
723 		printf("xbdback: failed to write %s/info: %d\n",
724 		    xbusd->xbusd_path, err);
725 		goto abort;
726 	}
727 	err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu",
728 	    (u_long)DEV_BSIZE);
729 	if (err) {
730 		printf("xbdback: failed to write %s/sector-size: %d\n",
731 		    xbusd->xbusd_path, err);
732 		goto abort;
733 	}
734 	err = xenbus_transaction_end(xbt, 0);
735 	if (err == EAGAIN)
736 		goto again;
737 	if (err) {
738 		printf("xbdback %s: can't end transaction: %d\n",
739 		    xbusd->xbusd_path, err);
740 	}
741 	err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected);
742 	if (err) {
743 		printf("xbdback %s: can't switch state: %d\n",
744 		    xbusd->xbusd_path, err);
745 	}
746 	return;
747 abort:
748 	xenbus_transaction_end(xbt, 1);
749 }
750 
751 
752 static void xbdback_finish_disconnect(struct xbdback_instance *xbdi)
753 {
754 	KASSERT(xbdi->xbdi_status == DISCONNECTING);
755 
756 	xbdi->xbdi_status = DISCONNECTED;
757 	wakeup(&xbdi->xbdi_status);
758 
759 }
760 
761 static struct xbdback_instance *
762 xbdif_lookup(domid_t dom , uint32_t handle)
763 {
764 	struct xbdback_instance *xbdi;
765 
766 	SLIST_FOREACH(xbdi, &xbdback_instances, next) {
767 		if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle)
768 			return xbdi;
769 	}
770 	return NULL;
771 }
772 
773 static int
774 xbdback_evthandler(void *arg)
775 {
776 	struct xbdback_instance *xbdi = arg;
777 
778 	XENPRINTF(("xbdback_evthandler domain %d: cont %p\n",
779 	    xbdi->xbdi_domid, xbdi->xbdi_cont));
780 
781 	if (xbdi->xbdi_cont == NULL) {
782 		xbdi->xbdi_cont = xbdback_co_main;
783 		xbdback_trampoline(xbdi, xbdi);
784 	}
785 	return 1;
786 }
787 
788 static void *
789 xbdback_co_main(struct xbdback_instance *xbdi, void *obj)
790 {
791 	(void)obj;
792 	xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod;
793 	x86_lfence(); /* ensure we see all requests up to req_prod */
794 	/*
795 	 * note that we'll eventually get a full ring of request.
796 	 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod)
797 	 */
798 	xbdi->xbdi_cont = xbdback_co_main_loop;
799 	return xbdi;
800 }
801 
802 static void *
803 xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj)
804 {
805 	blkif_request_t *req = &xbdi->xbdi_xen_req;
806 	blkif_x86_32_request_t *req32;
807 	blkif_x86_64_request_t *req64;
808 	int i;
809 
810 	(void)obj;
811 	if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) {
812 		switch(xbdi->xbdi_proto) {
813 		case XBDIP_NATIVE:
814 			memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
815 			    xbdi->xbdi_ring.ring_n.req_cons),
816 			    sizeof(blkif_request_t));
817 			break;
818 		case XBDIP_32:
819 			req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
820 			    xbdi->xbdi_ring.ring_n.req_cons);
821 			req->operation = req32->operation;
822 			req->nr_segments = req32->nr_segments;
823 			req->handle = req32->handle;
824 			req->id = req32->id;
825 			req->sector_number = req32->sector_number;
826 			for (i = 0; i < req->nr_segments; i++)
827 				req->seg[i] = req32->seg[i];
828 			break;
829 
830 		case XBDIP_64:
831 			req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
832 			    xbdi->xbdi_ring.ring_n.req_cons);
833 			req->operation = req64->operation;
834 			req->nr_segments = req64->nr_segments;
835 			req->handle = req64->handle;
836 			req->id = req64->id;
837 			req->sector_number = req64->sector_number;
838 			for (i = 0; i < req->nr_segments; i++)
839 				req->seg[i] = req64->seg[i];
840 			break;
841 		}
842 		XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x "
843 		    "resp_prod 0x%x id %" PRIu64 "\n", req->operation,
844 			xbdi->xbdi_ring.ring_n.req_cons,
845 			xbdi->xbdi_req_prod,
846 			xbdi->xbdi_ring.ring_n.rsp_prod_pvt,
847 			req->id));
848 		switch(req->operation) {
849 		case BLKIF_OP_READ:
850 		case BLKIF_OP_WRITE:
851 			xbdi->xbdi_cont = xbdback_co_io;
852 			break;
853 		default:
854 			printf("xbdback_evthandler domain %d: unknown "
855 			    "operation %d\n", xbdi->xbdi_domid, req->operation);
856 			xbdback_send_reply(xbdi, req->id, req->operation,
857 			    BLKIF_RSP_ERROR);
858 			xbdi->xbdi_cont = xbdback_co_main_incr;
859 			break;
860 		}
861 	} else {
862 		xbdi->xbdi_cont = xbdback_co_main_done;
863 	}
864 	return xbdi;
865 }
866 
867 static void *
868 xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj)
869 {
870 	(void)obj;
871 	xbdi->xbdi_ring.ring_n.req_cons++;
872 	xbdi->xbdi_cont = xbdback_co_main_loop;
873 	return xbdi;
874 }
875 
876 static void *
877 xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj)
878 {
879 	(void)obj;
880 	if (xbdi->xbdi_io != NULL) {
881 		xbdi->xbdi_cont = xbdback_co_flush;
882 		xbdi->xbdi_cont_aux = xbdback_co_main_done2;
883 	} else {
884 		xbdi->xbdi_cont = xbdback_co_main_done2;
885 	}
886 	return xbdi;
887 }
888 
889 static void *
890 xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj)
891 {
892 	int work_to_do;
893 
894 	RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do);
895 	if (work_to_do)
896 		xbdi->xbdi_cont = xbdback_co_main;
897 	else
898 		xbdi->xbdi_cont = NULL;
899 	return xbdi;
900 }
901 
902 static void *
903 xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
904 {
905 	int error;
906 
907 	(void)obj;
908 	if (xbdi->xbdi_xen_req.nr_segments < 1 ||
909 	    xbdi->xbdi_xen_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST ) {
910 		printf("xbdback_io domain %d: %d segments\n",
911 		       xbdi->xbdi_domid, xbdi->xbdi_xen_req.nr_segments);
912 		error = EINVAL;
913 		goto end;
914 	}
915 	if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) {
916 		if (xbdi->xbdi_ro) {
917 			error = EROFS;
918 			goto end;
919 		}
920 	}
921 
922 	xbdi->xbdi_segno = 0;
923 
924 	xbdi->xbdi_cont = xbdback_co_io_gotreq;
925 	return xbdback_pool_get(&xbdback_request_pool, xbdi);
926  end:
927 	xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id,
928 	    xbdi->xbdi_xen_req.operation, error);
929 	xbdi->xbdi_cont = xbdback_co_main_incr;
930 	return xbdi;
931 }
932 
933 static void *
934 xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj)
935 {
936 	struct xbdback_request *xrq;
937 
938 	xrq = xbdi->xbdi_req = obj;
939 
940 	xrq->rq_xbdi = xbdi;
941 	xrq->rq_iocount = 0;
942 	xrq->rq_ioerrs = 0;
943 	xrq->rq_id = xbdi->xbdi_xen_req.id;
944 	xrq->rq_operation = xbdi->xbdi_xen_req.operation;
945 
946 	/*
947 	 * Request-level reasons not to coalesce: different device,
948 	 * different op, or noncontiguous disk sectors (vs. previous
949 	 * request handed to us).
950 	 */
951 	xbdi->xbdi_cont = xbdback_co_io_loop;
952 	if (xbdi->xbdi_io != NULL) {
953 		struct xbdback_request *last_req;
954 		last_req = SLIST_FIRST(&xbdi->xbdi_io->xio_rq)->car;
955 		XENPRINTF(("xbdback_io domain %d: hoping for sector %" PRIu64
956 		    "; got %" PRIu64 "\n", xbdi->xbdi_domid,
957 		    xbdi->xbdi_next_sector,
958 		    xbdi->xbdi_xen_req.sector_number));
959 		if ((xrq->rq_operation != last_req->rq_operation)
960 		    || (xbdi->xbdi_xen_req.sector_number !=
961 		    xbdi->xbdi_next_sector)) {
962 			XENPRINTF(("xbdback_io domain %d: segment break\n",
963 			    xbdi->xbdi_domid));
964 			xbdi->xbdi_next_sector =
965 			    xbdi->xbdi_xen_req.sector_number;
966 			xbdi->xbdi_cont_aux = xbdi->xbdi_cont;
967 			xbdi->xbdi_cont = xbdback_co_flush;
968 		}
969 	} else {
970 		xbdi->xbdi_next_sector = xbdi->xbdi_xen_req.sector_number;
971 	}
972 	return xbdi;
973 }
974 
975 
976 static void *
977 xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj)
978 {
979 	struct xbdback_io *xio;
980 
981 	(void)obj;
982 	if (xbdi->xbdi_segno < xbdi->xbdi_xen_req.nr_segments) {
983 		uint8_t this_fs, this_ls, last_fs, last_ls;
984 		grant_ref_t thisgrt, lastgrt;
985 		/*
986 		 * Segment-level reason to coalesce: handling full
987 		 * pages, or adjacent sector ranges from the same page
988 		 * (and yes, this latter does happen).  But not if the
989 		 * array of client pseudo-physical pages is full.
990 		 */
991 		this_fs = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].first_sect;
992 		this_ls = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].last_sect;
993 		thisgrt = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].gref;
994 		XENPRINTF(("xbdback_io domain %d: "
995 			   "first,last_sect[%d]=0%o,0%o\n",
996 			   xbdi->xbdi_domid, xbdi->xbdi_segno,
997 			   this_fs, this_ls));
998 		last_fs = xbdi->xbdi_last_fs = xbdi->xbdi_this_fs;
999 		last_ls = xbdi->xbdi_last_ls = xbdi->xbdi_this_ls;
1000 		lastgrt = xbdi->xbdi_lastgrt = xbdi->xbdi_thisgrt;
1001 		xbdi->xbdi_this_fs = this_fs;
1002 		xbdi->xbdi_this_ls = this_ls;
1003 		xbdi->xbdi_thisgrt = thisgrt;
1004 		if (xbdi->xbdi_io != NULL) {
1005 			if (last_ls == VBD_MAXSECT
1006 			    && this_fs == 0
1007 			    && xbdi->xbdi_io->xio_nrma
1008 			    < XENSHM_MAX_PAGES_PER_REQUEST) {
1009 				xbdi->xbdi_same_page = 0;
1010 			} else if (last_ls + 1
1011 				       == this_fs
1012 #ifdef notyet
1013 				   && (last_fas & ~PAGE_MASK)
1014 				       == (this_fas & ~PAGE_MASK)
1015 #else
1016 				  && 0 /* can't know frame number yet */
1017 #endif
1018 			    ) {
1019 #ifdef DEBUG
1020 				static struct timeval gluetimer;
1021 				if (ratecheck(&gluetimer,
1022 					      &xbdback_fragio_intvl))
1023 					printf("xbdback: domain %d sending"
1024 					    " excessively fragmented I/O\n",
1025 					    xbdi->xbdi_domid);
1026 #endif
1027 				printf("xbdback_io: would maybe glue same page sec %d (%d->%d)\n", xbdi->xbdi_segno, this_fs, this_ls);
1028 				panic("notyet!");
1029 				XENPRINTF(("xbdback_io domain %d: glue same "
1030 				    "page", xbdi->xbdi_domid));
1031 				xbdi->xbdi_same_page = 1;
1032 			} else {
1033 				xbdi->xbdi_cont_aux = xbdback_co_io_loop;
1034 				xbdi->xbdi_cont = xbdback_co_flush;
1035 				return xbdi;
1036 			}
1037 		} else
1038 			xbdi->xbdi_same_page = 0;
1039 
1040 		if (xbdi->xbdi_io == NULL) {
1041 			xbdi->xbdi_cont = xbdback_co_io_gotio;
1042 			xio = xbdback_pool_get(&xbdback_io_pool, xbdi);
1043 			buf_init(&xio->xio_buf);
1044 			return xio;
1045 		} else {
1046 			xbdi->xbdi_cont = xbdback_co_io_gotio2;
1047 		}
1048 	} else {
1049 		/* done with the loop over segments; get next request */
1050 		xbdi->xbdi_cont = xbdback_co_main_incr;
1051 	}
1052 	return xbdi;
1053 }
1054 
1055 
1056 static void *
1057 xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj)
1058 
1059 {
1060 	struct xbdback_io *xbd_io;
1061 	vaddr_t start_offset; /* start offset in vm area */
1062 	int buf_flags;
1063 
1064 	xbdi_get(xbdi);
1065 
1066 	xbd_io = xbdi->xbdi_io = obj;
1067 	xbd_io->xio_xbdi = xbdi;
1068 	SLIST_INIT(&xbd_io->xio_rq);
1069 	xbd_io->xio_nrma = 0;
1070 	xbd_io->xio_mapped = 0;
1071 
1072 	start_offset = xbdi->xbdi_this_fs * VBD_BSIZE;
1073 
1074 	if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) {
1075 		buf_flags = B_WRITE;
1076 	} else {
1077 		buf_flags = B_READ;
1078 	}
1079 
1080 	xbd_io->xio_buf.b_flags = buf_flags;
1081 	xbd_io->xio_buf.b_cflags = 0;
1082 	xbd_io->xio_buf.b_oflags = 0;
1083 	xbd_io->xio_buf.b_iodone = xbdback_iodone;
1084 	xbd_io->xio_buf.b_proc = NULL;
1085 	xbd_io->xio_buf.b_vp = xbdi->xbdi_vp;
1086 	xbd_io->xio_buf.b_objlock = &xbdi->xbdi_vp->v_interlock;
1087 	xbd_io->xio_buf.b_dev = xbdi->xbdi_dev;
1088 	xbd_io->xio_buf.b_blkno = xbdi->xbdi_next_sector;
1089 	xbd_io->xio_buf.b_bcount = 0;
1090 	xbd_io->xio_buf.b_data = (void *)start_offset;
1091 	xbd_io->xio_buf.b_private = xbd_io;
1092 
1093 	xbdi->xbdi_cont = xbdback_co_io_gotio2;
1094 	return xbdi;
1095 }
1096 
1097 
1098 static void *
1099 xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj)
1100 {
1101 	(void)obj;
1102 	if (xbdi->xbdi_segno == 0 || SLIST_EMPTY(&xbdi->xbdi_io->xio_rq)) {
1103 		/* if this is the first segment of a new request */
1104 		/* or if it's the first segment of the io */
1105 		xbdi->xbdi_cont = xbdback_co_io_gotfrag;
1106 		return xbdback_pool_get(&xbdback_fragment_pool, xbdi);
1107 	}
1108 	xbdi->xbdi_cont = xbdback_co_io_gotfrag2;
1109 	return xbdi;
1110 }
1111 
1112 
1113 static void *
1114 xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj)
1115 {
1116 	struct xbdback_fragment *xbd_fr;
1117 
1118 	xbd_fr = obj;
1119 	xbd_fr->car = xbdi->xbdi_req;
1120 	SLIST_INSERT_HEAD(&xbdi->xbdi_io->xio_rq, xbd_fr, cdr);
1121 	++xbdi->xbdi_req->rq_iocount;
1122 
1123 	xbdi->xbdi_cont = xbdback_co_io_gotfrag2;
1124 	return xbdi;
1125 }
1126 
1127 static void *
1128 xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj)
1129 {
1130 	struct xbdback_io *xbd_io;
1131 	int seg_size;
1132 	uint8_t this_fs, this_ls;
1133 
1134 	this_fs = xbdi->xbdi_this_fs;
1135 	this_ls = xbdi->xbdi_this_ls;
1136 	xbd_io = xbdi->xbdi_io;
1137 	seg_size = this_ls - this_fs + 1;
1138 
1139 	if (seg_size < 0) {
1140 		printf("xbdback_io domain %d: negative-size request (%d %d)\n",
1141 		       xbdi->xbdi_domid, this_ls, this_fs);
1142 		xbdback_io_error(xbdi->xbdi_io, EINVAL);
1143 		xbdi->xbdi_io = NULL;
1144 		xbdi->xbdi_cont = xbdback_co_main_incr;
1145 		return xbdi;
1146 	}
1147 
1148 	if (!xbdi->xbdi_same_page) {
1149 		XENPRINTF(("xbdback_io domain %d: appending grant %u\n",
1150 			   xbdi->xbdi_domid, (u_int)xbdi->xbdi_thisgrt));
1151 		xbd_io->xio_gref[xbd_io->xio_nrma++] = xbdi->xbdi_thisgrt;
1152 	}
1153 
1154 	xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE);
1155 	XENPRINTF(("xbdback_io domain %d: start sect %d size %d\n",
1156 	    xbdi->xbdi_domid, (int)xbdi->xbdi_next_sector, seg_size));
1157 
1158 	/* Finally, the end of the segment loop! */
1159 	xbdi->xbdi_next_sector += seg_size;
1160 	++xbdi->xbdi_segno;
1161 	xbdi->xbdi_cont = xbdback_co_io_loop;
1162 	return xbdi;
1163 }
1164 
1165 
1166 static void *
1167 xbdback_co_flush(struct xbdback_instance *xbdi, void *obj)
1168 {
1169 	(void)obj;
1170 	XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n",
1171 	    xbdi->xbdi_domid, (long)xbdi->xbdi_io->xio_buf.b_blkno,
1172 	    (int)xbdi->xbdi_io->xio_buf.b_bcount, (long)xbdi->xbdi_io));
1173 	xbdi->xbdi_cont = xbdback_co_flush_done;
1174 	return xbdback_map_shm(xbdi->xbdi_io);
1175 }
1176 
1177 static void *
1178 xbdback_co_flush_done(struct xbdback_instance *xbdi, void *obj)
1179 {
1180 	(void)obj;
1181 	xbdback_do_io(xbdi->xbdi_io);
1182 	xbdi->xbdi_io = NULL;
1183 	xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
1184 	return xbdi;
1185 }
1186 
1187 static void
1188 xbdback_io_error(struct xbdback_io *xbd_io, int error)
1189 {
1190 	xbd_io->xio_buf.b_error = error;
1191 	xbdback_iodone(&xbd_io->xio_buf);
1192 }
1193 
1194 static void
1195 xbdback_do_io(struct xbdback_io *xbd_io)
1196 {
1197 	xbd_io->xio_buf.b_data =
1198 	    (void *)((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr);
1199 #ifdef DIAGNOSTIC
1200 	{
1201 	vaddr_t bdata = (vaddr_t)xbd_io->xio_buf.b_data;
1202 	int nsegs =
1203 	    ((((bdata + xbd_io->xio_buf.b_bcount - 1) & ~PAGE_MASK) -
1204 	    (bdata & ~PAGE_MASK)) >> PAGE_SHIFT) + 1;
1205 	if ((bdata & ~PAGE_MASK) != (xbd_io->xio_vaddr & ~PAGE_MASK)) {
1206 		printf("xbdback_do_io vaddr 0x%lx bdata 0x%lx\n",
1207 		    xbd_io->xio_vaddr, bdata);
1208 		panic("xbdback_do_io: bdata page change");
1209 	}
1210 	if (nsegs > xbd_io->xio_nrma) {
1211 		printf("xbdback_do_io vaddr 0x%lx bcount 0x%x doesn't fit in "
1212 		    " %d pages\n", bdata, xbd_io->xio_buf.b_bcount,
1213 		    xbd_io->xio_nrma);
1214 		panic("xbdback_do_io: not enough pages");
1215 	}
1216 	}
1217 #endif
1218 	if ((xbd_io->xio_buf.b_flags & B_READ) == 0)
1219 		xbd_io->xio_buf.b_vp->v_numoutput++;
1220 	bdev_strategy(&xbd_io->xio_buf);
1221 }
1222 
1223 /* This gets reused by xbdback_io_error to report errors from other sources. */
1224 static void
1225 xbdback_iodone(struct buf *bp)
1226 {
1227 	struct xbdback_io *xbd_io;
1228 	struct xbdback_instance *xbdi;
1229 	int errp;
1230 
1231 	xbd_io = bp->b_private;
1232 	xbdi = xbd_io->xio_xbdi;
1233 
1234 	XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n",
1235 		   xbdi->xbdi_domid, (long)xbd_io));
1236 
1237 	if (xbd_io->xio_mapped)
1238 		xbdback_unmap_shm(xbd_io);
1239 
1240 	if (bp->b_error != 0) {
1241 		printf("xbd IO domain %d: error %d\n",
1242 		       xbdi->xbdi_domid, bp->b_error);
1243 		errp = 1;
1244 	} else
1245 		errp = 0;
1246 
1247 
1248 	/* for each constituent xbd request */
1249 	while(!SLIST_EMPTY(&xbd_io->xio_rq)) {
1250 		struct xbdback_fragment *xbd_fr;
1251 		struct xbdback_request *xbd_req;
1252 		struct xbdback_instance *rxbdi;
1253 		int error;
1254 
1255 		xbd_fr = SLIST_FIRST(&xbd_io->xio_rq);
1256 		xbd_req = xbd_fr->car;
1257 		SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr);
1258 		xbdback_pool_put(&xbdback_fragment_pool, xbd_fr);
1259 
1260 		if (errp)
1261 			++xbd_req->rq_ioerrs;
1262 
1263 		/* finalize it only if this was its last I/O */
1264 		if (--xbd_req->rq_iocount > 0)
1265 			continue;
1266 
1267 		rxbdi = xbd_req->rq_xbdi;
1268 		KASSERT(xbdi == rxbdi);
1269 
1270 		error = xbd_req->rq_ioerrs > 0
1271 		    ? BLKIF_RSP_ERROR
1272 		    : BLKIF_RSP_OKAY;
1273 
1274 		XENPRINTF(("xbdback_io domain %d: end request %" PRIu64 " error=%d\n",
1275 		    xbdi->xbdi_domid, xbd_req->rq_id, error));
1276 		xbdback_send_reply(xbdi, xbd_req->rq_id,
1277 		    xbd_req->rq_operation, error);
1278 		xbdback_pool_put(&xbdback_request_pool, xbd_req);
1279 	}
1280 	xbdi_put(xbdi);
1281 	buf_destroy(&xbd_io->xio_buf);
1282 	xbdback_pool_put(&xbdback_io_pool, xbd_io);
1283 }
1284 
1285 /*
1286  * called once a request has completed. Place the reply in the ring and
1287  * notify the guest OS
1288  */
1289 static void
1290 xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id,
1291     int op, int status)
1292 {
1293 	blkif_response_t *resp_n;
1294 	blkif_x86_32_response_t *resp32;
1295 	blkif_x86_64_response_t *resp64;
1296 	int notify;
1297 
1298 	switch(xbdi->xbdi_proto) {
1299 	case XBDIP_NATIVE:
1300 		resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n,
1301 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1302 		resp_n->id        = id;
1303 		resp_n->operation = op;
1304 		resp_n->status    = status;
1305 		break;
1306 	case XBDIP_32:
1307 		resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32,
1308 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1309 		resp32->id        = id;
1310 		resp32->operation = op;
1311 		resp32->status    = status;
1312 		break;
1313 	case XBDIP_64:
1314 		resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64,
1315 		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
1316 		resp64->id        = id;
1317 		resp64->operation = op;
1318 		resp64->status    = status;
1319 		break;
1320 	}
1321 	xbdi->xbdi_ring.ring_n.rsp_prod_pvt++;
1322 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify);
1323 	if (notify) {
1324 		XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid));
1325 		hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
1326 	}
1327 }
1328 
1329 /*
1330  * Map a request into our virtual address space.  The xbd_req->rq_ma
1331  * array is to be filled out by the caller.
1332  */
1333 static void *
1334 xbdback_map_shm(struct xbdback_io *xbd_io)
1335 {
1336 	struct xbdback_instance *xbdi;
1337 	struct xbdback_request *xbd_rq;
1338 	int error, s;
1339 
1340 #ifdef XENDEBUG_VBD
1341 	int i;
1342 	printf("xbdback_map_shm map grant ");
1343 	for (i = 0; i < xbd_io->xio_nrma; i++) {
1344 		printf("%u ", (u_int)xbd_io->xio_gref[i]);
1345 	}
1346 #endif
1347 
1348 	KASSERT(xbd_io->xio_mapped == 0);
1349 
1350 	xbdi = xbd_io->xio_xbdi;
1351 	xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car;
1352 	error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid,
1353 	    xbd_io->xio_gref, &xbd_io->xio_vaddr, xbd_io->xio_gh,
1354 	    (xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO: 0);
1355 
1356 	switch(error) {
1357 	case 0:
1358 #ifdef XENDEBUG_VBD
1359 		printf("handle ");
1360 		for (i = 0; i < xbd_io->xio_nrma; i++) {
1361 			printf("%u ", (u_int)xbd_io->xio_gh[i]);
1362 		}
1363 		printf("\n");
1364 #endif
1365 		xbd_io->xio_mapped = 1;
1366 		return (void *)xbd_io->xio_vaddr;
1367 	case ENOMEM:
1368 		s = splvm();
1369 		if (!xbdback_shmcb) {
1370 			if (xen_shm_callback(xbdback_shm_callback, xbdi)
1371 			    != 0) {
1372 				splx(s);
1373 				panic("xbdback_map_shm: "
1374 				      "xen_shm_callback failed");
1375 			}
1376 			xbdback_shmcb = 1;
1377 		}
1378 		SIMPLEQ_INSERT_TAIL(&xbdback_shmq, xbdi, xbdi_on_hold);
1379 		splx(s);
1380 		return NULL;
1381 	default:
1382 		printf("xbdback_map_shm: xen_shm error %d ",
1383 		       error);
1384 		xbdback_io_error(xbdi->xbdi_io, error);
1385 		xbdi->xbdi_io = NULL;
1386 		xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
1387 		return xbdi;
1388 	}
1389 }
1390 
1391 static int
1392 xbdback_shm_callback(void *arg)
1393 {
1394         int error, s;
1395 
1396 	s = splvm();
1397 	while(!SIMPLEQ_EMPTY(&xbdback_shmq)) {
1398 		struct xbdback_instance *xbdi;
1399 		struct xbdback_io *xbd_io;
1400 		struct xbdback_request *xbd_rq;
1401 
1402 		xbdi = SIMPLEQ_FIRST(&xbdback_shmq);
1403 		xbd_io = xbdi->xbdi_io;
1404 		xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car;
1405 		KASSERT(xbd_io->xio_mapped == 0);
1406 
1407 		error = xen_shm_map(xbd_io->xio_nrma,
1408 		    xbdi->xbdi_domid, xbd_io->xio_gref,
1409 		    &xbd_io->xio_vaddr, xbd_io->xio_gh,
1410 		    XSHM_CALLBACK |
1411 		    ((xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO: 0));
1412 		switch(error) {
1413 		case ENOMEM:
1414 			splx(s);
1415 			return -1; /* will try again later */
1416 		case 0:
1417 			xbd_io->xio_mapped = 1;
1418 			SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold);
1419 			splx(s);
1420 			xbdback_trampoline(xbdi, xbdi);
1421 			s = splvm();
1422 			break;
1423 		default:
1424 			SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold);
1425 			splx(s);
1426 			printf("xbdback_shm_callback: xen_shm error %d\n",
1427 			       error);
1428 			xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
1429 			xbdback_io_error(xbd_io, error);
1430 			xbdback_trampoline(xbdi, xbdi);
1431 			s = splvm();
1432 			break;
1433 		}
1434 	}
1435 	xbdback_shmcb = 0;
1436 	splx(s);
1437 	return 0;
1438 }
1439 
1440 /* unmap a request from our virtual address space (request is done) */
1441 static void
1442 xbdback_unmap_shm(struct xbdback_io *xbd_io)
1443 {
1444 #ifdef XENDEBUG_VBD
1445 	int i;
1446 	printf("xbdback_unmap_shm handle ");
1447 	for (i = 0; i < xbd_io->xio_nrma; i++) {
1448 		printf("%u ", (u_int)xbd_io->xio_gh[i]);
1449 	}
1450 	printf("\n");
1451 #endif
1452 
1453 	KASSERT(xbd_io->xio_mapped == 1);
1454 	xbd_io->xio_mapped = 0;
1455 	xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma,
1456 	    xbd_io->xio_gh);
1457 	xbd_io->xio_vaddr = -1;
1458 }
1459 
1460 /* Obtain memory from a pool, in cooperation with the continuations. */
1461 static void *xbdback_pool_get(struct xbdback_pool *pp,
1462 			      struct xbdback_instance *xbdi)
1463 {
1464 	int s;
1465 	void *item;
1466 
1467 	item = pool_get(&pp->p, PR_NOWAIT);
1468 	if (item == NULL) {
1469 		if (ratecheck(&pp->last_warning, &xbdback_poolsleep_intvl))
1470 			printf("xbdback_pool_get: %s is full",
1471 			       pp->p.pr_wchan);
1472 		s = splvm();
1473 		SIMPLEQ_INSERT_TAIL(&pp->q, xbdi, xbdi_on_hold);
1474 		splx(s);
1475 	}
1476 	return item;
1477 }
1478 
1479 /*
1480  * Restore memory to a pool... unless an xbdback instance had been
1481  * waiting for it, in which case that gets the memory first.
1482  */
1483 static void xbdback_pool_put(struct xbdback_pool *pp, void *item)
1484 {
1485 	int s;
1486 
1487 	s = splvm();
1488 	if (SIMPLEQ_EMPTY(&pp->q)) {
1489 		splx(s);
1490 		pool_put(&pp->p, item);
1491 	} else {
1492 		struct xbdback_instance *xbdi = SIMPLEQ_FIRST(&pp->q);
1493 		SIMPLEQ_REMOVE_HEAD(&pp->q, xbdi_on_hold);
1494 		splx(s);
1495 		xbdback_trampoline(xbdi, item);
1496 	}
1497 }
1498 
1499 static void
1500 xbdback_trampoline(struct xbdback_instance *xbdi, void *obj)
1501 {
1502 	xbdback_cont_t cont;
1503 
1504 	while(obj != NULL && xbdi->xbdi_cont != NULL) {
1505 		cont = xbdi->xbdi_cont;
1506 #ifdef DIAGNOSTIC
1507 		xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF;
1508 #endif
1509 		obj = (*cont)(xbdi, obj);
1510 #ifdef DIAGNOSTIC
1511 		if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) {
1512 			printf("xbdback_trampoline: 0x%lx didn't set "
1513 			       "xbdi->xbdi_cont!\n2", (long)cont);
1514 			panic("xbdback_trampoline: bad continuation");
1515 		}
1516 #endif
1517 	}
1518 }
1519