1 /* $NetBSD: xbdback_xenbus.c,v 1.16 2008/03/22 14:21:56 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2006 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by Manuel Bouyer. 17 * 4. The name of the author may not be used to endorse or promote products 18 * derived from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33 #include <sys/cdefs.h> 34 __KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.16 2008/03/22 14:21:56 ad Exp $"); 35 36 #include <sys/types.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/malloc.h> 40 #include <sys/queue.h> 41 #include <sys/kernel.h> 42 #include <sys/conf.h> 43 #include <sys/disk.h> 44 #include <sys/disklabel.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/kauth.h> 48 49 #include <xen/xen.h> 50 #include <xen/xen_shm.h> 51 #include <xen/evtchn.h> 52 #include <xen/xenbus.h> 53 #include <xen/xen3-public/io/protocols.h> 54 55 /* #define XENDEBUG_VBD */ 56 #ifdef XENDEBUG_VBD 57 #define XENPRINTF(x) printf x 58 #else 59 #define XENPRINTF(x) 60 #endif 61 62 #define BLKIF_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) 63 64 /* 65 * Backend block device driver for Xen 66 */ 67 68 /* Max number of pages per request. The request may not be page aligned */ 69 #define BLKIF_MAX_PAGES_PER_REQUEST (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) 70 71 /* Values are expressed in 512-byte sectors */ 72 #define VBD_BSIZE 512 73 #define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1) 74 75 struct xbdback_request; 76 struct xbdback_io; 77 struct xbdback_fragment; 78 struct xbdback_instance; 79 80 /* state of a xbdback instance */ 81 typedef enum {CONNECTED, DISCONNECTING, DISCONNECTED} xbdback_state_t; 82 83 /* 84 * Since there are a variety of conditions that can block our I/O 85 * processing, which isn't allowed to suspend its thread's execution, 86 * such things will be done in a sort of continuation-passing style. 87 * 88 * Return value is NULL to indicate that execution has blocked; if 89 * it's finished, set xbdi->xbdi_cont (see below) to NULL and the return 90 * doesn't matter. Otherwise it's passed as the second parameter to 91 * the new value of xbdi->xbdi_cont. 92 */ 93 typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *); 94 95 enum xbdi_proto { 96 XBDIP_NATIVE, 97 XBDIP_32, 98 XBDIP_64 99 }; 100 101 102 /* we keep the xbdback instances in a linked list */ 103 struct xbdback_instance { 104 SLIST_ENTRY(xbdback_instance) next; 105 struct xenbus_device *xbdi_xbusd; /* our xenstore entry */ 106 struct xenbus_watch xbdi_watch; /* to watch our store */ 107 domid_t xbdi_domid; /* attached to this domain */ 108 uint32_t xbdi_handle; /* domain-specific handle */ 109 xbdback_state_t xbdi_status; 110 /* backing device parameters */ 111 dev_t xbdi_dev; 112 const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */ 113 struct vnode *xbdi_vp; 114 uint64_t xbdi_size; 115 int xbdi_ro; /* is device read-only ? */ 116 /* parameters for the communication */ 117 unsigned int xbdi_evtchn; 118 /* private parameters for communication */ 119 blkif_back_ring_proto_t xbdi_ring; 120 enum xbdi_proto xbdi_proto; 121 grant_handle_t xbdi_ring_handle; /* to unmap the ring */ 122 vaddr_t xbdi_ring_va; /* to unmap the ring */ 123 /* disconnection must be postponed until all I/O is done */ 124 volatile unsigned xbdi_refcnt; 125 /* 126 * State for I/O processing/coalescing follows; this has to 127 * live here instead of on the stack because of the 128 * continuation-ness (see above). 129 */ 130 RING_IDX xbdi_req_prod; /* limit on request indices */ 131 xbdback_cont_t xbdi_cont, xbdi_cont_aux; 132 SIMPLEQ_ENTRY(xbdback_instance) xbdi_on_hold; /* waiting on resources */ 133 /* _request state */ 134 struct xbdback_request *xbdi_req; /* if NULL, ignore following */ 135 blkif_request_t xbdi_xen_req; 136 int xbdi_segno; 137 /* _io state */ 138 struct xbdback_io *xbdi_io; /* if NULL, ignore next field */ 139 daddr_t xbdi_next_sector; 140 uint8_t xbdi_last_fs, xbdi_this_fs; /* first sectors */ 141 uint8_t xbdi_last_ls, xbdi_this_ls; /* last sectors */ 142 grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */ 143 /* other state */ 144 int xbdi_same_page; /* are we merging two segments on the same page? */ 145 }; 146 /* Manipulation of the above reference count. */ 147 /* XXXjld@panix.com: not MP-safe, and move the i386 asm elsewhere. */ 148 #define xbdi_get(xbdip) (++(xbdip)->xbdi_refcnt) 149 #define xbdi_put(xbdip) \ 150 do { \ 151 __asm volatile("decl %0" \ 152 : "=m"((xbdip)->xbdi_refcnt) : "m"((xbdip)->xbdi_refcnt)); \ 153 if (0 == (xbdip)->xbdi_refcnt) \ 154 xbdback_finish_disconnect(xbdip); \ 155 } while (/* CONSTCOND */ 0) 156 157 SLIST_HEAD(, xbdback_instance) xbdback_instances; 158 159 /* 160 * For each request from a guest, a xbdback_request is allocated from 161 * a pool. This will describe the request until completion. The 162 * request may require multiple IO operations to perform, so the 163 * per-IO information is not stored here. 164 */ 165 struct xbdback_request { 166 struct xbdback_instance *rq_xbdi; /* our xbd instance */ 167 uint64_t rq_id; 168 int rq_iocount; /* reference count; or, number of outstanding I/O's */ 169 int rq_ioerrs; 170 uint8_t rq_operation; 171 }; 172 173 /* 174 * For each I/O operation associated with one of those requests, an 175 * xbdback_io is allocated from a pool. It may correspond to multiple 176 * Xen disk requests, or parts of them, if several arrive at once that 177 * can be coalesced. 178 */ 179 struct xbdback_io { 180 struct buf xio_buf; /* our I/O */ 181 /* The instance pointer is duplicated for convenience. */ 182 struct xbdback_instance *xio_xbdi; /* our xbd instance */ 183 SLIST_HEAD(, xbdback_fragment) xio_rq; /* xbd requests involved */ 184 vaddr_t xio_vaddr; /* the virtual address to map the request at */ 185 grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; /* grants to map */ 186 grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];/* grants release */ 187 uint16_t xio_nrma; /* number of guest pages */ 188 uint16_t xio_mapped; 189 }; 190 191 /* 192 * Rather than have the xbdback_io keep an array of the 193 * xbdback_requests involved, since the actual number will probably be 194 * small but might be as large as BLKIF_RING_SIZE, use a list. This 195 * would be threaded through xbdback_request, but one of them might be 196 * part of multiple I/O's, alas. 197 */ 198 struct xbdback_fragment { 199 struct xbdback_request *car; 200 SLIST_ENTRY(xbdback_fragment) cdr; 201 }; 202 203 /* 204 * Wrap our pools with a chain of xbdback_instances whose I/O 205 * processing has blocked for want of memory from that pool. 206 */ 207 struct xbdback_pool { 208 struct pool p; 209 SIMPLEQ_HEAD(xbdback_iqueue, xbdback_instance) q; 210 struct timeval last_warning; 211 } xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool; 212 static struct xbdback_iqueue xbdback_shmq; 213 static int xbdback_shmcb; /* have we already registered a callback? */ 214 215 struct timeval xbdback_poolsleep_intvl = { 5, 0 }; 216 #ifdef DEBUG 217 struct timeval xbdback_fragio_intvl = { 60, 0 }; 218 #endif 219 void xbdbackattach(int); 220 static int xbdback_xenbus_create(struct xenbus_device *); 221 static int xbdback_xenbus_destroy(void *); 222 static void xbdback_frontend_changed(void *, XenbusState); 223 static void xbdback_backend_changed(struct xenbus_watch *, 224 const char **, unsigned int); 225 static int xbdback_evthandler(void *); 226 static void xbdback_finish_disconnect(struct xbdback_instance *); 227 228 static struct xbdback_instance *xbdif_lookup(domid_t, uint32_t); 229 230 static void *xbdback_co_main(struct xbdback_instance *, void *); 231 static void *xbdback_co_main_loop(struct xbdback_instance *, void *); 232 static void *xbdback_co_main_incr(struct xbdback_instance *, void *); 233 static void *xbdback_co_main_done(struct xbdback_instance *, void *); 234 static void *xbdback_co_main_done2(struct xbdback_instance *, void *); 235 236 static void *xbdback_co_io(struct xbdback_instance *, void *); 237 static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *); 238 static void *xbdback_co_io_loop(struct xbdback_instance *, void *); 239 static void *xbdback_co_io_gotio(struct xbdback_instance *, void *); 240 static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *); 241 static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *); 242 static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *); 243 244 static void *xbdback_co_flush(struct xbdback_instance *, void *); 245 static void *xbdback_co_flush_done(struct xbdback_instance *, void *); 246 247 static int xbdback_shm_callback(void *); 248 static void xbdback_io_error(struct xbdback_io *, int); 249 static void xbdback_do_io(struct xbdback_io *); 250 static void xbdback_iodone(struct buf *); 251 static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int); 252 253 static void *xbdback_map_shm(struct xbdback_io *); 254 static void xbdback_unmap_shm(struct xbdback_io *); 255 256 static void *xbdback_pool_get(struct xbdback_pool *, 257 struct xbdback_instance *); 258 static void xbdback_pool_put(struct xbdback_pool *, void *); 259 static void xbdback_trampoline(struct xbdback_instance *, void *); 260 261 static struct xenbus_backend_driver xbd_backend_driver = { 262 .xbakd_create = xbdback_xenbus_create, 263 .xbakd_type = "vbd" 264 }; 265 266 void 267 xbdbackattach(int n) 268 { 269 XENPRINTF(("xbdbackattach\n")); 270 271 /* 272 * initialize the backend driver, register the control message handler 273 * and send driver up message. 274 */ 275 SLIST_INIT(&xbdback_instances); 276 SIMPLEQ_INIT(&xbdback_shmq); 277 xbdback_shmcb = 0; 278 pool_init(&xbdback_request_pool.p, sizeof(struct xbdback_request), 279 0, 0, 0, "xbbrp", NULL, IPL_BIO); 280 SIMPLEQ_INIT(&xbdback_request_pool.q); 281 pool_init(&xbdback_io_pool.p, sizeof(struct xbdback_io), 282 0, 0, 0, "xbbip", NULL, IPL_BIO); 283 SIMPLEQ_INIT(&xbdback_io_pool.q); 284 pool_init(&xbdback_fragment_pool.p, sizeof(struct xbdback_fragment), 285 0, 0, 0, "xbbfp", NULL, IPL_BIO); 286 SIMPLEQ_INIT(&xbdback_fragment_pool.q); 287 /* we allocate enough to handle a whole ring at once */ 288 if (pool_prime(&xbdback_request_pool.p, BLKIF_RING_SIZE) != 0) 289 printf("xbdback: failed to prime request pool\n"); 290 if (pool_prime(&xbdback_io_pool.p, BLKIF_RING_SIZE) != 0) 291 printf("xbdback: failed to prime io pool\n"); 292 if (pool_prime(&xbdback_fragment_pool.p, 293 BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) != 0) 294 printf("xbdback: failed to prime fragment pool\n"); 295 296 xenbus_backend_register(&xbd_backend_driver); 297 } 298 299 static int 300 xbdback_xenbus_create(struct xenbus_device *xbusd) 301 { 302 struct xbdback_instance *xbdi; 303 long domid, handle; 304 int error, i; 305 char *ep; 306 307 if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path, 308 "frontend-id", &domid, 10)) != 0) { 309 aprint_error("xbdback: can' read %s/frontend-id: %d\n", 310 xbusd->xbusd_path, error); 311 return error; 312 } 313 314 /* 315 * get handle: this is the last component of the path; which is 316 * a decimal number. $path/dev contains the device name, which is not 317 * appropriate. 318 */ 319 for (i = strlen(xbusd->xbusd_path); i > 0; i--) { 320 if (xbusd->xbusd_path[i] == '/') 321 break; 322 } 323 if (i == 0) { 324 aprint_error("xbdback: can't parse %s\n", 325 xbusd->xbusd_path); 326 return EFTYPE; 327 } 328 handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10); 329 if (*ep != '\0') { 330 aprint_error("xbdback: can't parse %s\n", 331 xbusd->xbusd_path); 332 return EFTYPE; 333 } 334 335 if (xbdif_lookup(domid, handle) != NULL) { 336 return EEXIST; 337 } 338 xbdi = malloc(sizeof(struct xbdback_instance), M_DEVBUF, 339 M_NOWAIT | M_ZERO); 340 if (xbdi == NULL) { 341 return ENOMEM; 342 } 343 xbdi->xbdi_domid = domid; 344 xbdi->xbdi_handle = handle; 345 xbdi->xbdi_status = DISCONNECTED; 346 xbdi->xbdi_refcnt = 1; 347 SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next); 348 xbusd->xbusd_u.b.b_cookie = xbdi; 349 xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy; 350 xbusd->xbusd_otherend_changed = xbdback_frontend_changed; 351 xbdi->xbdi_xbusd = xbusd; 352 353 error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device", 354 &xbdi->xbdi_watch, xbdback_backend_changed); 355 if (error) { 356 printf("failed to watch on %s/physical-device: %d\n", 357 xbusd->xbusd_path, error); 358 goto fail; 359 } 360 xbdi->xbdi_watch.xbw_dev = xbusd; 361 error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait); 362 if (error) { 363 printf("failed to switch state on %s: %d\n", 364 xbusd->xbusd_path, error); 365 goto fail2; 366 } 367 return 0; 368 fail2: 369 unregister_xenbus_watch(&xbdi->xbdi_watch); 370 fail: 371 free(xbdi, M_DEVBUF); 372 return error; 373 } 374 375 static int 376 xbdback_xenbus_destroy(void *arg) 377 { 378 struct xbdback_instance *xbdi = arg; 379 struct xenbus_device *xbusd = xbdi->xbdi_xbusd; 380 struct gnttab_unmap_grant_ref ungrop; 381 int err, s; 382 383 XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status)); 384 385 if (xbdi->xbdi_status != DISCONNECTED) { 386 hypervisor_mask_event(xbdi->xbdi_evtchn); 387 event_remove_handler(xbdi->xbdi_evtchn, xbdback_evthandler, 388 xbdi); 389 xbdi->xbdi_status = DISCONNECTING; 390 s = splbio(); 391 xbdi_put(xbdi); 392 while (xbdi->xbdi_status != DISCONNECTED) { 393 tsleep(&xbdi->xbdi_status, PRIBIO, "xbddis", 0); 394 } 395 splx(s); 396 } 397 /* unregister watch */ 398 if (xbdi->xbdi_watch.node) { 399 unregister_xenbus_watch(&xbdi->xbdi_watch); 400 free(xbdi->xbdi_watch.node, M_DEVBUF); 401 xbdi->xbdi_watch.node = NULL; 402 } 403 /* unmap ring */ 404 if (xbdi->xbdi_ring_va != 0) { 405 ungrop.host_addr = xbdi->xbdi_ring_va; 406 ungrop.handle = xbdi->xbdi_ring_handle; 407 ungrop.dev_bus_addr = 0; 408 err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 409 &ungrop, 1); 410 if (err) 411 printf("xbdback %s: unmap_grant_ref failed: %d\n", 412 xbusd->xbusd_otherend, err); 413 uvm_km_free(kernel_map, xbdi->xbdi_ring_va, 414 PAGE_SIZE, UVM_KMF_VAONLY); 415 } 416 /* close device */ 417 if (xbdi->xbdi_size) { 418 printf("xbd backend: detach device %s%d%c for domain %d\n", 419 devsw_blk2name(major(xbdi->xbdi_dev)), 420 DISKUNIT(xbdi->xbdi_dev), DISKPART(xbdi->xbdi_dev) + 'a', 421 xbdi->xbdi_domid); 422 vn_close(xbdi->xbdi_vp, FREAD, NOCRED); 423 } 424 SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next); 425 free(xbdi, M_DEVBUF); 426 return 0; 427 } 428 429 static void 430 xbdback_frontend_changed(void *arg, XenbusState new_state) 431 { 432 struct xbdback_instance *xbdi = arg; 433 struct xenbus_device *xbusd = xbdi->xbdi_xbusd; 434 u_long ring_ref, revtchn; 435 struct gnttab_map_grant_ref grop; 436 struct gnttab_unmap_grant_ref ungrop; 437 evtchn_op_t evop; 438 char evname[16]; 439 const char *proto; 440 char *xsproto; 441 int len; 442 int err, s; 443 444 XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state)); 445 switch(new_state) { 446 case XenbusStateInitialising: 447 break; 448 case XenbusStateInitialised: 449 case XenbusStateConnected: 450 if (xbdi->xbdi_status == CONNECTED) 451 break; 452 /* read comunication informations */ 453 err = xenbus_read_ul(NULL, xbusd->xbusd_otherend, 454 "ring-ref", &ring_ref, 10); 455 if (err) { 456 xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref", 457 xbusd->xbusd_otherend); 458 break; 459 } 460 err = xenbus_read_ul(NULL, xbusd->xbusd_otherend, 461 "event-channel", &revtchn, 10); 462 if (err) { 463 xenbus_dev_fatal(xbusd, err, "reading %s/event-channel", 464 xbusd->xbusd_otherend); 465 break; 466 } 467 err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol", 468 &len, &xsproto); 469 if (err) { 470 proto = "unspecified"; 471 xbdi->xbdi_proto = XBDIP_NATIVE; 472 } else { 473 if(strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) { 474 xbdi->xbdi_proto = XBDIP_NATIVE; 475 proto = XEN_IO_PROTO_ABI_NATIVE; 476 } else if(strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) { 477 xbdi->xbdi_proto = XBDIP_32; 478 proto = XEN_IO_PROTO_ABI_X86_32; 479 } else if(strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) { 480 xbdi->xbdi_proto = XBDIP_64; 481 proto = XEN_IO_PROTO_ABI_X86_32; 482 } else { 483 printf("xbd domain %d: unknown proto %s\n", 484 xbdi->xbdi_domid, xsproto); 485 free(xsproto, M_DEVBUF); 486 return; 487 } 488 free(xsproto, M_DEVBUF); 489 } 490 /* allocate VA space and map rings */ 491 xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 492 UVM_KMF_VAONLY); 493 if (xbdi->xbdi_ring_va == 0) { 494 xenbus_dev_fatal(xbusd, ENOMEM, 495 "can't get VA for ring", xbusd->xbusd_otherend); 496 break; 497 } 498 grop.host_addr = xbdi->xbdi_ring_va; 499 grop.flags = GNTMAP_host_map; 500 grop.ref = ring_ref; 501 grop.dom = xbdi->xbdi_domid; 502 err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 503 &grop, 1); 504 if (err || grop.status) { 505 printf("xbdback %s: can't map grant ref: %d/%d\n", 506 xbusd->xbusd_path, err, grop.status); 507 xenbus_dev_fatal(xbusd, EINVAL, 508 "can't map ring", xbusd->xbusd_otherend); 509 goto err; 510 } 511 xbdi->xbdi_ring_handle = grop.handle; 512 switch(xbdi->xbdi_proto) { 513 case XBDIP_NATIVE: 514 { 515 blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va; 516 BACK_RING_INIT(&xbdi->xbdi_ring.ring_n, 517 sring, PAGE_SIZE); 518 break; 519 } 520 case XBDIP_32: 521 { 522 blkif_x86_32_sring_t *sring = 523 (void *)xbdi->xbdi_ring_va; 524 BACK_RING_INIT(&xbdi->xbdi_ring.ring_32, 525 sring, PAGE_SIZE); 526 break; 527 } 528 case XBDIP_64: 529 { 530 blkif_x86_64_sring_t *sring = 531 (void *)xbdi->xbdi_ring_va; 532 BACK_RING_INIT(&xbdi->xbdi_ring.ring_64, 533 sring, PAGE_SIZE); 534 break; 535 } 536 } 537 evop.cmd = EVTCHNOP_bind_interdomain; 538 evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid; 539 evop.u.bind_interdomain.remote_port = revtchn; 540 err = HYPERVISOR_event_channel_op(&evop); 541 if (err) { 542 printf("blkback %s: can't get event channel: %d\n", 543 xbusd->xbusd_otherend, err); 544 xenbus_dev_fatal(xbusd, err, 545 "can't bind event chanel", xbusd->xbusd_otherend); 546 goto err2; 547 } 548 xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port; 549 snprintf(evname, sizeof(evname), "xbd%d.%d", 550 xbdi->xbdi_domid, xbdi->xbdi_handle); 551 event_set_handler(xbdi->xbdi_evtchn, xbdback_evthandler, 552 xbdi, IPL_BIO, evname); 553 printf("xbd backend 0x%x for domain %d " 554 "using event channel %d, protocol %s\n", xbdi->xbdi_handle, 555 xbdi->xbdi_domid, xbdi->xbdi_evtchn, proto); 556 hypervisor_enable_event(xbdi->xbdi_evtchn); 557 hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn); 558 xbdi->xbdi_status = CONNECTED; 559 break; 560 case XenbusStateClosing: 561 hypervisor_mask_event(xbdi->xbdi_evtchn); 562 event_remove_handler(xbdi->xbdi_evtchn, xbdback_evthandler, 563 xbdi); 564 xbdi->xbdi_status = DISCONNECTING; 565 s = splbio(); 566 xbdi_put(xbdi); 567 while (xbdi->xbdi_status != DISCONNECTED) { 568 tsleep(&xbdi->xbdi_status, PRIBIO, "xbddis", 0); 569 } 570 splx(s); 571 xenbus_switch_state(xbusd, NULL, XenbusStateClosing); 572 break; 573 case XenbusStateClosed: 574 /* otherend_changed() should handle it for us */ 575 panic("xbdback_frontend_changed: closed\n"); 576 case XenbusStateUnknown: 577 case XenbusStateInitWait: 578 default: 579 aprint_error("xbdback %s: invalid frontend state %d\n", 580 xbusd->xbusd_path, new_state); 581 } 582 return; 583 err2: 584 /* unmap ring */ 585 ungrop.host_addr = xbdi->xbdi_ring_va; 586 ungrop.handle = xbdi->xbdi_ring_handle; 587 ungrop.dev_bus_addr = 0; 588 err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 589 &ungrop, 1); 590 if (err) 591 printf("xbdback %s: unmap_grant_ref failed: %d\n", 592 xbusd->xbusd_path, err); 593 err: 594 uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY); 595 return; 596 } 597 598 static void 599 xbdback_backend_changed(struct xenbus_watch *watch, 600 const char **vec, unsigned int len) 601 { 602 struct xenbus_device *xbusd = watch->xbw_dev; 603 struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie; 604 int err; 605 long dev; 606 char *mode; 607 struct xenbus_transaction *xbt; 608 const char *devname; 609 int major; 610 611 err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device", 612 &dev, 10); 613 /* 614 * An error can occur as the watch can fire up just after being 615 * registered. So we have to ignore error :( 616 */ 617 if (err) 618 return; 619 if (xbdi->xbdi_status == CONNECTED && xbdi->xbdi_dev != dev) { 620 printf("xbdback %s: changing physical device from 0x%x to " 621 "0x%lx not supported\n", xbusd->xbusd_path, xbdi->xbdi_dev, 622 dev); 623 return; 624 } 625 xbdi->xbdi_dev = dev; 626 err = xenbus_read(NULL, xbusd->xbusd_path, "mode", NULL, &mode); 627 if (err) { 628 printf("xbdback: failed to read %s/mode: %d\n", 629 xbusd->xbusd_path, err); 630 return; 631 } 632 if (mode[0] == 'w') 633 xbdi->xbdi_ro = 0; 634 else 635 xbdi->xbdi_ro = 1; 636 major = major(xbdi->xbdi_dev); 637 devname = devsw_blk2name(major); 638 if (devname == NULL) { 639 printf("xbdback %s: unknwon device 0x%x\n", xbusd->xbusd_path, 640 xbdi->xbdi_dev); 641 return; 642 } 643 xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev); 644 if (xbdi->xbdi_bdevsw == NULL) { 645 printf("xbdback %s: no bdevsw for device 0x%x\n", 646 xbusd->xbusd_path, xbdi->xbdi_dev); 647 return; 648 } 649 err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp); 650 if (err) { 651 printf("xbdback %s: can't open device 0x%x: %d\n", 652 xbusd->xbusd_path, xbdi->xbdi_dev, err); 653 return; 654 } 655 err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY); 656 if (err) { 657 printf("xbdback %s: can't vn_lock device 0x%x: %d\n", 658 xbusd->xbusd_path, xbdi->xbdi_dev, err); 659 vrele(xbdi->xbdi_vp); 660 return; 661 } 662 err = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED); 663 if (err) { 664 printf("xbdback %s: can't VOP_OPEN device 0x%x: %d\n", 665 xbusd->xbusd_path, xbdi->xbdi_dev, err); 666 vput(xbdi->xbdi_vp); 667 return; 668 } 669 VOP_UNLOCK(xbdi->xbdi_vp, 0); 670 if (strcmp(devname, "dk") == 0) { 671 /* dk device; get wedge data */ 672 struct dkwedge_info wi; 673 err = VOP_IOCTL(xbdi->xbdi_vp, DIOCGWEDGEINFO, &wi, 674 FREAD, NOCRED); 675 if (err) { 676 printf("xbdback %s: can't DIOCGWEDGEINFO device " 677 "0x%x: %d\n", xbusd->xbusd_path, 678 xbdi->xbdi_dev, err); 679 xbdi->xbdi_size = xbdi->xbdi_dev = 0; 680 vn_close(xbdi->xbdi_vp, FREAD, NOCRED); 681 xbdi->xbdi_vp = NULL; 682 return; 683 } 684 xbdi->xbdi_size = wi.dkw_size; 685 printf("xbd backend: attach device %s (size %" PRIu64 ") " 686 "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size, 687 xbdi->xbdi_domid); 688 } else { 689 /* disk device, get partition data */ 690 struct partinfo dpart; 691 err = VOP_IOCTL(xbdi->xbdi_vp, DIOCGPART, &dpart, FREAD, 0); 692 if (err) { 693 printf("xbdback %s: can't DIOCGPART device 0x%x: %d\n", 694 xbusd->xbusd_path, xbdi->xbdi_dev, err); 695 xbdi->xbdi_size = xbdi->xbdi_dev = 0; 696 vn_close(xbdi->xbdi_vp, FREAD, NOCRED); 697 xbdi->xbdi_vp = NULL; 698 return; 699 } 700 xbdi->xbdi_size = dpart.part->p_size; 701 printf("xbd backend: attach device %s%d%c (size %" PRIu64 ") " 702 "for domain %d\n", devname, DISKUNIT(xbdi->xbdi_dev), 703 DISKPART(xbdi->xbdi_dev) + 'a', xbdi->xbdi_size, 704 xbdi->xbdi_domid); 705 } 706 again: 707 xbt = xenbus_transaction_start(); 708 if (xbt == NULL) { 709 printf("xbdback %s: can't start transaction\n", 710 xbusd->xbusd_path); 711 return; 712 } 713 err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 , 714 xbdi->xbdi_size); 715 if (err) { 716 printf("xbdback: failed to write %s/sectors: %d\n", 717 xbusd->xbusd_path, err); 718 goto abort; 719 } 720 err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u", 721 xbdi->xbdi_ro ? VDISK_READONLY : 0); 722 if (err) { 723 printf("xbdback: failed to write %s/info: %d\n", 724 xbusd->xbusd_path, err); 725 goto abort; 726 } 727 err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu", 728 (u_long)DEV_BSIZE); 729 if (err) { 730 printf("xbdback: failed to write %s/sector-size: %d\n", 731 xbusd->xbusd_path, err); 732 goto abort; 733 } 734 err = xenbus_transaction_end(xbt, 0); 735 if (err == EAGAIN) 736 goto again; 737 if (err) { 738 printf("xbdback %s: can't end transaction: %d\n", 739 xbusd->xbusd_path, err); 740 } 741 err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected); 742 if (err) { 743 printf("xbdback %s: can't switch state: %d\n", 744 xbusd->xbusd_path, err); 745 } 746 return; 747 abort: 748 xenbus_transaction_end(xbt, 1); 749 } 750 751 752 static void xbdback_finish_disconnect(struct xbdback_instance *xbdi) 753 { 754 KASSERT(xbdi->xbdi_status == DISCONNECTING); 755 756 xbdi->xbdi_status = DISCONNECTED; 757 wakeup(&xbdi->xbdi_status); 758 759 } 760 761 static struct xbdback_instance * 762 xbdif_lookup(domid_t dom , uint32_t handle) 763 { 764 struct xbdback_instance *xbdi; 765 766 SLIST_FOREACH(xbdi, &xbdback_instances, next) { 767 if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle) 768 return xbdi; 769 } 770 return NULL; 771 } 772 773 static int 774 xbdback_evthandler(void *arg) 775 { 776 struct xbdback_instance *xbdi = arg; 777 778 XENPRINTF(("xbdback_evthandler domain %d: cont %p\n", 779 xbdi->xbdi_domid, xbdi->xbdi_cont)); 780 781 if (xbdi->xbdi_cont == NULL) { 782 xbdi->xbdi_cont = xbdback_co_main; 783 xbdback_trampoline(xbdi, xbdi); 784 } 785 return 1; 786 } 787 788 static void * 789 xbdback_co_main(struct xbdback_instance *xbdi, void *obj) 790 { 791 (void)obj; 792 xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod; 793 x86_lfence(); /* ensure we see all requests up to req_prod */ 794 /* 795 * note that we'll eventually get a full ring of request. 796 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod) 797 */ 798 xbdi->xbdi_cont = xbdback_co_main_loop; 799 return xbdi; 800 } 801 802 static void * 803 xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj) 804 { 805 blkif_request_t *req = &xbdi->xbdi_xen_req; 806 blkif_x86_32_request_t *req32; 807 blkif_x86_64_request_t *req64; 808 int i; 809 810 (void)obj; 811 if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) { 812 switch(xbdi->xbdi_proto) { 813 case XBDIP_NATIVE: 814 memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n, 815 xbdi->xbdi_ring.ring_n.req_cons), 816 sizeof(blkif_request_t)); 817 break; 818 case XBDIP_32: 819 req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32, 820 xbdi->xbdi_ring.ring_n.req_cons); 821 req->operation = req32->operation; 822 req->nr_segments = req32->nr_segments; 823 req->handle = req32->handle; 824 req->id = req32->id; 825 req->sector_number = req32->sector_number; 826 for (i = 0; i < req->nr_segments; i++) 827 req->seg[i] = req32->seg[i]; 828 break; 829 830 case XBDIP_64: 831 req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64, 832 xbdi->xbdi_ring.ring_n.req_cons); 833 req->operation = req64->operation; 834 req->nr_segments = req64->nr_segments; 835 req->handle = req64->handle; 836 req->id = req64->id; 837 req->sector_number = req64->sector_number; 838 for (i = 0; i < req->nr_segments; i++) 839 req->seg[i] = req64->seg[i]; 840 break; 841 } 842 XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x " 843 "resp_prod 0x%x id %" PRIu64 "\n", req->operation, 844 xbdi->xbdi_ring.ring_n.req_cons, 845 xbdi->xbdi_req_prod, 846 xbdi->xbdi_ring.ring_n.rsp_prod_pvt, 847 req->id)); 848 switch(req->operation) { 849 case BLKIF_OP_READ: 850 case BLKIF_OP_WRITE: 851 xbdi->xbdi_cont = xbdback_co_io; 852 break; 853 default: 854 printf("xbdback_evthandler domain %d: unknown " 855 "operation %d\n", xbdi->xbdi_domid, req->operation); 856 xbdback_send_reply(xbdi, req->id, req->operation, 857 BLKIF_RSP_ERROR); 858 xbdi->xbdi_cont = xbdback_co_main_incr; 859 break; 860 } 861 } else { 862 xbdi->xbdi_cont = xbdback_co_main_done; 863 } 864 return xbdi; 865 } 866 867 static void * 868 xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj) 869 { 870 (void)obj; 871 xbdi->xbdi_ring.ring_n.req_cons++; 872 xbdi->xbdi_cont = xbdback_co_main_loop; 873 return xbdi; 874 } 875 876 static void * 877 xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj) 878 { 879 (void)obj; 880 if (xbdi->xbdi_io != NULL) { 881 xbdi->xbdi_cont = xbdback_co_flush; 882 xbdi->xbdi_cont_aux = xbdback_co_main_done2; 883 } else { 884 xbdi->xbdi_cont = xbdback_co_main_done2; 885 } 886 return xbdi; 887 } 888 889 static void * 890 xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj) 891 { 892 int work_to_do; 893 894 RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do); 895 if (work_to_do) 896 xbdi->xbdi_cont = xbdback_co_main; 897 else 898 xbdi->xbdi_cont = NULL; 899 return xbdi; 900 } 901 902 static void * 903 xbdback_co_io(struct xbdback_instance *xbdi, void *obj) 904 { 905 int error; 906 907 (void)obj; 908 if (xbdi->xbdi_xen_req.nr_segments < 1 || 909 xbdi->xbdi_xen_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST ) { 910 printf("xbdback_io domain %d: %d segments\n", 911 xbdi->xbdi_domid, xbdi->xbdi_xen_req.nr_segments); 912 error = EINVAL; 913 goto end; 914 } 915 if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) { 916 if (xbdi->xbdi_ro) { 917 error = EROFS; 918 goto end; 919 } 920 } 921 922 xbdi->xbdi_segno = 0; 923 924 xbdi->xbdi_cont = xbdback_co_io_gotreq; 925 return xbdback_pool_get(&xbdback_request_pool, xbdi); 926 end: 927 xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id, 928 xbdi->xbdi_xen_req.operation, error); 929 xbdi->xbdi_cont = xbdback_co_main_incr; 930 return xbdi; 931 } 932 933 static void * 934 xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj) 935 { 936 struct xbdback_request *xrq; 937 938 xrq = xbdi->xbdi_req = obj; 939 940 xrq->rq_xbdi = xbdi; 941 xrq->rq_iocount = 0; 942 xrq->rq_ioerrs = 0; 943 xrq->rq_id = xbdi->xbdi_xen_req.id; 944 xrq->rq_operation = xbdi->xbdi_xen_req.operation; 945 946 /* 947 * Request-level reasons not to coalesce: different device, 948 * different op, or noncontiguous disk sectors (vs. previous 949 * request handed to us). 950 */ 951 xbdi->xbdi_cont = xbdback_co_io_loop; 952 if (xbdi->xbdi_io != NULL) { 953 struct xbdback_request *last_req; 954 last_req = SLIST_FIRST(&xbdi->xbdi_io->xio_rq)->car; 955 XENPRINTF(("xbdback_io domain %d: hoping for sector %" PRIu64 956 "; got %" PRIu64 "\n", xbdi->xbdi_domid, 957 xbdi->xbdi_next_sector, 958 xbdi->xbdi_xen_req.sector_number)); 959 if ((xrq->rq_operation != last_req->rq_operation) 960 || (xbdi->xbdi_xen_req.sector_number != 961 xbdi->xbdi_next_sector)) { 962 XENPRINTF(("xbdback_io domain %d: segment break\n", 963 xbdi->xbdi_domid)); 964 xbdi->xbdi_next_sector = 965 xbdi->xbdi_xen_req.sector_number; 966 xbdi->xbdi_cont_aux = xbdi->xbdi_cont; 967 xbdi->xbdi_cont = xbdback_co_flush; 968 } 969 } else { 970 xbdi->xbdi_next_sector = xbdi->xbdi_xen_req.sector_number; 971 } 972 return xbdi; 973 } 974 975 976 static void * 977 xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj) 978 { 979 struct xbdback_io *xio; 980 981 (void)obj; 982 if (xbdi->xbdi_segno < xbdi->xbdi_xen_req.nr_segments) { 983 uint8_t this_fs, this_ls, last_fs, last_ls; 984 grant_ref_t thisgrt, lastgrt; 985 /* 986 * Segment-level reason to coalesce: handling full 987 * pages, or adjacent sector ranges from the same page 988 * (and yes, this latter does happen). But not if the 989 * array of client pseudo-physical pages is full. 990 */ 991 this_fs = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].first_sect; 992 this_ls = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].last_sect; 993 thisgrt = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].gref; 994 XENPRINTF(("xbdback_io domain %d: " 995 "first,last_sect[%d]=0%o,0%o\n", 996 xbdi->xbdi_domid, xbdi->xbdi_segno, 997 this_fs, this_ls)); 998 last_fs = xbdi->xbdi_last_fs = xbdi->xbdi_this_fs; 999 last_ls = xbdi->xbdi_last_ls = xbdi->xbdi_this_ls; 1000 lastgrt = xbdi->xbdi_lastgrt = xbdi->xbdi_thisgrt; 1001 xbdi->xbdi_this_fs = this_fs; 1002 xbdi->xbdi_this_ls = this_ls; 1003 xbdi->xbdi_thisgrt = thisgrt; 1004 if (xbdi->xbdi_io != NULL) { 1005 if (last_ls == VBD_MAXSECT 1006 && this_fs == 0 1007 && xbdi->xbdi_io->xio_nrma 1008 < XENSHM_MAX_PAGES_PER_REQUEST) { 1009 xbdi->xbdi_same_page = 0; 1010 } else if (last_ls + 1 1011 == this_fs 1012 #ifdef notyet 1013 && (last_fas & ~PAGE_MASK) 1014 == (this_fas & ~PAGE_MASK) 1015 #else 1016 && 0 /* can't know frame number yet */ 1017 #endif 1018 ) { 1019 #ifdef DEBUG 1020 static struct timeval gluetimer; 1021 if (ratecheck(&gluetimer, 1022 &xbdback_fragio_intvl)) 1023 printf("xbdback: domain %d sending" 1024 " excessively fragmented I/O\n", 1025 xbdi->xbdi_domid); 1026 #endif 1027 printf("xbdback_io: would maybe glue same page sec %d (%d->%d)\n", xbdi->xbdi_segno, this_fs, this_ls); 1028 panic("notyet!"); 1029 XENPRINTF(("xbdback_io domain %d: glue same " 1030 "page", xbdi->xbdi_domid)); 1031 xbdi->xbdi_same_page = 1; 1032 } else { 1033 xbdi->xbdi_cont_aux = xbdback_co_io_loop; 1034 xbdi->xbdi_cont = xbdback_co_flush; 1035 return xbdi; 1036 } 1037 } else 1038 xbdi->xbdi_same_page = 0; 1039 1040 if (xbdi->xbdi_io == NULL) { 1041 xbdi->xbdi_cont = xbdback_co_io_gotio; 1042 xio = xbdback_pool_get(&xbdback_io_pool, xbdi); 1043 buf_init(&xio->xio_buf); 1044 return xio; 1045 } else { 1046 xbdi->xbdi_cont = xbdback_co_io_gotio2; 1047 } 1048 } else { 1049 /* done with the loop over segments; get next request */ 1050 xbdi->xbdi_cont = xbdback_co_main_incr; 1051 } 1052 return xbdi; 1053 } 1054 1055 1056 static void * 1057 xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj) 1058 1059 { 1060 struct xbdback_io *xbd_io; 1061 vaddr_t start_offset; /* start offset in vm area */ 1062 int buf_flags; 1063 1064 xbdi_get(xbdi); 1065 1066 xbd_io = xbdi->xbdi_io = obj; 1067 xbd_io->xio_xbdi = xbdi; 1068 SLIST_INIT(&xbd_io->xio_rq); 1069 xbd_io->xio_nrma = 0; 1070 xbd_io->xio_mapped = 0; 1071 1072 start_offset = xbdi->xbdi_this_fs * VBD_BSIZE; 1073 1074 if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) { 1075 buf_flags = B_WRITE; 1076 } else { 1077 buf_flags = B_READ; 1078 } 1079 1080 xbd_io->xio_buf.b_flags = buf_flags; 1081 xbd_io->xio_buf.b_cflags = 0; 1082 xbd_io->xio_buf.b_oflags = 0; 1083 xbd_io->xio_buf.b_iodone = xbdback_iodone; 1084 xbd_io->xio_buf.b_proc = NULL; 1085 xbd_io->xio_buf.b_vp = xbdi->xbdi_vp; 1086 xbd_io->xio_buf.b_objlock = &xbdi->xbdi_vp->v_interlock; 1087 xbd_io->xio_buf.b_dev = xbdi->xbdi_dev; 1088 xbd_io->xio_buf.b_blkno = xbdi->xbdi_next_sector; 1089 xbd_io->xio_buf.b_bcount = 0; 1090 xbd_io->xio_buf.b_data = (void *)start_offset; 1091 xbd_io->xio_buf.b_private = xbd_io; 1092 1093 xbdi->xbdi_cont = xbdback_co_io_gotio2; 1094 return xbdi; 1095 } 1096 1097 1098 static void * 1099 xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj) 1100 { 1101 (void)obj; 1102 if (xbdi->xbdi_segno == 0 || SLIST_EMPTY(&xbdi->xbdi_io->xio_rq)) { 1103 /* if this is the first segment of a new request */ 1104 /* or if it's the first segment of the io */ 1105 xbdi->xbdi_cont = xbdback_co_io_gotfrag; 1106 return xbdback_pool_get(&xbdback_fragment_pool, xbdi); 1107 } 1108 xbdi->xbdi_cont = xbdback_co_io_gotfrag2; 1109 return xbdi; 1110 } 1111 1112 1113 static void * 1114 xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj) 1115 { 1116 struct xbdback_fragment *xbd_fr; 1117 1118 xbd_fr = obj; 1119 xbd_fr->car = xbdi->xbdi_req; 1120 SLIST_INSERT_HEAD(&xbdi->xbdi_io->xio_rq, xbd_fr, cdr); 1121 ++xbdi->xbdi_req->rq_iocount; 1122 1123 xbdi->xbdi_cont = xbdback_co_io_gotfrag2; 1124 return xbdi; 1125 } 1126 1127 static void * 1128 xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj) 1129 { 1130 struct xbdback_io *xbd_io; 1131 int seg_size; 1132 uint8_t this_fs, this_ls; 1133 1134 this_fs = xbdi->xbdi_this_fs; 1135 this_ls = xbdi->xbdi_this_ls; 1136 xbd_io = xbdi->xbdi_io; 1137 seg_size = this_ls - this_fs + 1; 1138 1139 if (seg_size < 0) { 1140 printf("xbdback_io domain %d: negative-size request (%d %d)\n", 1141 xbdi->xbdi_domid, this_ls, this_fs); 1142 xbdback_io_error(xbdi->xbdi_io, EINVAL); 1143 xbdi->xbdi_io = NULL; 1144 xbdi->xbdi_cont = xbdback_co_main_incr; 1145 return xbdi; 1146 } 1147 1148 if (!xbdi->xbdi_same_page) { 1149 XENPRINTF(("xbdback_io domain %d: appending grant %u\n", 1150 xbdi->xbdi_domid, (u_int)xbdi->xbdi_thisgrt)); 1151 xbd_io->xio_gref[xbd_io->xio_nrma++] = xbdi->xbdi_thisgrt; 1152 } 1153 1154 xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE); 1155 XENPRINTF(("xbdback_io domain %d: start sect %d size %d\n", 1156 xbdi->xbdi_domid, (int)xbdi->xbdi_next_sector, seg_size)); 1157 1158 /* Finally, the end of the segment loop! */ 1159 xbdi->xbdi_next_sector += seg_size; 1160 ++xbdi->xbdi_segno; 1161 xbdi->xbdi_cont = xbdback_co_io_loop; 1162 return xbdi; 1163 } 1164 1165 1166 static void * 1167 xbdback_co_flush(struct xbdback_instance *xbdi, void *obj) 1168 { 1169 (void)obj; 1170 XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n", 1171 xbdi->xbdi_domid, (long)xbdi->xbdi_io->xio_buf.b_blkno, 1172 (int)xbdi->xbdi_io->xio_buf.b_bcount, (long)xbdi->xbdi_io)); 1173 xbdi->xbdi_cont = xbdback_co_flush_done; 1174 return xbdback_map_shm(xbdi->xbdi_io); 1175 } 1176 1177 static void * 1178 xbdback_co_flush_done(struct xbdback_instance *xbdi, void *obj) 1179 { 1180 (void)obj; 1181 xbdback_do_io(xbdi->xbdi_io); 1182 xbdi->xbdi_io = NULL; 1183 xbdi->xbdi_cont = xbdi->xbdi_cont_aux; 1184 return xbdi; 1185 } 1186 1187 static void 1188 xbdback_io_error(struct xbdback_io *xbd_io, int error) 1189 { 1190 xbd_io->xio_buf.b_error = error; 1191 xbdback_iodone(&xbd_io->xio_buf); 1192 } 1193 1194 static void 1195 xbdback_do_io(struct xbdback_io *xbd_io) 1196 { 1197 xbd_io->xio_buf.b_data = 1198 (void *)((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr); 1199 #ifdef DIAGNOSTIC 1200 { 1201 vaddr_t bdata = (vaddr_t)xbd_io->xio_buf.b_data; 1202 int nsegs = 1203 ((((bdata + xbd_io->xio_buf.b_bcount - 1) & ~PAGE_MASK) - 1204 (bdata & ~PAGE_MASK)) >> PAGE_SHIFT) + 1; 1205 if ((bdata & ~PAGE_MASK) != (xbd_io->xio_vaddr & ~PAGE_MASK)) { 1206 printf("xbdback_do_io vaddr 0x%lx bdata 0x%lx\n", 1207 xbd_io->xio_vaddr, bdata); 1208 panic("xbdback_do_io: bdata page change"); 1209 } 1210 if (nsegs > xbd_io->xio_nrma) { 1211 printf("xbdback_do_io vaddr 0x%lx bcount 0x%x doesn't fit in " 1212 " %d pages\n", bdata, xbd_io->xio_buf.b_bcount, 1213 xbd_io->xio_nrma); 1214 panic("xbdback_do_io: not enough pages"); 1215 } 1216 } 1217 #endif 1218 if ((xbd_io->xio_buf.b_flags & B_READ) == 0) 1219 xbd_io->xio_buf.b_vp->v_numoutput++; 1220 bdev_strategy(&xbd_io->xio_buf); 1221 } 1222 1223 /* This gets reused by xbdback_io_error to report errors from other sources. */ 1224 static void 1225 xbdback_iodone(struct buf *bp) 1226 { 1227 struct xbdback_io *xbd_io; 1228 struct xbdback_instance *xbdi; 1229 int errp; 1230 1231 xbd_io = bp->b_private; 1232 xbdi = xbd_io->xio_xbdi; 1233 1234 XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n", 1235 xbdi->xbdi_domid, (long)xbd_io)); 1236 1237 if (xbd_io->xio_mapped) 1238 xbdback_unmap_shm(xbd_io); 1239 1240 if (bp->b_error != 0) { 1241 printf("xbd IO domain %d: error %d\n", 1242 xbdi->xbdi_domid, bp->b_error); 1243 errp = 1; 1244 } else 1245 errp = 0; 1246 1247 1248 /* for each constituent xbd request */ 1249 while(!SLIST_EMPTY(&xbd_io->xio_rq)) { 1250 struct xbdback_fragment *xbd_fr; 1251 struct xbdback_request *xbd_req; 1252 struct xbdback_instance *rxbdi; 1253 int error; 1254 1255 xbd_fr = SLIST_FIRST(&xbd_io->xio_rq); 1256 xbd_req = xbd_fr->car; 1257 SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr); 1258 xbdback_pool_put(&xbdback_fragment_pool, xbd_fr); 1259 1260 if (errp) 1261 ++xbd_req->rq_ioerrs; 1262 1263 /* finalize it only if this was its last I/O */ 1264 if (--xbd_req->rq_iocount > 0) 1265 continue; 1266 1267 rxbdi = xbd_req->rq_xbdi; 1268 KASSERT(xbdi == rxbdi); 1269 1270 error = xbd_req->rq_ioerrs > 0 1271 ? BLKIF_RSP_ERROR 1272 : BLKIF_RSP_OKAY; 1273 1274 XENPRINTF(("xbdback_io domain %d: end request %" PRIu64 " error=%d\n", 1275 xbdi->xbdi_domid, xbd_req->rq_id, error)); 1276 xbdback_send_reply(xbdi, xbd_req->rq_id, 1277 xbd_req->rq_operation, error); 1278 xbdback_pool_put(&xbdback_request_pool, xbd_req); 1279 } 1280 xbdi_put(xbdi); 1281 buf_destroy(&xbd_io->xio_buf); 1282 xbdback_pool_put(&xbdback_io_pool, xbd_io); 1283 } 1284 1285 /* 1286 * called once a request has completed. Place the reply in the ring and 1287 * notify the guest OS 1288 */ 1289 static void 1290 xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id, 1291 int op, int status) 1292 { 1293 blkif_response_t *resp_n; 1294 blkif_x86_32_response_t *resp32; 1295 blkif_x86_64_response_t *resp64; 1296 int notify; 1297 1298 switch(xbdi->xbdi_proto) { 1299 case XBDIP_NATIVE: 1300 resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n, 1301 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1302 resp_n->id = id; 1303 resp_n->operation = op; 1304 resp_n->status = status; 1305 break; 1306 case XBDIP_32: 1307 resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32, 1308 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1309 resp32->id = id; 1310 resp32->operation = op; 1311 resp32->status = status; 1312 break; 1313 case XBDIP_64: 1314 resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64, 1315 xbdi->xbdi_ring.ring_n.rsp_prod_pvt); 1316 resp64->id = id; 1317 resp64->operation = op; 1318 resp64->status = status; 1319 break; 1320 } 1321 xbdi->xbdi_ring.ring_n.rsp_prod_pvt++; 1322 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify); 1323 if (notify) { 1324 XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid)); 1325 hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn); 1326 } 1327 } 1328 1329 /* 1330 * Map a request into our virtual address space. The xbd_req->rq_ma 1331 * array is to be filled out by the caller. 1332 */ 1333 static void * 1334 xbdback_map_shm(struct xbdback_io *xbd_io) 1335 { 1336 struct xbdback_instance *xbdi; 1337 struct xbdback_request *xbd_rq; 1338 int error, s; 1339 1340 #ifdef XENDEBUG_VBD 1341 int i; 1342 printf("xbdback_map_shm map grant "); 1343 for (i = 0; i < xbd_io->xio_nrma; i++) { 1344 printf("%u ", (u_int)xbd_io->xio_gref[i]); 1345 } 1346 #endif 1347 1348 KASSERT(xbd_io->xio_mapped == 0); 1349 1350 xbdi = xbd_io->xio_xbdi; 1351 xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car; 1352 error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid, 1353 xbd_io->xio_gref, &xbd_io->xio_vaddr, xbd_io->xio_gh, 1354 (xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO: 0); 1355 1356 switch(error) { 1357 case 0: 1358 #ifdef XENDEBUG_VBD 1359 printf("handle "); 1360 for (i = 0; i < xbd_io->xio_nrma; i++) { 1361 printf("%u ", (u_int)xbd_io->xio_gh[i]); 1362 } 1363 printf("\n"); 1364 #endif 1365 xbd_io->xio_mapped = 1; 1366 return (void *)xbd_io->xio_vaddr; 1367 case ENOMEM: 1368 s = splvm(); 1369 if (!xbdback_shmcb) { 1370 if (xen_shm_callback(xbdback_shm_callback, xbdi) 1371 != 0) { 1372 splx(s); 1373 panic("xbdback_map_shm: " 1374 "xen_shm_callback failed"); 1375 } 1376 xbdback_shmcb = 1; 1377 } 1378 SIMPLEQ_INSERT_TAIL(&xbdback_shmq, xbdi, xbdi_on_hold); 1379 splx(s); 1380 return NULL; 1381 default: 1382 printf("xbdback_map_shm: xen_shm error %d ", 1383 error); 1384 xbdback_io_error(xbdi->xbdi_io, error); 1385 xbdi->xbdi_io = NULL; 1386 xbdi->xbdi_cont = xbdi->xbdi_cont_aux; 1387 return xbdi; 1388 } 1389 } 1390 1391 static int 1392 xbdback_shm_callback(void *arg) 1393 { 1394 int error, s; 1395 1396 s = splvm(); 1397 while(!SIMPLEQ_EMPTY(&xbdback_shmq)) { 1398 struct xbdback_instance *xbdi; 1399 struct xbdback_io *xbd_io; 1400 struct xbdback_request *xbd_rq; 1401 1402 xbdi = SIMPLEQ_FIRST(&xbdback_shmq); 1403 xbd_io = xbdi->xbdi_io; 1404 xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car; 1405 KASSERT(xbd_io->xio_mapped == 0); 1406 1407 error = xen_shm_map(xbd_io->xio_nrma, 1408 xbdi->xbdi_domid, xbd_io->xio_gref, 1409 &xbd_io->xio_vaddr, xbd_io->xio_gh, 1410 XSHM_CALLBACK | 1411 ((xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO: 0)); 1412 switch(error) { 1413 case ENOMEM: 1414 splx(s); 1415 return -1; /* will try again later */ 1416 case 0: 1417 xbd_io->xio_mapped = 1; 1418 SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold); 1419 splx(s); 1420 xbdback_trampoline(xbdi, xbdi); 1421 s = splvm(); 1422 break; 1423 default: 1424 SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold); 1425 splx(s); 1426 printf("xbdback_shm_callback: xen_shm error %d\n", 1427 error); 1428 xbdi->xbdi_cont = xbdi->xbdi_cont_aux; 1429 xbdback_io_error(xbd_io, error); 1430 xbdback_trampoline(xbdi, xbdi); 1431 s = splvm(); 1432 break; 1433 } 1434 } 1435 xbdback_shmcb = 0; 1436 splx(s); 1437 return 0; 1438 } 1439 1440 /* unmap a request from our virtual address space (request is done) */ 1441 static void 1442 xbdback_unmap_shm(struct xbdback_io *xbd_io) 1443 { 1444 #ifdef XENDEBUG_VBD 1445 int i; 1446 printf("xbdback_unmap_shm handle "); 1447 for (i = 0; i < xbd_io->xio_nrma; i++) { 1448 printf("%u ", (u_int)xbd_io->xio_gh[i]); 1449 } 1450 printf("\n"); 1451 #endif 1452 1453 KASSERT(xbd_io->xio_mapped == 1); 1454 xbd_io->xio_mapped = 0; 1455 xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma, 1456 xbd_io->xio_gh); 1457 xbd_io->xio_vaddr = -1; 1458 } 1459 1460 /* Obtain memory from a pool, in cooperation with the continuations. */ 1461 static void *xbdback_pool_get(struct xbdback_pool *pp, 1462 struct xbdback_instance *xbdi) 1463 { 1464 int s; 1465 void *item; 1466 1467 item = pool_get(&pp->p, PR_NOWAIT); 1468 if (item == NULL) { 1469 if (ratecheck(&pp->last_warning, &xbdback_poolsleep_intvl)) 1470 printf("xbdback_pool_get: %s is full", 1471 pp->p.pr_wchan); 1472 s = splvm(); 1473 SIMPLEQ_INSERT_TAIL(&pp->q, xbdi, xbdi_on_hold); 1474 splx(s); 1475 } 1476 return item; 1477 } 1478 1479 /* 1480 * Restore memory to a pool... unless an xbdback instance had been 1481 * waiting for it, in which case that gets the memory first. 1482 */ 1483 static void xbdback_pool_put(struct xbdback_pool *pp, void *item) 1484 { 1485 int s; 1486 1487 s = splvm(); 1488 if (SIMPLEQ_EMPTY(&pp->q)) { 1489 splx(s); 1490 pool_put(&pp->p, item); 1491 } else { 1492 struct xbdback_instance *xbdi = SIMPLEQ_FIRST(&pp->q); 1493 SIMPLEQ_REMOVE_HEAD(&pp->q, xbdi_on_hold); 1494 splx(s); 1495 xbdback_trampoline(xbdi, item); 1496 } 1497 } 1498 1499 static void 1500 xbdback_trampoline(struct xbdback_instance *xbdi, void *obj) 1501 { 1502 xbdback_cont_t cont; 1503 1504 while(obj != NULL && xbdi->xbdi_cont != NULL) { 1505 cont = xbdi->xbdi_cont; 1506 #ifdef DIAGNOSTIC 1507 xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF; 1508 #endif 1509 obj = (*cont)(xbdi, obj); 1510 #ifdef DIAGNOSTIC 1511 if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) { 1512 printf("xbdback_trampoline: 0x%lx didn't set " 1513 "xbdi->xbdi_cont!\n2", (long)cont); 1514 panic("xbdback_trampoline: bad continuation"); 1515 } 1516 #endif 1517 } 1518 } 1519