1 /* $NetBSD: xbd_xenbus.c,v 1.134 2023/07/25 16:15:50 bouyer Exp $ */
2
3 /*
4 * Copyright (c) 2006 Manuel Bouyer.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 */
27
28 /*
29 * The file contains the xbd frontend code required for block-level
30 * communications (similar to hard disks) between two Xen domains.
31 *
32 * We are not supposed to receive solicitations spontaneously from backend. The
33 * protocol is therefore fairly simple and uses only one ring to communicate
34 * with backend: frontend posts requests to the ring then wait for their
35 * replies asynchronously.
36 *
37 * xbd follows NetBSD's disk(9) convention. At any time, a LWP can schedule
38 * an operation request for the device (be it open(), read(), write(), ...).
39 * Calls are typically processed that way:
40 * - initiate request: xbdread/write/open/ioctl/..
41 * - depending on operation, it is handled directly by disk(9) subsystem or
42 * goes through physio(9) first.
43 * - the request is ultimately processed by xbd_diskstart() that prepares the
44 * xbd requests, post them in the ring I/O queue, then signal the backend.
45 *
46 * When a response is available in the queue, the backend signals the frontend
47 * via its event channel. This triggers xbd_handler(), which will link back
48 * the response to its request through the request ID, and mark the I/O as
49 * completed.
50 */
51
52 #include <sys/cdefs.h>
53 __KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.134 2023/07/25 16:15:50 bouyer Exp $");
54
55 #include "opt_xen.h"
56
57
58 #include <sys/param.h>
59 #include <sys/buf.h>
60 #include <sys/bufq.h>
61 #include <sys/device.h>
62 #include <sys/disk.h>
63 #include <sys/disklabel.h>
64 #include <sys/conf.h>
65 #include <sys/fcntl.h>
66 #include <sys/kernel.h>
67 #include <sys/proc.h>
68 #include <sys/systm.h>
69 #include <sys/stat.h>
70 #include <sys/vnode.h>
71 #include <sys/mutex.h>
72
73 #include <dev/dkvar.h>
74
75 #include <uvm/uvm.h>
76
77 #include <xen/intr.h>
78 #include <xen/hypervisor.h>
79 #include <xen/evtchn.h>
80 #include <xen/granttables.h>
81 #include <xen/include/public/io/blkif.h>
82 #include <xen/include/public/io/protocols.h>
83
84 #include <xen/xenbus.h>
85 #include "locators.h"
86
87 #undef XBD_DEBUG
88 #ifdef XBD_DEBUG
89 #define DPRINTF(x) printf x;
90 #else
91 #define DPRINTF(x)
92 #endif
93
94 #define GRANT_INVALID_REF -1
95
96 #define XBD_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
97 #define XBD_MAX_XFER (PAGE_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST)
98 #define XBD_MAX_CHUNK 32*1024 /* max I/O size we process in 1 req */
99 #define XBD_XFER_LIMIT (2*XBD_MAX_XFER)
100
101 #define XEN_BSHIFT 9 /* log2(XEN_BSIZE) */
102 #define XEN_BSIZE (1 << XEN_BSHIFT)
103
104 CTASSERT((MAXPHYS <= 2*XBD_MAX_CHUNK));
105 CTASSERT(XEN_BSIZE == DEV_BSIZE);
106
107 struct xbd_indirect {
108 SLIST_ENTRY(xbd_indirect) in_next;
109 struct blkif_request_segment *in_addr;
110 grant_ref_t in_gntref;
111 };
112
113 struct xbd_req {
114 SLIST_ENTRY(xbd_req) req_next;
115 uint16_t req_id; /* ID passed to backend */
116 bus_dmamap_t req_dmamap;
117 struct xbd_req *req_parent, *req_child;
118 bool req_parent_done;
119 union {
120 struct {
121 grant_ref_t req_gntref[XBD_XFER_LIMIT >> PAGE_SHIFT];
122 struct buf *req_bp; /* buffer associated with this request */
123 void *req_data; /* pointer to the data buffer */
124 struct xbd_indirect *req_indirect; /* indirect page */
125 } req_rw;
126 struct {
127 int s_error;
128 int s_done;
129 } req_sync;
130 } u;
131 };
132 #define req_gntref u.req_rw.req_gntref
133 #define req_bp u.req_rw.req_bp
134 #define req_data u.req_rw.req_data
135 #define req_indirect u.req_rw.req_indirect
136 #define req_sync u.req_sync
137
138 struct xbd_xenbus_softc {
139 struct dk_softc sc_dksc; /* Must be first in this struct */
140 struct xenbus_device *sc_xbusd;
141 unsigned int sc_evtchn;
142
143 struct intrhand *sc_ih; /* Interrupt handler for this instance. */
144 kmutex_t sc_lock;
145 kcondvar_t sc_cache_flush_cv;
146 kcondvar_t sc_req_cv;
147 kcondvar_t sc_detach_cv;
148 kcondvar_t sc_suspend_cv;
149
150 blkif_front_ring_t sc_ring;
151 grant_ref_t sc_ring_gntref;
152
153 struct xbd_req sc_reqs[XBD_RING_SIZE];
154 SLIST_HEAD(,xbd_req) sc_xbdreq_head; /* list of free requests */
155
156 struct xbd_indirect sc_indirect[XBD_RING_SIZE];
157 SLIST_HEAD(,xbd_indirect) sc_indirect_head;
158
159 vmem_addr_t sc_unalign_buffer;
160 void *sc_unalign_used;
161
162 int sc_backend_status; /* our status with backend */
163 #define BLKIF_STATE_DISCONNECTED 0
164 #define BLKIF_STATE_CONNECTED 1
165 #define BLKIF_STATE_SUSPENDED 2
166
167 int sc_shutdown;
168 #define BLKIF_SHUTDOWN_RUN 0 /* no shutdown */
169 #define BLKIF_SHUTDOWN_REMOTE 1 /* backend-initiated shutdown in progress */
170 #define BLKIF_SHUTDOWN_LOCAL 2 /* locally-initiated shutdown in progress */
171
172 uint64_t sc_sectors; /* number of sc_secsize sectors for this device */
173 u_long sc_secsize; /* sector size */
174 uint64_t sc_xbdsize; /* size of disk in DEV_BSIZE */
175 u_long sc_info; /* VDISK_* */
176 u_long sc_handle; /* from backend */
177 int sc_features;
178 #define BLKIF_FEATURE_CACHE_FLUSH 0x1
179 #define BLKIF_FEATURE_BARRIER 0x2
180 #define BLKIF_FEATURE_PERSISTENT 0x4
181 #define BLKIF_FEATURE_INDIRECT 0x8
182 #define BLKIF_FEATURE_BITS \
183 "\20\1CACHE-FLUSH\2BARRIER\3PERSISTENT\4INDIRECT"
184 struct evcnt sc_cnt_map_unalign;
185 struct evcnt sc_cnt_unalign_busy;
186 struct evcnt sc_cnt_queue_full;
187 struct evcnt sc_cnt_indirect;
188 };
189
190 static int xbd_xenbus_match(device_t, cfdata_t, void *);
191 static void xbd_xenbus_attach(device_t, device_t, void *);
192 static int xbd_xenbus_detach(device_t, int);
193
194 static bool xbd_xenbus_suspend(device_t, const pmf_qual_t *);
195 static bool xbd_xenbus_resume(device_t, const pmf_qual_t *);
196
197 static int xbd_handler(void *);
198 static int xbd_diskstart(device_t, struct buf *);
199 static void xbd_iosize(device_t, int *);
200 static void xbd_backend_changed(void *, XenbusState);
201 static void xbd_connect(struct xbd_xenbus_softc *);
202 static void xbd_features(struct xbd_xenbus_softc *);
203
204 static void xbd_diskstart_submit(struct xbd_xenbus_softc *, int,
205 struct buf *bp, int, bus_dmamap_t, grant_ref_t *);
206 static void xbd_diskstart_submit_indirect(struct xbd_xenbus_softc *,
207 struct xbd_req *, struct buf *bp);
208 static int xbd_map_align(struct xbd_xenbus_softc *, struct xbd_req *);
209 static void xbd_unmap_align(struct xbd_xenbus_softc *, struct xbd_req *,
210 struct buf *);
211
212 static void xbdminphys(struct buf *);
213
214 CFATTACH_DECL3_NEW(xbd, sizeof(struct xbd_xenbus_softc),
215 xbd_xenbus_match, xbd_xenbus_attach, xbd_xenbus_detach, NULL, NULL, NULL,
216 DVF_DETACH_SHUTDOWN);
217
218 static dev_type_open(xbdopen);
219 static dev_type_close(xbdclose);
220 static dev_type_read(xbdread);
221 static dev_type_write(xbdwrite);
222 static dev_type_ioctl(xbdioctl);
223 static dev_type_strategy(xbdstrategy);
224 static dev_type_dump(xbddump);
225 static dev_type_size(xbdsize);
226
227 const struct bdevsw xbd_bdevsw = {
228 .d_open = xbdopen,
229 .d_close = xbdclose,
230 .d_strategy = xbdstrategy,
231 .d_ioctl = xbdioctl,
232 .d_dump = xbddump,
233 .d_psize = xbdsize,
234 .d_discard = nodiscard,
235 .d_flag = D_DISK | D_MPSAFE
236 };
237
238 const struct cdevsw xbd_cdevsw = {
239 .d_open = xbdopen,
240 .d_close = xbdclose,
241 .d_read = xbdread,
242 .d_write = xbdwrite,
243 .d_ioctl = xbdioctl,
244 .d_stop = nostop,
245 .d_tty = notty,
246 .d_poll = nopoll,
247 .d_mmap = nommap,
248 .d_kqfilter = nokqfilter,
249 .d_discard = nodiscard,
250 .d_flag = D_DISK | D_MPSAFE
251 };
252
253 extern struct cfdriver xbd_cd;
254
255 static const struct dkdriver xbddkdriver = {
256 .d_strategy = xbdstrategy,
257 .d_minphys = xbdminphys,
258 .d_open = xbdopen,
259 .d_close = xbdclose,
260 .d_diskstart = xbd_diskstart,
261 .d_iosize = xbd_iosize,
262 };
263
264 static int
xbd_xenbus_match(device_t parent,cfdata_t match,void * aux)265 xbd_xenbus_match(device_t parent, cfdata_t match, void *aux)
266 {
267 struct xenbusdev_attach_args *xa = aux;
268
269 if (strcmp(xa->xa_type, "vbd") != 0)
270 return 0;
271
272 if (match->cf_loc[XENBUSCF_ID] != XENBUSCF_ID_DEFAULT &&
273 match->cf_loc[XENBUSCF_ID] != xa->xa_id)
274 return 0;
275
276 return 1;
277 }
278
279 static void
xbd_xenbus_attach(device_t parent,device_t self,void * aux)280 xbd_xenbus_attach(device_t parent, device_t self, void *aux)
281 {
282 struct xbd_xenbus_softc *sc = device_private(self);
283 struct xenbusdev_attach_args *xa = aux;
284 blkif_sring_t *ring;
285 RING_IDX i;
286
287 config_pending_incr(self);
288 aprint_normal(": Xen Virtual Block Device Interface\n");
289
290 dk_init(&sc->sc_dksc, self, DKTYPE_ESDI);
291 disk_init(&sc->sc_dksc.sc_dkdev, device_xname(self), &xbddkdriver);
292
293 sc->sc_xbusd = xa->xa_xbusd;
294 sc->sc_xbusd->xbusd_otherend_changed = xbd_backend_changed;
295
296 mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_BIO);
297 cv_init(&sc->sc_cache_flush_cv, "xbdsync");
298 cv_init(&sc->sc_req_cv, "xbdreq");
299 cv_init(&sc->sc_detach_cv, "xbddetach");
300 cv_init(&sc->sc_suspend_cv, "xbdsuspend");
301
302 xbd_features(sc);
303
304 /* initialize free requests list */
305 SLIST_INIT(&sc->sc_xbdreq_head);
306 for (i = 0; i < XBD_RING_SIZE; i++) {
307 sc->sc_reqs[i].req_id = i;
308 SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, &sc->sc_reqs[i],
309 req_next);
310 }
311
312 if (sc->sc_features & BLKIF_FEATURE_INDIRECT) {
313 /* initialize indirect page list */
314 for (i = 0; i < XBD_RING_SIZE; i++) {
315 vmem_addr_t va;
316 if (uvm_km_kmem_alloc(kmem_va_arena,
317 PAGE_SIZE, VM_SLEEP | VM_INSTANTFIT, &va) != 0) {
318 aprint_error_dev(self,
319 "can't alloc indirect pages\n");
320 return;
321 }
322 sc->sc_indirect[i].in_addr = (void *)va;
323 SLIST_INSERT_HEAD(&sc->sc_indirect_head,
324 &sc->sc_indirect[i], in_next);
325 }
326 }
327
328 sc->sc_backend_status = BLKIF_STATE_DISCONNECTED;
329 sc->sc_shutdown = BLKIF_SHUTDOWN_REMOTE;
330
331 ring = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED);
332 if (ring == NULL)
333 panic("%s: can't alloc ring", device_xname(self));
334 sc->sc_ring.sring = ring;
335
336 evcnt_attach_dynamic(&sc->sc_cnt_map_unalign, EVCNT_TYPE_MISC,
337 NULL, device_xname(self), "map unaligned");
338 evcnt_attach_dynamic(&sc->sc_cnt_unalign_busy, EVCNT_TYPE_MISC,
339 NULL, device_xname(self), "map unaligned");
340 evcnt_attach_dynamic(&sc->sc_cnt_queue_full, EVCNT_TYPE_MISC,
341 NULL, device_xname(self), "queue full");
342 evcnt_attach_dynamic(&sc->sc_cnt_indirect, EVCNT_TYPE_MISC,
343 NULL, device_xname(self), "indirect segment");
344
345 for (i = 0; i < XBD_RING_SIZE; i++) {
346 if (bus_dmamap_create(sc->sc_xbusd->xbusd_dmat,
347 MAXPHYS, XBD_XFER_LIMIT >> PAGE_SHIFT,
348 PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
349 &sc->sc_reqs[i].req_dmamap) != 0) {
350 aprint_error_dev(self, "can't alloc dma maps\n");
351 return;
352 }
353 }
354
355 if (uvm_km_kmem_alloc(kmem_va_arena,
356 MAXPHYS, VM_SLEEP | VM_INSTANTFIT, &sc->sc_unalign_buffer) != 0) {
357 aprint_error_dev(self, "can't alloc align buffer\n");
358 return;
359 }
360
361 /* resume shared structures and tell backend that we are ready */
362 if (xbd_xenbus_resume(self, PMF_Q_NONE) == false) {
363 uvm_km_free(kernel_map, (vaddr_t)ring, PAGE_SIZE,
364 UVM_KMF_WIRED);
365 return;
366 }
367
368 if (!pmf_device_register(self, xbd_xenbus_suspend, xbd_xenbus_resume))
369 aprint_error_dev(self, "couldn't establish power handler\n");
370 }
371
372 static int
xbd_xenbus_detach(device_t dev,int flags)373 xbd_xenbus_detach(device_t dev, int flags)
374 {
375 struct xbd_xenbus_softc *sc = device_private(dev);
376 int bmaj, cmaj, i, mn, rc;
377
378 DPRINTF(("%s: xbd_detach\n", device_xname(dev)));
379
380 rc = disk_begindetach(&sc->sc_dksc.sc_dkdev, NULL, dev, flags);
381 if (rc != 0)
382 return rc;
383
384 mutex_enter(&sc->sc_lock);
385 if (sc->sc_shutdown == BLKIF_SHUTDOWN_RUN) {
386 sc->sc_shutdown = BLKIF_SHUTDOWN_LOCAL;
387
388 /* wait for requests to complete */
389 while (sc->sc_backend_status == BLKIF_STATE_CONNECTED &&
390 disk_isbusy(&sc->sc_dksc.sc_dkdev)) {
391 cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2);
392 }
393 mutex_exit(&sc->sc_lock);
394
395 /* Trigger state transition with backend */
396 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosing);
397
398 mutex_enter(&sc->sc_lock);
399 }
400 if ((flags & DETACH_FORCE) == 0) {
401 /* xbd_xenbus_detach already in progress */
402 cv_broadcast(&sc->sc_detach_cv);
403 mutex_exit(&sc->sc_lock);
404 return EALREADY;
405 }
406 mutex_exit(&sc->sc_lock);
407 while (xenbus_read_driver_state(sc->sc_xbusd->xbusd_otherend)
408 != XenbusStateClosed) {
409 mutex_enter(&sc->sc_lock);
410 cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2);
411 mutex_exit(&sc->sc_lock);
412 }
413
414 /* locate the major number */
415 bmaj = bdevsw_lookup_major(&xbd_bdevsw);
416 cmaj = cdevsw_lookup_major(&xbd_cdevsw);
417
418 /* Nuke the vnodes for any open instances. */
419 for (i = 0; i < MAXPARTITIONS; i++) {
420 mn = DISKMINOR(device_unit(dev), i);
421 vdevgone(bmaj, mn, mn, VBLK);
422 vdevgone(cmaj, mn, mn, VCHR);
423 }
424
425 if (sc->sc_backend_status == BLKIF_STATE_CONNECTED) {
426 /* Delete all of our wedges. */
427 dkwedge_delall(&sc->sc_dksc.sc_dkdev);
428
429 /* Kill off any queued buffers. */
430 dk_drain(&sc->sc_dksc);
431 bufq_free(sc->sc_dksc.sc_bufq);
432
433 /* detach disk */
434 disk_detach(&sc->sc_dksc.sc_dkdev);
435 disk_destroy(&sc->sc_dksc.sc_dkdev);
436 dk_detach(&sc->sc_dksc);
437 }
438
439 hypervisor_mask_event(sc->sc_evtchn);
440 if (sc->sc_ih != NULL) {
441 xen_intr_disestablish(sc->sc_ih);
442 sc->sc_ih = NULL;
443 }
444
445 mutex_enter(&sc->sc_lock);
446 while (xengnt_status(sc->sc_ring_gntref))
447 cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2);
448 mutex_exit(&sc->sc_lock);
449
450 xengnt_revoke_access(sc->sc_ring_gntref);
451 uvm_km_free(kernel_map, (vaddr_t)sc->sc_ring.sring,
452 PAGE_SIZE, UVM_KMF_WIRED);
453
454 for (i = 0; i < XBD_RING_SIZE; i++) {
455 if (sc->sc_reqs[i].req_dmamap != NULL) {
456 bus_dmamap_destroy(sc->sc_xbusd->xbusd_dmat,
457 sc->sc_reqs[i].req_dmamap);
458 sc->sc_reqs[i].req_dmamap = NULL;
459 }
460 }
461
462 if (sc->sc_unalign_buffer != 0) {
463 uvm_km_kmem_free(kmem_va_arena, sc->sc_unalign_buffer, MAXPHYS);
464 sc->sc_unalign_buffer = 0;
465 }
466
467 mutex_destroy(&sc->sc_lock);
468
469 evcnt_detach(&sc->sc_cnt_map_unalign);
470 evcnt_detach(&sc->sc_cnt_unalign_busy);
471 evcnt_detach(&sc->sc_cnt_queue_full);
472 evcnt_detach(&sc->sc_cnt_indirect);
473
474 pmf_device_deregister(dev);
475
476 return 0;
477 }
478
479 static bool
xbd_xenbus_suspend(device_t dev,const pmf_qual_t * qual)480 xbd_xenbus_suspend(device_t dev, const pmf_qual_t *qual) {
481
482 struct xbd_xenbus_softc *sc;
483
484 sc = device_private(dev);
485
486 mutex_enter(&sc->sc_lock);
487 /* wait for requests to complete, then suspend device */
488 while (sc->sc_backend_status == BLKIF_STATE_CONNECTED &&
489 disk_isbusy(&sc->sc_dksc.sc_dkdev)) {
490 cv_timedwait(&sc->sc_suspend_cv, &sc->sc_lock, hz/2);
491 }
492
493 hypervisor_mask_event(sc->sc_evtchn);
494 sc->sc_backend_status = BLKIF_STATE_SUSPENDED;
495
496 #ifdef DIAGNOSTIC
497 /* Check that all requests are finished and device ready for resume */
498 int reqcnt = 0;
499 struct xbd_req *req;
500 SLIST_FOREACH(req, &sc->sc_xbdreq_head, req_next)
501 reqcnt++;
502 KASSERT(reqcnt == __arraycount(sc->sc_reqs));
503
504 int incnt = 0;
505 struct xbd_indirect *in;
506 SLIST_FOREACH(in, &sc->sc_indirect_head, in_next)
507 incnt++;
508 KASSERT(incnt == __arraycount(sc->sc_indirect));
509 #endif
510
511 mutex_exit(&sc->sc_lock);
512
513 xenbus_device_suspend(sc->sc_xbusd);
514 aprint_verbose_dev(dev, "removed event channel %d\n", sc->sc_evtchn);
515
516 return true;
517 }
518
519 static bool
xbd_xenbus_resume(device_t dev,const pmf_qual_t * qual)520 xbd_xenbus_resume(device_t dev, const pmf_qual_t *qual)
521 {
522 struct xbd_xenbus_softc *sc;
523 struct xenbus_transaction *xbt;
524 int error;
525 blkif_sring_t *ring;
526 paddr_t ma;
527 const char *errmsg;
528
529 sc = device_private(dev);
530
531 /* All grants were removed during suspend */
532 sc->sc_ring_gntref = GRANT_INVALID_REF;
533
534 /* Initialize ring */
535 ring = sc->sc_ring.sring;
536 memset(ring, 0, PAGE_SIZE);
537 SHARED_RING_INIT(ring);
538 FRONT_RING_INIT(&sc->sc_ring, ring, PAGE_SIZE);
539
540 /*
541 * get MA address of the ring, and use it to set up the grant entry
542 * for the block device
543 */
544 (void)pmap_extract_ma(pmap_kernel(), (vaddr_t)ring, &ma);
545 error = xenbus_grant_ring(sc->sc_xbusd, ma, &sc->sc_ring_gntref);
546 if (error)
547 goto abort_resume;
548
549 if (sc->sc_features & BLKIF_FEATURE_INDIRECT) {
550 for (int i = 0; i < XBD_RING_SIZE; i++) {
551 vaddr_t va = (vaddr_t)sc->sc_indirect[i].in_addr;
552 KASSERT(va != 0);
553 KASSERT((va & PAGE_MASK) == 0);
554 (void)pmap_extract_ma(pmap_kernel(), va, &ma);
555 if (xengnt_grant_access(
556 sc->sc_xbusd->xbusd_otherend_id,
557 ma, true, &sc->sc_indirect[i].in_gntref)) {
558 aprint_error_dev(dev,
559 "indirect page grant failed\n");
560 goto abort_resume;
561 }
562 }
563 }
564
565 error = xenbus_alloc_evtchn(sc->sc_xbusd, &sc->sc_evtchn);
566 if (error)
567 goto abort_resume;
568
569 if (sc->sc_ih != NULL) {
570 xen_intr_disestablish(sc->sc_ih);
571 sc->sc_ih = NULL;
572 }
573 aprint_verbose_dev(dev, "using event channel %d\n",
574 sc->sc_evtchn);
575 sc->sc_ih = xen_intr_establish_xname(-1, &xen_pic, sc->sc_evtchn,
576 IST_LEVEL, IPL_BIO, &xbd_handler, sc, true, device_xname(dev));
577 KASSERT(sc->sc_ih != NULL);
578
579 again:
580 xbt = xenbus_transaction_start();
581 if (xbt == NULL)
582 return false;
583 error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
584 "ring-ref","%u", sc->sc_ring_gntref);
585 if (error) {
586 errmsg = "writing ring-ref";
587 goto abort_transaction;
588 }
589 error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
590 "event-channel", "%u", sc->sc_evtchn);
591 if (error) {
592 errmsg = "writing event channel";
593 goto abort_transaction;
594 }
595 error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
596 "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
597 if (error) {
598 errmsg = "writing protocol";
599 goto abort_transaction;
600 }
601 error = xenbus_transaction_end(xbt, 0);
602 if (error == EAGAIN)
603 goto again;
604 if (error != 0) {
605 xenbus_dev_fatal(sc->sc_xbusd, error,
606 "completing transaction");
607 return false;
608 }
609
610 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateInitialised);
611
612 if (sc->sc_backend_status == BLKIF_STATE_SUSPENDED) {
613 /*
614 * device was suspended, softc structures are
615 * already initialized - we use a shortcut
616 */
617 sc->sc_backend_status = BLKIF_STATE_CONNECTED;
618 xenbus_device_resume(sc->sc_xbusd);
619 hypervisor_unmask_event(sc->sc_evtchn);
620 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected);
621 }
622
623 return true;
624
625 abort_resume:
626 xenbus_dev_fatal(sc->sc_xbusd, error, "resuming device");
627 return false;
628
629 abort_transaction:
630 xenbus_transaction_end(xbt, 1);
631 xenbus_dev_fatal(sc->sc_xbusd, error, "%s", errmsg);
632 return false;
633 }
634
635 static void
xbd_backend_changed(void * arg,XenbusState new_state)636 xbd_backend_changed(void *arg, XenbusState new_state)
637 {
638 struct xbd_xenbus_softc *sc = device_private((device_t)arg);
639 struct disk_geom *dg;
640
641 char buf[64];
642 DPRINTF(("%s: new backend state %d\n",
643 device_xname(sc->sc_dksc.sc_dev), new_state));
644
645 switch (new_state) {
646 case XenbusStateUnknown:
647 case XenbusStateInitialising:
648 case XenbusStateInitWait:
649 case XenbusStateInitialised:
650 break;
651 case XenbusStateClosing:
652 mutex_enter(&sc->sc_lock);
653 if (sc->sc_shutdown == BLKIF_SHUTDOWN_RUN)
654 sc->sc_shutdown = BLKIF_SHUTDOWN_REMOTE;
655 /* wait for requests to complete */
656 while (sc->sc_backend_status == BLKIF_STATE_CONNECTED &&
657 disk_isbusy(&sc->sc_dksc.sc_dkdev)) {
658 cv_timedwait(&sc->sc_detach_cv, &sc->sc_lock, hz/2);
659 }
660 mutex_exit(&sc->sc_lock);
661 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosed);
662 break;
663 case XenbusStateConnected:
664 /*
665 * note that xbd_backend_changed() can only be called by
666 * the xenbus thread.
667 */
668
669 if (sc->sc_backend_status == BLKIF_STATE_CONNECTED ||
670 sc->sc_backend_status == BLKIF_STATE_SUSPENDED)
671 /* already connected */
672 return;
673
674 xbd_connect(sc);
675 sc->sc_shutdown = BLKIF_SHUTDOWN_RUN;
676 sc->sc_xbdsize =
677 sc->sc_sectors * (uint64_t)sc->sc_secsize / DEV_BSIZE;
678 dg = &sc->sc_dksc.sc_dkdev.dk_geom;
679 memset(dg, 0, sizeof(*dg));
680
681 dg->dg_secperunit = sc->sc_sectors;
682 dg->dg_secsize = sc->sc_secsize;
683 dg->dg_ntracks = 1;
684 dg->dg_nsectors = (1024 * 1024) / dg->dg_secsize;
685 dg->dg_ncylinders = dg->dg_secperunit / dg->dg_nsectors;
686
687 bufq_alloc(&sc->sc_dksc.sc_bufq, "fcfs", 0);
688 dk_attach(&sc->sc_dksc);
689 disk_attach(&sc->sc_dksc.sc_dkdev);
690
691 sc->sc_backend_status = BLKIF_STATE_CONNECTED;
692 hypervisor_unmask_event(sc->sc_evtchn);
693
694 format_bytes(buf, uimin(9, sizeof(buf)),
695 sc->sc_sectors * dg->dg_secsize);
696 aprint_normal_dev(sc->sc_dksc.sc_dev,
697 "%s, %d bytes/sect x %" PRIu64 " sectors\n",
698 buf, (int)dg->dg_secsize, sc->sc_sectors);
699 snprintb(buf, sizeof(buf), BLKIF_FEATURE_BITS,
700 sc->sc_features);
701 aprint_normal_dev(sc->sc_dksc.sc_dev,
702 "backend features %s\n", buf);
703
704 /* Discover wedges on this disk. */
705 dkwedge_discover(&sc->sc_dksc.sc_dkdev);
706
707 disk_set_info(sc->sc_dksc.sc_dev, &sc->sc_dksc.sc_dkdev, NULL);
708
709 /* the disk should be working now */
710 config_pending_decr(sc->sc_dksc.sc_dev);
711 break;
712 default:
713 panic("bad backend state %d", new_state);
714 }
715 }
716
717 static void
xbd_connect(struct xbd_xenbus_softc * sc)718 xbd_connect(struct xbd_xenbus_softc *sc)
719 {
720 int err;
721 unsigned long long sectors;
722 u_long val;
723
724 /*
725 * Must read feature-persistent later, e.g. Linux Dom0 writes
726 * this together with the device info.
727 */
728 err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
729 "feature-persistent", &val, 10);
730 if (err)
731 val = 0;
732 if (val > 0)
733 sc->sc_features |= BLKIF_FEATURE_PERSISTENT;
734
735 err = xenbus_read_ul(NULL,
736 sc->sc_xbusd->xbusd_path, "virtual-device", &sc->sc_handle, 10);
737 if (err)
738 panic("%s: can't read number from %s/virtual-device\n",
739 device_xname(sc->sc_dksc.sc_dev),
740 sc->sc_xbusd->xbusd_otherend);
741 err = xenbus_read_ul(NULL,
742 sc->sc_xbusd->xbusd_otherend, "info", &sc->sc_info, 10);
743 if (err)
744 panic("%s: can't read number from %s/info\n",
745 device_xname(sc->sc_dksc.sc_dev),
746 sc->sc_xbusd->xbusd_otherend);
747 err = xenbus_read_ul(NULL,
748 sc->sc_xbusd->xbusd_otherend, "sector-size", &sc->sc_secsize, 10);
749 if (err)
750 panic("%s: can't read number from %s/sector-size\n",
751 device_xname(sc->sc_dksc.sc_dev),
752 sc->sc_xbusd->xbusd_otherend);
753
754 err = xenbus_read_ull(NULL,
755 sc->sc_xbusd->xbusd_otherend, "sectors", §ors, 10);
756 if (err)
757 panic("%s: can't read number from %s/sectors\n",
758 device_xname(sc->sc_dksc.sc_dev),
759 sc->sc_xbusd->xbusd_otherend);
760 sc->sc_sectors = sectors * (uint64_t)XEN_BSIZE / sc->sc_secsize;
761
762 xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected);
763 }
764
765 static void
xbd_features(struct xbd_xenbus_softc * sc)766 xbd_features(struct xbd_xenbus_softc *sc)
767 {
768 int err;
769 u_long val;
770
771 err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
772 "feature-flush-cache", &val, 10);
773 if (err)
774 val = 0;
775 if (val > 0)
776 sc->sc_features |= BLKIF_FEATURE_CACHE_FLUSH;
777
778 err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
779 "feature-barrier", &val, 10);
780 if (err)
781 val = 0;
782 if (val > 0)
783 sc->sc_features |= BLKIF_FEATURE_BARRIER;
784
785 err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
786 "feature-max-indirect-segments", &val, 10);
787 if (err)
788 val = 0;
789 if (val >= (MAXPHYS >> PAGE_SHIFT) + 1) {
790 /* We can use indirect segments, the limit is big enough */
791 sc->sc_features |= BLKIF_FEATURE_INDIRECT;
792 }
793 }
794
795 static int
xbd_handler(void * arg)796 xbd_handler(void *arg)
797 {
798 struct xbd_xenbus_softc *sc = arg;
799 struct buf *bp;
800 RING_IDX resp_prod, i;
801 int more_to_do;
802 int seg;
803 grant_ref_t gntref;
804
805 DPRINTF(("xbd_handler(%s)\n", device_xname(sc->sc_dksc.sc_dev)));
806
807 if (__predict_false(sc->sc_backend_status != BLKIF_STATE_CONNECTED))
808 return 0;
809
810 mutex_enter(&sc->sc_lock);
811 again:
812 resp_prod = sc->sc_ring.sring->rsp_prod;
813 xen_rmb(); /* ensure we see replies up to resp_prod */
814 for (i = sc->sc_ring.rsp_cons; i != resp_prod; i++) {
815 blkif_response_t *rep = RING_GET_RESPONSE(&sc->sc_ring, i);
816 struct xbd_req *xbdreq = &sc->sc_reqs[rep->id];
817
818 if (rep->operation == BLKIF_OP_FLUSH_DISKCACHE) {
819 KASSERT(xbdreq->req_bp == NULL);
820 xbdreq->req_sync.s_error = rep->status;
821 xbdreq->req_sync.s_done = 1;
822 cv_broadcast(&sc->sc_cache_flush_cv);
823 /* caller will free the req */
824 continue;
825 }
826
827 if (rep->operation != BLKIF_OP_READ &&
828 rep->operation != BLKIF_OP_WRITE) {
829 aprint_error_dev(sc->sc_dksc.sc_dev,
830 "bad operation %d from backend\n", rep->operation);
831 continue;
832 }
833
834 bp = xbdreq->req_bp;
835 xbdreq->req_bp = NULL;
836 KASSERT(bp != NULL && bp->b_data != NULL);
837 DPRINTF(("%s(%p): b_bcount = %ld\n", __func__,
838 bp, (long)bp->b_bcount));
839
840 if (bp->b_error != 0 || rep->status != BLKIF_RSP_OKAY) {
841 DPRINTF(("%s: error %d status %d\n", __func__,
842 bp->b_error, rep->status));
843 bp->b_error = EIO;
844 bp->b_resid = bp->b_bcount;
845 }
846
847 if (xbdreq->req_parent) {
848 struct xbd_req *req_parent = xbdreq->req_parent;
849
850 /* Unhook and recycle child */
851 xbdreq->req_parent = NULL;
852 req_parent->req_child = NULL;
853 SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq,
854 req_next);
855
856 if (!req_parent->req_parent_done) {
857 /* Finished before parent, nothig else to do */
858 continue;
859 }
860
861 /* Must do the cleanup now */
862 xbdreq = req_parent;
863 }
864 if (xbdreq->req_child) {
865 /* Finished before child, child will cleanup */
866 xbdreq->req_parent_done = true;
867 continue;
868 }
869
870 if (bp->b_error == 0)
871 bp->b_resid = 0;
872
873 KASSERT(xbdreq->req_dmamap->dm_nsegs > 0);
874 for (seg = 0; seg < xbdreq->req_dmamap->dm_nsegs; seg++) {
875 /*
876 * We are not allowing persistent mappings, so
877 * expect the backend to release the grant
878 * immediately.
879 */
880 if (xbdreq->req_indirect) {
881 gntref =
882 xbdreq->req_indirect->in_addr[seg].gref;
883 } else
884 gntref = xbdreq->req_gntref[seg];
885 KASSERT(xengnt_status(gntref) == 0);
886 xengnt_revoke_access(gntref);
887 }
888
889 bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat, xbdreq->req_dmamap);
890
891 if (__predict_false(bp->b_data != xbdreq->req_data))
892 xbd_unmap_align(sc, xbdreq, bp);
893 xbdreq->req_data = NULL;
894
895 dk_done(&sc->sc_dksc, bp);
896
897 if (xbdreq->req_indirect) {
898 /* No persistent mappings, so check that
899 * backend unmapped the indirect segment grant too.
900 */
901 KASSERT(xengnt_status(xbdreq->req_indirect->in_gntref)
902 == 0);
903 SLIST_INSERT_HEAD(&sc->sc_indirect_head,
904 xbdreq->req_indirect, in_next);
905 xbdreq->req_indirect = NULL;
906 }
907 SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next);
908 }
909 sc->sc_ring.rsp_cons = i;
910
911 xen_wmb();
912 RING_FINAL_CHECK_FOR_RESPONSES(&sc->sc_ring, more_to_do);
913 if (more_to_do)
914 goto again;
915
916 cv_signal(&sc->sc_req_cv);
917 mutex_exit(&sc->sc_lock);
918
919 dk_start(&sc->sc_dksc, NULL);
920
921 return 1;
922 }
923
924 static void
xbdminphys(struct buf * bp)925 xbdminphys(struct buf *bp)
926 {
927 if (bp->b_bcount > XBD_XFER_LIMIT) {
928 bp->b_bcount = XBD_XFER_LIMIT;
929 }
930 minphys(bp);
931 }
932
933 static void
xbd_iosize(device_t dev,int * maxxfer)934 xbd_iosize(device_t dev, int *maxxfer)
935 {
936 /*
937 * Always restrict dumps to XBD_MAX_XFER to avoid indirect segments,
938 * so that it uses as little memory as possible.
939 */
940 if (*maxxfer > XBD_MAX_XFER)
941 *maxxfer = XBD_MAX_XFER;
942 }
943
944 static int
xbdopen(dev_t dev,int flags,int fmt,struct lwp * l)945 xbdopen(dev_t dev, int flags, int fmt, struct lwp *l)
946 {
947 struct xbd_xenbus_softc *sc;
948
949 sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
950 if (sc == NULL)
951 return (ENXIO);
952 if ((flags & FWRITE) && (sc->sc_info & VDISK_READONLY))
953 return EROFS;
954
955 DPRINTF(("xbdopen(%" PRIx64 ", %d)\n", dev, flags));
956 return dk_open(&sc->sc_dksc, dev, flags, fmt, l);
957 }
958
959 static int
xbdclose(dev_t dev,int flags,int fmt,struct lwp * l)960 xbdclose(dev_t dev, int flags, int fmt, struct lwp *l)
961 {
962 struct xbd_xenbus_softc *sc;
963
964 sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
965
966 DPRINTF(("xbdclose(%" PRIx64 ", %d)\n", dev, flags));
967 return dk_close(&sc->sc_dksc, dev, flags, fmt, l);
968 }
969
970 static void
xbdstrategy(struct buf * bp)971 xbdstrategy(struct buf *bp)
972 {
973 struct xbd_xenbus_softc *sc;
974
975 sc = device_lookup_private(&xbd_cd, DISKUNIT(bp->b_dev));
976
977 DPRINTF(("xbdstrategy(%p): b_bcount = %ld\n", bp,
978 (long)bp->b_bcount));
979
980 if (sc == NULL || sc->sc_shutdown != BLKIF_SHUTDOWN_RUN) {
981 bp->b_error = EIO;
982 biodone(bp);
983 return;
984 }
985 if (__predict_false((sc->sc_info & VDISK_READONLY) &&
986 (bp->b_flags & B_READ) == 0)) {
987 bp->b_error = EROFS;
988 biodone(bp);
989 return;
990 }
991
992 dk_strategy(&sc->sc_dksc, bp);
993 return;
994 }
995
996 static int
xbdsize(dev_t dev)997 xbdsize(dev_t dev)
998 {
999 struct xbd_xenbus_softc *sc;
1000
1001 DPRINTF(("xbdsize(%" PRIx64 ")\n", dev));
1002
1003 sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
1004 if (sc == NULL || sc->sc_shutdown != BLKIF_SHUTDOWN_RUN)
1005 return -1;
1006 return dk_size(&sc->sc_dksc, dev);
1007 }
1008
1009 static int
xbdread(dev_t dev,struct uio * uio,int flags)1010 xbdread(dev_t dev, struct uio *uio, int flags)
1011 {
1012 struct xbd_xenbus_softc *sc =
1013 device_lookup_private(&xbd_cd, DISKUNIT(dev));
1014 struct dk_softc *dksc = &sc->sc_dksc;
1015
1016 if (!DK_ATTACHED(dksc))
1017 return ENXIO;
1018 return physio(xbdstrategy, NULL, dev, B_READ, xbdminphys, uio);
1019 }
1020
1021 static int
xbdwrite(dev_t dev,struct uio * uio,int flags)1022 xbdwrite(dev_t dev, struct uio *uio, int flags)
1023 {
1024 struct xbd_xenbus_softc *sc =
1025 device_lookup_private(&xbd_cd, DISKUNIT(dev));
1026 struct dk_softc *dksc = &sc->sc_dksc;
1027
1028 if (!DK_ATTACHED(dksc))
1029 return ENXIO;
1030 if (__predict_false(sc->sc_info & VDISK_READONLY))
1031 return EROFS;
1032 return physio(xbdstrategy, NULL, dev, B_WRITE, xbdminphys, uio);
1033 }
1034
1035 static int
xbdioctl(dev_t dev,u_long cmd,void * data,int flag,struct lwp * l)1036 xbdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1037 {
1038 struct xbd_xenbus_softc *sc =
1039 device_lookup_private(&xbd_cd, DISKUNIT(dev));
1040 struct dk_softc *dksc;
1041 int error;
1042 struct xbd_req *xbdreq;
1043 blkif_request_t *req;
1044 int notify;
1045
1046 DPRINTF(("xbdioctl(%" PRIx64 ", %08lx, %p, %d, %p)\n",
1047 dev, cmd, data, flag, l));
1048 dksc = &sc->sc_dksc;
1049
1050 switch (cmd) {
1051 case DIOCGCACHE:
1052 {
1053 /* Assume there is write cache if cache-flush is supported */
1054 int *bitsp = (int *)data;
1055 *bitsp = 0;
1056 if (sc->sc_features & BLKIF_FEATURE_CACHE_FLUSH)
1057 *bitsp |= DKCACHE_WRITE;
1058 error = 0;
1059 break;
1060 }
1061 case DIOCCACHESYNC:
1062 if ((sc->sc_features & BLKIF_FEATURE_CACHE_FLUSH) == 0)
1063 return EOPNOTSUPP;
1064
1065 mutex_enter(&sc->sc_lock);
1066 while ((xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head)) == NULL)
1067 cv_wait(&sc->sc_req_cv, &sc->sc_lock);
1068 KASSERT(!RING_FULL(&sc->sc_ring));
1069
1070 SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
1071 req = RING_GET_REQUEST(&sc->sc_ring,
1072 sc->sc_ring.req_prod_pvt);
1073 req->id = xbdreq->req_id;
1074 req->operation = BLKIF_OP_FLUSH_DISKCACHE;
1075 req->handle = sc->sc_handle;
1076 xbdreq->req_sync.s_done = 0;
1077 sc->sc_ring.req_prod_pvt++;
1078 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, notify);
1079 if (notify)
1080 hypervisor_notify_via_evtchn(sc->sc_evtchn);
1081 /* request sent, now wait for completion */
1082 while (xbdreq->req_sync.s_done == 0)
1083 cv_wait(&sc->sc_cache_flush_cv, &sc->sc_lock);
1084
1085 if (xbdreq->req_sync.s_error == BLKIF_RSP_EOPNOTSUPP)
1086 error = EOPNOTSUPP;
1087 else if (xbdreq->req_sync.s_error == BLKIF_RSP_OKAY)
1088 error = 0;
1089 else
1090 error = EIO;
1091 SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next);
1092 cv_signal(&sc->sc_req_cv);
1093 mutex_exit(&sc->sc_lock);
1094
1095 /* Restart I/O if it was waiting for req */
1096 dk_start(&sc->sc_dksc, NULL);
1097 break;
1098
1099 default:
1100 error = dk_ioctl(dksc, dev, cmd, data, flag, l);
1101 break;
1102 }
1103
1104 return error;
1105 }
1106
1107 static int
xbddump(dev_t dev,daddr_t blkno,void * va,size_t size)1108 xbddump(dev_t dev, daddr_t blkno, void *va, size_t size)
1109 {
1110 struct xbd_xenbus_softc *sc;
1111
1112 sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
1113 if (sc == NULL)
1114 return (ENXIO);
1115
1116 DPRINTF(("xbddump(%" PRIx64 ", %" PRId64 ", %p, %lu)\n", dev, blkno, va,
1117 (unsigned long)size));
1118 return dk_dump(&sc->sc_dksc, dev, blkno, va, size, 0);
1119 }
1120
1121 static int
xbd_diskstart(device_t self,struct buf * bp)1122 xbd_diskstart(device_t self, struct buf *bp)
1123 {
1124 struct xbd_xenbus_softc *sc = device_private(self);
1125 struct xbd_req *xbdreq;
1126 int error = 0;
1127 int notify;
1128
1129 KASSERT(bp->b_bcount <= MAXPHYS);
1130
1131 DPRINTF(("xbd_diskstart(%p): b_bcount = %ld\n",
1132 bp, (long)bp->b_bcount));
1133
1134 mutex_enter(&sc->sc_lock);
1135
1136 if (sc->sc_shutdown != BLKIF_SHUTDOWN_RUN) {
1137 error = EIO;
1138 goto out;
1139 }
1140
1141 if (bp->b_rawblkno < 0 || bp->b_rawblkno > sc->sc_sectors) {
1142 /* invalid block number */
1143 error = EINVAL;
1144 goto out;
1145 }
1146
1147 if (__predict_false(
1148 sc->sc_backend_status == BLKIF_STATE_SUSPENDED)) {
1149 /* device is suspended, do not consume buffer */
1150 DPRINTF(("%s: (xbd_diskstart) device suspended\n",
1151 sc->sc_dksc.sc_xname));
1152 error = EAGAIN;
1153 goto out;
1154 }
1155
1156 xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head);
1157 if (__predict_false(xbdreq == NULL)) {
1158 sc->sc_cnt_queue_full.ev_count++;
1159 DPRINTF(("xbd_diskstart: no req\n"));
1160 error = EAGAIN;
1161 goto out;
1162 }
1163 KASSERT(!RING_FULL(&sc->sc_ring));
1164
1165 if ((sc->sc_features & BLKIF_FEATURE_INDIRECT) == 0
1166 && bp->b_bcount > XBD_MAX_CHUNK) {
1167 if (!SLIST_NEXT(xbdreq, req_next)) {
1168 DPRINTF(("%s: need extra req\n", __func__));
1169 error = EAGAIN;
1170 goto out;
1171 }
1172 }
1173
1174 bp->b_resid = bp->b_bcount;
1175 xbdreq->req_bp = bp;
1176 xbdreq->req_data = bp->b_data;
1177 if (__predict_false((vaddr_t)bp->b_data & (sc->sc_secsize - 1))) {
1178 if (__predict_false(xbd_map_align(sc, xbdreq) != 0)) {
1179 DPRINTF(("xbd_diskstart: no align\n"));
1180 error = EAGAIN;
1181 goto out;
1182 }
1183 }
1184
1185 if (__predict_false(bus_dmamap_load(sc->sc_xbusd->xbusd_dmat,
1186 xbdreq->req_dmamap, xbdreq->req_data, bp->b_bcount, NULL,
1187 BUS_DMA_NOWAIT) != 0)) {
1188 printf("%s: %s: bus_dmamap_load failed\n",
1189 device_xname(sc->sc_dksc.sc_dev), __func__);
1190 if (__predict_false(bp->b_data != xbdreq->req_data))
1191 xbd_unmap_align(sc, xbdreq, NULL);
1192 error = EINVAL;
1193 goto out;
1194 }
1195 KASSERTMSG(xbdreq->req_dmamap->dm_nsegs > 0,
1196 "dm_nsegs == 0 with bcount %d", bp->b_bcount);
1197
1198 for (int seg = 0; seg < xbdreq->req_dmamap->dm_nsegs; seg++) {
1199 KASSERT(seg < __arraycount(xbdreq->req_gntref));
1200
1201 paddr_t ma = xbdreq->req_dmamap->dm_segs[seg].ds_addr;
1202 if (__predict_false(xengnt_grant_access(
1203 sc->sc_xbusd->xbusd_otherend_id,
1204 (ma & ~PAGE_MASK), (bp->b_flags & B_READ) == 0,
1205 &xbdreq->req_gntref[seg]))) {
1206 printf("%s: %s: xengnt_grant_access failed\n",
1207 device_xname(sc->sc_dksc.sc_dev), __func__);
1208 if (seg > 0) {
1209 for (; --seg >= 0; ) {
1210 xengnt_revoke_access(
1211 xbdreq->req_gntref[seg]);
1212 }
1213 }
1214 bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat,
1215 xbdreq->req_dmamap);
1216 if (__predict_false(bp->b_data != xbdreq->req_data))
1217 xbd_unmap_align(sc, xbdreq, NULL);
1218 error = EAGAIN;
1219 goto out;
1220 }
1221 }
1222
1223 KASSERT(xbdreq->req_parent == NULL);
1224 KASSERT(xbdreq->req_child == NULL);
1225
1226 /* We are now committed to the transfer */
1227 SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
1228
1229 if ((sc->sc_features & BLKIF_FEATURE_INDIRECT) != 0 &&
1230 bp->b_bcount > XBD_MAX_CHUNK) {
1231 xbd_diskstart_submit_indirect(sc, xbdreq, bp);
1232 goto push;
1233 }
1234
1235 xbd_diskstart_submit(sc, xbdreq->req_id,
1236 bp, 0, xbdreq->req_dmamap, xbdreq->req_gntref);
1237
1238 if (bp->b_bcount > XBD_MAX_CHUNK) {
1239 KASSERT(!RING_FULL(&sc->sc_ring));
1240 struct xbd_req *xbdreq2 = SLIST_FIRST(&sc->sc_xbdreq_head);
1241 KASSERT(xbdreq2 != NULL); /* Checked earlier */
1242 SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
1243 xbdreq->req_child = xbdreq2;
1244 xbdreq->req_parent_done = false;
1245 xbdreq2->req_parent = xbdreq;
1246 xbdreq2->req_bp = bp;
1247 xbdreq2->req_data = xbdreq->req_data;
1248 xbd_diskstart_submit(sc, xbdreq2->req_id,
1249 bp, XBD_MAX_CHUNK, xbdreq->req_dmamap,
1250 xbdreq->req_gntref);
1251 }
1252
1253 push:
1254 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, notify);
1255 if (notify)
1256 hypervisor_notify_via_evtchn(sc->sc_evtchn);
1257 out:
1258 mutex_exit(&sc->sc_lock);
1259 return error;
1260 }
1261
1262 static void
xbd_diskstart_submit(struct xbd_xenbus_softc * sc,int req_id,struct buf * bp,int start,bus_dmamap_t dmamap,grant_ref_t * gntref)1263 xbd_diskstart_submit(struct xbd_xenbus_softc *sc,
1264 int req_id, struct buf *bp, int start, bus_dmamap_t dmamap,
1265 grant_ref_t *gntref)
1266 {
1267 blkif_request_t *req;
1268 paddr_t ma;
1269 int nsects, nbytes, dmaseg, first_sect, size, segidx = 0;
1270 struct blkif_request_segment *reqseg;
1271
1272 KASSERT(mutex_owned(&sc->sc_lock));
1273
1274 req = RING_GET_REQUEST(&sc->sc_ring, sc->sc_ring.req_prod_pvt);
1275 req->id = req_id;
1276 req->operation =
1277 bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE;
1278 req->sector_number = (bp->b_rawblkno * sc->sc_secsize / XEN_BSIZE) +
1279 (start >> XEN_BSHIFT);
1280 req->handle = sc->sc_handle;
1281 DPRINTF(("%s: id %" PRIu64 " op %d sn %" PRIu64 " handle %d\n",
1282 __func__, req->id, req->operation, req->sector_number,
1283 req->handle));
1284
1285 size = uimin(bp->b_bcount - start, XBD_MAX_CHUNK);
1286 for (dmaseg = 0; dmaseg < dmamap->dm_nsegs && size > 0; dmaseg++) {
1287 bus_dma_segment_t *ds = &dmamap->dm_segs[dmaseg];
1288
1289 ma = ds->ds_addr;
1290 nbytes = ds->ds_len;
1291
1292 if (start > 0) {
1293 if (start >= nbytes) {
1294 start -= nbytes;
1295 continue;
1296 }
1297 ma += start;
1298 nbytes -= start;
1299 start = 0;
1300 }
1301 size -= nbytes;
1302
1303 KASSERT(((ma & PAGE_MASK) & (sc->sc_secsize - 1)) == 0);
1304 KASSERT((nbytes & (sc->sc_secsize - 1)) == 0);
1305 KASSERT((size & (sc->sc_secsize - 1)) == 0);
1306 first_sect = (ma & PAGE_MASK) >> XEN_BSHIFT;
1307 nsects = nbytes >> XEN_BSHIFT;
1308
1309 reqseg = &req->seg[segidx++];
1310 reqseg->first_sect = first_sect;
1311 reqseg->last_sect = first_sect + nsects - 1;
1312 KASSERT(reqseg->first_sect <= reqseg->last_sect);
1313 KASSERT(reqseg->last_sect < (PAGE_SIZE / XEN_BSIZE));
1314 DPRINTF(("%s: seg %d fs %d ls %d\n", __func__, segidx,
1315 reqseg->first_sect, reqseg->last_sect));
1316
1317 reqseg->gref = gntref[dmaseg];
1318 }
1319 KASSERT(segidx > 0);
1320 req->nr_segments = segidx;
1321 sc->sc_ring.req_prod_pvt++;
1322 }
1323
1324 static void
xbd_diskstart_submit_indirect(struct xbd_xenbus_softc * sc,struct xbd_req * xbdreq,struct buf * bp)1325 xbd_diskstart_submit_indirect(struct xbd_xenbus_softc *sc,
1326 struct xbd_req *xbdreq, struct buf *bp)
1327 {
1328 blkif_request_indirect_t *req;
1329 paddr_t ma;
1330 int nsects, nbytes, dmaseg, first_sect;
1331 struct blkif_request_segment *reqseg;
1332
1333 KASSERT(mutex_owned(&sc->sc_lock));
1334
1335 req = (blkif_request_indirect_t *)RING_GET_REQUEST(&sc->sc_ring,
1336 sc->sc_ring.req_prod_pvt);
1337 req->id = xbdreq->req_id;
1338 req->operation = BLKIF_OP_INDIRECT;
1339 req->indirect_op =
1340 bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE;
1341 req->sector_number = bp->b_rawblkno * sc->sc_secsize / XEN_BSIZE;
1342 req->handle = sc->sc_handle;
1343 DPRINTF(("%s: id %" PRIu64 " op %d sn %" PRIu64 " handle %d\n",
1344 __func__, req->id, req->indirect_op, req->sector_number,
1345 req->handle));
1346
1347 xbdreq->req_indirect = SLIST_FIRST(&sc->sc_indirect_head);
1348 KASSERT(xbdreq->req_indirect != NULL); /* always as many as reqs */
1349 SLIST_REMOVE_HEAD(&sc->sc_indirect_head, in_next);
1350 req->indirect_grefs[0] = xbdreq->req_indirect->in_gntref;
1351
1352 reqseg = xbdreq->req_indirect->in_addr;
1353 for (dmaseg = 0; dmaseg < xbdreq->req_dmamap->dm_nsegs; dmaseg++) {
1354 bus_dma_segment_t *ds = &xbdreq->req_dmamap->dm_segs[dmaseg];
1355
1356 ma = ds->ds_addr;
1357 nbytes = ds->ds_len;
1358
1359 KASSERT(((ma & PAGE_MASK) & (sc->sc_secsize - 1)) == 0);
1360 KASSERT((nbytes & (sc->sc_secsize - 1)) == 0);
1361
1362 first_sect = (ma & PAGE_MASK) >> XEN_BSHIFT;
1363 nsects = nbytes >> XEN_BSHIFT;
1364
1365 reqseg->first_sect = first_sect;
1366 reqseg->last_sect = first_sect + nsects - 1;
1367 reqseg->gref = xbdreq->req_gntref[dmaseg];
1368 DPRINTF(("%s: seg %d fs %d ls %d\n", __func__, dmaseg,
1369 reqseg->first_sect, reqseg->last_sect));
1370
1371 KASSERT(reqseg->first_sect <= reqseg->last_sect);
1372 KASSERT(reqseg->last_sect < (PAGE_SIZE / XEN_BSIZE));
1373
1374 reqseg++;
1375 }
1376 req->nr_segments = dmaseg;
1377 sc->sc_ring.req_prod_pvt++;
1378
1379 sc->sc_cnt_indirect.ev_count++;
1380 }
1381
1382 static int
xbd_map_align(struct xbd_xenbus_softc * sc,struct xbd_req * req)1383 xbd_map_align(struct xbd_xenbus_softc *sc, struct xbd_req *req)
1384 {
1385 sc->sc_cnt_map_unalign.ev_count++;
1386
1387 if (sc->sc_unalign_used) {
1388 sc->sc_cnt_unalign_busy.ev_count++;
1389 return EAGAIN;
1390 }
1391 sc->sc_unalign_used = req->req_bp;
1392
1393 KASSERT(req->req_bp->b_bcount <= MAXPHYS);
1394 req->req_data = (void *)sc->sc_unalign_buffer;
1395 if ((req->req_bp->b_flags & B_READ) == 0)
1396 memcpy(req->req_data, req->req_bp->b_data,
1397 req->req_bp->b_bcount);
1398 return 0;
1399 }
1400
1401 static void
xbd_unmap_align(struct xbd_xenbus_softc * sc,struct xbd_req * req,struct buf * bp)1402 xbd_unmap_align(struct xbd_xenbus_softc *sc, struct xbd_req *req,
1403 struct buf *bp)
1404 {
1405 KASSERT(!bp || sc->sc_unalign_used == bp);
1406 if (bp && bp->b_flags & B_READ)
1407 memcpy(bp->b_data, req->req_data, bp->b_bcount);
1408 sc->sc_unalign_used = NULL;
1409 }
1410