xref: /openbsd-src/sys/dev/pv/xenstore.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /*	$OpenBSD: xenstore.c,v 1.29 2016/07/29 21:05:26 mikeb Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Belopuhov
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <sys/atomic.h>
22 #include <sys/kernel.h>
23 #include <sys/malloc.h>
24 #include <sys/device.h>
25 #include <sys/mutex.h>
26 #include <sys/ioctl.h>
27 #include <sys/task.h>
28 
29 #include <machine/bus.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 #include <dev/pv/pvvar.h>
34 #include <dev/pv/xenreg.h>
35 #include <dev/pv/xenvar.h>
36 
37 /*
38  * The XenStore interface is a simple storage system that is a means of
39  * communicating state and configuration data between the Xen Domain 0
40  * and the various guest domains.  All configuration data other than
41  * a small amount of essential information required during the early
42  * boot process of launching a Xen aware guest, is managed using the
43  * XenStore.
44  *
45  * The XenStore is ASCII string based, and has a structure and semantics
46  * similar to a filesystem.  There are files and directories that are
47  * able to contain files or other directories.  The depth of the hierachy
48  * is only limited by the XenStore's maximum path length.
49  *
50  * The communication channel between the XenStore service and other
51  * domains is via two, guest specific, ring buffers in a shared memory
52  * area.  One ring buffer is used for communicating in each direction.
53  * The grant table references for this shared memory are given to the
54  * guest via HVM hypercalls.
55  *
56  * The XenStore communication relies on an event channel and thus
57  * interrupts. Several Xen services depend on the XenStore, most
58  * notably the XenBus used to discover and manage Xen devices.
59  */
60 
61 const struct {
62 	const char		*xse_errstr;
63 	int			 xse_errnum;
64 } xs_errors[] = {
65 	{ "EINVAL",	EINVAL },
66 	{ "EACCES",	EACCES },
67 	{ "EEXIST",	EEXIST },
68 	{ "EISDIR",	EISDIR },
69 	{ "ENOENT",	ENOENT },
70 	{ "ENOMEM",	ENOMEM },
71 	{ "ENOSPC",	ENOSPC },
72 	{ "EIO",	EIO },
73 	{ "ENOTEMPTY",	ENOTEMPTY },
74 	{ "ENOSYS",	ENOSYS },
75 	{ "EROFS",	EROFS },
76 	{ "EBUSY",	EBUSY },
77 	{ "EAGAIN",	EAGAIN },
78 	{ "EISCONN",	EISCONN },
79 	{ NULL,		-1 },
80 };
81 
82 struct xs_msghdr {
83 	/* Message type */
84 	uint32_t		 xmh_type;
85 	/* Request identifier, echoed in daemon's response.  */
86 	uint32_t		 xmh_rid;
87 	/* Transaction id (0 if not related to a transaction). */
88 	uint32_t		 xmh_tid;
89 	/* Length of data following this. */
90 	uint32_t		 xmh_len;
91 	/* Generally followed by nul-terminated string(s). */
92 } __packed;
93 
94 /*
95  * A minimum output buffer size needed to store an error string.
96  */
97 #define XS_ERR_PAYLOAD		16
98 
99 /*
100  * Although the Xen source code implies that the limit is 4k,
101  * in practice it turns out that we can only send 2k bytes of
102  * payload before receiving a ENOSPC.  We set it to an even
103  * smaller value however, because there's no real need to use
104  * large buffers for anything.
105  */
106 #define XS_MAX_PAYLOAD		1024
107 
108 struct xs_msg {
109 	struct xs_msghdr	 xsm_hdr;
110 	int			 xsm_read;
111 	int			 xsm_dlen;
112 	uint8_t			*xsm_data;
113 	TAILQ_ENTRY(xs_msg)	 xsm_link;
114 };
115 TAILQ_HEAD(xs_msgq, xs_msg);
116 
117 #define XS_RING_SIZE		1024
118 
119 struct xs_ring {
120 	uint8_t			xsr_req[XS_RING_SIZE];
121 	uint8_t			xsr_rsp[XS_RING_SIZE];
122 	uint32_t		xsr_req_cons;
123 	uint32_t		xsr_req_prod;
124 	uint32_t		xsr_rsp_cons;
125 	uint32_t		xsr_rsp_prod;
126 } __packed;
127 
128 #define XST_DELAY		1	/* in seconds */
129 
130 #define XSW_TOKLEN		(sizeof(void *) * 2 + 1)
131 
132 struct xs_watch {
133 	TAILQ_ENTRY(xs_watch)	 xsw_entry;
134 	uint8_t			 xsw_token[XSW_TOKLEN];
135 	struct task		*xsw_task;
136 };
137 
138 /*
139  * Container for all XenStore related state.
140  */
141 struct xs_softc {
142 	struct xen_softc	*xs_sc;
143 
144 	evtchn_port_t		 xs_port;
145 	xen_intr_handle_t	 xs_ih;
146 
147 	struct xs_ring		*xs_ring;
148 
149 	struct xs_msg		 xs_msgs[10];
150 	struct xs_msg		*xs_rmsg;
151 
152 	struct xs_msgq		 xs_free;
153 	struct xs_msgq		 xs_reqs;
154 	struct xs_msgq		 xs_rsps;
155 
156 	volatile uint		 xs_rid;
157 
158 	const char		*xs_wchan;
159 	const char		*xs_rchan;
160 
161 	struct mutex		 xs_reqlck;	/* request queue mutex */
162 	struct mutex		 xs_rsplck;	/* response queue mutex */
163 	struct mutex		 xs_frqlck;	/* free queue mutex */
164 
165 	TAILQ_HEAD(, xs_watch)	 xs_watches;
166 	struct mutex		 xs_watchlck;
167 	struct xs_msg		 xs_emsg;
168 
169 	uint			 xs_rngsem;
170 };
171 
172 struct xs_msg *
173 	xs_get_msg(struct xs_softc *, int);
174 void	xs_put_msg(struct xs_softc *, struct xs_msg *);
175 int	xs_ring_get(struct xs_softc *, void *, size_t);
176 int	xs_ring_put(struct xs_softc *, void *, size_t);
177 void	xs_intr(void *);
178 int	xs_output(struct xs_transaction *, uint8_t *, int);
179 int	xs_start(struct xs_transaction *, struct xs_msg *, struct iovec *, int);
180 struct xs_msg *
181 	xs_reply(struct xs_transaction *, uint);
182 int	xs_parse(struct xs_transaction *, struct xs_msg *, struct iovec **,
183 	    int *);
184 int	xs_event(struct xs_softc *, struct xs_msg *);
185 
186 int
187 xs_attach(struct xen_softc *sc)
188 {
189         struct xen_hvm_param xhv;
190 	struct xs_softc *xs;
191 	paddr_t pa;
192 	int i;
193 
194 	if ((xs = malloc(sizeof(*xs), M_DEVBUF, M_NOWAIT | M_ZERO)) == NULL) {
195 		printf(": failed to allocate xenstore softc\n");
196 		return (-1);
197 	}
198 	sc->sc_xs = xs;
199 	xs->xs_sc = sc;
200 
201 	/* Fetch event channel port */
202 	memset(&xhv, 0, sizeof(xhv));
203 	xhv.domid = DOMID_SELF;
204 	xhv.index = HVM_PARAM_STORE_EVTCHN;
205 	if (xen_hypercall(sc, XC_HVM, 2, HVMOP_get_param, &xhv)) {
206 		printf(": failed to obtain a xenstore event channel\n");
207 		goto fail_1;
208 	}
209 	xs->xs_port = xhv.value;
210 
211 	printf(", event channel %d\n", xs->xs_port);
212 
213 	/* Fetch a frame number (PA) of a shared xenstore page */
214 	memset(&xhv, 0, sizeof(xhv));
215 	xhv.domid = DOMID_SELF;
216 	xhv.index = HVM_PARAM_STORE_PFN;
217 	if (xen_hypercall(sc, XC_HVM, 2, HVMOP_get_param, &xhv))
218 		goto fail_1;
219 	pa = ptoa(xhv.value);
220 	/* Allocate a page of virtual memory */
221 	xs->xs_ring = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
222 	if (xs->xs_ring == NULL)
223 		goto fail_1;
224 	/* Map in the xenstore page into our KVA */
225 	pa |= PMAP_NOCACHE;
226 	pmap_kenter_pa((vaddr_t)xs->xs_ring, pa, PROT_READ | PROT_WRITE);
227 	pmap_update(pmap_kernel());
228 
229 	if (xen_intr_establish(xs->xs_port, &xs->xs_ih, 0, xs_intr, xs,
230 	    sc->sc_dev.dv_xname))
231 		goto fail_2;
232 
233 	xs->xs_wchan = "xswrite";
234 	xs->xs_rchan = "xsread";
235 
236 	TAILQ_INIT(&xs->xs_free);
237 	TAILQ_INIT(&xs->xs_reqs);
238 	TAILQ_INIT(&xs->xs_rsps);
239 	for (i = 0; i < nitems(xs->xs_msgs); i++)
240 		TAILQ_INSERT_TAIL(&xs->xs_free, &xs->xs_msgs[i], xsm_link);
241 
242 	mtx_init(&xs->xs_reqlck, IPL_NET);
243 	mtx_init(&xs->xs_rsplck, IPL_NET);
244 	mtx_init(&xs->xs_frqlck, IPL_NET);
245 
246 	mtx_init(&xs->xs_watchlck, IPL_NET);
247 	TAILQ_INIT(&xs->xs_watches);
248 
249 	xs->xs_emsg.xsm_data = malloc(XS_MAX_PAYLOAD, M_DEVBUF,
250 	    M_ZERO | M_NOWAIT);
251 	if (xs->xs_emsg.xsm_data == NULL)
252 		goto fail_2;
253 	xs->xs_emsg.xsm_dlen = XS_MAX_PAYLOAD;
254 
255 	return (0);
256 
257  fail_2:
258 	pmap_kremove((vaddr_t)xs->xs_ring, PAGE_SIZE);
259 	pmap_update(pmap_kernel());
260 	km_free(xs->xs_ring, PAGE_SIZE, &kv_any, &kp_none);
261 	xs->xs_ring = NULL;
262  fail_1:
263 	free(xs, sizeof(*xs), M_DEVBUF);
264 	sc->sc_xs = NULL;
265 	return (-1);
266 }
267 
268 static inline int
269 xs_sem_get(uint *semaphore)
270 {
271 	if (atomic_inc_int_nv(semaphore) != 1) {
272 		/* we're out of luck */
273 		if (atomic_dec_int_nv(semaphore) == 0)
274 			wakeup(semaphore);
275 		return (0);
276 	}
277 	return (1);
278 }
279 
280 static inline void
281 xs_sem_put(uint *semaphore)
282 {
283 	if (atomic_dec_int_nv(semaphore) == 0)
284 		wakeup(semaphore);
285 }
286 
287 struct xs_msg *
288 xs_get_msg(struct xs_softc *xs, int waitok)
289 {
290 	static const char *chan = "xsalloc";
291 	struct xs_msg *xsm;
292 
293 	mtx_enter(&xs->xs_frqlck);
294 	for (;;) {
295 		xsm = TAILQ_FIRST(&xs->xs_free);
296 		if (xsm != NULL) {
297 			TAILQ_REMOVE(&xs->xs_free, xsm, xsm_link);
298 			break;
299 		}
300 		if (!waitok) {
301 			mtx_leave(&xs->xs_frqlck);
302 			delay(XST_DELAY * 1000 >> 2);
303 			mtx_enter(&xs->xs_frqlck);
304 		} else
305 			msleep(chan, &xs->xs_frqlck, PRIBIO, chan,
306 			    XST_DELAY * hz >> 2);
307 	}
308 	mtx_leave(&xs->xs_frqlck);
309 	return (xsm);
310 }
311 
312 void
313 xs_put_msg(struct xs_softc *xs, struct xs_msg *xsm)
314 {
315 	memset(xsm, 0, sizeof(*xsm));
316 	mtx_enter(&xs->xs_frqlck);
317 	TAILQ_INSERT_TAIL(&xs->xs_free, xsm, xsm_link);
318 	mtx_leave(&xs->xs_frqlck);
319 }
320 
321 int
322 xs_geterror(struct xs_msg *xsm)
323 {
324 	int i;
325 
326 	for (i = 0; i < nitems(xs_errors); i++)
327 		if (strcmp(xs_errors[i].xse_errstr, xsm->xsm_data) == 0)
328 			break;
329 	return (xs_errors[i].xse_errnum);
330 }
331 
332 static inline uint32_t
333 xs_ring_avail(struct xs_ring *xsr, int req)
334 {
335 	uint32_t cons = req ? xsr->xsr_req_cons : xsr->xsr_rsp_cons;
336 	uint32_t prod = req ? xsr->xsr_req_prod : xsr->xsr_rsp_prod;
337 
338 	KASSERT(prod - cons <= XS_RING_SIZE);
339 	return (req ? XS_RING_SIZE - (prod - cons) : prod - cons);
340 }
341 
342 int
343 xs_output(struct xs_transaction *xst, uint8_t *bp, int len)
344 {
345 	struct xs_softc *xs = xst->xst_sc;
346 	int chunk, s;
347 
348 	while (len > 0) {
349 		chunk = xs_ring_put(xs, bp, MIN(len, XS_RING_SIZE));
350 		if (chunk < 0)
351 			return (-1);
352 		if (chunk > 0) {
353 			len -= chunk;
354 			bp += chunk;
355 			if (xs_ring_avail(xs->xs_ring, 1) > 0)
356 				continue;
357 		}
358 		/* Squeaky wheel gets the kick */
359 		xen_intr_signal(xs->xs_ih);
360 		/*
361 		 * chunk == 0: we need to wait for hv to consume
362 		 * what has already been written;
363 		 *
364 		 * Alternatively we have managed to fill the ring
365 		 * and must wait for HV to collect the data.
366 		 */
367 		while (xs->xs_ring->xsr_req_prod != xs->xs_ring->xsr_req_cons) {
368 			if (xst->xst_flags & XST_POLL) {
369 				delay(XST_DELAY * 1000 >> 2);
370 				s = splnet();
371 				xs_intr(xs);
372 				splx(s);
373 			} else
374 				tsleep(xs->xs_wchan, PRIBIO, xs->xs_wchan,
375 				    XST_DELAY * hz >> 2);
376 			virtio_membar_sync();
377 		}
378 	}
379 	return (0);
380 }
381 
382 int
383 xs_start(struct xs_transaction *xst, struct xs_msg *xsm, struct iovec *iov,
384     int iov_cnt)
385 {
386 	struct xs_softc *xs = xst->xst_sc;
387 	int i;
388 
389 	while (!xs_sem_get(&xs->xs_rngsem)) {
390 		if (xst->xst_flags & XST_POLL)
391 			delay(XST_DELAY * 1000 >> 2);
392 		else
393 			tsleep(&xs->xs_rngsem, PRIBIO, "xsaccess",
394 			    XST_DELAY * hz >> 2);
395 	}
396 
397 	/* Header */
398 	if (xs_output(xst, (uint8_t *)&xsm->xsm_hdr,
399 	    sizeof(xsm->xsm_hdr)) == -1) {
400 		printf("%s: failed to write the header\n", __func__);
401 		xs_sem_put(&xs->xs_rngsem);
402 		return (-1);
403 	}
404 
405 	/* Data loop */
406 	for (i = 0; i < iov_cnt; i++) {
407 		if (xs_output(xst, iov[i].iov_base, iov[i].iov_len) == -1) {
408 			printf("%s: failed on iovec #%d len %ld\n", __func__,
409 			    i, iov[i].iov_len);
410 			xs_sem_put(&xs->xs_rngsem);
411 			return (-1);
412 		}
413 	}
414 
415 	mtx_enter(&xs->xs_reqlck);
416 	TAILQ_INSERT_TAIL(&xs->xs_reqs, xsm, xsm_link);
417 	mtx_leave(&xs->xs_reqlck);
418 
419 	xen_intr_signal(xs->xs_ih);
420 
421 	xs_sem_put(&xs->xs_rngsem);
422 
423 	return (0);
424 }
425 
426 struct xs_msg *
427 xs_reply(struct xs_transaction *xst, uint rid)
428 {
429 	struct xs_softc *xs = xst->xst_sc;
430 	struct xs_msg *xsm;
431 	int s;
432 
433 	mtx_enter(&xs->xs_rsplck);
434 	for (;;) {
435 		TAILQ_FOREACH(xsm, &xs->xs_rsps, xsm_link) {
436 			if (xsm->xsm_hdr.xmh_tid == xst->xst_id &&
437 			    xsm->xsm_hdr.xmh_rid == rid)
438 				break;
439 		}
440 		if (xsm != NULL) {
441 			TAILQ_REMOVE(&xs->xs_rsps, xsm, xsm_link);
442 			break;
443 		}
444 		if (xst->xst_flags & XST_POLL) {
445 			mtx_leave(&xs->xs_rsplck);
446 			delay(XST_DELAY * 1000 >> 2);
447 			s = splnet();
448 			xs_intr(xs);
449 			splx(s);
450 			mtx_enter(&xs->xs_rsplck);
451 		} else
452 			msleep(xs->xs_rchan, &xs->xs_rsplck, PRIBIO,
453 			    xs->xs_rchan, XST_DELAY * hz >> 2);
454 	}
455 	mtx_leave(&xs->xs_rsplck);
456 	return (xsm);
457 }
458 
459 int
460 xs_ring_put(struct xs_softc *xs, void *src, size_t size)
461 {
462 	struct xs_ring *xsr = xs->xs_ring;
463 	uint32_t prod = xsr->xsr_req_prod & (XS_RING_SIZE - 1);
464 	uint32_t avail = xs_ring_avail(xsr, 1);
465 	size_t left;
466 
467 	if (size > XS_RING_SIZE)
468 		return (-1);
469 	if (avail == 0)
470 		return (0);
471 
472 	/* Bound the size by the number of available slots */
473 	size = MIN(size, avail);
474 	/* How many contiguous bytes can we memcpy... */
475 	left = XS_RING_SIZE - prod;
476 	/* ...bounded by by how much we need to write? */
477 	left = MIN(left, size);
478 
479 	memcpy(&xsr->xsr_req[prod], src, left);
480 	memcpy(&xsr->xsr_req[0], (caddr_t)src + left, size - left);
481 	virtio_membar_sync();
482 	xsr->xsr_req_prod += size;
483 	return (size);
484 }
485 
486 int
487 xs_ring_get(struct xs_softc *xs, void *dst, size_t size)
488 {
489 	struct xs_ring *xsr = xs->xs_ring;
490 	uint32_t cons = xsr->xsr_rsp_cons & (XS_RING_SIZE - 1);
491 	uint32_t avail = xs_ring_avail(xsr, 0);
492 	size_t left;
493 
494 	if (size > XS_RING_SIZE)
495 		return (-1);
496 	if (avail == 0)
497 		return (0);
498 
499 	/* Bound the size by the number of available slots */
500 	size = MIN(size, avail);
501 	/* How many contiguous bytes can we memcpy... */
502 	left = XS_RING_SIZE - cons;
503 	/* ...bounded by by how much we need to read? */
504 	left = MIN(left, size);
505 
506 	memcpy(dst, &xsr->xsr_rsp[cons], left);
507 	memcpy((caddr_t)dst + left, &xsr->xsr_rsp[0], size - left);
508 	virtio_membar_sync();
509 	xsr->xsr_rsp_cons += size;
510 	return (size);
511 }
512 
513 void
514 xs_intr(void *arg)
515 {
516 	struct xs_softc *xs = arg;
517 	struct xs_ring *xsr = xs->xs_ring;
518 	struct xen_softc *sc = xs->xs_sc;
519 	struct xs_msg *xsm = xs->xs_rmsg;
520 	struct xs_msghdr xmh;
521 	uint32_t avail;
522 	int len;
523 
524 	virtio_membar_sync();
525 
526 	if (xsr->xsr_rsp_cons == xsr->xsr_rsp_prod)
527 		return;
528 
529 	avail = xs_ring_avail(xsr, 0);
530 
531 	/* Response processing */
532 
533  again:
534 	if (xs->xs_rmsg == NULL) {
535 		if (avail < sizeof(xmh)) {
536 			printf("%s: incomplete header: %d\n",
537 			    sc->sc_dev.dv_xname, avail);
538 			goto out;
539 		}
540 		avail -= sizeof(xmh);
541 
542 		if ((len = xs_ring_get(xs, &xmh, sizeof(xmh))) != sizeof(xmh)) {
543 			printf("%s: message too short: %d\n",
544 			    sc->sc_dev.dv_xname, len);
545 			goto out;
546 		}
547 
548 		if (xmh.xmh_type == XS_EVENT) {
549 			xsm = &xs->xs_emsg;
550 			xsm->xsm_read = 0;
551 		} else {
552 			mtx_enter(&xs->xs_reqlck);
553 			TAILQ_FOREACH(xsm, &xs->xs_reqs, xsm_link) {
554 				if (xsm->xsm_hdr.xmh_rid == xmh.xmh_rid) {
555 					TAILQ_REMOVE(&xs->xs_reqs, xsm,
556 					    xsm_link);
557 					break;
558 				}
559 			}
560 			mtx_leave(&xs->xs_reqlck);
561 			if (xsm == NULL) {
562 				printf("%s: unexpected message id %u\n",
563 				    sc->sc_dev.dv_xname, xmh.xmh_rid);
564 				goto out;
565 			}
566 		}
567 		memcpy(&xsm->xsm_hdr, &xmh, sizeof(xmh));
568 		xs->xs_rmsg = xsm;
569 	}
570 
571 	if (xsm->xsm_hdr.xmh_len > xsm->xsm_dlen)
572 		panic("message too large: %d vs %d for type %d, rid %u",
573 		    xsm->xsm_hdr.xmh_len, xsm->xsm_dlen, xsm->xsm_hdr.xmh_type,
574 		    xsm->xsm_hdr.xmh_rid);
575 
576 	len = MIN(xsm->xsm_hdr.xmh_len - xsm->xsm_read, avail);
577 	if (len) {
578 		/* Get data if reply is not empty */
579 		if ((len = xs_ring_get(xs,
580 		    &xsm->xsm_data[xsm->xsm_read], len)) <= 0) {
581 			printf("%s: read failure %d\n", sc->sc_dev.dv_xname,
582 			    len);
583 			goto out;
584 		}
585 		xsm->xsm_read += len;
586 	}
587 
588 	/* Notify reader that we've managed to read the whole message */
589 	if (xsm->xsm_read == xsm->xsm_hdr.xmh_len) {
590 		xs->xs_rmsg = NULL;
591 		if (xsm->xsm_hdr.xmh_type == XS_EVENT) {
592 			xs_event(xs, xsm);
593 		} else {
594 			mtx_enter(&xs->xs_rsplck);
595 			TAILQ_INSERT_TAIL(&xs->xs_rsps, xsm, xsm_link);
596 			mtx_leave(&xs->xs_rsplck);
597 			wakeup(xs->xs_rchan);
598 		}
599 	}
600 
601 	if ((avail = xs_ring_avail(xsr, 0)) > 0)
602 		goto again;
603 
604  out:
605 	/* Wakeup sleeping writes (if any) */
606 	wakeup(xs->xs_wchan);
607 	xen_intr_signal(xs->xs_ih);
608 }
609 
610 static inline int
611 xs_get_buf(struct xs_transaction *xst, struct xs_msg *xsm, int len)
612 {
613 	unsigned char *buf = NULL;
614 
615 	buf = malloc(len, M_DEVBUF, M_ZERO | (xst->xst_flags & XST_POLL ?
616 	    M_NOWAIT : M_WAITOK));
617 	if (buf == NULL)
618 		return (-1);
619 	xsm->xsm_dlen = len;
620 	xsm->xsm_data = buf;
621 	return (0);
622 }
623 
624 static inline void
625 xs_put_buf(struct xs_transaction *xst, struct xs_msg *xsm)
626 {
627 	free(xsm->xsm_data, M_DEVBUF, xsm->xsm_dlen);
628 	xsm->xsm_data = NULL;
629 }
630 
631 void
632 xs_resfree(struct xs_transaction *xst, struct iovec *iov, int iov_cnt)
633 {
634 	int i;
635 
636 	for (i = 0; i < iov_cnt; i++)
637 		free(iov[i].iov_base, M_DEVBUF, iov[i].iov_len);
638 	free(iov, M_DEVBUF, sizeof(struct iovec) * iov_cnt);
639 }
640 
641 int
642 xs_parse(struct xs_transaction *xst, struct xs_msg *xsm, struct iovec **iov,
643     int *iov_cnt)
644 {
645 	char *bp, *cp;
646 	int i, dlen, flags;
647 
648 	/* If the response size is zero, we return an empty string */
649 	dlen = MAX(xsm->xsm_hdr.xmh_len, 1);
650 	flags = M_ZERO | (xst->xst_flags & XST_POLL ? M_NOWAIT : M_WAITOK);
651 
652 	*iov_cnt = 0;
653 	/* Make sure that the data is NUL terminated */
654 	if (xsm->xsm_data[dlen - 1] != '\0') {
655 		/*
656 		 * The XS_READ operation always returns length without
657 		 * the trailing NUL so we have to adjust the length.
658 		 */
659 		dlen = MIN(dlen + 1, xsm->xsm_dlen);
660 		xsm->xsm_data[dlen - 1] = '\0';
661 	}
662 	for (i = 0; i < dlen; i++)
663 		if (xsm->xsm_data[i] == '\0')
664 			(*iov_cnt)++;
665 	*iov = mallocarray(*iov_cnt, sizeof(struct iovec), M_DEVBUF, flags);
666 	if (*iov == NULL)
667 		goto cleanup;
668 	bp = xsm->xsm_data;
669 	for (i = 0; i < *iov_cnt; i++) {
670 		cp = bp;
671 		while (cp - (caddr_t)xsm->xsm_data < dlen && *cp != '\0')
672 			cp++;
673 		(*iov)[i].iov_len = cp - bp + 1;
674 		(*iov)[i].iov_base = malloc((*iov)[i].iov_len, M_DEVBUF, flags);
675 		if (!(*iov)[i].iov_base) {
676 			xs_resfree(xst, *iov, *iov_cnt);
677 			goto cleanup;
678 		}
679 		memcpy((*iov)[i].iov_base, bp, (*iov)[i].iov_len);
680 		bp = ++cp;
681 	}
682 	return (0);
683 
684  cleanup:
685 	*iov = NULL;
686 	*iov_cnt = 0;
687 	return (ENOMEM);
688 }
689 
690 int
691 xs_event(struct xs_softc *xs, struct xs_msg *xsm)
692 {
693 	struct xs_watch *xsw;
694 	char *token = NULL;
695 	int i;
696 
697 	for (i = 0; i < xsm->xsm_read; i++) {
698 		if (xsm->xsm_data[i] == '\0') {
699 			token = &xsm->xsm_data[i+1];
700 			break;
701 		}
702 	}
703 	if (token == NULL) {
704 		printf("%s: event on \"%s\" without token\n",
705 		    xs->xs_sc->sc_dev.dv_xname, xsm->xsm_data);
706 		return (-1);
707 	}
708 
709 	mtx_enter(&xs->xs_watchlck);
710 	TAILQ_FOREACH(xsw, &xs->xs_watches, xsw_entry) {
711 		if (strcmp(xsw->xsw_token, token))
712 			continue;
713 		mtx_leave(&xs->xs_watchlck);
714 		task_add(systq, xsw->xsw_task);
715 		return (0);
716 	}
717 	mtx_leave(&xs->xs_watchlck);
718 
719 	printf("%s: no watchers for node \"%s\"\n",
720 	    xs->xs_sc->sc_dev.dv_xname, xsm->xsm_data);
721 	return (-1);
722 }
723 
724 int
725 xs_cmd(struct xs_transaction *xst, int cmd, const char *path,
726     struct iovec **iov, int *iov_cnt)
727 {
728 	struct xs_softc *xs = xst->xst_sc;
729 	struct xs_msg *xsm;
730 	struct iovec ov[10];	/* output vector */
731 	int datalen = XS_ERR_PAYLOAD;
732 	int ov_cnt = 0;
733 	enum { READ, WRITE } mode = READ;
734 	int i, error = 0;
735 
736 	if (cmd >= XS_MAX)
737 		return (EINVAL);
738 
739 	switch (cmd) {
740 	case XS_TOPEN:
741 		ov[0].iov_base = "";
742 		ov[0].iov_len = 1;
743 		ov_cnt++;
744 		break;
745 	case XS_TCLOSE:
746 	case XS_RM:
747 	case XS_WATCH:
748 	case XS_WRITE:
749 		mode = WRITE;
750 		/* FALLTHROUGH */
751 	default:
752 		if (mode == READ)
753 			datalen = XS_MAX_PAYLOAD;
754 		break;
755 	}
756 
757 	if (path) {
758 		ov[ov_cnt].iov_base = (void *)path;
759 		ov[ov_cnt++].iov_len = strlen(path) + 1; /* +NUL */
760 	}
761 
762 	if (mode == WRITE && iov && iov_cnt && *iov_cnt > 0) {
763 		for (i = 0; i < *iov_cnt && ov_cnt < nitems(ov);
764 		     i++, ov_cnt++) {
765 			ov[ov_cnt].iov_base = (*iov)[i].iov_base;
766 			ov[ov_cnt].iov_len = (*iov)[i].iov_len;
767 		}
768 	}
769 
770 	xsm = xs_get_msg(xs, !(xst->xst_flags & XST_POLL));
771 
772 	if (xs_get_buf(xst, xsm, datalen)) {
773 		xs_put_msg(xs, xsm);
774 		return (ENOMEM);
775 	}
776 
777 	xsm->xsm_hdr.xmh_tid = xst->xst_id;
778 	xsm->xsm_hdr.xmh_type = cmd;
779 	xsm->xsm_hdr.xmh_rid = atomic_inc_int_nv(&xs->xs_rid);
780 
781 	for (i = 0; i < ov_cnt; i++)
782 		xsm->xsm_hdr.xmh_len += ov[i].iov_len;
783 
784 	if (xsm->xsm_hdr.xmh_len > XS_MAX_PAYLOAD) {
785 		printf("%s: message type %d with payload above the limit\n",
786 		    xs->xs_sc->sc_dev.dv_xname, cmd);
787 		xs_put_buf(xst, xsm);
788 		xs_put_msg(xs, xsm);
789 		return (EIO);
790 	}
791 
792 	if (xs_start(xst, xsm, ov, ov_cnt)) {
793 		printf("%s: message type %d transmission failed\n",
794 		    xs->xs_sc->sc_dev.dv_xname, cmd);
795 		xs_put_buf(xst, xsm);
796 		xs_put_msg(xs, xsm);
797 		return (EIO);
798 	}
799 
800 	xsm = xs_reply(xst, xsm->xsm_hdr.xmh_rid);
801 
802 	if (xsm->xsm_hdr.xmh_type == XS_ERROR) {
803 		error = xs_geterror(xsm);
804 		DPRINTF("%s: xenstore request %d \"%s\" error %s\n",
805 		    xs->xs_sc->sc_dev.dv_xname, cmd, path, xsm->xsm_data);
806 	} else if (mode == READ) {
807 		KASSERT(iov && iov_cnt);
808 		error = xs_parse(xst, xsm, iov, iov_cnt);
809 	}
810 #ifdef XEN_DEBUG
811 	else
812 		if (strcmp(xsm->xsm_data, "OK"))
813 			printf("%s: xenstore request %d failed: %s\n",
814 			    xs->xs_sc->sc_dev.dv_xname, cmd, xsm->xsm_data);
815 #endif
816 
817 	xs_put_buf(xst, xsm);
818 	xs_put_msg(xs, xsm);
819 
820 	return (error);
821 }
822 
823 int
824 xs_watch(struct xen_softc *sc, const char *path, const char *property,
825     struct task *task, void (*cb)(void *), void *arg)
826 {
827 	struct xs_softc *xs = sc->sc_xs;
828 	struct xs_transaction xst;
829 	struct xs_watch *xsw;
830 	struct iovec iov, *iovp = &iov;
831 	char key[256];
832 	int error, iov_cnt, ret;
833 
834 	memset(&xst, 0, sizeof(xst));
835 	xst.xst_id = 0;
836 	xst.xst_sc = sc->sc_xs;
837 	if (cold)
838 		xst.xst_flags = XST_POLL;
839 
840 	xsw = malloc(sizeof(*xsw), M_DEVBUF, M_NOWAIT | M_ZERO);
841 	if (xsw == NULL)
842 		return (-1);
843 
844 	task_set(task, cb, arg);
845 	xsw->xsw_task = task;
846 
847 	snprintf(xsw->xsw_token, sizeof(xsw->xsw_token), "%0lx",
848 	    (unsigned long)xsw);
849 
850 	if (path)
851 		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
852 	else
853 		ret = snprintf(key, sizeof(key), "%s", property);
854 	if (ret == -1 || ret >= sizeof(key))
855 		return (EINVAL);
856 
857 	iov.iov_base = xsw->xsw_token;
858 	iov.iov_len = sizeof(xsw->xsw_token);
859 	iov_cnt = 1;
860 
861 	/*
862 	 * xs_watches must be prepared pre-emptively because a xenstore
863 	 * event is raised immediately after a watch is established.
864 	 */
865 	mtx_enter(&xs->xs_watchlck);
866 	TAILQ_INSERT_TAIL(&xs->xs_watches, xsw, xsw_entry);
867 	mtx_leave(&xs->xs_watchlck);
868 
869 	if ((error = xs_cmd(&xst, XS_WATCH, key, &iovp, &iov_cnt)) != 0) {
870 		mtx_enter(&xs->xs_watchlck);
871 		TAILQ_REMOVE(&xs->xs_watches, xsw, xsw_entry);
872 		mtx_leave(&xs->xs_watchlck);
873 		free(xsw, M_DEVBUF, sizeof(*xsw));
874 		return (error);
875 	}
876 
877 	return (0);
878 }
879 
880 int
881 xs_getprop(struct xen_softc *sc, const char *path, const char *property,
882     char *value, int size)
883 {
884 	struct xs_transaction xst;
885 	struct iovec *iovp = NULL;
886 	char key[256];
887 	int error, ret, iov_cnt = 0;
888 
889 	if (!property)
890 		return (-1);
891 
892 	memset(&xst, 0, sizeof(xst));
893 	xst.xst_id = 0;
894 	xst.xst_sc = sc->sc_xs;
895 	if (cold)
896 		xst.xst_flags = XST_POLL;
897 
898 	if (path)
899 		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
900 	else
901 		ret = snprintf(key, sizeof(key), "%s", property);
902 	if (ret == -1 || ret >= sizeof(key))
903 		return (EINVAL);
904 
905 	if ((error = xs_cmd(&xst, XS_READ, key, &iovp, &iov_cnt)) != 0)
906 		return (error);
907 
908 	if (iov_cnt > 0)
909 		strlcpy(value, (char *)iovp->iov_base, size);
910 
911 	xs_resfree(&xst, iovp, iov_cnt);
912 
913 	return (0);
914 }
915 
916 int
917 xs_setprop(struct xen_softc *sc, const char *path, const char *property,
918     char *value, int size)
919 {
920 	struct xs_transaction xst;
921 	struct iovec iov, *iovp = &iov;
922 	char key[256];
923 	int error, ret, iov_cnt = 0;
924 
925 	if (!property)
926 		return (-1);
927 
928 	memset(&xst, 0, sizeof(xst));
929 	xst.xst_id = 0;
930 	xst.xst_sc = sc->sc_xs;
931 	if (cold)
932 		xst.xst_flags = XST_POLL;
933 
934 	if (path)
935 		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
936 	else
937 		ret = snprintf(key, sizeof(key), "%s", property);
938 	if (ret == -1 || ret >= sizeof(key))
939 		return (EINVAL);
940 
941 	iov.iov_base = value;
942 	iov.iov_len = size;
943 	iov_cnt = 1;
944 
945 	error = xs_cmd(&xst, XS_WRITE, key, &iovp, &iov_cnt);
946 
947 	return (error);
948 }
949 
950 int
951 xs_kvop(void *arg, int op, char *key, char *value, size_t valuelen)
952 {
953 	struct xen_softc *sc = arg;
954 	struct xs_transaction xst;
955 	struct iovec iov, *iovp = &iov;
956 	int error = 0, iov_cnt = 0, cmd, i;
957 
958 	switch (op) {
959 	case PVBUS_KVWRITE:
960 		cmd = XS_WRITE;
961 		iov.iov_base = value;
962 		iov.iov_len = strlen(value);
963 		iov_cnt = 1;
964 		break;
965 	case PVBUS_KVREAD:
966 		cmd = XS_READ;
967 		break;
968 	case PVBUS_KVLS:
969 		cmd = XS_LIST;
970 		break;
971 	default:
972 		return (EOPNOTSUPP);
973 	}
974 
975 	memset(&xst, 0, sizeof(xst));
976 	xst.xst_id = 0;
977 	xst.xst_sc = sc->sc_xs;
978 
979 	if ((error = xs_cmd(&xst, cmd, key, &iovp, &iov_cnt)) != 0)
980 		return (error);
981 
982 	memset(value, 0, valuelen);
983 
984 	switch (cmd) {
985 	case XS_READ:
986 		if (iov_cnt == 1 && iovp[0].iov_len == 1) {
987 			xs_resfree(&xst, iovp, iov_cnt);
988 
989 			/*
990 			 * We cannot distinguish if the returned value is
991 			 * a directory or a file in the xenstore.  The only
992 			 * indication is that the read value of a directory
993 			 * returns an empty string (single nul byte),
994 			 * so try to get the directory list in this case.
995 			 */
996 			return (xs_kvop(arg, PVBUS_KVLS, key, value, valuelen));
997 		}
998 		/* FALLTHROUGH */
999 	case XS_LIST:
1000 		for (i = 0; i < iov_cnt; i++) {
1001 			if (i && strlcat(value, "\n", valuelen) >= valuelen)
1002 				break;
1003 			if (strlcat(value, iovp[i].iov_base,
1004 			    valuelen) >= valuelen)
1005 				break;
1006 		}
1007 		xs_resfree(&xst, iovp, iov_cnt);
1008 		break;
1009 	default:
1010 		break;
1011 	}
1012 
1013 	return (0);
1014 }
1015