xref: /openbsd-src/sys/dev/pv/xenstore.c (revision 3374c67d44f9b75b98444cbf63020f777792342e)
1 /*	$OpenBSD: xenstore.c,v 1.47 2022/11/10 02:47:52 asou Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Belopuhov
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <sys/atomic.h>
22 #include <sys/kernel.h>
23 #include <sys/malloc.h>
24 #include <sys/device.h>
25 #include <sys/mutex.h>
26 #include <sys/rwlock.h>
27 #include <sys/ioctl.h>
28 #include <sys/task.h>
29 
30 #include <machine/bus.h>
31 
32 #include <uvm/uvm_extern.h>
33 
34 #include <dev/pv/pvvar.h>
35 #include <dev/pv/xenreg.h>
36 #include <dev/pv/xenvar.h>
37 
38 /* #define XS_DEBUG */
39 
40 #ifdef XS_DEBUG
41 #define DPRINTF(x...)		printf(x)
42 #else
43 #define DPRINTF(x...)
44 #endif
45 
46 /*
47  * The XenStore interface is a simple storage system that is a means of
48  * communicating state and configuration data between the Xen Domain 0
49  * and the various guest domains.  All configuration data other than
50  * a small amount of essential information required during the early
51  * boot process of launching a Xen aware guest, is managed using the
52  * XenStore.
53  *
54  * The XenStore is ASCII string based, and has a structure and semantics
55  * similar to a filesystem.  There are files and directories that are
56  * able to contain files or other directories.  The depth of the hierarchy
57  * is only limited by the XenStore's maximum path length.
58  *
59  * The communication channel between the XenStore service and other
60  * domains is via two, guest specific, ring buffers in a shared memory
61  * area.  One ring buffer is used for communicating in each direction.
62  * The grant table references for this shared memory are given to the
63  * guest via HVM hypercalls.
64  *
65  * The XenStore communication relies on an event channel and thus
66  * interrupts. Several Xen services depend on the XenStore, most
67  * notably the XenBus used to discover and manage Xen devices.
68  */
69 
70 const struct {
71 	const char		*xse_errstr;
72 	int			 xse_errnum;
73 } xs_errors[] = {
74 	{ "EINVAL",	EINVAL },
75 	{ "EACCES",	EACCES },
76 	{ "EEXIST",	EEXIST },
77 	{ "EISDIR",	EISDIR },
78 	{ "ENOENT",	ENOENT },
79 	{ "ENOMEM",	ENOMEM },
80 	{ "ENOSPC",	ENOSPC },
81 	{ "EIO",	EIO },
82 	{ "ENOTEMPTY",	ENOTEMPTY },
83 	{ "ENOSYS",	ENOSYS },
84 	{ "EROFS",	EROFS },
85 	{ "EBUSY",	EBUSY },
86 	{ "EAGAIN",	EAGAIN },
87 	{ "EISCONN",	EISCONN },
88 	{ NULL,		-1 },
89 };
90 
91 struct xs_msghdr {
92 	/* Message type */
93 	uint32_t		 xmh_type;
94 	/* Request identifier, echoed in daemon's response.  */
95 	uint32_t		 xmh_rid;
96 	/* Transaction id (0 if not related to a transaction). */
97 	uint32_t		 xmh_tid;
98 	/* Length of data following this. */
99 	uint32_t		 xmh_len;
100 	/* Generally followed by nul-terminated string(s). */
101 } __packed;
102 
103 /*
104  * A minimum output buffer size needed to store an error string.
105  */
106 #define XS_ERR_PAYLOAD		16
107 
108 /*
109  * Although the Xen source code implies that the limit is 4k,
110  * in practice it turns out that we can only send 2k bytes of
111  * payload before receiving a ENOSPC.  We set it to an even
112  * smaller value however, because there's no real need to use
113  * large buffers for anything.
114  */
115 #define XS_MAX_PAYLOAD		1024
116 
117 struct xs_msg {
118 	struct xs_msghdr	 xsm_hdr;
119 	uint32_t		 xsm_read;
120 	uint32_t		 xsm_dlen;
121 	int			 xsm_error;
122 	uint8_t			*xsm_data;
123 	TAILQ_ENTRY(xs_msg)	 xsm_link;
124 };
125 TAILQ_HEAD(xs_msgq, xs_msg);
126 
127 #define XS_RING_SIZE		1024
128 
129 struct xs_ring {
130 	uint8_t			xsr_req[XS_RING_SIZE];
131 	uint8_t			xsr_rsp[XS_RING_SIZE];
132 	uint32_t		xsr_req_cons;
133 	uint32_t		xsr_req_prod;
134 	uint32_t		xsr_rsp_cons;
135 	uint32_t		xsr_rsp_prod;
136 } __packed;
137 
138 #define XST_DELAY		1	/* in seconds */
139 
140 #define XSW_TOKLEN		(sizeof(void *) * 2 + 1)
141 
142 struct xs_watch {
143 	TAILQ_ENTRY(xs_watch)	 xsw_entry;
144 	uint8_t			 xsw_token[XSW_TOKLEN];
145 	struct task		*xsw_task;
146 };
147 
148 /*
149  * Container for all XenStore related state.
150  */
151 struct xs_softc {
152 	struct xen_softc	*xs_sc;
153 
154 	evtchn_port_t		 xs_port;
155 	xen_intr_handle_t	 xs_ih;
156 
157 	struct xs_ring		*xs_ring;
158 
159 	struct xs_msg		 xs_msgs[10];
160 	struct xs_msg		*xs_rmsg;
161 
162 	struct xs_msgq		 xs_free;
163 	struct xs_msgq		 xs_reqs;
164 	struct xs_msgq		 xs_rsps;
165 
166 	volatile uint		 xs_rid;
167 
168 	const char		*xs_wchan;
169 	const char		*xs_rchan;
170 
171 	struct mutex		 xs_reqlck;	/* request queue mutex */
172 	struct mutex		 xs_rsplck;	/* response queue mutex */
173 	struct mutex		 xs_frqlck;	/* free queue mutex */
174 
175 	TAILQ_HEAD(, xs_watch)	 xs_watches;
176 	struct mutex		 xs_watchlck;
177 	struct xs_msg		 xs_emsg;
178 	struct taskq		*xs_watchtq;
179 
180 	struct rwlock		 xs_rnglck;
181 };
182 
183 struct xs_msg *
184 	xs_get_msg(struct xs_softc *, int);
185 void	xs_put_msg(struct xs_softc *, struct xs_msg *);
186 int	xs_ring_get(struct xs_softc *, void *, size_t);
187 int	xs_ring_put(struct xs_softc *, void *, size_t);
188 void	xs_intr(void *);
189 void	xs_poll(struct xs_softc *, int);
190 int	xs_output(struct xs_transaction *, uint8_t *, int);
191 int	xs_start(struct xs_transaction *, struct xs_msg *, struct iovec *, int);
192 struct xs_msg *
193 	xs_reply(struct xs_transaction *, uint);
194 int	xs_parse(struct xs_transaction *, struct xs_msg *, struct iovec **,
195 	    int *);
196 int	xs_event(struct xs_softc *, struct xs_msg *);
197 
198 int
199 xs_attach(struct xen_softc *sc)
200 {
201         struct xen_hvm_param xhv;
202 	struct xs_softc *xs;
203 	paddr_t pa;
204 	int i;
205 
206 	if ((xs = malloc(sizeof(*xs), M_DEVBUF, M_NOWAIT | M_ZERO)) == NULL) {
207 		printf(": failed to allocate xenstore softc\n");
208 		return (-1);
209 	}
210 	sc->sc_xs = xs;
211 	xs->xs_sc = sc;
212 
213 	/* Fetch event channel port */
214 	memset(&xhv, 0, sizeof(xhv));
215 	xhv.domid = DOMID_SELF;
216 	xhv.index = HVM_PARAM_STORE_EVTCHN;
217 	if (xen_hypercall(sc, XC_HVM, 2, HVMOP_get_param, &xhv)) {
218 		printf(": failed to obtain a xenstore event channel\n");
219 		goto fail_1;
220 	}
221 	xs->xs_port = xhv.value;
222 
223 	printf(", event channel %u\n", xs->xs_port);
224 
225 	/* Fetch a frame number (PA) of a shared xenstore page */
226 	memset(&xhv, 0, sizeof(xhv));
227 	xhv.domid = DOMID_SELF;
228 	xhv.index = HVM_PARAM_STORE_PFN;
229 	if (xen_hypercall(sc, XC_HVM, 2, HVMOP_get_param, &xhv))
230 		goto fail_1;
231 	pa = ptoa(xhv.value);
232 	/* Allocate a page of virtual memory */
233 	xs->xs_ring = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
234 	if (xs->xs_ring == NULL)
235 		goto fail_1;
236 	/* Map in the xenstore page into our KVA */
237 	pa |= PMAP_NOCACHE;
238 	pmap_kenter_pa((vaddr_t)xs->xs_ring, pa, PROT_READ | PROT_WRITE);
239 	pmap_update(pmap_kernel());
240 
241 	if (xen_intr_establish(xs->xs_port, &xs->xs_ih, 0, xs_intr, xs,
242 	    sc->sc_dev.dv_xname))
243 		goto fail_2;
244 
245 	xs->xs_wchan = "xswrite";
246 	xs->xs_rchan = "xsread";
247 
248 	TAILQ_INIT(&xs->xs_free);
249 	TAILQ_INIT(&xs->xs_reqs);
250 	TAILQ_INIT(&xs->xs_rsps);
251 	for (i = 0; i < nitems(xs->xs_msgs); i++)
252 		TAILQ_INSERT_TAIL(&xs->xs_free, &xs->xs_msgs[i], xsm_link);
253 
254 	mtx_init(&xs->xs_reqlck, IPL_NET);
255 	mtx_init(&xs->xs_rsplck, IPL_NET);
256 	mtx_init(&xs->xs_frqlck, IPL_NET);
257 
258 	rw_init(&xs->xs_rnglck, "xsrnglck");
259 
260 	xs->xs_watchtq = taskq_create("xenwatch", 1, IPL_NET, 0);
261 
262 	mtx_init(&xs->xs_watchlck, IPL_NET);
263 	TAILQ_INIT(&xs->xs_watches);
264 
265 	xs->xs_emsg.xsm_data = malloc(XS_MAX_PAYLOAD, M_DEVBUF,
266 	    M_ZERO | M_NOWAIT);
267 	if (xs->xs_emsg.xsm_data == NULL)
268 		goto fail_2;
269 	xs->xs_emsg.xsm_dlen = XS_MAX_PAYLOAD;
270 
271 	return (0);
272 
273  fail_2:
274 	pmap_kremove((vaddr_t)xs->xs_ring, PAGE_SIZE);
275 	pmap_update(pmap_kernel());
276 	km_free(xs->xs_ring, PAGE_SIZE, &kv_any, &kp_none);
277 	xs->xs_ring = NULL;
278  fail_1:
279 	free(xs, sizeof(*xs), M_DEVBUF);
280 	sc->sc_xs = NULL;
281 	return (-1);
282 }
283 
284 struct xs_msg *
285 xs_get_msg(struct xs_softc *xs, int waitok)
286 {
287 	static const char *chan = "xsalloc";
288 	struct xs_msg *xsm;
289 
290 	mtx_enter(&xs->xs_frqlck);
291 	for (;;) {
292 		xsm = TAILQ_FIRST(&xs->xs_free);
293 		if (xsm != NULL) {
294 			TAILQ_REMOVE(&xs->xs_free, xsm, xsm_link);
295 			break;
296 		}
297 		if (!waitok) {
298 			mtx_leave(&xs->xs_frqlck);
299 			delay(XST_DELAY * 1000 >> 2);
300 			mtx_enter(&xs->xs_frqlck);
301 		} else
302 			msleep_nsec(chan, &xs->xs_frqlck, PRIBIO, chan,
303 			    SEC_TO_NSEC(XST_DELAY) >> 2);
304 	}
305 	mtx_leave(&xs->xs_frqlck);
306 	return (xsm);
307 }
308 
309 void
310 xs_put_msg(struct xs_softc *xs, struct xs_msg *xsm)
311 {
312 	memset(xsm, 0, sizeof(*xsm));
313 	mtx_enter(&xs->xs_frqlck);
314 	TAILQ_INSERT_TAIL(&xs->xs_free, xsm, xsm_link);
315 	mtx_leave(&xs->xs_frqlck);
316 }
317 
318 int
319 xs_geterror(struct xs_msg *xsm)
320 {
321 	int i;
322 
323 	for (i = 0; i < nitems(xs_errors); i++)
324 		if (strcmp(xs_errors[i].xse_errstr, xsm->xsm_data) == 0)
325 			return (xs_errors[i].xse_errnum);
326 	return (EOPNOTSUPP);
327 }
328 
329 static inline uint32_t
330 xs_ring_avail(struct xs_ring *xsr, int req)
331 {
332 	uint32_t cons = req ? xsr->xsr_req_cons : xsr->xsr_rsp_cons;
333 	uint32_t prod = req ? xsr->xsr_req_prod : xsr->xsr_rsp_prod;
334 
335 	KASSERT(prod - cons <= XS_RING_SIZE);
336 	return (req ? XS_RING_SIZE - (prod - cons) : prod - cons);
337 }
338 
339 void
340 xs_poll(struct xs_softc *xs, int nosleep)
341 {
342 	int s;
343 
344 	if (nosleep) {
345 		delay(XST_DELAY * 1000 >> 2);
346 		s = splnet();
347 		xs_intr(xs);
348 		splx(s);
349 	} else {
350 		tsleep_nsec(xs->xs_wchan, PRIBIO, xs->xs_wchan,
351 		    SEC_TO_NSEC(XST_DELAY) >> 2);
352 	}
353 }
354 
355 int
356 xs_output(struct xs_transaction *xst, uint8_t *bp, int len)
357 {
358 	struct xs_softc *xs = xst->xst_cookie;
359 	int chunk;
360 
361 	while (len > 0) {
362 		chunk = xs_ring_put(xs, bp, MIN(len, XS_RING_SIZE));
363 		if (chunk < 0)
364 			return (-1);
365 		if (chunk > 0) {
366 			len -= chunk;
367 			bp += chunk;
368 			if (xs_ring_avail(xs->xs_ring, 1) > 0)
369 				continue;
370 		}
371 		/* Squeaky wheel gets the kick */
372 		xen_intr_signal(xs->xs_ih);
373 		/*
374 		 * chunk == 0: we need to wait for hv to consume
375 		 * what has already been written;
376 		 *
377 		 * Alternatively we have managed to fill the ring
378 		 * and must wait for HV to collect the data.
379 		 */
380 		while (xs->xs_ring->xsr_req_prod != xs->xs_ring->xsr_req_cons)
381 			xs_poll(xs, 1);
382 	}
383 	return (0);
384 }
385 
386 int
387 xs_start(struct xs_transaction *xst, struct xs_msg *xsm, struct iovec *iov,
388     int iov_cnt)
389 {
390 	struct xs_softc *xs = xst->xst_cookie;
391 	int i;
392 
393 	rw_enter_write(&xs->xs_rnglck);
394 
395 	/* Header */
396 	if (xs_output(xst, (uint8_t *)&xsm->xsm_hdr,
397 	    sizeof(xsm->xsm_hdr)) == -1) {
398 		printf("%s: failed to write the header\n", __func__);
399 		rw_exit_write(&xs->xs_rnglck);
400 		return (-1);
401 	}
402 
403 	/* Data loop */
404 	for (i = 0; i < iov_cnt; i++) {
405 		if (xs_output(xst, iov[i].iov_base, iov[i].iov_len) == -1) {
406 			printf("%s: failed on iovec #%d len %lu\n", __func__,
407 			    i, iov[i].iov_len);
408 			rw_exit_write(&xs->xs_rnglck);
409 			return (-1);
410 		}
411 	}
412 
413 	mtx_enter(&xs->xs_reqlck);
414 	TAILQ_INSERT_TAIL(&xs->xs_reqs, xsm, xsm_link);
415 	mtx_leave(&xs->xs_reqlck);
416 
417 	xen_intr_signal(xs->xs_ih);
418 
419 	rw_exit_write(&xs->xs_rnglck);
420 
421 	return (0);
422 }
423 
424 struct xs_msg *
425 xs_reply(struct xs_transaction *xst, uint rid)
426 {
427 	struct xs_softc *xs = xst->xst_cookie;
428 	struct xs_msg *xsm;
429 	int s;
430 
431 	mtx_enter(&xs->xs_rsplck);
432 	for (;;) {
433 		TAILQ_FOREACH(xsm, &xs->xs_rsps, xsm_link) {
434 			if (xsm->xsm_hdr.xmh_tid == xst->xst_id &&
435 			    xsm->xsm_hdr.xmh_rid == rid)
436 				break;
437 		}
438 		if (xsm != NULL) {
439 			TAILQ_REMOVE(&xs->xs_rsps, xsm, xsm_link);
440 			break;
441 		}
442 		if (cold) {
443 			mtx_leave(&xs->xs_rsplck);
444 			delay(XST_DELAY * 1000 >> 2);
445 			s = splnet();
446 			xs_intr(xs);
447 			splx(s);
448 			mtx_enter(&xs->xs_rsplck);
449 		} else
450 			msleep_nsec(xs->xs_rchan, &xs->xs_rsplck, PRIBIO,
451 			    xs->xs_rchan, SEC_TO_NSEC(XST_DELAY) >> 2);
452 	}
453 	mtx_leave(&xs->xs_rsplck);
454 	return (xsm);
455 }
456 
457 int
458 xs_ring_put(struct xs_softc *xs, void *src, size_t size)
459 {
460 	struct xs_ring *xsr = xs->xs_ring;
461 	uint32_t prod = xsr->xsr_req_prod & (XS_RING_SIZE - 1);
462 	uint32_t avail = xs_ring_avail(xsr, 1);
463 	size_t left;
464 
465 	if (size > XS_RING_SIZE)
466 		return (-1);
467 	if (avail == 0)
468 		return (0);
469 
470 	/* Bound the size by the number of available slots */
471 	size = MIN(size, avail);
472 	/* How many contiguous bytes can we memcpy... */
473 	left = XS_RING_SIZE - prod;
474 	/* ...bounded by by how much we need to write? */
475 	left = MIN(left, size);
476 
477 	memcpy(&xsr->xsr_req[prod], src, left);
478 	memcpy(&xsr->xsr_req[0], (caddr_t)src + left, size - left);
479 	virtio_membar_sync();
480 	xsr->xsr_req_prod += size;
481 	return (size);
482 }
483 
484 int
485 xs_ring_get(struct xs_softc *xs, void *dst, size_t size)
486 {
487 	struct xs_ring *xsr = xs->xs_ring;
488 	uint32_t cons = xsr->xsr_rsp_cons & (XS_RING_SIZE - 1);
489 	uint32_t avail = xs_ring_avail(xsr, 0);
490 	size_t left;
491 
492 	if (size > XS_RING_SIZE)
493 		return (-1);
494 	if (avail == 0)
495 		return (0);
496 
497 	/* Bound the size by the number of available slots */
498 	size = MIN(size, avail);
499 	/* How many contiguous bytes can we memcpy... */
500 	left = XS_RING_SIZE - cons;
501 	/* ...bounded by by how much we need to read? */
502 	left = MIN(left, size);
503 
504 	memcpy(dst, &xsr->xsr_rsp[cons], left);
505 	memcpy((caddr_t)dst + left, &xsr->xsr_rsp[0], size - left);
506 	virtio_membar_sync();
507 	xsr->xsr_rsp_cons += size;
508 	return (size);
509 }
510 
511 void
512 xs_intr(void *arg)
513 {
514 	struct xs_softc *xs = arg;
515 	struct xs_ring *xsr = xs->xs_ring;
516 	struct xen_softc *sc = xs->xs_sc;
517 	struct xs_msg *xsm = xs->xs_rmsg;
518 	struct xs_msghdr xmh;
519 	uint32_t avail;
520 	int len;
521 
522 	virtio_membar_sync();
523 
524 	if (xsr->xsr_rsp_cons == xsr->xsr_rsp_prod)
525 		return;
526 
527 	avail = xs_ring_avail(xsr, 0);
528 
529 	/* Response processing */
530 
531  again:
532 	if (xs->xs_rmsg == NULL) {
533 		if (avail < sizeof(xmh)) {
534 			DPRINTF("%s: incomplete header: %u\n",
535 			    sc->sc_dev.dv_xname, avail);
536 			goto out;
537 		}
538 		avail -= sizeof(xmh);
539 
540 		if ((len = xs_ring_get(xs, &xmh, sizeof(xmh))) != sizeof(xmh)) {
541 			printf("%s: message too short: %d\n",
542 			    sc->sc_dev.dv_xname, len);
543 			goto out;
544 		}
545 
546 		if (xmh.xmh_type == XS_EVENT) {
547 			xsm = &xs->xs_emsg;
548 			xsm->xsm_read = 0;
549 		} else {
550 			mtx_enter(&xs->xs_reqlck);
551 			TAILQ_FOREACH(xsm, &xs->xs_reqs, xsm_link) {
552 				if (xsm->xsm_hdr.xmh_rid == xmh.xmh_rid) {
553 					TAILQ_REMOVE(&xs->xs_reqs, xsm,
554 					    xsm_link);
555 					break;
556 				}
557 			}
558 			mtx_leave(&xs->xs_reqlck);
559 			if (xsm == NULL) {
560 				printf("%s: unexpected message id %u\n",
561 				    sc->sc_dev.dv_xname, xmh.xmh_rid);
562 				goto out;
563 			}
564 		}
565 		memcpy(&xsm->xsm_hdr, &xmh, sizeof(xmh));
566 		xs->xs_rmsg = xsm;
567 	}
568 
569 	if (xsm->xsm_hdr.xmh_len > xsm->xsm_dlen)
570 		xsm->xsm_error = EMSGSIZE;
571 
572 	len = MIN(xsm->xsm_hdr.xmh_len - xsm->xsm_read, avail);
573 	if (len) {
574 		/* Get data if reply is not empty */
575 		if ((len = xs_ring_get(xs,
576 		    &xsm->xsm_data[xsm->xsm_read], len)) <= 0) {
577 			printf("%s: read failure %d\n", sc->sc_dev.dv_xname,
578 			    len);
579 			goto out;
580 		}
581 		xsm->xsm_read += len;
582 	}
583 
584 	/* Notify reader that we've managed to read the whole message */
585 	if (xsm->xsm_read == xsm->xsm_hdr.xmh_len) {
586 		xs->xs_rmsg = NULL;
587 		if (xsm->xsm_hdr.xmh_type == XS_EVENT) {
588 			xs_event(xs, xsm);
589 		} else {
590 			mtx_enter(&xs->xs_rsplck);
591 			TAILQ_INSERT_TAIL(&xs->xs_rsps, xsm, xsm_link);
592 			mtx_leave(&xs->xs_rsplck);
593 			wakeup(xs->xs_rchan);
594 		}
595 	}
596 
597 	if ((avail = xs_ring_avail(xsr, 0)) > 0)
598 		goto again;
599 
600  out:
601 	/* Wakeup sleeping writes (if any) */
602 	wakeup(xs->xs_wchan);
603 	xen_intr_signal(xs->xs_ih);
604 }
605 
606 static inline int
607 xs_get_buf(struct xs_transaction *xst, struct xs_msg *xsm, int len)
608 {
609 	unsigned char *buf;
610 
611 	buf = malloc(len, M_DEVBUF, M_ZERO | (cold ? M_NOWAIT : M_WAITOK));
612 	if (buf == NULL)
613 		return (-1);
614 	xsm->xsm_dlen = len;
615 	xsm->xsm_data = buf;
616 	return (0);
617 }
618 
619 static inline void
620 xs_put_buf(struct xs_transaction *xst, struct xs_msg *xsm)
621 {
622 	free(xsm->xsm_data, M_DEVBUF, xsm->xsm_dlen);
623 	xsm->xsm_data = NULL;
624 }
625 
626 void
627 xs_resfree(struct xs_transaction *xst, struct iovec *iov, int iov_cnt)
628 {
629 	int i;
630 
631 	for (i = 0; i < iov_cnt; i++)
632 		free(iov[i].iov_base, M_DEVBUF, iov[i].iov_len);
633 	free(iov, M_DEVBUF, sizeof(struct iovec) * iov_cnt);
634 }
635 
636 int
637 xs_parse(struct xs_transaction *xst, struct xs_msg *xsm, struct iovec **iov,
638     int *iov_cnt)
639 {
640 	char *bp, *cp;
641 	uint32_t dlen;
642 	int i, flags;
643 
644 	/* If the response size is zero, we return an empty string */
645 	dlen = MAX(xsm->xsm_hdr.xmh_len, 1);
646 	flags = M_ZERO | (cold ? M_NOWAIT : M_WAITOK);
647 
648 	*iov_cnt = 0;
649 	/* Make sure that the data is NUL terminated */
650 	if (xsm->xsm_data[dlen - 1] != '\0') {
651 		/*
652 		 * The XS_READ operation always returns length without
653 		 * the trailing NUL so we have to adjust the length.
654 		 */
655 		dlen = MIN(dlen + 1, xsm->xsm_dlen);
656 		xsm->xsm_data[dlen - 1] = '\0';
657 	}
658 	for (i = 0; i < dlen; i++)
659 		if (xsm->xsm_data[i] == '\0')
660 			(*iov_cnt)++;
661 	*iov = mallocarray(*iov_cnt, sizeof(struct iovec), M_DEVBUF, flags);
662 	if (*iov == NULL)
663 		goto cleanup;
664 	bp = xsm->xsm_data;
665 	for (i = 0; i < *iov_cnt; i++) {
666 		cp = bp;
667 		while (cp - (caddr_t)xsm->xsm_data < dlen && *cp != '\0')
668 			cp++;
669 		(*iov)[i].iov_len = cp - bp + 1;
670 		(*iov)[i].iov_base = malloc((*iov)[i].iov_len, M_DEVBUF, flags);
671 		if (!(*iov)[i].iov_base) {
672 			xs_resfree(xst, *iov, *iov_cnt);
673 			goto cleanup;
674 		}
675 		memcpy((*iov)[i].iov_base, bp, (*iov)[i].iov_len);
676 		bp = ++cp;
677 	}
678 	return (0);
679 
680  cleanup:
681 	*iov = NULL;
682 	*iov_cnt = 0;
683 	return (ENOMEM);
684 }
685 
686 int
687 xs_event(struct xs_softc *xs, struct xs_msg *xsm)
688 {
689 	struct xs_watch *xsw;
690 	char *token = NULL;
691 	int i;
692 
693 	for (i = 0; i < xsm->xsm_read; i++) {
694 		if (xsm->xsm_data[i] == '\0') {
695 			token = &xsm->xsm_data[i+1];
696 			break;
697 		}
698 	}
699 	if (token == NULL) {
700 		printf("%s: event on \"%s\" without token\n",
701 		    xs->xs_sc->sc_dev.dv_xname, xsm->xsm_data);
702 		return (-1);
703 	}
704 
705 	mtx_enter(&xs->xs_watchlck);
706 	TAILQ_FOREACH(xsw, &xs->xs_watches, xsw_entry) {
707 		if (strcmp(xsw->xsw_token, token))
708 			continue;
709 		mtx_leave(&xs->xs_watchlck);
710 		task_add(xs->xs_watchtq, xsw->xsw_task);
711 		return (0);
712 	}
713 	mtx_leave(&xs->xs_watchlck);
714 
715 	printf("%s: no watchers for node \"%s\"\n",
716 	    xs->xs_sc->sc_dev.dv_xname, xsm->xsm_data);
717 	return (-1);
718 }
719 
720 int
721 xs_cmd(struct xs_transaction *xst, int cmd, const char *path,
722     struct iovec **iov, int *iov_cnt)
723 {
724 	struct xs_softc *xs = xst->xst_cookie;
725 	struct xs_msg *xsm;
726 	struct iovec ov[10];	/* output vector */
727 	int datalen = XS_ERR_PAYLOAD;
728 	int ov_cnt = 0;
729 	enum { READ, WRITE } mode = READ;
730 	int i, error = 0;
731 
732 	if (cmd >= XS_MAX)
733 		return (EINVAL);
734 
735 	switch (cmd) {
736 	case XS_TOPEN:
737 		ov[0].iov_base = "";
738 		ov[0].iov_len = 1;
739 		ov_cnt++;
740 		break;
741 	case XS_TCLOSE:
742 	case XS_RM:
743 	case XS_WATCH:
744 	case XS_WRITE:
745 		mode = WRITE;
746 		/* FALLTHROUGH */
747 	default:
748 		if (mode == READ)
749 			datalen = XS_MAX_PAYLOAD;
750 		break;
751 	}
752 
753 	if (path) {
754 		ov[ov_cnt].iov_base = (void *)path;
755 		ov[ov_cnt++].iov_len = strlen(path) + 1; /* +NUL */
756 	}
757 
758 	if (mode == WRITE && iov && iov_cnt && *iov_cnt > 0) {
759 		for (i = 0; i < *iov_cnt && ov_cnt < nitems(ov);
760 		     i++, ov_cnt++) {
761 			ov[ov_cnt].iov_base = (*iov)[i].iov_base;
762 			ov[ov_cnt].iov_len = (*iov)[i].iov_len;
763 		}
764 	}
765 
766 	xsm = xs_get_msg(xs, !cold);
767 
768 	if (xs_get_buf(xst, xsm, datalen)) {
769 		xs_put_msg(xs, xsm);
770 		return (ENOMEM);
771 	}
772 
773 	xsm->xsm_hdr.xmh_tid = xst->xst_id;
774 	xsm->xsm_hdr.xmh_type = cmd;
775 	xsm->xsm_hdr.xmh_rid = atomic_inc_int_nv(&xs->xs_rid);
776 
777 	for (i = 0; i < ov_cnt; i++)
778 		xsm->xsm_hdr.xmh_len += ov[i].iov_len;
779 
780 	if (xsm->xsm_hdr.xmh_len > XS_MAX_PAYLOAD) {
781 		printf("%s: message type %d with payload above the limit\n",
782 		    xs->xs_sc->sc_dev.dv_xname, cmd);
783 		xs_put_buf(xst, xsm);
784 		xs_put_msg(xs, xsm);
785 		return (EIO);
786 	}
787 
788 	if (xs_start(xst, xsm, ov, ov_cnt)) {
789 		printf("%s: message type %d transmission failed\n",
790 		    xs->xs_sc->sc_dev.dv_xname, cmd);
791 		xs_put_buf(xst, xsm);
792 		xs_put_msg(xs, xsm);
793 		return (EIO);
794 	}
795 
796 	xsm = xs_reply(xst, xsm->xsm_hdr.xmh_rid);
797 
798 	if (xsm->xsm_hdr.xmh_type == XS_ERROR) {
799 		error = xs_geterror(xsm);
800 		DPRINTF("%s: xenstore request %d \"%s\" error %s\n",
801 		    xs->xs_sc->sc_dev.dv_xname, cmd, path, xsm->xsm_data);
802 	} else if (xsm->xsm_error != 0)
803 		error = xsm->xsm_error;
804 	else if (mode == READ) {
805 		KASSERT(iov && iov_cnt);
806 		error = xs_parse(xst, xsm, iov, iov_cnt);
807 	}
808 #ifdef XS_DEBUG
809 	else
810 		if (strcmp(xsm->xsm_data, "OK"))
811 			printf("%s: xenstore request %d failed: %s\n",
812 			    xs->xs_sc->sc_dev.dv_xname, cmd, xsm->xsm_data);
813 #endif
814 
815 	xs_put_buf(xst, xsm);
816 	xs_put_msg(xs, xsm);
817 
818 	return (error);
819 }
820 
821 int
822 xs_watch(void *xsc, const char *path, const char *property, struct task *task,
823     void (*cb)(void *), void *arg)
824 {
825 	struct xen_softc *sc = xsc;
826 	struct xs_softc *xs = sc->sc_xs;
827 	struct xs_transaction xst;
828 	struct xs_watch *xsw;
829 	struct iovec iov, *iovp = &iov;
830 	char key[256];
831 	int error, iov_cnt, ret;
832 
833 	memset(&xst, 0, sizeof(xst));
834 	xst.xst_id = 0;
835 	xst.xst_cookie = sc->sc_xs;
836 
837 	xsw = malloc(sizeof(*xsw), M_DEVBUF, M_NOWAIT | M_ZERO);
838 	if (xsw == NULL)
839 		return (-1);
840 
841 	task_set(task, cb, arg);
842 	xsw->xsw_task = task;
843 
844 	snprintf(xsw->xsw_token, sizeof(xsw->xsw_token), "%0lx",
845 	    (unsigned long)xsw);
846 
847 	if (path)
848 		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
849 	else
850 		ret = snprintf(key, sizeof(key), "%s", property);
851 	if (ret == -1 || ret >= sizeof(key)) {
852 		free(xsw, M_DEVBUF, sizeof(*xsw));
853 		return (EINVAL);
854 	}
855 
856 	iov.iov_base = xsw->xsw_token;
857 	iov.iov_len = sizeof(xsw->xsw_token);
858 	iov_cnt = 1;
859 
860 	/*
861 	 * xs_watches must be prepared pre-emptively because a xenstore
862 	 * event is raised immediately after a watch is established.
863 	 */
864 	mtx_enter(&xs->xs_watchlck);
865 	TAILQ_INSERT_TAIL(&xs->xs_watches, xsw, xsw_entry);
866 	mtx_leave(&xs->xs_watchlck);
867 
868 	if ((error = xs_cmd(&xst, XS_WATCH, key, &iovp, &iov_cnt)) != 0) {
869 		mtx_enter(&xs->xs_watchlck);
870 		TAILQ_REMOVE(&xs->xs_watches, xsw, xsw_entry);
871 		mtx_leave(&xs->xs_watchlck);
872 		free(xsw, M_DEVBUF, sizeof(*xsw));
873 		return (error);
874 	}
875 
876 	return (0);
877 }
878 
879 static unsigned long long
880 atoull(const char *cp, int *error)
881 {
882 	unsigned long long res, cutoff;
883 	int ch;
884 	int cutlim;
885 
886 	res = 0;
887 	cutoff = ULLONG_MAX / (unsigned long long)10;
888 	cutlim = ULLONG_MAX % (unsigned long long)10;
889 
890 	do {
891 		if (*cp < '0' || *cp > '9') {
892 			*error = EINVAL;
893 			return (res);
894 		}
895 		ch = *cp - '0';
896 		if (res > cutoff || (res == cutoff && ch > cutlim)) {
897 			*error = ERANGE;
898 			return (res);
899 		}
900 		res *= 10;
901 		res += ch;
902 	} while (*(++cp) != '\0');
903 
904 	*error = 0;
905 	return (res);
906 }
907 
908 int
909 xs_getnum(void *xsc, const char *path, const char *property,
910     unsigned long long *val)
911 {
912 	char *buf;
913 	int error = 0;
914 
915 	buf = malloc(XS_MAX_PAYLOAD, M_DEVBUF, M_ZERO |
916 	    (cold ? M_NOWAIT : M_WAITOK));
917 	if (buf == NULL)
918 		return (ENOMEM);
919 
920 	error = xs_getprop(xsc, path, property, buf, XS_MAX_PAYLOAD);
921 	if (error)
922 		goto out;
923 
924 	*val = atoull(buf, &error);
925 	if (error)
926 		goto out;
927 
928  out:
929 	free(buf, M_DEVBUF, XS_MAX_PAYLOAD);
930 	return (error);
931 }
932 
933 int
934 xs_setnum(void *xsc, const char *path, const char *property,
935     unsigned long long val)
936 {
937 	char buf[32];
938 	int ret;
939 
940 	ret = snprintf(buf, sizeof(buf), "%llu", val);
941 	if (ret == -1 || ret >= sizeof(buf))
942 		return (ERANGE);
943 
944 	return (xs_setprop(xsc, path, property, buf, strlen(buf)));
945 }
946 
947 int
948 xs_getprop(void *xsc, const char *path, const char *property, char *value,
949     int size)
950 {
951 	struct xen_softc *sc = xsc;
952 	struct xs_transaction xst;
953 	struct iovec *iovp = NULL;
954 	char key[256];
955 	int error, ret, iov_cnt = 0;
956 
957 	if (!property)
958 		return (EINVAL);
959 
960 	memset(&xst, 0, sizeof(xst));
961 	xst.xst_id = 0;
962 	xst.xst_cookie = sc->sc_xs;
963 
964 	if (path)
965 		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
966 	else
967 		ret = snprintf(key, sizeof(key), "%s", property);
968 	if (ret == -1 || ret >= sizeof(key))
969 		return (EINVAL);
970 
971 	if ((error = xs_cmd(&xst, XS_READ, key, &iovp, &iov_cnt)) != 0)
972 		return (error);
973 
974 	if (iov_cnt > 0)
975 		strlcpy(value, (char *)iovp->iov_base, size);
976 
977 	xs_resfree(&xst, iovp, iov_cnt);
978 
979 	return (0);
980 }
981 
982 int
983 xs_setprop(void *xsc, const char *path, const char *property, char *value,
984     int size)
985 {
986 	struct xen_softc *sc = xsc;
987 	struct xs_transaction xst;
988 	struct iovec iov, *iovp = &iov;
989 	char key[256];
990 	int error, ret, iov_cnt = 0;
991 
992 	if (!property)
993 		return (EINVAL);
994 
995 	memset(&xst, 0, sizeof(xst));
996 	xst.xst_id = 0;
997 	xst.xst_cookie = sc->sc_xs;
998 
999 	if (path)
1000 		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
1001 	else
1002 		ret = snprintf(key, sizeof(key), "%s", property);
1003 	if (ret == -1 || ret >= sizeof(key))
1004 		return (EINVAL);
1005 
1006 	iov.iov_base = value;
1007 	iov.iov_len = size;
1008 	iov_cnt = 1;
1009 
1010 	error = xs_cmd(&xst, XS_WRITE, key, &iovp, &iov_cnt);
1011 
1012 	return (error);
1013 }
1014 
1015 int
1016 xs_cmpprop(void *xsc, const char *path, const char *property, const char *value,
1017     int *result)
1018 {
1019 	struct xen_softc *sc = xsc;
1020 	struct xs_transaction xst;
1021 	struct iovec *iovp = NULL;
1022 	char key[256];
1023 	int error, ret, iov_cnt = 0;
1024 
1025 	if (!property)
1026 		return (EINVAL);
1027 
1028 	memset(&xst, 0, sizeof(xst));
1029 	xst.xst_id = 0;
1030 	xst.xst_cookie = sc->sc_xs;
1031 
1032 	if (path)
1033 		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
1034 	else
1035 		ret = snprintf(key, sizeof(key), "%s", property);
1036 	if (ret == -1 || ret >= sizeof(key))
1037 		return (EINVAL);
1038 
1039 	if ((error = xs_cmd(&xst, XS_READ, key, &iovp, &iov_cnt)) != 0)
1040 		return (error);
1041 
1042 	*result = strcmp(value, (char *)iovp->iov_base);
1043 
1044 	xs_resfree(&xst, iovp, iov_cnt);
1045 
1046 	return (0);
1047 }
1048 
1049 int
1050 xs_await_transition(void *xsc, const char *path, const char *property,
1051     const char *value, int timo)
1052 {
1053 	struct xen_softc *sc = xsc;
1054 	int error, res;
1055 
1056 	do {
1057 		error = xs_cmpprop(xsc, path, property, value, &res);
1058 		if (error)
1059 			return (error);
1060 		if (timo && --timo == 0)
1061 			return (ETIMEDOUT);
1062 		xs_poll(sc->sc_xs, cold);
1063 	} while (res != 0);
1064 
1065 	return (0);
1066 }
1067 
1068 int
1069 xs_kvop(void *xsc, int op, char *key, char *value, size_t valuelen)
1070 {
1071 	struct xen_softc *sc = xsc;
1072 	struct xs_transaction xst;
1073 	struct iovec iov, *iovp = &iov;
1074 	int error = 0, iov_cnt = 0, cmd, i;
1075 
1076 	switch (op) {
1077 	case PVBUS_KVWRITE:
1078 		cmd = XS_WRITE;
1079 		iov.iov_base = value;
1080 		iov.iov_len = strlen(value);
1081 		iov_cnt = 1;
1082 		break;
1083 	case PVBUS_KVREAD:
1084 		cmd = XS_READ;
1085 		break;
1086 	case PVBUS_KVLS:
1087 		cmd = XS_LIST;
1088 		break;
1089 	default:
1090 		return (EOPNOTSUPP);
1091 	}
1092 
1093 	memset(&xst, 0, sizeof(xst));
1094 	xst.xst_id = 0;
1095 	xst.xst_cookie = sc->sc_xs;
1096 
1097 	if ((error = xs_cmd(&xst, cmd, key, &iovp, &iov_cnt)) != 0)
1098 		return (error);
1099 
1100 	memset(value, 0, valuelen);
1101 
1102 	switch (cmd) {
1103 	case XS_READ:
1104 		if (iov_cnt == 1 && iovp[0].iov_len == 1) {
1105 			xs_resfree(&xst, iovp, iov_cnt);
1106 
1107 			/*
1108 			 * We cannot distinguish if the returned value is
1109 			 * a directory or a file in the xenstore.  The only
1110 			 * indication is that the read value of a directory
1111 			 * returns an empty string (single nul byte),
1112 			 * so try to get the directory list in this case.
1113 			 */
1114 			return (xs_kvop(xsc, PVBUS_KVLS, key, value, valuelen));
1115 		}
1116 		/* FALLTHROUGH */
1117 	case XS_LIST:
1118 		for (i = 0; i < iov_cnt; i++) {
1119 			if (i && strlcat(value, "\n", valuelen) >= valuelen)
1120 				break;
1121 			if (strlcat(value, iovp[i].iov_base,
1122 			    valuelen) >= valuelen)
1123 				break;
1124 		}
1125 		xs_resfree(&xst, iovp, iov_cnt);
1126 		break;
1127 	default:
1128 		break;
1129 	}
1130 
1131 	return (0);
1132 }
1133