1/*	$OpenBSD: xenstore.c,v 1.50 2024/05/24 10:05:55 jsg Exp $	*/
2
3/*
4 * Copyright (c) 2015 Mike Belopuhov
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <sys/param.h>
20#include <sys/systm.h>
21#include <sys/atomic.h>
22#include <sys/malloc.h>
23#include <sys/device.h>
24#include <sys/mutex.h>
25#include <sys/rwlock.h>
26#include <sys/task.h>
27
28#include <machine/bus.h>
29
30#include <uvm/uvm_extern.h>
31
32#include <dev/pv/pvvar.h>
33#include <dev/pv/xenreg.h>
34#include <dev/pv/xenvar.h>
35
36/* #define XS_DEBUG */
37
38#ifdef XS_DEBUG
39#define DPRINTF(x...)		printf(x)
40#else
41#define DPRINTF(x...)
42#endif
43
44/*
45 * The XenStore interface is a simple storage system that is a means of
46 * communicating state and configuration data between the Xen Domain 0
47 * and the various guest domains.  All configuration data other than
48 * a small amount of essential information required during the early
49 * boot process of launching a Xen aware guest, is managed using the
50 * XenStore.
51 *
52 * The XenStore is ASCII string based, and has a structure and semantics
53 * similar to a filesystem.  There are files and directories that are
54 * able to contain files or other directories.  The depth of the hierarchy
55 * is only limited by the XenStore's maximum path length.
56 *
57 * The communication channel between the XenStore service and other
58 * domains is via two, guest specific, ring buffers in a shared memory
59 * area.  One ring buffer is used for communicating in each direction.
60 * The grant table references for this shared memory are given to the
61 * guest via HVM hypercalls.
62 *
63 * The XenStore communication relies on an event channel and thus
64 * interrupts. Several Xen services depend on the XenStore, most
65 * notably the XenBus used to discover and manage Xen devices.
66 */
67
68const struct {
69	const char		*xse_errstr;
70	int			 xse_errnum;
71} xs_errors[] = {
72	{ "EINVAL",	EINVAL },
73	{ "EACCES",	EACCES },
74	{ "EEXIST",	EEXIST },
75	{ "EISDIR",	EISDIR },
76	{ "ENOENT",	ENOENT },
77	{ "ENOMEM",	ENOMEM },
78	{ "ENOSPC",	ENOSPC },
79	{ "EIO",	EIO },
80	{ "ENOTEMPTY",	ENOTEMPTY },
81	{ "ENOSYS",	ENOSYS },
82	{ "EROFS",	EROFS },
83	{ "EBUSY",	EBUSY },
84	{ "EAGAIN",	EAGAIN },
85	{ "EISCONN",	EISCONN },
86	{ NULL,		-1 },
87};
88
89struct xs_msghdr {
90	/* Message type */
91	uint32_t		 xmh_type;
92	/* Request identifier, echoed in daemon's response.  */
93	uint32_t		 xmh_rid;
94	/* Transaction id (0 if not related to a transaction). */
95	uint32_t		 xmh_tid;
96	/* Length of data following this. */
97	uint32_t		 xmh_len;
98	/* Generally followed by nul-terminated string(s). */
99} __packed;
100
101/*
102 * A minimum output buffer size needed to store an error string.
103 */
104#define XS_ERR_PAYLOAD		16
105
106/*
107 * Although the Xen source code implies that the limit is 4k,
108 * in practice it turns out that we can only send 2k bytes of
109 * payload before receiving a ENOSPC.  We set it to an even
110 * smaller value however, because there's no real need to use
111 * large buffers for anything.
112 */
113#define XS_MAX_PAYLOAD		1024
114
115struct xs_msg {
116	struct xs_msghdr	 xsm_hdr;
117	uint32_t		 xsm_read;
118	uint32_t		 xsm_dlen;
119	int			 xsm_error;
120	uint8_t			*xsm_data;
121	TAILQ_ENTRY(xs_msg)	 xsm_link;
122};
123TAILQ_HEAD(xs_msgq, xs_msg);
124
125#define XS_RING_SIZE		1024
126
127struct xs_ring {
128	uint8_t			xsr_req[XS_RING_SIZE];
129	uint8_t			xsr_rsp[XS_RING_SIZE];
130	uint32_t		xsr_req_cons;
131	uint32_t		xsr_req_prod;
132	uint32_t		xsr_rsp_cons;
133	uint32_t		xsr_rsp_prod;
134} __packed;
135
136#define XST_DELAY		1	/* in seconds */
137
138#define XSW_TOKLEN		(sizeof(void *) * 2 + 1)
139
140struct xs_watch {
141	TAILQ_ENTRY(xs_watch)	 xsw_entry;
142	uint8_t			 xsw_token[XSW_TOKLEN];
143	struct task		*xsw_task;
144};
145
146/*
147 * Container for all XenStore related state.
148 */
149struct xs_softc {
150	struct xen_softc	*xs_sc;
151
152	evtchn_port_t		 xs_port;
153	xen_intr_handle_t	 xs_ih;
154
155	struct xs_ring		*xs_ring;
156
157	struct xs_msg		 xs_msgs[10];
158	struct xs_msg		*xs_rmsg;
159
160	struct xs_msgq		 xs_free;
161	struct xs_msgq		 xs_reqs;
162	struct xs_msgq		 xs_rsps;
163
164	volatile uint		 xs_rid;
165
166	const char		*xs_wchan;
167	const char		*xs_rchan;
168
169	struct mutex		 xs_reqlck;	/* request queue mutex */
170	struct mutex		 xs_rsplck;	/* response queue mutex */
171	struct mutex		 xs_frqlck;	/* free queue mutex */
172
173	TAILQ_HEAD(, xs_watch)	 xs_watches;
174	struct mutex		 xs_watchlck;
175	struct xs_msg		 xs_emsg;
176	struct taskq		*xs_watchtq;
177
178	struct rwlock		 xs_rnglck;
179};
180
181struct xs_msg *
182	xs_get_msg(struct xs_softc *, int);
183void	xs_put_msg(struct xs_softc *, struct xs_msg *);
184int	xs_ring_get(struct xs_softc *, void *, size_t);
185int	xs_ring_put(struct xs_softc *, void *, size_t);
186void	xs_intr(void *);
187void	xs_poll(struct xs_softc *, int);
188int	xs_output(struct xs_transaction *, uint8_t *, int);
189int	xs_start(struct xs_transaction *, struct xs_msg *, struct iovec *, int);
190struct xs_msg *
191	xs_reply(struct xs_transaction *, uint);
192int	xs_parse(struct xs_transaction *, struct xs_msg *, struct iovec **,
193	    int *);
194int	xs_event(struct xs_softc *, struct xs_msg *);
195
196int
197xs_attach(struct xen_softc *sc)
198{
199        struct xen_hvm_param xhv;
200	struct xs_softc *xs;
201	paddr_t pa;
202	int i;
203
204	if ((xs = malloc(sizeof(*xs), M_DEVBUF, M_NOWAIT | M_ZERO)) == NULL) {
205		printf(": failed to allocate xenstore softc\n");
206		return (-1);
207	}
208	sc->sc_xs = xs;
209	xs->xs_sc = sc;
210
211	/* Fetch event channel port */
212	memset(&xhv, 0, sizeof(xhv));
213	xhv.domid = DOMID_SELF;
214	xhv.index = HVM_PARAM_STORE_EVTCHN;
215	if (xen_hypercall(sc, XC_HVM, 2, HVMOP_get_param, &xhv)) {
216		printf(": failed to obtain a xenstore event channel\n");
217		goto fail_1;
218	}
219	xs->xs_port = xhv.value;
220
221	printf(", event channel %u\n", xs->xs_port);
222
223	/* Fetch a frame number (PA) of a shared xenstore page */
224	memset(&xhv, 0, sizeof(xhv));
225	xhv.domid = DOMID_SELF;
226	xhv.index = HVM_PARAM_STORE_PFN;
227	if (xen_hypercall(sc, XC_HVM, 2, HVMOP_get_param, &xhv))
228		goto fail_1;
229	pa = ptoa(xhv.value);
230	/* Allocate a page of virtual memory */
231	xs->xs_ring = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
232	if (xs->xs_ring == NULL)
233		goto fail_1;
234	/* Map in the xenstore page into our KVA */
235	pa |= PMAP_NOCACHE;
236	pmap_kenter_pa((vaddr_t)xs->xs_ring, pa, PROT_READ | PROT_WRITE);
237	pmap_update(pmap_kernel());
238
239	if (xen_intr_establish(xs->xs_port, &xs->xs_ih, 0, xs_intr, xs,
240	    sc->sc_dev.dv_xname))
241		goto fail_2;
242
243	xs->xs_wchan = "xswrite";
244	xs->xs_rchan = "xsread";
245
246	TAILQ_INIT(&xs->xs_free);
247	TAILQ_INIT(&xs->xs_reqs);
248	TAILQ_INIT(&xs->xs_rsps);
249	for (i = 0; i < nitems(xs->xs_msgs); i++)
250		TAILQ_INSERT_TAIL(&xs->xs_free, &xs->xs_msgs[i], xsm_link);
251
252	mtx_init(&xs->xs_reqlck, IPL_NET);
253	mtx_init(&xs->xs_rsplck, IPL_NET);
254	mtx_init(&xs->xs_frqlck, IPL_NET);
255
256	rw_init(&xs->xs_rnglck, "xsrnglck");
257
258	xs->xs_watchtq = taskq_create("xenwatch", 1, IPL_NET, 0);
259
260	mtx_init(&xs->xs_watchlck, IPL_NET);
261	TAILQ_INIT(&xs->xs_watches);
262
263	xs->xs_emsg.xsm_data = malloc(XS_MAX_PAYLOAD, M_DEVBUF,
264	    M_ZERO | M_NOWAIT);
265	if (xs->xs_emsg.xsm_data == NULL)
266		goto fail_2;
267	xs->xs_emsg.xsm_dlen = XS_MAX_PAYLOAD;
268
269	return (0);
270
271 fail_2:
272	pmap_kremove((vaddr_t)xs->xs_ring, PAGE_SIZE);
273	pmap_update(pmap_kernel());
274	km_free(xs->xs_ring, PAGE_SIZE, &kv_any, &kp_none);
275	xs->xs_ring = NULL;
276 fail_1:
277	free(xs, sizeof(*xs), M_DEVBUF);
278	sc->sc_xs = NULL;
279	return (-1);
280}
281
282struct xs_msg *
283xs_get_msg(struct xs_softc *xs, int waitok)
284{
285	static const char *chan = "xsalloc";
286	struct xs_msg *xsm;
287
288	mtx_enter(&xs->xs_frqlck);
289	for (;;) {
290		xsm = TAILQ_FIRST(&xs->xs_free);
291		if (xsm != NULL) {
292			TAILQ_REMOVE(&xs->xs_free, xsm, xsm_link);
293			break;
294		}
295		if (!waitok) {
296			mtx_leave(&xs->xs_frqlck);
297			delay(XST_DELAY * 1000 >> 2);
298			mtx_enter(&xs->xs_frqlck);
299		} else
300			msleep_nsec(chan, &xs->xs_frqlck, PRIBIO, chan,
301			    SEC_TO_NSEC(XST_DELAY) >> 2);
302	}
303	mtx_leave(&xs->xs_frqlck);
304	return (xsm);
305}
306
307void
308xs_put_msg(struct xs_softc *xs, struct xs_msg *xsm)
309{
310	memset(xsm, 0, sizeof(*xsm));
311	mtx_enter(&xs->xs_frqlck);
312	TAILQ_INSERT_TAIL(&xs->xs_free, xsm, xsm_link);
313	mtx_leave(&xs->xs_frqlck);
314}
315
316int
317xs_geterror(struct xs_msg *xsm)
318{
319	int i;
320
321	for (i = 0; i < nitems(xs_errors); i++)
322		if (strcmp(xs_errors[i].xse_errstr, xsm->xsm_data) == 0)
323			return (xs_errors[i].xse_errnum);
324	return (EOPNOTSUPP);
325}
326
327static inline uint32_t
328xs_ring_avail(struct xs_ring *xsr, int req)
329{
330	uint32_t cons = req ? xsr->xsr_req_cons : xsr->xsr_rsp_cons;
331	uint32_t prod = req ? xsr->xsr_req_prod : xsr->xsr_rsp_prod;
332
333	KASSERT(prod - cons <= XS_RING_SIZE);
334	return (req ? XS_RING_SIZE - (prod - cons) : prod - cons);
335}
336
337void
338xs_poll(struct xs_softc *xs, int nosleep)
339{
340	int s;
341
342	if (nosleep) {
343		delay(XST_DELAY * 1000 >> 2);
344		s = splnet();
345		xs_intr(xs);
346		splx(s);
347	} else {
348		tsleep_nsec(xs->xs_wchan, PRIBIO, xs->xs_wchan,
349		    SEC_TO_NSEC(XST_DELAY) >> 2);
350	}
351}
352
353int
354xs_output(struct xs_transaction *xst, uint8_t *bp, int len)
355{
356	struct xs_softc *xs = xst->xst_cookie;
357	int chunk;
358
359	while (len > 0) {
360		chunk = xs_ring_put(xs, bp, MIN(len, XS_RING_SIZE));
361		if (chunk < 0)
362			return (-1);
363		if (chunk > 0) {
364			len -= chunk;
365			bp += chunk;
366			if (xs_ring_avail(xs->xs_ring, 1) > 0)
367				continue;
368		}
369		/* Squeaky wheel gets the kick */
370		xen_intr_signal(xs->xs_ih);
371		/*
372		 * chunk == 0: we need to wait for hv to consume
373		 * what has already been written;
374		 *
375		 * Alternatively we have managed to fill the ring
376		 * and must wait for HV to collect the data.
377		 */
378		while (xs->xs_ring->xsr_req_prod != xs->xs_ring->xsr_req_cons)
379			xs_poll(xs, 1);
380	}
381	return (0);
382}
383
384int
385xs_start(struct xs_transaction *xst, struct xs_msg *xsm, struct iovec *iov,
386    int iov_cnt)
387{
388	struct xs_softc *xs = xst->xst_cookie;
389	int i;
390
391	rw_enter_write(&xs->xs_rnglck);
392
393	/* Header */
394	if (xs_output(xst, (uint8_t *)&xsm->xsm_hdr,
395	    sizeof(xsm->xsm_hdr)) == -1) {
396		printf("%s: failed to write the header\n", __func__);
397		rw_exit_write(&xs->xs_rnglck);
398		return (-1);
399	}
400
401	/* Data loop */
402	for (i = 0; i < iov_cnt; i++) {
403		if (xs_output(xst, iov[i].iov_base, iov[i].iov_len) == -1) {
404			printf("%s: failed on iovec #%d len %lu\n", __func__,
405			    i, iov[i].iov_len);
406			rw_exit_write(&xs->xs_rnglck);
407			return (-1);
408		}
409	}
410
411	mtx_enter(&xs->xs_reqlck);
412	TAILQ_INSERT_TAIL(&xs->xs_reqs, xsm, xsm_link);
413	mtx_leave(&xs->xs_reqlck);
414
415	xen_intr_signal(xs->xs_ih);
416
417	rw_exit_write(&xs->xs_rnglck);
418
419	return (0);
420}
421
422struct xs_msg *
423xs_reply(struct xs_transaction *xst, uint rid)
424{
425	struct xs_softc *xs = xst->xst_cookie;
426	struct xs_msg *xsm;
427	int s;
428
429	mtx_enter(&xs->xs_rsplck);
430	for (;;) {
431		TAILQ_FOREACH(xsm, &xs->xs_rsps, xsm_link) {
432			if (xsm->xsm_hdr.xmh_tid == xst->xst_id &&
433			    xsm->xsm_hdr.xmh_rid == rid)
434				break;
435		}
436		if (xsm != NULL) {
437			TAILQ_REMOVE(&xs->xs_rsps, xsm, xsm_link);
438			break;
439		}
440		if (cold) {
441			mtx_leave(&xs->xs_rsplck);
442			delay(XST_DELAY * 1000 >> 2);
443			s = splnet();
444			xs_intr(xs);
445			splx(s);
446			mtx_enter(&xs->xs_rsplck);
447		} else
448			msleep_nsec(xs->xs_rchan, &xs->xs_rsplck, PRIBIO,
449			    xs->xs_rchan, SEC_TO_NSEC(XST_DELAY) >> 2);
450	}
451	mtx_leave(&xs->xs_rsplck);
452	return (xsm);
453}
454
455int
456xs_ring_put(struct xs_softc *xs, void *src, size_t size)
457{
458	struct xs_ring *xsr = xs->xs_ring;
459	uint32_t prod = xsr->xsr_req_prod & (XS_RING_SIZE - 1);
460	uint32_t avail = xs_ring_avail(xsr, 1);
461	size_t left;
462
463	if (size > XS_RING_SIZE)
464		return (-1);
465	if (avail == 0)
466		return (0);
467
468	/* Bound the size by the number of available slots */
469	size = MIN(size, avail);
470	/* How many contiguous bytes can we memcpy... */
471	left = XS_RING_SIZE - prod;
472	/* ...bounded by how much we need to write? */
473	left = MIN(left, size);
474
475	memcpy(&xsr->xsr_req[prod], src, left);
476	memcpy(&xsr->xsr_req[0], (caddr_t)src + left, size - left);
477	virtio_membar_sync();
478	xsr->xsr_req_prod += size;
479	return (size);
480}
481
482int
483xs_ring_get(struct xs_softc *xs, void *dst, size_t size)
484{
485	struct xs_ring *xsr = xs->xs_ring;
486	uint32_t cons = xsr->xsr_rsp_cons & (XS_RING_SIZE - 1);
487	uint32_t avail = xs_ring_avail(xsr, 0);
488	size_t left;
489
490	if (size > XS_RING_SIZE)
491		return (-1);
492	if (avail == 0)
493		return (0);
494
495	/* Bound the size by the number of available slots */
496	size = MIN(size, avail);
497	/* How many contiguous bytes can we memcpy... */
498	left = XS_RING_SIZE - cons;
499	/* ...bounded by how much we need to read? */
500	left = MIN(left, size);
501
502	memcpy(dst, &xsr->xsr_rsp[cons], left);
503	memcpy((caddr_t)dst + left, &xsr->xsr_rsp[0], size - left);
504	virtio_membar_sync();
505	xsr->xsr_rsp_cons += size;
506	return (size);
507}
508
509void
510xs_intr(void *arg)
511{
512	struct xs_softc *xs = arg;
513	struct xs_ring *xsr = xs->xs_ring;
514	struct xen_softc *sc = xs->xs_sc;
515	struct xs_msg *xsm = xs->xs_rmsg;
516	struct xs_msghdr xmh;
517	uint32_t avail;
518	int len;
519
520	virtio_membar_sync();
521
522	if (xsr->xsr_rsp_cons == xsr->xsr_rsp_prod)
523		return;
524
525	avail = xs_ring_avail(xsr, 0);
526
527	/* Response processing */
528
529 again:
530	if (xs->xs_rmsg == NULL) {
531		if (avail < sizeof(xmh)) {
532			DPRINTF("%s: incomplete header: %u\n",
533			    sc->sc_dev.dv_xname, avail);
534			goto out;
535		}
536		avail -= sizeof(xmh);
537
538		if ((len = xs_ring_get(xs, &xmh, sizeof(xmh))) != sizeof(xmh)) {
539			printf("%s: message too short: %d\n",
540			    sc->sc_dev.dv_xname, len);
541			goto out;
542		}
543
544		if (xmh.xmh_type == XS_EVENT) {
545			xsm = &xs->xs_emsg;
546			xsm->xsm_read = 0;
547		} else {
548			mtx_enter(&xs->xs_reqlck);
549			TAILQ_FOREACH(xsm, &xs->xs_reqs, xsm_link) {
550				if (xsm->xsm_hdr.xmh_rid == xmh.xmh_rid) {
551					TAILQ_REMOVE(&xs->xs_reqs, xsm,
552					    xsm_link);
553					break;
554				}
555			}
556			mtx_leave(&xs->xs_reqlck);
557			if (xsm == NULL) {
558				printf("%s: unexpected message id %u\n",
559				    sc->sc_dev.dv_xname, xmh.xmh_rid);
560				goto out;
561			}
562		}
563		memcpy(&xsm->xsm_hdr, &xmh, sizeof(xmh));
564		xs->xs_rmsg = xsm;
565	}
566
567	if (xsm->xsm_hdr.xmh_len > xsm->xsm_dlen)
568		xsm->xsm_error = EMSGSIZE;
569
570	len = MIN(xsm->xsm_hdr.xmh_len - xsm->xsm_read, avail);
571	if (len) {
572		/* Get data if reply is not empty */
573		if ((len = xs_ring_get(xs,
574		    &xsm->xsm_data[xsm->xsm_read], len)) <= 0) {
575			printf("%s: read failure %d\n", sc->sc_dev.dv_xname,
576			    len);
577			goto out;
578		}
579		xsm->xsm_read += len;
580	}
581
582	/* Notify reader that we've managed to read the whole message */
583	if (xsm->xsm_read == xsm->xsm_hdr.xmh_len) {
584		xs->xs_rmsg = NULL;
585		if (xsm->xsm_hdr.xmh_type == XS_EVENT) {
586			xs_event(xs, xsm);
587		} else {
588			mtx_enter(&xs->xs_rsplck);
589			TAILQ_INSERT_TAIL(&xs->xs_rsps, xsm, xsm_link);
590			mtx_leave(&xs->xs_rsplck);
591			wakeup(xs->xs_rchan);
592		}
593	}
594
595	if ((avail = xs_ring_avail(xsr, 0)) > 0)
596		goto again;
597
598 out:
599	/* Wakeup sleeping writes (if any) */
600	wakeup(xs->xs_wchan);
601	xen_intr_signal(xs->xs_ih);
602}
603
604static inline int
605xs_get_buf(struct xs_transaction *xst, struct xs_msg *xsm, int len)
606{
607	unsigned char *buf;
608
609	buf = malloc(len, M_DEVBUF, M_ZERO | (cold ? M_NOWAIT : M_WAITOK));
610	if (buf == NULL)
611		return (-1);
612	xsm->xsm_dlen = len;
613	xsm->xsm_data = buf;
614	return (0);
615}
616
617static inline void
618xs_put_buf(struct xs_transaction *xst, struct xs_msg *xsm)
619{
620	free(xsm->xsm_data, M_DEVBUF, xsm->xsm_dlen);
621	xsm->xsm_data = NULL;
622}
623
624void
625xs_resfree(struct xs_transaction *xst, struct iovec *iov, int iov_cnt)
626{
627	int i;
628
629	for (i = 0; i < iov_cnt; i++)
630		free(iov[i].iov_base, M_DEVBUF, iov[i].iov_len);
631	free(iov, M_DEVBUF, sizeof(struct iovec) * iov_cnt);
632}
633
634int
635xs_parse(struct xs_transaction *xst, struct xs_msg *xsm, struct iovec **iov,
636    int *iov_cnt)
637{
638	char *bp, *cp;
639	uint32_t dlen;
640	int i, flags;
641
642	/* If the response size is zero, we return an empty string */
643	dlen = MAX(xsm->xsm_hdr.xmh_len, 1);
644	flags = M_ZERO | (cold ? M_NOWAIT : M_WAITOK);
645
646	*iov_cnt = 0;
647	/* Make sure that the data is NUL terminated */
648	if (xsm->xsm_data[dlen - 1] != '\0') {
649		/*
650		 * The XS_READ operation always returns length without
651		 * the trailing NUL so we have to adjust the length.
652		 */
653		dlen = MIN(dlen + 1, xsm->xsm_dlen);
654		xsm->xsm_data[dlen - 1] = '\0';
655	}
656	for (i = 0; i < dlen; i++)
657		if (xsm->xsm_data[i] == '\0')
658			(*iov_cnt)++;
659	*iov = mallocarray(*iov_cnt, sizeof(struct iovec), M_DEVBUF, flags);
660	if (*iov == NULL)
661		goto cleanup;
662	bp = xsm->xsm_data;
663	for (i = 0; i < *iov_cnt; i++) {
664		cp = bp;
665		while (cp - (caddr_t)xsm->xsm_data < dlen && *cp != '\0')
666			cp++;
667		(*iov)[i].iov_len = cp - bp + 1;
668		(*iov)[i].iov_base = malloc((*iov)[i].iov_len, M_DEVBUF, flags);
669		if (!(*iov)[i].iov_base) {
670			xs_resfree(xst, *iov, *iov_cnt);
671			goto cleanup;
672		}
673		memcpy((*iov)[i].iov_base, bp, (*iov)[i].iov_len);
674		bp = ++cp;
675	}
676	return (0);
677
678 cleanup:
679	*iov = NULL;
680	*iov_cnt = 0;
681	return (ENOMEM);
682}
683
684int
685xs_event(struct xs_softc *xs, struct xs_msg *xsm)
686{
687	struct xs_watch *xsw;
688	char *token = NULL;
689	int i;
690
691	for (i = 0; i < xsm->xsm_read; i++) {
692		if (xsm->xsm_data[i] == '\0') {
693			token = &xsm->xsm_data[i+1];
694			break;
695		}
696	}
697	if (token == NULL) {
698		printf("%s: event on \"%s\" without token\n",
699		    xs->xs_sc->sc_dev.dv_xname, xsm->xsm_data);
700		return (-1);
701	}
702
703	mtx_enter(&xs->xs_watchlck);
704	TAILQ_FOREACH(xsw, &xs->xs_watches, xsw_entry) {
705		if (strcmp(xsw->xsw_token, token))
706			continue;
707		mtx_leave(&xs->xs_watchlck);
708		task_add(xs->xs_watchtq, xsw->xsw_task);
709		return (0);
710	}
711	mtx_leave(&xs->xs_watchlck);
712
713	printf("%s: no watchers for node \"%s\"\n",
714	    xs->xs_sc->sc_dev.dv_xname, xsm->xsm_data);
715	return (-1);
716}
717
718int
719xs_cmd(struct xs_transaction *xst, int cmd, const char *path,
720    struct iovec **iov, int *iov_cnt)
721{
722	struct xs_softc *xs = xst->xst_cookie;
723	struct xs_msg *xsm;
724	struct iovec ov[10];	/* output vector */
725	int datalen = XS_ERR_PAYLOAD;
726	int ov_cnt = 0;
727	enum { READ, WRITE } mode = READ;
728	int i, error = 0;
729
730	if (cmd >= XS_MAX)
731		return (EINVAL);
732
733	switch (cmd) {
734	case XS_TOPEN:
735		ov[0].iov_base = "";
736		ov[0].iov_len = 1;
737		ov_cnt++;
738		break;
739	case XS_TCLOSE:
740	case XS_RM:
741	case XS_WATCH:
742	case XS_WRITE:
743		mode = WRITE;
744		/* FALLTHROUGH */
745	default:
746		if (mode == READ)
747			datalen = XS_MAX_PAYLOAD;
748		break;
749	}
750
751	if (path) {
752		ov[ov_cnt].iov_base = (void *)path;
753		ov[ov_cnt++].iov_len = strlen(path) + 1; /* +NUL */
754	}
755
756	if (mode == WRITE && iov && iov_cnt && *iov_cnt > 0) {
757		for (i = 0; i < *iov_cnt && ov_cnt < nitems(ov);
758		     i++, ov_cnt++) {
759			ov[ov_cnt].iov_base = (*iov)[i].iov_base;
760			ov[ov_cnt].iov_len = (*iov)[i].iov_len;
761		}
762	}
763
764	xsm = xs_get_msg(xs, !cold);
765
766	if (xs_get_buf(xst, xsm, datalen)) {
767		xs_put_msg(xs, xsm);
768		return (ENOMEM);
769	}
770
771	xsm->xsm_hdr.xmh_tid = xst->xst_id;
772	xsm->xsm_hdr.xmh_type = cmd;
773	xsm->xsm_hdr.xmh_rid = atomic_inc_int_nv(&xs->xs_rid);
774
775	for (i = 0; i < ov_cnt; i++)
776		xsm->xsm_hdr.xmh_len += ov[i].iov_len;
777
778	if (xsm->xsm_hdr.xmh_len > XS_MAX_PAYLOAD) {
779		printf("%s: message type %d with payload above the limit\n",
780		    xs->xs_sc->sc_dev.dv_xname, cmd);
781		xs_put_buf(xst, xsm);
782		xs_put_msg(xs, xsm);
783		return (EIO);
784	}
785
786	if (xs_start(xst, xsm, ov, ov_cnt)) {
787		printf("%s: message type %d transmission failed\n",
788		    xs->xs_sc->sc_dev.dv_xname, cmd);
789		xs_put_buf(xst, xsm);
790		xs_put_msg(xs, xsm);
791		return (EIO);
792	}
793
794	xsm = xs_reply(xst, xsm->xsm_hdr.xmh_rid);
795
796	if (xsm->xsm_hdr.xmh_type == XS_ERROR) {
797		error = xs_geterror(xsm);
798		DPRINTF("%s: xenstore request %d \"%s\" error %s\n",
799		    xs->xs_sc->sc_dev.dv_xname, cmd, path, xsm->xsm_data);
800	} else if (xsm->xsm_error != 0)
801		error = xsm->xsm_error;
802	else if (mode == READ) {
803		KASSERT(iov && iov_cnt);
804		error = xs_parse(xst, xsm, iov, iov_cnt);
805	}
806#ifdef XS_DEBUG
807	else
808		if (strcmp(xsm->xsm_data, "OK"))
809			printf("%s: xenstore request %d failed: %s\n",
810			    xs->xs_sc->sc_dev.dv_xname, cmd, xsm->xsm_data);
811#endif
812
813	xs_put_buf(xst, xsm);
814	xs_put_msg(xs, xsm);
815
816	return (error);
817}
818
819int
820xs_watch(void *xsc, const char *path, const char *property, struct task *task,
821    void (*cb)(void *), void *arg)
822{
823	struct xen_softc *sc = xsc;
824	struct xs_softc *xs = sc->sc_xs;
825	struct xs_transaction xst;
826	struct xs_watch *xsw;
827	struct iovec iov, *iovp = &iov;
828	char key[256];
829	int error, iov_cnt, ret;
830
831	memset(&xst, 0, sizeof(xst));
832	xst.xst_id = 0;
833	xst.xst_cookie = sc->sc_xs;
834
835	xsw = malloc(sizeof(*xsw), M_DEVBUF, M_NOWAIT | M_ZERO);
836	if (xsw == NULL)
837		return (-1);
838
839	task_set(task, cb, arg);
840	xsw->xsw_task = task;
841
842	snprintf(xsw->xsw_token, sizeof(xsw->xsw_token), "%0lx",
843	    (unsigned long)xsw);
844
845	if (path)
846		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
847	else
848		ret = snprintf(key, sizeof(key), "%s", property);
849	if (ret == -1 || ret >= sizeof(key)) {
850		free(xsw, M_DEVBUF, sizeof(*xsw));
851		return (EINVAL);
852	}
853
854	iov.iov_base = xsw->xsw_token;
855	iov.iov_len = sizeof(xsw->xsw_token);
856	iov_cnt = 1;
857
858	/*
859	 * xs_watches must be prepared pre-emptively because a xenstore
860	 * event is raised immediately after a watch is established.
861	 */
862	mtx_enter(&xs->xs_watchlck);
863	TAILQ_INSERT_TAIL(&xs->xs_watches, xsw, xsw_entry);
864	mtx_leave(&xs->xs_watchlck);
865
866	if ((error = xs_cmd(&xst, XS_WATCH, key, &iovp, &iov_cnt)) != 0) {
867		mtx_enter(&xs->xs_watchlck);
868		TAILQ_REMOVE(&xs->xs_watches, xsw, xsw_entry);
869		mtx_leave(&xs->xs_watchlck);
870		free(xsw, M_DEVBUF, sizeof(*xsw));
871		return (error);
872	}
873
874	return (0);
875}
876
877static unsigned long long
878atoull(const char *cp, int *error)
879{
880	unsigned long long res, cutoff;
881	int ch;
882	int cutlim;
883
884	res = 0;
885	cutoff = ULLONG_MAX / (unsigned long long)10;
886	cutlim = ULLONG_MAX % (unsigned long long)10;
887
888	do {
889		if (*cp < '0' || *cp > '9') {
890			*error = EINVAL;
891			return (res);
892		}
893		ch = *cp - '0';
894		if (res > cutoff || (res == cutoff && ch > cutlim)) {
895			*error = ERANGE;
896			return (res);
897		}
898		res *= 10;
899		res += ch;
900	} while (*(++cp) != '\0');
901
902	*error = 0;
903	return (res);
904}
905
906int
907xs_getnum(void *xsc, const char *path, const char *property,
908    unsigned long long *val)
909{
910	char *buf;
911	int error = 0;
912
913	buf = malloc(XS_MAX_PAYLOAD, M_DEVBUF, M_ZERO |
914	    (cold ? M_NOWAIT : M_WAITOK));
915	if (buf == NULL)
916		return (ENOMEM);
917
918	error = xs_getprop(xsc, path, property, buf, XS_MAX_PAYLOAD);
919	if (error)
920		goto out;
921
922	*val = atoull(buf, &error);
923	if (error)
924		goto out;
925
926 out:
927	free(buf, M_DEVBUF, XS_MAX_PAYLOAD);
928	return (error);
929}
930
931int
932xs_setnum(void *xsc, const char *path, const char *property,
933    unsigned long long val)
934{
935	char buf[32];
936	int ret;
937
938	ret = snprintf(buf, sizeof(buf), "%llu", val);
939	if (ret == -1 || ret >= sizeof(buf))
940		return (ERANGE);
941
942	return (xs_setprop(xsc, path, property, buf, strlen(buf)));
943}
944
945int
946xs_getprop(void *xsc, const char *path, const char *property, char *value,
947    int size)
948{
949	struct xen_softc *sc = xsc;
950	struct xs_transaction xst;
951	struct iovec *iovp = NULL;
952	char key[256];
953	int error, ret, iov_cnt = 0;
954
955	if (!property)
956		return (EINVAL);
957
958	memset(&xst, 0, sizeof(xst));
959	xst.xst_id = 0;
960	xst.xst_cookie = sc->sc_xs;
961
962	if (path)
963		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
964	else
965		ret = snprintf(key, sizeof(key), "%s", property);
966	if (ret == -1 || ret >= sizeof(key))
967		return (EINVAL);
968
969	if ((error = xs_cmd(&xst, XS_READ, key, &iovp, &iov_cnt)) != 0)
970		return (error);
971
972	if (iov_cnt > 0)
973		strlcpy(value, (char *)iovp->iov_base, size);
974
975	xs_resfree(&xst, iovp, iov_cnt);
976
977	return (0);
978}
979
980int
981xs_setprop(void *xsc, const char *path, const char *property, char *value,
982    int size)
983{
984	struct xen_softc *sc = xsc;
985	struct xs_transaction xst;
986	struct iovec iov, *iovp = &iov;
987	char key[256];
988	int error, ret, iov_cnt = 0;
989
990	if (!property)
991		return (EINVAL);
992
993	memset(&xst, 0, sizeof(xst));
994	xst.xst_id = 0;
995	xst.xst_cookie = sc->sc_xs;
996
997	if (path)
998		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
999	else
1000		ret = snprintf(key, sizeof(key), "%s", property);
1001	if (ret == -1 || ret >= sizeof(key))
1002		return (EINVAL);
1003
1004	iov.iov_base = value;
1005	iov.iov_len = size;
1006	iov_cnt = 1;
1007
1008	error = xs_cmd(&xst, XS_WRITE, key, &iovp, &iov_cnt);
1009
1010	return (error);
1011}
1012
1013int
1014xs_cmpprop(void *xsc, const char *path, const char *property, const char *value,
1015    int *result)
1016{
1017	struct xen_softc *sc = xsc;
1018	struct xs_transaction xst;
1019	struct iovec *iovp = NULL;
1020	char key[256];
1021	int error, ret, iov_cnt = 0;
1022
1023	if (!property)
1024		return (EINVAL);
1025
1026	memset(&xst, 0, sizeof(xst));
1027	xst.xst_id = 0;
1028	xst.xst_cookie = sc->sc_xs;
1029
1030	if (path)
1031		ret = snprintf(key, sizeof(key), "%s/%s", path, property);
1032	else
1033		ret = snprintf(key, sizeof(key), "%s", property);
1034	if (ret == -1 || ret >= sizeof(key))
1035		return (EINVAL);
1036
1037	if ((error = xs_cmd(&xst, XS_READ, key, &iovp, &iov_cnt)) != 0)
1038		return (error);
1039
1040	*result = strcmp(value, (char *)iovp->iov_base);
1041
1042	xs_resfree(&xst, iovp, iov_cnt);
1043
1044	return (0);
1045}
1046
1047int
1048xs_await_transition(void *xsc, const char *path, const char *property,
1049    const char *value, int timo)
1050{
1051	struct xen_softc *sc = xsc;
1052	int error, res;
1053
1054	do {
1055		error = xs_cmpprop(xsc, path, property, value, &res);
1056		if (error)
1057			return (error);
1058		if (timo && --timo == 0)
1059			return (ETIMEDOUT);
1060		xs_poll(sc->sc_xs, cold);
1061	} while (res != 0);
1062
1063	return (0);
1064}
1065
1066int
1067xs_kvop(void *xsc, int op, char *key, char *value, size_t valuelen)
1068{
1069	struct xen_softc *sc = xsc;
1070	struct xs_transaction xst;
1071	struct iovec iov, *iovp = &iov;
1072	int error = 0, iov_cnt = 0, cmd, i;
1073
1074	switch (op) {
1075	case PVBUS_KVWRITE:
1076		cmd = XS_WRITE;
1077		iov.iov_base = value;
1078		iov.iov_len = strlen(value);
1079		iov_cnt = 1;
1080		break;
1081	case PVBUS_KVREAD:
1082		cmd = XS_READ;
1083		break;
1084	case PVBUS_KVLS:
1085		cmd = XS_LIST;
1086		break;
1087	default:
1088		return (EOPNOTSUPP);
1089	}
1090
1091	memset(&xst, 0, sizeof(xst));
1092	xst.xst_id = 0;
1093	xst.xst_cookie = sc->sc_xs;
1094
1095	if ((error = xs_cmd(&xst, cmd, key, &iovp, &iov_cnt)) != 0)
1096		return (error);
1097
1098	memset(value, 0, valuelen);
1099
1100	switch (cmd) {
1101	case XS_READ:
1102		if (iov_cnt == 1 && iovp[0].iov_len == 1) {
1103			xs_resfree(&xst, iovp, iov_cnt);
1104
1105			/*
1106			 * We cannot distinguish if the returned value is
1107			 * a directory or a file in the xenstore.  The only
1108			 * indication is that the read value of a directory
1109			 * returns an empty string (single nul byte),
1110			 * so try to get the directory list in this case.
1111			 */
1112			return (xs_kvop(xsc, PVBUS_KVLS, key, value, valuelen));
1113		}
1114		/* FALLTHROUGH */
1115	case XS_LIST:
1116		for (i = 0; i < iov_cnt; i++) {
1117			if (i > 0 && strlcat(value, "\n", valuelen) >=
1118			    valuelen) {
1119				error = ERANGE;
1120				break;
1121			}
1122			if (strlcat(value, iovp[i].iov_base,
1123			    valuelen) >= valuelen) {
1124				error = ERANGE;
1125				break;
1126			}
1127		}
1128		xs_resfree(&xst, iovp, iov_cnt);
1129		break;
1130	default:
1131		break;
1132	}
1133
1134	return (error);
1135}
1136