xenevt.c revision 1.38
1/*      $NetBSD: xenevt.c,v 1.38 2011/08/11 17:59:00 cherry Exp $      */
2
3/*
4 * Copyright (c) 2005 Manuel Bouyer.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 */
27
28#include <sys/cdefs.h>
29__KERNEL_RCSID(0, "$NetBSD: xenevt.c,v 1.38 2011/08/11 17:59:00 cherry Exp $");
30
31#include "opt_xen.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/malloc.h>
35#include <sys/mutex.h>
36#include <sys/systm.h>
37#include <sys/device.h>
38#include <sys/file.h>
39#include <sys/filedesc.h>
40#include <sys/poll.h>
41#include <sys/select.h>
42#include <sys/proc.h>
43#include <sys/conf.h>
44#include <sys/intr.h>
45#include <sys/kmem.h>
46
47#include <uvm/uvm_extern.h>
48
49#include <xen/hypervisor.h>
50#include <xen/xenpmap.h>
51#include <xen/xenio.h>
52#include <xen/xenio3.h>
53#include <xen/xen.h>
54
55/*
56 * Interface between the event channel and userland.
57 * Each process with a xenevt device instance open can regiter events it
58 * wants to receive. It will get pending events by read(), eventually blocking
59 * until some event is available. Pending events are ack'd by a bitmask
60 * write()en to the device. Some special operations (such as events binding)
61 * are done though ioctl().
62 * Processes get a device instance by opening a cloning device.
63 */
64
65void		xenevtattach(int);
66static int	xenevt_fread(struct file *, off_t *, struct uio *,
67    kauth_cred_t, int);
68static int	xenevt_fwrite(struct file *, off_t *, struct uio *,
69    kauth_cred_t, int);
70static int	xenevt_fioctl(struct file *, u_long, void *);
71static int	xenevt_fpoll(struct file *, int);
72static int	xenevt_fclose(struct file *);
73/* static int	xenevt_fkqfilter(struct file *, struct knote *); */
74
75static const struct fileops xenevt_fileops = {
76	.fo_read = xenevt_fread,
77	.fo_write = xenevt_fwrite,
78	.fo_ioctl = xenevt_fioctl,
79	.fo_fcntl = fnullop_fcntl,
80	.fo_poll = xenevt_fpoll,
81	.fo_stat = fbadop_stat,
82	.fo_close = xenevt_fclose,
83	.fo_kqfilter = /* xenevt_fkqfilter */ fnullop_kqfilter,
84	.fo_restart = fnullop_restart,
85};
86
87dev_type_open(xenevtopen);
88dev_type_read(xenevtread);
89dev_type_mmap(xenevtmmap);
90const struct cdevsw xenevt_cdevsw = {
91	xenevtopen, nullclose, xenevtread, nowrite, noioctl,
92	nostop, notty, nopoll, xenevtmmap, nokqfilter, D_OTHER
93};
94
95/* minor numbers */
96#define DEV_EVT 0
97#define DEV_XSD 1
98
99/* per-instance datas */
100#define XENEVT_RING_SIZE 2048
101#define XENEVT_RING_MASK 2047
102
103#define BYTES_PER_PORT (sizeof(evtchn_port_t) / sizeof(uint8_t))
104
105struct xenevt_d {
106	kmutex_t lock;
107	kcondvar_t cv;
108	STAILQ_ENTRY(xenevt_d) pendingq;
109	bool pending;
110	evtchn_port_t ring[2048];
111	u_int ring_read; /* pointer of the reader */
112	u_int ring_write; /* pointer of the writer */
113	u_int flags;
114#define XENEVT_F_OVERFLOW 0x01 /* ring overflow */
115	struct selinfo sel; /* used by poll */
116};
117
118/* event -> user device mapping */
119static struct xenevt_d *devevent[NR_EVENT_CHANNELS];
120
121/* pending events */
122static void *devevent_sih;
123static kmutex_t devevent_lock;
124static STAILQ_HEAD(, xenevt_d) devevent_pending;
125
126static void xenevt_donotify(struct xenevt_d *);
127static void xenevt_record(struct xenevt_d *, evtchn_port_t);
128
129/* pending events */
130long xenevt_ev1;
131long xenevt_ev2[NR_EVENT_CHANNELS];
132static int xenevt_processevt(void *);
133
134/* called at boot time */
135void
136xenevtattach(int n)
137{
138	struct intrhand *ih;
139	int s;
140	int level = IPL_HIGH;
141#ifdef MULTIPROCESSOR
142	bool mpsafe = (level != IPL_VM);
143#endif /* MULTIPROCESSOR */
144
145	mutex_init(&devevent_lock, MUTEX_DEFAULT, IPL_HIGH);
146	STAILQ_INIT(&devevent_pending);
147
148	devevent_sih = softint_establish(SOFTINT_SERIAL,
149	    (void (*)(void *))xenevt_notify, NULL);
150	memset(devevent, 0, sizeof(devevent));
151	xenevt_ev1 = 0;
152	memset(xenevt_ev2, 0, sizeof(xenevt_ev2));
153
154	/* register a handler at splhigh, so that spllower() will call us */
155	ih = malloc(sizeof (struct intrhand), M_DEVBUF,
156	     M_WAITOK|M_ZERO);
157	if (ih == NULL)
158		panic("can't allocate xenevt interrupt source");
159	ih->ih_level = level;
160	ih->ih_fun = ih->ih_realfun = xenevt_processevt;
161	ih->ih_arg = ih->ih_realarg = NULL;
162	ih->ih_ipl_next = NULL;
163	ih->ih_cpu = curcpu();
164#ifdef MULTIPROCESSOR
165	if (!mpsafe) {
166		ih->ih_fun = intr_biglock_wrapper;
167		ih->ih_arg = ih;
168	}
169#endif /* MULTIPROCESSOR */
170
171	s = splhigh();
172	event_set_iplhandler(ih->ih_cpu, ih, level);
173	splx(s);
174}
175
176/* register pending event - always called with interrupt disabled */
177void
178xenevt_setipending(int l1, int l2)
179{
180	xenevt_ev1 |= 1UL << l1;
181	xenevt_ev2[l1] |= 1UL << l2;
182	curcpu()/*XXX*/->ci_ipending |= 1 << IPL_HIGH;
183}
184
185/* process pending events */
186static int
187xenevt_processevt(void *v)
188{
189	long l1, l2;
190	int l1i, l2i;
191	int port;
192
193	l1 = xen_atomic_xchg(&xenevt_ev1, 0);
194	while ((l1i = xen_ffs(l1)) != 0) {
195		l1i--;
196		l1 &= ~(1UL << l1i);
197		l2 = xen_atomic_xchg(&xenevt_ev2[l1i], 0);
198		while ((l2i = xen_ffs(l2)) != 0) {
199			l2i--;
200			l2 &= ~(1UL << l2i);
201			port = (l1i << LONG_SHIFT) + l2i;
202			xenevt_event(port);
203		}
204	}
205
206	return 0;
207}
208
209
210/* event callback, called at splhigh() */
211void
212xenevt_event(int port)
213{
214	struct xenevt_d *d;
215
216	d = devevent[port];
217	if (d != NULL) {
218		xenevt_record(d, port);
219
220		if (d->pending) {
221			return;
222		}
223
224		mutex_enter(&devevent_lock);
225		STAILQ_INSERT_TAIL(&devevent_pending, d, pendingq);
226		d->pending = true;
227		mutex_exit(&devevent_lock);
228
229		softint_schedule(devevent_sih);
230	}
231}
232
233void
234xenevt_notify(void)
235{
236	struct xenevt_d *d;
237
238	for (;;) {
239		mutex_enter(&devevent_lock);
240		d = STAILQ_FIRST(&devevent_pending);
241		if (d == NULL) {
242			mutex_exit(&devevent_lock);
243			break;
244		}
245		STAILQ_REMOVE_HEAD(&devevent_pending, pendingq);
246		d->pending = false;
247		mutex_exit(&devevent_lock);
248
249		xenevt_donotify(d);
250	}
251}
252
253static void
254xenevt_donotify(struct xenevt_d *d)
255{
256
257	mutex_enter(&d->lock);
258	selnotify(&d->sel, 0, 1);
259	cv_broadcast(&d->cv);
260	mutex_exit(&d->lock);
261}
262
263static void
264xenevt_record(struct xenevt_d *d, evtchn_port_t port)
265{
266
267	/*
268	 * This algorithm overflows for one less slot than available.
269	 * Not really an issue, and the correct algorithm would be more
270	 * complex
271	 */
272
273	if (d->ring_read ==
274	    ((d->ring_write + 1) & XENEVT_RING_MASK)) {
275		d->flags |= XENEVT_F_OVERFLOW;
276		printf("xenevt_event: ring overflow port %d\n", port);
277	} else {
278		d->ring[d->ring_write] = port;
279		d->ring_write = (d->ring_write + 1) & XENEVT_RING_MASK;
280	}
281}
282
283/* open the xenevt device; this is where we clone */
284int
285xenevtopen(dev_t dev, int flags, int mode, struct lwp *l)
286{
287	struct xenevt_d *d;
288	struct file *fp;
289	int fd, error;
290
291	switch(minor(dev)) {
292	case DEV_EVT:
293		/* falloc() will use the descriptor for us. */
294		if ((error = fd_allocfile(&fp, &fd)) != 0)
295			return error;
296
297		d = malloc(sizeof(*d), M_DEVBUF, M_WAITOK | M_ZERO);
298		mutex_init(&d->lock, MUTEX_DEFAULT, IPL_SOFTSERIAL);
299		cv_init(&d->cv, "xenevt");
300		selinit(&d->sel);
301		return fd_clone(fp, fd, flags, &xenevt_fileops, d);
302	case DEV_XSD:
303		/* no clone for /dev/xsd_kva */
304		return (0);
305	default:
306		break;
307	}
308	return ENODEV;
309}
310
311/* read from device: only for /dev/xsd_kva, xenevt is done though fread */
312int
313xenevtread(dev_t dev, struct uio *uio, int flags)
314{
315#define LD_STRLEN 21 /* a 64bit integer needs 20 digits in base10 */
316	if (minor(dev) == DEV_XSD) {
317		char strbuf[LD_STRLEN], *bf;
318		int off, error;
319		size_t len;
320
321		off = (int)uio->uio_offset;
322		if (off < 0)
323			return EINVAL;
324		len  = snprintf(strbuf, sizeof(strbuf), "%ld\n",
325		    xen_start_info.store_mfn);
326		if (off >= len) {
327			bf = strbuf;
328			len = 0;
329		} else {
330			bf = &strbuf[off];
331			len -= off;
332		}
333		error = uiomove(bf, len, uio);
334		return error;
335	}
336	return ENODEV;
337}
338
339/* mmap: only for xsd_kva */
340paddr_t
341xenevtmmap(dev_t dev, off_t off, int prot)
342{
343	if (minor(dev) == DEV_XSD) {
344		/* only one page, so off is always 0 */
345		if (off != 0)
346			return -1;
347		return x86_btop(
348		   xpmap_mtop((paddr_t)xen_start_info.store_mfn << PAGE_SHIFT));
349	}
350	return -1;
351}
352
353static int
354xenevt_fclose(struct file *fp)
355{
356	struct xenevt_d *d = fp->f_data;
357	int i;
358
359	for (i = 0; i < NR_EVENT_CHANNELS; i++ ) {
360		if (devevent[i] == d) {
361			evtchn_op_t op = { .cmd = 0 };
362			int error;
363
364			hypervisor_mask_event(i);
365			devevent[i] = NULL;
366
367			op.cmd = EVTCHNOP_close;
368			op.u.close.port = i;
369			if ((error = HYPERVISOR_event_channel_op(&op))) {
370				printf("xenevt_fclose: error %d from "
371				    "hypervisor\n", -error);
372			}
373		}
374	}
375	seldestroy(&d->sel);
376	cv_destroy(&d->cv);
377	mutex_destroy(&d->lock);
378	fp->f_data = NULL;
379	free(d, M_DEVBUF);
380
381	return (0);
382}
383
384static int
385xenevt_fread(struct file *fp, off_t *offp, struct uio *uio,
386    kauth_cred_t cred, int flags)
387{
388	struct xenevt_d *d = fp->f_data;
389	int error, ring_read, ring_write;
390	size_t len, uio_len;
391
392	error = 0;
393	mutex_enter(&d->lock);
394	while (error == 0) {
395		ring_read = d->ring_read;
396		ring_write = d->ring_write;
397		if (ring_read != ring_write) {
398			break;
399		}
400		if (d->flags & XENEVT_F_OVERFLOW) {
401			break;
402		}
403
404		/* nothing to read */
405		if ((fp->f_flag & FNONBLOCK) == 0) {
406			error = cv_wait_sig(&d->cv, &d->lock);
407		} else {
408			error = EAGAIN;
409		}
410	}
411	if (error == 0 && (d->flags & XENEVT_F_OVERFLOW)) {
412		error = EFBIG;
413	}
414	mutex_exit(&d->lock);
415
416	if (error) {
417		return error;
418	}
419
420	uio_len = uio->uio_resid / BYTES_PER_PORT;
421	if (ring_read <= ring_write)
422		len = ring_write - ring_read;
423	else
424		len = XENEVT_RING_SIZE - ring_read;
425	if (len > uio_len)
426		len = uio_len;
427	error = uiomove(&d->ring[ring_read], len * BYTES_PER_PORT, uio);
428	if (error)
429		return error;
430	ring_read = (ring_read + len) & XENEVT_RING_MASK;
431	uio_len = uio->uio_resid / BYTES_PER_PORT;
432	if (uio_len == 0)
433		goto done;
434	/* ring wrapped, read the second part */
435	len = ring_write - ring_read;
436	if (len > uio_len)
437		len = uio_len;
438	error = uiomove(&d->ring[ring_read], len * BYTES_PER_PORT, uio);
439	if (error)
440		return error;
441	ring_read = (ring_read + len) & XENEVT_RING_MASK;
442
443done:
444	mutex_enter(&d->lock);
445	d->ring_read = ring_read;
446	mutex_exit(&d->lock);
447
448	return 0;
449}
450
451static int
452xenevt_fwrite(struct file *fp, off_t *offp, struct uio *uio,
453    kauth_cred_t cred, int flags)
454{
455	struct xenevt_d *d = fp->f_data;
456	uint16_t *chans;
457	int i, nentries, error;
458
459	if (uio->uio_resid == 0)
460		return (0);
461	nentries = uio->uio_resid / sizeof(uint16_t);
462	if (nentries > NR_EVENT_CHANNELS)
463		return EMSGSIZE;
464	chans = kmem_alloc(nentries * sizeof(uint16_t), KM_SLEEP);
465	if (chans == NULL)
466		return ENOMEM;
467	error = uiomove(chans, uio->uio_resid, uio);
468	if (error)
469		goto out;
470	for (i = 0; i < nentries; i++) {
471		if (chans[i] < NR_EVENT_CHANNELS &&
472		    devevent[chans[i]] == d) {
473			hypervisor_unmask_event(chans[i]);
474		}
475	}
476out:
477	kmem_free(chans, nentries * sizeof(uint16_t));
478	return 0;
479}
480
481static int
482xenevt_fioctl(struct file *fp, u_long cmd, void *addr)
483{
484	struct xenevt_d *d = fp->f_data;
485	evtchn_op_t op = { .cmd = 0 };
486	int error;
487
488	switch(cmd) {
489	case EVTCHN_RESET:
490	case IOCTL_EVTCHN_RESET:
491		d->ring_read = d->ring_write = 0;
492		d->flags = 0;
493		break;
494	case IOCTL_EVTCHN_BIND_VIRQ:
495	{
496		struct ioctl_evtchn_bind_virq *bind_virq = addr;
497		op.cmd = EVTCHNOP_bind_virq;
498		op.u.bind_virq.virq = bind_virq->virq;
499		op.u.bind_virq.vcpu = 0;
500		if ((error = HYPERVISOR_event_channel_op(&op))) {
501			printf("IOCTL_EVTCHN_BIND_VIRQ failed: virq %d error %d\n", bind_virq->virq, error);
502			return -error;
503		}
504		bind_virq->port = op.u.bind_virq.port;
505		devevent[bind_virq->port] = d;
506		hypervisor_unmask_event(bind_virq->port);
507		break;
508	}
509	case IOCTL_EVTCHN_BIND_INTERDOMAIN:
510	{
511		struct ioctl_evtchn_bind_interdomain *bind_intd = addr;
512		op.cmd = EVTCHNOP_bind_interdomain;
513		op.u.bind_interdomain.remote_dom = bind_intd->remote_domain;
514		op.u.bind_interdomain.remote_port = bind_intd->remote_port;
515		if ((error = HYPERVISOR_event_channel_op(&op)))
516			return -error;
517		bind_intd->port = op.u.bind_interdomain.local_port;
518		devevent[bind_intd->port] = d;
519		hypervisor_unmask_event(bind_intd->port);
520		break;
521	}
522	case IOCTL_EVTCHN_BIND_UNBOUND_PORT:
523	{
524		struct ioctl_evtchn_bind_unbound_port *bind_unbound = addr;
525		op.cmd = EVTCHNOP_alloc_unbound;
526		op.u.alloc_unbound.dom = DOMID_SELF;
527		op.u.alloc_unbound.remote_dom = bind_unbound->remote_domain;
528		if ((error = HYPERVISOR_event_channel_op(&op)))
529			return -error;
530		bind_unbound->port = op.u.alloc_unbound.port;
531		devevent[bind_unbound->port] = d;
532		hypervisor_unmask_event(bind_unbound->port);
533		break;
534	}
535	case IOCTL_EVTCHN_UNBIND:
536	{
537		struct ioctl_evtchn_unbind *unbind = addr;
538
539		if (unbind->port > NR_EVENT_CHANNELS)
540			return EINVAL;
541		if (devevent[unbind->port] != d)
542			return ENOTCONN;
543		devevent[unbind->port] = NULL;
544		hypervisor_mask_event(unbind->port);
545		op.cmd = EVTCHNOP_close;
546		op.u.close.port = unbind->port;
547		if ((error = HYPERVISOR_event_channel_op(&op)))
548			return -error;
549		break;
550	}
551	case IOCTL_EVTCHN_NOTIFY:
552	{
553		struct ioctl_evtchn_notify *notify = addr;
554
555		if (notify->port > NR_EVENT_CHANNELS)
556			return EINVAL;
557		if (devevent[notify->port] != d)
558			return ENOTCONN;
559		hypervisor_notify_via_evtchn(notify->port);
560		break;
561	}
562	case FIONBIO:
563		break;
564	default:
565		return EINVAL;
566	}
567	return 0;
568}
569
570/*
571 * Support for poll() system call
572 *
573 * Return true if the specific operation will not block indefinitely.
574 */
575
576static int
577xenevt_fpoll(struct file *fp, int events)
578{
579	struct xenevt_d *d = fp->f_data;
580	int revents = events & (POLLOUT | POLLWRNORM); /* we can always write */
581
582	mutex_enter(&d->lock);
583	if (events & (POLLIN | POLLRDNORM)) {
584		if (d->ring_read != d->ring_write) {
585			revents |= events & (POLLIN | POLLRDNORM);
586		} else {
587			/* Record that someone is waiting */
588			selrecord(curlwp, &d->sel);
589		}
590	}
591	mutex_exit(&d->lock);
592	return (revents);
593}
594