xenevt.c revision 1.38
1/* $NetBSD: xenevt.c,v 1.38 2011/08/11 17:59:00 cherry Exp $ */ 2 3/* 4 * Copyright (c) 2005 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28#include <sys/cdefs.h> 29__KERNEL_RCSID(0, "$NetBSD: xenevt.c,v 1.38 2011/08/11 17:59:00 cherry Exp $"); 30 31#include "opt_xen.h" 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/malloc.h> 35#include <sys/mutex.h> 36#include <sys/systm.h> 37#include <sys/device.h> 38#include <sys/file.h> 39#include <sys/filedesc.h> 40#include <sys/poll.h> 41#include <sys/select.h> 42#include <sys/proc.h> 43#include <sys/conf.h> 44#include <sys/intr.h> 45#include <sys/kmem.h> 46 47#include <uvm/uvm_extern.h> 48 49#include <xen/hypervisor.h> 50#include <xen/xenpmap.h> 51#include <xen/xenio.h> 52#include <xen/xenio3.h> 53#include <xen/xen.h> 54 55/* 56 * Interface between the event channel and userland. 57 * Each process with a xenevt device instance open can regiter events it 58 * wants to receive. It will get pending events by read(), eventually blocking 59 * until some event is available. Pending events are ack'd by a bitmask 60 * write()en to the device. Some special operations (such as events binding) 61 * are done though ioctl(). 62 * Processes get a device instance by opening a cloning device. 63 */ 64 65void xenevtattach(int); 66static int xenevt_fread(struct file *, off_t *, struct uio *, 67 kauth_cred_t, int); 68static int xenevt_fwrite(struct file *, off_t *, struct uio *, 69 kauth_cred_t, int); 70static int xenevt_fioctl(struct file *, u_long, void *); 71static int xenevt_fpoll(struct file *, int); 72static int xenevt_fclose(struct file *); 73/* static int xenevt_fkqfilter(struct file *, struct knote *); */ 74 75static const struct fileops xenevt_fileops = { 76 .fo_read = xenevt_fread, 77 .fo_write = xenevt_fwrite, 78 .fo_ioctl = xenevt_fioctl, 79 .fo_fcntl = fnullop_fcntl, 80 .fo_poll = xenevt_fpoll, 81 .fo_stat = fbadop_stat, 82 .fo_close = xenevt_fclose, 83 .fo_kqfilter = /* xenevt_fkqfilter */ fnullop_kqfilter, 84 .fo_restart = fnullop_restart, 85}; 86 87dev_type_open(xenevtopen); 88dev_type_read(xenevtread); 89dev_type_mmap(xenevtmmap); 90const struct cdevsw xenevt_cdevsw = { 91 xenevtopen, nullclose, xenevtread, nowrite, noioctl, 92 nostop, notty, nopoll, xenevtmmap, nokqfilter, D_OTHER 93}; 94 95/* minor numbers */ 96#define DEV_EVT 0 97#define DEV_XSD 1 98 99/* per-instance datas */ 100#define XENEVT_RING_SIZE 2048 101#define XENEVT_RING_MASK 2047 102 103#define BYTES_PER_PORT (sizeof(evtchn_port_t) / sizeof(uint8_t)) 104 105struct xenevt_d { 106 kmutex_t lock; 107 kcondvar_t cv; 108 STAILQ_ENTRY(xenevt_d) pendingq; 109 bool pending; 110 evtchn_port_t ring[2048]; 111 u_int ring_read; /* pointer of the reader */ 112 u_int ring_write; /* pointer of the writer */ 113 u_int flags; 114#define XENEVT_F_OVERFLOW 0x01 /* ring overflow */ 115 struct selinfo sel; /* used by poll */ 116}; 117 118/* event -> user device mapping */ 119static struct xenevt_d *devevent[NR_EVENT_CHANNELS]; 120 121/* pending events */ 122static void *devevent_sih; 123static kmutex_t devevent_lock; 124static STAILQ_HEAD(, xenevt_d) devevent_pending; 125 126static void xenevt_donotify(struct xenevt_d *); 127static void xenevt_record(struct xenevt_d *, evtchn_port_t); 128 129/* pending events */ 130long xenevt_ev1; 131long xenevt_ev2[NR_EVENT_CHANNELS]; 132static int xenevt_processevt(void *); 133 134/* called at boot time */ 135void 136xenevtattach(int n) 137{ 138 struct intrhand *ih; 139 int s; 140 int level = IPL_HIGH; 141#ifdef MULTIPROCESSOR 142 bool mpsafe = (level != IPL_VM); 143#endif /* MULTIPROCESSOR */ 144 145 mutex_init(&devevent_lock, MUTEX_DEFAULT, IPL_HIGH); 146 STAILQ_INIT(&devevent_pending); 147 148 devevent_sih = softint_establish(SOFTINT_SERIAL, 149 (void (*)(void *))xenevt_notify, NULL); 150 memset(devevent, 0, sizeof(devevent)); 151 xenevt_ev1 = 0; 152 memset(xenevt_ev2, 0, sizeof(xenevt_ev2)); 153 154 /* register a handler at splhigh, so that spllower() will call us */ 155 ih = malloc(sizeof (struct intrhand), M_DEVBUF, 156 M_WAITOK|M_ZERO); 157 if (ih == NULL) 158 panic("can't allocate xenevt interrupt source"); 159 ih->ih_level = level; 160 ih->ih_fun = ih->ih_realfun = xenevt_processevt; 161 ih->ih_arg = ih->ih_realarg = NULL; 162 ih->ih_ipl_next = NULL; 163 ih->ih_cpu = curcpu(); 164#ifdef MULTIPROCESSOR 165 if (!mpsafe) { 166 ih->ih_fun = intr_biglock_wrapper; 167 ih->ih_arg = ih; 168 } 169#endif /* MULTIPROCESSOR */ 170 171 s = splhigh(); 172 event_set_iplhandler(ih->ih_cpu, ih, level); 173 splx(s); 174} 175 176/* register pending event - always called with interrupt disabled */ 177void 178xenevt_setipending(int l1, int l2) 179{ 180 xenevt_ev1 |= 1UL << l1; 181 xenevt_ev2[l1] |= 1UL << l2; 182 curcpu()/*XXX*/->ci_ipending |= 1 << IPL_HIGH; 183} 184 185/* process pending events */ 186static int 187xenevt_processevt(void *v) 188{ 189 long l1, l2; 190 int l1i, l2i; 191 int port; 192 193 l1 = xen_atomic_xchg(&xenevt_ev1, 0); 194 while ((l1i = xen_ffs(l1)) != 0) { 195 l1i--; 196 l1 &= ~(1UL << l1i); 197 l2 = xen_atomic_xchg(&xenevt_ev2[l1i], 0); 198 while ((l2i = xen_ffs(l2)) != 0) { 199 l2i--; 200 l2 &= ~(1UL << l2i); 201 port = (l1i << LONG_SHIFT) + l2i; 202 xenevt_event(port); 203 } 204 } 205 206 return 0; 207} 208 209 210/* event callback, called at splhigh() */ 211void 212xenevt_event(int port) 213{ 214 struct xenevt_d *d; 215 216 d = devevent[port]; 217 if (d != NULL) { 218 xenevt_record(d, port); 219 220 if (d->pending) { 221 return; 222 } 223 224 mutex_enter(&devevent_lock); 225 STAILQ_INSERT_TAIL(&devevent_pending, d, pendingq); 226 d->pending = true; 227 mutex_exit(&devevent_lock); 228 229 softint_schedule(devevent_sih); 230 } 231} 232 233void 234xenevt_notify(void) 235{ 236 struct xenevt_d *d; 237 238 for (;;) { 239 mutex_enter(&devevent_lock); 240 d = STAILQ_FIRST(&devevent_pending); 241 if (d == NULL) { 242 mutex_exit(&devevent_lock); 243 break; 244 } 245 STAILQ_REMOVE_HEAD(&devevent_pending, pendingq); 246 d->pending = false; 247 mutex_exit(&devevent_lock); 248 249 xenevt_donotify(d); 250 } 251} 252 253static void 254xenevt_donotify(struct xenevt_d *d) 255{ 256 257 mutex_enter(&d->lock); 258 selnotify(&d->sel, 0, 1); 259 cv_broadcast(&d->cv); 260 mutex_exit(&d->lock); 261} 262 263static void 264xenevt_record(struct xenevt_d *d, evtchn_port_t port) 265{ 266 267 /* 268 * This algorithm overflows for one less slot than available. 269 * Not really an issue, and the correct algorithm would be more 270 * complex 271 */ 272 273 if (d->ring_read == 274 ((d->ring_write + 1) & XENEVT_RING_MASK)) { 275 d->flags |= XENEVT_F_OVERFLOW; 276 printf("xenevt_event: ring overflow port %d\n", port); 277 } else { 278 d->ring[d->ring_write] = port; 279 d->ring_write = (d->ring_write + 1) & XENEVT_RING_MASK; 280 } 281} 282 283/* open the xenevt device; this is where we clone */ 284int 285xenevtopen(dev_t dev, int flags, int mode, struct lwp *l) 286{ 287 struct xenevt_d *d; 288 struct file *fp; 289 int fd, error; 290 291 switch(minor(dev)) { 292 case DEV_EVT: 293 /* falloc() will use the descriptor for us. */ 294 if ((error = fd_allocfile(&fp, &fd)) != 0) 295 return error; 296 297 d = malloc(sizeof(*d), M_DEVBUF, M_WAITOK | M_ZERO); 298 mutex_init(&d->lock, MUTEX_DEFAULT, IPL_SOFTSERIAL); 299 cv_init(&d->cv, "xenevt"); 300 selinit(&d->sel); 301 return fd_clone(fp, fd, flags, &xenevt_fileops, d); 302 case DEV_XSD: 303 /* no clone for /dev/xsd_kva */ 304 return (0); 305 default: 306 break; 307 } 308 return ENODEV; 309} 310 311/* read from device: only for /dev/xsd_kva, xenevt is done though fread */ 312int 313xenevtread(dev_t dev, struct uio *uio, int flags) 314{ 315#define LD_STRLEN 21 /* a 64bit integer needs 20 digits in base10 */ 316 if (minor(dev) == DEV_XSD) { 317 char strbuf[LD_STRLEN], *bf; 318 int off, error; 319 size_t len; 320 321 off = (int)uio->uio_offset; 322 if (off < 0) 323 return EINVAL; 324 len = snprintf(strbuf, sizeof(strbuf), "%ld\n", 325 xen_start_info.store_mfn); 326 if (off >= len) { 327 bf = strbuf; 328 len = 0; 329 } else { 330 bf = &strbuf[off]; 331 len -= off; 332 } 333 error = uiomove(bf, len, uio); 334 return error; 335 } 336 return ENODEV; 337} 338 339/* mmap: only for xsd_kva */ 340paddr_t 341xenevtmmap(dev_t dev, off_t off, int prot) 342{ 343 if (minor(dev) == DEV_XSD) { 344 /* only one page, so off is always 0 */ 345 if (off != 0) 346 return -1; 347 return x86_btop( 348 xpmap_mtop((paddr_t)xen_start_info.store_mfn << PAGE_SHIFT)); 349 } 350 return -1; 351} 352 353static int 354xenevt_fclose(struct file *fp) 355{ 356 struct xenevt_d *d = fp->f_data; 357 int i; 358 359 for (i = 0; i < NR_EVENT_CHANNELS; i++ ) { 360 if (devevent[i] == d) { 361 evtchn_op_t op = { .cmd = 0 }; 362 int error; 363 364 hypervisor_mask_event(i); 365 devevent[i] = NULL; 366 367 op.cmd = EVTCHNOP_close; 368 op.u.close.port = i; 369 if ((error = HYPERVISOR_event_channel_op(&op))) { 370 printf("xenevt_fclose: error %d from " 371 "hypervisor\n", -error); 372 } 373 } 374 } 375 seldestroy(&d->sel); 376 cv_destroy(&d->cv); 377 mutex_destroy(&d->lock); 378 fp->f_data = NULL; 379 free(d, M_DEVBUF); 380 381 return (0); 382} 383 384static int 385xenevt_fread(struct file *fp, off_t *offp, struct uio *uio, 386 kauth_cred_t cred, int flags) 387{ 388 struct xenevt_d *d = fp->f_data; 389 int error, ring_read, ring_write; 390 size_t len, uio_len; 391 392 error = 0; 393 mutex_enter(&d->lock); 394 while (error == 0) { 395 ring_read = d->ring_read; 396 ring_write = d->ring_write; 397 if (ring_read != ring_write) { 398 break; 399 } 400 if (d->flags & XENEVT_F_OVERFLOW) { 401 break; 402 } 403 404 /* nothing to read */ 405 if ((fp->f_flag & FNONBLOCK) == 0) { 406 error = cv_wait_sig(&d->cv, &d->lock); 407 } else { 408 error = EAGAIN; 409 } 410 } 411 if (error == 0 && (d->flags & XENEVT_F_OVERFLOW)) { 412 error = EFBIG; 413 } 414 mutex_exit(&d->lock); 415 416 if (error) { 417 return error; 418 } 419 420 uio_len = uio->uio_resid / BYTES_PER_PORT; 421 if (ring_read <= ring_write) 422 len = ring_write - ring_read; 423 else 424 len = XENEVT_RING_SIZE - ring_read; 425 if (len > uio_len) 426 len = uio_len; 427 error = uiomove(&d->ring[ring_read], len * BYTES_PER_PORT, uio); 428 if (error) 429 return error; 430 ring_read = (ring_read + len) & XENEVT_RING_MASK; 431 uio_len = uio->uio_resid / BYTES_PER_PORT; 432 if (uio_len == 0) 433 goto done; 434 /* ring wrapped, read the second part */ 435 len = ring_write - ring_read; 436 if (len > uio_len) 437 len = uio_len; 438 error = uiomove(&d->ring[ring_read], len * BYTES_PER_PORT, uio); 439 if (error) 440 return error; 441 ring_read = (ring_read + len) & XENEVT_RING_MASK; 442 443done: 444 mutex_enter(&d->lock); 445 d->ring_read = ring_read; 446 mutex_exit(&d->lock); 447 448 return 0; 449} 450 451static int 452xenevt_fwrite(struct file *fp, off_t *offp, struct uio *uio, 453 kauth_cred_t cred, int flags) 454{ 455 struct xenevt_d *d = fp->f_data; 456 uint16_t *chans; 457 int i, nentries, error; 458 459 if (uio->uio_resid == 0) 460 return (0); 461 nentries = uio->uio_resid / sizeof(uint16_t); 462 if (nentries > NR_EVENT_CHANNELS) 463 return EMSGSIZE; 464 chans = kmem_alloc(nentries * sizeof(uint16_t), KM_SLEEP); 465 if (chans == NULL) 466 return ENOMEM; 467 error = uiomove(chans, uio->uio_resid, uio); 468 if (error) 469 goto out; 470 for (i = 0; i < nentries; i++) { 471 if (chans[i] < NR_EVENT_CHANNELS && 472 devevent[chans[i]] == d) { 473 hypervisor_unmask_event(chans[i]); 474 } 475 } 476out: 477 kmem_free(chans, nentries * sizeof(uint16_t)); 478 return 0; 479} 480 481static int 482xenevt_fioctl(struct file *fp, u_long cmd, void *addr) 483{ 484 struct xenevt_d *d = fp->f_data; 485 evtchn_op_t op = { .cmd = 0 }; 486 int error; 487 488 switch(cmd) { 489 case EVTCHN_RESET: 490 case IOCTL_EVTCHN_RESET: 491 d->ring_read = d->ring_write = 0; 492 d->flags = 0; 493 break; 494 case IOCTL_EVTCHN_BIND_VIRQ: 495 { 496 struct ioctl_evtchn_bind_virq *bind_virq = addr; 497 op.cmd = EVTCHNOP_bind_virq; 498 op.u.bind_virq.virq = bind_virq->virq; 499 op.u.bind_virq.vcpu = 0; 500 if ((error = HYPERVISOR_event_channel_op(&op))) { 501 printf("IOCTL_EVTCHN_BIND_VIRQ failed: virq %d error %d\n", bind_virq->virq, error); 502 return -error; 503 } 504 bind_virq->port = op.u.bind_virq.port; 505 devevent[bind_virq->port] = d; 506 hypervisor_unmask_event(bind_virq->port); 507 break; 508 } 509 case IOCTL_EVTCHN_BIND_INTERDOMAIN: 510 { 511 struct ioctl_evtchn_bind_interdomain *bind_intd = addr; 512 op.cmd = EVTCHNOP_bind_interdomain; 513 op.u.bind_interdomain.remote_dom = bind_intd->remote_domain; 514 op.u.bind_interdomain.remote_port = bind_intd->remote_port; 515 if ((error = HYPERVISOR_event_channel_op(&op))) 516 return -error; 517 bind_intd->port = op.u.bind_interdomain.local_port; 518 devevent[bind_intd->port] = d; 519 hypervisor_unmask_event(bind_intd->port); 520 break; 521 } 522 case IOCTL_EVTCHN_BIND_UNBOUND_PORT: 523 { 524 struct ioctl_evtchn_bind_unbound_port *bind_unbound = addr; 525 op.cmd = EVTCHNOP_alloc_unbound; 526 op.u.alloc_unbound.dom = DOMID_SELF; 527 op.u.alloc_unbound.remote_dom = bind_unbound->remote_domain; 528 if ((error = HYPERVISOR_event_channel_op(&op))) 529 return -error; 530 bind_unbound->port = op.u.alloc_unbound.port; 531 devevent[bind_unbound->port] = d; 532 hypervisor_unmask_event(bind_unbound->port); 533 break; 534 } 535 case IOCTL_EVTCHN_UNBIND: 536 { 537 struct ioctl_evtchn_unbind *unbind = addr; 538 539 if (unbind->port > NR_EVENT_CHANNELS) 540 return EINVAL; 541 if (devevent[unbind->port] != d) 542 return ENOTCONN; 543 devevent[unbind->port] = NULL; 544 hypervisor_mask_event(unbind->port); 545 op.cmd = EVTCHNOP_close; 546 op.u.close.port = unbind->port; 547 if ((error = HYPERVISOR_event_channel_op(&op))) 548 return -error; 549 break; 550 } 551 case IOCTL_EVTCHN_NOTIFY: 552 { 553 struct ioctl_evtchn_notify *notify = addr; 554 555 if (notify->port > NR_EVENT_CHANNELS) 556 return EINVAL; 557 if (devevent[notify->port] != d) 558 return ENOTCONN; 559 hypervisor_notify_via_evtchn(notify->port); 560 break; 561 } 562 case FIONBIO: 563 break; 564 default: 565 return EINVAL; 566 } 567 return 0; 568} 569 570/* 571 * Support for poll() system call 572 * 573 * Return true if the specific operation will not block indefinitely. 574 */ 575 576static int 577xenevt_fpoll(struct file *fp, int events) 578{ 579 struct xenevt_d *d = fp->f_data; 580 int revents = events & (POLLOUT | POLLWRNORM); /* we can always write */ 581 582 mutex_enter(&d->lock); 583 if (events & (POLLIN | POLLRDNORM)) { 584 if (d->ring_read != d->ring_write) { 585 revents |= events & (POLLIN | POLLRDNORM); 586 } else { 587 /* Record that someone is waiting */ 588 selrecord(curlwp, &d->sel); 589 } 590 } 591 mutex_exit(&d->lock); 592 return (revents); 593} 594