1/* 2 * kvm eventfd support - use eventfd objects to signal various KVM events 3 * 4 * Copyright 2009 Novell. All Rights Reserved. 5 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 6 * 7 * Author: 8 * Gregory Haskins <ghaskins@novell.com> 9 * 10 * This file is free software; you can redistribute it and/or modify 11 * it under the terms of version 2 of the GNU General Public License 12 * as published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software Foundation, 21 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 22 */ 23 24#include <linux/kvm_host.h> 25#include <linux/kvm.h> 26#include <linux/workqueue.h> 27#include <linux/syscalls.h> 28#include <linux/wait.h> 29#include <linux/poll.h> 30#include <linux/file.h> 31#include <linux/list.h> 32#include <linux/eventfd.h> 33#include <linux/kernel.h> 34#include <linux/slab.h> 35 36#include "iodev.h" 37 38/* 39 * -------------------------------------------------------------------- 40 * irqfd: Allows an fd to be used to inject an interrupt to the guest 41 * 42 * Credit goes to Avi Kivity for the original idea. 43 * -------------------------------------------------------------------- 44 */ 45 46struct _irqfd { 47 struct kvm *kvm; 48 struct eventfd_ctx *eventfd; 49 int gsi; 50 struct list_head list; 51 poll_table pt; 52 wait_queue_t wait; 53 struct work_struct inject; 54 struct work_struct shutdown; 55}; 56 57static struct workqueue_struct *irqfd_cleanup_wq; 58 59static void 60irqfd_inject(struct work_struct *work) 61{ 62 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); 63 struct kvm *kvm = irqfd->kvm; 64 65 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); 66 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); 67} 68 69/* 70 * Race-free decouple logic (ordering is critical) 71 */ 72static void 73irqfd_shutdown(struct work_struct *work) 74{ 75 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); 76 u64 cnt; 77 78 /* 79 * Synchronize with the wait-queue and unhook ourselves to prevent 80 * further events. 81 */ 82 eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 83 84 /* 85 * We know no new events will be scheduled at this point, so block 86 * until all previously outstanding events have completed 87 */ 88 flush_work(&irqfd->inject); 89 90 /* 91 * It is now safe to release the object's resources 92 */ 93 eventfd_ctx_put(irqfd->eventfd); 94 kfree(irqfd); 95} 96 97 98/* assumes kvm->irqfds.lock is held */ 99static bool 100irqfd_is_active(struct _irqfd *irqfd) 101{ 102 return list_empty(&irqfd->list) ? false : true; 103} 104 105/* 106 * Mark the irqfd as inactive and schedule it for removal 107 * 108 * assumes kvm->irqfds.lock is held 109 */ 110static void 111irqfd_deactivate(struct _irqfd *irqfd) 112{ 113 BUG_ON(!irqfd_is_active(irqfd)); 114 115 list_del_init(&irqfd->list); 116 117 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 118} 119 120/* 121 * Called with wqh->lock held and interrupts disabled 122 */ 123static int 124irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) 125{ 126 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); 127 unsigned long flags = (unsigned long)key; 128 129 if (flags & POLLIN) 130 /* An event has been signaled, inject an interrupt */ 131 schedule_work(&irqfd->inject); 132 133 if (flags & POLLHUP) { 134 /* The eventfd is closing, detach from KVM */ 135 struct kvm *kvm = irqfd->kvm; 136 unsigned long flags; 137 138 spin_lock_irqsave(&kvm->irqfds.lock, flags); 139 140 /* 141 * We must check if someone deactivated the irqfd before 142 * we could acquire the irqfds.lock since the item is 143 * deactivated from the KVM side before it is unhooked from 144 * the wait-queue. If it is already deactivated, we can 145 * simply return knowing the other side will cleanup for us. 146 * We cannot race against the irqfd going away since the 147 * other side is required to acquire wqh->lock, which we hold 148 */ 149 if (irqfd_is_active(irqfd)) 150 irqfd_deactivate(irqfd); 151 152 spin_unlock_irqrestore(&kvm->irqfds.lock, flags); 153 } 154 155 return 0; 156} 157 158static void 159irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, 160 poll_table *pt) 161{ 162 struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); 163 add_wait_queue(wqh, &irqfd->wait); 164} 165 166static int 167kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) 168{ 169 struct _irqfd *irqfd, *tmp; 170 struct file *file = NULL; 171 struct eventfd_ctx *eventfd = NULL; 172 int ret; 173 unsigned int events; 174 175 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); 176 if (!irqfd) 177 return -ENOMEM; 178 179 irqfd->kvm = kvm; 180 irqfd->gsi = gsi; 181 INIT_LIST_HEAD(&irqfd->list); 182 INIT_WORK(&irqfd->inject, irqfd_inject); 183 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 184 185 file = eventfd_fget(fd); 186 if (IS_ERR(file)) { 187 ret = PTR_ERR(file); 188 goto fail; 189 } 190 191 eventfd = eventfd_ctx_fileget(file); 192 if (IS_ERR(eventfd)) { 193 ret = PTR_ERR(eventfd); 194 goto fail; 195 } 196 197 irqfd->eventfd = eventfd; 198 199 /* 200 * Install our own custom wake-up handling so we are notified via 201 * a callback whenever someone signals the underlying eventfd 202 */ 203 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 204 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 205 206 spin_lock_irq(&kvm->irqfds.lock); 207 208 ret = 0; 209 list_for_each_entry(tmp, &kvm->irqfds.items, list) { 210 if (irqfd->eventfd != tmp->eventfd) 211 continue; 212 /* This fd is used for another irq already. */ 213 ret = -EBUSY; 214 spin_unlock_irq(&kvm->irqfds.lock); 215 goto fail; 216 } 217 218 events = file->f_op->poll(file, &irqfd->pt); 219 220 list_add_tail(&irqfd->list, &kvm->irqfds.items); 221 222 /* 223 * Check if there was an event already pending on the eventfd 224 * before we registered, and trigger it as if we didn't miss it. 225 */ 226 if (events & POLLIN) 227 schedule_work(&irqfd->inject); 228 229 spin_unlock_irq(&kvm->irqfds.lock); 230 231 /* 232 * do not drop the file until the irqfd is fully initialized, otherwise 233 * we might race against the POLLHUP 234 */ 235 fput(file); 236 237 return 0; 238 239fail: 240 if (eventfd && !IS_ERR(eventfd)) 241 eventfd_ctx_put(eventfd); 242 243 if (!IS_ERR(file)) 244 fput(file); 245 246 kfree(irqfd); 247 return ret; 248} 249 250void 251kvm_eventfd_init(struct kvm *kvm) 252{ 253 spin_lock_init(&kvm->irqfds.lock); 254 INIT_LIST_HEAD(&kvm->irqfds.items); 255 INIT_LIST_HEAD(&kvm->ioeventfds); 256} 257 258/* 259 * shutdown any irqfd's that match fd+gsi 260 */ 261static int 262kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) 263{ 264 struct _irqfd *irqfd, *tmp; 265 struct eventfd_ctx *eventfd; 266 267 eventfd = eventfd_ctx_fdget(fd); 268 if (IS_ERR(eventfd)) 269 return PTR_ERR(eventfd); 270 271 spin_lock_irq(&kvm->irqfds.lock); 272 273 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 274 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) 275 irqfd_deactivate(irqfd); 276 } 277 278 spin_unlock_irq(&kvm->irqfds.lock); 279 eventfd_ctx_put(eventfd); 280 281 /* 282 * Block until we know all outstanding shutdown jobs have completed 283 * so that we guarantee there will not be any more interrupts on this 284 * gsi once this deassign function returns. 285 */ 286 flush_workqueue(irqfd_cleanup_wq); 287 288 return 0; 289} 290 291int 292kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) 293{ 294 if (flags & KVM_IRQFD_FLAG_DEASSIGN) 295 return kvm_irqfd_deassign(kvm, fd, gsi); 296 297 return kvm_irqfd_assign(kvm, fd, gsi); 298} 299 300/* 301 * This function is called as the kvm VM fd is being released. Shutdown all 302 * irqfds that still remain open 303 */ 304void 305kvm_irqfd_release(struct kvm *kvm) 306{ 307 struct _irqfd *irqfd, *tmp; 308 309 spin_lock_irq(&kvm->irqfds.lock); 310 311 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 312 irqfd_deactivate(irqfd); 313 314 spin_unlock_irq(&kvm->irqfds.lock); 315 316 /* 317 * Block until we know all outstanding shutdown jobs have completed 318 * since we do not take a kvm* reference. 319 */ 320 flush_workqueue(irqfd_cleanup_wq); 321 322} 323 324/* 325 * create a host-wide workqueue for issuing deferred shutdown requests 326 * aggregated from all vm* instances. We need our own isolated single-thread 327 * queue to prevent deadlock against flushing the normal work-queue. 328 */ 329static int __init irqfd_module_init(void) 330{ 331 irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); 332 if (!irqfd_cleanup_wq) 333 return -ENOMEM; 334 335 return 0; 336} 337 338static void __exit irqfd_module_exit(void) 339{ 340 destroy_workqueue(irqfd_cleanup_wq); 341} 342 343module_init(irqfd_module_init); 344module_exit(irqfd_module_exit); 345 346/* 347 * -------------------------------------------------------------------- 348 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 349 * 350 * userspace can register a PIO/MMIO address with an eventfd for receiving 351 * notification when the memory has been touched. 352 * -------------------------------------------------------------------- 353 */ 354 355struct _ioeventfd { 356 struct list_head list; 357 u64 addr; 358 int length; 359 struct eventfd_ctx *eventfd; 360 u64 datamatch; 361 struct kvm_io_device dev; 362 bool wildcard; 363}; 364 365static inline struct _ioeventfd * 366to_ioeventfd(struct kvm_io_device *dev) 367{ 368 return container_of(dev, struct _ioeventfd, dev); 369} 370 371static void 372ioeventfd_release(struct _ioeventfd *p) 373{ 374 eventfd_ctx_put(p->eventfd); 375 list_del(&p->list); 376 kfree(p); 377} 378 379static bool 380ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 381{ 382 u64 _val; 383 384 if (!(addr == p->addr && len == p->length)) 385 /* address-range must be precise for a hit */ 386 return false; 387 388 if (p->wildcard) 389 /* all else equal, wildcard is always a hit */ 390 return true; 391 392 /* otherwise, we have to actually compare the data */ 393 394 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 395 396 switch (len) { 397 case 1: 398 _val = *(u8 *)val; 399 break; 400 case 2: 401 _val = *(u16 *)val; 402 break; 403 case 4: 404 _val = *(u32 *)val; 405 break; 406 case 8: 407 _val = *(u64 *)val; 408 break; 409 default: 410 return false; 411 } 412 413 return _val == p->datamatch ? true : false; 414} 415 416/* MMIO/PIO writes trigger an event if the addr/val match */ 417static int 418ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, 419 const void *val) 420{ 421 struct _ioeventfd *p = to_ioeventfd(this); 422 423 if (!ioeventfd_in_range(p, addr, len, val)) 424 return -EOPNOTSUPP; 425 426 eventfd_signal(p->eventfd, 1); 427 return 0; 428} 429 430/* 431 * This function is called as KVM is completely shutting down. We do not 432 * need to worry about locking just nuke anything we have as quickly as possible 433 */ 434static void 435ioeventfd_destructor(struct kvm_io_device *this) 436{ 437 struct _ioeventfd *p = to_ioeventfd(this); 438 439 ioeventfd_release(p); 440} 441 442static const struct kvm_io_device_ops ioeventfd_ops = { 443 .write = ioeventfd_write, 444 .destructor = ioeventfd_destructor, 445}; 446 447/* assumes kvm->slots_lock held */ 448static bool 449ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 450{ 451 struct _ioeventfd *_p; 452 453 list_for_each_entry(_p, &kvm->ioeventfds, list) 454 if (_p->addr == p->addr && _p->length == p->length && 455 (_p->wildcard || p->wildcard || 456 _p->datamatch == p->datamatch)) 457 return true; 458 459 return false; 460} 461 462static int 463kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 464{ 465 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; 466 enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; 467 struct _ioeventfd *p; 468 struct eventfd_ctx *eventfd; 469 int ret; 470 471 /* must be natural-word sized */ 472 switch (args->len) { 473 case 1: 474 case 2: 475 case 4: 476 case 8: 477 break; 478 default: 479 return -EINVAL; 480 } 481 482 /* check for range overflow */ 483 if (args->addr + args->len < args->addr) 484 return -EINVAL; 485 486 /* check for extra flags that we don't understand */ 487 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 488 return -EINVAL; 489 490 eventfd = eventfd_ctx_fdget(args->fd); 491 if (IS_ERR(eventfd)) 492 return PTR_ERR(eventfd); 493 494 p = kzalloc(sizeof(*p), GFP_KERNEL); 495 if (!p) { 496 ret = -ENOMEM; 497 goto fail; 498 } 499 500 INIT_LIST_HEAD(&p->list); 501 p->addr = args->addr; 502 p->length = args->len; 503 p->eventfd = eventfd; 504 505 /* The datamatch feature is optional, otherwise this is a wildcard */ 506 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 507 p->datamatch = args->datamatch; 508 else 509 p->wildcard = true; 510 511 mutex_lock(&kvm->slots_lock); 512 513 /* Verify that there isnt a match already */ 514 if (ioeventfd_check_collision(kvm, p)) { 515 ret = -EEXIST; 516 goto unlock_fail; 517 } 518 519 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 520 521 ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); 522 if (ret < 0) 523 goto unlock_fail; 524 525 list_add_tail(&p->list, &kvm->ioeventfds); 526 527 mutex_unlock(&kvm->slots_lock); 528 529 return 0; 530 531unlock_fail: 532 mutex_unlock(&kvm->slots_lock); 533 534fail: 535 kfree(p); 536 eventfd_ctx_put(eventfd); 537 538 return ret; 539} 540 541static int 542kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 543{ 544 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; 545 enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; 546 struct _ioeventfd *p, *tmp; 547 struct eventfd_ctx *eventfd; 548 int ret = -ENOENT; 549 550 eventfd = eventfd_ctx_fdget(args->fd); 551 if (IS_ERR(eventfd)) 552 return PTR_ERR(eventfd); 553 554 mutex_lock(&kvm->slots_lock); 555 556 list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { 557 bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 558 559 if (p->eventfd != eventfd || 560 p->addr != args->addr || 561 p->length != args->len || 562 p->wildcard != wildcard) 563 continue; 564 565 if (!p->wildcard && p->datamatch != args->datamatch) 566 continue; 567 568 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 569 ioeventfd_release(p); 570 ret = 0; 571 break; 572 } 573 574 mutex_unlock(&kvm->slots_lock); 575 576 eventfd_ctx_put(eventfd); 577 578 return ret; 579} 580 581int 582kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 583{ 584 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 585 return kvm_deassign_ioeventfd(kvm, args); 586 587 return kvm_assign_ioeventfd(kvm, args); 588} 589