ib_umem_odp.c revision 319974
1319974Shselasky/* 2319974Shselasky * Copyright (c) 2014 Mellanox Technologies. All rights reserved. 3319974Shselasky * 4319974Shselasky * This software is available to you under a choice of one of two 5319974Shselasky * licenses. You may choose to be licensed under the terms of the GNU 6319974Shselasky * General Public License (GPL) Version 2, available from the file 7319974Shselasky * COPYING in the main directory of this source tree, or the 8319974Shselasky * OpenIB.org BSD license below: 9319974Shselasky * 10319974Shselasky * Redistribution and use in source and binary forms, with or 11319974Shselasky * without modification, are permitted provided that the following 12319974Shselasky * conditions are met: 13319974Shselasky * 14319974Shselasky * - Redistributions of source code must retain the above 15319974Shselasky * copyright notice, this list of conditions and the following 16319974Shselasky * disclaimer. 17319974Shselasky * 18319974Shselasky * - Redistributions in binary form must reproduce the above 19319974Shselasky * copyright notice, this list of conditions and the following 20319974Shselasky * disclaimer in the documentation and/or other materials 21319974Shselasky * provided with the distribution. 22319974Shselasky * 23319974Shselasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24319974Shselasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25319974Shselasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26319974Shselasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27319974Shselasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28319974Shselasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29319974Shselasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30319974Shselasky * SOFTWARE. 31319974Shselasky */ 32319974Shselasky 33319974Shselasky#include <linux/types.h> 34319974Shselasky#include <linux/sched.h> 35319974Shselasky#include <linux/slab.h> 36319974Shselasky#include <linux/vmalloc.h> 37319974Shselasky 38319974Shselasky#include <rdma/ib_verbs.h> 39319974Shselasky#include <rdma/ib_umem.h> 40319974Shselasky#include <rdma/ib_umem_odp.h> 41319974Shselasky 42319974Shselaskystatic void ib_umem_notifier_start_account(struct ib_umem *item) 43319974Shselasky{ 44319974Shselasky mutex_lock(&item->odp_data->umem_mutex); 45319974Shselasky 46319974Shselasky /* Only update private counters for this umem if it has them. 47319974Shselasky * Otherwise skip it. All page faults will be delayed for this umem. */ 48319974Shselasky if (item->odp_data->mn_counters_active) { 49319974Shselasky int notifiers_count = item->odp_data->notifiers_count++; 50319974Shselasky 51319974Shselasky if (notifiers_count == 0) 52319974Shselasky /* Initialize the completion object for waiting on 53319974Shselasky * notifiers. Since notifier_count is zero, no one 54319974Shselasky * should be waiting right now. */ 55319974Shselasky reinit_completion(&item->odp_data->notifier_completion); 56319974Shselasky } 57319974Shselasky mutex_unlock(&item->odp_data->umem_mutex); 58319974Shselasky} 59319974Shselasky 60319974Shselaskystatic void ib_umem_notifier_end_account(struct ib_umem *item) 61319974Shselasky{ 62319974Shselasky mutex_lock(&item->odp_data->umem_mutex); 63319974Shselasky 64319974Shselasky /* Only update private counters for this umem if it has them. 65319974Shselasky * Otherwise skip it. All page faults will be delayed for this umem. */ 66319974Shselasky if (item->odp_data->mn_counters_active) { 67319974Shselasky /* 68319974Shselasky * This sequence increase will notify the QP page fault that 69319974Shselasky * the page that is going to be mapped in the spte could have 70319974Shselasky * been freed. 71319974Shselasky */ 72319974Shselasky ++item->odp_data->notifiers_seq; 73319974Shselasky if (--item->odp_data->notifiers_count == 0) 74319974Shselasky complete_all(&item->odp_data->notifier_completion); 75319974Shselasky } 76319974Shselasky mutex_unlock(&item->odp_data->umem_mutex); 77319974Shselasky} 78319974Shselasky 79319974Shselasky/* Account for a new mmu notifier in an ib_ucontext. */ 80319974Shselaskystatic void ib_ucontext_notifier_start_account(struct ib_ucontext *context) 81319974Shselasky{ 82319974Shselasky atomic_inc(&context->notifier_count); 83319974Shselasky} 84319974Shselasky 85319974Shselasky/* Account for a terminating mmu notifier in an ib_ucontext. 86319974Shselasky * 87319974Shselasky * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since 88319974Shselasky * the function takes the semaphore itself. */ 89319974Shselaskystatic void ib_ucontext_notifier_end_account(struct ib_ucontext *context) 90319974Shselasky{ 91319974Shselasky int zero_notifiers = atomic_dec_and_test(&context->notifier_count); 92319974Shselasky 93319974Shselasky if (zero_notifiers && 94319974Shselasky !list_empty(&context->no_private_counters)) { 95319974Shselasky /* No currently running mmu notifiers. Now is the chance to 96319974Shselasky * add private accounting to all previously added umems. */ 97319974Shselasky struct ib_umem_odp *odp_data, *next; 98319974Shselasky 99319974Shselasky /* Prevent concurrent mmu notifiers from working on the 100319974Shselasky * no_private_counters list. */ 101319974Shselasky down_write(&context->umem_rwsem); 102319974Shselasky 103319974Shselasky /* Read the notifier_count again, with the umem_rwsem 104319974Shselasky * semaphore taken for write. */ 105319974Shselasky if (!atomic_read(&context->notifier_count)) { 106319974Shselasky list_for_each_entry_safe(odp_data, next, 107319974Shselasky &context->no_private_counters, 108319974Shselasky no_private_counters) { 109319974Shselasky mutex_lock(&odp_data->umem_mutex); 110319974Shselasky odp_data->mn_counters_active = true; 111319974Shselasky list_del(&odp_data->no_private_counters); 112319974Shselasky complete_all(&odp_data->notifier_completion); 113319974Shselasky mutex_unlock(&odp_data->umem_mutex); 114319974Shselasky } 115319974Shselasky } 116319974Shselasky 117319974Shselasky up_write(&context->umem_rwsem); 118319974Shselasky } 119319974Shselasky} 120319974Shselasky 121319974Shselaskystatic int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, 122319974Shselasky u64 end, void *cookie) { 123319974Shselasky /* 124319974Shselasky * Increase the number of notifiers running, to 125319974Shselasky * prevent any further fault handling on this MR. 126319974Shselasky */ 127319974Shselasky ib_umem_notifier_start_account(item); 128319974Shselasky item->odp_data->dying = 1; 129319974Shselasky /* Make sure that the fact the umem is dying is out before we release 130319974Shselasky * all pending page faults. */ 131319974Shselasky smp_wmb(); 132319974Shselasky complete_all(&item->odp_data->notifier_completion); 133319974Shselasky item->context->invalidate_range(item, ib_umem_start(item), 134319974Shselasky ib_umem_end(item)); 135319974Shselasky return 0; 136319974Shselasky} 137319974Shselasky 138319974Shselaskystatic void ib_umem_notifier_release(struct mmu_notifier *mn, 139319974Shselasky struct mm_struct *mm) 140319974Shselasky{ 141319974Shselasky struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 142319974Shselasky 143319974Shselasky if (!context->invalidate_range) 144319974Shselasky return; 145319974Shselasky 146319974Shselasky ib_ucontext_notifier_start_account(context); 147319974Shselasky down_read(&context->umem_rwsem); 148319974Shselasky rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, 149319974Shselasky ULLONG_MAX, 150319974Shselasky ib_umem_notifier_release_trampoline, 151319974Shselasky NULL); 152319974Shselasky up_read(&context->umem_rwsem); 153319974Shselasky} 154319974Shselasky 155319974Shselaskystatic int invalidate_page_trampoline(struct ib_umem *item, u64 start, 156319974Shselasky u64 end, void *cookie) 157319974Shselasky{ 158319974Shselasky ib_umem_notifier_start_account(item); 159319974Shselasky item->context->invalidate_range(item, start, start + PAGE_SIZE); 160319974Shselasky ib_umem_notifier_end_account(item); 161319974Shselasky return 0; 162319974Shselasky} 163319974Shselasky 164319974Shselaskystatic void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, 165319974Shselasky struct mm_struct *mm, 166319974Shselasky unsigned long address) 167319974Shselasky{ 168319974Shselasky struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 169319974Shselasky 170319974Shselasky if (!context->invalidate_range) 171319974Shselasky return; 172319974Shselasky 173319974Shselasky ib_ucontext_notifier_start_account(context); 174319974Shselasky down_read(&context->umem_rwsem); 175319974Shselasky rbt_ib_umem_for_each_in_range(&context->umem_tree, address, 176319974Shselasky address + PAGE_SIZE, 177319974Shselasky invalidate_page_trampoline, NULL); 178319974Shselasky up_read(&context->umem_rwsem); 179319974Shselasky ib_ucontext_notifier_end_account(context); 180319974Shselasky} 181319974Shselasky 182319974Shselaskystatic int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, 183319974Shselasky u64 end, void *cookie) 184319974Shselasky{ 185319974Shselasky ib_umem_notifier_start_account(item); 186319974Shselasky item->context->invalidate_range(item, start, end); 187319974Shselasky return 0; 188319974Shselasky} 189319974Shselasky 190319974Shselaskystatic void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, 191319974Shselasky struct mm_struct *mm, 192319974Shselasky unsigned long start, 193319974Shselasky unsigned long end) 194319974Shselasky{ 195319974Shselasky struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 196319974Shselasky 197319974Shselasky if (!context->invalidate_range) 198319974Shselasky return; 199319974Shselasky 200319974Shselasky ib_ucontext_notifier_start_account(context); 201319974Shselasky down_read(&context->umem_rwsem); 202319974Shselasky rbt_ib_umem_for_each_in_range(&context->umem_tree, start, 203319974Shselasky end, 204319974Shselasky invalidate_range_start_trampoline, NULL); 205319974Shselasky up_read(&context->umem_rwsem); 206319974Shselasky} 207319974Shselasky 208319974Shselaskystatic int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, 209319974Shselasky u64 end, void *cookie) 210319974Shselasky{ 211319974Shselasky ib_umem_notifier_end_account(item); 212319974Shselasky return 0; 213319974Shselasky} 214319974Shselasky 215319974Shselaskystatic void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, 216319974Shselasky struct mm_struct *mm, 217319974Shselasky unsigned long start, 218319974Shselasky unsigned long end) 219319974Shselasky{ 220319974Shselasky struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); 221319974Shselasky 222319974Shselasky if (!context->invalidate_range) 223319974Shselasky return; 224319974Shselasky 225319974Shselasky down_read(&context->umem_rwsem); 226319974Shselasky rbt_ib_umem_for_each_in_range(&context->umem_tree, start, 227319974Shselasky end, 228319974Shselasky invalidate_range_end_trampoline, NULL); 229319974Shselasky up_read(&context->umem_rwsem); 230319974Shselasky ib_ucontext_notifier_end_account(context); 231319974Shselasky} 232319974Shselasky 233319974Shselaskystatic const struct mmu_notifier_ops ib_umem_notifiers = { 234319974Shselasky .release = ib_umem_notifier_release, 235319974Shselasky .invalidate_page = ib_umem_notifier_invalidate_page, 236319974Shselasky .invalidate_range_start = ib_umem_notifier_invalidate_range_start, 237319974Shselasky .invalidate_range_end = ib_umem_notifier_invalidate_range_end, 238319974Shselasky}; 239319974Shselasky 240319974Shselaskyint ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) 241319974Shselasky{ 242319974Shselasky int ret_val; 243319974Shselasky pid_t our_pid; 244319974Shselasky struct mm_struct *mm = get_task_mm(current); 245319974Shselasky 246319974Shselasky if (!mm) 247319974Shselasky return -EINVAL; 248319974Shselasky 249319974Shselasky /* Prevent creating ODP MRs in child processes */ 250319974Shselasky rcu_read_lock(); 251319974Shselasky our_pid = get_pid(task_pid_group_leader(current)); 252319974Shselasky rcu_read_unlock(); 253319974Shselasky put_pid(our_pid); 254319974Shselasky if (context->tgid != our_pid) { 255319974Shselasky ret_val = -EINVAL; 256319974Shselasky goto out_mm; 257319974Shselasky } 258319974Shselasky 259319974Shselasky umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); 260319974Shselasky if (!umem->odp_data) { 261319974Shselasky ret_val = -ENOMEM; 262319974Shselasky goto out_mm; 263319974Shselasky } 264319974Shselasky umem->odp_data->umem = umem; 265319974Shselasky 266319974Shselasky mutex_init(&umem->odp_data->umem_mutex); 267319974Shselasky 268319974Shselasky init_completion(&umem->odp_data->notifier_completion); 269319974Shselasky 270319974Shselasky umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * 271319974Shselasky sizeof(*umem->odp_data->page_list)); 272319974Shselasky if (!umem->odp_data->page_list) { 273319974Shselasky ret_val = -ENOMEM; 274319974Shselasky goto out_odp_data; 275319974Shselasky } 276319974Shselasky 277319974Shselasky umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * 278319974Shselasky sizeof(*umem->odp_data->dma_list)); 279319974Shselasky if (!umem->odp_data->dma_list) { 280319974Shselasky ret_val = -ENOMEM; 281319974Shselasky goto out_page_list; 282319974Shselasky } 283319974Shselasky 284319974Shselasky /* 285319974Shselasky * When using MMU notifiers, we will get a 286319974Shselasky * notification before the "current" task (and MM) is 287319974Shselasky * destroyed. We use the umem_rwsem semaphore to synchronize. 288319974Shselasky */ 289319974Shselasky down_write(&context->umem_rwsem); 290319974Shselasky context->odp_mrs_count++; 291319974Shselasky if (likely(ib_umem_start(umem) != ib_umem_end(umem))) 292319974Shselasky rbt_ib_umem_insert(&umem->odp_data->interval_tree, 293319974Shselasky &context->umem_tree); 294319974Shselasky if (likely(!atomic_read(&context->notifier_count)) || 295319974Shselasky context->odp_mrs_count == 1) 296319974Shselasky umem->odp_data->mn_counters_active = true; 297319974Shselasky else 298319974Shselasky list_add(&umem->odp_data->no_private_counters, 299319974Shselasky &context->no_private_counters); 300319974Shselasky downgrade_write(&context->umem_rwsem); 301319974Shselasky 302319974Shselasky if (context->odp_mrs_count == 1) { 303319974Shselasky /* 304319974Shselasky * Note that at this point, no MMU notifier is running 305319974Shselasky * for this context! 306319974Shselasky */ 307319974Shselasky atomic_set(&context->notifier_count, 0); 308319974Shselasky INIT_HLIST_NODE(&context->mn.hlist); 309319974Shselasky context->mn.ops = &ib_umem_notifiers; 310319974Shselasky /* 311319974Shselasky * Lock-dep detects a false positive for mmap_sem vs. 312319974Shselasky * umem_rwsem, due to not grasping downgrade_write correctly. 313319974Shselasky */ 314319974Shselasky ret_val = mmu_notifier_register(&context->mn, mm); 315319974Shselasky if (ret_val) { 316319974Shselasky pr_err("Failed to register mmu_notifier %d\n", ret_val); 317319974Shselasky ret_val = -EBUSY; 318319974Shselasky goto out_mutex; 319319974Shselasky } 320319974Shselasky } 321319974Shselasky 322319974Shselasky up_read(&context->umem_rwsem); 323319974Shselasky 324319974Shselasky /* 325319974Shselasky * Note that doing an mmput can cause a notifier for the relevant mm. 326319974Shselasky * If the notifier is called while we hold the umem_rwsem, this will 327319974Shselasky * cause a deadlock. Therefore, we release the reference only after we 328319974Shselasky * released the semaphore. 329319974Shselasky */ 330319974Shselasky mmput(mm); 331319974Shselasky return 0; 332319974Shselasky 333319974Shselaskyout_mutex: 334319974Shselasky up_read(&context->umem_rwsem); 335319974Shselasky vfree(umem->odp_data->dma_list); 336319974Shselaskyout_page_list: 337319974Shselasky vfree(umem->odp_data->page_list); 338319974Shselaskyout_odp_data: 339319974Shselasky kfree(umem->odp_data); 340319974Shselaskyout_mm: 341319974Shselasky mmput(mm); 342319974Shselasky return ret_val; 343319974Shselasky} 344319974Shselasky 345319974Shselaskyvoid ib_umem_odp_release(struct ib_umem *umem) 346319974Shselasky{ 347319974Shselasky struct ib_ucontext *context = umem->context; 348319974Shselasky 349319974Shselasky /* 350319974Shselasky * Ensure that no more pages are mapped in the umem. 351319974Shselasky * 352319974Shselasky * It is the driver's responsibility to ensure, before calling us, 353319974Shselasky * that the hardware will not attempt to access the MR any more. 354319974Shselasky */ 355319974Shselasky ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), 356319974Shselasky ib_umem_end(umem)); 357319974Shselasky 358319974Shselasky down_write(&context->umem_rwsem); 359319974Shselasky if (likely(ib_umem_start(umem) != ib_umem_end(umem))) 360319974Shselasky rbt_ib_umem_remove(&umem->odp_data->interval_tree, 361319974Shselasky &context->umem_tree); 362319974Shselasky context->odp_mrs_count--; 363319974Shselasky if (!umem->odp_data->mn_counters_active) { 364319974Shselasky list_del(&umem->odp_data->no_private_counters); 365319974Shselasky complete_all(&umem->odp_data->notifier_completion); 366319974Shselasky } 367319974Shselasky 368319974Shselasky /* 369319974Shselasky * Downgrade the lock to a read lock. This ensures that the notifiers 370319974Shselasky * (who lock the mutex for reading) will be able to finish, and we 371319974Shselasky * will be able to enventually obtain the mmu notifiers SRCU. Note 372319974Shselasky * that since we are doing it atomically, no other user could register 373319974Shselasky * and unregister while we do the check. 374319974Shselasky */ 375319974Shselasky downgrade_write(&context->umem_rwsem); 376319974Shselasky if (!context->odp_mrs_count) { 377319974Shselasky struct task_struct *owning_process = NULL; 378319974Shselasky struct mm_struct *owning_mm = NULL; 379319974Shselasky 380319974Shselasky owning_process = get_pid_task(context->tgid, 381319974Shselasky PIDTYPE_PID); 382319974Shselasky if (owning_process == NULL) 383319974Shselasky /* 384319974Shselasky * The process is already dead, notifier were removed 385319974Shselasky * already. 386319974Shselasky */ 387319974Shselasky goto out; 388319974Shselasky 389319974Shselasky owning_mm = get_task_mm(owning_process); 390319974Shselasky if (owning_mm == NULL) 391319974Shselasky /* 392319974Shselasky * The process' mm is already dead, notifier were 393319974Shselasky * removed already. 394319974Shselasky */ 395319974Shselasky goto out_put_task; 396319974Shselasky mmu_notifier_unregister(&context->mn, owning_mm); 397319974Shselasky 398319974Shselasky mmput(owning_mm); 399319974Shselasky 400319974Shselaskyout_put_task: 401319974Shselasky put_task_struct(owning_process); 402319974Shselasky } 403319974Shselaskyout: 404319974Shselasky up_read(&context->umem_rwsem); 405319974Shselasky 406319974Shselasky vfree(umem->odp_data->dma_list); 407319974Shselasky vfree(umem->odp_data->page_list); 408319974Shselasky kfree(umem->odp_data); 409319974Shselasky kfree(umem); 410319974Shselasky} 411319974Shselasky 412319974Shselasky/* 413319974Shselasky * Map for DMA and insert a single page into the on-demand paging page tables. 414319974Shselasky * 415319974Shselasky * @umem: the umem to insert the page to. 416319974Shselasky * @page_index: index in the umem to add the page to. 417319974Shselasky * @page: the page struct to map and add. 418319974Shselasky * @access_mask: access permissions needed for this page. 419319974Shselasky * @current_seq: sequence number for synchronization with invalidations. 420319974Shselasky * the sequence number is taken from 421319974Shselasky * umem->odp_data->notifiers_seq. 422319974Shselasky * 423319974Shselasky * The function returns -EFAULT if the DMA mapping operation fails. It returns 424319974Shselasky * -EAGAIN if a concurrent invalidation prevents us from updating the page. 425319974Shselasky * 426319974Shselasky * The page is released via put_page even if the operation failed. For 427319974Shselasky * on-demand pinning, the page is released whenever it isn't stored in the 428319974Shselasky * umem. 429319974Shselasky */ 430319974Shselaskystatic int ib_umem_odp_map_dma_single_page( 431319974Shselasky struct ib_umem *umem, 432319974Shselasky int page_index, 433319974Shselasky u64 base_virt_addr, 434319974Shselasky struct page *page, 435319974Shselasky u64 access_mask, 436319974Shselasky unsigned long current_seq) 437319974Shselasky{ 438319974Shselasky struct ib_device *dev = umem->context->device; 439319974Shselasky dma_addr_t dma_addr; 440319974Shselasky int stored_page = 0; 441319974Shselasky int remove_existing_mapping = 0; 442319974Shselasky int ret = 0; 443319974Shselasky 444319974Shselasky /* 445319974Shselasky * Note: we avoid writing if seq is different from the initial seq, to 446319974Shselasky * handle case of a racing notifier. This check also allows us to bail 447319974Shselasky * early if we have a notifier running in parallel with us. 448319974Shselasky */ 449319974Shselasky if (ib_umem_mmu_notifier_retry(umem, current_seq)) { 450319974Shselasky ret = -EAGAIN; 451319974Shselasky goto out; 452319974Shselasky } 453319974Shselasky if (!(umem->odp_data->dma_list[page_index])) { 454319974Shselasky dma_addr = ib_dma_map_page(dev, 455319974Shselasky page, 456319974Shselasky 0, PAGE_SIZE, 457319974Shselasky DMA_BIDIRECTIONAL); 458319974Shselasky if (ib_dma_mapping_error(dev, dma_addr)) { 459319974Shselasky ret = -EFAULT; 460319974Shselasky goto out; 461319974Shselasky } 462319974Shselasky umem->odp_data->dma_list[page_index] = dma_addr | access_mask; 463319974Shselasky umem->odp_data->page_list[page_index] = page; 464319974Shselasky stored_page = 1; 465319974Shselasky } else if (umem->odp_data->page_list[page_index] == page) { 466319974Shselasky umem->odp_data->dma_list[page_index] |= access_mask; 467319974Shselasky } else { 468319974Shselasky pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", 469319974Shselasky umem->odp_data->page_list[page_index], page); 470319974Shselasky /* Better remove the mapping now, to prevent any further 471319974Shselasky * damage. */ 472319974Shselasky remove_existing_mapping = 1; 473319974Shselasky } 474319974Shselasky 475319974Shselaskyout: 476319974Shselasky /* On Demand Paging - avoid pinning the page */ 477319974Shselasky if (umem->context->invalidate_range || !stored_page) 478319974Shselasky put_page(page); 479319974Shselasky 480319974Shselasky if (remove_existing_mapping && umem->context->invalidate_range) { 481319974Shselasky invalidate_page_trampoline( 482319974Shselasky umem, 483319974Shselasky base_virt_addr + (page_index * PAGE_SIZE), 484319974Shselasky base_virt_addr + ((page_index+1)*PAGE_SIZE), 485319974Shselasky NULL); 486319974Shselasky ret = -EAGAIN; 487319974Shselasky } 488319974Shselasky 489319974Shselasky return ret; 490319974Shselasky} 491319974Shselasky 492319974Shselasky/** 493319974Shselasky * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. 494319974Shselasky * 495319974Shselasky * Pins the range of pages passed in the argument, and maps them to 496319974Shselasky * DMA addresses. The DMA addresses of the mapped pages is updated in 497319974Shselasky * umem->odp_data->dma_list. 498319974Shselasky * 499319974Shselasky * Returns the number of pages mapped in success, negative error code 500319974Shselasky * for failure. 501319974Shselasky * An -EAGAIN error code is returned when a concurrent mmu notifier prevents 502319974Shselasky * the function from completing its task. 503319974Shselasky * 504319974Shselasky * @umem: the umem to map and pin 505319974Shselasky * @user_virt: the address from which we need to map. 506319974Shselasky * @bcnt: the minimal number of bytes to pin and map. The mapping might be 507319974Shselasky * bigger due to alignment, and may also be smaller in case of an error 508319974Shselasky * pinning or mapping a page. The actual pages mapped is returned in 509319974Shselasky * the return value. 510319974Shselasky * @access_mask: bit mask of the requested access permissions for the given 511319974Shselasky * range. 512319974Shselasky * @current_seq: the MMU notifiers sequance value for synchronization with 513319974Shselasky * invalidations. the sequance number is read from 514319974Shselasky * umem->odp_data->notifiers_seq before calling this function 515319974Shselasky */ 516319974Shselaskyint ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, 517319974Shselasky u64 access_mask, unsigned long current_seq) 518319974Shselasky{ 519319974Shselasky struct task_struct *owning_process = NULL; 520319974Shselasky struct mm_struct *owning_mm = NULL; 521319974Shselasky struct page **local_page_list = NULL; 522319974Shselasky u64 off; 523319974Shselasky int j, k, ret = 0, start_idx, npages = 0; 524319974Shselasky u64 base_virt_addr; 525319974Shselasky unsigned int flags = 0; 526319974Shselasky 527319974Shselasky if (access_mask == 0) 528319974Shselasky return -EINVAL; 529319974Shselasky 530319974Shselasky if (user_virt < ib_umem_start(umem) || 531319974Shselasky user_virt + bcnt > ib_umem_end(umem)) 532319974Shselasky return -EFAULT; 533319974Shselasky 534319974Shselasky local_page_list = (struct page **)__get_free_page(GFP_KERNEL); 535319974Shselasky if (!local_page_list) 536319974Shselasky return -ENOMEM; 537319974Shselasky 538319974Shselasky off = user_virt & (~PAGE_MASK); 539319974Shselasky user_virt = user_virt & PAGE_MASK; 540319974Shselasky base_virt_addr = user_virt; 541319974Shselasky bcnt += off; /* Charge for the first page offset as well. */ 542319974Shselasky 543319974Shselasky owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); 544319974Shselasky if (owning_process == NULL) { 545319974Shselasky ret = -EINVAL; 546319974Shselasky goto out_no_task; 547319974Shselasky } 548319974Shselasky 549319974Shselasky owning_mm = get_task_mm(owning_process); 550319974Shselasky if (owning_mm == NULL) { 551319974Shselasky ret = -EINVAL; 552319974Shselasky goto out_put_task; 553319974Shselasky } 554319974Shselasky 555319974Shselasky if (access_mask & ODP_WRITE_ALLOWED_BIT) 556319974Shselasky flags |= FOLL_WRITE; 557319974Shselasky 558319974Shselasky start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; 559319974Shselasky k = start_idx; 560319974Shselasky 561319974Shselasky while (bcnt > 0) { 562319974Shselasky const size_t gup_num_pages = 563319974Shselasky min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, 564319974Shselasky PAGE_SIZE / sizeof(struct page *)); 565319974Shselasky 566319974Shselasky down_read(&owning_mm->mmap_sem); 567319974Shselasky /* 568319974Shselasky * Note: this might result in redundent page getting. We can 569319974Shselasky * avoid this by checking dma_list to be 0 before calling 570319974Shselasky * get_user_pages. However, this make the code much more 571319974Shselasky * complex (and doesn't gain us much performance in most use 572319974Shselasky * cases). 573319974Shselasky */ 574319974Shselasky npages = get_user_pages_remote(owning_process, owning_mm, 575319974Shselasky user_virt, gup_num_pages, 576319974Shselasky flags, local_page_list, NULL); 577319974Shselasky up_read(&owning_mm->mmap_sem); 578319974Shselasky 579319974Shselasky if (npages < 0) 580319974Shselasky break; 581319974Shselasky 582319974Shselasky bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); 583319974Shselasky user_virt += npages << PAGE_SHIFT; 584319974Shselasky mutex_lock(&umem->odp_data->umem_mutex); 585319974Shselasky for (j = 0; j < npages; ++j) { 586319974Shselasky ret = ib_umem_odp_map_dma_single_page( 587319974Shselasky umem, k, base_virt_addr, local_page_list[j], 588319974Shselasky access_mask, current_seq); 589319974Shselasky if (ret < 0) 590319974Shselasky break; 591319974Shselasky k++; 592319974Shselasky } 593319974Shselasky mutex_unlock(&umem->odp_data->umem_mutex); 594319974Shselasky 595319974Shselasky if (ret < 0) { 596319974Shselasky /* Release left over pages when handling errors. */ 597319974Shselasky for (++j; j < npages; ++j) 598319974Shselasky put_page(local_page_list[j]); 599319974Shselasky break; 600319974Shselasky } 601319974Shselasky } 602319974Shselasky 603319974Shselasky if (ret >= 0) { 604319974Shselasky if (npages < 0 && k == start_idx) 605319974Shselasky ret = npages; 606319974Shselasky else 607319974Shselasky ret = k - start_idx; 608319974Shselasky } 609319974Shselasky 610319974Shselasky mmput(owning_mm); 611319974Shselaskyout_put_task: 612319974Shselasky put_task_struct(owning_process); 613319974Shselaskyout_no_task: 614319974Shselasky free_page((unsigned long)local_page_list); 615319974Shselasky return ret; 616319974Shselasky} 617319974ShselaskyEXPORT_SYMBOL(ib_umem_odp_map_dma_pages); 618319974Shselasky 619319974Shselaskyvoid ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, 620319974Shselasky u64 bound) 621319974Shselasky{ 622319974Shselasky int idx; 623319974Shselasky u64 addr; 624319974Shselasky struct ib_device *dev = umem->context->device; 625319974Shselasky 626319974Shselasky virt = max_t(u64, virt, ib_umem_start(umem)); 627319974Shselasky bound = min_t(u64, bound, ib_umem_end(umem)); 628319974Shselasky /* Note that during the run of this function, the 629319974Shselasky * notifiers_count of the MR is > 0, preventing any racing 630319974Shselasky * faults from completion. We might be racing with other 631319974Shselasky * invalidations, so we must make sure we free each page only 632319974Shselasky * once. */ 633319974Shselasky mutex_lock(&umem->odp_data->umem_mutex); 634319974Shselasky for (addr = virt; addr < bound; addr += (u64)umem->page_size) { 635319974Shselasky idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 636319974Shselasky if (umem->odp_data->page_list[idx]) { 637319974Shselasky struct page *page = umem->odp_data->page_list[idx]; 638319974Shselasky dma_addr_t dma = umem->odp_data->dma_list[idx]; 639319974Shselasky dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; 640319974Shselasky 641319974Shselasky WARN_ON(!dma_addr); 642319974Shselasky 643319974Shselasky ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, 644319974Shselasky DMA_BIDIRECTIONAL); 645319974Shselasky if (dma & ODP_WRITE_ALLOWED_BIT) { 646319974Shselasky struct page *head_page = compound_head(page); 647319974Shselasky /* 648319974Shselasky * set_page_dirty prefers being called with 649319974Shselasky * the page lock. However, MMU notifiers are 650319974Shselasky * called sometimes with and sometimes without 651319974Shselasky * the lock. We rely on the umem_mutex instead 652319974Shselasky * to prevent other mmu notifiers from 653319974Shselasky * continuing and allowing the page mapping to 654319974Shselasky * be removed. 655319974Shselasky */ 656319974Shselasky set_page_dirty(head_page); 657319974Shselasky } 658319974Shselasky /* on demand pinning support */ 659319974Shselasky if (!umem->context->invalidate_range) 660319974Shselasky put_page(page); 661319974Shselasky umem->odp_data->page_list[idx] = NULL; 662319974Shselasky umem->odp_data->dma_list[idx] = 0; 663319974Shselasky } 664319974Shselasky } 665319974Shselasky mutex_unlock(&umem->odp_data->umem_mutex); 666319974Shselasky} 667319974ShselaskyEXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); 668