1/* 2 * Copyright (c) 2005 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Cisco Systems. All rights reserved. 4 * Copyright (c) 2005 Mellanox Technologies. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 35#define LINUXKPI_PARAM_PREFIX ibcore_ 36 37#include <linux/mm.h> 38#include <linux/dma-mapping.h> 39#include <linux/sched.h> 40#ifdef __linux__ 41#include <linux/hugetlb.h> 42#endif 43#include <linux/dma-attrs.h> 44 45#include <sys/priv.h> 46#include <sys/resource.h> 47#include <sys/resourcevar.h> 48 49#include <vm/vm.h> 50#include <vm/vm_map.h> 51#include <vm/vm_object.h> 52#include <vm/vm_pageout.h> 53 54#include "uverbs.h" 55 56static int allow_weak_ordering; 57module_param(allow_weak_ordering, int, 0444); 58MODULE_PARM_DESC(allow_weak_ordering, "Allow weak ordering for data registered memory"); 59 60#define IB_UMEM_MAX_PAGE_CHUNK \ 61 ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ 62 ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ 63 (void *) &((struct ib_umem_chunk *) 0)->page_list[0])) 64 65#ifdef __ia64__ 66extern int dma_map_sg_hp_wa; 67 68static int dma_map_sg_ia64(struct ib_device *ibdev, 69 struct scatterlist *sg, 70 int nents, 71 enum dma_data_direction dir) 72{ 73 int i, rc, j, lents = 0; 74 struct device *dev; 75 76 if (!dma_map_sg_hp_wa) 77 return ib_dma_map_sg(ibdev, sg, nents, dir); 78 79 dev = ibdev->dma_device; 80 for (i = 0; i < nents; ++i) { 81 rc = dma_map_sg(dev, sg + i, 1, dir); 82 if (rc <= 0) { 83 for (j = 0; j < i; ++j) 84 dma_unmap_sg(dev, sg + j, 1, dir); 85 86 return 0; 87 } 88 lents += rc; 89 } 90 91 return lents; 92} 93 94static void dma_unmap_sg_ia64(struct ib_device *ibdev, 95 struct scatterlist *sg, 96 int nents, 97 enum dma_data_direction dir) 98{ 99 int i; 100 struct device *dev; 101 102 if (!dma_map_sg_hp_wa) 103 return ib_dma_unmap_sg(ibdev, sg, nents, dir); 104 105 dev = ibdev->dma_device; 106 for (i = 0; i < nents; ++i) 107 dma_unmap_sg(dev, sg + i, 1, dir); 108} 109 110#define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir) 111#define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir) 112 113#endif 114 115static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) 116{ 117#ifdef __linux__ 118 struct ib_umem_chunk *chunk, *tmp; 119 int i; 120 121 list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { 122 ib_dma_unmap_sg_attrs(dev, chunk->page_list, 123 chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); 124 for (i = 0; i < chunk->nents; ++i) { 125 struct page *page = sg_page(&chunk->page_list[i]); 126 if (umem->writable && dirty) 127 set_page_dirty_lock(page); 128 put_page(page); 129 } 130 kfree(chunk); 131 } 132#else 133 struct ib_umem_chunk *chunk, *tmp; 134 vm_object_t object; 135 int i; 136 137 object = NULL; 138 list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { 139 ib_dma_unmap_sg_attrs(dev, chunk->page_list, 140 chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); 141 for (i = 0; i < chunk->nents; ++i) { 142 struct page *page = sg_page(&chunk->page_list[i]); 143 if (umem->writable && dirty) { 144 if (object && object != page->object) 145 VM_OBJECT_WUNLOCK(object); 146 if (object != page->object) { 147 object = page->object; 148 VM_OBJECT_WLOCK(object); 149 } 150 vm_page_dirty(page); 151 } 152 } 153 kfree(chunk); 154 } 155 if (object) 156 VM_OBJECT_WUNLOCK(object); 157 158#endif 159} 160 161/** 162 * ib_umem_get - Pin and DMA map userspace memory. 163 * @context: userspace context to pin memory for 164 * @addr: userspace virtual address to start at 165 * @size: length of region to pin 166 * @access: IB_ACCESS_xxx flags for memory being pinned 167 * @dmasync: flush in-flight DMA when the memory region is written 168 */ 169struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, 170 size_t size, int access, int dmasync) 171{ 172#ifdef __linux__ 173 struct ib_umem *umem; 174 struct page **page_list; 175 struct vm_area_struct **vma_list; 176 struct ib_umem_chunk *chunk; 177 unsigned long locked; 178 unsigned long lock_limit; 179 unsigned long cur_base; 180 unsigned long npages; 181 int ret; 182 int off; 183 int i; 184 DEFINE_DMA_ATTRS(attrs); 185 186 if (dmasync) 187 dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); 188 else if (allow_weak_ordering) 189 dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs); 190 191 if (!can_do_mlock()) 192 return ERR_PTR(-EPERM); 193 194 umem = kmalloc(sizeof *umem, GFP_KERNEL); 195 if (!umem) 196 return ERR_PTR(-ENOMEM); 197 198 umem->context = context; 199 umem->length = size; 200 umem->offset = addr & ~PAGE_MASK; 201 umem->page_size = PAGE_SIZE; 202 /* 203 * We ask for writable memory if any access flags other than 204 * "remote read" are set. "Local write" and "remote write" 205 * obviously require write access. "Remote atomic" can do 206 * things like fetch and add, which will modify memory, and 207 * "MW bind" can change permissions by binding a window. 208 */ 209 umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); 210 211 /* We assume the memory is from hugetlb until proved otherwise */ 212 umem->hugetlb = 1; 213 214 INIT_LIST_HEAD(&umem->chunk_list); 215 216 page_list = (struct page **) __get_free_page(GFP_KERNEL); 217 if (!page_list) { 218 kfree(umem); 219 return ERR_PTR(-ENOMEM); 220 } 221 222 /* 223 * if we can't alloc the vma_list, it's not so bad; 224 * just assume the memory is not hugetlb memory 225 */ 226 vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); 227 if (!vma_list) 228 umem->hugetlb = 0; 229 230 npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; 231 232 down_write(¤t->mm->mmap_sem); 233 234 locked = npages + current->mm->locked_vm; 235 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 236 237 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { 238 ret = -ENOMEM; 239 goto out; 240 } 241 242 cur_base = addr & PAGE_MASK; 243 244 ret = 0; 245 246 while (npages) { 247 ret = get_user_pages(current, current->mm, cur_base, 248 min_t(unsigned long, npages, 249 PAGE_SIZE / sizeof (struct page *)), 250 1, !umem->writable, page_list, vma_list); 251 252 if (ret < 0) 253 goto out; 254 255 cur_base += ret * PAGE_SIZE; 256 npages -= ret; 257 258 off = 0; 259 260 while (ret) { 261 chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) * 262 min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK), 263 GFP_KERNEL); 264 if (!chunk) { 265 ret = -ENOMEM; 266 goto out; 267 } 268 269 chunk->attrs = attrs; 270 chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); 271 sg_init_table(chunk->page_list, chunk->nents); 272 for (i = 0; i < chunk->nents; ++i) { 273 if (vma_list && 274 !is_vm_hugetlb_page(vma_list[i + off])) 275 umem->hugetlb = 0; 276 sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0); 277 } 278 279 chunk->nmap = ib_dma_map_sg_attrs(context->device, 280 &chunk->page_list[0], 281 chunk->nents, 282 DMA_BIDIRECTIONAL, 283 &attrs); 284 if (chunk->nmap <= 0) { 285 for (i = 0; i < chunk->nents; ++i) 286 put_page(sg_page(&chunk->page_list[i])); 287 kfree(chunk); 288 289 ret = -ENOMEM; 290 goto out; 291 } 292 293 ret -= chunk->nents; 294 off += chunk->nents; 295 list_add_tail(&chunk->list, &umem->chunk_list); 296 } 297 298 ret = 0; 299 } 300 301out: 302 if (ret < 0) { 303 __ib_umem_release(context->device, umem, 0); 304 kfree(umem); 305 } else 306 current->mm->locked_vm = locked; 307 308 up_write(¤t->mm->mmap_sem); 309 if (vma_list) 310 free_page((unsigned long) vma_list); 311 free_page((unsigned long) page_list); 312 313 return ret < 0 ? ERR_PTR(ret) : umem; 314#else 315 struct ib_umem *umem; 316 struct ib_umem_chunk *chunk; 317 struct proc *proc; 318 pmap_t pmap; 319 vm_offset_t end, last, start; 320 vm_size_t npages; 321 int error; 322 int ents; 323 int ret; 324 int i; 325 DEFINE_DMA_ATTRS(attrs); 326 327 error = priv_check(curthread, PRIV_VM_MLOCK); 328 if (error) 329 return ERR_PTR(-error); 330 331 last = addr + size; 332 start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ 333 end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ 334 if (last < addr || end < addr) 335 return ERR_PTR(-EINVAL); 336 npages = atop(end - start); 337 if (npages > vm_page_max_wired) 338 return ERR_PTR(-ENOMEM); 339 umem = kzalloc(sizeof *umem, GFP_KERNEL); 340 if (!umem) 341 return ERR_PTR(-ENOMEM); 342 proc = curthread->td_proc; 343 PROC_LOCK(proc); 344 if (ptoa(npages + 345 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > 346 lim_cur(proc, RLIMIT_MEMLOCK)) { 347 PROC_UNLOCK(proc); 348 kfree(umem); 349 return ERR_PTR(-ENOMEM); 350 } 351 PROC_UNLOCK(proc); 352 if (npages + cnt.v_wire_count > vm_page_max_wired) { 353 kfree(umem); 354 return ERR_PTR(-EAGAIN); 355 } 356 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 357 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | 358 (umem->writable ? VM_MAP_WIRE_WRITE : 0)); 359 if (error != KERN_SUCCESS) { 360 kfree(umem); 361 return ERR_PTR(-ENOMEM); 362 } 363 364 umem->context = context; 365 umem->length = size; 366 umem->offset = addr & ~PAGE_MASK; 367 umem->page_size = PAGE_SIZE; 368 umem->start = addr; 369 /* 370 * We ask for writable memory if any access flags other than 371 * "remote read" are set. "Local write" and "remote write" 372 * obviously require write access. "Remote atomic" can do 373 * things like fetch and add, which will modify memory, and 374 * "MW bind" can change permissions by binding a window. 375 */ 376 umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); 377 umem->hugetlb = 0; 378 INIT_LIST_HEAD(&umem->chunk_list); 379 380 pmap = vm_map_pmap(&proc->p_vmspace->vm_map); 381 ret = 0; 382 while (npages) { 383 ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK); 384 chunk = kmalloc(sizeof(*chunk) + 385 (sizeof(struct scatterlist) * ents), 386 GFP_KERNEL); 387 if (!chunk) { 388 ret = -ENOMEM; 389 goto out; 390 } 391 392 chunk->attrs = attrs; 393 chunk->nents = ents; 394 sg_init_table(&chunk->page_list[0], ents); 395 for (i = 0; i < chunk->nents; ++i) { 396 vm_paddr_t pa; 397 398 pa = pmap_extract(pmap, start); 399 if (pa == 0) { 400 ret = -ENOMEM; 401 kfree(chunk); 402 goto out; 403 } 404 sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa), 405 PAGE_SIZE, 0); 406 npages--; 407 start += PAGE_SIZE; 408 } 409 410 chunk->nmap = ib_dma_map_sg_attrs(context->device, 411 &chunk->page_list[0], 412 chunk->nents, 413 DMA_BIDIRECTIONAL, 414 &attrs); 415 if (chunk->nmap != chunk->nents) { 416 kfree(chunk); 417 ret = -ENOMEM; 418 goto out; 419 } 420 421 list_add_tail(&chunk->list, &umem->chunk_list); 422 } 423 424out: 425 if (ret < 0) { 426 __ib_umem_release(context->device, umem, 0); 427 kfree(umem); 428 } 429 430 return ret < 0 ? ERR_PTR(ret) : umem; 431#endif 432} 433EXPORT_SYMBOL(ib_umem_get); 434 435#ifdef __linux__ 436static void ib_umem_account(struct work_struct *work) 437{ 438 struct ib_umem *umem = container_of(work, struct ib_umem, work); 439 440 down_write(&umem->mm->mmap_sem); 441 umem->mm->locked_vm -= umem->diff; 442 up_write(&umem->mm->mmap_sem); 443 mmput(umem->mm); 444 kfree(umem); 445} 446#endif 447 448/** 449 * ib_umem_release - release memory pinned with ib_umem_get 450 * @umem: umem struct to release 451 */ 452void ib_umem_release(struct ib_umem *umem) 453{ 454#ifdef __linux__ 455 struct ib_ucontext *context = umem->context; 456 struct mm_struct *mm; 457 unsigned long diff; 458 459 __ib_umem_release(umem->context->device, umem, 1); 460 461 mm = get_task_mm(current); 462 if (!mm) { 463 kfree(umem); 464 return; 465 } 466 467 diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; 468 469 /* 470 * We may be called with the mm's mmap_sem already held. This 471 * can happen when a userspace munmap() is the call that drops 472 * the last reference to our file and calls our release 473 * method. If there are memory regions to destroy, we'll end 474 * up here and not be able to take the mmap_sem. In that case 475 * we defer the vm_locked accounting to the system workqueue. 476 */ 477 if (context->closing) { 478 if (!down_write_trylock(&mm->mmap_sem)) { 479 INIT_WORK(&umem->work, ib_umem_account); 480 umem->mm = mm; 481 umem->diff = diff; 482 483 schedule_work(&umem->work); 484 return; 485 } 486 } else 487 down_write(&mm->mmap_sem); 488 489 current->mm->locked_vm -= diff; 490 up_write(&mm->mmap_sem); 491 mmput(mm); 492#else 493 vm_offset_t addr, end, last, start; 494 vm_size_t size; 495 int error; 496 497 __ib_umem_release(umem->context->device, umem, 1); 498 if (umem->context->closing) { 499 kfree(umem); 500 return; 501 } 502 error = priv_check(curthread, PRIV_VM_MUNLOCK); 503 if (error) 504 return; 505 addr = umem->start; 506 size = umem->length; 507 last = addr + size; 508 start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ 509 end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ 510 vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end, 511 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 512 513#endif 514 kfree(umem); 515} 516EXPORT_SYMBOL(ib_umem_release); 517 518int ib_umem_page_count(struct ib_umem *umem) 519{ 520 struct ib_umem_chunk *chunk; 521 int shift; 522 int i; 523 int n; 524 525 shift = ilog2(umem->page_size); 526 527 n = 0; 528 list_for_each_entry(chunk, &umem->chunk_list, list) 529 for (i = 0; i < chunk->nmap; ++i) 530 n += sg_dma_len(&chunk->page_list[i]) >> shift; 531 532 return n; 533} 534EXPORT_SYMBOL(ib_umem_page_count); 535 536/**********************************************/ 537/* 538 * Stub functions for contiguous pages - 539 * We currently do not support this feature 540 */ 541/**********************************************/ 542 543/** 544 * ib_cmem_release_contiguous_pages - release memory allocated by 545 * ib_cmem_alloc_contiguous_pages. 546 * @cmem: cmem struct to release 547 */ 548void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem) 549{ 550} 551EXPORT_SYMBOL(ib_cmem_release_contiguous_pages); 552 553/** 554 * * ib_cmem_alloc_contiguous_pages - allocate contiguous pages 555 * * @context: userspace context to allocate memory for 556 * * @total_size: total required size for that allocation. 557 * * @page_size_order: order of one contiguous page. 558 * */ 559struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context, 560 unsigned long total_size, 561 unsigned long page_size_order) 562{ 563 return NULL; 564} 565EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages); 566 567/** 568 * * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA 569 * * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages 570 * * @vma: VMA to inject pages into. 571 * */ 572int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem, 573 struct vm_area_struct *vma) 574{ 575 return 0; 576} 577EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma); 578