1/* 2 3 * Copyright (c) 2005 Topspin Communications. All rights reserved. 4 * Copyright (c) 2005 Cisco Systems. All rights reserved. 5 * Copyright (c) 2005 Mellanox Technologies. All rights reserved. 6 * 7 * This software is available to you under a choice of one of two 8 * licenses. You may choose to be licensed under the terms of the GNU 9 * General Public License (GPL) Version 2, available from the file 10 * COPYING in the main directory of this source tree, or the 11 * OpenIB.org BSD license below: 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above 18 * copyright notice, this list of conditions and the following 19 * disclaimer. 20 * 21 * - Redistributions in binary form must reproduce the above 22 * copyright notice, this list of conditions and the following 23 * disclaimer in the documentation and/or other materials 24 * provided with the distribution. 25 * 26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33 * SOFTWARE. 34 35 36 #include <linux/mm.h> 37 #include <linux/dma-mapping.h> 38 #include <linux/sched.h> 39 #ifdef __linux__ 40 #include <linux/hugetlb.h> 41 #endif 42 #include <linux/dma-attrs.h> 43 44 #include <sys/priv.h> 45 #include <sys/resource.h> 46 #include <sys/resourcevar.h> 47 48 #include <vm/vm.h> 49 #include <vm/vm_map.h> 50 #include <vm/vm_object.h> 51 #include <vm/vm_pageout.h> 52 53 #include "uverbs.h" 54 55 static int allow_weak_ordering; 56 module_param(allow_weak_ordering, bool, 0444); 57 MODULE_PARM_DESC(allow_weak_ordering, "Allow weak ordering for data registered memory"); 58 59 #define IB_UMEM_MAX_PAGE_CHUNK \ 60 ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ 61 ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ 62 (void *) &((struct ib_umem_chunk *) 0)->page_list[0])) 63 64 #ifdef __ia64__ 65 extern int dma_map_sg_hp_wa; 66 67 static int dma_map_sg_ia64(struct ib_device *ibdev, 68 struct scatterlist *sg, 69 int nents, 70 enum dma_data_direction dir) 71 { 72 int i, rc, j, lents = 0; 73 struct device *dev; 74 75 if (!dma_map_sg_hp_wa) 76 return ib_dma_map_sg(ibdev, sg, nents, dir); 77 78 dev = ibdev->dma_device; 79 for (i = 0; i < nents; ++i) { 80 rc = dma_map_sg(dev, sg + i, 1, dir); 81 if (rc <= 0) { 82 for (j = 0; j < i; ++j) 83 dma_unmap_sg(dev, sg + j, 1, dir); 84 85 return 0; 86 } 87 lents += rc; 88 } 89 90 return lents; 91 } 92 93 static void dma_unmap_sg_ia64(struct ib_device *ibdev, 94 struct scatterlist *sg, 95 int nents, 96 enum dma_data_direction dir) 97 { 98 int i; 99 struct device *dev; 100 101 if (!dma_map_sg_hp_wa) 102 return ib_dma_unmap_sg(ibdev, sg, nents, dir); 103 104 dev = ibdev->dma_device; 105 for (i = 0; i < nents; ++i) 106 dma_unmap_sg(dev, sg + i, 1, dir); 107 } 108 109 #define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir) 110 #define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir) 111 112 #endif 113 114 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) 115 { 116 #ifdef __linux__ 117 struct ib_umem_chunk *chunk, *tmp; 118 int i; 119 120 list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { 121 ib_dma_unmap_sg_attrs(dev, chunk->page_list, 122 chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); 123 for (i = 0; i < chunk->nents; ++i) { 124 struct page *page = sg_page(&chunk->page_list[i]); 125 if (umem->writable && dirty) 126 set_page_dirty_lock(page); 127 put_page(page); 128 } 129 kfree(chunk); 130 } 131 #else 132 struct ib_umem_chunk *chunk, *tmp; 133 vm_object_t object; 134 int i; 135 136 object = NULL; 137 list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { 138 ib_dma_unmap_sg_attrs(dev, chunk->page_list, 139 chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); 140 for (i = 0; i < chunk->nents; ++i) { 141 struct page *page = sg_page(&chunk->page_list[i]); 142 if (umem->writable && dirty) { 143 if (object && object != page->object) 144 VM_OBJECT_WUNLOCK(object); 145 if (object != page->object) { 146 object = page->object; 147 VM_OBJECT_WLOCK(object); 148 } 149 vm_page_dirty(page); 150 } 151 } 152 kfree(chunk); 153 } 154 if (object) 155 VM_OBJECT_WUNLOCK(object); 156 157 #endif 158 } 159 160 * 161 * ib_umem_get - Pin and DMA map userspace memory. 162 * @context: userspace context to pin memory for 163 * @addr: userspace virtual address to start at 164 * @size: length of region to pin 165 * @access: IB_ACCESS_xxx flags for memory being pinned 166 * @dmasync: flush in-flight DMA when the memory region is written 167 */ 168struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, 169 size_t size, int access, int dmasync) { 170 171 /* 172 #ifdef bla 173 struct ib_umem *umem; 174 struct page **page_list; 175 struct vm_area_struct **vma_list; 176 struct ib_umem_chunk *chunk; 177 unsigned long locked; 178 unsigned long lock_limit; 179 unsigned long cur_base; 180 unsigned long npages; 181 int ret; 182 int off; 183 int i; 184 DEFINE_DMA_ATTRS(attrs); 185 186 if (dmasync) 187 dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); 188 else if (allow_weak_ordering) 189 dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs); 190 191 if (!can_do_mlock()) 192 return ERR_PTR(-EPERM); 193 194 umem = kmalloc(sizeof *umem, GFP_KERNEL); 195 if (!umem) 196 return ERR_PTR(-ENOMEM); 197 198 umem->context = context; 199 umem->length = size; 200 umem->offset = addr & ~PAGE_MASK; 201 umem->page_size = PAGE_SIZE; 202 203 * We ask for writable memory if any access flags other than 204 * "remote read" are set. "Local write" and "remote write" 205 * obviously require write access. "Remote atomic" can do 206 * things like fetch and add, which will modify memory, and 207 * "MW bind" can change permissions by binding a window. 208 209 umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); 210 211 We assume the memory is from hugetlb until proved otherwise 212 umem->hugetlb = 1; 213 214 INIT_LIST_HEAD(&umem->chunk_list); 215 216 page_list = (struct page **) __get_free_page(GFP_KERNEL); 217 if (!page_list) { 218 kfree(umem); 219 return ERR_PTR(-ENOMEM); 220 } 221 222 223 * if we can't alloc the vma_list, it's not so bad; 224 * just assume the memory is not hugetlb memory 225 226 vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); 227 if (!vma_list) 228 umem->hugetlb = 0; 229 230 npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; 231 232 down_write(¤t->mm->mmap_sem); 233 234 locked = npages + current->mm->locked_vm; 235 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 236 237 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { 238 ret = -ENOMEM; 239 goto out; 240 } 241 242 cur_base = addr & PAGE_MASK; 243 244 ret = 0; 245 246 while (npages) { 247 ret = get_user_pages(current, current->mm, cur_base, 248 min_t(unsigned long, npages, 249 PAGE_SIZE / sizeof (struct page *)), 250 1, !umem->writable, page_list, vma_list); 251 252 if (ret < 0) 253 goto out; 254 255 cur_base += ret * PAGE_SIZE; 256 npages -= ret; 257 258 off = 0; 259 260 while (ret) { 261 chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) * 262 min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK), 263 GFP_KERNEL); 264 if (!chunk) { 265 ret = -ENOMEM; 266 goto out; 267 } 268 269 chunk->attrs = attrs; 270 chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); 271 sg_init_table(chunk->page_list, chunk->nents); 272 for (i = 0; i < chunk->nents; ++i) { 273 if (vma_list && 274 !is_vm_hugetlb_page(vma_list[i + off])) 275 umem->hugetlb = 0; 276 sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0); 277 } 278 279 chunk->nmap = ib_dma_map_sg_attrs(context->device, 280 &chunk->page_list[0], 281 chunk->nents, 282 DMA_BIDIRECTIONAL, 283 &attrs); 284 if (chunk->nmap <= 0) { 285 for (i = 0; i < chunk->nents; ++i) 286 put_page(sg_page(&chunk->page_list[i])); 287 kfree(chunk); 288 289 ret = -ENOMEM; 290 goto out; 291 } 292 293 ret -= chunk->nents; 294 off += chunk->nents; 295 list_add_tail(&chunk->list, &umem->chunk_list); 296 } 297 298 ret = 0; 299 } 300 301 out: 302 if (ret < 0) { 303 __ib_umem_release(context->device, umem, 0); 304 kfree(umem); 305 } else 306 current->mm->locked_vm = locked; 307 308 up_write(¤t->mm->mmap_sem); 309 if (vma_list) 310 free_page((unsigned long) vma_list); 311 free_page((unsigned long) page_list); 312 313 return ret < 0 ? ERR_PTR(ret) : umem; 314 #else 315 */ 316 317 struct ib_umem *umem; 318 struct ib_umem_chunk *chunk; 319 struct proc *proc; 320 pmap_t pmap; 321 vm_offset_t end, last, start; 322 vm_size_t npages; 323 int error; 324 int ents; 325 int ret; 326 int i; 327 DEFINE_DMA_ATTRS(attrs); 328 329 error = priv_check(curthread, PRIV_VM_MLOCK); 330 if (error) 331 return ERR_PTR(-error); 332 333 last = addr + size; 334 start = addr & PAGE_MASK; /*Use the linux PAGE_MASK definition. */ 335 end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ 336 if (last < addr || end < addr) 337 return ERR_PTR(-EINVAL); 338 npages = atop(end - start); 339 if (npages > vm_page_max_wired) 340 return ERR_PTR(-ENOMEM); 341 umem = kzalloc(sizeof *umem, GFP_KERNEL); 342 if (!umem) 343 return ERR_PTR(-ENOMEM); 344 proc = curthread->td_proc; 345 PROC_LOCK(proc); 346 if (ptoa(npages + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) 347 > lim_cur(proc, RLIMIT_MEMLOCK)) { 348 PROC_UNLOCK(proc); 349 kfree(umem); 350 return ERR_PTR(-ENOMEM); 351 } 352 PROC_UNLOCK(proc); 353 if (npages + cnt.v_wire_count > vm_page_max_wired) { 354 kfree(umem); 355 return ERR_PTR(-EAGAIN); 356 } 357 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 358 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES 359 | (umem->writable ? VM_MAP_WIRE_WRITE : 0)); 360 if (error != KERN_SUCCESS) { 361 kfree(umem); 362 return ERR_PTR(-ENOMEM); 363 } 364 365 umem->context = context; 366 umem->length = size; 367 umem->offset = addr & ~PAGE_MASK; 368 umem->page_size = PAGE_SIZE; 369 umem->start = addr; 370 371 /** We ask for writable memory if any access flags other than 372 * "remote read" are set. "Local write" and "remote write" 373 * obviously require write access. "Remote atomic" can do 374 * things like fetch and add, which will modify memory, and 375 * "MW bind" can change permissions by binding a window.*/ 376 377 umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); 378 umem->hugetlb = 0; 379 INIT_LIST_HEAD(&umem->chunk_list); 380 381 pmap = vm_map_pmap(&proc->p_vmspace->vm_map); 382 ret = 0; 383 while (npages) { 384 ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK); 385 chunk = kmalloc(sizeof(*chunk) + (sizeof(struct scatterlist) * ents), 386 GFP_KERNEL); 387 if (!chunk) { 388 ret = -ENOMEM; 389 goto out; 390 } 391 392 chunk->attrs = attrs; 393 chunk->nents = ents; 394 sg_init_table(&chunk->page_list[0], ents); 395 for (i = 0; i < chunk->nents; ++i) { 396 vm_paddr_t pa; 397 398 pa = pmap_extract(pmap, start); 399 if (pa == 0) { 400 ret = -ENOMEM; 401 kfree(chunk); 402 goto out; 403 } 404 sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa), PAGE_SIZE, 405 0); 406 npages--; 407 start += PAGE_SIZE; 408 } 409 410 chunk->nmap = ib_dma_map_sg_attrs(context->device, &chunk->page_list[0], 411 chunk->nents, DMA_BIDIRECTIONAL, &attrs); 412 if (chunk->nmap != chunk->nents) { 413 kfree(chunk); 414 ret = -ENOMEM; 415 goto out; 416 } 417 418 list_add_tail(&chunk->list, &umem->chunk_list); 419 } 420 421 out: if (ret < 0) { 422 __ib_umem_release(context->device, umem, 0); 423 kfree(umem); 424 } 425 426 return ret < 0 ? ERR_PTR(ret) : umem; 427 /*#endif*/ 428} 429/* 430 EXPORT_SYMBOL(ib_umem_get); 431 432 #ifdef __linux__ 433 static void ib_umem_account(struct work_struct *work) 434 { 435 struct ib_umem *umem = container_of(work, struct ib_umem, work); 436 437 down_write(&umem->mm->mmap_sem); 438 umem->mm->locked_vm -= umem->diff; 439 up_write(&umem->mm->mmap_sem); 440 mmput(umem->mm); 441 kfree(umem); 442 } 443 #endif 444 445 * 446 * ib_umem_release - release memory pinned with ib_umem_get 447 * @umem: umem struct to release 448 449 void ib_umem_release(struct ib_umem *umem) 450 { 451 #ifdef __linux__ 452 struct ib_ucontext *context = umem->context; 453 struct mm_struct *mm; 454 unsigned long diff; 455 456 __ib_umem_release(umem->context->device, umem, 1); 457 458 mm = get_task_mm(current); 459 if (!mm) { 460 kfree(umem); 461 return; 462 } 463 464 diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; 465 466 467 * We may be called with the mm's mmap_sem already held. This 468 * can happen when a userspace munmap() is the call that drops 469 * the last reference to our file and calls our release 470 * method. If there are memory regions to destroy, we'll end 471 * up here and not be able to take the mmap_sem. In that case 472 * we defer the vm_locked accounting to the system workqueue. 473 474 if (context->closing) { 475 if (!down_write_trylock(&mm->mmap_sem)) { 476 INIT_WORK(&umem->work, ib_umem_account); 477 umem->mm = mm; 478 umem->diff = diff; 479 480 schedule_work(&umem->work); 481 return; 482 } 483 } else 484 down_write(&mm->mmap_sem); 485 486 current->mm->locked_vm -= diff; 487 up_write(&mm->mmap_sem); 488 mmput(mm); 489 #else 490 vm_offset_t addr, end, last, start; 491 vm_size_t size; 492 int error; 493 494 __ib_umem_release(umem->context->device, umem, 1); 495 if (umem->context->closing) { 496 kfree(umem); 497 return; 498 } 499 error = priv_check(curthread, PRIV_VM_MUNLOCK); 500 if (error) 501 return; 502 addr = umem->start; 503 size = umem->length; 504 last = addr + size; 505 start = addr & PAGE_MASK; Use the linux PAGE_MASK definition. 506 end = roundup2(last, PAGE_SIZE); Use PAGE_MASK safe operation. 507 vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end, 508 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 509 510 #endif 511 kfree(umem); 512 } 513 EXPORT_SYMBOL(ib_umem_release); 514 515 int ib_umem_page_count(struct ib_umem *umem) 516 { 517 struct ib_umem_chunk *chunk; 518 int shift; 519 int i; 520 int n; 521 522 shift = ilog2(umem->page_size); 523 524 n = 0; 525 list_for_each_entry(chunk, &umem->chunk_list, list) 526 for (i = 0; i < chunk->nmap; ++i) 527 n += sg_dma_len(&chunk->page_list[i]) >> shift; 528 529 return n; 530 } 531 EXPORT_SYMBOL(ib_umem_page_count); 532 533 ******************************************** 534 535 * Stub functions for contiguous pages - 536 * We currently do not support this feature 537 538 ******************************************** 539 540 * 541 * ib_cmem_release_contiguous_pages - release memory allocated by 542 * ib_cmem_alloc_contiguous_pages. 543 * @cmem: cmem struct to release 544 545 void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem) 546 { 547 } 548 EXPORT_SYMBOL(ib_cmem_release_contiguous_pages); 549 550 * 551 * * ib_cmem_alloc_contiguous_pages - allocate contiguous pages 552 * * @context: userspace context to allocate memory for 553 * * @total_size: total required size for that allocation. 554 * * @page_size_order: order of one contiguous page. 555 * 556 struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context, 557 unsigned long total_size, 558 unsigned long page_size_order) 559 { 560 return NULL; 561 } 562 EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages); 563 564 * 565 * * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA 566 * * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages 567 * * @vma: VMA to inject pages into. 568 * 569 int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem, 570 struct vm_area_struct *vma) 571 { 572 return 0; 573 } 574 EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma); 575 */ 576