1/*
2
3 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
4 * Copyright (c) 2005 Cisco Systems.  All rights reserved.
5 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
6 *
7 * This software is available to you under a choice of one of two
8 * licenses.  You may choose to be licensed under the terms of the GNU
9 * General Public License (GPL) Version 2, available from the file
10 * COPYING in the main directory of this source tree, or the
11 * OpenIB.org BSD license below:
12 *
13 *     Redistribution and use in source and binary forms, with or
14 *     without modification, are permitted provided that the following
15 *     conditions are met:
16 *
17 *      - Redistributions of source code must retain the above
18 *        copyright notice, this list of conditions and the following
19 *        disclaimer.
20 *
21 *      - Redistributions in binary form must reproduce the above
22 *        copyright notice, this list of conditions and the following
23 *        disclaimer in the documentation and/or other materials
24 *        provided with the distribution.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * SOFTWARE.
34
35
36 #include <linux/mm.h>
37 #include <linux/dma-mapping.h>
38 #include <linux/sched.h>
39 #ifdef __linux__
40 #include <linux/hugetlb.h>
41 #endif
42 #include <linux/dma-attrs.h>
43
44 #include <sys/priv.h>
45 #include <sys/resource.h>
46 #include <sys/resourcevar.h>
47
48 #include <vm/vm.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_object.h>
51 #include <vm/vm_pageout.h>
52
53 #include "uverbs.h"
54
55 static int allow_weak_ordering;
56 module_param(allow_weak_ordering, bool, 0444);
57 MODULE_PARM_DESC(allow_weak_ordering,  "Allow weak ordering for data registered memory");
58
59 #define IB_UMEM_MAX_PAGE_CHUNK						\
60	((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) /	\
61	 ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] -	\
62	  (void *) &((struct ib_umem_chunk *) 0)->page_list[0]))
63
64 #ifdef __ia64__
65 extern int dma_map_sg_hp_wa;
66
67 static int dma_map_sg_ia64(struct ib_device *ibdev,
68 struct scatterlist *sg,
69 int nents,
70 enum dma_data_direction dir)
71 {
72 int i, rc, j, lents = 0;
73 struct device *dev;
74
75 if (!dma_map_sg_hp_wa)
76 return ib_dma_map_sg(ibdev, sg, nents, dir);
77
78 dev = ibdev->dma_device;
79 for (i = 0; i < nents; ++i) {
80 rc = dma_map_sg(dev, sg + i, 1, dir);
81 if (rc <= 0) {
82 for (j = 0; j < i; ++j)
83 dma_unmap_sg(dev, sg + j, 1, dir);
84
85 return 0;
86 }
87 lents += rc;
88 }
89
90 return lents;
91 }
92
93 static void dma_unmap_sg_ia64(struct ib_device *ibdev,
94 struct scatterlist *sg,
95 int nents,
96 enum dma_data_direction dir)
97 {
98 int i;
99 struct device *dev;
100
101 if (!dma_map_sg_hp_wa)
102 return ib_dma_unmap_sg(ibdev, sg, nents, dir);
103
104 dev = ibdev->dma_device;
105 for (i = 0; i < nents; ++i)
106 dma_unmap_sg(dev, sg + i, 1, dir);
107 }
108
109 #define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir)
110 #define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir)
111
112 #endif
113
114 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
115 {
116 #ifdef __linux__
117 struct ib_umem_chunk *chunk, *tmp;
118 int i;
119
120 list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
121 ib_dma_unmap_sg_attrs(dev, chunk->page_list,
122 chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
123 for (i = 0; i < chunk->nents; ++i) {
124 struct page *page = sg_page(&chunk->page_list[i]);
125 if (umem->writable && dirty)
126 set_page_dirty_lock(page);
127 put_page(page);
128 }
129 kfree(chunk);
130 }
131 #else
132 struct ib_umem_chunk *chunk, *tmp;
133 vm_object_t object;
134 int i;
135
136 object = NULL;
137 list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
138 ib_dma_unmap_sg_attrs(dev, chunk->page_list,
139 chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
140 for (i = 0; i < chunk->nents; ++i) {
141 struct page *page = sg_page(&chunk->page_list[i]);
142 if (umem->writable && dirty) {
143 if (object && object != page->object)
144 VM_OBJECT_WUNLOCK(object);
145 if (object != page->object) {
146 object = page->object;
147 VM_OBJECT_WLOCK(object);
148 }
149 vm_page_dirty(page);
150 }
151 }
152 kfree(chunk);
153 }
154 if (object)
155 VM_OBJECT_WUNLOCK(object);
156
157 #endif
158 }
159
160 *
161 * ib_umem_get - Pin and DMA map userspace memory.
162 * @context: userspace context to pin memory for
163 * @addr: userspace virtual address to start at
164 * @size: length of region to pin
165 * @access: IB_ACCESS_xxx flags for memory being pinned
166 * @dmasync: flush in-flight DMA when the memory region is written
167 */
168struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
169		size_t size, int access, int dmasync) {
170
171	/*
172	 #ifdef bla
173	 struct ib_umem *umem;
174	 struct page **page_list;
175	 struct vm_area_struct **vma_list;
176	 struct ib_umem_chunk *chunk;
177	 unsigned long locked;
178	 unsigned long lock_limit;
179	 unsigned long cur_base;
180	 unsigned long npages;
181	 int ret;
182	 int off;
183	 int i;
184	 DEFINE_DMA_ATTRS(attrs);
185
186	 if (dmasync)
187	 dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
188	 else if (allow_weak_ordering)
189	 dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
190
191	 if (!can_do_mlock())
192	 return ERR_PTR(-EPERM);
193
194	 umem = kmalloc(sizeof *umem, GFP_KERNEL);
195	 if (!umem)
196	 return ERR_PTR(-ENOMEM);
197
198	 umem->context   = context;
199	 umem->length    = size;
200	 umem->offset    = addr & ~PAGE_MASK;
201	 umem->page_size = PAGE_SIZE;
202
203	 * We ask for writable memory if any access flags other than
204	 * "remote read" are set.  "Local write" and "remote write"
205	 * obviously require write access.  "Remote atomic" can do
206	 * things like fetch and add, which will modify memory, and
207	 * "MW bind" can change permissions by binding a window.
208
209	 umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
210
211	 We assume the memory is from hugetlb until proved otherwise
212	 umem->hugetlb   = 1;
213
214	 INIT_LIST_HEAD(&umem->chunk_list);
215
216	 page_list = (struct page **) __get_free_page(GFP_KERNEL);
217	 if (!page_list) {
218	 kfree(umem);
219	 return ERR_PTR(-ENOMEM);
220	 }
221
222
223	 * if we can't alloc the vma_list, it's not so bad;
224	 * just assume the memory is not hugetlb memory
225
226	 vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
227	 if (!vma_list)
228	 umem->hugetlb = 0;
229
230	 npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
231
232	 down_write(&current->mm->mmap_sem);
233
234	 locked     = npages + current->mm->locked_vm;
235	 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
236
237	 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
238	 ret = -ENOMEM;
239	 goto out;
240	 }
241
242	 cur_base = addr & PAGE_MASK;
243
244	 ret = 0;
245
246	 while (npages) {
247	 ret = get_user_pages(current, current->mm, cur_base,
248	 min_t(unsigned long, npages,
249	 PAGE_SIZE / sizeof (struct page *)),
250	 1, !umem->writable, page_list, vma_list);
251
252	 if (ret < 0)
253	 goto out;
254
255	 cur_base += ret * PAGE_SIZE;
256	 npages   -= ret;
257
258	 off = 0;
259
260	 while (ret) {
261	 chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) *
262	 min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
263	 GFP_KERNEL);
264	 if (!chunk) {
265	 ret = -ENOMEM;
266	 goto out;
267	 }
268
269	 chunk->attrs = attrs;
270	 chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
271	 sg_init_table(chunk->page_list, chunk->nents);
272	 for (i = 0; i < chunk->nents; ++i) {
273	 if (vma_list &&
274	 !is_vm_hugetlb_page(vma_list[i + off]))
275	 umem->hugetlb = 0;
276	 sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0);
277	 }
278
279	 chunk->nmap = ib_dma_map_sg_attrs(context->device,
280	 &chunk->page_list[0],
281	 chunk->nents,
282	 DMA_BIDIRECTIONAL,
283	 &attrs);
284	 if (chunk->nmap <= 0) {
285	 for (i = 0; i < chunk->nents; ++i)
286	 put_page(sg_page(&chunk->page_list[i]));
287	 kfree(chunk);
288
289	 ret = -ENOMEM;
290	 goto out;
291	 }
292
293	 ret -= chunk->nents;
294	 off += chunk->nents;
295	 list_add_tail(&chunk->list, &umem->chunk_list);
296	 }
297
298	 ret = 0;
299	 }
300
301	 out:
302	 if (ret < 0) {
303	 __ib_umem_release(context->device, umem, 0);
304	 kfree(umem);
305	 } else
306	 current->mm->locked_vm = locked;
307
308	 up_write(&current->mm->mmap_sem);
309	 if (vma_list)
310	 free_page((unsigned long) vma_list);
311	 free_page((unsigned long) page_list);
312
313	 return ret < 0 ? ERR_PTR(ret) : umem;
314	 #else
315	 */
316
317	struct ib_umem *umem;
318	struct ib_umem_chunk *chunk;
319	struct proc *proc;
320	pmap_t pmap;
321	vm_offset_t end, last, start;
322	vm_size_t npages;
323	int error;
324	int ents;
325	int ret;
326	int i;
327	DEFINE_DMA_ATTRS(attrs);
328
329	error = priv_check(curthread, PRIV_VM_MLOCK);
330	if (error)
331		return ERR_PTR(-error);
332
333	last = addr + size;
334	start = addr & PAGE_MASK; /*Use the linux PAGE_MASK definition. */
335	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
336	if (last < addr || end < addr)
337		return ERR_PTR(-EINVAL);
338	npages = atop(end - start);
339	if (npages > vm_page_max_wired)
340		return ERR_PTR(-ENOMEM);
341	umem = kzalloc(sizeof *umem, GFP_KERNEL);
342	if (!umem)
343		return ERR_PTR(-ENOMEM);
344	proc = curthread->td_proc;
345	PROC_LOCK(proc);
346	if (ptoa(npages + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map)))
347			> lim_cur(proc, RLIMIT_MEMLOCK)) {
348		PROC_UNLOCK(proc);
349		kfree(umem);
350		return ERR_PTR(-ENOMEM);
351	}
352	PROC_UNLOCK(proc);
353	if (npages + cnt.v_wire_count > vm_page_max_wired) {
354		kfree(umem);
355		return ERR_PTR(-EAGAIN);
356	}
357	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
358			VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES
359					| (umem->writable ? VM_MAP_WIRE_WRITE : 0));
360	if (error != KERN_SUCCESS) {
361		kfree(umem);
362		return ERR_PTR(-ENOMEM);
363	}
364
365	umem->context = context;
366	umem->length = size;
367	umem->offset = addr & ~PAGE_MASK;
368	umem->page_size = PAGE_SIZE;
369	umem->start = addr;
370
371	/** We ask for writable memory if any access flags other than
372	 * "remote read" are set.  "Local write" and "remote write"
373	 * obviously require write access.  "Remote atomic" can do
374	 * things like fetch and add, which will modify memory, and
375	 * "MW bind" can change permissions by binding a window.*/
376
377	umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
378	umem->hugetlb = 0;
379	INIT_LIST_HEAD(&umem->chunk_list);
380
381	pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
382	ret = 0;
383	while (npages) {
384		ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
385		chunk = kmalloc(sizeof(*chunk) + (sizeof(struct scatterlist) * ents),
386				GFP_KERNEL);
387		if (!chunk) {
388			ret = -ENOMEM;
389			goto out;
390		}
391
392		chunk->attrs = attrs;
393		chunk->nents = ents;
394		sg_init_table(&chunk->page_list[0], ents);
395		for (i = 0; i < chunk->nents; ++i) {
396			vm_paddr_t pa;
397
398			pa = pmap_extract(pmap, start);
399			if (pa == 0) {
400				ret = -ENOMEM;
401				kfree(chunk);
402				goto out;
403			}
404			sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa), PAGE_SIZE,
405					0);
406			npages--;
407			start += PAGE_SIZE;
408		}
409
410		chunk->nmap = ib_dma_map_sg_attrs(context->device, &chunk->page_list[0],
411				chunk->nents, DMA_BIDIRECTIONAL, &attrs);
412		if (chunk->nmap != chunk->nents) {
413			kfree(chunk);
414			ret = -ENOMEM;
415			goto out;
416		}
417
418		list_add_tail(&chunk->list, &umem->chunk_list);
419	}
420
421	out: if (ret < 0) {
422		__ib_umem_release(context->device, umem, 0);
423		kfree(umem);
424	}
425
426	return ret < 0 ? ERR_PTR(ret) : umem;
427	/*#endif*/
428}
429/*
430 EXPORT_SYMBOL(ib_umem_get);
431
432 #ifdef __linux__
433 static void ib_umem_account(struct work_struct *work)
434 {
435 struct ib_umem *umem = container_of(work, struct ib_umem, work);
436
437 down_write(&umem->mm->mmap_sem);
438 umem->mm->locked_vm -= umem->diff;
439 up_write(&umem->mm->mmap_sem);
440 mmput(umem->mm);
441 kfree(umem);
442 }
443 #endif
444
445 *
446 * ib_umem_release - release memory pinned with ib_umem_get
447 * @umem: umem struct to release
448
449 void ib_umem_release(struct ib_umem *umem)
450 {
451 #ifdef __linux__
452 struct ib_ucontext *context = umem->context;
453 struct mm_struct *mm;
454 unsigned long diff;
455
456 __ib_umem_release(umem->context->device, umem, 1);
457
458 mm = get_task_mm(current);
459 if (!mm) {
460 kfree(umem);
461 return;
462 }
463
464 diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
465
466
467 * We may be called with the mm's mmap_sem already held.  This
468 * can happen when a userspace munmap() is the call that drops
469 * the last reference to our file and calls our release
470 * method.  If there are memory regions to destroy, we'll end
471 * up here and not be able to take the mmap_sem.  In that case
472 * we defer the vm_locked accounting to the system workqueue.
473
474 if (context->closing) {
475 if (!down_write_trylock(&mm->mmap_sem)) {
476 INIT_WORK(&umem->work, ib_umem_account);
477 umem->mm   = mm;
478 umem->diff = diff;
479
480 schedule_work(&umem->work);
481 return;
482 }
483 } else
484 down_write(&mm->mmap_sem);
485
486 current->mm->locked_vm -= diff;
487 up_write(&mm->mmap_sem);
488 mmput(mm);
489 #else
490 vm_offset_t addr, end, last, start;
491 vm_size_t size;
492 int error;
493
494 __ib_umem_release(umem->context->device, umem, 1);
495 if (umem->context->closing) {
496 kfree(umem);
497 return;
498 }
499 error = priv_check(curthread, PRIV_VM_MUNLOCK);
500 if (error)
501 return;
502 addr = umem->start;
503 size = umem->length;
504 last = addr + size;
505 start = addr & PAGE_MASK;  Use the linux PAGE_MASK definition.
506 end = roundup2(last, PAGE_SIZE);  Use PAGE_MASK safe operation.
507 vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end,
508 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
509
510 #endif
511 kfree(umem);
512 }
513 EXPORT_SYMBOL(ib_umem_release);
514
515 int ib_umem_page_count(struct ib_umem *umem)
516 {
517 struct ib_umem_chunk *chunk;
518 int shift;
519 int i;
520 int n;
521
522 shift = ilog2(umem->page_size);
523
524 n = 0;
525 list_for_each_entry(chunk, &umem->chunk_list, list)
526 for (i = 0; i < chunk->nmap; ++i)
527 n += sg_dma_len(&chunk->page_list[i]) >> shift;
528
529 return n;
530 }
531 EXPORT_SYMBOL(ib_umem_page_count);
532
533 ********************************************
534
535 * Stub functions for contiguous pages -
536 * We currently do not support this feature
537
538 ********************************************
539
540 *
541 * ib_cmem_release_contiguous_pages - release memory allocated by
542 *                                              ib_cmem_alloc_contiguous_pages.
543 * @cmem: cmem struct to release
544
545 void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
546 {
547 }
548 EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
549
550 *
551 *  * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
552 *  *  @context: userspace context to allocate memory for
553 *   * @total_size: total required size for that allocation.
554 *    * @page_size_order: order of one contiguous page.
555 *
556 struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
557 unsigned long total_size,
558 unsigned long page_size_order)
559 {
560 return NULL;
561 }
562 EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
563
564 *
565 *  * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
566 *   * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
567 *    * @vma: VMA to inject pages into.
568 *
569 int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
570 struct vm_area_struct *vma)
571 {
572 return 0;
573 }
574 EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
575 */
576