ib_umem_odp.c revision 331769
1/*
2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <linux/types.h>
34#include <linux/sched.h>
35#include <linux/slab.h>
36#include <linux/vmalloc.h>
37
38#include <rdma/ib_verbs.h>
39#include <rdma/ib_umem.h>
40#include <rdma/ib_umem_odp.h>
41
42static void ib_umem_notifier_start_account(struct ib_umem *item)
43{
44	mutex_lock(&item->odp_data->umem_mutex);
45
46	/* Only update private counters for this umem if it has them.
47	 * Otherwise skip it. All page faults will be delayed for this umem. */
48	if (item->odp_data->mn_counters_active) {
49		int notifiers_count = item->odp_data->notifiers_count++;
50
51		if (notifiers_count == 0)
52			/* Initialize the completion object for waiting on
53			 * notifiers. Since notifier_count is zero, no one
54			 * should be waiting right now. */
55			reinit_completion(&item->odp_data->notifier_completion);
56	}
57	mutex_unlock(&item->odp_data->umem_mutex);
58}
59
60static void ib_umem_notifier_end_account(struct ib_umem *item)
61{
62	mutex_lock(&item->odp_data->umem_mutex);
63
64	/* Only update private counters for this umem if it has them.
65	 * Otherwise skip it. All page faults will be delayed for this umem. */
66	if (item->odp_data->mn_counters_active) {
67		/*
68		 * This sequence increase will notify the QP page fault that
69		 * the page that is going to be mapped in the spte could have
70		 * been freed.
71		 */
72		++item->odp_data->notifiers_seq;
73		if (--item->odp_data->notifiers_count == 0)
74			complete_all(&item->odp_data->notifier_completion);
75	}
76	mutex_unlock(&item->odp_data->umem_mutex);
77}
78
79/* Account for a new mmu notifier in an ib_ucontext. */
80static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
81{
82	atomic_inc(&context->notifier_count);
83}
84
85/* Account for a terminating mmu notifier in an ib_ucontext.
86 *
87 * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
88 * the function takes the semaphore itself. */
89static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
90{
91	int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
92
93	if (zero_notifiers &&
94	    !list_empty(&context->no_private_counters)) {
95		/* No currently running mmu notifiers. Now is the chance to
96		 * add private accounting to all previously added umems. */
97		struct ib_umem_odp *odp_data, *next;
98
99		/* Prevent concurrent mmu notifiers from working on the
100		 * no_private_counters list. */
101		down_write(&context->umem_rwsem);
102
103		/* Read the notifier_count again, with the umem_rwsem
104		 * semaphore taken for write. */
105		if (!atomic_read(&context->notifier_count)) {
106			list_for_each_entry_safe(odp_data, next,
107						 &context->no_private_counters,
108						 no_private_counters) {
109				mutex_lock(&odp_data->umem_mutex);
110				odp_data->mn_counters_active = true;
111				list_del(&odp_data->no_private_counters);
112				complete_all(&odp_data->notifier_completion);
113				mutex_unlock(&odp_data->umem_mutex);
114			}
115		}
116
117		up_write(&context->umem_rwsem);
118	}
119}
120
121static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
122					       u64 end, void *cookie) {
123	/*
124	 * Increase the number of notifiers running, to
125	 * prevent any further fault handling on this MR.
126	 */
127	ib_umem_notifier_start_account(item);
128	item->odp_data->dying = 1;
129	/* Make sure that the fact the umem is dying is out before we release
130	 * all pending page faults. */
131	smp_wmb();
132	complete_all(&item->odp_data->notifier_completion);
133	item->context->invalidate_range(item, ib_umem_start(item),
134					ib_umem_end(item));
135	return 0;
136}
137
138static void ib_umem_notifier_release(struct mmu_notifier *mn,
139				     struct mm_struct *mm)
140{
141	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
142
143	if (!context->invalidate_range)
144		return;
145
146	ib_ucontext_notifier_start_account(context);
147	down_read(&context->umem_rwsem);
148	rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
149				      ULLONG_MAX,
150				      ib_umem_notifier_release_trampoline,
151				      NULL);
152	up_read(&context->umem_rwsem);
153}
154
155static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
156				      u64 end, void *cookie)
157{
158	ib_umem_notifier_start_account(item);
159	item->context->invalidate_range(item, start, start + PAGE_SIZE);
160	ib_umem_notifier_end_account(item);
161	return 0;
162}
163
164static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
165					     struct mm_struct *mm,
166					     unsigned long address)
167{
168	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
169
170	if (!context->invalidate_range)
171		return;
172
173	ib_ucontext_notifier_start_account(context);
174	down_read(&context->umem_rwsem);
175	rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
176				      address + PAGE_SIZE,
177				      invalidate_page_trampoline, NULL);
178	up_read(&context->umem_rwsem);
179	ib_ucontext_notifier_end_account(context);
180}
181
182static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
183					     u64 end, void *cookie)
184{
185	ib_umem_notifier_start_account(item);
186	item->context->invalidate_range(item, start, end);
187	return 0;
188}
189
190static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
191						    struct mm_struct *mm,
192						    unsigned long start,
193						    unsigned long end)
194{
195	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
196
197	if (!context->invalidate_range)
198		return;
199
200	ib_ucontext_notifier_start_account(context);
201	down_read(&context->umem_rwsem);
202	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
203				      end,
204				      invalidate_range_start_trampoline, NULL);
205	up_read(&context->umem_rwsem);
206}
207
208static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
209					   u64 end, void *cookie)
210{
211	ib_umem_notifier_end_account(item);
212	return 0;
213}
214
215static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
216						  struct mm_struct *mm,
217						  unsigned long start,
218						  unsigned long end)
219{
220	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
221
222	if (!context->invalidate_range)
223		return;
224
225	down_read(&context->umem_rwsem);
226	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
227				      end,
228				      invalidate_range_end_trampoline, NULL);
229	up_read(&context->umem_rwsem);
230	ib_ucontext_notifier_end_account(context);
231}
232
233static const struct mmu_notifier_ops ib_umem_notifiers = {
234	.release                    = ib_umem_notifier_release,
235	.invalidate_page            = ib_umem_notifier_invalidate_page,
236	.invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
237	.invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
238};
239
240int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
241{
242	int ret_val;
243	pid_t our_pid;
244	struct mm_struct *mm = get_task_mm(current);
245
246	if (!mm)
247		return -EINVAL;
248
249	/* Prevent creating ODP MRs in child processes */
250	rcu_read_lock();
251	our_pid = get_pid(task_pid_group_leader(current));
252	rcu_read_unlock();
253	put_pid(our_pid);
254	if (context->tgid != our_pid) {
255		ret_val = -EINVAL;
256		goto out_mm;
257	}
258
259	umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
260	if (!umem->odp_data) {
261		ret_val = -ENOMEM;
262		goto out_mm;
263	}
264	umem->odp_data->umem = umem;
265
266	mutex_init(&umem->odp_data->umem_mutex);
267
268	init_completion(&umem->odp_data->notifier_completion);
269
270	umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
271					    sizeof(*umem->odp_data->page_list));
272	if (!umem->odp_data->page_list) {
273		ret_val = -ENOMEM;
274		goto out_odp_data;
275	}
276
277	umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
278					  sizeof(*umem->odp_data->dma_list));
279	if (!umem->odp_data->dma_list) {
280		ret_val = -ENOMEM;
281		goto out_page_list;
282	}
283
284	/*
285	 * When using MMU notifiers, we will get a
286	 * notification before the "current" task (and MM) is
287	 * destroyed. We use the umem_rwsem semaphore to synchronize.
288	 */
289	down_write(&context->umem_rwsem);
290	context->odp_mrs_count++;
291	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
292		rbt_ib_umem_insert(&umem->odp_data->interval_tree,
293				   &context->umem_tree);
294	if (likely(!atomic_read(&context->notifier_count)) ||
295	    context->odp_mrs_count == 1)
296		umem->odp_data->mn_counters_active = true;
297	else
298		list_add(&umem->odp_data->no_private_counters,
299			 &context->no_private_counters);
300	downgrade_write(&context->umem_rwsem);
301
302	if (context->odp_mrs_count == 1) {
303		/*
304		 * Note that at this point, no MMU notifier is running
305		 * for this context!
306		 */
307		atomic_set(&context->notifier_count, 0);
308		INIT_HLIST_NODE(&context->mn.hlist);
309		context->mn.ops = &ib_umem_notifiers;
310		/*
311		 * Lock-dep detects a false positive for mmap_sem vs.
312		 * umem_rwsem, due to not grasping downgrade_write correctly.
313		 */
314		ret_val = mmu_notifier_register(&context->mn, mm);
315		if (ret_val) {
316			pr_err("Failed to register mmu_notifier %d\n", ret_val);
317			ret_val = -EBUSY;
318			goto out_mutex;
319		}
320	}
321
322	up_read(&context->umem_rwsem);
323
324	/*
325	 * Note that doing an mmput can cause a notifier for the relevant mm.
326	 * If the notifier is called while we hold the umem_rwsem, this will
327	 * cause a deadlock. Therefore, we release the reference only after we
328	 * released the semaphore.
329	 */
330	mmput(mm);
331	return 0;
332
333out_mutex:
334	up_read(&context->umem_rwsem);
335	vfree(umem->odp_data->dma_list);
336out_page_list:
337	vfree(umem->odp_data->page_list);
338out_odp_data:
339	kfree(umem->odp_data);
340out_mm:
341	mmput(mm);
342	return ret_val;
343}
344
345void ib_umem_odp_release(struct ib_umem *umem)
346{
347	struct ib_ucontext *context = umem->context;
348
349	/*
350	 * Ensure that no more pages are mapped in the umem.
351	 *
352	 * It is the driver's responsibility to ensure, before calling us,
353	 * that the hardware will not attempt to access the MR any more.
354	 */
355	ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
356				    ib_umem_end(umem));
357
358	down_write(&context->umem_rwsem);
359	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
360		rbt_ib_umem_remove(&umem->odp_data->interval_tree,
361				   &context->umem_tree);
362	context->odp_mrs_count--;
363	if (!umem->odp_data->mn_counters_active) {
364		list_del(&umem->odp_data->no_private_counters);
365		complete_all(&umem->odp_data->notifier_completion);
366	}
367
368	/*
369	 * Downgrade the lock to a read lock. This ensures that the notifiers
370	 * (who lock the mutex for reading) will be able to finish, and we
371	 * will be able to enventually obtain the mmu notifiers SRCU. Note
372	 * that since we are doing it atomically, no other user could register
373	 * and unregister while we do the check.
374	 */
375	downgrade_write(&context->umem_rwsem);
376	if (!context->odp_mrs_count) {
377		struct task_struct *owning_process = NULL;
378		struct mm_struct *owning_mm        = NULL;
379
380		owning_process = get_pid_task(context->tgid,
381					      PIDTYPE_PID);
382		if (owning_process == NULL)
383			/*
384			 * The process is already dead, notifier were removed
385			 * already.
386			 */
387			goto out;
388
389		owning_mm = get_task_mm(owning_process);
390		if (owning_mm == NULL)
391			/*
392			 * The process' mm is already dead, notifier were
393			 * removed already.
394			 */
395			goto out_put_task;
396		mmu_notifier_unregister(&context->mn, owning_mm);
397
398		mmput(owning_mm);
399
400out_put_task:
401		put_task_struct(owning_process);
402	}
403out:
404	up_read(&context->umem_rwsem);
405
406	vfree(umem->odp_data->dma_list);
407	vfree(umem->odp_data->page_list);
408	kfree(umem->odp_data);
409	kfree(umem);
410}
411
412/*
413 * Map for DMA and insert a single page into the on-demand paging page tables.
414 *
415 * @umem: the umem to insert the page to.
416 * @page_index: index in the umem to add the page to.
417 * @page: the page struct to map and add.
418 * @access_mask: access permissions needed for this page.
419 * @current_seq: sequence number for synchronization with invalidations.
420 *               the sequence number is taken from
421 *               umem->odp_data->notifiers_seq.
422 *
423 * The function returns -EFAULT if the DMA mapping operation fails. It returns
424 * -EAGAIN if a concurrent invalidation prevents us from updating the page.
425 *
426 * The page is released via put_page even if the operation failed. For
427 * on-demand pinning, the page is released whenever it isn't stored in the
428 * umem.
429 */
430static int ib_umem_odp_map_dma_single_page(
431		struct ib_umem *umem,
432		int page_index,
433		u64 base_virt_addr,
434		struct page *page,
435		u64 access_mask,
436		unsigned long current_seq)
437{
438	struct ib_device *dev = umem->context->device;
439	dma_addr_t dma_addr;
440	int stored_page = 0;
441	int remove_existing_mapping = 0;
442	int ret = 0;
443
444	/*
445	 * Note: we avoid writing if seq is different from the initial seq, to
446	 * handle case of a racing notifier. This check also allows us to bail
447	 * early if we have a notifier running in parallel with us.
448	 */
449	if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
450		ret = -EAGAIN;
451		goto out;
452	}
453	if (!(umem->odp_data->dma_list[page_index])) {
454		dma_addr = ib_dma_map_page(dev,
455					   page,
456					   0, PAGE_SIZE,
457					   DMA_BIDIRECTIONAL);
458		if (ib_dma_mapping_error(dev, dma_addr)) {
459			ret = -EFAULT;
460			goto out;
461		}
462		umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
463		umem->odp_data->page_list[page_index] = page;
464		stored_page = 1;
465	} else if (umem->odp_data->page_list[page_index] == page) {
466		umem->odp_data->dma_list[page_index] |= access_mask;
467	} else {
468		pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
469		       umem->odp_data->page_list[page_index], page);
470		/* Better remove the mapping now, to prevent any further
471		 * damage. */
472		remove_existing_mapping = 1;
473	}
474
475out:
476	/* On Demand Paging - avoid pinning the page */
477	if (umem->context->invalidate_range || !stored_page)
478		put_page(page);
479
480	if (remove_existing_mapping && umem->context->invalidate_range) {
481		invalidate_page_trampoline(
482			umem,
483			base_virt_addr + (page_index * PAGE_SIZE),
484			base_virt_addr + ((page_index+1)*PAGE_SIZE),
485			NULL);
486		ret = -EAGAIN;
487	}
488
489	return ret;
490}
491
492/**
493 * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
494 *
495 * Pins the range of pages passed in the argument, and maps them to
496 * DMA addresses. The DMA addresses of the mapped pages is updated in
497 * umem->odp_data->dma_list.
498 *
499 * Returns the number of pages mapped in success, negative error code
500 * for failure.
501 * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
502 * the function from completing its task.
503 *
504 * @umem: the umem to map and pin
505 * @user_virt: the address from which we need to map.
506 * @bcnt: the minimal number of bytes to pin and map. The mapping might be
507 *        bigger due to alignment, and may also be smaller in case of an error
508 *        pinning or mapping a page. The actual pages mapped is returned in
509 *        the return value.
510 * @access_mask: bit mask of the requested access permissions for the given
511 *               range.
512 * @current_seq: the MMU notifiers sequance value for synchronization with
513 *               invalidations. the sequance number is read from
514 *               umem->odp_data->notifiers_seq before calling this function
515 */
516int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
517			      u64 access_mask, unsigned long current_seq)
518{
519	struct task_struct *owning_process  = NULL;
520	struct mm_struct   *owning_mm       = NULL;
521	struct page       **local_page_list = NULL;
522	u64 off;
523	int j, k, ret = 0, start_idx, npages = 0;
524	u64 base_virt_addr;
525	unsigned int flags = 0;
526
527	if (access_mask == 0)
528		return -EINVAL;
529
530	if (user_virt < ib_umem_start(umem) ||
531	    user_virt + bcnt > ib_umem_end(umem))
532		return -EFAULT;
533
534	local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
535	if (!local_page_list)
536		return -ENOMEM;
537
538	off = user_virt & (~PAGE_MASK);
539	user_virt = user_virt & PAGE_MASK;
540	base_virt_addr = user_virt;
541	bcnt += off; /* Charge for the first page offset as well. */
542
543	owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
544	if (owning_process == NULL) {
545		ret = -EINVAL;
546		goto out_no_task;
547	}
548
549	owning_mm = get_task_mm(owning_process);
550	if (owning_mm == NULL) {
551		ret = -EINVAL;
552		goto out_put_task;
553	}
554
555	if (access_mask & ODP_WRITE_ALLOWED_BIT)
556		flags |= FOLL_WRITE;
557
558	start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
559	k = start_idx;
560
561	while (bcnt > 0) {
562		const size_t gup_num_pages =
563			min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
564			      PAGE_SIZE / sizeof(struct page *));
565
566		down_read(&owning_mm->mmap_sem);
567		/*
568		 * Note: this might result in redundent page getting. We can
569		 * avoid this by checking dma_list to be 0 before calling
570		 * get_user_pages. However, this make the code much more
571		 * complex (and doesn't gain us much performance in most use
572		 * cases).
573		 */
574		npages = get_user_pages_remote(owning_process, owning_mm,
575				user_virt, gup_num_pages,
576				flags, local_page_list, NULL);
577		up_read(&owning_mm->mmap_sem);
578
579		if (npages < 0)
580			break;
581
582		bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
583		user_virt += npages << PAGE_SHIFT;
584		mutex_lock(&umem->odp_data->umem_mutex);
585		for (j = 0; j < npages; ++j) {
586			ret = ib_umem_odp_map_dma_single_page(
587				umem, k, base_virt_addr, local_page_list[j],
588				access_mask, current_seq);
589			if (ret < 0)
590				break;
591			k++;
592		}
593		mutex_unlock(&umem->odp_data->umem_mutex);
594
595		if (ret < 0) {
596			/* Release left over pages when handling errors. */
597			for (++j; j < npages; ++j)
598				put_page(local_page_list[j]);
599			break;
600		}
601	}
602
603	if (ret >= 0) {
604		if (npages < 0 && k == start_idx)
605			ret = npages;
606		else
607			ret = k - start_idx;
608	}
609
610	mmput(owning_mm);
611out_put_task:
612	put_task_struct(owning_process);
613out_no_task:
614	free_page((unsigned long)local_page_list);
615	return ret;
616}
617EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
618
619void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
620				 u64 bound)
621{
622	int idx;
623	u64 addr;
624	struct ib_device *dev = umem->context->device;
625
626	virt  = max_t(u64, virt,  ib_umem_start(umem));
627	bound = min_t(u64, bound, ib_umem_end(umem));
628	/* Note that during the run of this function, the
629	 * notifiers_count of the MR is > 0, preventing any racing
630	 * faults from completion. We might be racing with other
631	 * invalidations, so we must make sure we free each page only
632	 * once. */
633	mutex_lock(&umem->odp_data->umem_mutex);
634	for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
635		idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
636		if (umem->odp_data->page_list[idx]) {
637			struct page *page = umem->odp_data->page_list[idx];
638			dma_addr_t dma = umem->odp_data->dma_list[idx];
639			dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
640
641			WARN_ON(!dma_addr);
642
643			ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
644					  DMA_BIDIRECTIONAL);
645			if (dma & ODP_WRITE_ALLOWED_BIT) {
646				struct page *head_page = compound_head(page);
647				/*
648				 * set_page_dirty prefers being called with
649				 * the page lock. However, MMU notifiers are
650				 * called sometimes with and sometimes without
651				 * the lock. We rely on the umem_mutex instead
652				 * to prevent other mmu notifiers from
653				 * continuing and allowing the page mapping to
654				 * be removed.
655				 */
656				set_page_dirty(head_page);
657			}
658			/* on demand pinning support */
659			if (!umem->context->invalidate_range)
660				put_page(page);
661			umem->odp_data->page_list[idx] = NULL;
662			umem->odp_data->dma_list[idx] = 0;
663		}
664	}
665	mutex_unlock(&umem->odp_data->umem_mutex);
666}
667EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
668