1/*-
2 * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD$
26 */
27
28#include <linux/kref.h>
29#include <linux/random.h>
30#include <linux/delay.h>
31#include <linux/sched.h>
32#include <rdma/ib_umem.h>
33#include <rdma/ib_umem_odp.h>
34#include <rdma/ib_verbs.h>
35#include "mlx5_ib.h"
36
37enum {
38	MAX_PENDING_REG_MR = 8,
39};
40
41#define MLX5_UMR_ALIGN 2048
42#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
43static __be64 mlx5_ib_update_mtt_emergency_buffer[
44		MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
45	__aligned(MLX5_UMR_ALIGN);
46static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
47#endif
48
49static int clean_mr(struct mlx5_ib_mr *mr);
50
51static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
52{
53	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
54
55#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
56	/* Wait until all page fault handlers using the mr complete. */
57	synchronize_srcu(&dev->mr_srcu);
58#endif
59
60	return err;
61}
62
63static int order2idx(struct mlx5_ib_dev *dev, int order)
64{
65	struct mlx5_mr_cache *cache = &dev->cache;
66
67	if (order < cache->ent[0].order)
68		return 0;
69	else
70		return order - cache->ent[0].order;
71}
72
73static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
74{
75	return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
76		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
77}
78
79#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
80static void update_odp_mr(struct mlx5_ib_mr *mr)
81{
82	if (mr->umem->odp_data) {
83		/*
84		 * This barrier prevents the compiler from moving the
85		 * setting of umem->odp_data->private to point to our
86		 * MR, before reg_umr finished, to ensure that the MR
87		 * initialization have finished before starting to
88		 * handle invalidations.
89		 */
90		smp_wmb();
91		mr->umem->odp_data->private = mr;
92		/*
93		 * Make sure we will see the new
94		 * umem->odp_data->private value in the invalidation
95		 * routines, before we can get page faults on the
96		 * MR. Page faults can happen once we put the MR in
97		 * the tree, below this line. Without the barrier,
98		 * there can be a fault handling and an invalidation
99		 * before umem->odp_data->private == mr is visible to
100		 * the invalidation handler.
101		 */
102		smp_wmb();
103	}
104}
105#endif
106
107static void reg_mr_callback(int status, struct mlx5_async_work *context)
108{
109	struct mlx5_ib_mr *mr =
110		container_of(context, struct mlx5_ib_mr, cb_work);
111	struct mlx5_ib_dev *dev = mr->dev;
112	struct mlx5_mr_cache *cache = &dev->cache;
113	int c = order2idx(dev, mr->order);
114	struct mlx5_cache_ent *ent = &cache->ent[c];
115	u8 key;
116	unsigned long flags;
117	struct mlx5_mr_table *table = &dev->mdev->priv.mr_table;
118	int err;
119
120	spin_lock_irqsave(&ent->lock, flags);
121	ent->pending--;
122	spin_unlock_irqrestore(&ent->lock, flags);
123	if (status) {
124		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
125		kfree(mr);
126		dev->fill_delay = 1;
127		mod_timer(&dev->delay_timer, jiffies + HZ);
128		return;
129	}
130
131	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
132	key = dev->mdev->priv.mkey_key++;
133	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
134	mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
135
136	cache->last_add = jiffies;
137
138	spin_lock_irqsave(&ent->lock, flags);
139	list_add_tail(&mr->list, &ent->head);
140	ent->cur++;
141	ent->size++;
142	spin_unlock_irqrestore(&ent->lock, flags);
143
144	spin_lock_irqsave(&table->lock, flags);
145	err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mr->mmkey.key),
146				&mr->mmkey);
147	if (err)
148		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
149	spin_unlock_irqrestore(&table->lock, flags);
150}
151
152static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
153{
154	struct mlx5_mr_cache *cache = &dev->cache;
155	struct mlx5_cache_ent *ent = &cache->ent[c];
156	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
157	struct mlx5_ib_mr *mr;
158	int npages = 1 << ent->order;
159	void *mkc;
160	u32 *in;
161	int err = 0;
162	int i;
163
164	in = kzalloc(inlen, GFP_KERNEL);
165	if (!in)
166		return -ENOMEM;
167
168	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
169	for (i = 0; i < num; i++) {
170		if (ent->pending >= MAX_PENDING_REG_MR) {
171			err = -EAGAIN;
172			break;
173		}
174
175		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
176		if (!mr) {
177			err = -ENOMEM;
178			break;
179		}
180		mr->order = ent->order;
181		mr->umred = 1;
182		mr->dev = dev;
183
184		MLX5_SET(mkc, mkc, free, 1);
185		MLX5_SET(mkc, mkc, umr_en, 1);
186		MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT);
187
188		MLX5_SET(mkc, mkc, qpn, 0xffffff);
189		MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2);
190		MLX5_SET(mkc, mkc, log_page_size, 12);
191
192		spin_lock_irq(&ent->lock);
193		ent->pending++;
194		spin_unlock_irq(&ent->lock);
195		err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
196					       &dev->async_ctx, in, inlen,
197					       mr->out, sizeof(mr->out),
198					       reg_mr_callback, &mr->cb_work);
199		if (err) {
200			spin_lock_irq(&ent->lock);
201			ent->pending--;
202			spin_unlock_irq(&ent->lock);
203			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
204			kfree(mr);
205			break;
206		}
207	}
208
209	kfree(in);
210	return err;
211}
212
213static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
214{
215	struct mlx5_mr_cache *cache = &dev->cache;
216	struct mlx5_cache_ent *ent = &cache->ent[c];
217	struct mlx5_ib_mr *mr;
218	int err;
219	int i;
220
221	for (i = 0; i < num; i++) {
222		spin_lock_irq(&ent->lock);
223		if (list_empty(&ent->head)) {
224			spin_unlock_irq(&ent->lock);
225			return;
226		}
227		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
228		list_del(&mr->list);
229		ent->cur--;
230		ent->size--;
231		spin_unlock_irq(&ent->lock);
232		err = destroy_mkey(dev, mr);
233		if (err)
234			mlx5_ib_warn(dev, "failed destroy mkey\n");
235		else
236			kfree(mr);
237	}
238}
239
240static int someone_adding(struct mlx5_mr_cache *cache)
241{
242	int i;
243
244	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
245		if (cache->ent[i].cur < cache->ent[i].limit)
246			return 1;
247	}
248
249	return 0;
250}
251
252static void __cache_work_func(struct mlx5_cache_ent *ent)
253{
254	struct mlx5_ib_dev *dev = ent->dev;
255	struct mlx5_mr_cache *cache = &dev->cache;
256	int i = order2idx(dev, ent->order);
257	int err;
258
259	if (cache->stopped)
260		return;
261
262	ent = &dev->cache.ent[i];
263	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
264		err = add_keys(dev, i, 1);
265		if (ent->cur < 2 * ent->limit) {
266			if (err == -EAGAIN) {
267				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
268					    i + 2);
269				queue_delayed_work(cache->wq, &ent->dwork,
270						   msecs_to_jiffies(3));
271			} else if (err) {
272				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
273					     i + 2, err);
274				queue_delayed_work(cache->wq, &ent->dwork,
275						   msecs_to_jiffies(1000));
276			} else {
277				queue_work(cache->wq, &ent->work);
278			}
279		}
280	} else if (ent->cur > 2 * ent->limit) {
281		/*
282		 * The remove_keys() logic is performed as garbage collection
283		 * task. Such task is intended to be run when no other active
284		 * processes are running.
285		 *
286		 * The need_resched() will return TRUE if there are user tasks
287		 * to be activated in near future.
288		 *
289		 * In such case, we don't execute remove_keys() and postpone
290		 * the garbage collection work to try to run in next cycle,
291		 * in order to free CPU resources to other tasks.
292		 */
293		if (!need_resched() && !someone_adding(cache) &&
294		    time_after(jiffies, cache->last_add + 300 * HZ)) {
295			remove_keys(dev, i, 1);
296			if (ent->cur > ent->limit)
297				queue_work(cache->wq, &ent->work);
298		} else {
299			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
300		}
301	}
302}
303
304static void delayed_cache_work_func(struct work_struct *work)
305{
306	struct mlx5_cache_ent *ent;
307
308	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
309	__cache_work_func(ent);
310}
311
312static void cache_work_func(struct work_struct *work)
313{
314	struct mlx5_cache_ent *ent;
315
316	ent = container_of(work, struct mlx5_cache_ent, work);
317	__cache_work_func(ent);
318}
319
320static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
321{
322	struct mlx5_mr_cache *cache = &dev->cache;
323	struct mlx5_ib_mr *mr = NULL;
324	struct mlx5_cache_ent *ent;
325	int c;
326	int i;
327
328	c = order2idx(dev, order);
329	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
330		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
331		return NULL;
332	}
333
334	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
335		ent = &cache->ent[i];
336
337		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
338
339		spin_lock_irq(&ent->lock);
340		if (!list_empty(&ent->head)) {
341			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
342					      list);
343			list_del(&mr->list);
344			ent->cur--;
345			spin_unlock_irq(&ent->lock);
346			if (ent->cur < ent->limit)
347				queue_work(cache->wq, &ent->work);
348			break;
349		}
350		spin_unlock_irq(&ent->lock);
351
352		queue_work(cache->wq, &ent->work);
353	}
354
355	if (!mr)
356		cache->ent[c].miss++;
357
358	return mr;
359}
360
361static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
362{
363	struct mlx5_mr_cache *cache = &dev->cache;
364	struct mlx5_cache_ent *ent;
365	int shrink = 0;
366	int c;
367
368	c = order2idx(dev, mr->order);
369	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
370		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
371		return;
372	}
373	ent = &cache->ent[c];
374	spin_lock_irq(&ent->lock);
375	list_add_tail(&mr->list, &ent->head);
376	ent->cur++;
377	if (ent->cur > 2 * ent->limit)
378		shrink = 1;
379	spin_unlock_irq(&ent->lock);
380
381	if (shrink)
382		queue_work(cache->wq, &ent->work);
383}
384
385static void clean_keys(struct mlx5_ib_dev *dev, int c)
386{
387	struct mlx5_mr_cache *cache = &dev->cache;
388	struct mlx5_cache_ent *ent = &cache->ent[c];
389	struct mlx5_ib_mr *mr;
390	int err;
391
392	cancel_delayed_work(&ent->dwork);
393	while (1) {
394		spin_lock_irq(&ent->lock);
395		if (list_empty(&ent->head)) {
396			spin_unlock_irq(&ent->lock);
397			return;
398		}
399		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
400		list_del(&mr->list);
401		ent->cur--;
402		ent->size--;
403		spin_unlock_irq(&ent->lock);
404		err = destroy_mkey(dev, mr);
405		if (err)
406			mlx5_ib_warn(dev, "failed destroy mkey\n");
407		else
408			kfree(mr);
409	}
410}
411
412static void delay_time_func(unsigned long ctx)
413{
414	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
415
416	dev->fill_delay = 0;
417}
418
419int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
420{
421	struct mlx5_mr_cache *cache = &dev->cache;
422	struct mlx5_cache_ent *ent;
423	int limit;
424	int i;
425
426	mutex_init(&dev->slow_path_mutex);
427	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
428	if (!cache->wq) {
429		mlx5_ib_warn(dev, "failed to create work queue\n");
430		return -ENOMEM;
431	}
432
433	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
434	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
435	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
436		INIT_LIST_HEAD(&cache->ent[i].head);
437		spin_lock_init(&cache->ent[i].lock);
438
439		ent = &cache->ent[i];
440		INIT_LIST_HEAD(&ent->head);
441		spin_lock_init(&ent->lock);
442		ent->order = i + 2;
443		ent->dev = dev;
444
445		if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE)
446			limit = dev->mdev->profile->mr_cache[i].limit;
447		else
448			limit = 0;
449
450		INIT_WORK(&ent->work, cache_work_func);
451		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
452		ent->limit = limit;
453		queue_work(cache->wq, &ent->work);
454	}
455
456	return 0;
457}
458
459int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
460{
461	int i;
462
463	dev->cache.stopped = 1;
464	flush_workqueue(dev->cache.wq);
465	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
466
467	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
468		clean_keys(dev, i);
469
470	destroy_workqueue(dev->cache.wq);
471	del_timer_sync(&dev->delay_timer);
472
473	return 0;
474}
475
476struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
477{
478	struct mlx5_ib_dev *dev = to_mdev(pd->device);
479	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
480	struct mlx5_core_dev *mdev = dev->mdev;
481	struct mlx5_ib_mr *mr;
482	void *mkc;
483	u32 *in;
484	int err;
485
486	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
487	if (!mr)
488		return ERR_PTR(-ENOMEM);
489
490	in = kzalloc(inlen, GFP_KERNEL);
491	if (!in) {
492		err = -ENOMEM;
493		goto err_free;
494	}
495
496	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
497
498	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA);
499	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
500	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
501	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
502	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
503	MLX5_SET(mkc, mkc, lr, 1);
504
505	MLX5_SET(mkc, mkc, length64, 1);
506	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
507	MLX5_SET(mkc, mkc, qpn, 0xffffff);
508	MLX5_SET64(mkc, mkc, start_addr, 0);
509
510	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
511	if (err)
512		goto err_in;
513
514	kfree(in);
515	mr->ibmr.lkey = mr->mmkey.key;
516	mr->ibmr.rkey = mr->mmkey.key;
517	mr->umem = NULL;
518
519	return &mr->ibmr;
520
521err_in:
522	kfree(in);
523
524err_free:
525	kfree(mr);
526
527	return ERR_PTR(err);
528}
529
530static int get_octo_len(u64 addr, u64 len, int page_size)
531{
532	u64 offset;
533	int npages;
534
535	offset = addr & (page_size - 1);
536	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
537	return (npages + 1) / 2;
538}
539
540static int use_umr(int order)
541{
542	return order <= MLX5_MAX_UMR_SHIFT;
543}
544
545static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
546			  int npages, int page_shift, int *size,
547			  __be64 **mr_pas, dma_addr_t *dma)
548{
549	__be64 *pas;
550	struct device *ddev = dev->ib_dev.dma_device;
551
552	/*
553	 * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
554	 * To avoid copying garbage after the pas array, we allocate
555	 * a little more.
556	 */
557	*size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
558	*mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
559	if (!(*mr_pas))
560		return -ENOMEM;
561
562	pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
563	mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
564	/* Clear padding after the actual pages. */
565	memset(pas + npages, 0, *size - npages * sizeof(u64));
566
567	*dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
568	if (dma_mapping_error(ddev, *dma)) {
569		kfree(*mr_pas);
570		return -ENOMEM;
571	}
572
573	return 0;
574}
575
576static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
577				struct ib_sge *sg, u64 dma, int n, u32 key,
578				int page_shift)
579{
580	struct mlx5_ib_dev *dev = to_mdev(pd->device);
581	struct mlx5_umr_wr *umrwr = umr_wr(wr);
582
583	sg->addr = dma;
584	sg->length = ALIGN(sizeof(u64) * n, 64);
585	sg->lkey = dev->umrc.pd->local_dma_lkey;
586
587	wr->next = NULL;
588	wr->sg_list = sg;
589	if (n)
590		wr->num_sge = 1;
591	else
592		wr->num_sge = 0;
593
594	wr->opcode = MLX5_IB_WR_UMR;
595
596	umrwr->npages = n;
597	umrwr->page_shift = page_shift;
598	umrwr->mkey = key;
599}
600
601static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
602			     struct ib_sge *sg, u64 dma, int n, u32 key,
603			     int page_shift, u64 virt_addr, u64 len,
604			     int access_flags)
605{
606	struct mlx5_umr_wr *umrwr = umr_wr(wr);
607
608	prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift);
609
610	wr->send_flags = 0;
611
612	umrwr->target.virt_addr = virt_addr;
613	umrwr->length = len;
614	umrwr->access_flags = access_flags;
615	umrwr->pd = pd;
616}
617
618static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
619			       struct ib_send_wr *wr, u32 key)
620{
621	struct mlx5_umr_wr *umrwr = umr_wr(wr);
622
623	wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
624	wr->opcode = MLX5_IB_WR_UMR;
625	umrwr->mkey = key;
626}
627
628static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
629				   int access_flags, int *npages,
630				   int *page_shift, int *ncont, int *order)
631{
632	struct mlx5_ib_dev *dev = to_mdev(pd->device);
633	struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length,
634					   access_flags, 0);
635	if (IS_ERR(umem)) {
636		mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
637		return (void *)umem;
638	}
639
640	mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order);
641	if (!*npages) {
642		mlx5_ib_warn(dev, "avoid zero region\n");
643		ib_umem_release(umem);
644		return ERR_PTR(-EINVAL);
645	}
646
647	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
648		    *npages, *ncont, *order, *page_shift);
649
650	return umem;
651}
652
653static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
654{
655	struct mlx5_ib_umr_context *context =
656		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
657
658	context->status = wc->status;
659	complete(&context->done);
660}
661
662static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
663{
664	context->cqe.done = mlx5_ib_umr_done;
665	context->status = -1;
666	init_completion(&context->done);
667}
668
669static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
670				  u64 virt_addr, u64 len, int npages,
671				  int page_shift, int order, int access_flags)
672{
673	struct mlx5_ib_dev *dev = to_mdev(pd->device);
674	struct device *ddev = dev->ib_dev.dma_device;
675	struct umr_common *umrc = &dev->umrc;
676	struct mlx5_ib_umr_context umr_context;
677	struct mlx5_umr_wr umrwr = {};
678	struct ib_send_wr *bad;
679	struct mlx5_ib_mr *mr;
680	struct ib_sge sg;
681	int size;
682	__be64 *mr_pas;
683	dma_addr_t dma;
684	int err = 0;
685	int i;
686
687	for (i = 0; i < 1; i++) {
688		mr = alloc_cached_mr(dev, order);
689		if (mr)
690			break;
691
692		err = add_keys(dev, order2idx(dev, order), 1);
693		if (err && err != -EAGAIN) {
694			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
695			break;
696		}
697	}
698
699	if (!mr)
700		return ERR_PTR(-EAGAIN);
701
702	err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas,
703			     &dma);
704	if (err)
705		goto free_mr;
706
707	mlx5_ib_init_umr_context(&umr_context);
708
709	umrwr.wr.wr_cqe = &umr_context.cqe;
710	prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
711			 page_shift, virt_addr, len, access_flags);
712
713	down(&umrc->sem);
714	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
715	if (err) {
716		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
717		goto unmap_dma;
718	} else {
719		wait_for_completion(&umr_context.done);
720		if (umr_context.status != IB_WC_SUCCESS) {
721			mlx5_ib_warn(dev, "reg umr failed\n");
722			err = -EFAULT;
723		}
724	}
725
726	mr->mmkey.iova = virt_addr;
727	mr->mmkey.size = len;
728	mr->mmkey.pd = to_mpd(pd)->pdn;
729
730	mr->live = 1;
731
732unmap_dma:
733	up(&umrc->sem);
734	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
735
736	kfree(mr_pas);
737
738free_mr:
739	if (err) {
740		free_cached_mr(dev, mr);
741		return ERR_PTR(err);
742	}
743
744	return mr;
745}
746
747#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
748int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
749		       int zap)
750{
751	struct mlx5_ib_dev *dev = mr->dev;
752	struct device *ddev = dev->ib_dev.dma_device;
753	struct umr_common *umrc = &dev->umrc;
754	struct mlx5_ib_umr_context umr_context;
755	struct ib_umem *umem = mr->umem;
756	int size;
757	__be64 *pas;
758	dma_addr_t dma;
759	struct ib_send_wr *bad;
760	struct mlx5_umr_wr wr;
761	struct ib_sge sg;
762	int err = 0;
763	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
764	const int page_index_mask = page_index_alignment - 1;
765	size_t pages_mapped = 0;
766	size_t pages_to_map = 0;
767	size_t pages_iter = 0;
768	int use_emergency_buf = 0;
769
770	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
771	 * so we need to align the offset and length accordingly */
772	if (start_page_index & page_index_mask) {
773		npages += start_page_index & page_index_mask;
774		start_page_index &= ~page_index_mask;
775	}
776
777	pages_to_map = ALIGN(npages, page_index_alignment);
778
779	if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
780		return -EINVAL;
781
782	size = sizeof(u64) * pages_to_map;
783	size = min_t(int, PAGE_SIZE, size);
784	/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
785	 * code, when we are called from an invalidation. The pas buffer must
786	 * be 2k-aligned for Connect-IB. */
787	pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
788	if (!pas) {
789		mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
790		pas = mlx5_ib_update_mtt_emergency_buffer;
791		size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
792		use_emergency_buf = 1;
793		mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
794		memset(pas, 0, size);
795	}
796	pages_iter = size / sizeof(u64);
797	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
798	if (dma_mapping_error(ddev, dma)) {
799		mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
800		err = -ENOMEM;
801		goto free_pas;
802	}
803
804	for (pages_mapped = 0;
805	     pages_mapped < pages_to_map && !err;
806	     pages_mapped += pages_iter, start_page_index += pages_iter) {
807		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
808
809		npages = min_t(size_t,
810			       pages_iter,
811			       ib_umem_num_pages(umem) - start_page_index);
812
813		if (!zap) {
814			__mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
815					       start_page_index, npages, pas,
816					       MLX5_IB_MTT_PRESENT);
817			/* Clear padding after the pages brought from the
818			 * umem. */
819			memset(pas + npages, 0, size - npages * sizeof(u64));
820		}
821
822		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
823
824		mlx5_ib_init_umr_context(&umr_context);
825
826		memset(&wr, 0, sizeof(wr));
827		wr.wr.wr_cqe = &umr_context.cqe;
828
829		sg.addr = dma;
830		sg.length = ALIGN(npages * sizeof(u64),
831				MLX5_UMR_MTT_ALIGNMENT);
832		sg.lkey = dev->umrc.pd->local_dma_lkey;
833
834		wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
835				MLX5_IB_SEND_UMR_UPDATE_MTT;
836		wr.wr.sg_list = &sg;
837		wr.wr.num_sge = 1;
838		wr.wr.opcode = MLX5_IB_WR_UMR;
839		wr.npages = sg.length / sizeof(u64);
840		wr.page_shift = PAGE_SHIFT;
841		wr.mkey = mr->mmkey.key;
842		wr.target.offset = start_page_index;
843
844		down(&umrc->sem);
845		err = ib_post_send(umrc->qp, &wr.wr, &bad);
846		if (err) {
847			mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
848		} else {
849			wait_for_completion(&umr_context.done);
850			if (umr_context.status != IB_WC_SUCCESS) {
851				mlx5_ib_err(dev, "UMR completion failed, code %d\n",
852					    umr_context.status);
853				err = -EFAULT;
854			}
855		}
856		up(&umrc->sem);
857	}
858	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
859
860free_pas:
861	if (!use_emergency_buf)
862		free_page((unsigned long)pas);
863	else
864		mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
865
866	return err;
867}
868#endif
869
870/*
871 * If ibmr is NULL it will be allocated by reg_create.
872 * Else, the given ibmr will be used.
873 */
874static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
875				     u64 virt_addr, u64 length,
876				     struct ib_umem *umem, int npages,
877				     int page_shift, int access_flags)
878{
879	struct mlx5_ib_dev *dev = to_mdev(pd->device);
880	struct mlx5_ib_mr *mr;
881	__be64 *pas;
882	void *mkc;
883	int inlen;
884	u32 *in;
885	int err;
886	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
887
888	mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
889	if (!mr)
890		return ERR_PTR(-ENOMEM);
891
892	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
893		sizeof(*pas) * ((npages + 1) / 2) * 2;
894	in = mlx5_vzalloc(inlen);
895	if (!in) {
896		err = -ENOMEM;
897		goto err_1;
898	}
899	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
900	mlx5_ib_populate_pas(dev, umem, page_shift, pas,
901			     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
902
903	/* The pg_access bit allows setting the access flags
904	 * in the page list submitted with the command. */
905	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
906
907	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
908	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT);
909	MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
910	MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
911	MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
912	MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
913	MLX5_SET(mkc, mkc, lr, 1);
914
915	MLX5_SET64(mkc, mkc, start_addr, virt_addr);
916	MLX5_SET64(mkc, mkc, len, length);
917	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
918	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
919	MLX5_SET(mkc, mkc, translations_octword_size,
920		 get_octo_len(virt_addr, length, 1 << page_shift));
921	MLX5_SET(mkc, mkc, log_page_size, page_shift);
922	MLX5_SET(mkc, mkc, qpn, 0xffffff);
923	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
924		 get_octo_len(virt_addr, length, 1 << page_shift));
925
926	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
927	if (err) {
928		mlx5_ib_warn(dev, "create mkey failed\n");
929		goto err_2;
930	}
931	mr->umem = umem;
932	mr->dev = dev;
933	mr->live = 1;
934	kvfree(in);
935
936	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
937
938	return mr;
939
940err_2:
941	kvfree(in);
942
943err_1:
944	if (!ibmr)
945		kfree(mr);
946
947	return ERR_PTR(err);
948}
949
950static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
951			  int npages, u64 length, int access_flags)
952{
953	mr->npages = npages;
954	atomic_add(npages, &dev->mdev->priv.reg_pages);
955	mr->ibmr.lkey = mr->mmkey.key;
956	mr->ibmr.rkey = mr->mmkey.key;
957	mr->ibmr.length = length;
958	mr->access_flags = access_flags;
959}
960
961struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
962				  u64 virt_addr, int access_flags,
963				  struct ib_udata *udata)
964{
965	struct mlx5_ib_dev *dev = to_mdev(pd->device);
966	struct mlx5_ib_mr *mr = NULL;
967	struct ib_umem *umem;
968	int page_shift;
969	int npages;
970	int ncont;
971	int order;
972	int err;
973
974	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
975		    (long long)start, (long long)virt_addr, (long long)length, access_flags);
976	umem = mr_umem_get(pd, start, length, access_flags, &npages,
977			   &page_shift, &ncont, &order);
978
979	if (IS_ERR(umem))
980		return (void *)umem;
981
982	if (use_umr(order)) {
983		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
984			     order, access_flags);
985		if (PTR_ERR(mr) == -EAGAIN) {
986			mlx5_ib_dbg(dev, "cache empty for order %d", order);
987			mr = NULL;
988		}
989	} else if (access_flags & IB_ACCESS_ON_DEMAND) {
990		err = -EINVAL;
991		pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
992		goto error;
993	}
994
995	if (!mr) {
996		mutex_lock(&dev->slow_path_mutex);
997		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
998				page_shift, access_flags);
999		mutex_unlock(&dev->slow_path_mutex);
1000	}
1001
1002	if (IS_ERR(mr)) {
1003		err = PTR_ERR(mr);
1004		goto error;
1005	}
1006
1007	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1008
1009	mr->umem = umem;
1010	set_mr_fileds(dev, mr, npages, length, access_flags);
1011
1012#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1013	update_odp_mr(mr);
1014#endif
1015
1016	return &mr->ibmr;
1017
1018error:
1019	ib_umem_release(umem);
1020	return ERR_PTR(err);
1021}
1022
1023static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1024{
1025	struct mlx5_core_dev *mdev = dev->mdev;
1026	struct umr_common *umrc = &dev->umrc;
1027	struct mlx5_ib_umr_context umr_context;
1028	struct mlx5_umr_wr umrwr = {};
1029	struct ib_send_wr *bad;
1030	int err;
1031
1032	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1033		return 0;
1034
1035	mlx5_ib_init_umr_context(&umr_context);
1036
1037	umrwr.wr.wr_cqe = &umr_context.cqe;
1038	prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key);
1039
1040	down(&umrc->sem);
1041	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1042	if (err) {
1043		up(&umrc->sem);
1044		mlx5_ib_dbg(dev, "err %d\n", err);
1045		goto error;
1046	} else {
1047		wait_for_completion(&umr_context.done);
1048		up(&umrc->sem);
1049	}
1050	if (umr_context.status != IB_WC_SUCCESS) {
1051		mlx5_ib_warn(dev, "unreg umr failed\n");
1052		err = -EFAULT;
1053		goto error;
1054	}
1055	return 0;
1056
1057error:
1058	return err;
1059}
1060
1061static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
1062		     u64 length, int npages, int page_shift, int order,
1063		     int access_flags, int flags)
1064{
1065	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1066	struct device *ddev = dev->ib_dev.dma_device;
1067	struct mlx5_ib_umr_context umr_context;
1068	struct ib_send_wr *bad;
1069	struct mlx5_umr_wr umrwr = {};
1070	struct ib_sge sg;
1071	struct umr_common *umrc = &dev->umrc;
1072	dma_addr_t dma = 0;
1073	__be64 *mr_pas = NULL;
1074	int size;
1075	int err;
1076
1077	mlx5_ib_init_umr_context(&umr_context);
1078
1079	umrwr.wr.wr_cqe = &umr_context.cqe;
1080	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1081
1082	if (flags & IB_MR_REREG_TRANS) {
1083		err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size,
1084				     &mr_pas, &dma);
1085		if (err)
1086			return err;
1087
1088		umrwr.target.virt_addr = virt_addr;
1089		umrwr.length = length;
1090		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1091	}
1092
1093	prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
1094			    page_shift);
1095
1096	if (flags & IB_MR_REREG_PD) {
1097		umrwr.pd = pd;
1098		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD;
1099	}
1100
1101	if (flags & IB_MR_REREG_ACCESS) {
1102		umrwr.access_flags = access_flags;
1103		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS;
1104	}
1105
1106	/* post send request to UMR QP */
1107	down(&umrc->sem);
1108	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1109
1110	if (err) {
1111		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
1112	} else {
1113		wait_for_completion(&umr_context.done);
1114		if (umr_context.status != IB_WC_SUCCESS) {
1115			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
1116				     umr_context.status);
1117			err = -EFAULT;
1118		}
1119	}
1120
1121	up(&umrc->sem);
1122	if (flags & IB_MR_REREG_TRANS) {
1123		dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1124		kfree(mr_pas);
1125	}
1126	return err;
1127}
1128
1129int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1130			  u64 length, u64 virt_addr, int new_access_flags,
1131			  struct ib_pd *new_pd, struct ib_udata *udata)
1132{
1133	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1134	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1135	struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
1136	int access_flags = flags & IB_MR_REREG_ACCESS ?
1137			    new_access_flags :
1138			    mr->access_flags;
1139	u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address;
1140	u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length;
1141	int page_shift = 0;
1142	int npages = 0;
1143	int ncont = 0;
1144	int order = 0;
1145	int err;
1146
1147	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1148		    (long long)start, (long long)virt_addr, (long long)length, access_flags);
1149
1150	if (flags != IB_MR_REREG_PD) {
1151		/*
1152		 * Replace umem. This needs to be done whether or not UMR is
1153		 * used.
1154		 */
1155		flags |= IB_MR_REREG_TRANS;
1156		ib_umem_release(mr->umem);
1157		mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages,
1158				       &page_shift, &ncont, &order);
1159		if (IS_ERR(mr->umem)) {
1160			err = PTR_ERR(mr->umem);
1161			mr->umem = NULL;
1162			return err;
1163		}
1164	}
1165
1166	if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
1167		/*
1168		 * UMR can't be used - MKey needs to be replaced.
1169		 */
1170		if (mr->umred) {
1171			err = unreg_umr(dev, mr);
1172			if (err)
1173				mlx5_ib_warn(dev, "Failed to unregister MR\n");
1174		} else {
1175			err = destroy_mkey(dev, mr);
1176			if (err)
1177				mlx5_ib_warn(dev, "Failed to destroy MKey\n");
1178		}
1179		if (err)
1180			return err;
1181
1182		mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont,
1183				page_shift, access_flags);
1184
1185		if (IS_ERR(mr))
1186			return PTR_ERR(mr);
1187
1188		mr->umred = 0;
1189	} else {
1190		/*
1191		 * Send a UMR WQE
1192		 */
1193		err = rereg_umr(pd, mr, addr, len, npages, page_shift,
1194				order, access_flags, flags);
1195		if (err) {
1196			mlx5_ib_warn(dev, "Failed to rereg UMR\n");
1197			return err;
1198		}
1199	}
1200
1201	if (flags & IB_MR_REREG_PD) {
1202		ib_mr->pd = pd;
1203		mr->mmkey.pd = to_mpd(pd)->pdn;
1204	}
1205
1206	if (flags & IB_MR_REREG_ACCESS)
1207		mr->access_flags = access_flags;
1208
1209	if (flags & IB_MR_REREG_TRANS) {
1210		atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1211		set_mr_fileds(dev, mr, npages, len, access_flags);
1212		mr->mmkey.iova = addr;
1213		mr->mmkey.size = len;
1214	}
1215#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1216	update_odp_mr(mr);
1217#endif
1218
1219	return 0;
1220}
1221
1222static int
1223mlx5_alloc_priv_descs(struct ib_device *device,
1224		      struct mlx5_ib_mr *mr,
1225		      int ndescs,
1226		      int desc_size)
1227{
1228	int size = ndescs * desc_size;
1229	int add_size;
1230	int ret;
1231
1232	add_size = max_t(int, MLX5_UMR_ALIGN - 1, 0);
1233
1234	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1235	if (!mr->descs_alloc)
1236		return -ENOMEM;
1237
1238	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1239
1240	mr->desc_map = dma_map_single(device->dma_device, mr->descs,
1241				      size, DMA_TO_DEVICE);
1242	if (dma_mapping_error(device->dma_device, mr->desc_map)) {
1243		ret = -ENOMEM;
1244		goto err;
1245	}
1246
1247	return 0;
1248err:
1249	kfree(mr->descs_alloc);
1250
1251	return ret;
1252}
1253
1254static void
1255mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1256{
1257	if (mr->descs) {
1258		struct ib_device *device = mr->ibmr.device;
1259		int size = mr->max_descs * mr->desc_size;
1260
1261		dma_unmap_single(device->dma_device, mr->desc_map,
1262				 size, DMA_TO_DEVICE);
1263		kfree(mr->descs_alloc);
1264		mr->descs = NULL;
1265	}
1266}
1267
1268static int clean_mr(struct mlx5_ib_mr *mr)
1269{
1270	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1271	int umred = mr->umred;
1272	int err;
1273
1274	if (mr->sig) {
1275		if (mlx5_core_destroy_psv(dev->mdev,
1276					  mr->sig->psv_memory.psv_idx))
1277			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1278				     mr->sig->psv_memory.psv_idx);
1279		if (mlx5_core_destroy_psv(dev->mdev,
1280					  mr->sig->psv_wire.psv_idx))
1281			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1282				     mr->sig->psv_wire.psv_idx);
1283		kfree(mr->sig);
1284		mr->sig = NULL;
1285	}
1286
1287	mlx5_free_priv_descs(mr);
1288
1289	if (!umred) {
1290		err = destroy_mkey(dev, mr);
1291		if (err) {
1292			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1293				     mr->mmkey.key, err);
1294			return err;
1295		}
1296	} else {
1297		err = unreg_umr(dev, mr);
1298		if (err) {
1299			mlx5_ib_warn(dev, "failed unregister\n");
1300			return err;
1301		}
1302		free_cached_mr(dev, mr);
1303	}
1304
1305	if (!umred)
1306		kfree(mr);
1307
1308	return 0;
1309}
1310
1311int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1312{
1313	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1314	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1315	int npages = mr->npages;
1316	struct ib_umem *umem = mr->umem;
1317
1318#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1319	if (umem && umem->odp_data) {
1320		/* Prevent new page faults from succeeding */
1321		mr->live = 0;
1322		/* Wait for all running page-fault handlers to finish. */
1323		synchronize_srcu(&dev->mr_srcu);
1324		/* Destroy all page mappings */
1325		mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1326					 ib_umem_end(umem));
1327		/*
1328		 * We kill the umem before the MR for ODP,
1329		 * so that there will not be any invalidations in
1330		 * flight, looking at the *mr struct.
1331		 */
1332		ib_umem_release(umem);
1333		atomic_sub(npages, &dev->mdev->priv.reg_pages);
1334
1335		/* Avoid double-freeing the umem. */
1336		umem = NULL;
1337	}
1338#endif
1339
1340	clean_mr(mr);
1341
1342	if (umem) {
1343		ib_umem_release(umem);
1344		atomic_sub(npages, &dev->mdev->priv.reg_pages);
1345	}
1346
1347	return 0;
1348}
1349
1350struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
1351			       enum ib_mr_type mr_type,
1352			       u32 max_num_sg)
1353{
1354	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1355	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1356	int ndescs = ALIGN(max_num_sg, 4);
1357	struct mlx5_ib_mr *mr;
1358	void *mkc;
1359	u32 *in;
1360	int err;
1361
1362	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1363	if (!mr)
1364		return ERR_PTR(-ENOMEM);
1365
1366	in = kzalloc(inlen, GFP_KERNEL);
1367	if (!in) {
1368		err = -ENOMEM;
1369		goto err_free;
1370	}
1371
1372	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1373	MLX5_SET(mkc, mkc, free, 1);
1374	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1375	MLX5_SET(mkc, mkc, qpn, 0xffffff);
1376	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1377
1378	if (mr_type == IB_MR_TYPE_MEM_REG) {
1379		mr->access_mode = MLX5_ACCESS_MODE_MTT;
1380		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
1381		err = mlx5_alloc_priv_descs(pd->device, mr,
1382					    ndescs, sizeof(u64));
1383		if (err)
1384			goto err_free_in;
1385
1386		mr->desc_size = sizeof(u64);
1387		mr->max_descs = ndescs;
1388	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
1389		mr->access_mode = MLX5_ACCESS_MODE_KLM;
1390
1391		err = mlx5_alloc_priv_descs(pd->device, mr,
1392					    ndescs, sizeof(struct mlx5_klm));
1393		if (err)
1394			goto err_free_in;
1395		mr->desc_size = sizeof(struct mlx5_klm);
1396		mr->max_descs = ndescs;
1397	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
1398		u32 psv_index[2];
1399
1400		MLX5_SET(mkc, mkc, bsf_en, 1);
1401		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1402		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1403		if (!mr->sig) {
1404			err = -ENOMEM;
1405			goto err_free_in;
1406		}
1407
1408		/* create mem & wire PSVs */
1409		err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
1410					   2, psv_index);
1411		if (err)
1412			goto err_free_sig;
1413
1414		mr->access_mode = MLX5_ACCESS_MODE_KLM;
1415		mr->sig->psv_memory.psv_idx = psv_index[0];
1416		mr->sig->psv_wire.psv_idx = psv_index[1];
1417
1418		mr->sig->sig_status_checked = true;
1419		mr->sig->sig_err_exists = false;
1420		/* Next UMR, Arm SIGERR */
1421		++mr->sig->sigerr_count;
1422	} else {
1423		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1424		err = -EINVAL;
1425		goto err_free_in;
1426	}
1427
1428	MLX5_SET(mkc, mkc, access_mode, mr->access_mode);
1429	MLX5_SET(mkc, mkc, umr_en, 1);
1430
1431	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1432	if (err)
1433		goto err_destroy_psv;
1434
1435	mr->ibmr.lkey = mr->mmkey.key;
1436	mr->ibmr.rkey = mr->mmkey.key;
1437	mr->umem = NULL;
1438	kfree(in);
1439
1440	return &mr->ibmr;
1441
1442err_destroy_psv:
1443	if (mr->sig) {
1444		if (mlx5_core_destroy_psv(dev->mdev,
1445					  mr->sig->psv_memory.psv_idx))
1446			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1447				     mr->sig->psv_memory.psv_idx);
1448		if (mlx5_core_destroy_psv(dev->mdev,
1449					  mr->sig->psv_wire.psv_idx))
1450			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1451				     mr->sig->psv_wire.psv_idx);
1452	}
1453	mlx5_free_priv_descs(mr);
1454err_free_sig:
1455	kfree(mr->sig);
1456err_free_in:
1457	kfree(in);
1458err_free:
1459	kfree(mr);
1460	return ERR_PTR(err);
1461}
1462
1463struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
1464			       struct ib_udata *udata)
1465{
1466	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1467	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1468	struct mlx5_ib_mw *mw = NULL;
1469	u32 *in = NULL;
1470	void *mkc;
1471	int ndescs;
1472	int err;
1473	struct mlx5_ib_alloc_mw req = {};
1474	struct {
1475		__u32	comp_mask;
1476		__u32	response_length;
1477	} resp = {};
1478
1479	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1480	if (err)
1481		return ERR_PTR(err);
1482
1483	if (req.comp_mask || req.reserved1 || req.reserved2)
1484		return ERR_PTR(-EOPNOTSUPP);
1485
1486	if (udata->inlen > sizeof(req) &&
1487	    !ib_is_udata_cleared(udata, sizeof(req),
1488				 udata->inlen - sizeof(req)))
1489		return ERR_PTR(-EOPNOTSUPP);
1490
1491	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
1492
1493	mw = kzalloc(sizeof(*mw), GFP_KERNEL);
1494	in = kzalloc(inlen, GFP_KERNEL);
1495	if (!mw || !in) {
1496		err = -ENOMEM;
1497		goto free;
1498	}
1499
1500	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1501
1502	MLX5_SET(mkc, mkc, free, 1);
1503	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1504	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1505	MLX5_SET(mkc, mkc, umr_en, 1);
1506	MLX5_SET(mkc, mkc, lr, 1);
1507	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_KLM);
1508	MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
1509	MLX5_SET(mkc, mkc, qpn, 0xffffff);
1510
1511	err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
1512	if (err)
1513		goto free;
1514
1515	mw->ibmw.rkey = mw->mmkey.key;
1516
1517	resp.response_length = min(offsetof(typeof(resp), response_length) +
1518				   sizeof(resp.response_length), udata->outlen);
1519	if (resp.response_length) {
1520		err = ib_copy_to_udata(udata, &resp, resp.response_length);
1521		if (err) {
1522			mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1523			goto free;
1524		}
1525	}
1526
1527	kfree(in);
1528	return &mw->ibmw;
1529
1530free:
1531	kfree(mw);
1532	kfree(in);
1533	return ERR_PTR(err);
1534}
1535
1536int mlx5_ib_dealloc_mw(struct ib_mw *mw)
1537{
1538	struct mlx5_ib_mw *mmw = to_mmw(mw);
1539	int err;
1540
1541	err =  mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
1542				      &mmw->mmkey);
1543	if (!err)
1544		kfree(mmw);
1545	return err;
1546}
1547
1548int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
1549			    struct ib_mr_status *mr_status)
1550{
1551	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1552	int ret = 0;
1553
1554	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
1555		pr_err("Invalid status check mask\n");
1556		ret = -EINVAL;
1557		goto done;
1558	}
1559
1560	mr_status->fail_status = 0;
1561	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
1562		if (!mmr->sig) {
1563			ret = -EINVAL;
1564			pr_err("signature status check requested on a non-signature enabled MR\n");
1565			goto done;
1566		}
1567
1568		mmr->sig->sig_status_checked = true;
1569		if (!mmr->sig->sig_err_exists)
1570			goto done;
1571
1572		if (ibmr->lkey == mmr->sig->err_item.key)
1573			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
1574			       sizeof(mr_status->sig_err));
1575		else {
1576			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
1577			mr_status->sig_err.sig_err_offset = 0;
1578			mr_status->sig_err.key = mmr->sig->err_item.key;
1579		}
1580
1581		mmr->sig->sig_err_exists = false;
1582		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
1583	}
1584
1585done:
1586	return ret;
1587}
1588
1589static int
1590mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
1591		   struct scatterlist *sgl,
1592		   unsigned short sg_nents,
1593		   unsigned int *sg_offset_p)
1594{
1595	struct scatterlist *sg = sgl;
1596	struct mlx5_klm *klms = mr->descs;
1597	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
1598	u32 lkey = mr->ibmr.pd->local_dma_lkey;
1599	int i;
1600
1601	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
1602	mr->ibmr.length = 0;
1603	mr->ndescs = sg_nents;
1604
1605	for_each_sg(sgl, sg, sg_nents, i) {
1606		if (unlikely(i > mr->max_descs))
1607			break;
1608		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
1609		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
1610		klms[i].key = cpu_to_be32(lkey);
1611		mr->ibmr.length += sg_dma_len(sg);
1612
1613		sg_offset = 0;
1614	}
1615
1616	if (sg_offset_p)
1617		*sg_offset_p = sg_offset;
1618
1619	return i;
1620}
1621
1622static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
1623{
1624	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1625	__be64 *descs;
1626
1627	if (unlikely(mr->ndescs == mr->max_descs))
1628		return -ENOMEM;
1629
1630	descs = mr->descs;
1631	descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
1632
1633	return 0;
1634}
1635
1636int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
1637		      unsigned int *sg_offset)
1638{
1639	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1640	int n;
1641
1642	mr->ndescs = 0;
1643
1644	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
1645				   mr->desc_size * mr->max_descs,
1646				   DMA_TO_DEVICE);
1647
1648	if (mr->access_mode == MLX5_ACCESS_MODE_KLM)
1649		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
1650	else
1651		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
1652				mlx5_set_page);
1653
1654	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
1655				      mr->desc_size * mr->max_descs,
1656				      DMA_TO_DEVICE);
1657
1658	return n;
1659}
1660