mlx5_ib_mr.c revision 331769
1/*-
2 * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: stable/11/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c 331769 2018-03-30 18:06:29Z hselasky $
26 */
27
28#include <linux/kref.h>
29#include <linux/random.h>
30#include <linux/delay.h>
31#include <linux/sched.h>
32#include <rdma/ib_umem.h>
33#include <rdma/ib_umem_odp.h>
34#include <rdma/ib_verbs.h>
35#include "mlx5_ib.h"
36
37enum {
38	MAX_PENDING_REG_MR = 8,
39};
40
41#define MLX5_UMR_ALIGN 2048
42#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
43static __be64 mlx5_ib_update_mtt_emergency_buffer[
44		MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
45	__aligned(MLX5_UMR_ALIGN);
46static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
47#endif
48
49static int clean_mr(struct mlx5_ib_mr *mr);
50
51static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
52{
53	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
54
55#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
56	/* Wait until all page fault handlers using the mr complete. */
57	synchronize_srcu(&dev->mr_srcu);
58#endif
59
60	return err;
61}
62
63static int order2idx(struct mlx5_ib_dev *dev, int order)
64{
65	struct mlx5_mr_cache *cache = &dev->cache;
66
67	if (order < cache->ent[0].order)
68		return 0;
69	else
70		return order - cache->ent[0].order;
71}
72
73static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
74{
75	return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
76		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
77}
78
79#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
80static void update_odp_mr(struct mlx5_ib_mr *mr)
81{
82	if (mr->umem->odp_data) {
83		/*
84		 * This barrier prevents the compiler from moving the
85		 * setting of umem->odp_data->private to point to our
86		 * MR, before reg_umr finished, to ensure that the MR
87		 * initialization have finished before starting to
88		 * handle invalidations.
89		 */
90		smp_wmb();
91		mr->umem->odp_data->private = mr;
92		/*
93		 * Make sure we will see the new
94		 * umem->odp_data->private value in the invalidation
95		 * routines, before we can get page faults on the
96		 * MR. Page faults can happen once we put the MR in
97		 * the tree, below this line. Without the barrier,
98		 * there can be a fault handling and an invalidation
99		 * before umem->odp_data->private == mr is visible to
100		 * the invalidation handler.
101		 */
102		smp_wmb();
103	}
104}
105#endif
106
107static void reg_mr_callback(int status, void *context)
108{
109	struct mlx5_ib_mr *mr = context;
110	struct mlx5_ib_dev *dev = mr->dev;
111	struct mlx5_mr_cache *cache = &dev->cache;
112	int c = order2idx(dev, mr->order);
113	struct mlx5_cache_ent *ent = &cache->ent[c];
114	u8 key;
115	unsigned long flags;
116	struct mlx5_mr_table *table = &dev->mdev->priv.mr_table;
117	int err;
118
119	spin_lock_irqsave(&ent->lock, flags);
120	ent->pending--;
121	spin_unlock_irqrestore(&ent->lock, flags);
122	if (status) {
123		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
124		kfree(mr);
125		dev->fill_delay = 1;
126		mod_timer(&dev->delay_timer, jiffies + HZ);
127		return;
128	}
129
130	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
131	key = dev->mdev->priv.mkey_key++;
132	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
133	mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
134
135	cache->last_add = jiffies;
136
137	spin_lock_irqsave(&ent->lock, flags);
138	list_add_tail(&mr->list, &ent->head);
139	ent->cur++;
140	ent->size++;
141	spin_unlock_irqrestore(&ent->lock, flags);
142
143	spin_lock_irqsave(&table->lock, flags);
144	err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mr->mmkey.key),
145				&mr->mmkey);
146	if (err)
147		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
148	spin_unlock_irqrestore(&table->lock, flags);
149}
150
151static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
152{
153	struct mlx5_mr_cache *cache = &dev->cache;
154	struct mlx5_cache_ent *ent = &cache->ent[c];
155	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
156	struct mlx5_ib_mr *mr;
157	int npages = 1 << ent->order;
158	void *mkc;
159	u32 *in;
160	int err = 0;
161	int i;
162
163	in = kzalloc(inlen, GFP_KERNEL);
164	if (!in)
165		return -ENOMEM;
166
167	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
168	for (i = 0; i < num; i++) {
169		if (ent->pending >= MAX_PENDING_REG_MR) {
170			err = -EAGAIN;
171			break;
172		}
173
174		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
175		if (!mr) {
176			err = -ENOMEM;
177			break;
178		}
179		mr->order = ent->order;
180		mr->umred = 1;
181		mr->dev = dev;
182
183		MLX5_SET(mkc, mkc, free, 1);
184		MLX5_SET(mkc, mkc, umr_en, 1);
185		MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT);
186
187		MLX5_SET(mkc, mkc, qpn, 0xffffff);
188		MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2);
189		MLX5_SET(mkc, mkc, log_page_size, 12);
190
191		spin_lock_irq(&ent->lock);
192		ent->pending++;
193		spin_unlock_irq(&ent->lock);
194		err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey,
195					       (struct mlx5_create_mkey_mbox_in *)in,
196                                               inlen, reg_mr_callback, mr,
197                                               (struct mlx5_create_mkey_mbox_out *)mr->out);
198		if (err) {
199			spin_lock_irq(&ent->lock);
200			ent->pending--;
201			spin_unlock_irq(&ent->lock);
202			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
203			kfree(mr);
204			break;
205		}
206	}
207
208	kfree(in);
209	return err;
210}
211
212static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
213{
214	struct mlx5_mr_cache *cache = &dev->cache;
215	struct mlx5_cache_ent *ent = &cache->ent[c];
216	struct mlx5_ib_mr *mr;
217	int err;
218	int i;
219
220	for (i = 0; i < num; i++) {
221		spin_lock_irq(&ent->lock);
222		if (list_empty(&ent->head)) {
223			spin_unlock_irq(&ent->lock);
224			return;
225		}
226		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
227		list_del(&mr->list);
228		ent->cur--;
229		ent->size--;
230		spin_unlock_irq(&ent->lock);
231		err = destroy_mkey(dev, mr);
232		if (err)
233			mlx5_ib_warn(dev, "failed destroy mkey\n");
234		else
235			kfree(mr);
236	}
237}
238
239static int someone_adding(struct mlx5_mr_cache *cache)
240{
241	int i;
242
243	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
244		if (cache->ent[i].cur < cache->ent[i].limit)
245			return 1;
246	}
247
248	return 0;
249}
250
251static void __cache_work_func(struct mlx5_cache_ent *ent)
252{
253	struct mlx5_ib_dev *dev = ent->dev;
254	struct mlx5_mr_cache *cache = &dev->cache;
255	int i = order2idx(dev, ent->order);
256	int err;
257
258	if (cache->stopped)
259		return;
260
261	ent = &dev->cache.ent[i];
262	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
263		err = add_keys(dev, i, 1);
264		if (ent->cur < 2 * ent->limit) {
265			if (err == -EAGAIN) {
266				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
267					    i + 2);
268				queue_delayed_work(cache->wq, &ent->dwork,
269						   msecs_to_jiffies(3));
270			} else if (err) {
271				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
272					     i + 2, err);
273				queue_delayed_work(cache->wq, &ent->dwork,
274						   msecs_to_jiffies(1000));
275			} else {
276				queue_work(cache->wq, &ent->work);
277			}
278		}
279	} else if (ent->cur > 2 * ent->limit) {
280		/*
281		 * The remove_keys() logic is performed as garbage collection
282		 * task. Such task is intended to be run when no other active
283		 * processes are running.
284		 *
285		 * The need_resched() will return TRUE if there are user tasks
286		 * to be activated in near future.
287		 *
288		 * In such case, we don't execute remove_keys() and postpone
289		 * the garbage collection work to try to run in next cycle,
290		 * in order to free CPU resources to other tasks.
291		 */
292		if (!need_resched() && !someone_adding(cache) &&
293		    time_after(jiffies, cache->last_add + 300 * HZ)) {
294			remove_keys(dev, i, 1);
295			if (ent->cur > ent->limit)
296				queue_work(cache->wq, &ent->work);
297		} else {
298			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
299		}
300	}
301}
302
303static void delayed_cache_work_func(struct work_struct *work)
304{
305	struct mlx5_cache_ent *ent;
306
307	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
308	__cache_work_func(ent);
309}
310
311static void cache_work_func(struct work_struct *work)
312{
313	struct mlx5_cache_ent *ent;
314
315	ent = container_of(work, struct mlx5_cache_ent, work);
316	__cache_work_func(ent);
317}
318
319static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
320{
321	struct mlx5_mr_cache *cache = &dev->cache;
322	struct mlx5_ib_mr *mr = NULL;
323	struct mlx5_cache_ent *ent;
324	int c;
325	int i;
326
327	c = order2idx(dev, order);
328	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
329		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
330		return NULL;
331	}
332
333	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
334		ent = &cache->ent[i];
335
336		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
337
338		spin_lock_irq(&ent->lock);
339		if (!list_empty(&ent->head)) {
340			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
341					      list);
342			list_del(&mr->list);
343			ent->cur--;
344			spin_unlock_irq(&ent->lock);
345			if (ent->cur < ent->limit)
346				queue_work(cache->wq, &ent->work);
347			break;
348		}
349		spin_unlock_irq(&ent->lock);
350
351		queue_work(cache->wq, &ent->work);
352	}
353
354	if (!mr)
355		cache->ent[c].miss++;
356
357	return mr;
358}
359
360static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
361{
362	struct mlx5_mr_cache *cache = &dev->cache;
363	struct mlx5_cache_ent *ent;
364	int shrink = 0;
365	int c;
366
367	c = order2idx(dev, mr->order);
368	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
369		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
370		return;
371	}
372	ent = &cache->ent[c];
373	spin_lock_irq(&ent->lock);
374	list_add_tail(&mr->list, &ent->head);
375	ent->cur++;
376	if (ent->cur > 2 * ent->limit)
377		shrink = 1;
378	spin_unlock_irq(&ent->lock);
379
380	if (shrink)
381		queue_work(cache->wq, &ent->work);
382}
383
384static void clean_keys(struct mlx5_ib_dev *dev, int c)
385{
386	struct mlx5_mr_cache *cache = &dev->cache;
387	struct mlx5_cache_ent *ent = &cache->ent[c];
388	struct mlx5_ib_mr *mr;
389	int err;
390
391	cancel_delayed_work(&ent->dwork);
392	while (1) {
393		spin_lock_irq(&ent->lock);
394		if (list_empty(&ent->head)) {
395			spin_unlock_irq(&ent->lock);
396			return;
397		}
398		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
399		list_del(&mr->list);
400		ent->cur--;
401		ent->size--;
402		spin_unlock_irq(&ent->lock);
403		err = destroy_mkey(dev, mr);
404		if (err)
405			mlx5_ib_warn(dev, "failed destroy mkey\n");
406		else
407			kfree(mr);
408	}
409}
410
411static void delay_time_func(unsigned long ctx)
412{
413	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
414
415	dev->fill_delay = 0;
416}
417
418int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
419{
420	struct mlx5_mr_cache *cache = &dev->cache;
421	struct mlx5_cache_ent *ent;
422	int limit;
423	int i;
424
425	mutex_init(&dev->slow_path_mutex);
426	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
427	if (!cache->wq) {
428		mlx5_ib_warn(dev, "failed to create work queue\n");
429		return -ENOMEM;
430	}
431
432	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
433	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
434		INIT_LIST_HEAD(&cache->ent[i].head);
435		spin_lock_init(&cache->ent[i].lock);
436
437		ent = &cache->ent[i];
438		INIT_LIST_HEAD(&ent->head);
439		spin_lock_init(&ent->lock);
440		ent->order = i + 2;
441		ent->dev = dev;
442
443		if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE)
444			limit = dev->mdev->profile->mr_cache[i].limit;
445		else
446			limit = 0;
447
448		INIT_WORK(&ent->work, cache_work_func);
449		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
450		ent->limit = limit;
451		queue_work(cache->wq, &ent->work);
452	}
453
454	return 0;
455}
456
457int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
458{
459	int i;
460
461	dev->cache.stopped = 1;
462	flush_workqueue(dev->cache.wq);
463
464	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
465		clean_keys(dev, i);
466
467	destroy_workqueue(dev->cache.wq);
468	del_timer_sync(&dev->delay_timer);
469
470	return 0;
471}
472
473struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
474{
475	struct mlx5_ib_dev *dev = to_mdev(pd->device);
476	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
477	struct mlx5_core_dev *mdev = dev->mdev;
478	struct mlx5_ib_mr *mr;
479	void *mkc;
480	u32 *in;
481	int err;
482
483	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
484	if (!mr)
485		return ERR_PTR(-ENOMEM);
486
487	in = kzalloc(inlen, GFP_KERNEL);
488	if (!in) {
489		err = -ENOMEM;
490		goto err_free;
491	}
492
493	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
494
495	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA);
496	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
497	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
498	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
499	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
500	MLX5_SET(mkc, mkc, lr, 1);
501
502	MLX5_SET(mkc, mkc, length64, 1);
503	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
504	MLX5_SET(mkc, mkc, qpn, 0xffffff);
505	MLX5_SET64(mkc, mkc, start_addr, 0);
506
507	err = mlx5_core_create_mkey(mdev, &mr->mmkey,
508					(struct mlx5_create_mkey_mbox_in *)in,
509                                        inlen, NULL, NULL, NULL);
510	if (err)
511		goto err_in;
512
513	kfree(in);
514	mr->ibmr.lkey = mr->mmkey.key;
515	mr->ibmr.rkey = mr->mmkey.key;
516	mr->umem = NULL;
517
518	return &mr->ibmr;
519
520err_in:
521	kfree(in);
522
523err_free:
524	kfree(mr);
525
526	return ERR_PTR(err);
527}
528
529static int get_octo_len(u64 addr, u64 len, int page_size)
530{
531	u64 offset;
532	int npages;
533
534	offset = addr & (page_size - 1);
535	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
536	return (npages + 1) / 2;
537}
538
539static int use_umr(int order)
540{
541	return order <= MLX5_MAX_UMR_SHIFT;
542}
543
544static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
545			  int npages, int page_shift, int *size,
546			  __be64 **mr_pas, dma_addr_t *dma)
547{
548	__be64 *pas;
549	struct device *ddev = dev->ib_dev.dma_device;
550
551	/*
552	 * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
553	 * To avoid copying garbage after the pas array, we allocate
554	 * a little more.
555	 */
556	*size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
557	*mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
558	if (!(*mr_pas))
559		return -ENOMEM;
560
561	pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
562	mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
563	/* Clear padding after the actual pages. */
564	memset(pas + npages, 0, *size - npages * sizeof(u64));
565
566	*dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
567	if (dma_mapping_error(ddev, *dma)) {
568		kfree(*mr_pas);
569		return -ENOMEM;
570	}
571
572	return 0;
573}
574
575static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr,
576				struct ib_sge *sg, u64 dma, int n, u32 key,
577				int page_shift)
578{
579	struct mlx5_ib_dev *dev = to_mdev(pd->device);
580	struct mlx5_umr_wr *umrwr = umr_wr(wr);
581
582	sg->addr = dma;
583	sg->length = ALIGN(sizeof(u64) * n, 64);
584	sg->lkey = dev->umrc.pd->local_dma_lkey;
585
586	wr->next = NULL;
587	wr->sg_list = sg;
588	if (n)
589		wr->num_sge = 1;
590	else
591		wr->num_sge = 0;
592
593	wr->opcode = MLX5_IB_WR_UMR;
594
595	umrwr->npages = n;
596	umrwr->page_shift = page_shift;
597	umrwr->mkey = key;
598}
599
600static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
601			     struct ib_sge *sg, u64 dma, int n, u32 key,
602			     int page_shift, u64 virt_addr, u64 len,
603			     int access_flags)
604{
605	struct mlx5_umr_wr *umrwr = umr_wr(wr);
606
607	prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift);
608
609	wr->send_flags = 0;
610
611	umrwr->target.virt_addr = virt_addr;
612	umrwr->length = len;
613	umrwr->access_flags = access_flags;
614	umrwr->pd = pd;
615}
616
617static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
618			       struct ib_send_wr *wr, u32 key)
619{
620	struct mlx5_umr_wr *umrwr = umr_wr(wr);
621
622	wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
623	wr->opcode = MLX5_IB_WR_UMR;
624	umrwr->mkey = key;
625}
626
627static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
628				   int access_flags, int *npages,
629				   int *page_shift, int *ncont, int *order)
630{
631	struct mlx5_ib_dev *dev = to_mdev(pd->device);
632	struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length,
633					   access_flags, 0);
634	if (IS_ERR(umem)) {
635		mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
636		return (void *)umem;
637	}
638
639	mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order);
640	if (!*npages) {
641		mlx5_ib_warn(dev, "avoid zero region\n");
642		ib_umem_release(umem);
643		return ERR_PTR(-EINVAL);
644	}
645
646	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
647		    *npages, *ncont, *order, *page_shift);
648
649	return umem;
650}
651
652static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
653{
654	struct mlx5_ib_umr_context *context =
655		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
656
657	context->status = wc->status;
658	complete(&context->done);
659}
660
661static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
662{
663	context->cqe.done = mlx5_ib_umr_done;
664	context->status = -1;
665	init_completion(&context->done);
666}
667
668static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
669				  u64 virt_addr, u64 len, int npages,
670				  int page_shift, int order, int access_flags)
671{
672	struct mlx5_ib_dev *dev = to_mdev(pd->device);
673	struct device *ddev = dev->ib_dev.dma_device;
674	struct umr_common *umrc = &dev->umrc;
675	struct mlx5_ib_umr_context umr_context;
676	struct mlx5_umr_wr umrwr = {};
677	struct ib_send_wr *bad;
678	struct mlx5_ib_mr *mr;
679	struct ib_sge sg;
680	int size;
681	__be64 *mr_pas;
682	dma_addr_t dma;
683	int err = 0;
684	int i;
685
686	for (i = 0; i < 1; i++) {
687		mr = alloc_cached_mr(dev, order);
688		if (mr)
689			break;
690
691		err = add_keys(dev, order2idx(dev, order), 1);
692		if (err && err != -EAGAIN) {
693			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
694			break;
695		}
696	}
697
698	if (!mr)
699		return ERR_PTR(-EAGAIN);
700
701	err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas,
702			     &dma);
703	if (err)
704		goto free_mr;
705
706	mlx5_ib_init_umr_context(&umr_context);
707
708	umrwr.wr.wr_cqe = &umr_context.cqe;
709	prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
710			 page_shift, virt_addr, len, access_flags);
711
712	down(&umrc->sem);
713	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
714	if (err) {
715		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
716		goto unmap_dma;
717	} else {
718		wait_for_completion(&umr_context.done);
719		if (umr_context.status != IB_WC_SUCCESS) {
720			mlx5_ib_warn(dev, "reg umr failed\n");
721			err = -EFAULT;
722		}
723	}
724
725	mr->mmkey.iova = virt_addr;
726	mr->mmkey.size = len;
727	mr->mmkey.pd = to_mpd(pd)->pdn;
728
729	mr->live = 1;
730
731unmap_dma:
732	up(&umrc->sem);
733	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
734
735	kfree(mr_pas);
736
737free_mr:
738	if (err) {
739		free_cached_mr(dev, mr);
740		return ERR_PTR(err);
741	}
742
743	return mr;
744}
745
746#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
747int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
748		       int zap)
749{
750	struct mlx5_ib_dev *dev = mr->dev;
751	struct device *ddev = dev->ib_dev.dma_device;
752	struct umr_common *umrc = &dev->umrc;
753	struct mlx5_ib_umr_context umr_context;
754	struct ib_umem *umem = mr->umem;
755	int size;
756	__be64 *pas;
757	dma_addr_t dma;
758	struct ib_send_wr *bad;
759	struct mlx5_umr_wr wr;
760	struct ib_sge sg;
761	int err = 0;
762	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
763	const int page_index_mask = page_index_alignment - 1;
764	size_t pages_mapped = 0;
765	size_t pages_to_map = 0;
766	size_t pages_iter = 0;
767	int use_emergency_buf = 0;
768
769	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
770	 * so we need to align the offset and length accordingly */
771	if (start_page_index & page_index_mask) {
772		npages += start_page_index & page_index_mask;
773		start_page_index &= ~page_index_mask;
774	}
775
776	pages_to_map = ALIGN(npages, page_index_alignment);
777
778	if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
779		return -EINVAL;
780
781	size = sizeof(u64) * pages_to_map;
782	size = min_t(int, PAGE_SIZE, size);
783	/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
784	 * code, when we are called from an invalidation. The pas buffer must
785	 * be 2k-aligned for Connect-IB. */
786	pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
787	if (!pas) {
788		mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
789		pas = mlx5_ib_update_mtt_emergency_buffer;
790		size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
791		use_emergency_buf = 1;
792		mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
793		memset(pas, 0, size);
794	}
795	pages_iter = size / sizeof(u64);
796	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
797	if (dma_mapping_error(ddev, dma)) {
798		mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
799		err = -ENOMEM;
800		goto free_pas;
801	}
802
803	for (pages_mapped = 0;
804	     pages_mapped < pages_to_map && !err;
805	     pages_mapped += pages_iter, start_page_index += pages_iter) {
806		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
807
808		npages = min_t(size_t,
809			       pages_iter,
810			       ib_umem_num_pages(umem) - start_page_index);
811
812		if (!zap) {
813			__mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
814					       start_page_index, npages, pas,
815					       MLX5_IB_MTT_PRESENT);
816			/* Clear padding after the pages brought from the
817			 * umem. */
818			memset(pas + npages, 0, size - npages * sizeof(u64));
819		}
820
821		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
822
823		mlx5_ib_init_umr_context(&umr_context);
824
825		memset(&wr, 0, sizeof(wr));
826		wr.wr.wr_cqe = &umr_context.cqe;
827
828		sg.addr = dma;
829		sg.length = ALIGN(npages * sizeof(u64),
830				MLX5_UMR_MTT_ALIGNMENT);
831		sg.lkey = dev->umrc.pd->local_dma_lkey;
832
833		wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
834				MLX5_IB_SEND_UMR_UPDATE_MTT;
835		wr.wr.sg_list = &sg;
836		wr.wr.num_sge = 1;
837		wr.wr.opcode = MLX5_IB_WR_UMR;
838		wr.npages = sg.length / sizeof(u64);
839		wr.page_shift = PAGE_SHIFT;
840		wr.mkey = mr->mmkey.key;
841		wr.target.offset = start_page_index;
842
843		down(&umrc->sem);
844		err = ib_post_send(umrc->qp, &wr.wr, &bad);
845		if (err) {
846			mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
847		} else {
848			wait_for_completion(&umr_context.done);
849			if (umr_context.status != IB_WC_SUCCESS) {
850				mlx5_ib_err(dev, "UMR completion failed, code %d\n",
851					    umr_context.status);
852				err = -EFAULT;
853			}
854		}
855		up(&umrc->sem);
856	}
857	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
858
859free_pas:
860	if (!use_emergency_buf)
861		free_page((unsigned long)pas);
862	else
863		mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
864
865	return err;
866}
867#endif
868
869/*
870 * If ibmr is NULL it will be allocated by reg_create.
871 * Else, the given ibmr will be used.
872 */
873static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
874				     u64 virt_addr, u64 length,
875				     struct ib_umem *umem, int npages,
876				     int page_shift, int access_flags)
877{
878	struct mlx5_ib_dev *dev = to_mdev(pd->device);
879	struct mlx5_ib_mr *mr;
880	__be64 *pas;
881	void *mkc;
882	int inlen;
883	u32 *in;
884	int err;
885	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
886
887	mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
888	if (!mr)
889		return ERR_PTR(-ENOMEM);
890
891	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
892		sizeof(*pas) * ((npages + 1) / 2) * 2;
893	in = mlx5_vzalloc(inlen);
894	if (!in) {
895		err = -ENOMEM;
896		goto err_1;
897	}
898	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
899	mlx5_ib_populate_pas(dev, umem, page_shift, pas,
900			     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
901
902	/* The pg_access bit allows setting the access flags
903	 * in the page list submitted with the command. */
904	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
905
906	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
907	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT);
908	MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
909	MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
910	MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
911	MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
912	MLX5_SET(mkc, mkc, lr, 1);
913
914	MLX5_SET64(mkc, mkc, start_addr, virt_addr);
915	MLX5_SET64(mkc, mkc, len, length);
916	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
917	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
918	MLX5_SET(mkc, mkc, translations_octword_size,
919		 get_octo_len(virt_addr, length, 1 << page_shift));
920	MLX5_SET(mkc, mkc, log_page_size, page_shift);
921	MLX5_SET(mkc, mkc, qpn, 0xffffff);
922	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
923		 get_octo_len(virt_addr, length, 1 << page_shift));
924
925	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey,
926					(struct mlx5_create_mkey_mbox_in *)in,
927                                        inlen, NULL, NULL, NULL);
928	if (err) {
929		mlx5_ib_warn(dev, "create mkey failed\n");
930		goto err_2;
931	}
932	mr->umem = umem;
933	mr->dev = dev;
934	mr->live = 1;
935	kvfree(in);
936
937	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
938
939	return mr;
940
941err_2:
942	kvfree(in);
943
944err_1:
945	if (!ibmr)
946		kfree(mr);
947
948	return ERR_PTR(err);
949}
950
951static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
952			  int npages, u64 length, int access_flags)
953{
954	mr->npages = npages;
955	atomic_add(npages, &dev->mdev->priv.reg_pages);
956	mr->ibmr.lkey = mr->mmkey.key;
957	mr->ibmr.rkey = mr->mmkey.key;
958	mr->ibmr.length = length;
959	mr->access_flags = access_flags;
960}
961
962struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
963				  u64 virt_addr, int access_flags,
964				  struct ib_udata *udata)
965{
966	struct mlx5_ib_dev *dev = to_mdev(pd->device);
967	struct mlx5_ib_mr *mr = NULL;
968	struct ib_umem *umem;
969	int page_shift;
970	int npages;
971	int ncont;
972	int order;
973	int err;
974
975	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
976		    (long long)start, (long long)virt_addr, (long long)length, access_flags);
977	umem = mr_umem_get(pd, start, length, access_flags, &npages,
978			   &page_shift, &ncont, &order);
979
980	if (IS_ERR(umem))
981		return (void *)umem;
982
983	if (use_umr(order)) {
984		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
985			     order, access_flags);
986		if (PTR_ERR(mr) == -EAGAIN) {
987			mlx5_ib_dbg(dev, "cache empty for order %d", order);
988			mr = NULL;
989		}
990	} else if (access_flags & IB_ACCESS_ON_DEMAND) {
991		err = -EINVAL;
992		pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
993		goto error;
994	}
995
996	if (!mr) {
997		mutex_lock(&dev->slow_path_mutex);
998		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
999				page_shift, access_flags);
1000		mutex_unlock(&dev->slow_path_mutex);
1001	}
1002
1003	if (IS_ERR(mr)) {
1004		err = PTR_ERR(mr);
1005		goto error;
1006	}
1007
1008	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1009
1010	mr->umem = umem;
1011	set_mr_fileds(dev, mr, npages, length, access_flags);
1012
1013#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1014	update_odp_mr(mr);
1015#endif
1016
1017	return &mr->ibmr;
1018
1019error:
1020	ib_umem_release(umem);
1021	return ERR_PTR(err);
1022}
1023
1024static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1025{
1026	struct mlx5_core_dev *mdev = dev->mdev;
1027	struct umr_common *umrc = &dev->umrc;
1028	struct mlx5_ib_umr_context umr_context;
1029	struct mlx5_umr_wr umrwr = {};
1030	struct ib_send_wr *bad;
1031	int err;
1032
1033	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1034		return 0;
1035
1036	mlx5_ib_init_umr_context(&umr_context);
1037
1038	umrwr.wr.wr_cqe = &umr_context.cqe;
1039	prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key);
1040
1041	down(&umrc->sem);
1042	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1043	if (err) {
1044		up(&umrc->sem);
1045		mlx5_ib_dbg(dev, "err %d\n", err);
1046		goto error;
1047	} else {
1048		wait_for_completion(&umr_context.done);
1049		up(&umrc->sem);
1050	}
1051	if (umr_context.status != IB_WC_SUCCESS) {
1052		mlx5_ib_warn(dev, "unreg umr failed\n");
1053		err = -EFAULT;
1054		goto error;
1055	}
1056	return 0;
1057
1058error:
1059	return err;
1060}
1061
1062static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
1063		     u64 length, int npages, int page_shift, int order,
1064		     int access_flags, int flags)
1065{
1066	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1067	struct device *ddev = dev->ib_dev.dma_device;
1068	struct mlx5_ib_umr_context umr_context;
1069	struct ib_send_wr *bad;
1070	struct mlx5_umr_wr umrwr = {};
1071	struct ib_sge sg;
1072	struct umr_common *umrc = &dev->umrc;
1073	dma_addr_t dma = 0;
1074	__be64 *mr_pas = NULL;
1075	int size;
1076	int err;
1077
1078	mlx5_ib_init_umr_context(&umr_context);
1079
1080	umrwr.wr.wr_cqe = &umr_context.cqe;
1081	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1082
1083	if (flags & IB_MR_REREG_TRANS) {
1084		err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size,
1085				     &mr_pas, &dma);
1086		if (err)
1087			return err;
1088
1089		umrwr.target.virt_addr = virt_addr;
1090		umrwr.length = length;
1091		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1092	}
1093
1094	prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key,
1095			    page_shift);
1096
1097	if (flags & IB_MR_REREG_PD) {
1098		umrwr.pd = pd;
1099		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD;
1100	}
1101
1102	if (flags & IB_MR_REREG_ACCESS) {
1103		umrwr.access_flags = access_flags;
1104		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS;
1105	}
1106
1107	/* post send request to UMR QP */
1108	down(&umrc->sem);
1109	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1110
1111	if (err) {
1112		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
1113	} else {
1114		wait_for_completion(&umr_context.done);
1115		if (umr_context.status != IB_WC_SUCCESS) {
1116			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
1117				     umr_context.status);
1118			err = -EFAULT;
1119		}
1120	}
1121
1122	up(&umrc->sem);
1123	if (flags & IB_MR_REREG_TRANS) {
1124		dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1125		kfree(mr_pas);
1126	}
1127	return err;
1128}
1129
1130int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1131			  u64 length, u64 virt_addr, int new_access_flags,
1132			  struct ib_pd *new_pd, struct ib_udata *udata)
1133{
1134	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1135	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1136	struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
1137	int access_flags = flags & IB_MR_REREG_ACCESS ?
1138			    new_access_flags :
1139			    mr->access_flags;
1140	u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address;
1141	u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length;
1142	int page_shift = 0;
1143	int npages = 0;
1144	int ncont = 0;
1145	int order = 0;
1146	int err;
1147
1148	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1149		    (long long)start, (long long)virt_addr, (long long)length, access_flags);
1150
1151	if (flags != IB_MR_REREG_PD) {
1152		/*
1153		 * Replace umem. This needs to be done whether or not UMR is
1154		 * used.
1155		 */
1156		flags |= IB_MR_REREG_TRANS;
1157		ib_umem_release(mr->umem);
1158		mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages,
1159				       &page_shift, &ncont, &order);
1160		if (IS_ERR(mr->umem)) {
1161			err = PTR_ERR(mr->umem);
1162			mr->umem = NULL;
1163			return err;
1164		}
1165	}
1166
1167	if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
1168		/*
1169		 * UMR can't be used - MKey needs to be replaced.
1170		 */
1171		if (mr->umred) {
1172			err = unreg_umr(dev, mr);
1173			if (err)
1174				mlx5_ib_warn(dev, "Failed to unregister MR\n");
1175		} else {
1176			err = destroy_mkey(dev, mr);
1177			if (err)
1178				mlx5_ib_warn(dev, "Failed to destroy MKey\n");
1179		}
1180		if (err)
1181			return err;
1182
1183		mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont,
1184				page_shift, access_flags);
1185
1186		if (IS_ERR(mr))
1187			return PTR_ERR(mr);
1188
1189		mr->umred = 0;
1190	} else {
1191		/*
1192		 * Send a UMR WQE
1193		 */
1194		err = rereg_umr(pd, mr, addr, len, npages, page_shift,
1195				order, access_flags, flags);
1196		if (err) {
1197			mlx5_ib_warn(dev, "Failed to rereg UMR\n");
1198			return err;
1199		}
1200	}
1201
1202	if (flags & IB_MR_REREG_PD) {
1203		ib_mr->pd = pd;
1204		mr->mmkey.pd = to_mpd(pd)->pdn;
1205	}
1206
1207	if (flags & IB_MR_REREG_ACCESS)
1208		mr->access_flags = access_flags;
1209
1210	if (flags & IB_MR_REREG_TRANS) {
1211		atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1212		set_mr_fileds(dev, mr, npages, len, access_flags);
1213		mr->mmkey.iova = addr;
1214		mr->mmkey.size = len;
1215	}
1216#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1217	update_odp_mr(mr);
1218#endif
1219
1220	return 0;
1221}
1222
1223static int
1224mlx5_alloc_priv_descs(struct ib_device *device,
1225		      struct mlx5_ib_mr *mr,
1226		      int ndescs,
1227		      int desc_size)
1228{
1229	int size = ndescs * desc_size;
1230	int add_size;
1231	int ret;
1232
1233	add_size = max_t(int, MLX5_UMR_ALIGN - 1, 0);
1234
1235	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1236	if (!mr->descs_alloc)
1237		return -ENOMEM;
1238
1239	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1240
1241	mr->desc_map = dma_map_single(device->dma_device, mr->descs,
1242				      size, DMA_TO_DEVICE);
1243	if (dma_mapping_error(device->dma_device, mr->desc_map)) {
1244		ret = -ENOMEM;
1245		goto err;
1246	}
1247
1248	return 0;
1249err:
1250	kfree(mr->descs_alloc);
1251
1252	return ret;
1253}
1254
1255static void
1256mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1257{
1258	if (mr->descs) {
1259		struct ib_device *device = mr->ibmr.device;
1260		int size = mr->max_descs * mr->desc_size;
1261
1262		dma_unmap_single(device->dma_device, mr->desc_map,
1263				 size, DMA_TO_DEVICE);
1264		kfree(mr->descs_alloc);
1265		mr->descs = NULL;
1266	}
1267}
1268
1269static int clean_mr(struct mlx5_ib_mr *mr)
1270{
1271	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1272	int umred = mr->umred;
1273	int err;
1274
1275	if (mr->sig) {
1276		if (mlx5_core_destroy_psv(dev->mdev,
1277					  mr->sig->psv_memory.psv_idx))
1278			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1279				     mr->sig->psv_memory.psv_idx);
1280		if (mlx5_core_destroy_psv(dev->mdev,
1281					  mr->sig->psv_wire.psv_idx))
1282			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1283				     mr->sig->psv_wire.psv_idx);
1284		kfree(mr->sig);
1285		mr->sig = NULL;
1286	}
1287
1288	mlx5_free_priv_descs(mr);
1289
1290	if (!umred) {
1291		err = destroy_mkey(dev, mr);
1292		if (err) {
1293			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1294				     mr->mmkey.key, err);
1295			return err;
1296		}
1297	} else {
1298		err = unreg_umr(dev, mr);
1299		if (err) {
1300			mlx5_ib_warn(dev, "failed unregister\n");
1301			return err;
1302		}
1303		free_cached_mr(dev, mr);
1304	}
1305
1306	if (!umred)
1307		kfree(mr);
1308
1309	return 0;
1310}
1311
1312CTASSERT(sizeof(((struct ib_phys_buf *)0)->size) == 8);
1313
1314struct ib_mr *
1315mlx5_ib_reg_phys_mr(struct ib_pd *pd,
1316		    struct ib_phys_buf *buffer_list,
1317		    int num_phys_buf,
1318		    int access_flags,
1319		    u64 *virt_addr)
1320{
1321	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1322	struct mlx5_ib_mr *mr;
1323	__be64 *pas;
1324	void *mkc;
1325	u32 *in;
1326	u64 total_size;
1327	u32 octo_len;
1328	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1329	unsigned long mask;
1330	int shift;
1331	int npages;
1332	int inlen;
1333	int err;
1334	int i, j, n;
1335
1336	mask = buffer_list[0].addr ^ *virt_addr;
1337	total_size = 0;
1338	for (i = 0; i < num_phys_buf; ++i) {
1339		if (i != 0)
1340			mask |= buffer_list[i].addr;
1341		if (i != num_phys_buf - 1)
1342			mask |= buffer_list[i].addr + buffer_list[i].size;
1343
1344		total_size += buffer_list[i].size;
1345	}
1346
1347	if (mask & ~PAGE_MASK)
1348		return ERR_PTR(-EINVAL);
1349
1350	shift = __ffs(mask | 1 << 31);
1351
1352	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
1353	buffer_list[0].addr &= ~0ULL << shift;
1354
1355	npages = 0;
1356	for (i = 0; i < num_phys_buf; ++i)
1357		npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
1358
1359	if (!npages) {
1360		mlx5_ib_warn(dev, "avoid zero region\n");
1361		return ERR_PTR(-EINVAL);
1362	}
1363
1364	mr = kzalloc(sizeof *mr, GFP_KERNEL);
1365	if (!mr)
1366		return ERR_PTR(-ENOMEM);
1367
1368	octo_len = get_octo_len(*virt_addr, total_size, 1ULL << shift);
1369	octo_len = ALIGN(octo_len, 4);
1370
1371	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + (octo_len * 16);
1372	in = mlx5_vzalloc(inlen);
1373	if (!in) {
1374		kfree(mr);
1375		return ERR_PTR(-ENOMEM);
1376	}
1377	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1378
1379	n = 0;
1380	for (i = 0; i < num_phys_buf; ++i) {
1381		for (j = 0;
1382		     j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
1383		     ++j) {
1384			u64 temp = buffer_list[i].addr + ((u64) j << shift);
1385			if (pg_cap)
1386				temp |= MLX5_IB_MTT_PRESENT;
1387			pas[n++] = cpu_to_be64(temp);
1388		}
1389	}
1390
1391	/*
1392	 * The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access
1393	 * flags in the page list submitted with the command:
1394	 */
1395	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1396
1397	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1398	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT);
1399	MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
1400	MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
1401	MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
1402	MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
1403	MLX5_SET(mkc, mkc, lr, 1);
1404
1405	MLX5_SET64(mkc, mkc, start_addr, *virt_addr);
1406	MLX5_SET64(mkc, mkc, len, total_size);
1407	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1408	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1409	MLX5_SET(mkc, mkc, translations_octword_size, octo_len);
1410	MLX5_SET(mkc, mkc, log_page_size, shift);
1411	MLX5_SET(mkc, mkc, qpn, 0xffffff);
1412	MLX5_SET(create_mkey_in, in, translations_octword_actual_size, octo_len);
1413
1414	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey,
1415				    (struct mlx5_create_mkey_mbox_in *)in, inlen,
1416				    NULL, NULL, NULL);
1417	mr->umem = NULL;
1418	mr->dev = dev;
1419	mr->live = 1;
1420	mr->npages = npages;
1421	mr->ibmr.lkey = mr->mmkey.key;
1422	mr->ibmr.rkey = mr->mmkey.key;
1423	mr->ibmr.length = total_size;
1424	mr->access_flags = access_flags;
1425
1426	kvfree(in);
1427
1428	if (err) {
1429		kfree(mr);
1430		return ERR_PTR(err);
1431	}
1432	return &mr->ibmr;
1433}
1434
1435int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1436{
1437	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1438	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1439	int npages = mr->npages;
1440	struct ib_umem *umem = mr->umem;
1441
1442#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1443	if (umem && umem->odp_data) {
1444		/* Prevent new page faults from succeeding */
1445		mr->live = 0;
1446		/* Wait for all running page-fault handlers to finish. */
1447		synchronize_srcu(&dev->mr_srcu);
1448		/* Destroy all page mappings */
1449		mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1450					 ib_umem_end(umem));
1451		/*
1452		 * We kill the umem before the MR for ODP,
1453		 * so that there will not be any invalidations in
1454		 * flight, looking at the *mr struct.
1455		 */
1456		ib_umem_release(umem);
1457		atomic_sub(npages, &dev->mdev->priv.reg_pages);
1458
1459		/* Avoid double-freeing the umem. */
1460		umem = NULL;
1461	}
1462#endif
1463
1464	clean_mr(mr);
1465
1466	if (umem) {
1467		ib_umem_release(umem);
1468		atomic_sub(npages, &dev->mdev->priv.reg_pages);
1469	}
1470
1471	return 0;
1472}
1473
1474struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
1475			       enum ib_mr_type mr_type,
1476			       u32 max_num_sg)
1477{
1478	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1479	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1480	int ndescs = ALIGN(max_num_sg, 4);
1481	struct mlx5_ib_mr *mr;
1482	void *mkc;
1483	u32 *in;
1484	int err;
1485
1486	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1487	if (!mr)
1488		return ERR_PTR(-ENOMEM);
1489
1490	in = kzalloc(inlen, GFP_KERNEL);
1491	if (!in) {
1492		err = -ENOMEM;
1493		goto err_free;
1494	}
1495
1496	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1497	MLX5_SET(mkc, mkc, free, 1);
1498	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1499	MLX5_SET(mkc, mkc, qpn, 0xffffff);
1500	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1501
1502	if (mr_type == IB_MR_TYPE_MEM_REG) {
1503		mr->access_mode = MLX5_ACCESS_MODE_MTT;
1504		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
1505		err = mlx5_alloc_priv_descs(pd->device, mr,
1506					    ndescs, sizeof(u64));
1507		if (err)
1508			goto err_free_in;
1509
1510		mr->desc_size = sizeof(u64);
1511		mr->max_descs = ndescs;
1512	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
1513		mr->access_mode = MLX5_ACCESS_MODE_KLM;
1514
1515		err = mlx5_alloc_priv_descs(pd->device, mr,
1516					    ndescs, sizeof(struct mlx5_klm));
1517		if (err)
1518			goto err_free_in;
1519		mr->desc_size = sizeof(struct mlx5_klm);
1520		mr->max_descs = ndescs;
1521	} else if (mr_type == IB_MR_TYPE_SIGNATURE) {
1522		u32 psv_index[2];
1523
1524		MLX5_SET(mkc, mkc, bsf_en, 1);
1525		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1526		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1527		if (!mr->sig) {
1528			err = -ENOMEM;
1529			goto err_free_in;
1530		}
1531
1532		/* create mem & wire PSVs */
1533		err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
1534					   2, psv_index);
1535		if (err)
1536			goto err_free_sig;
1537
1538		mr->access_mode = MLX5_ACCESS_MODE_KLM;
1539		mr->sig->psv_memory.psv_idx = psv_index[0];
1540		mr->sig->psv_wire.psv_idx = psv_index[1];
1541
1542		mr->sig->sig_status_checked = true;
1543		mr->sig->sig_err_exists = false;
1544		/* Next UMR, Arm SIGERR */
1545		++mr->sig->sigerr_count;
1546	} else {
1547		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1548		err = -EINVAL;
1549		goto err_free_in;
1550	}
1551
1552	MLX5_SET(mkc, mkc, access_mode, mr->access_mode);
1553	MLX5_SET(mkc, mkc, umr_en, 1);
1554
1555	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey,
1556					(struct mlx5_create_mkey_mbox_in *)in,
1557                                        inlen, NULL, NULL, NULL);
1558	if (err)
1559		goto err_destroy_psv;
1560
1561	mr->ibmr.lkey = mr->mmkey.key;
1562	mr->ibmr.rkey = mr->mmkey.key;
1563	mr->umem = NULL;
1564	kfree(in);
1565
1566	return &mr->ibmr;
1567
1568err_destroy_psv:
1569	if (mr->sig) {
1570		if (mlx5_core_destroy_psv(dev->mdev,
1571					  mr->sig->psv_memory.psv_idx))
1572			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1573				     mr->sig->psv_memory.psv_idx);
1574		if (mlx5_core_destroy_psv(dev->mdev,
1575					  mr->sig->psv_wire.psv_idx))
1576			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1577				     mr->sig->psv_wire.psv_idx);
1578	}
1579	mlx5_free_priv_descs(mr);
1580err_free_sig:
1581	kfree(mr->sig);
1582err_free_in:
1583	kfree(in);
1584err_free:
1585	kfree(mr);
1586	return ERR_PTR(err);
1587}
1588
1589struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
1590			       struct ib_udata *udata)
1591{
1592	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1593	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1594	struct mlx5_ib_mw *mw = NULL;
1595	u32 *in = NULL;
1596	void *mkc;
1597	int ndescs;
1598	int err;
1599	struct mlx5_ib_alloc_mw req = {};
1600	struct {
1601		__u32	comp_mask;
1602		__u32	response_length;
1603	} resp = {};
1604
1605	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1606	if (err)
1607		return ERR_PTR(err);
1608
1609	if (req.comp_mask || req.reserved1 || req.reserved2)
1610		return ERR_PTR(-EOPNOTSUPP);
1611
1612	if (udata->inlen > sizeof(req) &&
1613	    !ib_is_udata_cleared(udata, sizeof(req),
1614				 udata->inlen - sizeof(req)))
1615		return ERR_PTR(-EOPNOTSUPP);
1616
1617	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
1618
1619	mw = kzalloc(sizeof(*mw), GFP_KERNEL);
1620	in = kzalloc(inlen, GFP_KERNEL);
1621	if (!mw || !in) {
1622		err = -ENOMEM;
1623		goto free;
1624	}
1625
1626	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1627
1628	MLX5_SET(mkc, mkc, free, 1);
1629	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1630	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1631	MLX5_SET(mkc, mkc, umr_en, 1);
1632	MLX5_SET(mkc, mkc, lr, 1);
1633	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_KLM);
1634	MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
1635	MLX5_SET(mkc, mkc, qpn, 0xffffff);
1636
1637	err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey,
1638                                        (struct mlx5_create_mkey_mbox_in *)in,
1639                                        inlen, NULL, NULL, NULL);
1640	if (err)
1641		goto free;
1642
1643	mw->ibmw.rkey = mw->mmkey.key;
1644
1645	resp.response_length = min(offsetof(typeof(resp), response_length) +
1646				   sizeof(resp.response_length), udata->outlen);
1647	if (resp.response_length) {
1648		err = ib_copy_to_udata(udata, &resp, resp.response_length);
1649		if (err) {
1650			mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1651			goto free;
1652		}
1653	}
1654
1655	kfree(in);
1656	return &mw->ibmw;
1657
1658free:
1659	kfree(mw);
1660	kfree(in);
1661	return ERR_PTR(err);
1662}
1663
1664int mlx5_ib_dealloc_mw(struct ib_mw *mw)
1665{
1666	struct mlx5_ib_mw *mmw = to_mmw(mw);
1667	int err;
1668
1669	err =  mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
1670				      &mmw->mmkey);
1671	if (!err)
1672		kfree(mmw);
1673	return err;
1674}
1675
1676int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
1677			    struct ib_mr_status *mr_status)
1678{
1679	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1680	int ret = 0;
1681
1682	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
1683		pr_err("Invalid status check mask\n");
1684		ret = -EINVAL;
1685		goto done;
1686	}
1687
1688	mr_status->fail_status = 0;
1689	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
1690		if (!mmr->sig) {
1691			ret = -EINVAL;
1692			pr_err("signature status check requested on a non-signature enabled MR\n");
1693			goto done;
1694		}
1695
1696		mmr->sig->sig_status_checked = true;
1697		if (!mmr->sig->sig_err_exists)
1698			goto done;
1699
1700		if (ibmr->lkey == mmr->sig->err_item.key)
1701			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
1702			       sizeof(mr_status->sig_err));
1703		else {
1704			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
1705			mr_status->sig_err.sig_err_offset = 0;
1706			mr_status->sig_err.key = mmr->sig->err_item.key;
1707		}
1708
1709		mmr->sig->sig_err_exists = false;
1710		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
1711	}
1712
1713done:
1714	return ret;
1715}
1716
1717static int
1718mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
1719		   struct scatterlist *sgl,
1720		   unsigned short sg_nents,
1721		   unsigned int *sg_offset_p)
1722{
1723	struct scatterlist *sg = sgl;
1724	struct mlx5_klm *klms = mr->descs;
1725	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
1726	u32 lkey = mr->ibmr.pd->local_dma_lkey;
1727	int i;
1728
1729	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
1730	mr->ibmr.length = 0;
1731	mr->ndescs = sg_nents;
1732
1733	for_each_sg(sgl, sg, sg_nents, i) {
1734		if (unlikely(i > mr->max_descs))
1735			break;
1736		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
1737		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
1738		klms[i].key = cpu_to_be32(lkey);
1739		mr->ibmr.length += sg_dma_len(sg);
1740
1741		sg_offset = 0;
1742	}
1743
1744	if (sg_offset_p)
1745		*sg_offset_p = sg_offset;
1746
1747	return i;
1748}
1749
1750static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
1751{
1752	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1753	__be64 *descs;
1754
1755	if (unlikely(mr->ndescs == mr->max_descs))
1756		return -ENOMEM;
1757
1758	descs = mr->descs;
1759	descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
1760
1761	return 0;
1762}
1763
1764int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
1765		      unsigned int *sg_offset)
1766{
1767	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1768	int n;
1769
1770	mr->ndescs = 0;
1771
1772	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
1773				   mr->desc_size * mr->max_descs,
1774				   DMA_TO_DEVICE);
1775
1776	if (mr->access_mode == MLX5_ACCESS_MODE_KLM)
1777		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
1778	else
1779		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
1780				mlx5_set_page);
1781
1782	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
1783				      mr->desc_size * mr->max_descs,
1784				      DMA_TO_DEVICE);
1785
1786	return n;
1787}
1788