1/*-
2 * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: stable/10/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c 323223 2017-09-06 15:33:23Z hselasky $
26 */
27
28#include <linux/compiler.h>
29#include <linux/kref.h>
30#include <linux/random.h>
31#include <linux/fs.h>
32#include <linux/delay.h>
33#include <rdma/ib_umem.h>
34#include "mlx5_ib.h"
35
36CTASSERT((uintptr_t)PAGE_MASK > (uintptr_t)PAGE_SIZE);
37
38enum {
39	MAX_PENDING_REG_MR = 8,
40	MAX_MR_RELEASE_TIMEOUT = (60 * 20) /* Allow release timeout up to 20 min */
41};
42
43#define MLX5_UMR_ALIGN 2048
44
45static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev);
46static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev);
47
48static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
49{
50	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
51
52	return err;
53}
54
55static int order2idx(struct mlx5_ib_dev *dev, int order)
56{
57	struct mlx5_mr_cache *cache = &dev->cache;
58
59	if (order < cache->ent[0].order)
60		return 0;
61	else
62		return order - cache->ent[0].order;
63}
64
65static void reg_mr_callback(int status, void *context)
66{
67	struct mlx5_ib_mr *mr = context;
68	struct mlx5_ib_dev *dev = mr->dev;
69	struct mlx5_mr_cache *cache = &dev->cache;
70	int c = order2idx(dev, mr->order);
71	struct mlx5_cache_ent *ent = &cache->ent[c];
72	struct mlx5_core_dev *mdev = dev->mdev;
73	struct mlx5_core_mr *mmr = &mr->mmr;
74	struct mlx5_mr_table *table = &dev->mdev->priv.mr_table;
75	unsigned long flags;
76	int err;
77	u8 key;
78
79	spin_lock_irqsave(&ent->lock, flags);
80	ent->pending--;
81	spin_unlock_irqrestore(&ent->lock, flags);
82	if (status) {
83		mlx5_ib_warn(dev, "async reg mr failed. status %d, order %d\n", status, ent->order);
84		kfree(mr);
85		dev->fill_delay = 1;
86		mod_timer(&dev->delay_timer, jiffies + HZ);
87		return;
88	}
89
90	if (mr->out.hdr.status) {
91		mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n",
92			     mr->out.hdr.status,
93			     be32_to_cpu(mr->out.hdr.syndrome));
94		kfree(mr);
95		dev->fill_delay = 1;
96		mod_timer(&dev->delay_timer, jiffies + HZ);
97		return;
98	}
99
100	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
101	key = dev->mdev->priv.mkey_key++;
102	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
103	mmr->key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key;
104	mlx5_ib_dbg(dev, "callbacked mkey 0x%x created\n",
105		    be32_to_cpu(mr->out.mkey));
106
107	cache->last_add = jiffies;
108
109	spin_lock_irqsave(&ent->lock, flags);
110	list_add_tail(&mr->list, &ent->head);
111	ent->cur++;
112	ent->size++;
113	spin_unlock_irqrestore(&ent->lock, flags);
114
115	spin_lock_irqsave(&table->lock, flags);
116	err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mmr->key), mmr);
117	spin_unlock_irqrestore(&table->lock, flags);
118	if (err) {
119		mlx5_ib_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n",
120			     mmr->key, err);
121		mlx5_core_destroy_mkey(mdev, mmr);
122	}
123}
124
125static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
126{
127	struct mlx5_mr_cache *cache = &dev->cache;
128	struct mlx5_cache_ent *ent = &cache->ent[c];
129	struct mlx5_create_mkey_mbox_in *in;
130	struct mlx5_ib_mr *mr;
131	int npages = 1 << ent->order;
132	int err = 0;
133	int i;
134
135	in = kzalloc(sizeof(*in), GFP_KERNEL);
136	if (!in)
137		return -ENOMEM;
138
139	for (i = 0; i < num; i++) {
140		if (ent->pending >= MAX_PENDING_REG_MR) {
141			err = -EAGAIN;
142			break;
143		}
144
145		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
146		if (!mr) {
147			err = -ENOMEM;
148			break;
149		}
150		mr->order = ent->order;
151		mr->umred = 1;
152		mr->dev = dev;
153		in->seg.status = MLX5_MKEY_STATUS_FREE;
154		in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
155		in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
156		in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
157		in->seg.log2_page_size = 12;
158
159		spin_lock_irq(&ent->lock);
160		ent->pending++;
161		spin_unlock_irq(&ent->lock);
162		err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in,
163					    sizeof(*in), reg_mr_callback,
164					    mr, &mr->out);
165		if (err) {
166			spin_lock_irq(&ent->lock);
167			ent->pending--;
168			spin_unlock_irq(&ent->lock);
169			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
170			kfree(mr);
171			break;
172		}
173	}
174
175	kfree(in);
176	return err;
177}
178
179static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
180{
181	struct mlx5_mr_cache *cache = &dev->cache;
182	struct mlx5_cache_ent *ent = &cache->ent[c];
183	struct mlx5_ib_mr *mr;
184	int err;
185	int i;
186
187	for (i = 0; i < num; i++) {
188		spin_lock_irq(&ent->lock);
189		if (list_empty(&ent->head)) {
190			spin_unlock_irq(&ent->lock);
191			return;
192		}
193		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
194		list_del(&mr->list);
195		ent->cur--;
196		ent->size--;
197		spin_unlock_irq(&ent->lock);
198		err = destroy_mkey(dev, mr);
199		if (err)
200			mlx5_ib_warn(dev, "failed destroy mkey\n");
201		else
202			kfree(mr);
203	}
204}
205
206static int someone_adding(struct mlx5_mr_cache *cache)
207{
208	int i;
209
210	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
211		if (cache->ent[i].cur < cache->ent[i].limit)
212			return 1;
213	}
214
215	return 0;
216}
217
218static int someone_releasing(struct mlx5_mr_cache *cache)
219{
220	int i;
221
222	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
223		if (cache->ent[i].cur > 2 * cache->ent[i].limit)
224			return 1;
225	}
226
227	return 0;
228}
229
230static void __cache_work_func(struct mlx5_cache_ent *ent)
231{
232	struct mlx5_ib_dev *dev = ent->dev;
233	struct mlx5_mr_cache *cache = &dev->cache;
234	int i = order2idx(dev, ent->order);
235	int err;
236	s64 dtime;
237
238	if (cache->stopped)
239		return;
240
241	ent = &dev->cache.ent[i];
242	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
243		err = add_keys(dev, i, 1);
244		if (ent->cur < 2 * ent->limit) {
245			if (err == -EAGAIN) {
246				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
247					    i + 2);
248				cancel_delayed_work(&ent->dwork);
249				if (!queue_delayed_work(cache->wq, &ent->dwork,
250							msecs_to_jiffies(3)))
251					mlx5_ib_warn(dev, "failed queueing delayed work\n");
252			} else if (err) {
253				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
254					     i + 2, err);
255				cancel_delayed_work(&ent->dwork);
256				if (!queue_delayed_work(cache->wq, &ent->dwork,
257							msecs_to_jiffies(1000)))
258					mlx5_ib_warn(dev, "failed queueing delayed work\n");
259			} else {
260				if (!queue_work(cache->wq, &ent->work))
261					mlx5_ib_warn(dev, "failed queueing work\n");
262			}
263		}
264	} else if (ent->cur > 2 * ent->limit) {
265		dtime = (cache->last_add + (s64)cache->rel_timeout * HZ) - jiffies;
266		if (cache->rel_imm ||
267		    (cache->rel_timeout >= 0 && !someone_adding(cache) && dtime <= 0)) {
268			remove_keys(dev, i, 1);
269			if (ent->cur > ent->limit)
270				if (!queue_work(cache->wq, &ent->work))
271					mlx5_ib_warn(dev, "failed queueing work\n");
272		} else if (cache->rel_timeout >= 0) {
273			dtime = max_t(s64, dtime, 0);
274			dtime = min_t(s64, dtime, (MAX_MR_RELEASE_TIMEOUT * HZ));
275			cancel_delayed_work(&ent->dwork);
276			if (!queue_delayed_work(cache->wq, &ent->dwork, dtime))
277				mlx5_ib_warn(dev, "failed queueing delayed work\n");
278		}
279	} else if (cache->rel_imm && !someone_releasing(cache)) {
280		cache->rel_imm = 0;
281	}
282}
283
284static void delayed_cache_work_func(struct work_struct *work)
285{
286	struct mlx5_cache_ent *ent;
287
288	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
289	__cache_work_func(ent);
290}
291
292static void cache_work_func(struct work_struct *work)
293{
294	struct mlx5_cache_ent *ent;
295
296	ent = container_of(work, struct mlx5_cache_ent, work);
297	__cache_work_func(ent);
298}
299
300static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
301{
302	struct mlx5_mr_cache *cache = &dev->cache;
303	struct mlx5_cache_ent *ent;
304	int shrink = 0;
305	int c;
306
307	c = order2idx(dev, mr->order);
308	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
309		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
310		return;
311	}
312	ent = &cache->ent[c];
313	spin_lock_irq(&ent->lock);
314	list_add_tail(&mr->list, &ent->head);
315	ent->cur++;
316	if (ent->cur > 2 * ent->limit)
317		shrink = 1;
318	spin_unlock_irq(&ent->lock);
319
320	if (shrink)
321		if (!queue_work(cache->wq, &ent->work))
322			mlx5_ib_warn(dev, "failed queueing work\n");
323}
324
325static void clean_keys(struct mlx5_ib_dev *dev, int c)
326{
327	struct mlx5_mr_cache *cache = &dev->cache;
328	struct mlx5_cache_ent *ent = &cache->ent[c];
329	struct mlx5_ib_mr *mr;
330	int err;
331
332	cancel_delayed_work(&ent->dwork);
333	while (1) {
334		spin_lock_irq(&ent->lock);
335		if (list_empty(&ent->head)) {
336			spin_unlock_irq(&ent->lock);
337			return;
338		}
339		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
340		list_del(&mr->list);
341		ent->cur--;
342		ent->size--;
343		spin_unlock_irq(&ent->lock);
344		err = destroy_mkey(dev, mr);
345		if (err)
346			mlx5_ib_warn(dev, "failed destroy mkey 0x%x from order %d\n",
347				     mr->mmr.key, ent->order);
348		else
349			kfree(mr);
350	}
351}
352
353static void delay_time_func(unsigned long ctx)
354{
355	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
356
357	dev->fill_delay = 0;
358}
359
360enum {
361	MLX5_VF_MR_LIMIT	= 2,
362};
363
364int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
365{
366	struct mlx5_mr_cache *cache = &dev->cache;
367	struct mlx5_cache_ent *ent;
368	int limit;
369	int err;
370	int i;
371
372	mutex_init(&dev->slow_path_mutex);
373	cache->rel_timeout = 300;
374	cache->wq = create_singlethread_workqueue("mkey_cache");
375	if (!cache->wq) {
376		mlx5_ib_warn(dev, "failed to create work queue\n");
377		return -ENOMEM;
378	}
379
380	setup_timer(&dev->delay_timer, delay_time_func, (uintptr_t)dev);
381	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
382		INIT_LIST_HEAD(&cache->ent[i].head);
383		spin_lock_init(&cache->ent[i].lock);
384
385		ent = &cache->ent[i];
386		INIT_LIST_HEAD(&ent->head);
387		spin_lock_init(&ent->lock);
388		ent->order = i + 2;
389		ent->dev = dev;
390
391		if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) {
392			if (mlx5_core_is_pf(dev->mdev))
393				limit = dev->mdev->profile->mr_cache[i].limit;
394			else
395				limit = MLX5_VF_MR_LIMIT;
396		} else {
397			limit = 0;
398		}
399
400		INIT_WORK(&ent->work, cache_work_func);
401		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
402		ent->limit = limit;
403		if (!queue_work(cache->wq, &ent->work))
404			mlx5_ib_warn(dev, "failed queueing work\n");
405	}
406
407	err = mlx5_mr_sysfs_init(dev);
408	if (err)
409		mlx5_ib_warn(dev, "failed to init mr cache sysfs\n");
410
411	return 0;
412}
413
414static void wait_for_async_commands(struct mlx5_ib_dev *dev)
415{
416	struct mlx5_mr_cache *cache = &dev->cache;
417	struct mlx5_cache_ent *ent;
418	int total = 0;
419	int i;
420	int j;
421
422	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
423		ent = &cache->ent[i];
424		for (j = 0 ; j < 1000; j++) {
425			if (!ent->pending)
426				break;
427			msleep(50);
428		}
429	}
430	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
431		ent = &cache->ent[i];
432		total += ent->pending;
433	}
434
435	if (total)
436		mlx5_ib_dbg(dev, "aborted, %d pending requests\n", total);
437	else
438		mlx5_ib_dbg(dev, "done with all pending requests\n");
439}
440
441int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
442{
443	int i;
444
445	dev->cache.stopped = 1;
446	flush_workqueue(dev->cache.wq);
447	mlx5_mr_sysfs_cleanup(dev);
448
449	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
450		clean_keys(dev, i);
451
452	destroy_workqueue(dev->cache.wq);
453	wait_for_async_commands(dev);
454	del_timer_sync(&dev->delay_timer);
455	return 0;
456}
457
458struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
459{
460	struct mlx5_ib_dev *dev = to_mdev(pd->device);
461	struct mlx5_core_dev *mdev = dev->mdev;
462	struct mlx5_create_mkey_mbox_in *in;
463	struct mlx5_mkey_seg *seg;
464	struct mlx5_ib_mr *mr;
465	int err;
466
467	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
468	if (!mr)
469		return ERR_PTR(-ENOMEM);
470
471	in = kzalloc(sizeof(*in), GFP_KERNEL);
472	if (!in) {
473		err = -ENOMEM;
474		goto err_free;
475	}
476
477	seg = &in->seg;
478	seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA;
479	seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64);
480	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
481	seg->start_addr = 0;
482
483	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL,
484				    NULL);
485	if (err)
486		goto err_in;
487
488	kfree(in);
489	mr->ibmr.lkey = mr->mmr.key;
490	mr->ibmr.rkey = mr->mmr.key;
491	mr->umem = NULL;
492
493	return &mr->ibmr;
494
495err_in:
496	kfree(in);
497
498err_free:
499	kfree(mr);
500
501	return ERR_PTR(err);
502}
503
504static int get_octo_len(u64 addr, u64 len, u64 page_size)
505{
506	u64 offset;
507	int npages;
508
509	offset = addr & (page_size - 1ULL);
510	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
511	return (npages + 1) / 2;
512}
513
514void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
515{
516	struct mlx5_ib_umr_context *context;
517	struct ib_wc wc;
518	int err;
519
520	while (1) {
521		err = ib_poll_cq(cq, 1, &wc);
522		if (err < 0) {
523			printf("mlx5_ib: WARN: ""poll cq error %d\n", err);
524			return;
525		}
526		if (err == 0)
527			break;
528
529		context = (struct mlx5_ib_umr_context *)(uintptr_t)wc.wr_id;
530		context->status = wc.status;
531		complete(&context->done);
532	}
533	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
534}
535
536static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
537				     u64 length, struct ib_umem *umem,
538				     int npages, int page_shift,
539				     int access_flags)
540{
541	struct mlx5_ib_dev *dev = to_mdev(pd->device);
542	struct mlx5_create_mkey_mbox_in *in;
543	struct mlx5_ib_mr *mr;
544	int inlen;
545	int err;
546	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
547
548	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
549	if (!mr)
550		return ERR_PTR(-ENOMEM);
551
552	inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2;
553	in = mlx5_vzalloc(inlen);
554	if (!in) {
555		err = -ENOMEM;
556		goto err_1;
557	}
558	mlx5_ib_populate_pas(dev, umem, page_shift, in->pas,
559			     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
560
561	/* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags
562	 * in the page list submitted with the command. */
563	in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0;
564	in->seg.flags = convert_access(access_flags) |
565		MLX5_ACCESS_MODE_MTT;
566	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
567	in->seg.start_addr = cpu_to_be64(virt_addr);
568	in->seg.len = cpu_to_be64(length);
569	in->seg.bsfs_octo_size = 0;
570	in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
571	in->seg.log2_page_size = page_shift;
572	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
573	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length,
574							 1 << page_shift));
575	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL,
576				    NULL, NULL);
577	if (err) {
578		mlx5_ib_warn(dev, "create mkey failed\n");
579		goto err_2;
580	}
581	mr->umem = umem;
582	mr->dev = dev;
583	kvfree(in);
584
585	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
586
587	return mr;
588
589err_2:
590	kvfree(in);
591
592err_1:
593	kfree(mr);
594
595	return ERR_PTR(err);
596}
597
598enum {
599	MLX5_MAX_REG_ORDER = MAX_MR_CACHE_ENTRIES + 1,
600	MLX5_MAX_REG_SIZE = 2ul * 1024 * 1024 * 1024,
601};
602
603static int clean_mr(struct mlx5_ib_mr *mr)
604{
605	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
606	int umred = mr->umred;
607	int err;
608	int i;
609
610	if (!umred) {
611		for (i = 0; i < mr->nchild; ++i) {
612			free_cached_mr(dev, mr->children[i]);
613		}
614		kfree(mr->children);
615
616		err = destroy_mkey(dev, mr);
617		if (err) {
618			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
619				     mr->mmr.key, err);
620			return err;
621		}
622	}
623	return 0;
624}
625
626struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
627				  u64 virt_addr, int access_flags,
628				  struct ib_udata *udata, int mr_id)
629{
630	struct mlx5_ib_dev *dev = to_mdev(pd->device);
631	struct mlx5_ib_mr *mr = NULL;
632	struct ib_umem *umem;
633	int page_shift;
634	int npages;
635	int ncont;
636	int order;
637	int err;
638
639	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
640		    (unsigned long long)start, (unsigned long long)virt_addr,
641		    (unsigned long long)length, access_flags);
642	umem = ib_umem_get(pd->uobject->context, start, length, access_flags, 0);
643	if (IS_ERR(umem)) {
644		mlx5_ib_warn(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
645		return (void *)umem;
646	}
647
648	mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order);
649	if (!npages) {
650		mlx5_ib_warn(dev, "avoid zero region\n");
651		err = -EINVAL;
652		goto error;
653	}
654
655	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
656		    npages, ncont, order, page_shift);
657
658	mutex_lock(&dev->slow_path_mutex);
659	mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, access_flags);
660	mutex_unlock(&dev->slow_path_mutex);
661
662	if (IS_ERR(mr)) {
663		err = PTR_ERR(mr);
664		mr = NULL;
665		goto error;
666	}
667
668	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key);
669
670	mr->umem = umem;
671	mr->npages = npages;
672	atomic_add(npages, &dev->mdev->priv.reg_pages);
673	mr->ibmr.lkey = mr->mmr.key;
674	mr->ibmr.rkey = mr->mmr.key;
675
676	return &mr->ibmr;
677
678error:
679	/*
680	 * Destroy the umem *before* destroying the MR, to ensure we
681	 * will not have any in-flight notifiers when destroying the
682	 * MR.
683	 *
684	 * As the MR is completely invalid to begin with, and this
685	 * error path is only taken if we can't push the mr entry into
686	 * the pagefault tree, this is safe.
687	 */
688
689	ib_umem_release(umem);
690	return ERR_PTR(err);
691}
692
693CTASSERT(sizeof(((struct ib_phys_buf *)0)->size) == 8);
694
695struct ib_mr *
696mlx5_ib_reg_phys_mr(struct ib_pd *pd,
697		    struct ib_phys_buf *buffer_list,
698		    int num_phys_buf,
699		    int access_flags,
700		    u64 *virt_addr)
701{
702	struct mlx5_ib_dev *dev = to_mdev(pd->device);
703	struct mlx5_create_mkey_mbox_in *in;
704	struct mlx5_ib_mr *mr;
705	u64 total_size;
706	u32 octo_len;
707	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
708	unsigned long mask;
709	int shift;
710	int npages;
711	int inlen;
712	int err;
713	int i, j, n;
714
715	mask = buffer_list[0].addr ^ *virt_addr;
716	total_size = 0;
717	for (i = 0; i < num_phys_buf; ++i) {
718		if (i != 0)
719			mask |= buffer_list[i].addr;
720		if (i != num_phys_buf - 1)
721			mask |= buffer_list[i].addr + buffer_list[i].size;
722
723		total_size += buffer_list[i].size;
724	}
725
726	if (mask & ~PAGE_MASK)
727		return ERR_PTR(-EINVAL);
728
729	shift = __ffs(mask | 1 << 31);
730
731	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
732	buffer_list[0].addr &= ~0ULL << shift;
733
734	npages = 0;
735	for (i = 0; i < num_phys_buf; ++i)
736		npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
737
738	if (!npages) {
739		mlx5_ib_warn(dev, "avoid zero region\n");
740		return ERR_PTR(-EINVAL);
741	}
742
743	mr = kzalloc(sizeof *mr, GFP_KERNEL);
744	if (!mr)
745		return ERR_PTR(-ENOMEM);
746
747	octo_len = get_octo_len(*virt_addr, total_size, 1ULL << shift);
748	octo_len = ALIGN(octo_len, 4);
749
750	inlen = sizeof(*in) + (octo_len * 16);
751	in = mlx5_vzalloc(inlen);
752	if (!in) {
753		kfree(mr);
754		return ERR_PTR(-ENOMEM);
755	}
756
757	n = 0;
758	for (i = 0; i < num_phys_buf; ++i) {
759		for (j = 0;
760		     j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
761		     ++j) {
762			u64 temp = buffer_list[i].addr + ((u64) j << shift);
763			if (pg_cap)
764				temp |= MLX5_IB_MTT_PRESENT;
765			in->pas[n++] = cpu_to_be64(temp);
766		}
767	}
768
769	/* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags
770	 * in the page list submitted with the command. */
771	in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0;
772	in->seg.flags = convert_access(access_flags) |
773		MLX5_ACCESS_MODE_MTT;
774	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
775	in->seg.start_addr = cpu_to_be64(*virt_addr);
776	in->seg.len = cpu_to_be64(total_size);
777	in->seg.bsfs_octo_size = 0;
778	in->seg.xlt_oct_size = cpu_to_be32(octo_len);
779	in->seg.log2_page_size = shift;
780	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
781	in->xlat_oct_act_size = cpu_to_be32(octo_len);
782	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL,
783				    NULL, NULL);
784	mr->umem = NULL;
785	mr->dev = dev;
786	mr->npages = npages;
787	mr->ibmr.lkey = mr->mmr.key;
788	mr->ibmr.rkey = mr->mmr.key;
789
790	kvfree(in);
791
792	if (err) {
793		kfree(mr);
794		return ERR_PTR(err);
795	}
796	return &mr->ibmr;
797}
798
799int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
800{
801	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
802	struct mlx5_ib_mr *mr = to_mmr(ibmr);
803	struct ib_umem *umem = mr->umem;
804	int npages = mr->npages;
805	int umred = mr->umred;
806	int err;
807
808	err = clean_mr(mr);
809	if (err)
810		return err;
811
812	if (umem) {
813		ib_umem_release(umem);
814		atomic_sub(npages, &dev->mdev->priv.reg_pages);
815	}
816
817	if (umred)
818		free_cached_mr(dev, mr);
819	else
820		kfree(mr);
821
822	return 0;
823}
824
825int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
826{
827	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
828	struct mlx5_ib_mr *mr = to_mmr(ibmr);
829	int err;
830
831	if (mr->sig) {
832		if (mlx5_core_destroy_psv(dev->mdev,
833					  mr->sig->psv_memory.psv_idx))
834			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
835				     mr->sig->psv_memory.psv_idx);
836		if (mlx5_core_destroy_psv(dev->mdev,
837					  mr->sig->psv_wire.psv_idx))
838			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
839				     mr->sig->psv_wire.psv_idx);
840		kfree(mr->sig);
841	}
842
843	err = destroy_mkey(dev, mr);
844	if (err) {
845		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
846			     mr->mmr.key, err);
847		return err;
848	}
849
850	kfree(mr);
851
852	return err;
853}
854
855struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
856					int max_page_list_len)
857{
858	struct mlx5_ib_dev *dev = to_mdev(pd->device);
859	struct mlx5_create_mkey_mbox_in *in;
860	struct mlx5_ib_mr *mr;
861	int err;
862
863	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
864	if (!mr)
865		return ERR_PTR(-ENOMEM);
866
867	in = kzalloc(sizeof(*in), GFP_KERNEL);
868	if (!in) {
869		err = -ENOMEM;
870		goto err_free;
871	}
872
873	in->seg.status = MLX5_MKEY_STATUS_FREE;
874	in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
875	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
876	in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
877	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
878	/*
879	 * TBD not needed - issue 197292 */
880	in->seg.log2_page_size = PAGE_SHIFT;
881
882	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
883				    NULL, NULL);
884	kfree(in);
885	if (err) {
886		mlx5_ib_warn(dev, "failed create mkey\n");
887		goto err_free;
888	}
889
890	mr->ibmr.lkey = mr->mmr.key;
891	mr->ibmr.rkey = mr->mmr.key;
892	mr->umem = NULL;
893
894	return &mr->ibmr;
895
896err_free:
897	kfree(mr);
898	return ERR_PTR(err);
899}
900
901struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
902							       int page_list_len)
903{
904	struct mlx5_ib_fast_reg_page_list *mfrpl;
905	int size = page_list_len * sizeof(u64);
906
907	mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL);
908	if (!mfrpl)
909		return ERR_PTR(-ENOMEM);
910
911	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
912	if (!mfrpl->ibfrpl.page_list)
913		goto err_free;
914
915	mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device,
916						     size, &mfrpl->map,
917						     GFP_KERNEL);
918	if (!mfrpl->mapped_page_list)
919		goto err_free;
920
921	WARN_ON(mfrpl->map & 0x3f);
922
923	return &mfrpl->ibfrpl;
924
925err_free:
926	kfree(mfrpl->ibfrpl.page_list);
927	kfree(mfrpl);
928	return ERR_PTR(-ENOMEM);
929}
930
931void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
932{
933	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
934	struct mlx5_ib_dev *dev = to_mdev(page_list->device);
935	int size = page_list->max_page_list_len * sizeof(u64);
936
937	dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list,
938			  mfrpl->map);
939	kfree(mfrpl->ibfrpl.page_list);
940	kfree(mfrpl);
941}
942
943struct order_attribute {
944	struct attribute attr;
945	ssize_t (*show)(struct cache_order *, struct order_attribute *, char *buf);
946	ssize_t (*store)(struct cache_order *, struct order_attribute *,
947			 const char *buf, size_t count);
948};
949
950static ssize_t cur_show(struct cache_order *co, struct order_attribute *oa,
951			char *buf)
952{
953	struct mlx5_ib_dev *dev = co->dev;
954	struct mlx5_mr_cache *cache = &dev->cache;
955	struct mlx5_cache_ent *ent = &cache->ent[co->index];
956	int err;
957
958	err = snprintf(buf, 20, "%d\n", ent->cur);
959	return err;
960}
961
962static ssize_t limit_show(struct cache_order *co, struct order_attribute *oa,
963			  char *buf)
964{
965	struct mlx5_ib_dev *dev = co->dev;
966	struct mlx5_mr_cache *cache = &dev->cache;
967	struct mlx5_cache_ent *ent = &cache->ent[co->index];
968	int err;
969
970	err = snprintf(buf, 20, "%d\n", ent->limit);
971	return err;
972}
973
974static ssize_t limit_store(struct cache_order *co, struct order_attribute *oa,
975			   const char *buf, size_t count)
976{
977	struct mlx5_ib_dev *dev = co->dev;
978	struct mlx5_mr_cache *cache = &dev->cache;
979	struct mlx5_cache_ent *ent = &cache->ent[co->index];
980	u32 var;
981	int err;
982
983#define	kstrtouint(a,b,c) ({*(c) = strtol(a,0,b); 0;})
984#define	kstrtoint(a,b,c) ({*(c) = strtol(a,0,b); 0;})
985
986	if (kstrtouint(buf, 0, &var))
987		return -EINVAL;
988
989	if (var > ent->size)
990		return -EINVAL;
991
992	ent->limit = var;
993
994	if (ent->cur < ent->limit) {
995		err = add_keys(dev, co->index, 2 * ent->limit - ent->cur);
996		if (err)
997			return err;
998	}
999
1000	return count;
1001}
1002
1003static ssize_t miss_show(struct cache_order *co, struct order_attribute *oa,
1004			 char *buf)
1005{
1006	struct mlx5_ib_dev *dev = co->dev;
1007	struct mlx5_mr_cache *cache = &dev->cache;
1008	struct mlx5_cache_ent *ent = &cache->ent[co->index];
1009	int err;
1010
1011	err = snprintf(buf, 20, "%d\n", ent->miss);
1012	return err;
1013}
1014
1015static ssize_t miss_store(struct cache_order *co, struct order_attribute *oa,
1016			  const char *buf, size_t count)
1017{
1018	struct mlx5_ib_dev *dev = co->dev;
1019	struct mlx5_mr_cache *cache = &dev->cache;
1020	struct mlx5_cache_ent *ent = &cache->ent[co->index];
1021	u32 var;
1022
1023	if (kstrtouint(buf, 0, &var))
1024		return -EINVAL;
1025
1026	if (var != 0)
1027		return -EINVAL;
1028
1029	ent->miss = var;
1030
1031	return count;
1032}
1033
1034static ssize_t size_show(struct cache_order *co, struct order_attribute *oa,
1035			 char *buf)
1036{
1037	struct mlx5_ib_dev *dev = co->dev;
1038	struct mlx5_mr_cache *cache = &dev->cache;
1039	struct mlx5_cache_ent *ent = &cache->ent[co->index];
1040	int err;
1041
1042	err = snprintf(buf, 20, "%d\n", ent->size);
1043	return err;
1044}
1045
1046static ssize_t size_store(struct cache_order *co, struct order_attribute *oa,
1047			  const char *buf, size_t count)
1048{
1049	struct mlx5_ib_dev *dev = co->dev;
1050	struct mlx5_mr_cache *cache = &dev->cache;
1051	struct mlx5_cache_ent *ent = &cache->ent[co->index];
1052	u32 var;
1053	int err;
1054
1055	if (kstrtouint(buf, 0, &var))
1056		return -EINVAL;
1057
1058	if (var < ent->limit)
1059		return -EINVAL;
1060
1061	if (var > ent->size) {
1062		do {
1063			err = add_keys(dev, co->index, var - ent->size);
1064			if (err && err != -EAGAIN)
1065				return err;
1066
1067			usleep_range(3000, 5000);
1068		} while (err);
1069	} else if (var < ent->size) {
1070		remove_keys(dev, co->index, ent->size - var);
1071	}
1072
1073	return count;
1074}
1075
1076static ssize_t order_attr_show(struct kobject *kobj,
1077			       struct attribute *attr, char *buf)
1078{
1079	struct order_attribute *oa =
1080		container_of(attr, struct order_attribute, attr);
1081	struct cache_order *co = container_of(kobj, struct cache_order, kobj);
1082
1083	if (!oa->show)
1084		return -EIO;
1085
1086	return oa->show(co, oa, buf);
1087}
1088
1089static ssize_t order_attr_store(struct kobject *kobj,
1090				struct attribute *attr, const char *buf, size_t size)
1091{
1092	struct order_attribute *oa =
1093		container_of(attr, struct order_attribute, attr);
1094	struct cache_order *co = container_of(kobj, struct cache_order, kobj);
1095
1096	if (!oa->store)
1097		return -EIO;
1098
1099	return oa->store(co, oa, buf, size);
1100}
1101
1102static const struct sysfs_ops order_sysfs_ops = {
1103	.show = order_attr_show,
1104	.store = order_attr_store,
1105};
1106
1107#define ORDER_ATTR(_name) struct order_attribute order_attr_##_name = \
1108	__ATTR(_name, 0644, _name##_show, _name##_store)
1109#define ORDER_ATTR_RO(_name) struct order_attribute order_attr_##_name = \
1110	__ATTR(_name, 0444, _name##_show, NULL)
1111
1112static ORDER_ATTR_RO(cur);
1113static ORDER_ATTR(limit);
1114static ORDER_ATTR(miss);
1115static ORDER_ATTR(size);
1116
1117static struct attribute *order_default_attrs[] = {
1118	&order_attr_cur.attr,
1119	&order_attr_limit.attr,
1120	&order_attr_miss.attr,
1121	&order_attr_size.attr,
1122	NULL
1123};
1124
1125static struct kobj_type order_type = {
1126	.sysfs_ops     = &order_sysfs_ops,
1127	.default_attrs = order_default_attrs
1128};
1129
1130
1131
1132struct cache_attribute {
1133	struct attribute attr;
1134	ssize_t (*show)(struct mlx5_ib_dev *dev, char *buf);
1135	ssize_t (*store)(struct mlx5_ib_dev *dev, const char *buf, size_t count);
1136};
1137
1138static ssize_t rel_imm_show(struct mlx5_ib_dev *dev, char *buf)
1139{
1140	struct mlx5_mr_cache *cache = &dev->cache;
1141	int err;
1142
1143	err = snprintf(buf, 20, "%d\n", cache->rel_imm);
1144	return err;
1145}
1146
1147static ssize_t rel_imm_store(struct mlx5_ib_dev *dev, const char *buf, size_t count)
1148{
1149	struct mlx5_mr_cache *cache = &dev->cache;
1150	u32 var;
1151	int i;
1152	int found = 0;
1153
1154	if (kstrtouint(buf, 0, &var))
1155		return -EINVAL;
1156
1157	if (var > 1)
1158		return -EINVAL;
1159
1160	if (var == cache->rel_imm)
1161		return count;
1162
1163	cache->rel_imm = var;
1164	if (cache->rel_imm == 1) {
1165		for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
1166			if (cache->ent[i].cur > 2 * cache->ent[i].limit) {
1167				queue_work(cache->wq, &cache->ent[i].work);
1168				found = 1;
1169			}
1170		}
1171		if (!found)
1172			cache->rel_imm = 0;
1173	}
1174
1175	return count;
1176}
1177static ssize_t rel_timeout_show(struct mlx5_ib_dev *dev, char *buf)
1178{
1179	struct mlx5_mr_cache *cache = &dev->cache;
1180	int err;
1181
1182	err = snprintf(buf, 20, "%d\n", cache->rel_timeout);
1183	return err;
1184}
1185
1186static ssize_t rel_timeout_store(struct mlx5_ib_dev *dev, const char *buf, size_t count)
1187{
1188	struct mlx5_mr_cache *cache = &dev->cache;
1189	int var;
1190	int i;
1191
1192	if (kstrtoint(buf, 0, &var))
1193		return -EINVAL;
1194
1195	if (var < -1 || var > MAX_MR_RELEASE_TIMEOUT)
1196		return -EINVAL;
1197
1198	if (var == cache->rel_timeout)
1199		return count;
1200
1201	if (cache->rel_timeout == -1 || (var < cache->rel_timeout && var != -1)) {
1202		cache->rel_timeout = var;
1203		for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
1204			if (cache->ent[i].cur > 2 * cache->ent[i].limit)
1205				queue_work(cache->wq, &cache->ent[i].work);
1206		}
1207	} else {
1208		cache->rel_timeout = var;
1209	}
1210
1211	return count;
1212}
1213
1214static ssize_t cache_attr_show(struct kobject *kobj,
1215			       struct attribute *attr, char *buf)
1216{
1217	struct cache_attribute *ca =
1218		container_of(attr, struct cache_attribute, attr);
1219	struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache);
1220
1221	if (!ca->show)
1222		return -EIO;
1223
1224	return ca->show(dev, buf);
1225}
1226
1227static ssize_t cache_attr_store(struct kobject *kobj,
1228				struct attribute *attr, const char *buf, size_t size)
1229{
1230	struct cache_attribute *ca =
1231		container_of(attr, struct cache_attribute, attr);
1232	struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache);
1233
1234	if (!ca->store)
1235		return -EIO;
1236
1237	return ca->store(dev, buf, size);
1238}
1239
1240static const struct sysfs_ops cache_sysfs_ops = {
1241	.show = cache_attr_show,
1242	.store = cache_attr_store,
1243};
1244
1245#define CACHE_ATTR(_name) struct cache_attribute cache_attr_##_name = \
1246	__ATTR(_name, 0644, _name##_show, _name##_store)
1247
1248static CACHE_ATTR(rel_imm);
1249static CACHE_ATTR(rel_timeout);
1250
1251static struct attribute *cache_default_attrs[] = {
1252	&cache_attr_rel_imm.attr,
1253	&cache_attr_rel_timeout.attr,
1254	NULL
1255};
1256
1257static struct kobj_type cache_type = {
1258	.sysfs_ops     = &cache_sysfs_ops,
1259	.default_attrs = cache_default_attrs
1260};
1261
1262static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev)
1263{
1264	struct mlx5_mr_cache *cache = &dev->cache;
1265	struct device *device = &dev->ib_dev.dev;
1266	struct cache_order *co;
1267	int o;
1268	int i;
1269	int err;
1270
1271	err = kobject_init_and_add(&dev->mr_cache, &cache_type,
1272				   &device->kobj, "mr_cache");
1273	if (err)
1274		return -ENOMEM;
1275
1276	for (o = 2, i = 0; i < MAX_MR_CACHE_ENTRIES; o++, i++) {
1277		co = &cache->ent[i].co;
1278		co->order = o;
1279		co->index = i;
1280		co->dev = dev;
1281		err = kobject_init_and_add(&co->kobj, &order_type,
1282					   &dev->mr_cache, "%d", o);
1283		if (err)
1284			goto err_put;
1285	}
1286
1287	return 0;
1288
1289err_put:
1290	for (; i >= 0; i--) {
1291		co = &cache->ent[i].co;
1292		kobject_put(&co->kobj);
1293	}
1294	kobject_put(&dev->mr_cache);
1295
1296	return err;
1297}
1298
1299static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev)
1300{
1301	struct mlx5_mr_cache *cache = &dev->cache;
1302	struct cache_order *co;
1303	int i;
1304
1305	for (i = MAX_MR_CACHE_ENTRIES - 1; i >= 0; i--) {
1306		co = &cache->ent[i].co;
1307		kobject_put(&co->kobj);
1308	}
1309	kobject_put(&dev->mr_cache);
1310}
1311