mlx4_ib_mr.c revision 296382
1/*
2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <linux/slab.h>
35#include <linux/module.h>
36#include <linux/sched.h>
37
38#include "mlx4_ib.h"
39
40static u32 convert_access(int acc)
41{
42	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
43	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
44	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
45	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
46	       (acc & IB_ACCESS_MW_BIND       ? MLX4_PERM_BIND_MW      : 0) |
47	       MLX4_PERM_LOCAL_READ;
48}
49/* No suuport for Shared MR feature */
50#if 0
51static ssize_t shared_mr_proc_read(struct file *file,
52			  char __user *buffer,
53			  size_t len,
54			  loff_t *offset)
55{
56
57	return -ENOSYS;
58
59}
60
61static ssize_t shared_mr_proc_write(struct file *file,
62			   const char __user *buffer,
63			   size_t len,
64			   loff_t *offset)
65{
66
67	return -ENOSYS;
68}
69
70static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma)
71{
72
73	struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode);
74	struct mlx4_shared_mr_info *smr_info =
75		(struct mlx4_shared_mr_info *)pde->data;
76
77	/* Prevent any mapping not on start of area */
78	if (vma->vm_pgoff != 0)
79		return -EINVAL;
80
81	return ib_umem_map_to_vma(smr_info->umem,
82					vma);
83
84}
85
86static const struct file_operations shared_mr_proc_ops = {
87	.owner	= THIS_MODULE,
88	.read	= shared_mr_proc_read,
89	.write	= shared_mr_proc_write,
90	.mmap	= shared_mr_mmap
91};
92
93static mode_t convert_shared_access(int acc)
94{
95
96	return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR       : 0) |
97	       (acc & IB_ACCESS_SHARED_MR_USER_WRITE  ? S_IWUSR : 0) |
98	       (acc & IB_ACCESS_SHARED_MR_GROUP_READ   ? S_IRGRP  : 0) |
99	       (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE   ? S_IWGRP  : 0) |
100	       (acc & IB_ACCESS_SHARED_MR_OTHER_READ   ? S_IROTH  : 0) |
101	       (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE   ? S_IWOTH  : 0);
102
103}
104#endif
105struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
106{
107	struct mlx4_ib_mr *mr;
108	int err;
109
110	mr = kzalloc(sizeof *mr, GFP_KERNEL);
111	if (!mr)
112		return ERR_PTR(-ENOMEM);
113
114	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
115			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
116	if (err)
117		goto err_free;
118
119	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
120	if (err)
121		goto err_mr;
122
123	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
124	mr->umem = NULL;
125
126	return &mr->ibmr;
127
128err_mr:
129	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
130
131err_free:
132	kfree(mr);
133
134	return ERR_PTR(err);
135}
136
137static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
138						struct mlx4_mtt *mtt,
139						u64 mtt_size,
140						u64 mtt_shift,
141						u64 len,
142						u64 cur_start_addr,
143						u64 *pages,
144						int *start_index,
145						int *npages)
146{
147	int k;
148	int err = 0;
149	u64 mtt_entries;
150	u64 cur_end_addr = cur_start_addr + len;
151	u64 cur_end_addr_aligned = 0;
152
153	len += (cur_start_addr & (mtt_size-1ULL));
154	cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
155	len += (cur_end_addr_aligned - cur_end_addr);
156	if (len & (mtt_size-1ULL)) {
157		WARN(1 ,
158		"write_block: len %llx is not aligned to mtt_size %llx\n",
159			(unsigned long long)len, (unsigned long long)mtt_size);
160		return -EINVAL;
161	}
162
163
164	mtt_entries = (len >> mtt_shift);
165
166	/* Align the MTT start address to
167		the mtt_size.
168		Required to handle cases when the MR
169		starts in the middle of an MTT record.
170		Was not required in old code since
171		the physical addresses provided by
172		the dma subsystem were page aligned,
173		which was also the MTT size.
174	*/
175	cur_start_addr = round_down(cur_start_addr, mtt_size);
176	/* A new block is started ...*/
177	for (k = 0; k < mtt_entries; ++k) {
178		pages[*npages] = cur_start_addr + (mtt_size * k);
179		(*npages)++;
180		/*
181		 * Be friendly to mlx4_write_mtt() and
182		 * pass it chunks of appropriate size.
183		 */
184		if (*npages == PAGE_SIZE / sizeof(u64)) {
185			err = mlx4_write_mtt(dev->dev,
186					mtt, *start_index,
187					*npages, pages);
188			if (err)
189				return err;
190
191			(*start_index) += *npages;
192			*npages = 0;
193		}
194	}
195
196	return 0;
197}
198
199int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
200			   struct ib_umem *umem)
201{
202	u64 *pages;
203	u64 len = 0;
204	int err = 0;
205	u64 mtt_size;
206	u64 cur_start_addr = 0;
207	u64 mtt_shift;
208	int start_index = 0;
209	int npages = 0;
210	struct scatterlist *sg;
211	int i;
212
213	pages = (u64 *) __get_free_page(GFP_KERNEL);
214	if (!pages)
215		return -ENOMEM;
216
217	mtt_shift = mtt->page_shift;
218	mtt_size = 1ULL << mtt_shift;
219
220	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
221			if (cur_start_addr + len ==
222			    sg_dma_address(sg)) {
223				/* still the same block */
224				len += sg_dma_len(sg);
225				continue;
226			}
227			/* A new block is started ...*/
228			/* If len is malaligned, write an extra mtt entry to
229			    cover the misaligned area (round up the division)
230			*/
231			err = mlx4_ib_umem_write_mtt_block(dev,
232						mtt, mtt_size, mtt_shift,
233						len, cur_start_addr,
234						pages,
235						&start_index,
236						&npages);
237			if (err)
238				goto out;
239
240			cur_start_addr =
241				sg_dma_address(sg);
242			len = sg_dma_len(sg);
243	}
244
245	/* Handle the last block */
246	if (len > 0) {
247		/*  If len is malaligned, write an extra mtt entry to cover
248		     the misaligned area (round up the division)
249		*/
250		err = mlx4_ib_umem_write_mtt_block(dev,
251						mtt, mtt_size, mtt_shift,
252						len, cur_start_addr,
253						pages,
254						&start_index,
255						&npages);
256			if (err)
257				goto out;
258	}
259
260
261	if (npages)
262		err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
263
264out:
265	free_page((unsigned long) pages);
266	return err;
267}
268
269static inline u64 alignment_of(u64 ptr)
270{
271	return ilog2(ptr & (~(ptr-1)));
272}
273
274static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
275						u64 current_block_end,
276						u64 block_shift)
277{
278	/* Check whether the alignment of the new block
279	     is aligned as well as the previous block.
280	     Block address must start with zeros till size of entity_size.
281	*/
282	if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
283		/* It is not as well aligned as the
284		previous block-reduce the mtt size
285		accordingly.
286		Here we take the last right bit
287		which is 1.
288		*/
289		block_shift = alignment_of(next_block_start);
290
291	/*  Check whether the alignment of the
292	     end of previous block - is it aligned
293	     as well as the start of the block
294	*/
295	if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
296		/* It is not as well aligned as
297		the start of the block - reduce the
298		mtt size accordingly.
299		*/
300		block_shift = alignment_of(current_block_end);
301
302	return block_shift;
303}
304
305/* Calculate optimal mtt size based on contiguous pages.
306* Function will return also the number of pages that are not aligned to the
307   calculated mtt_size to be added to total number
308    of pages. For that we should check the first chunk length & last chunk
309    length and if not aligned to mtt_size we should increment
310    the non_aligned_pages number.
311    All chunks in the middle already handled as part of mtt shift calculation
312    for both their start & end addresses.
313*/
314int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
315						u64 start_va,
316						int *num_of_mtts)
317{
318	u64 block_shift = MLX4_MAX_MTT_SHIFT;
319	u64 current_block_len = 0;
320	u64 current_block_start = 0;
321	u64 misalignment_bits;
322	u64 first_block_start = 0;
323	u64 last_block_end = 0;
324	u64 total_len = 0;
325	u64 last_block_aligned_end = 0;
326	u64 min_shift = ilog2(umem->page_size);
327	struct scatterlist *sg;
328	int i;
329	u64 next_block_start;
330	u64 current_block_end;
331
332	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
333		/* Initialization - save the first chunk start as
334		    the current_block_start - block means contiguous pages.
335		*/
336		if (current_block_len == 0 && current_block_start == 0) {
337			first_block_start = current_block_start =
338				sg_dma_address(sg);
339			/* Find the bits that are different between
340			    the physical address and the virtual
341			    address for the start of the MR.
342			*/
343			/* umem_get aligned the start_va to a page
344			   boundry. Therefore, we need to align the
345			   start va to the same boundry */
346			/* misalignment_bits is needed to handle the
347			   case of a single memory region. In this
348			   case, the rest of the logic will not reduce
349			   the block size.  If we use a block size
350			   which is bigger than the alignment of the
351			   misalignment bits, we might use the virtual
352			   page number instead of the physical page
353			   number, resulting in access to the wrong
354			   data. */
355			misalignment_bits =
356			(start_va & (~(((u64)(umem->page_size))-1ULL)))
357						^ current_block_start;
358			block_shift = min(alignment_of(misalignment_bits)
359				, block_shift);
360		}
361
362		/* Go over the scatter entries and check
363		     if they continue the previous scatter entry.
364		*/
365		next_block_start =
366			sg_dma_address(sg);
367		current_block_end = current_block_start
368			+ current_block_len;
369		/* If we have a split (non-contig.) between two block*/
370		if (current_block_end != next_block_start) {
371			block_shift = mlx4_ib_umem_calc_block_mtt(
372					next_block_start,
373					current_block_end,
374					block_shift);
375
376			/* If we reached the minimum shift for 4k
377			     page we stop the loop.
378			*/
379			if (block_shift <= min_shift)
380				goto end;
381
382			/* If not saved yet we are in first block -
383			     we save the length of first block to
384			     calculate the non_aligned_pages number at
385			*    the end.
386			*/
387			total_len += current_block_len;
388
389			/* Start a new block */
390			current_block_start = next_block_start;
391			current_block_len =
392				sg_dma_len(sg);
393			continue;
394		}
395		/* The scatter entry is another part of
396		     the current block, increase the block size
397		* An entry in the scatter can be larger than
398		4k (page) as of dma mapping
399		which merge some blocks together.
400		*/
401		current_block_len +=
402			sg_dma_len(sg);
403	}
404
405	/* Account for the last block in the total len */
406	total_len += current_block_len;
407	/* Add to the first block the misalignment that it suffers from.*/
408	total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
409	last_block_end = current_block_start+current_block_len;
410	last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
411	total_len += (last_block_aligned_end - last_block_end);
412
413	WARN((total_len & ((1ULL<<block_shift)-1ULL)),
414		" misaligned total length detected (%llu, %llu)!",
415		(unsigned long long)total_len, (unsigned long long)block_shift);
416
417	*num_of_mtts = total_len >> block_shift;
418end:
419	if (block_shift < min_shift) {
420		/* If shift is less than the min we set a WARN and
421		     return the min shift.
422		*/
423		WARN(1,
424		"mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
425		(unsigned long long)block_shift);
426
427		block_shift = min_shift;
428	}
429	return block_shift;
430
431}
432
433/* No suuport for Shared MR */
434#if 0
435static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
436{
437
438	struct proc_dir_entry *mr_proc_entry;
439	mode_t mode = S_IFREG;
440	char name_buff[16];
441
442	mode |= convert_shared_access(access_flags);
443	sprintf(name_buff, "%X", mr_id);
444	mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
445	mr->smr_info->mr_id = mr_id;
446	mr->smr_info->umem = mr->umem;
447
448	mr_proc_entry = proc_create_data(name_buff, mode,
449				mlx4_mrs_dir_entry,
450				&shared_mr_proc_ops,
451				mr->smr_info);
452
453	if (!mr_proc_entry) {
454		pr_err("prepare_shared_mr failed via proc\n");
455		kfree(mr->smr_info);
456		return -ENODEV;
457	}
458
459	current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid));
460	mr_proc_entry->size = mr->umem->length;
461	return 0;
462
463}
464static int is_shared_mr(int access_flags)
465{
466	/* We should check whether IB_ACCESS_SHARED_MR_USER_READ or
467	other shared bits were turned on.
468	*/
469	return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ |
470				IB_ACCESS_SHARED_MR_USER_WRITE |
471				IB_ACCESS_SHARED_MR_GROUP_READ |
472				IB_ACCESS_SHARED_MR_GROUP_WRITE |
473				IB_ACCESS_SHARED_MR_OTHER_READ |
474				IB_ACCESS_SHARED_MR_OTHER_WRITE));
475
476}
477
478static void free_smr_info(struct mlx4_ib_mr *mr)
479{
480	/* When master/parent shared mr is dereged there is
481	no ability to share this mr any more - its mr_id will be
482	returned to the kernel as part of ib_uverbs_dereg_mr
483	and may be allocated again as part of other reg_mr.
484	*/
485	char name_buff[16];
486
487	sprintf(name_buff, "%X", mr->smr_info->mr_id);
488	/* Remove proc entry is checking internally that no operation
489	was strated on that proc fs file and if in the middle
490	current process will wait till end of operation.
491	That's why no sync mechanism is needed when we release
492	below the shared umem.
493	*/
494	remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
495	kfree(mr->smr_info);
496	mr->smr_info = NULL;
497}
498#endif
499
500static void mlx4_invalidate_umem(void *invalidation_cookie,
501				struct ib_umem *umem,
502				unsigned long addr, size_t size)
503{
504	struct mlx4_ib_mr *mr = (struct mlx4_ib_mr *)invalidation_cookie;
505
506	/* This function is called under client peer lock so its resources are race protected */
507	if (atomic_inc_return(&mr->invalidated) > 1) {
508		umem->invalidation_ctx->inflight_invalidation = 1;
509		goto end;
510	}
511
512	umem->invalidation_ctx->peer_callback = 1;
513	mlx4_mr_free(to_mdev(mr->ibmr.device)->dev, &mr->mmr);
514	ib_umem_release(umem);
515	complete(&mr->invalidation_comp);
516
517end:
518	return;
519
520}
521
522struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
523				  u64 virt_addr, int access_flags,
524				  struct ib_udata *udata,
525				  int mr_id)
526{
527	struct mlx4_ib_dev *dev = to_mdev(pd->device);
528	struct mlx4_ib_mr *mr;
529	int shift;
530	int err;
531	int n;
532	struct ib_peer_memory_client *ib_peer_mem;
533
534	mr = kzalloc(sizeof *mr, GFP_KERNEL);
535	if (!mr)
536		return ERR_PTR(-ENOMEM);
537
538	mr->umem = ib_umem_get_ex(pd->uobject->context, start, length,
539			access_flags, 0, 1);
540	if (IS_ERR(mr->umem)) {
541		err = PTR_ERR(mr->umem);
542		goto err_free;
543	}
544
545	ib_peer_mem = mr->umem->ib_peer_mem;
546	n = ib_umem_page_count(mr->umem);
547	shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
548		&n);
549	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
550			 convert_access(access_flags), n, shift, &mr->mmr);
551	if (err)
552		goto err_umem;
553
554	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
555	if (err)
556		goto err_mr;
557
558	err = mlx4_mr_enable(dev->dev, &mr->mmr);
559	if (err)
560		goto err_mr;
561
562	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
563/* No suuport for Shared MR */
564#if 0
565	/* Check whether MR should be shared */
566	if (is_shared_mr(access_flags)) {
567	/* start address and length must be aligned to page size in order
568	    to map a full page and preventing leakage of data */
569		if (mr->umem->offset || (length & ~PAGE_MASK)) {
570		        err = -EINVAL;
571		        goto err_mr;
572		}
573
574		err = prepare_shared_mr(mr, access_flags, mr_id);
575		if (err)
576			goto err_mr;
577	}
578#endif
579	if (ib_peer_mem) {
580		if (access_flags & IB_ACCESS_MW_BIND) {
581			/* Prevent binding MW on peer clients.
582			* mlx4_invalidate_umem must be void,
583			* therefore, mlx4_mr_free should not fail
584			* when using peer clients. */
585			err = -ENOSYS;
586			pr_err("MW is not supported with peer memory client");
587			goto err_smr;
588		}
589		init_completion(&mr->invalidation_comp);
590		ib_umem_activate_invalidation_notifier(mr->umem,
591					mlx4_invalidate_umem, mr);
592	}
593
594	atomic_set(&mr->invalidated, 0);
595	return &mr->ibmr;
596
597err_smr:
598/* No suuport for Shared MR */
599#if 0
600	if (mr->smr_info)
601		free_smr_info(mr);
602#endif
603err_mr:
604	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
605
606err_umem:
607	ib_umem_release(mr->umem);
608
609err_free:
610	kfree(mr);
611
612	return ERR_PTR(err);
613}
614
615int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
616{
617	struct mlx4_ib_mr *mr = to_mmr(ibmr);
618	struct ib_umem *umem = mr->umem;
619	int ret;
620
621/* No suuport for Shared MR */
622#if 0
623	if (mr->smr_info)
624		free_smr_info(mr);
625#endif
626
627	if (atomic_inc_return(&mr->invalidated) > 1) {
628		wait_for_completion(&mr->invalidation_comp);
629		goto end;
630	}
631
632	ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
633	if (ret) {
634		/* Error is not expected here, except when memory windows
635		* are bound to MR which is not supported with
636		* peer memory clients */
637		atomic_set(&mr->invalidated, 0);
638		return ret;
639	}
640
641	if (!umem)
642		goto end;
643
644	ib_umem_release(mr->umem);
645end:
646
647	kfree(mr);
648
649	return 0;
650}
651
652struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
653{
654	struct mlx4_ib_dev *dev = to_mdev(pd->device);
655	struct mlx4_ib_mw *mw;
656	int err;
657
658	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
659	if (!mw)
660		return ERR_PTR(-ENOMEM);
661
662	err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, (enum mlx4_mw_type)type, &mw->mmw);
663	if (err)
664		goto err_free;
665
666	err = mlx4_mw_enable(dev->dev, &mw->mmw);
667	if (err)
668		goto err_mw;
669
670	mw->ibmw.rkey = mw->mmw.key;
671
672	return &mw->ibmw;
673
674err_mw:
675	mlx4_mw_free(dev->dev, &mw->mmw);
676
677err_free:
678	kfree(mw);
679
680	return ERR_PTR(err);
681}
682
683int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
684		    struct ib_mw_bind *mw_bind)
685{
686	struct ib_send_wr  wr;
687	struct ib_send_wr *bad_wr;
688	int ret;
689
690	memset(&wr, 0, sizeof(wr));
691	wr.opcode               = IB_WR_BIND_MW;
692	wr.wr_id                = mw_bind->wr_id;
693	wr.send_flags           = mw_bind->send_flags;
694	wr.wr.bind_mw.mw        = mw;
695	wr.wr.bind_mw.bind_info = mw_bind->bind_info;
696	wr.wr.bind_mw.rkey      = ib_inc_rkey(mw->rkey);
697
698	ret = mlx4_ib_post_send(qp, &wr, &bad_wr);
699	if (!ret)
700		mw->rkey = wr.wr.bind_mw.rkey;
701
702	return ret;
703}
704
705int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
706{
707	struct mlx4_ib_mw *mw = to_mmw(ibmw);
708
709	mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
710	kfree(mw);
711
712	return 0;
713}
714
715struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
716					int max_page_list_len)
717{
718	struct mlx4_ib_dev *dev = to_mdev(pd->device);
719	struct mlx4_ib_mr *mr;
720	int err;
721
722	mr = kzalloc(sizeof *mr, GFP_KERNEL);
723	if (!mr)
724		return ERR_PTR(-ENOMEM);
725
726	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
727			    max_page_list_len, 0, &mr->mmr);
728	if (err)
729		goto err_free;
730
731	err = mlx4_mr_enable(dev->dev, &mr->mmr);
732	if (err)
733		goto err_mr;
734
735	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
736	mr->umem = NULL;
737
738	return &mr->ibmr;
739
740err_mr:
741	(void) mlx4_mr_free(dev->dev, &mr->mmr);
742
743err_free:
744	kfree(mr);
745	return ERR_PTR(err);
746}
747
748struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
749							       int page_list_len)
750{
751	struct mlx4_ib_dev *dev = to_mdev(ibdev);
752	struct mlx4_ib_fast_reg_page_list *mfrpl;
753	int size = page_list_len * sizeof (u64);
754
755	if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
756		return ERR_PTR(-EINVAL);
757
758	mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
759	if (!mfrpl)
760		return ERR_PTR(-ENOMEM);
761
762	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
763	if (!mfrpl->ibfrpl.page_list)
764		goto err_free;
765
766	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
767						     size, &mfrpl->map,
768						     GFP_KERNEL);
769	if (!mfrpl->mapped_page_list)
770		goto err_free;
771
772	WARN_ON(mfrpl->map & 0x3f);
773
774	return &mfrpl->ibfrpl;
775
776err_free:
777	kfree(mfrpl->ibfrpl.page_list);
778	kfree(mfrpl);
779	return ERR_PTR(-ENOMEM);
780}
781
782void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
783{
784	struct mlx4_ib_dev *dev = to_mdev(page_list->device);
785	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
786	int size = page_list->max_page_list_len * sizeof (u64);
787
788	dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
789			  mfrpl->map);
790	kfree(mfrpl->ibfrpl.page_list);
791	kfree(mfrpl);
792}
793
794struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
795				 struct ib_fmr_attr *fmr_attr)
796{
797	struct mlx4_ib_dev *dev = to_mdev(pd->device);
798	struct mlx4_ib_fmr *fmr;
799	int err = -ENOMEM;
800
801	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
802	if (!fmr)
803		return ERR_PTR(-ENOMEM);
804
805	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
806			     fmr_attr->max_pages, fmr_attr->max_maps,
807			     fmr_attr->page_shift, &fmr->mfmr);
808	if (err)
809		goto err_free;
810
811	err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
812	if (err)
813		goto err_mr;
814
815	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
816
817	return &fmr->ibfmr;
818
819err_mr:
820	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
821
822err_free:
823	kfree(fmr);
824
825	return ERR_PTR(err);
826}
827
828int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
829		      int npages, u64 iova)
830{
831	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
832	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
833
834	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
835				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
836}
837
838int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
839{
840	struct ib_fmr *ibfmr;
841	int err;
842	struct mlx4_dev *mdev = NULL;
843
844	list_for_each_entry(ibfmr, fmr_list, list) {
845		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
846			return -EINVAL;
847		mdev = to_mdev(ibfmr->device)->dev;
848	}
849
850	if (!mdev)
851		return 0;
852
853	list_for_each_entry(ibfmr, fmr_list, list) {
854		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
855
856		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
857	}
858
859	/*
860	 * Make sure all MPT status updates are visible before issuing
861	 * SYNC_TPT firmware command.
862	 */
863	wmb();
864
865	err = mlx4_SYNC_TPT(mdev);
866	if (err)
867		pr_warn("SYNC_TPT error %d when "
868		       "unmapping FMRs\n", err);
869
870	return 0;
871}
872
873int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
874{
875	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
876	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
877	int err;
878
879	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
880
881	if (!err)
882		kfree(ifmr);
883
884	return err;
885}
886