1/* $FreeBSD: releng/11.0/sys/dev/iser/iser_memory.c 300723 2016-05-26 09:49:29Z trasz $ */
2/*-
3 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include "icl_iser.h"
28
29static struct fast_reg_descriptor *
30iser_reg_desc_get(struct ib_conn *ib_conn)
31{
32	struct fast_reg_descriptor *desc;
33
34	mtx_lock(&ib_conn->lock);
35	desc = list_first_entry(&ib_conn->fastreg.pool,
36				struct fast_reg_descriptor, list);
37	list_del(&desc->list);
38	mtx_unlock(&ib_conn->lock);
39
40	return (desc);
41}
42
43static void
44iser_reg_desc_put(struct ib_conn *ib_conn,
45		  struct fast_reg_descriptor *desc)
46{
47	mtx_lock(&ib_conn->lock);
48	list_add(&desc->list, &ib_conn->fastreg.pool);
49	mtx_unlock(&ib_conn->lock);
50}
51
52#define IS_4K_ALIGNED(addr)	((((unsigned long)addr) & ~MASK_4K) == 0)
53
54/**
55 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
56 * and returns the length of resulting physical address array (may be less than
57 * the original due to possible compaction).
58 *
59 * we build a "page vec" under the assumption that the SG meets the RDMA
60 * alignment requirements. Other then the first and last SG elements, all
61 * the "internal" elements can be compacted into a list whose elements are
62 * dma addresses of physical pages. The code supports also the weird case
63 * where --few fragments of the same page-- are present in the SG as
64 * consecutive elements. Also, it handles one entry SG.
65 */
66static int
67iser_sg_to_page_vec(struct iser_data_buf *data,
68		    struct ib_device *ibdev, u64 *pages,
69		    int *offset, int *data_size)
70{
71	struct scatterlist *sg, *sgl = data->sgl;
72	u64 start_addr, end_addr, page, chunk_start = 0;
73	unsigned long total_sz = 0;
74	unsigned int dma_len;
75	int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
76
77	/* compute the offset of first element */
78	*offset = (u64) sgl[0].offset & ~MASK_4K;
79
80	new_chunk = 1;
81	cur_page  = 0;
82	for_each_sg(sgl, sg, data->dma_nents, i) {
83		start_addr = ib_sg_dma_address(ibdev, sg);
84		if (new_chunk)
85			chunk_start = start_addr;
86		dma_len = ib_sg_dma_len(ibdev, sg);
87		end_addr = start_addr + dma_len;
88		total_sz += dma_len;
89
90		/* collect page fragments until aligned or end of SG list */
91		if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
92			new_chunk = 0;
93			continue;
94		}
95		new_chunk = 1;
96
97		/* address of the first page in the contiguous chunk;
98		   masking relevant for the very first SG entry,
99		   which might be unaligned */
100		page = chunk_start & MASK_4K;
101		do {
102			pages[cur_page++] = page;
103			page += SIZE_4K;
104		} while (page < end_addr);
105	}
106
107	*data_size = total_sz;
108
109	return (cur_page);
110}
111
112/**
113 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
114 * for RDMA sub-list of a scatter-gather list of memory buffers, and  returns
115 * the number of entries which are aligned correctly. Supports the case where
116 * consecutive SG elements are actually fragments of the same physcial page.
117 */
118static int
119iser_data_buf_aligned_len(struct iser_data_buf *data, struct ib_device *ibdev)
120{
121	struct scatterlist *sg, *sgl, *next_sg = NULL;
122	u64 start_addr, end_addr;
123	int i, ret_len, start_check = 0;
124
125	if (data->dma_nents == 1)
126		return (1);
127
128	sgl = data->sgl;
129	start_addr  = ib_sg_dma_address(ibdev, sgl);
130
131	for_each_sg(sgl, sg, data->dma_nents, i) {
132		if (start_check && !IS_4K_ALIGNED(start_addr))
133			break;
134
135		next_sg = sg_next(sg);
136		if (!next_sg)
137			break;
138
139		end_addr    = start_addr + ib_sg_dma_len(ibdev, sg);
140		start_addr  = ib_sg_dma_address(ibdev, next_sg);
141
142		if (end_addr == start_addr) {
143			start_check = 0;
144			continue;
145		} else
146			start_check = 1;
147
148		if (!IS_4K_ALIGNED(end_addr))
149			break;
150	}
151	ret_len = (next_sg) ? i : i+1;
152
153	return (ret_len);
154}
155
156void
157iser_dma_unmap_task_data(struct icl_iser_pdu *iser_pdu,
158			 struct iser_data_buf *data,
159			 enum dma_data_direction dir)
160{
161	struct ib_device *dev;
162
163	dev = iser_pdu->iser_conn->ib_conn.device->ib_device;
164	ib_dma_unmap_sg(dev, data->sgl, data->size, dir);
165}
166
167static int
168iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
169	     struct iser_mem_reg *reg)
170{
171	struct scatterlist *sg = mem->sgl;
172
173	reg->sge.lkey = device->mr->lkey;
174	reg->rkey = device->mr->rkey;
175	reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
176	reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
177
178	return (0);
179}
180
181/**
182 * TODO: This should be a verb
183 * iser_ib_inc_rkey - increments the key portion of the given rkey. Can be used
184 * for calculating a new rkey for type 2 memory windows.
185 * @rkey - the rkey to increment.
186 */
187static inline u32
188iser_ib_inc_rkey(u32 rkey)
189{
190	const u32 mask = 0x000000ff;
191
192	return (((rkey + 1) & mask) | (rkey & ~mask));
193}
194
195static void
196iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
197{
198	u32 rkey;
199
200	memset(inv_wr, 0, sizeof(*inv_wr));
201	inv_wr->opcode = IB_WR_LOCAL_INV;
202	inv_wr->wr_id = ISER_FASTREG_LI_WRID;
203	inv_wr->ex.invalidate_rkey = mr->rkey;
204
205	rkey = iser_ib_inc_rkey(mr->rkey);
206	ib_update_fast_reg_key(mr, rkey);
207}
208
209static int
210iser_fast_reg_mr(struct icl_iser_pdu *iser_pdu,
211		 struct iser_data_buf *mem,
212		 struct iser_reg_resources *rsc,
213		 struct iser_mem_reg *reg)
214{
215	struct ib_conn *ib_conn = &iser_pdu->iser_conn->ib_conn;
216	struct iser_device *device = ib_conn->device;
217	struct ib_send_wr fastreg_wr, inv_wr;
218	struct ib_send_wr *bad_wr, *wr = NULL;
219	int ret, offset, size, plen;
220
221	/* if there a single dma entry, dma mr suffices */
222	if (mem->dma_nents == 1)
223		return iser_reg_dma(device, mem, reg);
224
225	/* rsc is not null */
226	plen = iser_sg_to_page_vec(mem, device->ib_device,
227				   rsc->frpl->page_list,
228				   &offset, &size);
229	if (plen * SIZE_4K < size) {
230		ISER_ERR("fast reg page_list too short to hold this SG");
231		return (EINVAL);
232	}
233
234	if (!rsc->mr_valid) {
235		iser_inv_rkey(&inv_wr, rsc->mr);
236		wr = &inv_wr;
237	}
238
239	/* Prepare FASTREG WR */
240	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
241	fastreg_wr.wr_id = ISER_FASTREG_LI_WRID;
242	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
243	fastreg_wr.wr.fast_reg.iova_start = rsc->frpl->page_list[0] + offset;
244	fastreg_wr.wr.fast_reg.page_list = rsc->frpl;
245	fastreg_wr.wr.fast_reg.page_list_len = plen;
246	fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
247	fastreg_wr.wr.fast_reg.length = size;
248	fastreg_wr.wr.fast_reg.rkey = rsc->mr->rkey;
249	fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
250					       IB_ACCESS_REMOTE_WRITE |
251					       IB_ACCESS_REMOTE_READ);
252
253	if (!wr)
254		wr = &fastreg_wr;
255	else
256		wr->next = &fastreg_wr;
257
258	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
259	if (ret) {
260		ISER_ERR("fast registration failed, ret:%d", ret);
261		return (ret);
262	}
263	rsc->mr_valid = 0;
264
265	reg->sge.lkey = rsc->mr->lkey;
266	reg->rkey = rsc->mr->rkey;
267	reg->sge.addr = rsc->frpl->page_list[0] + offset;
268	reg->sge.length = size;
269
270	return (ret);
271}
272
273/**
274 * iser_reg_rdma_mem - Registers memory intended for RDMA,
275 * using Fast Registration WR (if possible) obtaining rkey and va
276 *
277 * returns 0 on success, errno code on failure
278 */
279int
280iser_reg_rdma_mem(struct icl_iser_pdu *iser_pdu,
281		  enum iser_data_dir cmd_dir)
282{
283	struct ib_conn *ib_conn = &iser_pdu->iser_conn->ib_conn;
284	struct iser_device   *device = ib_conn->device;
285	struct ib_device     *ibdev = device->ib_device;
286	struct iser_data_buf *mem = &iser_pdu->data[cmd_dir];
287	struct iser_mem_reg *mem_reg = &iser_pdu->rdma_reg[cmd_dir];
288	struct fast_reg_descriptor *desc = NULL;
289	int err, aligned_len;
290
291	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
292	if (aligned_len != mem->dma_nents) {
293		ISER_ERR("bounce buffer is not supported");
294		return 1;
295	}
296
297	if (mem->dma_nents != 1) {
298		desc = iser_reg_desc_get(ib_conn);
299		mem_reg->mem_h = desc;
300	}
301
302	err = iser_fast_reg_mr(iser_pdu, mem, desc ? &desc->rsc : NULL,
303				       mem_reg);
304	if (err)
305		goto err_reg;
306
307	return (0);
308
309err_reg:
310	if (desc)
311		iser_reg_desc_put(ib_conn, desc);
312
313	return (err);
314}
315
316void
317iser_unreg_rdma_mem(struct icl_iser_pdu *iser_pdu,
318		    enum iser_data_dir cmd_dir)
319{
320	struct iser_mem_reg *reg = &iser_pdu->rdma_reg[cmd_dir];
321
322	if (!reg->mem_h)
323		return;
324
325	iser_reg_desc_put(&iser_pdu->iser_conn->ib_conn,
326			  reg->mem_h);
327	reg->mem_h = NULL;
328}
329
330int
331iser_dma_map_task_data(struct icl_iser_pdu *iser_pdu,
332		       struct iser_data_buf *data,
333		       enum iser_data_dir iser_dir,
334		       enum dma_data_direction dma_dir)
335{
336	struct ib_device *dev;
337
338	iser_pdu->dir[iser_dir] = 1;
339	dev = iser_pdu->iser_conn->ib_conn.device->ib_device;
340
341	data->dma_nents = ib_dma_map_sg(dev, data->sgl, data->size, dma_dir);
342	if (data->dma_nents == 0) {
343		ISER_ERR("dma_map_sg failed");
344		return (EINVAL);
345	}
346
347	return (0);
348}
349