1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 *
5 * Work Requests exploiting Infiniband API
6 *
7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
8 * are submitted to either RC SQ or RC RQ respectively
9 * (reliably connected send/receive queue)
10 * and become work queue entries (WQEs).
11 * While an SQ WR/WQE is pending, we track it until transmission completion.
12 * Through a send or receive completion queue (CQ) respectively,
13 * we get completion queue entries (CQEs) [aka work completions (WCs)].
14 * Since the CQ callback is called from IRQ context, we split work by using
15 * bottom halves implemented by tasklets.
16 *
17 * SMC uses this to exchange LLC (link layer control)
18 * and CDC (connection data control) messages.
19 *
20 * Copyright IBM Corp. 2016
21 *
22 * Author(s):  Steffen Maier <maier@linux.vnet.ibm.com>
23 */
24
25#include <linux/atomic.h>
26#include <linux/hashtable.h>
27#include <linux/wait.h>
28#include <rdma/ib_verbs.h>
29#include <asm/div64.h>
30
31#include "smc.h"
32#include "smc_wr.h"
33
34#define SMC_WR_MAX_POLL_CQE 10	/* max. # of compl. queue elements in 1 poll */
35
36#define SMC_WR_RX_HASH_BITS 4
37static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
38static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
39
40struct smc_wr_tx_pend {	/* control data for a pending send request */
41	u64			wr_id;		/* work request id sent */
42	smc_wr_tx_handler	handler;
43	enum ib_wc_status	wc_status;	/* CQE status */
44	struct smc_link		*link;
45	u32			idx;
46	struct smc_wr_tx_pend_priv priv;
47	u8			compl_requested;
48};
49
50/******************************** send queue *********************************/
51
52/*------------------------------- completion --------------------------------*/
53
54/* returns true if at least one tx work request is pending on the given link */
55static inline bool smc_wr_is_tx_pend(struct smc_link *link)
56{
57	return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt);
58}
59
60/* wait till all pending tx work requests on the given link are completed */
61void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
62{
63	wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
64}
65
66static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
67{
68	u32 i;
69
70	for (i = 0; i < link->wr_tx_cnt; i++) {
71		if (link->wr_tx_pends[i].wr_id == wr_id)
72			return i;
73	}
74	return link->wr_tx_cnt;
75}
76
77static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
78{
79	struct smc_wr_tx_pend pnd_snd;
80	struct smc_link *link;
81	u32 pnd_snd_idx;
82
83	link = wc->qp->qp_context;
84
85	if (wc->opcode == IB_WC_REG_MR) {
86		if (wc->status)
87			link->wr_reg_state = FAILED;
88		else
89			link->wr_reg_state = CONFIRMED;
90		smc_wr_wakeup_reg_wait(link);
91		return;
92	}
93
94	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
95	if (pnd_snd_idx == link->wr_tx_cnt) {
96		if (link->lgr->smc_version != SMC_V2 ||
97		    link->wr_tx_v2_pend->wr_id != wc->wr_id)
98			return;
99		link->wr_tx_v2_pend->wc_status = wc->status;
100		memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
101		/* clear the full struct smc_wr_tx_pend including .priv */
102		memset(link->wr_tx_v2_pend, 0,
103		       sizeof(*link->wr_tx_v2_pend));
104		memset(link->lgr->wr_tx_buf_v2, 0,
105		       sizeof(*link->lgr->wr_tx_buf_v2));
106	} else {
107		link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
108		if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
109			complete(&link->wr_tx_compl[pnd_snd_idx]);
110		memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
111		       sizeof(pnd_snd));
112		/* clear the full struct smc_wr_tx_pend including .priv */
113		memset(&link->wr_tx_pends[pnd_snd_idx], 0,
114		       sizeof(link->wr_tx_pends[pnd_snd_idx]));
115		memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
116		       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
117		if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
118			return;
119	}
120
121	if (wc->status) {
122		if (link->lgr->smc_version == SMC_V2) {
123			memset(link->wr_tx_v2_pend, 0,
124			       sizeof(*link->wr_tx_v2_pend));
125			memset(link->lgr->wr_tx_buf_v2, 0,
126			       sizeof(*link->lgr->wr_tx_buf_v2));
127		}
128		/* terminate link */
129		smcr_link_down_cond_sched(link);
130	}
131	if (pnd_snd.handler)
132		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
133	wake_up(&link->wr_tx_wait);
134}
135
136static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
137{
138	struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
139	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
140	int i = 0, rc;
141	int polled = 0;
142
143again:
144	polled++;
145	do {
146		memset(&wc, 0, sizeof(wc));
147		rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
148		if (polled == 1) {
149			ib_req_notify_cq(dev->roce_cq_send,
150					 IB_CQ_NEXT_COMP |
151					 IB_CQ_REPORT_MISSED_EVENTS);
152		}
153		if (!rc)
154			break;
155		for (i = 0; i < rc; i++)
156			smc_wr_tx_process_cqe(&wc[i]);
157	} while (rc > 0);
158	if (polled == 1)
159		goto again;
160}
161
162void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
163{
164	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
165
166	tasklet_schedule(&dev->send_tasklet);
167}
168
169/*---------------------------- request submission ---------------------------*/
170
171static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
172{
173	*idx = link->wr_tx_cnt;
174	if (!smc_link_sendable(link))
175		return -ENOLINK;
176	for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
177		if (!test_and_set_bit(*idx, link->wr_tx_mask))
178			return 0;
179	}
180	*idx = link->wr_tx_cnt;
181	return -EBUSY;
182}
183
184/**
185 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
186 *			and sets info for pending transmit tracking
187 * @link:		Pointer to smc_link used to later send the message.
188 * @handler:		Send completion handler function pointer.
189 * @wr_buf:		Out value returns pointer to message buffer.
190 * @wr_rdma_buf:	Out value returns pointer to rdma work request.
191 * @wr_pend_priv:	Out value returns pointer serving as handler context.
192 *
193 * Return: 0 on success, or -errno on error.
194 */
195int smc_wr_tx_get_free_slot(struct smc_link *link,
196			    smc_wr_tx_handler handler,
197			    struct smc_wr_buf **wr_buf,
198			    struct smc_rdma_wr **wr_rdma_buf,
199			    struct smc_wr_tx_pend_priv **wr_pend_priv)
200{
201	struct smc_link_group *lgr = smc_get_lgr(link);
202	struct smc_wr_tx_pend *wr_pend;
203	u32 idx = link->wr_tx_cnt;
204	struct ib_send_wr *wr_ib;
205	u64 wr_id;
206	int rc;
207
208	*wr_buf = NULL;
209	*wr_pend_priv = NULL;
210	if (in_softirq() || lgr->terminating) {
211		rc = smc_wr_tx_get_free_slot_index(link, &idx);
212		if (rc)
213			return rc;
214	} else {
215		rc = wait_event_interruptible_timeout(
216			link->wr_tx_wait,
217			!smc_link_sendable(link) ||
218			lgr->terminating ||
219			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
220			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
221		if (!rc) {
222			/* timeout - terminate link */
223			smcr_link_down_cond_sched(link);
224			return -EPIPE;
225		}
226		if (idx == link->wr_tx_cnt)
227			return -EPIPE;
228	}
229	wr_id = smc_wr_tx_get_next_wr_id(link);
230	wr_pend = &link->wr_tx_pends[idx];
231	wr_pend->wr_id = wr_id;
232	wr_pend->handler = handler;
233	wr_pend->link = link;
234	wr_pend->idx = idx;
235	wr_ib = &link->wr_tx_ibs[idx];
236	wr_ib->wr_id = wr_id;
237	*wr_buf = &link->wr_tx_bufs[idx];
238	if (wr_rdma_buf)
239		*wr_rdma_buf = &link->wr_tx_rdmas[idx];
240	*wr_pend_priv = &wr_pend->priv;
241	return 0;
242}
243
244int smc_wr_tx_get_v2_slot(struct smc_link *link,
245			  smc_wr_tx_handler handler,
246			  struct smc_wr_v2_buf **wr_buf,
247			  struct smc_wr_tx_pend_priv **wr_pend_priv)
248{
249	struct smc_wr_tx_pend *wr_pend;
250	struct ib_send_wr *wr_ib;
251	u64 wr_id;
252
253	if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
254		return -EBUSY;
255
256	*wr_buf = NULL;
257	*wr_pend_priv = NULL;
258	wr_id = smc_wr_tx_get_next_wr_id(link);
259	wr_pend = link->wr_tx_v2_pend;
260	wr_pend->wr_id = wr_id;
261	wr_pend->handler = handler;
262	wr_pend->link = link;
263	wr_pend->idx = link->wr_tx_cnt;
264	wr_ib = link->wr_tx_v2_ib;
265	wr_ib->wr_id = wr_id;
266	*wr_buf = link->lgr->wr_tx_buf_v2;
267	*wr_pend_priv = &wr_pend->priv;
268	return 0;
269}
270
271int smc_wr_tx_put_slot(struct smc_link *link,
272		       struct smc_wr_tx_pend_priv *wr_pend_priv)
273{
274	struct smc_wr_tx_pend *pend;
275
276	pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
277	if (pend->idx < link->wr_tx_cnt) {
278		u32 idx = pend->idx;
279
280		/* clear the full struct smc_wr_tx_pend including .priv */
281		memset(&link->wr_tx_pends[idx], 0,
282		       sizeof(link->wr_tx_pends[idx]));
283		memset(&link->wr_tx_bufs[idx], 0,
284		       sizeof(link->wr_tx_bufs[idx]));
285		test_and_clear_bit(idx, link->wr_tx_mask);
286		wake_up(&link->wr_tx_wait);
287		return 1;
288	} else if (link->lgr->smc_version == SMC_V2 &&
289		   pend->idx == link->wr_tx_cnt) {
290		/* Large v2 buffer */
291		memset(&link->wr_tx_v2_pend, 0,
292		       sizeof(link->wr_tx_v2_pend));
293		memset(&link->lgr->wr_tx_buf_v2, 0,
294		       sizeof(link->lgr->wr_tx_buf_v2));
295		return 1;
296	}
297
298	return 0;
299}
300
301/* Send prepared WR slot via ib_post_send.
302 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
303 */
304int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
305{
306	struct smc_wr_tx_pend *pend;
307	int rc;
308
309	ib_req_notify_cq(link->smcibdev->roce_cq_send,
310			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
311	pend = container_of(priv, struct smc_wr_tx_pend, priv);
312	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
313	if (rc) {
314		smc_wr_tx_put_slot(link, priv);
315		smcr_link_down_cond_sched(link);
316	}
317	return rc;
318}
319
320int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
321		      int len)
322{
323	int rc;
324
325	link->wr_tx_v2_ib->sg_list[0].length = len;
326	ib_req_notify_cq(link->smcibdev->roce_cq_send,
327			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
328	rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
329	if (rc) {
330		smc_wr_tx_put_slot(link, priv);
331		smcr_link_down_cond_sched(link);
332	}
333	return rc;
334}
335
336/* Send prepared WR slot via ib_post_send and wait for send completion
337 * notification.
338 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
339 */
340int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
341			unsigned long timeout)
342{
343	struct smc_wr_tx_pend *pend;
344	u32 pnd_idx;
345	int rc;
346
347	pend = container_of(priv, struct smc_wr_tx_pend, priv);
348	pend->compl_requested = 1;
349	pnd_idx = pend->idx;
350	init_completion(&link->wr_tx_compl[pnd_idx]);
351
352	rc = smc_wr_tx_send(link, priv);
353	if (rc)
354		return rc;
355	/* wait for completion by smc_wr_tx_process_cqe() */
356	rc = wait_for_completion_interruptible_timeout(
357					&link->wr_tx_compl[pnd_idx], timeout);
358	if (rc <= 0)
359		rc = -ENODATA;
360	if (rc > 0)
361		rc = 0;
362	return rc;
363}
364
365/* Register a memory region and wait for result. */
366int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
367{
368	int rc;
369
370	ib_req_notify_cq(link->smcibdev->roce_cq_send,
371			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
372	link->wr_reg_state = POSTED;
373	link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
374	link->wr_reg.mr = mr;
375	link->wr_reg.key = mr->rkey;
376	rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
377	if (rc)
378		return rc;
379
380	percpu_ref_get(&link->wr_reg_refs);
381	rc = wait_event_interruptible_timeout(link->wr_reg_wait,
382					      (link->wr_reg_state != POSTED),
383					      SMC_WR_REG_MR_WAIT_TIME);
384	percpu_ref_put(&link->wr_reg_refs);
385	if (!rc) {
386		/* timeout - terminate link */
387		smcr_link_down_cond_sched(link);
388		return -EPIPE;
389	}
390	if (rc == -ERESTARTSYS)
391		return -EINTR;
392	switch (link->wr_reg_state) {
393	case CONFIRMED:
394		rc = 0;
395		break;
396	case FAILED:
397		rc = -EIO;
398		break;
399	case POSTED:
400		rc = -EPIPE;
401		break;
402	}
403	return rc;
404}
405
406/****************************** receive queue ********************************/
407
408int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
409{
410	struct smc_wr_rx_handler *h_iter;
411	int rc = 0;
412
413	spin_lock(&smc_wr_rx_hash_lock);
414	hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
415		if (h_iter->type == handler->type) {
416			rc = -EEXIST;
417			goto out_unlock;
418		}
419	}
420	hash_add(smc_wr_rx_hash, &handler->list, handler->type);
421out_unlock:
422	spin_unlock(&smc_wr_rx_hash_lock);
423	return rc;
424}
425
426/* Demultiplex a received work request based on the message type to its handler.
427 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
428 * and not being modified any more afterwards so we don't need to lock it.
429 */
430static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
431{
432	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
433	struct smc_wr_rx_handler *handler;
434	struct smc_wr_rx_hdr *wr_rx;
435	u64 temp_wr_id;
436	u32 index;
437
438	if (wc->byte_len < sizeof(*wr_rx))
439		return; /* short message */
440	temp_wr_id = wc->wr_id;
441	index = do_div(temp_wr_id, link->wr_rx_cnt);
442	wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
443	hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
444		if (handler->type == wr_rx->type)
445			handler->handler(wc, wr_rx);
446	}
447}
448
449static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
450{
451	struct smc_link *link;
452	int i;
453
454	for (i = 0; i < num; i++) {
455		link = wc[i].qp->qp_context;
456		link->wr_rx_id_compl = wc[i].wr_id;
457		if (wc[i].status == IB_WC_SUCCESS) {
458			link->wr_rx_tstamp = jiffies;
459			smc_wr_rx_demultiplex(&wc[i]);
460			smc_wr_rx_post(link); /* refill WR RX */
461		} else {
462			/* handle status errors */
463			switch (wc[i].status) {
464			case IB_WC_RETRY_EXC_ERR:
465			case IB_WC_RNR_RETRY_EXC_ERR:
466			case IB_WC_WR_FLUSH_ERR:
467				smcr_link_down_cond_sched(link);
468				if (link->wr_rx_id_compl == link->wr_rx_id)
469					wake_up(&link->wr_rx_empty_wait);
470				break;
471			default:
472				smc_wr_rx_post(link); /* refill WR RX */
473				break;
474			}
475		}
476	}
477}
478
479static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
480{
481	struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
482	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
483	int polled = 0;
484	int rc;
485
486again:
487	polled++;
488	do {
489		memset(&wc, 0, sizeof(wc));
490		rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
491		if (polled == 1) {
492			ib_req_notify_cq(dev->roce_cq_recv,
493					 IB_CQ_SOLICITED_MASK
494					 | IB_CQ_REPORT_MISSED_EVENTS);
495		}
496		if (!rc)
497			break;
498		smc_wr_rx_process_cqes(&wc[0], rc);
499	} while (rc > 0);
500	if (polled == 1)
501		goto again;
502}
503
504void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
505{
506	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
507
508	tasklet_schedule(&dev->recv_tasklet);
509}
510
511int smc_wr_rx_post_init(struct smc_link *link)
512{
513	u32 i;
514	int rc = 0;
515
516	for (i = 0; i < link->wr_rx_cnt; i++)
517		rc = smc_wr_rx_post(link);
518	return rc;
519}
520
521/***************************** init, exit, misc ******************************/
522
523void smc_wr_remember_qp_attr(struct smc_link *lnk)
524{
525	struct ib_qp_attr *attr = &lnk->qp_attr;
526	struct ib_qp_init_attr init_attr;
527
528	memset(attr, 0, sizeof(*attr));
529	memset(&init_attr, 0, sizeof(init_attr));
530	ib_query_qp(lnk->roce_qp, attr,
531		    IB_QP_STATE |
532		    IB_QP_CUR_STATE |
533		    IB_QP_PKEY_INDEX |
534		    IB_QP_PORT |
535		    IB_QP_QKEY |
536		    IB_QP_AV |
537		    IB_QP_PATH_MTU |
538		    IB_QP_TIMEOUT |
539		    IB_QP_RETRY_CNT |
540		    IB_QP_RNR_RETRY |
541		    IB_QP_RQ_PSN |
542		    IB_QP_ALT_PATH |
543		    IB_QP_MIN_RNR_TIMER |
544		    IB_QP_SQ_PSN |
545		    IB_QP_PATH_MIG_STATE |
546		    IB_QP_CAP |
547		    IB_QP_DEST_QPN,
548		    &init_attr);
549
550	lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
551			       lnk->qp_attr.cap.max_send_wr);
552	lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
553			       lnk->qp_attr.cap.max_recv_wr);
554}
555
556static void smc_wr_init_sge(struct smc_link *lnk)
557{
558	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
559	bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
560	u32 i;
561
562	for (i = 0; i < lnk->wr_tx_cnt; i++) {
563		lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
564			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
565		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
566		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
567		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
568			lnk->roce_pd->local_dma_lkey;
569		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
570			lnk->roce_pd->local_dma_lkey;
571		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
572			lnk->roce_pd->local_dma_lkey;
573		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
574			lnk->roce_pd->local_dma_lkey;
575		lnk->wr_tx_ibs[i].next = NULL;
576		lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
577		lnk->wr_tx_ibs[i].num_sge = 1;
578		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
579		lnk->wr_tx_ibs[i].send_flags =
580			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
581		if (send_inline)
582			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
583		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
584		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
585		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
586			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
587		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
588			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
589	}
590
591	if (lnk->lgr->smc_version == SMC_V2) {
592		lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
593		lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
594		lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
595
596		lnk->wr_tx_v2_ib->next = NULL;
597		lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
598		lnk->wr_tx_v2_ib->num_sge = 1;
599		lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
600		lnk->wr_tx_v2_ib->send_flags =
601			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
602	}
603
604	/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
605	 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
606	 * and the same buffer for all sges. When a larger message arrived then
607	 * the content of the first small sge is copied to the beginning of
608	 * the larger spillover buffer, allowing easy data mapping.
609	 */
610	for (i = 0; i < lnk->wr_rx_cnt; i++) {
611		int x = i * sges_per_buf;
612
613		lnk->wr_rx_sges[x].addr =
614			lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
615		lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
616		lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
617		if (lnk->lgr->smc_version == SMC_V2) {
618			lnk->wr_rx_sges[x + 1].addr =
619					lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
620			lnk->wr_rx_sges[x + 1].length =
621					SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
622			lnk->wr_rx_sges[x + 1].lkey =
623					lnk->roce_pd->local_dma_lkey;
624		}
625		lnk->wr_rx_ibs[i].next = NULL;
626		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
627		lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
628	}
629	lnk->wr_reg.wr.next = NULL;
630	lnk->wr_reg.wr.num_sge = 0;
631	lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
632	lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
633	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
634}
635
636void smc_wr_free_link(struct smc_link *lnk)
637{
638	struct ib_device *ibdev;
639
640	if (!lnk->smcibdev)
641		return;
642	ibdev = lnk->smcibdev->ibdev;
643
644	smc_wr_drain_cq(lnk);
645	smc_wr_wakeup_reg_wait(lnk);
646	smc_wr_wakeup_tx_wait(lnk);
647
648	smc_wr_tx_wait_no_pending_sends(lnk);
649	percpu_ref_kill(&lnk->wr_reg_refs);
650	wait_for_completion(&lnk->reg_ref_comp);
651	percpu_ref_kill(&lnk->wr_tx_refs);
652	wait_for_completion(&lnk->tx_ref_comp);
653
654	if (lnk->wr_rx_dma_addr) {
655		ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
656				    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
657				    DMA_FROM_DEVICE);
658		lnk->wr_rx_dma_addr = 0;
659	}
660	if (lnk->wr_rx_v2_dma_addr) {
661		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
662				    SMC_WR_BUF_V2_SIZE,
663				    DMA_FROM_DEVICE);
664		lnk->wr_rx_v2_dma_addr = 0;
665	}
666	if (lnk->wr_tx_dma_addr) {
667		ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
668				    SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
669				    DMA_TO_DEVICE);
670		lnk->wr_tx_dma_addr = 0;
671	}
672	if (lnk->wr_tx_v2_dma_addr) {
673		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
674				    SMC_WR_BUF_V2_SIZE,
675				    DMA_TO_DEVICE);
676		lnk->wr_tx_v2_dma_addr = 0;
677	}
678}
679
680void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
681{
682	if (lgr->smc_version < SMC_V2)
683		return;
684
685	kfree(lgr->wr_rx_buf_v2);
686	lgr->wr_rx_buf_v2 = NULL;
687	kfree(lgr->wr_tx_buf_v2);
688	lgr->wr_tx_buf_v2 = NULL;
689}
690
691void smc_wr_free_link_mem(struct smc_link *lnk)
692{
693	kfree(lnk->wr_tx_v2_ib);
694	lnk->wr_tx_v2_ib = NULL;
695	kfree(lnk->wr_tx_v2_sge);
696	lnk->wr_tx_v2_sge = NULL;
697	kfree(lnk->wr_tx_v2_pend);
698	lnk->wr_tx_v2_pend = NULL;
699	kfree(lnk->wr_tx_compl);
700	lnk->wr_tx_compl = NULL;
701	kfree(lnk->wr_tx_pends);
702	lnk->wr_tx_pends = NULL;
703	bitmap_free(lnk->wr_tx_mask);
704	lnk->wr_tx_mask = NULL;
705	kfree(lnk->wr_tx_sges);
706	lnk->wr_tx_sges = NULL;
707	kfree(lnk->wr_tx_rdma_sges);
708	lnk->wr_tx_rdma_sges = NULL;
709	kfree(lnk->wr_rx_sges);
710	lnk->wr_rx_sges = NULL;
711	kfree(lnk->wr_tx_rdmas);
712	lnk->wr_tx_rdmas = NULL;
713	kfree(lnk->wr_rx_ibs);
714	lnk->wr_rx_ibs = NULL;
715	kfree(lnk->wr_tx_ibs);
716	lnk->wr_tx_ibs = NULL;
717	kfree(lnk->wr_tx_bufs);
718	lnk->wr_tx_bufs = NULL;
719	kfree(lnk->wr_rx_bufs);
720	lnk->wr_rx_bufs = NULL;
721}
722
723int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
724{
725	if (lgr->smc_version < SMC_V2)
726		return 0;
727
728	lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
729	if (!lgr->wr_rx_buf_v2)
730		return -ENOMEM;
731	lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
732	if (!lgr->wr_tx_buf_v2) {
733		kfree(lgr->wr_rx_buf_v2);
734		return -ENOMEM;
735	}
736	return 0;
737}
738
739int smc_wr_alloc_link_mem(struct smc_link *link)
740{
741	int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
742
743	/* allocate link related memory */
744	link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
745	if (!link->wr_tx_bufs)
746		goto no_mem;
747	link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
748				   GFP_KERNEL);
749	if (!link->wr_rx_bufs)
750		goto no_mem_wr_tx_bufs;
751	link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
752				  GFP_KERNEL);
753	if (!link->wr_tx_ibs)
754		goto no_mem_wr_rx_bufs;
755	link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
756				  sizeof(link->wr_rx_ibs[0]),
757				  GFP_KERNEL);
758	if (!link->wr_rx_ibs)
759		goto no_mem_wr_tx_ibs;
760	link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
761				    sizeof(link->wr_tx_rdmas[0]),
762				    GFP_KERNEL);
763	if (!link->wr_tx_rdmas)
764		goto no_mem_wr_rx_ibs;
765	link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
766					sizeof(link->wr_tx_rdma_sges[0]),
767					GFP_KERNEL);
768	if (!link->wr_tx_rdma_sges)
769		goto no_mem_wr_tx_rdmas;
770	link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
771				   GFP_KERNEL);
772	if (!link->wr_tx_sges)
773		goto no_mem_wr_tx_rdma_sges;
774	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
775				   sizeof(link->wr_rx_sges[0]) * sges_per_buf,
776				   GFP_KERNEL);
777	if (!link->wr_rx_sges)
778		goto no_mem_wr_tx_sges;
779	link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
780	if (!link->wr_tx_mask)
781		goto no_mem_wr_rx_sges;
782	link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
783				    sizeof(link->wr_tx_pends[0]),
784				    GFP_KERNEL);
785	if (!link->wr_tx_pends)
786		goto no_mem_wr_tx_mask;
787	link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
788				    sizeof(link->wr_tx_compl[0]),
789				    GFP_KERNEL);
790	if (!link->wr_tx_compl)
791		goto no_mem_wr_tx_pends;
792
793	if (link->lgr->smc_version == SMC_V2) {
794		link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
795					    GFP_KERNEL);
796		if (!link->wr_tx_v2_ib)
797			goto no_mem_tx_compl;
798		link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
799					     GFP_KERNEL);
800		if (!link->wr_tx_v2_sge)
801			goto no_mem_v2_ib;
802		link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
803					      GFP_KERNEL);
804		if (!link->wr_tx_v2_pend)
805			goto no_mem_v2_sge;
806	}
807	return 0;
808
809no_mem_v2_sge:
810	kfree(link->wr_tx_v2_sge);
811no_mem_v2_ib:
812	kfree(link->wr_tx_v2_ib);
813no_mem_tx_compl:
814	kfree(link->wr_tx_compl);
815no_mem_wr_tx_pends:
816	kfree(link->wr_tx_pends);
817no_mem_wr_tx_mask:
818	kfree(link->wr_tx_mask);
819no_mem_wr_rx_sges:
820	kfree(link->wr_rx_sges);
821no_mem_wr_tx_sges:
822	kfree(link->wr_tx_sges);
823no_mem_wr_tx_rdma_sges:
824	kfree(link->wr_tx_rdma_sges);
825no_mem_wr_tx_rdmas:
826	kfree(link->wr_tx_rdmas);
827no_mem_wr_rx_ibs:
828	kfree(link->wr_rx_ibs);
829no_mem_wr_tx_ibs:
830	kfree(link->wr_tx_ibs);
831no_mem_wr_rx_bufs:
832	kfree(link->wr_rx_bufs);
833no_mem_wr_tx_bufs:
834	kfree(link->wr_tx_bufs);
835no_mem:
836	return -ENOMEM;
837}
838
839void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
840{
841	tasklet_kill(&smcibdev->recv_tasklet);
842	tasklet_kill(&smcibdev->send_tasklet);
843}
844
845void smc_wr_add_dev(struct smc_ib_device *smcibdev)
846{
847	tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
848	tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
849}
850
851static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
852{
853	struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
854
855	complete(&lnk->tx_ref_comp);
856}
857
858static void smcr_wr_reg_refs_free(struct percpu_ref *ref)
859{
860	struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs);
861
862	complete(&lnk->reg_ref_comp);
863}
864
865int smc_wr_create_link(struct smc_link *lnk)
866{
867	struct ib_device *ibdev = lnk->smcibdev->ibdev;
868	int rc = 0;
869
870	smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
871	lnk->wr_rx_id = 0;
872	lnk->wr_rx_dma_addr = ib_dma_map_single(
873		ibdev, lnk->wr_rx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
874		DMA_FROM_DEVICE);
875	if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
876		lnk->wr_rx_dma_addr = 0;
877		rc = -EIO;
878		goto out;
879	}
880	if (lnk->lgr->smc_version == SMC_V2) {
881		lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
882			lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
883			DMA_FROM_DEVICE);
884		if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
885			lnk->wr_rx_v2_dma_addr = 0;
886			rc = -EIO;
887			goto dma_unmap;
888		}
889		lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
890			lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
891			DMA_TO_DEVICE);
892		if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
893			lnk->wr_tx_v2_dma_addr = 0;
894			rc = -EIO;
895			goto dma_unmap;
896		}
897	}
898	lnk->wr_tx_dma_addr = ib_dma_map_single(
899		ibdev, lnk->wr_tx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
900		DMA_TO_DEVICE);
901	if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
902		rc = -EIO;
903		goto dma_unmap;
904	}
905	smc_wr_init_sge(lnk);
906	bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
907	init_waitqueue_head(&lnk->wr_tx_wait);
908	rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL);
909	if (rc)
910		goto dma_unmap;
911	init_completion(&lnk->tx_ref_comp);
912	init_waitqueue_head(&lnk->wr_reg_wait);
913	rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL);
914	if (rc)
915		goto dma_unmap;
916	init_completion(&lnk->reg_ref_comp);
917	init_waitqueue_head(&lnk->wr_rx_empty_wait);
918	return rc;
919
920dma_unmap:
921	if (lnk->wr_rx_v2_dma_addr) {
922		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
923				    SMC_WR_BUF_V2_SIZE,
924				    DMA_FROM_DEVICE);
925		lnk->wr_rx_v2_dma_addr = 0;
926	}
927	if (lnk->wr_tx_v2_dma_addr) {
928		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
929				    SMC_WR_BUF_V2_SIZE,
930				    DMA_TO_DEVICE);
931		lnk->wr_tx_v2_dma_addr = 0;
932	}
933	ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
934			    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
935			    DMA_FROM_DEVICE);
936	lnk->wr_rx_dma_addr = 0;
937out:
938	return rc;
939}
940