1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4/* Copyright (c) 2008-2019, IBM Corporation */
5
6#include <linux/errno.h>
7#include <linux/types.h>
8#include <linux/net.h>
9#include <linux/scatterlist.h>
10#include <linux/highmem.h>
11
12#include <rdma/iw_cm.h>
13#include <rdma/ib_verbs.h>
14
15#include "siw.h"
16#include "siw_verbs.h"
17#include "siw_mem.h"
18
19/*
20 * siw_rx_umem()
21 *
22 * Receive data of @len into target referenced by @dest_addr.
23 *
24 * @srx:	Receive Context
25 * @umem:	siw representation of target memory
26 * @dest_addr:	user virtual address
27 * @len:	number of bytes to place
28 */
29static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
30		       u64 dest_addr, int len)
31{
32	int copied = 0;
33
34	while (len) {
35		struct page *p;
36		int pg_off, bytes, rv;
37		void *dest;
38
39		p = siw_get_upage(umem, dest_addr);
40		if (unlikely(!p)) {
41			pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
42				__func__, qp_id(rx_qp(srx)),
43				(void *)(uintptr_t)dest_addr,
44				(void *)(uintptr_t)umem->fp_addr);
45			/* siw internal error */
46			srx->skb_copied += copied;
47			srx->skb_new -= copied;
48
49			return -EFAULT;
50		}
51		pg_off = dest_addr & ~PAGE_MASK;
52		bytes = min(len, (int)PAGE_SIZE - pg_off);
53
54		siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
55
56		dest = kmap_atomic(p);
57		rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
58				   bytes);
59
60		if (unlikely(rv)) {
61			kunmap_atomic(dest);
62			srx->skb_copied += copied;
63			srx->skb_new -= copied;
64
65			pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
66				qp_id(rx_qp(srx)), __func__, len, p, rv);
67
68			return -EFAULT;
69		}
70		if (srx->mpa_crc_hd) {
71			if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
72				crypto_shash_update(srx->mpa_crc_hd,
73					(u8 *)(dest + pg_off), bytes);
74				kunmap_atomic(dest);
75			} else {
76				kunmap_atomic(dest);
77				/*
78				 * Do CRC on original, not target buffer.
79				 * Some user land applications may
80				 * concurrently write the target buffer,
81				 * which would yield a broken CRC.
82				 * Walking the skb twice is very ineffcient.
83				 * Folding the CRC into skb_copy_bits()
84				 * would be much better, but is currently
85				 * not supported.
86				 */
87				siw_crc_skb(srx, bytes);
88			}
89		} else {
90			kunmap_atomic(dest);
91		}
92		srx->skb_offset += bytes;
93		copied += bytes;
94		len -= bytes;
95		dest_addr += bytes;
96		pg_off = 0;
97	}
98	srx->skb_copied += copied;
99	srx->skb_new -= copied;
100
101	return copied;
102}
103
104static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
105{
106	int rv;
107
108	siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
109
110	rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
111	if (unlikely(rv)) {
112		pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
113			qp_id(rx_qp(srx)), __func__, len, kva, rv);
114
115		return rv;
116	}
117	if (srx->mpa_crc_hd)
118		crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
119
120	srx->skb_offset += len;
121	srx->skb_copied += len;
122	srx->skb_new -= len;
123
124	return len;
125}
126
127static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
128		      struct siw_mem *mem, u64 addr, int len)
129{
130	struct siw_pbl *pbl = mem->pbl;
131	u64 offset = addr - mem->va;
132	int copied = 0;
133
134	while (len) {
135		int bytes;
136		dma_addr_t buf_addr =
137			siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
138		if (!buf_addr)
139			break;
140
141		bytes = min(bytes, len);
142		if (siw_rx_kva(srx, ib_virt_dma_to_ptr(buf_addr), bytes) ==
143		    bytes) {
144			copied += bytes;
145			offset += bytes;
146			len -= bytes;
147		} else {
148			break;
149		}
150	}
151	return copied;
152}
153
154/*
155 * siw_rresp_check_ntoh()
156 *
157 * Check incoming RRESP fragment header against expected
158 * header values and update expected values for potential next
159 * fragment.
160 *
161 * NOTE: This function must be called only if a RRESP DDP segment
162 *       starts but not for fragmented consecutive pieces of an
163 *       already started DDP segment.
164 */
165static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
166				struct siw_rx_fpdu *frx)
167{
168	struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
169	struct siw_wqe *wqe = &frx->wqe_active;
170	enum ddp_ecode ecode;
171
172	u32 sink_stag = be32_to_cpu(rresp->sink_stag);
173	u64 sink_to = be64_to_cpu(rresp->sink_to);
174
175	if (frx->first_ddp_seg) {
176		srx->ddp_stag = wqe->sqe.sge[0].lkey;
177		srx->ddp_to = wqe->sqe.sge[0].laddr;
178		frx->pbl_idx = 0;
179	}
180	/* Below checks extend beyond the semantics of DDP, and
181	 * into RDMAP:
182	 * We check if the read response matches exactly the
183	 * read request which was send to the remote peer to
184	 * trigger this read response. RFC5040/5041 do not
185	 * always have a proper error code for the detected
186	 * error cases. We choose 'base or bounds error' for
187	 * cases where the inbound STag is valid, but offset
188	 * or length do not match our response receive state.
189	 */
190	if (unlikely(srx->ddp_stag != sink_stag)) {
191		pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
192			qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
193		ecode = DDP_ECODE_T_INVALID_STAG;
194		goto error;
195	}
196	if (unlikely(srx->ddp_to != sink_to)) {
197		pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
198			qp_id(rx_qp(srx)), (unsigned long long)sink_to,
199			(unsigned long long)srx->ddp_to);
200		ecode = DDP_ECODE_T_BASE_BOUNDS;
201		goto error;
202	}
203	if (unlikely(!frx->more_ddp_segs &&
204		     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
205		pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
206			qp_id(rx_qp(srx)),
207			wqe->processed + srx->fpdu_part_rem, wqe->bytes);
208		ecode = DDP_ECODE_T_BASE_BOUNDS;
209		goto error;
210	}
211	return 0;
212error:
213	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
214			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
215	return -EINVAL;
216}
217
218/*
219 * siw_write_check_ntoh()
220 *
221 * Check incoming WRITE fragment header against expected
222 * header values and update expected values for potential next
223 * fragment
224 *
225 * NOTE: This function must be called only if a WRITE DDP segment
226 *       starts but not for fragmented consecutive pieces of an
227 *       already started DDP segment.
228 */
229static int siw_write_check_ntoh(struct siw_rx_stream *srx,
230				struct siw_rx_fpdu *frx)
231{
232	struct iwarp_rdma_write *write = &srx->hdr.rwrite;
233	enum ddp_ecode ecode;
234
235	u32 sink_stag = be32_to_cpu(write->sink_stag);
236	u64 sink_to = be64_to_cpu(write->sink_to);
237
238	if (frx->first_ddp_seg) {
239		srx->ddp_stag = sink_stag;
240		srx->ddp_to = sink_to;
241		frx->pbl_idx = 0;
242	} else {
243		if (unlikely(srx->ddp_stag != sink_stag)) {
244			pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
245				qp_id(rx_qp(srx)), sink_stag,
246				srx->ddp_stag);
247			ecode = DDP_ECODE_T_INVALID_STAG;
248			goto error;
249		}
250		if (unlikely(srx->ddp_to != sink_to)) {
251			pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
252				qp_id(rx_qp(srx)),
253				(unsigned long long)sink_to,
254				(unsigned long long)srx->ddp_to);
255			ecode = DDP_ECODE_T_BASE_BOUNDS;
256			goto error;
257		}
258	}
259	return 0;
260error:
261	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
262			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
263	return -EINVAL;
264}
265
266/*
267 * siw_send_check_ntoh()
268 *
269 * Check incoming SEND fragment header against expected
270 * header values and update expected MSN if no next
271 * fragment expected
272 *
273 * NOTE: This function must be called only if a SEND DDP segment
274 *       starts but not for fragmented consecutive pieces of an
275 *       already started DDP segment.
276 */
277static int siw_send_check_ntoh(struct siw_rx_stream *srx,
278			       struct siw_rx_fpdu *frx)
279{
280	struct iwarp_send_inv *send = &srx->hdr.send_inv;
281	struct siw_wqe *wqe = &frx->wqe_active;
282	enum ddp_ecode ecode;
283
284	u32 ddp_msn = be32_to_cpu(send->ddp_msn);
285	u32 ddp_mo = be32_to_cpu(send->ddp_mo);
286	u32 ddp_qn = be32_to_cpu(send->ddp_qn);
287
288	if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
289		pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
290			qp_id(rx_qp(srx)), ddp_qn);
291		ecode = DDP_ECODE_UT_INVALID_QN;
292		goto error;
293	}
294	if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
295		pr_warn("siw: [QP %u]: send msn: %u != %u\n",
296			qp_id(rx_qp(srx)), ddp_msn,
297			srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
298		ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
299		goto error;
300	}
301	if (unlikely(ddp_mo != wqe->processed)) {
302		pr_warn("siw: [QP %u], send mo: %u != %u\n",
303			qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
304		ecode = DDP_ECODE_UT_INVALID_MO;
305		goto error;
306	}
307	if (frx->first_ddp_seg) {
308		/* initialize user memory write position */
309		frx->sge_idx = 0;
310		frx->sge_off = 0;
311		frx->pbl_idx = 0;
312
313		/* only valid for SEND_INV and SEND_SE_INV operations */
314		srx->inval_stag = be32_to_cpu(send->inval_stag);
315	}
316	if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
317		siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
318			   wqe->bytes, wqe->processed, srx->fpdu_part_rem);
319		wqe->wc_status = SIW_WC_LOC_LEN_ERR;
320		ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
321		goto error;
322	}
323	return 0;
324error:
325	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
326			   DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
327	return -EINVAL;
328}
329
330static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
331{
332	struct siw_rqe *rqe;
333	struct siw_srq *srq;
334	struct siw_wqe *wqe = NULL;
335	bool srq_event = false;
336	unsigned long flags;
337
338	srq = qp->srq;
339	if (srq) {
340		spin_lock_irqsave(&srq->lock, flags);
341		if (unlikely(!srq->num_rqe))
342			goto out;
343
344		rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
345	} else {
346		if (unlikely(!qp->recvq))
347			goto out;
348
349		rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
350	}
351	if (likely(rqe->flags == SIW_WQE_VALID)) {
352		int num_sge = rqe->num_sge;
353
354		if (likely(num_sge <= SIW_MAX_SGE)) {
355			int i = 0;
356
357			wqe = rx_wqe(&qp->rx_untagged);
358			rx_type(wqe) = SIW_OP_RECEIVE;
359			wqe->wr_status = SIW_WR_INPROGRESS;
360			wqe->bytes = 0;
361			wqe->processed = 0;
362
363			wqe->rqe.id = rqe->id;
364			wqe->rqe.num_sge = num_sge;
365
366			while (i < num_sge) {
367				wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
368				wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
369				wqe->rqe.sge[i].length = rqe->sge[i].length;
370				wqe->bytes += wqe->rqe.sge[i].length;
371				wqe->mem[i] = NULL;
372				i++;
373			}
374			/* can be re-used by appl */
375			smp_store_mb(rqe->flags, 0);
376		} else {
377			siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
378			if (srq)
379				spin_unlock_irqrestore(&srq->lock, flags);
380			return NULL;
381		}
382		if (!srq) {
383			qp->rq_get++;
384		} else {
385			if (srq->armed) {
386				/* Test SRQ limit */
387				u32 off = (srq->rq_get + srq->limit) %
388					  srq->num_rqe;
389				struct siw_rqe *rqe2 = &srq->recvq[off];
390
391				if (!(rqe2->flags & SIW_WQE_VALID)) {
392					srq->armed = false;
393					srq_event = true;
394				}
395			}
396			srq->rq_get++;
397		}
398	}
399out:
400	if (srq) {
401		spin_unlock_irqrestore(&srq->lock, flags);
402		if (srq_event)
403			siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
404	}
405	return wqe;
406}
407
408static int siw_rx_data(struct siw_mem *mem_p, struct siw_rx_stream *srx,
409		       unsigned int *pbl_idx, u64 addr, int bytes)
410{
411	int rv;
412
413	if (mem_p->mem_obj == NULL)
414		rv = siw_rx_kva(srx, ib_virt_dma_to_ptr(addr), bytes);
415	else if (!mem_p->is_pbl)
416		rv = siw_rx_umem(srx, mem_p->umem, addr, bytes);
417	else
418		rv = siw_rx_pbl(srx, pbl_idx, mem_p, addr, bytes);
419	return rv;
420}
421
422/*
423 * siw_proc_send:
424 *
425 * Process one incoming SEND and place data into memory referenced by
426 * receive wqe.
427 *
428 * Function supports partially received sends (suspending/resuming
429 * current receive wqe processing)
430 *
431 * return value:
432 *	0:       reached the end of a DDP segment
433 *	-EAGAIN: to be called again to finish the DDP segment
434 */
435int siw_proc_send(struct siw_qp *qp)
436{
437	struct siw_rx_stream *srx = &qp->rx_stream;
438	struct siw_rx_fpdu *frx = &qp->rx_untagged;
439	struct siw_wqe *wqe;
440	u32 data_bytes; /* all data bytes available */
441	u32 rcvd_bytes; /* sum of data bytes rcvd */
442	int rv = 0;
443
444	if (frx->first_ddp_seg) {
445		wqe = siw_rqe_get(qp);
446		if (unlikely(!wqe)) {
447			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
448					   DDP_ETYPE_UNTAGGED_BUF,
449					   DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
450			return -ENOENT;
451		}
452	} else {
453		wqe = rx_wqe(frx);
454	}
455	if (srx->state == SIW_GET_DATA_START) {
456		rv = siw_send_check_ntoh(srx, frx);
457		if (unlikely(rv)) {
458			siw_qp_event(qp, IB_EVENT_QP_FATAL);
459			return rv;
460		}
461		if (!srx->fpdu_part_rem) /* zero length SEND */
462			return 0;
463	}
464	data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
465	rcvd_bytes = 0;
466
467	/* A zero length SEND will skip below loop */
468	while (data_bytes) {
469		struct ib_pd *pd;
470		struct siw_mem **mem, *mem_p;
471		struct siw_sge *sge;
472		u32 sge_bytes; /* data bytes avail for SGE */
473
474		sge = &wqe->rqe.sge[frx->sge_idx];
475
476		if (!sge->length) {
477			/* just skip empty sge's */
478			frx->sge_idx++;
479			frx->sge_off = 0;
480			frx->pbl_idx = 0;
481			continue;
482		}
483		sge_bytes = min(data_bytes, sge->length - frx->sge_off);
484		mem = &wqe->mem[frx->sge_idx];
485
486		/*
487		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
488		 */
489		pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
490
491		rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
492				   frx->sge_off, sge_bytes);
493		if (unlikely(rv)) {
494			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
495					   DDP_ETYPE_CATASTROPHIC,
496					   DDP_ECODE_CATASTROPHIC, 0);
497
498			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
499			break;
500		}
501		mem_p = *mem;
502		rv = siw_rx_data(mem_p, srx, &frx->pbl_idx,
503				 sge->laddr + frx->sge_off, sge_bytes);
504		if (unlikely(rv != sge_bytes)) {
505			wqe->processed += rcvd_bytes;
506
507			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
508					   DDP_ETYPE_CATASTROPHIC,
509					   DDP_ECODE_CATASTROPHIC, 0);
510			return -EINVAL;
511		}
512		frx->sge_off += rv;
513
514		if (frx->sge_off == sge->length) {
515			frx->sge_idx++;
516			frx->sge_off = 0;
517			frx->pbl_idx = 0;
518		}
519		data_bytes -= rv;
520		rcvd_bytes += rv;
521
522		srx->fpdu_part_rem -= rv;
523		srx->fpdu_part_rcvd += rv;
524	}
525	wqe->processed += rcvd_bytes;
526
527	if (!srx->fpdu_part_rem)
528		return 0;
529
530	return (rv < 0) ? rv : -EAGAIN;
531}
532
533/*
534 * siw_proc_write:
535 *
536 * Place incoming WRITE after referencing and checking target buffer
537
538 * Function supports partially received WRITEs (suspending/resuming
539 * current receive processing)
540 *
541 * return value:
542 *	0:       reached the end of a DDP segment
543 *	-EAGAIN: to be called again to finish the DDP segment
544 */
545int siw_proc_write(struct siw_qp *qp)
546{
547	struct siw_rx_stream *srx = &qp->rx_stream;
548	struct siw_rx_fpdu *frx = &qp->rx_tagged;
549	struct siw_mem *mem;
550	int bytes, rv;
551
552	if (srx->state == SIW_GET_DATA_START) {
553		if (!srx->fpdu_part_rem) /* zero length WRITE */
554			return 0;
555
556		rv = siw_write_check_ntoh(srx, frx);
557		if (unlikely(rv)) {
558			siw_qp_event(qp, IB_EVENT_QP_FATAL);
559			return rv;
560		}
561	}
562	bytes = min(srx->fpdu_part_rem, srx->skb_new);
563
564	if (frx->first_ddp_seg) {
565		struct siw_wqe *wqe = rx_wqe(frx);
566
567		rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
568		if (unlikely(!rx_mem(frx))) {
569			siw_dbg_qp(qp,
570				   "sink stag not found/invalid, stag 0x%08x\n",
571				   srx->ddp_stag);
572
573			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
574					   DDP_ETYPE_TAGGED_BUF,
575					   DDP_ECODE_T_INVALID_STAG, 0);
576			return -EINVAL;
577		}
578		wqe->rqe.num_sge = 1;
579		rx_type(wqe) = SIW_OP_WRITE;
580		wqe->wr_status = SIW_WR_INPROGRESS;
581	}
582	mem = rx_mem(frx);
583
584	/*
585	 * Check if application re-registered memory with different
586	 * key field of STag.
587	 */
588	if (unlikely(mem->stag != srx->ddp_stag)) {
589		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
590				   DDP_ETYPE_TAGGED_BUF,
591				   DDP_ECODE_T_INVALID_STAG, 0);
592		return -EINVAL;
593	}
594	rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
595			   IB_ACCESS_REMOTE_WRITE, bytes);
596	if (unlikely(rv)) {
597		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
598				   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
599				   0);
600
601		siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
602
603		return -EINVAL;
604	}
605
606	rv = siw_rx_data(mem, srx, &frx->pbl_idx,
607			 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
608	if (unlikely(rv != bytes)) {
609		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
610				   DDP_ETYPE_CATASTROPHIC,
611				   DDP_ECODE_CATASTROPHIC, 0);
612		return -EINVAL;
613	}
614	srx->fpdu_part_rem -= rv;
615	srx->fpdu_part_rcvd += rv;
616
617	if (!srx->fpdu_part_rem) {
618		srx->ddp_to += srx->fpdu_part_rcvd;
619		return 0;
620	}
621	return -EAGAIN;
622}
623
624/*
625 * Inbound RREQ's cannot carry user data.
626 */
627int siw_proc_rreq(struct siw_qp *qp)
628{
629	struct siw_rx_stream *srx = &qp->rx_stream;
630
631	if (!srx->fpdu_part_rem)
632		return 0;
633
634	pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
635		be16_to_cpu(srx->hdr.ctrl.mpa_len));
636
637	return -EPROTO;
638}
639
640/*
641 * siw_init_rresp:
642 *
643 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
644 * Put it at the tail of the IRQ, if there is another WQE currently in
645 * transmit processing. If not, make it the current WQE to be processed
646 * and schedule transmit processing.
647 *
648 * Can be called from softirq context and from process
649 * context (RREAD socket loopback case!)
650 *
651 * return value:
652 *	0:      success,
653 *		failure code otherwise
654 */
655
656static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
657{
658	struct siw_wqe *tx_work = tx_wqe(qp);
659	struct siw_sqe *resp;
660
661	uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
662		 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
663	uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
664		 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
665		 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
666		 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
667
668	int run_sq = 1, rv = 0;
669	unsigned long flags;
670
671	if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
672		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
673				   DDP_ETYPE_UNTAGGED_BUF,
674				   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
675		return -EPROTO;
676	}
677	spin_lock_irqsave(&qp->sq_lock, flags);
678
679	if (unlikely(!qp->attrs.irq_size)) {
680		run_sq = 0;
681		goto error_irq;
682	}
683	if (tx_work->wr_status == SIW_WR_IDLE) {
684		/*
685		 * immediately schedule READ response w/o
686		 * consuming IRQ entry: IRQ must be empty.
687		 */
688		tx_work->processed = 0;
689		tx_work->mem[0] = NULL;
690		tx_work->wr_status = SIW_WR_QUEUED;
691		resp = &tx_work->sqe;
692	} else {
693		resp = irq_alloc_free(qp);
694		run_sq = 0;
695	}
696	if (likely(resp)) {
697		resp->opcode = SIW_OP_READ_RESPONSE;
698
699		resp->sge[0].length = length;
700		resp->sge[0].laddr = laddr;
701		resp->sge[0].lkey = lkey;
702
703		/* Keep aside message sequence number for potential
704		 * error reporting during Read Response generation.
705		 */
706		resp->sge[1].length = msn;
707
708		resp->raddr = raddr;
709		resp->rkey = rkey;
710		resp->num_sge = length ? 1 : 0;
711
712		/* RRESP now valid as current TX wqe or placed into IRQ */
713		smp_store_mb(resp->flags, SIW_WQE_VALID);
714	} else {
715error_irq:
716		pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
717			qp_id(qp), qp->attrs.irq_size);
718
719		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
720				   RDMAP_ETYPE_REMOTE_OPERATION,
721				   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
722		rv = -EPROTO;
723	}
724
725	spin_unlock_irqrestore(&qp->sq_lock, flags);
726
727	if (run_sq)
728		rv = siw_sq_start(qp);
729
730	return rv;
731}
732
733/*
734 * Only called at start of Read.Resonse processing.
735 * Transfer pending Read from tip of ORQ into currrent rx wqe,
736 * but keep ORQ entry valid until Read.Response processing done.
737 * No Queue locking needed.
738 */
739static int siw_orqe_start_rx(struct siw_qp *qp)
740{
741	struct siw_sqe *orqe;
742	struct siw_wqe *wqe = NULL;
743
744	if (unlikely(!qp->attrs.orq_size))
745		return -EPROTO;
746
747	/* make sure ORQ indices are current */
748	smp_mb();
749
750	orqe = orq_get_current(qp);
751	if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
752		/* RRESP is a TAGGED RDMAP operation */
753		wqe = rx_wqe(&qp->rx_tagged);
754		wqe->sqe.id = orqe->id;
755		wqe->sqe.opcode = orqe->opcode;
756		wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
757		wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
758		wqe->sqe.sge[0].length = orqe->sge[0].length;
759		wqe->sqe.flags = orqe->flags;
760		wqe->sqe.num_sge = 1;
761		wqe->bytes = orqe->sge[0].length;
762		wqe->processed = 0;
763		wqe->mem[0] = NULL;
764		/* make sure WQE is completely written before valid */
765		smp_wmb();
766		wqe->wr_status = SIW_WR_INPROGRESS;
767
768		return 0;
769	}
770	return -EPROTO;
771}
772
773/*
774 * siw_proc_rresp:
775 *
776 * Place incoming RRESP data into memory referenced by RREQ WQE
777 * which is at the tip of the ORQ
778 *
779 * Function supports partially received RRESP's (suspending/resuming
780 * current receive processing)
781 */
782int siw_proc_rresp(struct siw_qp *qp)
783{
784	struct siw_rx_stream *srx = &qp->rx_stream;
785	struct siw_rx_fpdu *frx = &qp->rx_tagged;
786	struct siw_wqe *wqe = rx_wqe(frx);
787	struct siw_mem **mem, *mem_p;
788	struct siw_sge *sge;
789	int bytes, rv;
790
791	if (frx->first_ddp_seg) {
792		if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
793			pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
794				qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
795			rv = -EPROTO;
796			goto error_term;
797		}
798		/*
799		 * fetch pending RREQ from orq
800		 */
801		rv = siw_orqe_start_rx(qp);
802		if (rv) {
803			pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
804				qp_id(qp), qp->attrs.orq_size);
805			goto error_term;
806		}
807		rv = siw_rresp_check_ntoh(srx, frx);
808		if (unlikely(rv)) {
809			siw_qp_event(qp, IB_EVENT_QP_FATAL);
810			return rv;
811		}
812	} else {
813		if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
814			pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
815				qp_id(qp), wqe->wr_status);
816			rv = -EPROTO;
817			goto error_term;
818		}
819	}
820	if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
821		return 0;
822
823	sge = wqe->sqe.sge; /* there is only one */
824	mem = &wqe->mem[0];
825
826	if (!(*mem)) {
827		/*
828		 * check target memory which resolves memory on first fragment
829		 */
830		rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
831				   wqe->bytes);
832		if (unlikely(rv)) {
833			siw_dbg_qp(qp, "target mem check: %d\n", rv);
834			wqe->wc_status = SIW_WC_LOC_PROT_ERR;
835
836			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
837					   DDP_ETYPE_TAGGED_BUF,
838					   siw_tagged_error(-rv), 0);
839
840			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
841
842			return -EINVAL;
843		}
844	}
845	mem_p = *mem;
846
847	bytes = min(srx->fpdu_part_rem, srx->skb_new);
848	rv = siw_rx_data(mem_p, srx, &frx->pbl_idx,
849			 sge->laddr + wqe->processed, bytes);
850	if (rv != bytes) {
851		wqe->wc_status = SIW_WC_GENERAL_ERR;
852		rv = -EINVAL;
853		goto error_term;
854	}
855	srx->fpdu_part_rem -= rv;
856	srx->fpdu_part_rcvd += rv;
857	wqe->processed += rv;
858
859	if (!srx->fpdu_part_rem) {
860		srx->ddp_to += srx->fpdu_part_rcvd;
861		return 0;
862	}
863	return -EAGAIN;
864
865error_term:
866	siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
867			   DDP_ECODE_CATASTROPHIC, 0);
868	return rv;
869}
870
871static void siw_update_skb_rcvd(struct siw_rx_stream *srx, u16 length)
872{
873	srx->skb_offset += length;
874	srx->skb_new -= length;
875	srx->skb_copied += length;
876}
877
878int siw_proc_terminate(struct siw_qp *qp)
879{
880	struct siw_rx_stream *srx = &qp->rx_stream;
881	struct sk_buff *skb = srx->skb;
882	struct iwarp_terminate *term = &srx->hdr.terminate;
883	union iwarp_hdr term_info;
884	u8 *infop = (u8 *)&term_info;
885	enum rdma_opcode op;
886	u16 to_copy = sizeof(struct iwarp_ctrl);
887
888	pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
889		__rdmap_term_layer(term), __rdmap_term_etype(term),
890		__rdmap_term_ecode(term));
891
892	if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
893	    be32_to_cpu(term->ddp_msn) !=
894		    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
895	    be32_to_cpu(term->ddp_mo) != 0) {
896		pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
897			be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
898			be32_to_cpu(term->ddp_mo));
899		return -ECONNRESET;
900	}
901	/*
902	 * Receive remaining pieces of TERM if indicated
903	 */
904	if (!term->flag_m)
905		return -ECONNRESET;
906
907	/* Do not take the effort to reassemble a network fragmented
908	 * TERM message
909	 */
910	if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
911		return -ECONNRESET;
912
913	memset(infop, 0, sizeof(term_info));
914
915	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
916
917	op = __rdmap_get_opcode(&term_info.ctrl);
918	if (op >= RDMAP_TERMINATE)
919		goto out;
920
921	infop += to_copy;
922	siw_update_skb_rcvd(srx, to_copy);
923	srx->fpdu_part_rcvd += to_copy;
924	srx->fpdu_part_rem -= to_copy;
925
926	to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
927
928	/* Again, no network fragmented TERM's */
929	if (to_copy + MPA_CRC_SIZE > srx->skb_new)
930		return -ECONNRESET;
931
932	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
933
934	if (term->flag_r) {
935		siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
936			   op, be16_to_cpu(term_info.ctrl.mpa_len),
937			   term->flag_m ? "valid" : "invalid");
938	} else if (term->flag_d) {
939		siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
940			   op, be16_to_cpu(term_info.ctrl.mpa_len),
941			   term->flag_m ? "valid" : "invalid");
942	}
943out:
944	siw_update_skb_rcvd(srx, to_copy);
945	srx->fpdu_part_rcvd += to_copy;
946	srx->fpdu_part_rem -= to_copy;
947
948	return -ECONNRESET;
949}
950
951static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
952{
953	struct sk_buff *skb = srx->skb;
954	int avail = min(srx->skb_new, srx->fpdu_part_rem);
955	u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
956	__wsum crc_in, crc_own = 0;
957
958	siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
959		   srx->fpdu_part_rem, srx->skb_new, srx->pad);
960
961	skb_copy_bits(skb, srx->skb_offset, tbuf, avail);
962
963	siw_update_skb_rcvd(srx, avail);
964	srx->fpdu_part_rem -= avail;
965
966	if (srx->fpdu_part_rem)
967		return -EAGAIN;
968
969	if (!srx->mpa_crc_hd)
970		return 0;
971
972	if (srx->pad)
973		crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
974	/*
975	 * CRC32 is computed, transmitted and received directly in NBO,
976	 * so there's never a reason to convert byte order.
977	 */
978	crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
979	crc_in = (__force __wsum)srx->trailer.crc;
980
981	if (unlikely(crc_in != crc_own)) {
982		pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
983			crc_in, crc_own, qp->rx_stream.rdmap_op);
984
985		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
986				   LLP_ETYPE_MPA,
987				   LLP_ECODE_RECEIVED_CRC, 0);
988		return -EINVAL;
989	}
990	return 0;
991}
992
993#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
994
995static int siw_get_hdr(struct siw_rx_stream *srx)
996{
997	struct sk_buff *skb = srx->skb;
998	struct siw_qp *qp = rx_qp(srx);
999	struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1000	struct siw_rx_fpdu *frx;
1001	u8 opcode;
1002	int bytes;
1003
1004	if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1005		/*
1006		 * copy a mimimum sized (tagged) DDP frame control part
1007		 */
1008		bytes = min_t(int, srx->skb_new,
1009			      MIN_DDP_HDR - srx->fpdu_part_rcvd);
1010
1011		skb_copy_bits(skb, srx->skb_offset,
1012			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1013
1014		siw_update_skb_rcvd(srx, bytes);
1015		srx->fpdu_part_rcvd += bytes;
1016		if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1017			return -EAGAIN;
1018
1019		if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1020			enum ddp_etype etype;
1021			enum ddp_ecode ecode;
1022
1023			pr_warn("siw: received ddp version unsupported %d\n",
1024				__ddp_get_version(c_hdr));
1025
1026			if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1027				etype = DDP_ETYPE_TAGGED_BUF;
1028				ecode = DDP_ECODE_T_VERSION;
1029			} else {
1030				etype = DDP_ETYPE_UNTAGGED_BUF;
1031				ecode = DDP_ECODE_UT_VERSION;
1032			}
1033			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1034					   etype, ecode, 0);
1035			return -EINVAL;
1036		}
1037		if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1038			pr_warn("siw: received rdmap version unsupported %d\n",
1039				__rdmap_get_version(c_hdr));
1040
1041			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1042					   RDMAP_ETYPE_REMOTE_OPERATION,
1043					   RDMAP_ECODE_VERSION, 0);
1044			return -EINVAL;
1045		}
1046		opcode = __rdmap_get_opcode(c_hdr);
1047
1048		if (opcode > RDMAP_TERMINATE) {
1049			pr_warn("siw: received unknown packet type %u\n",
1050				opcode);
1051
1052			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1053					   RDMAP_ETYPE_REMOTE_OPERATION,
1054					   RDMAP_ECODE_OPCODE, 0);
1055			return -EINVAL;
1056		}
1057		siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1058	} else {
1059		opcode = __rdmap_get_opcode(c_hdr);
1060	}
1061	set_rx_fpdu_context(qp, opcode);
1062	frx = qp->rx_fpdu;
1063
1064	/*
1065	 * Figure out len of current hdr: variable length of
1066	 * iwarp hdr may force us to copy hdr information in
1067	 * two steps. Only tagged DDP messages are already
1068	 * completely received.
1069	 */
1070	if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1071		int hdrlen = iwarp_pktinfo[opcode].hdr_len;
1072
1073		bytes = min_t(int, hdrlen - MIN_DDP_HDR, srx->skb_new);
1074
1075		skb_copy_bits(skb, srx->skb_offset,
1076			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1077
1078		siw_update_skb_rcvd(srx, bytes);
1079		srx->fpdu_part_rcvd += bytes;
1080		if (srx->fpdu_part_rcvd < hdrlen)
1081			return -EAGAIN;
1082	}
1083
1084	/*
1085	 * DDP/RDMAP header receive completed. Check if the current
1086	 * DDP segment starts a new RDMAP message or continues a previously
1087	 * started RDMAP message.
1088	 *
1089	 * Alternating reception of DDP segments (or FPDUs) from incomplete
1090	 * tagged and untagged RDMAP messages is supported, as long as
1091	 * the current tagged or untagged message gets eventually completed
1092	 * w/o intersection from another message of the same type
1093	 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1094	 * but not by a READ RESPONSE etc.
1095	 */
1096	if (srx->mpa_crc_hd) {
1097		/*
1098		 * Restart CRC computation
1099		 */
1100		crypto_shash_init(srx->mpa_crc_hd);
1101		crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1102				    srx->fpdu_part_rcvd);
1103	}
1104	if (frx->more_ddp_segs) {
1105		frx->first_ddp_seg = 0;
1106		if (frx->prev_rdmap_op != opcode) {
1107			pr_warn("siw: packet intersection: %u : %u\n",
1108				frx->prev_rdmap_op, opcode);
1109			/*
1110			 * The last inbound RDMA operation of same type
1111			 * (tagged or untagged) is left unfinished.
1112			 * To complete it in error, make it the current
1113			 * operation again, even with the header already
1114			 * overwritten. For error handling, only the opcode
1115			 * and current rx context are relevant.
1116			 */
1117			set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1118			__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1119			return -EPROTO;
1120		}
1121	} else {
1122		frx->prev_rdmap_op = opcode;
1123		frx->first_ddp_seg = 1;
1124	}
1125	frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1126
1127	return 0;
1128}
1129
1130static int siw_check_tx_fence(struct siw_qp *qp)
1131{
1132	struct siw_wqe *tx_waiting = tx_wqe(qp);
1133	struct siw_sqe *rreq;
1134	int resume_tx = 0, rv = 0;
1135	unsigned long flags;
1136
1137	spin_lock_irqsave(&qp->orq_lock, flags);
1138
1139	/* free current orq entry */
1140	rreq = orq_get_current(qp);
1141	WRITE_ONCE(rreq->flags, 0);
1142
1143	qp->orq_get++;
1144
1145	if (qp->tx_ctx.orq_fence) {
1146		if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1147			pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1148				qp_id(qp), tx_waiting->wr_status);
1149			rv = -EPROTO;
1150			goto out;
1151		}
1152		/* resume SQ processing, if possible */
1153		if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1154		    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1155
1156			/* SQ processing was stopped because of a full ORQ */
1157			rreq = orq_get_free(qp);
1158			if (unlikely(!rreq)) {
1159				pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1160				rv = -EPROTO;
1161				goto out;
1162			}
1163			siw_read_to_orq(rreq, &tx_waiting->sqe);
1164
1165			qp->orq_put++;
1166			qp->tx_ctx.orq_fence = 0;
1167			resume_tx = 1;
1168
1169		} else if (siw_orq_empty(qp)) {
1170			/*
1171			 * SQ processing was stopped by fenced work request.
1172			 * Resume since all previous Read's are now completed.
1173			 */
1174			qp->tx_ctx.orq_fence = 0;
1175			resume_tx = 1;
1176		}
1177	}
1178out:
1179	spin_unlock_irqrestore(&qp->orq_lock, flags);
1180
1181	if (resume_tx)
1182		rv = siw_sq_start(qp);
1183
1184	return rv;
1185}
1186
1187/*
1188 * siw_rdmap_complete()
1189 *
1190 * Complete processing of an RDMA message after receiving all
1191 * DDP segmens or ABort processing after encountering error case.
1192 *
1193 *   o SENDs + RRESPs will need for completion,
1194 *   o RREQs need for  READ RESPONSE initialization
1195 *   o WRITEs need memory dereferencing
1196 *
1197 * TODO: Failed WRITEs need local error to be surfaced.
1198 */
1199static int siw_rdmap_complete(struct siw_qp *qp, int error)
1200{
1201	struct siw_rx_stream *srx = &qp->rx_stream;
1202	struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1203	enum siw_wc_status wc_status = wqe->wc_status;
1204	u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1205	int rv = 0;
1206
1207	switch (opcode) {
1208	case RDMAP_SEND_SE:
1209	case RDMAP_SEND_SE_INVAL:
1210		wqe->rqe.flags |= SIW_WQE_SOLICITED;
1211		fallthrough;
1212
1213	case RDMAP_SEND:
1214	case RDMAP_SEND_INVAL:
1215		if (wqe->wr_status == SIW_WR_IDLE)
1216			break;
1217
1218		srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1219
1220		if (error != 0 && wc_status == SIW_WC_SUCCESS)
1221			wc_status = SIW_WC_GENERAL_ERR;
1222		/*
1223		 * Handle STag invalidation request
1224		 */
1225		if (wc_status == SIW_WC_SUCCESS &&
1226		    (opcode == RDMAP_SEND_INVAL ||
1227		     opcode == RDMAP_SEND_SE_INVAL)) {
1228			rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1229			if (rv) {
1230				siw_init_terminate(
1231					qp, TERM_ERROR_LAYER_RDMAP,
1232					rv == -EACCES ?
1233						RDMAP_ETYPE_REMOTE_PROTECTION :
1234						RDMAP_ETYPE_REMOTE_OPERATION,
1235					RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1236
1237				wc_status = SIW_WC_REM_INV_REQ_ERR;
1238			}
1239			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1240					      rv ? 0 : srx->inval_stag,
1241					      wc_status);
1242		} else {
1243			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1244					      0, wc_status);
1245		}
1246		siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1247		break;
1248
1249	case RDMAP_RDMA_READ_RESP:
1250		if (wqe->wr_status == SIW_WR_IDLE)
1251			break;
1252
1253		if (error != 0) {
1254			if ((srx->state == SIW_GET_HDR &&
1255			     qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1256				/* possible RREQ in ORQ left untouched */
1257				break;
1258
1259			if (wc_status == SIW_WC_SUCCESS)
1260				wc_status = SIW_WC_GENERAL_ERR;
1261		} else if (rdma_is_kernel_res(&qp->base_qp.res) &&
1262			   rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1263			/*
1264			 * Handle any STag invalidation request
1265			 */
1266			rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1267			if (rv) {
1268				siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1269						   RDMAP_ETYPE_CATASTROPHIC,
1270						   RDMAP_ECODE_UNSPECIFIED, 0);
1271
1272				if (wc_status == SIW_WC_SUCCESS) {
1273					wc_status = SIW_WC_GENERAL_ERR;
1274					error = rv;
1275				}
1276			}
1277		}
1278		/*
1279		 * All errors turn the wqe into signalled.
1280		 */
1281		if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1282			rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1283					      wc_status);
1284		siw_wqe_put_mem(wqe, SIW_OP_READ);
1285
1286		if (!error) {
1287			rv = siw_check_tx_fence(qp);
1288		} else {
1289			/* Disable current ORQ element */
1290			if (qp->attrs.orq_size)
1291				WRITE_ONCE(orq_get_current(qp)->flags, 0);
1292		}
1293		break;
1294
1295	case RDMAP_RDMA_READ_REQ:
1296		if (!error) {
1297			rv = siw_init_rresp(qp, srx);
1298			srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1299		}
1300		break;
1301
1302	case RDMAP_RDMA_WRITE:
1303		if (wqe->wr_status == SIW_WR_IDLE)
1304			break;
1305
1306		/*
1307		 * Free References from memory object if
1308		 * attached to receive context (inbound WRITE).
1309		 * While a zero-length WRITE is allowed,
1310		 * no memory reference got created.
1311		 */
1312		if (rx_mem(&qp->rx_tagged)) {
1313			siw_mem_put(rx_mem(&qp->rx_tagged));
1314			rx_mem(&qp->rx_tagged) = NULL;
1315		}
1316		break;
1317
1318	default:
1319		break;
1320	}
1321	wqe->wr_status = SIW_WR_IDLE;
1322
1323	return rv;
1324}
1325
1326/*
1327 * siw_tcp_rx_data()
1328 *
1329 * Main routine to consume inbound TCP payload
1330 *
1331 * @rd_desc:	read descriptor
1332 * @skb:	socket buffer
1333 * @off:	offset in skb
1334 * @len:	skb->len - offset : payload in skb
1335 */
1336int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1337		    unsigned int off, size_t len)
1338{
1339	struct siw_qp *qp = rd_desc->arg.data;
1340	struct siw_rx_stream *srx = &qp->rx_stream;
1341	int rv;
1342
1343	srx->skb = skb;
1344	srx->skb_new = skb->len - off;
1345	srx->skb_offset = off;
1346	srx->skb_copied = 0;
1347
1348	siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1349
1350	while (srx->skb_new) {
1351		int run_completion = 1;
1352
1353		if (unlikely(srx->rx_suspend)) {
1354			/* Do not process any more data */
1355			srx->skb_copied += srx->skb_new;
1356			break;
1357		}
1358		switch (srx->state) {
1359		case SIW_GET_HDR:
1360			rv = siw_get_hdr(srx);
1361			if (!rv) {
1362				srx->fpdu_part_rem =
1363					be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1364					srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1365
1366				if (srx->fpdu_part_rem)
1367					srx->pad = -srx->fpdu_part_rem & 0x3;
1368				else
1369					srx->pad = 0;
1370
1371				srx->state = SIW_GET_DATA_START;
1372				srx->fpdu_part_rcvd = 0;
1373			}
1374			break;
1375
1376		case SIW_GET_DATA_MORE:
1377			/*
1378			 * Another data fragment of the same DDP segment.
1379			 * Setting first_ddp_seg = 0 avoids repeating
1380			 * initializations that shall occur only once per
1381			 * DDP segment.
1382			 */
1383			qp->rx_fpdu->first_ddp_seg = 0;
1384			fallthrough;
1385
1386		case SIW_GET_DATA_START:
1387			/*
1388			 * Headers will be checked by the opcode-specific
1389			 * data receive function below.
1390			 */
1391			rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1392			if (!rv) {
1393				int mpa_len =
1394					be16_to_cpu(srx->hdr.ctrl.mpa_len)
1395					+ MPA_HDR_SIZE;
1396
1397				srx->fpdu_part_rem = (-mpa_len & 0x3)
1398						      + MPA_CRC_SIZE;
1399				srx->fpdu_part_rcvd = 0;
1400				srx->state = SIW_GET_TRAILER;
1401			} else {
1402				if (unlikely(rv == -ECONNRESET))
1403					run_completion = 0;
1404				else
1405					srx->state = SIW_GET_DATA_MORE;
1406			}
1407			break;
1408
1409		case SIW_GET_TRAILER:
1410			/*
1411			 * read CRC + any padding
1412			 */
1413			rv = siw_get_trailer(qp, srx);
1414			if (likely(!rv)) {
1415				/*
1416				 * FPDU completed.
1417				 * complete RDMAP message if last fragment
1418				 */
1419				srx->state = SIW_GET_HDR;
1420				srx->fpdu_part_rcvd = 0;
1421
1422				if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1423				      DDP_FLAG_LAST))
1424					/* more frags */
1425					break;
1426
1427				rv = siw_rdmap_complete(qp, 0);
1428				run_completion = 0;
1429			}
1430			break;
1431
1432		default:
1433			pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1434			rv = -EPROTO;
1435			run_completion = 0;
1436		}
1437		if (unlikely(rv != 0 && rv != -EAGAIN)) {
1438			if ((srx->state > SIW_GET_HDR ||
1439			     qp->rx_fpdu->more_ddp_segs) && run_completion)
1440				siw_rdmap_complete(qp, rv);
1441
1442			siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1443				   srx->state);
1444
1445			siw_qp_cm_drop(qp, 1);
1446
1447			break;
1448		}
1449		if (rv) {
1450			siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1451				   srx->state, srx->fpdu_part_rem);
1452			break;
1453		}
1454	}
1455	return srx->skb_copied;
1456}
1457