1219820Sjeff/*
2219820Sjeff * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
3219820Sjeff *
4219820Sjeff * This software is available to you under a choice of one of two
5219820Sjeff * licenses.  You may choose to be licensed under the terms of the GNU
6219820Sjeff * General Public License (GPL) Version 2, available from the file
7219820Sjeff * COPYING in the main directory of this source tree, or the
8219820Sjeff * OpenIB.org BSD license below:
9219820Sjeff *
10219820Sjeff *     Redistribution and use in source and binary forms, with or
11219820Sjeff *     without modification, are permitted provided that the following
12219820Sjeff *     conditions are met:
13219820Sjeff *
14219820Sjeff *      - Redistributions of source code must retain the above
15219820Sjeff *        copyright notice, this list of conditions and the following
16219820Sjeff *        disclaimer.
17219820Sjeff *
18219820Sjeff *      - Redistributions in binary form must reproduce the above
19219820Sjeff *        copyright notice, this list of conditions and the following
20219820Sjeff *        disclaimer in the documentation and/or other materials
21219820Sjeff *        provided with the distribution.
22219820Sjeff *
23219820Sjeff * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24219820Sjeff * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25219820Sjeff * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26219820Sjeff * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27219820Sjeff * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28219820Sjeff * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29219820Sjeff * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30219820Sjeff * SOFTWARE.
31219820Sjeff */
32219820Sjeff#include <linux/tcp.h>
33219820Sjeff#include <asm/ioctls.h>
34219820Sjeff#include <linux/workqueue.h>
35219820Sjeff#include <linux/net.h>
36219820Sjeff#include <linux/socket.h>
37219820Sjeff#include <net/protocol.h>
38219820Sjeff#include <net/inet_common.h>
39219820Sjeff#include <rdma/rdma_cm.h>
40219820Sjeff#include <rdma/ib_verbs.h>
41219820Sjeff#include <rdma/ib_fmr_pool.h>
42219820Sjeff#include <rdma/ib_umem.h>
43219820Sjeff#include <net/tcp.h> /* for memcpy_toiovec */
44219820Sjeff#include <asm/io.h>
45219820Sjeff#include <asm/uaccess.h>
46219820Sjeff#include <linux/delay.h>
47219820Sjeff#include "sdp.h"
48219820Sjeff
49219820Sjeffstatic int sdp_post_srcavail(struct socket *sk, struct tx_srcavail_state *tx_sa)
50219820Sjeff{
51219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
52219820Sjeff	struct mbuf *mb;
53219820Sjeff	int payload_len;
54219820Sjeff	struct page *payload_pg;
55219820Sjeff	int off, len;
56219820Sjeff	struct ib_umem_chunk *chunk;
57219820Sjeff
58219820Sjeff	WARN_ON(ssk->tx_sa);
59219820Sjeff
60219820Sjeff	BUG_ON(!tx_sa);
61219820Sjeff	BUG_ON(!tx_sa->fmr || !tx_sa->fmr->fmr->lkey);
62219820Sjeff	BUG_ON(!tx_sa->umem);
63219820Sjeff	BUG_ON(!tx_sa->umem->chunk_list.next);
64219820Sjeff
65219820Sjeff	chunk = list_entry(tx_sa->umem->chunk_list.next, struct ib_umem_chunk, list);
66219820Sjeff	BUG_ON(!chunk->nmap);
67219820Sjeff
68219820Sjeff	off = tx_sa->umem->offset;
69219820Sjeff	len = tx_sa->umem->length;
70219820Sjeff
71219820Sjeff	tx_sa->bytes_sent = tx_sa->bytes_acked = 0;
72219820Sjeff
73219820Sjeff	mb = sdp_alloc_mb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0);
74219820Sjeff	if (!mb) {
75219820Sjeff		return -ENOMEM;
76219820Sjeff	}
77219820Sjeff	sdp_dbg_data(sk, "sending SrcAvail\n");
78219820Sjeff
79219820Sjeff	TX_SRCAVAIL_STATE(mb) = tx_sa; /* tx_sa is hanged on the mb
80219820Sjeff					 * but continue to live after mb is freed */
81219820Sjeff	ssk->tx_sa = tx_sa;
82219820Sjeff
83219820Sjeff	/* must have payload inlined in SrcAvail packet in combined mode */
84219820Sjeff	payload_len = MIN(tx_sa->umem->page_size - off, len);
85219820Sjeff	payload_len = MIN(payload_len, ssk->xmit_size_goal - sizeof(struct sdp_srcah));
86219820Sjeff	payload_pg  = sg_page(&chunk->page_list[0]);
87219820Sjeff	get_page(payload_pg);
88219820Sjeff
89219820Sjeff	sdp_dbg_data(sk, "payload: off: 0x%x, pg: %p, len: 0x%x\n",
90219820Sjeff		off, payload_pg, payload_len);
91219820Sjeff
92219820Sjeff	mb_fill_page_desc(mb, mb_shinfo(mb)->nr_frags,
93219820Sjeff			payload_pg, off, payload_len);
94219820Sjeff
95219820Sjeff	mb->len             += payload_len;
96219820Sjeff	mb->data_len         = payload_len;
97219820Sjeff	mb->truesize        += payload_len;
98219820Sjeff//	sk->sk_wmem_queued   += payload_len;
99219820Sjeff//	sk->sk_forward_alloc -= payload_len;
100219820Sjeff
101219820Sjeff	mb_entail(sk, ssk, mb);
102219820Sjeff
103219820Sjeff	ssk->write_seq += payload_len;
104219820Sjeff	SDP_SKB_CB(mb)->end_seq += payload_len;
105219820Sjeff
106219820Sjeff	tx_sa->bytes_sent = tx_sa->umem->length;
107219820Sjeff	tx_sa->bytes_acked = payload_len;
108219820Sjeff
109219820Sjeff	/* TODO: pushing the mb into the tx_queue should be enough */
110219820Sjeff
111219820Sjeff	return 0;
112219820Sjeff}
113219820Sjeff
114219820Sjeffstatic int sdp_post_srcavail_cancel(struct socket *sk)
115219820Sjeff{
116219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
117219820Sjeff	struct mbuf *mb;
118219820Sjeff
119219820Sjeff	sdp_dbg_data(ssk->socket, "Posting srcavail cancel\n");
120219820Sjeff
121219820Sjeff	mb = sdp_alloc_mb_srcavail_cancel(sk, 0);
122219820Sjeff	mb_entail(sk, ssk, mb);
123219820Sjeff
124219820Sjeff	sdp_post_sends(ssk, 0);
125219820Sjeff
126219820Sjeff	schedule_delayed_work(&ssk->srcavail_cancel_work,
127219820Sjeff			SDP_SRCAVAIL_CANCEL_TIMEOUT);
128219820Sjeff
129219820Sjeff	return 0;
130219820Sjeff}
131219820Sjeff
132219820Sjeffvoid srcavail_cancel_timeout(struct work_struct *work)
133219820Sjeff{
134219820Sjeff	struct sdp_sock *ssk =
135219820Sjeff		container_of(work, struct sdp_sock, srcavail_cancel_work.work);
136219820Sjeff	struct socket *sk = ssk->socket;
137219820Sjeff
138219820Sjeff	lock_sock(sk);
139219820Sjeff
140219820Sjeff	sdp_dbg_data(sk, "both SrcAvail and SrcAvailCancel timedout."
141219820Sjeff			" closing connection\n");
142219820Sjeff	sdp_set_error(sk, -ECONNRESET);
143219820Sjeff	wake_up(&ssk->wq);
144219820Sjeff
145219820Sjeff	release_sock(sk);
146219820Sjeff}
147219820Sjeff
148219820Sjeffstatic int sdp_wait_rdmardcompl(struct sdp_sock *ssk, long *timeo_p,
149219820Sjeff		int ignore_signals)
150219820Sjeff{
151219820Sjeff	struct socket *sk = ssk->socket;
152219820Sjeff	int err = 0;
153219820Sjeff	long vm_wait = 0;
154219820Sjeff	long current_timeo = *timeo_p;
155219820Sjeff	struct tx_srcavail_state *tx_sa = ssk->tx_sa;
156219820Sjeff	DEFINE_WAIT(wait);
157219820Sjeff
158219820Sjeff	sdp_dbg_data(sk, "sleep till RdmaRdCompl. timeo = %ld.\n", *timeo_p);
159219820Sjeff	sdp_prf1(sk, NULL, "Going to sleep");
160219820Sjeff	while (ssk->qp_active) {
161219820Sjeff		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
162219820Sjeff
163219820Sjeff		if (unlikely(!*timeo_p)) {
164219820Sjeff			err = -ETIME;
165219820Sjeff			tx_sa->abort_flags |= TX_SA_TIMEDOUT;
166219820Sjeff			sdp_prf1(sk, NULL, "timeout");
167219820Sjeff			SDPSTATS_COUNTER_INC(zcopy_tx_timeout);
168219820Sjeff			break;
169219820Sjeff		}
170219820Sjeff
171219820Sjeff		else if (tx_sa->bytes_acked > tx_sa->bytes_sent) {
172219820Sjeff			err = -EINVAL;
173219820Sjeff			sdp_dbg_data(sk, "acked bytes > sent bytes\n");
174219820Sjeff			tx_sa->abort_flags |= TX_SA_ERROR;
175219820Sjeff			break;
176219820Sjeff		}
177219820Sjeff
178219820Sjeff		if (tx_sa->abort_flags & TX_SA_SENDSM) {
179219820Sjeff			sdp_prf1(sk, NULL, "Aborting SrcAvail sending");
180219820Sjeff			SDPSTATS_COUNTER_INC(zcopy_tx_aborted);
181219820Sjeff			err = -EAGAIN;
182219820Sjeff			break ;
183219820Sjeff		}
184219820Sjeff
185219820Sjeff		if (!ignore_signals) {
186219820Sjeff			if (signal_pending(current)) {
187219820Sjeff				err = -EINTR;
188219820Sjeff				sdp_prf1(sk, NULL, "signalled");
189219820Sjeff				tx_sa->abort_flags |= TX_SA_INTRRUPTED;
190219820Sjeff				break;
191219820Sjeff			}
192219820Sjeff
193219820Sjeff			if (ssk->rx_sa && (tx_sa->bytes_acked < tx_sa->bytes_sent)) {
194219820Sjeff				sdp_dbg_data(sk, "Crossing SrcAvail - aborting this\n");
195219820Sjeff				tx_sa->abort_flags |= TX_SA_CROSS_SEND;
196219820Sjeff				SDPSTATS_COUNTER_INC(zcopy_cross_send);
197219820Sjeff				err = -ETIME;
198219820Sjeff				break ;
199219820Sjeff			}
200219820Sjeff		}
201219820Sjeff
202219820Sjeff		posts_handler_put(ssk);
203219820Sjeff
204219820Sjeff		sk_wait_event(sk, &current_timeo,
205219820Sjeff				tx_sa->abort_flags &&
206219820Sjeff				ssk->rx_sa &&
207219820Sjeff				(tx_sa->bytes_acked < tx_sa->bytes_sent) &&
208219820Sjeff				vm_wait);
209219820Sjeff		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
210219820Sjeff
211219820Sjeff		posts_handler_get(ssk);
212219820Sjeff
213219820Sjeff		if (tx_sa->bytes_acked == tx_sa->bytes_sent)
214219820Sjeff			break;
215219820Sjeff
216219820Sjeff		if (vm_wait) {
217219820Sjeff			vm_wait -= current_timeo;
218219820Sjeff			current_timeo = *timeo_p;
219219820Sjeff			if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
220219820Sjeff			    (current_timeo -= vm_wait) < 0)
221219820Sjeff				current_timeo = 0;
222219820Sjeff			vm_wait = 0;
223219820Sjeff		}
224219820Sjeff		*timeo_p = current_timeo;
225219820Sjeff	}
226219820Sjeff
227219820Sjeff	finish_wait(sk->sk_sleep, &wait);
228219820Sjeff
229219820Sjeff	sdp_dbg_data(sk, "Finished waiting - RdmaRdCompl: %d/%d bytes, flags: 0x%x\n",
230219820Sjeff			tx_sa->bytes_acked, tx_sa->bytes_sent, tx_sa->abort_flags);
231219820Sjeff
232219820Sjeff	if (!ssk->qp_active) {
233219820Sjeff		sdp_dbg(sk, "QP destroyed while waiting\n");
234219820Sjeff		return -EINVAL;
235219820Sjeff	}
236219820Sjeff	return err;
237219820Sjeff}
238219820Sjeff
239219820Sjeffstatic void sdp_wait_rdma_wr_finished(struct sdp_sock *ssk)
240219820Sjeff{
241219820Sjeff	struct socket *sk = ssk->socket;
242219820Sjeff	long timeo = HZ * 5; /* Timeout for for RDMA read */
243219820Sjeff	DEFINE_WAIT(wait);
244219820Sjeff
245219820Sjeff	sdp_dbg_data(sk, "Sleep till RDMA wr finished.\n");
246219820Sjeff	while (1) {
247219820Sjeff		prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
248219820Sjeff
249219820Sjeff		if (!ssk->tx_ring.rdma_inflight->busy) {
250219820Sjeff			sdp_dbg_data(sk, "got rdma cqe\n");
251219820Sjeff			break;
252219820Sjeff		}
253219820Sjeff
254219820Sjeff		if (!ssk->qp_active) {
255219820Sjeff			sdp_dbg_data(sk, "QP destroyed\n");
256219820Sjeff			break;
257219820Sjeff		}
258219820Sjeff
259219820Sjeff		if (!timeo) {
260219820Sjeff			sdp_warn(sk, "Panic: Timed out waiting for RDMA read\n");
261219820Sjeff			WARN_ON(1);
262219820Sjeff			break;
263219820Sjeff		}
264219820Sjeff
265219820Sjeff		posts_handler_put(ssk);
266219820Sjeff
267219820Sjeff		sdp_prf1(sk, NULL, "Going to sleep");
268219820Sjeff		sk_wait_event(sk, &timeo,
269219820Sjeff			!ssk->tx_ring.rdma_inflight->busy);
270219820Sjeff		sdp_prf1(sk, NULL, "Woke up");
271219820Sjeff		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
272219820Sjeff
273219820Sjeff		posts_handler_get(ssk);
274219820Sjeff	}
275219820Sjeff
276219820Sjeff	finish_wait(sk->sk_sleep, &wait);
277219820Sjeff
278219820Sjeff	sdp_dbg_data(sk, "Finished waiting\n");
279219820Sjeff}
280219820Sjeff
281219820Sjeffint sdp_post_rdma_rd_compl(struct sdp_sock *ssk,
282219820Sjeff		struct rx_srcavail_state *rx_sa)
283219820Sjeff{
284219820Sjeff	struct mbuf *mb;
285219820Sjeff	int copied = rx_sa->used - rx_sa->reported;
286219820Sjeff
287219820Sjeff	if (rx_sa->used <= rx_sa->reported)
288219820Sjeff		return 0;
289219820Sjeff
290219820Sjeff	mb = sdp_alloc_mb_rdmardcompl(ssk->socket, copied, 0);
291219820Sjeff
292219820Sjeff	rx_sa->reported += copied;
293219820Sjeff
294219820Sjeff	/* TODO: What if no tx_credits available? */
295219820Sjeff	sdp_post_send(ssk, mb);
296219820Sjeff
297219820Sjeff	return 0;
298219820Sjeff}
299219820Sjeff
300219820Sjeffint sdp_post_sendsm(struct socket *sk)
301219820Sjeff{
302219820Sjeff	struct mbuf *mb = sdp_alloc_mb_sendsm(sk, 0);
303219820Sjeff
304219820Sjeff	sdp_post_send(sdp_sk(sk), mb);
305219820Sjeff
306219820Sjeff	return 0;
307219820Sjeff}
308219820Sjeff
309219820Sjeffstatic int sdp_update_iov_used(struct socket *sk, struct iovec *iov, int len)
310219820Sjeff{
311219820Sjeff	sdp_dbg_data(sk, "updating consumed 0x%x bytes from iov\n", len);
312219820Sjeff	while (len > 0) {
313219820Sjeff		if (iov->iov_len) {
314219820Sjeff			int copy = min_t(unsigned int, iov->iov_len, len);
315219820Sjeff			len -= copy;
316219820Sjeff			iov->iov_len -= copy;
317219820Sjeff			iov->iov_base += copy;
318219820Sjeff		}
319219820Sjeff		iov++;
320219820Sjeff	}
321219820Sjeff
322219820Sjeff	return 0;
323219820Sjeff}
324219820Sjeff
325219820Sjeffstatic inline int sge_bytes(struct ib_sge *sge, int sge_cnt)
326219820Sjeff{
327219820Sjeff	int bytes = 0;
328219820Sjeff
329219820Sjeff	while (sge_cnt > 0) {
330219820Sjeff		bytes += sge->length;
331219820Sjeff		sge++;
332219820Sjeff		sge_cnt--;
333219820Sjeff	}
334219820Sjeff
335219820Sjeff	return bytes;
336219820Sjeff}
337219820Sjeffvoid sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack)
338219820Sjeff{
339219820Sjeff	struct socket *sk = ssk->socket;
340219820Sjeff	unsigned long flags;
341219820Sjeff
342219820Sjeff	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
343219820Sjeff
344219820Sjeff	if (!ssk->tx_sa) {
345219820Sjeff		sdp_prf1(sk, NULL, "SendSM for cancelled/finished SrcAvail");
346219820Sjeff		goto out;
347219820Sjeff	}
348219820Sjeff
349219820Sjeff	if (ssk->tx_sa->mseq > mseq_ack) {
350219820Sjeff		sdp_dbg_data(sk, "SendSM arrived for old SrcAvail. "
351219820Sjeff			"SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
352219820Sjeff			mseq_ack, ssk->tx_sa->mseq);
353219820Sjeff		goto out;
354219820Sjeff	}
355219820Sjeff
356219820Sjeff	sdp_dbg_data(sk, "Got SendSM - aborting SrcAvail\n");
357219820Sjeff
358219820Sjeff	ssk->tx_sa->abort_flags |= TX_SA_SENDSM;
359219820Sjeff	cancel_delayed_work(&ssk->srcavail_cancel_work);
360219820Sjeff
361219820Sjeff	wake_up(sk->sk_sleep);
362219820Sjeff	sdp_dbg_data(sk, "woke up sleepers\n");
363219820Sjeff
364219820Sjeffout:
365219820Sjeff	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
366219820Sjeff}
367219820Sjeff
368219820Sjeffvoid sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,
369219820Sjeff		u32 bytes_completed)
370219820Sjeff{
371219820Sjeff	struct socket *sk = ssk->socket;
372219820Sjeff	unsigned long flags;
373219820Sjeff
374219820Sjeff	sdp_prf1(sk, NULL, "RdmaRdCompl ssk=%p tx_sa=%p", ssk, ssk->tx_sa);
375219820Sjeff	sdp_dbg_data(sk, "RdmaRdCompl ssk=%p tx_sa=%p\n", ssk, ssk->tx_sa);
376219820Sjeff
377219820Sjeff	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
378219820Sjeff
379219820Sjeff	BUG_ON(!ssk);
380219820Sjeff
381219820Sjeff	if (!ssk->tx_sa) {
382219820Sjeff		sdp_dbg_data(sk, "Got RdmaRdCompl for aborted SrcAvail\n");
383219820Sjeff		goto out;
384219820Sjeff	}
385219820Sjeff
386219820Sjeff	if (ssk->tx_sa->mseq > mseq_ack) {
387219820Sjeff		sdp_dbg_data(sk, "RdmaRdCompl arrived for old SrcAvail. "
388219820Sjeff			"SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
389219820Sjeff			mseq_ack, ssk->tx_sa->mseq);
390219820Sjeff		goto out;
391219820Sjeff	}
392219820Sjeff
393219820Sjeff	ssk->tx_sa->bytes_acked += bytes_completed;
394219820Sjeff
395219820Sjeff	wake_up(sk->sk_sleep);
396219820Sjeff	sdp_dbg_data(sk, "woke up sleepers\n");
397219820Sjeff
398219820Sjeffout:
399219820Sjeff	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
400219820Sjeff	return;
401219820Sjeff}
402219820Sjeff
403219820Sjeffstatic unsigned long sdp_get_max_memlockable_bytes(unsigned long offset)
404219820Sjeff{
405219820Sjeff	unsigned long avail;
406219820Sjeff	unsigned long lock_limit;
407219820Sjeff
408219820Sjeff	if (capable(CAP_IPC_LOCK))
409219820Sjeff		return ULONG_MAX;
410219820Sjeff
411219820Sjeff	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
412219820Sjeff	avail = lock_limit - (current->mm->locked_vm << PAGE_SHIFT);
413219820Sjeff
414219820Sjeff	return avail - offset;
415219820Sjeff}
416219820Sjeff
417219820Sjeffstatic int sdp_alloc_fmr(struct socket *sk, void *uaddr, size_t len,
418219820Sjeff	struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
419219820Sjeff{
420219820Sjeff	struct ib_pool_fmr *fmr;
421219820Sjeff	struct ib_umem *umem;
422219820Sjeff	struct ib_device *dev;
423219820Sjeff	u64 *pages;
424219820Sjeff	struct ib_umem_chunk *chunk;
425219820Sjeff	int n, j, k;
426219820Sjeff	int rc = 0;
427219820Sjeff	unsigned long max_lockable_bytes;
428219820Sjeff
429219820Sjeff	if (unlikely(len > SDP_MAX_RDMA_READ_LEN)) {
430219820Sjeff		sdp_dbg_data(sk, "len:0x%lx > FMR_SIZE: 0x%lx\n",
431219820Sjeff			len, SDP_MAX_RDMA_READ_LEN);
432219820Sjeff		len = SDP_MAX_RDMA_READ_LEN;
433219820Sjeff	}
434219820Sjeff
435219820Sjeff	max_lockable_bytes = sdp_get_max_memlockable_bytes((unsigned long)uaddr & ~PAGE_MASK);
436219820Sjeff	if (unlikely(len > max_lockable_bytes)) {
437219820Sjeff		sdp_dbg_data(sk, "len:0x%lx > RLIMIT_MEMLOCK available: 0x%lx\n",
438219820Sjeff			len, max_lockable_bytes);
439219820Sjeff		len = max_lockable_bytes;
440219820Sjeff	}
441219820Sjeff
442219820Sjeff	sdp_dbg_data(sk, "user buf: %p, len:0x%lx max_lockable_bytes: 0x%lx\n",
443219820Sjeff			uaddr, len, max_lockable_bytes);
444219820Sjeff
445219820Sjeff	umem = ib_umem_get(&sdp_sk(sk)->context, (unsigned long)uaddr, len,
446219820Sjeff		IB_ACCESS_REMOTE_WRITE, 0);
447219820Sjeff
448219820Sjeff	if (IS_ERR(umem)) {
449219820Sjeff		rc = PTR_ERR(umem);
450219820Sjeff		sdp_warn(sk, "Error doing umem_get 0x%lx bytes: %d\n", len, rc);
451219820Sjeff		sdp_warn(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n",
452219820Sjeff				current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur,
453219820Sjeff				current->signal->rlim[RLIMIT_MEMLOCK].rlim_max,
454219820Sjeff				capable(CAP_IPC_LOCK));
455219820Sjeff		goto err_umem_get;
456219820Sjeff	}
457219820Sjeff
458219820Sjeff	sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%lx\n",
459219820Sjeff		umem->offset, umem->length);
460219820Sjeff
461219820Sjeff	pages = (u64 *) __get_free_page(GFP_KERNEL);
462219820Sjeff	if (!pages)
463219820Sjeff		goto err_pages_alloc;
464219820Sjeff
465219820Sjeff	n = 0;
466219820Sjeff
467219820Sjeff	dev = sdp_sk(sk)->ib_device;
468219820Sjeff	list_for_each_entry(chunk, &umem->chunk_list, list) {
469219820Sjeff		for (j = 0; j < chunk->nmap; ++j) {
470219820Sjeff			len = ib_sg_dma_len(dev,
471219820Sjeff					&chunk->page_list[j]) >> PAGE_SHIFT;
472219820Sjeff
473219820Sjeff			for (k = 0; k < len; ++k) {
474219820Sjeff				pages[n++] = ib_sg_dma_address(dev,
475219820Sjeff						&chunk->page_list[j]) +
476219820Sjeff					umem->page_size * k;
477219820Sjeff
478219820Sjeff			}
479219820Sjeff		}
480219820Sjeff	}
481219820Sjeff
482219820Sjeff	fmr = ib_fmr_pool_map_phys(sdp_sk(sk)->sdp_dev->fmr_pool, pages, n, 0);
483219820Sjeff	if (IS_ERR(fmr)) {
484219820Sjeff		sdp_warn(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr));
485219820Sjeff		goto err_fmr_alloc;
486219820Sjeff	}
487219820Sjeff
488219820Sjeff	free_page((unsigned long) pages);
489219820Sjeff
490219820Sjeff	*_umem = umem;
491219820Sjeff	*_fmr = fmr;
492219820Sjeff
493219820Sjeff	return 0;
494219820Sjeff
495219820Sjefferr_fmr_alloc:
496219820Sjeff	free_page((unsigned long) pages);
497219820Sjeff
498219820Sjefferr_pages_alloc:
499219820Sjeff	ib_umem_release(umem);
500219820Sjeff
501219820Sjefferr_umem_get:
502219820Sjeff
503219820Sjeff	return rc;
504219820Sjeff}
505219820Sjeff
506219820Sjeffvoid sdp_free_fmr(struct socket *sk, struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
507219820Sjeff{
508219820Sjeff	if (!sdp_sk(sk)->qp_active)
509219820Sjeff		return;
510219820Sjeff
511219820Sjeff	ib_fmr_pool_unmap(*_fmr);
512219820Sjeff	*_fmr = NULL;
513219820Sjeff
514219820Sjeff	ib_umem_release(*_umem);
515219820Sjeff	*_umem = NULL;
516219820Sjeff}
517219820Sjeff
518219820Sjeffstatic int sdp_post_rdma_read(struct socket *sk, struct rx_srcavail_state *rx_sa)
519219820Sjeff{
520219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
521219820Sjeff	struct ib_send_wr *bad_wr;
522219820Sjeff	struct ib_send_wr wr = { NULL };
523219820Sjeff	struct ib_sge sge;
524219820Sjeff
525219820Sjeff	wr.opcode = IB_WR_RDMA_READ;
526219820Sjeff	wr.next = NULL;
527219820Sjeff	wr.wr_id = SDP_OP_RDMA;
528219820Sjeff	wr.wr.rdma.rkey = rx_sa->rkey;
529219820Sjeff	wr.send_flags = 0;
530219820Sjeff
531219820Sjeff	ssk->tx_ring.rdma_inflight = rx_sa;
532219820Sjeff
533219820Sjeff	sge.addr = rx_sa->umem->offset;
534219820Sjeff	sge.length = rx_sa->umem->length;
535219820Sjeff	sge.lkey = rx_sa->fmr->fmr->lkey;
536219820Sjeff
537219820Sjeff	wr.wr.rdma.remote_addr = rx_sa->vaddr + rx_sa->used;
538219820Sjeff	wr.num_sge = 1;
539219820Sjeff	wr.sg_list = &sge;
540219820Sjeff	rx_sa->busy++;
541219820Sjeff
542219820Sjeff	wr.send_flags = IB_SEND_SIGNALED;
543219820Sjeff
544219820Sjeff	return ib_post_send(ssk->qp, &wr, &bad_wr);
545219820Sjeff}
546219820Sjeff
547219820Sjeffint sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,
548219820Sjeff		unsigned long *used)
549219820Sjeff{
550219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
551219820Sjeff	struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(mb);
552219820Sjeff	int got_srcavail_cancel;
553219820Sjeff	int rc = 0;
554219820Sjeff	int len = *used;
555219820Sjeff	int copied;
556219820Sjeff
557219820Sjeff	sdp_dbg_data(ssk->socket, "preparing RDMA read."
558219820Sjeff		" len: 0x%x. buffer len: 0x%lx\n", len, iov->iov_len);
559219820Sjeff
560219820Sjeff	sock_hold(sk, SOCK_REF_RDMA_RD);
561219820Sjeff
562219820Sjeff	if (len > rx_sa->len) {
563219820Sjeff		sdp_warn(sk, "len:0x%x > rx_sa->len: 0x%x\n", len, rx_sa->len);
564219820Sjeff		WARN_ON(1);
565219820Sjeff		len = rx_sa->len;
566219820Sjeff	}
567219820Sjeff
568219820Sjeff	rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem);
569219820Sjeff	if (rc) {
570219820Sjeff		sdp_warn(sk, "Error allocating fmr: %d\n", rc);
571219820Sjeff		goto err_alloc_fmr;
572219820Sjeff	}
573219820Sjeff
574219820Sjeff	rc = sdp_post_rdma_read(sk, rx_sa);
575219820Sjeff	if (unlikely(rc)) {
576219820Sjeff		sdp_warn(sk, "ib_post_send failed with status %d.\n", rc);
577219820Sjeff		sdp_set_error(ssk->socket, -ECONNRESET);
578219820Sjeff		wake_up(&ssk->wq);
579219820Sjeff		goto err_post_send;
580219820Sjeff	}
581219820Sjeff
582219820Sjeff	sdp_prf(sk, mb, "Finished posting(rc=%d), now to wait", rc);
583219820Sjeff
584219820Sjeff	got_srcavail_cancel = ssk->srcavail_cancel_mseq > rx_sa->mseq;
585219820Sjeff
586219820Sjeff	sdp_arm_tx_cq(sk);
587219820Sjeff
588219820Sjeff	sdp_wait_rdma_wr_finished(ssk);
589219820Sjeff
590219820Sjeff	sdp_prf(sk, mb, "Finished waiting(rc=%d)", rc);
591219820Sjeff	if (!ssk->qp_active) {
592219820Sjeff		sdp_dbg_data(sk, "QP destroyed during RDMA read\n");
593219820Sjeff		rc = -EPIPE;
594219820Sjeff		goto err_post_send;
595219820Sjeff	}
596219820Sjeff
597219820Sjeff	copied = rx_sa->umem->length;
598219820Sjeff
599219820Sjeff	sdp_update_iov_used(sk, iov, copied);
600219820Sjeff	rx_sa->used += copied;
601219820Sjeff	atomic_add(copied, &ssk->rcv_nxt);
602219820Sjeff	*used = copied;
603219820Sjeff
604219820Sjeff	ssk->tx_ring.rdma_inflight = NULL;
605219820Sjeff
606219820Sjefferr_post_send:
607219820Sjeff	sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
608219820Sjeff
609219820Sjefferr_alloc_fmr:
610219820Sjeff	if (rc && ssk->qp_active) {
611219820Sjeff		sdp_warn(sk, "Couldn't do RDMA - post sendsm\n");
612219820Sjeff		rx_sa->flags |= RX_SA_ABORTED;
613219820Sjeff	}
614219820Sjeff
615219820Sjeff	sock_put(sk, SOCK_REF_RDMA_RD);
616219820Sjeff
617219820Sjeff	return rc;
618219820Sjeff}
619219820Sjeff
620219820Sjeffstatic inline int wait_for_sndbuf(struct socket *sk, long *timeo_p)
621219820Sjeff{
622219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
623219820Sjeff	int ret = 0;
624219820Sjeff	int credits_needed = 1;
625219820Sjeff
626219820Sjeff	sdp_dbg_data(sk, "Wait for mem\n");
627219820Sjeff
628219820Sjeff	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
629219820Sjeff
630219820Sjeff	SDPSTATS_COUNTER_INC(send_wait_for_mem);
631219820Sjeff
632219820Sjeff	sdp_do_posts(ssk);
633219820Sjeff
634219820Sjeff	sdp_xmit_poll(ssk, 1);
635219820Sjeff
636219820Sjeff	ret = sdp_tx_wait_memory(ssk, timeo_p, &credits_needed);
637219820Sjeff
638219820Sjeff	return ret;
639219820Sjeff}
640219820Sjeff
641219820Sjeffstatic int do_sdp_sendmsg_zcopy(struct socket *sk, struct tx_srcavail_state *tx_sa,
642219820Sjeff		struct iovec *iov, long *timeo)
643219820Sjeff{
644219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
645219820Sjeff	int rc = 0;
646219820Sjeff	unsigned long lock_flags;
647219820Sjeff
648219820Sjeff	rc = sdp_alloc_fmr(sk, iov->iov_base, iov->iov_len,
649219820Sjeff			&tx_sa->fmr, &tx_sa->umem);
650219820Sjeff	if (rc) {
651219820Sjeff		sdp_warn(sk, "Error allocating fmr: %d\n", rc);
652219820Sjeff		goto err_alloc_fmr;
653219820Sjeff	}
654219820Sjeff
655219820Sjeff	if (tx_slots_free(ssk) == 0) {
656219820Sjeff		rc = wait_for_sndbuf(sk, timeo);
657219820Sjeff		if (rc) {
658219820Sjeff			sdp_warn(sk, "Couldn't get send buffer\n");
659219820Sjeff			goto err_no_tx_slots;
660219820Sjeff		}
661219820Sjeff	}
662219820Sjeff
663219820Sjeff	rc = sdp_post_srcavail(sk, tx_sa);
664219820Sjeff	if (rc) {
665219820Sjeff		sdp_dbg(sk, "Error posting SrcAvail\n");
666219820Sjeff		goto err_abort_send;
667219820Sjeff	}
668219820Sjeff
669219820Sjeff	rc = sdp_wait_rdmardcompl(ssk, timeo, 0);
670219820Sjeff	if (unlikely(rc)) {
671219820Sjeff		enum tx_sa_flag f = tx_sa->abort_flags;
672219820Sjeff
673219820Sjeff		if (f & TX_SA_SENDSM) {
674219820Sjeff			sdp_dbg_data(sk, "Got SendSM. use SEND verb.\n");
675219820Sjeff		} else if (f & TX_SA_ERROR) {
676219820Sjeff			sdp_dbg_data(sk, "SrcAvail error completion\n");
677219820Sjeff			sdp_reset(sk);
678219820Sjeff			SDPSTATS_COUNTER_INC(zcopy_tx_error);
679219820Sjeff		} else if (ssk->qp_active) {
680219820Sjeff			sdp_post_srcavail_cancel(sk);
681219820Sjeff
682219820Sjeff			/* Wait for RdmaRdCompl/SendSM to
683219820Sjeff			 * finish the transaction */
684219820Sjeff			*timeo = 2 * HZ;
685219820Sjeff			sdp_dbg_data(sk, "Waiting for SendSM\n");
686219820Sjeff			sdp_wait_rdmardcompl(ssk, timeo, 1);
687219820Sjeff			sdp_dbg_data(sk, "finished waiting\n");
688219820Sjeff
689219820Sjeff			cancel_delayed_work(&ssk->srcavail_cancel_work);
690219820Sjeff		} else {
691219820Sjeff			sdp_dbg_data(sk, "QP was destroyed while waiting\n");
692219820Sjeff		}
693219820Sjeff	} else {
694219820Sjeff		sdp_dbg_data(sk, "got RdmaRdCompl\n");
695219820Sjeff	}
696219820Sjeff
697219820Sjeff	spin_lock_irqsave(&ssk->tx_sa_lock, lock_flags);
698219820Sjeff	ssk->tx_sa = NULL;
699219820Sjeff	spin_unlock_irqrestore(&ssk->tx_sa_lock, lock_flags);
700219820Sjeff
701219820Sjefferr_abort_send:
702219820Sjeff	sdp_update_iov_used(sk, iov, tx_sa->bytes_acked);
703219820Sjeff
704219820Sjefferr_no_tx_slots:
705219820Sjeff	sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
706219820Sjeff
707219820Sjefferr_alloc_fmr:
708219820Sjeff	return rc;
709219820Sjeff}
710219820Sjeff
711219820Sjeffint sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov)
712219820Sjeff{
713219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
714219820Sjeff	int rc = 0;
715219820Sjeff	long timeo;
716219820Sjeff	struct tx_srcavail_state *tx_sa;
717219820Sjeff	int offset;
718219820Sjeff	size_t bytes_to_copy = 0;
719219820Sjeff	int copied = 0;
720219820Sjeff
721219820Sjeff	sdp_dbg_data(sk, "Sending iov: %p, iov_len: 0x%lx\n",
722219820Sjeff			iov->iov_base, iov->iov_len);
723219820Sjeff	sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy start");
724219820Sjeff	if (ssk->rx_sa) {
725219820Sjeff		sdp_dbg_data(sk, "Deadlock prevent: crossing SrcAvail\n");
726219820Sjeff		return 0;
727219820Sjeff	}
728219820Sjeff
729219820Sjeff	sock_hold(ssk->socket, SOCK_REF_ZCOPY);
730219820Sjeff
731219820Sjeff	SDPSTATS_COUNTER_INC(sendmsg_zcopy_segment);
732219820Sjeff
733219820Sjeff	timeo = SDP_SRCAVAIL_ADV_TIMEOUT ;
734219820Sjeff
735219820Sjeff	/* Ok commence sending. */
736219820Sjeff	offset = (unsigned long)iov->iov_base & (PAGE_SIZE - 1);
737219820Sjeff
738219820Sjeff	tx_sa = kmalloc(sizeof(struct tx_srcavail_state), GFP_KERNEL);
739219820Sjeff	if (!tx_sa) {
740219820Sjeff		sdp_warn(sk, "Error allocating zcopy context\n");
741219820Sjeff		rc = -EAGAIN; /* Buffer too big - fallback to bcopy */
742219820Sjeff		goto err_alloc_tx_sa;
743219820Sjeff	}
744219820Sjeff
745219820Sjeff	bytes_to_copy = iov->iov_len;
746219820Sjeff	do {
747219820Sjeff		tx_sa_reset(tx_sa);
748219820Sjeff
749219820Sjeff		rc = do_sdp_sendmsg_zcopy(sk, tx_sa, iov, &timeo);
750219820Sjeff
751219820Sjeff		if (iov->iov_len && iov->iov_len < sdp_zcopy_thresh) {
752219820Sjeff			sdp_dbg_data(sk, "0x%lx bytes left, switching to bcopy\n",
753219820Sjeff				iov->iov_len);
754219820Sjeff			break;
755219820Sjeff		}
756219820Sjeff	} while (!rc && iov->iov_len > 0 && !tx_sa->abort_flags);
757219820Sjeff
758219820Sjeff	kfree(tx_sa);
759219820Sjefferr_alloc_tx_sa:
760219820Sjeff	copied = bytes_to_copy - iov->iov_len;
761219820Sjeff
762219820Sjeff	sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy end rc: %d copied: %d", rc, copied);
763219820Sjeff
764219820Sjeff	sock_put(ssk->socket, SOCK_REF_ZCOPY);
765219820Sjeff
766219820Sjeff	if (rc < 0 && rc != -EAGAIN && rc != -ETIME)
767219820Sjeff		return rc;
768219820Sjeff
769219820Sjeff	return copied;
770219820Sjeff}
771219820Sjeff
772219820Sjeffvoid sdp_abort_srcavail(struct socket *sk)
773219820Sjeff{
774219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
775219820Sjeff	struct tx_srcavail_state *tx_sa = ssk->tx_sa;
776219820Sjeff	unsigned long flags;
777219820Sjeff
778219820Sjeff	if (!tx_sa)
779219820Sjeff		return;
780219820Sjeff
781219820Sjeff	cancel_delayed_work(&ssk->srcavail_cancel_work);
782219820Sjeff	flush_scheduled_work();
783219820Sjeff
784219820Sjeff	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
785219820Sjeff
786219820Sjeff	sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
787219820Sjeff
788219820Sjeff	ssk->tx_sa = NULL;
789219820Sjeff
790219820Sjeff	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
791219820Sjeff}
792219820Sjeff
793219820Sjeffvoid sdp_abort_rdma_read(struct socket *sk)
794219820Sjeff{
795219820Sjeff	struct sdp_sock *ssk = sdp_sk(sk);
796219820Sjeff	struct rx_srcavail_state *rx_sa = ssk->rx_sa;
797219820Sjeff
798219820Sjeff	if (!rx_sa)
799219820Sjeff		return;
800219820Sjeff
801219820Sjeff	sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
802219820Sjeff
803219820Sjeff	ssk->rx_sa = NULL;
804219820Sjeff}
805