1219820Sjeff/* 2219820Sjeff * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. 3219820Sjeff * 4219820Sjeff * This software is available to you under a choice of one of two 5219820Sjeff * licenses. You may choose to be licensed under the terms of the GNU 6219820Sjeff * General Public License (GPL) Version 2, available from the file 7219820Sjeff * COPYING in the main directory of this source tree, or the 8219820Sjeff * OpenIB.org BSD license below: 9219820Sjeff * 10219820Sjeff * Redistribution and use in source and binary forms, with or 11219820Sjeff * without modification, are permitted provided that the following 12219820Sjeff * conditions are met: 13219820Sjeff * 14219820Sjeff * - Redistributions of source code must retain the above 15219820Sjeff * copyright notice, this list of conditions and the following 16219820Sjeff * disclaimer. 17219820Sjeff * 18219820Sjeff * - Redistributions in binary form must reproduce the above 19219820Sjeff * copyright notice, this list of conditions and the following 20219820Sjeff * disclaimer in the documentation and/or other materials 21219820Sjeff * provided with the distribution. 22219820Sjeff * 23219820Sjeff * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24219820Sjeff * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25219820Sjeff * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26219820Sjeff * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27219820Sjeff * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28219820Sjeff * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29219820Sjeff * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30219820Sjeff * SOFTWARE. 31219820Sjeff */ 32219820Sjeff#include <linux/tcp.h> 33219820Sjeff#include <asm/ioctls.h> 34219820Sjeff#include <linux/workqueue.h> 35219820Sjeff#include <linux/net.h> 36219820Sjeff#include <linux/socket.h> 37219820Sjeff#include <net/protocol.h> 38219820Sjeff#include <net/inet_common.h> 39219820Sjeff#include <rdma/rdma_cm.h> 40219820Sjeff#include <rdma/ib_verbs.h> 41219820Sjeff#include <rdma/ib_fmr_pool.h> 42219820Sjeff#include <rdma/ib_umem.h> 43219820Sjeff#include <net/tcp.h> /* for memcpy_toiovec */ 44219820Sjeff#include <asm/io.h> 45219820Sjeff#include <asm/uaccess.h> 46219820Sjeff#include <linux/delay.h> 47219820Sjeff#include "sdp.h" 48219820Sjeff 49219820Sjeffstatic int sdp_post_srcavail(struct socket *sk, struct tx_srcavail_state *tx_sa) 50219820Sjeff{ 51219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 52219820Sjeff struct mbuf *mb; 53219820Sjeff int payload_len; 54219820Sjeff struct page *payload_pg; 55219820Sjeff int off, len; 56219820Sjeff struct ib_umem_chunk *chunk; 57219820Sjeff 58219820Sjeff WARN_ON(ssk->tx_sa); 59219820Sjeff 60219820Sjeff BUG_ON(!tx_sa); 61219820Sjeff BUG_ON(!tx_sa->fmr || !tx_sa->fmr->fmr->lkey); 62219820Sjeff BUG_ON(!tx_sa->umem); 63219820Sjeff BUG_ON(!tx_sa->umem->chunk_list.next); 64219820Sjeff 65219820Sjeff chunk = list_entry(tx_sa->umem->chunk_list.next, struct ib_umem_chunk, list); 66219820Sjeff BUG_ON(!chunk->nmap); 67219820Sjeff 68219820Sjeff off = tx_sa->umem->offset; 69219820Sjeff len = tx_sa->umem->length; 70219820Sjeff 71219820Sjeff tx_sa->bytes_sent = tx_sa->bytes_acked = 0; 72219820Sjeff 73219820Sjeff mb = sdp_alloc_mb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0); 74219820Sjeff if (!mb) { 75219820Sjeff return -ENOMEM; 76219820Sjeff } 77219820Sjeff sdp_dbg_data(sk, "sending SrcAvail\n"); 78219820Sjeff 79219820Sjeff TX_SRCAVAIL_STATE(mb) = tx_sa; /* tx_sa is hanged on the mb 80219820Sjeff * but continue to live after mb is freed */ 81219820Sjeff ssk->tx_sa = tx_sa; 82219820Sjeff 83219820Sjeff /* must have payload inlined in SrcAvail packet in combined mode */ 84219820Sjeff payload_len = MIN(tx_sa->umem->page_size - off, len); 85219820Sjeff payload_len = MIN(payload_len, ssk->xmit_size_goal - sizeof(struct sdp_srcah)); 86219820Sjeff payload_pg = sg_page(&chunk->page_list[0]); 87219820Sjeff get_page(payload_pg); 88219820Sjeff 89219820Sjeff sdp_dbg_data(sk, "payload: off: 0x%x, pg: %p, len: 0x%x\n", 90219820Sjeff off, payload_pg, payload_len); 91219820Sjeff 92219820Sjeff mb_fill_page_desc(mb, mb_shinfo(mb)->nr_frags, 93219820Sjeff payload_pg, off, payload_len); 94219820Sjeff 95219820Sjeff mb->len += payload_len; 96219820Sjeff mb->data_len = payload_len; 97219820Sjeff mb->truesize += payload_len; 98219820Sjeff// sk->sk_wmem_queued += payload_len; 99219820Sjeff// sk->sk_forward_alloc -= payload_len; 100219820Sjeff 101219820Sjeff mb_entail(sk, ssk, mb); 102219820Sjeff 103219820Sjeff ssk->write_seq += payload_len; 104219820Sjeff SDP_SKB_CB(mb)->end_seq += payload_len; 105219820Sjeff 106219820Sjeff tx_sa->bytes_sent = tx_sa->umem->length; 107219820Sjeff tx_sa->bytes_acked = payload_len; 108219820Sjeff 109219820Sjeff /* TODO: pushing the mb into the tx_queue should be enough */ 110219820Sjeff 111219820Sjeff return 0; 112219820Sjeff} 113219820Sjeff 114219820Sjeffstatic int sdp_post_srcavail_cancel(struct socket *sk) 115219820Sjeff{ 116219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 117219820Sjeff struct mbuf *mb; 118219820Sjeff 119219820Sjeff sdp_dbg_data(ssk->socket, "Posting srcavail cancel\n"); 120219820Sjeff 121219820Sjeff mb = sdp_alloc_mb_srcavail_cancel(sk, 0); 122219820Sjeff mb_entail(sk, ssk, mb); 123219820Sjeff 124219820Sjeff sdp_post_sends(ssk, 0); 125219820Sjeff 126219820Sjeff schedule_delayed_work(&ssk->srcavail_cancel_work, 127219820Sjeff SDP_SRCAVAIL_CANCEL_TIMEOUT); 128219820Sjeff 129219820Sjeff return 0; 130219820Sjeff} 131219820Sjeff 132219820Sjeffvoid srcavail_cancel_timeout(struct work_struct *work) 133219820Sjeff{ 134219820Sjeff struct sdp_sock *ssk = 135219820Sjeff container_of(work, struct sdp_sock, srcavail_cancel_work.work); 136219820Sjeff struct socket *sk = ssk->socket; 137219820Sjeff 138219820Sjeff lock_sock(sk); 139219820Sjeff 140219820Sjeff sdp_dbg_data(sk, "both SrcAvail and SrcAvailCancel timedout." 141219820Sjeff " closing connection\n"); 142219820Sjeff sdp_set_error(sk, -ECONNRESET); 143219820Sjeff wake_up(&ssk->wq); 144219820Sjeff 145219820Sjeff release_sock(sk); 146219820Sjeff} 147219820Sjeff 148219820Sjeffstatic int sdp_wait_rdmardcompl(struct sdp_sock *ssk, long *timeo_p, 149219820Sjeff int ignore_signals) 150219820Sjeff{ 151219820Sjeff struct socket *sk = ssk->socket; 152219820Sjeff int err = 0; 153219820Sjeff long vm_wait = 0; 154219820Sjeff long current_timeo = *timeo_p; 155219820Sjeff struct tx_srcavail_state *tx_sa = ssk->tx_sa; 156219820Sjeff DEFINE_WAIT(wait); 157219820Sjeff 158219820Sjeff sdp_dbg_data(sk, "sleep till RdmaRdCompl. timeo = %ld.\n", *timeo_p); 159219820Sjeff sdp_prf1(sk, NULL, "Going to sleep"); 160219820Sjeff while (ssk->qp_active) { 161219820Sjeff prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 162219820Sjeff 163219820Sjeff if (unlikely(!*timeo_p)) { 164219820Sjeff err = -ETIME; 165219820Sjeff tx_sa->abort_flags |= TX_SA_TIMEDOUT; 166219820Sjeff sdp_prf1(sk, NULL, "timeout"); 167219820Sjeff SDPSTATS_COUNTER_INC(zcopy_tx_timeout); 168219820Sjeff break; 169219820Sjeff } 170219820Sjeff 171219820Sjeff else if (tx_sa->bytes_acked > tx_sa->bytes_sent) { 172219820Sjeff err = -EINVAL; 173219820Sjeff sdp_dbg_data(sk, "acked bytes > sent bytes\n"); 174219820Sjeff tx_sa->abort_flags |= TX_SA_ERROR; 175219820Sjeff break; 176219820Sjeff } 177219820Sjeff 178219820Sjeff if (tx_sa->abort_flags & TX_SA_SENDSM) { 179219820Sjeff sdp_prf1(sk, NULL, "Aborting SrcAvail sending"); 180219820Sjeff SDPSTATS_COUNTER_INC(zcopy_tx_aborted); 181219820Sjeff err = -EAGAIN; 182219820Sjeff break ; 183219820Sjeff } 184219820Sjeff 185219820Sjeff if (!ignore_signals) { 186219820Sjeff if (signal_pending(current)) { 187219820Sjeff err = -EINTR; 188219820Sjeff sdp_prf1(sk, NULL, "signalled"); 189219820Sjeff tx_sa->abort_flags |= TX_SA_INTRRUPTED; 190219820Sjeff break; 191219820Sjeff } 192219820Sjeff 193219820Sjeff if (ssk->rx_sa && (tx_sa->bytes_acked < tx_sa->bytes_sent)) { 194219820Sjeff sdp_dbg_data(sk, "Crossing SrcAvail - aborting this\n"); 195219820Sjeff tx_sa->abort_flags |= TX_SA_CROSS_SEND; 196219820Sjeff SDPSTATS_COUNTER_INC(zcopy_cross_send); 197219820Sjeff err = -ETIME; 198219820Sjeff break ; 199219820Sjeff } 200219820Sjeff } 201219820Sjeff 202219820Sjeff posts_handler_put(ssk); 203219820Sjeff 204219820Sjeff sk_wait_event(sk, ¤t_timeo, 205219820Sjeff tx_sa->abort_flags && 206219820Sjeff ssk->rx_sa && 207219820Sjeff (tx_sa->bytes_acked < tx_sa->bytes_sent) && 208219820Sjeff vm_wait); 209219820Sjeff sdp_dbg_data(ssk->socket, "woke up sleepers\n"); 210219820Sjeff 211219820Sjeff posts_handler_get(ssk); 212219820Sjeff 213219820Sjeff if (tx_sa->bytes_acked == tx_sa->bytes_sent) 214219820Sjeff break; 215219820Sjeff 216219820Sjeff if (vm_wait) { 217219820Sjeff vm_wait -= current_timeo; 218219820Sjeff current_timeo = *timeo_p; 219219820Sjeff if (current_timeo != MAX_SCHEDULE_TIMEOUT && 220219820Sjeff (current_timeo -= vm_wait) < 0) 221219820Sjeff current_timeo = 0; 222219820Sjeff vm_wait = 0; 223219820Sjeff } 224219820Sjeff *timeo_p = current_timeo; 225219820Sjeff } 226219820Sjeff 227219820Sjeff finish_wait(sk->sk_sleep, &wait); 228219820Sjeff 229219820Sjeff sdp_dbg_data(sk, "Finished waiting - RdmaRdCompl: %d/%d bytes, flags: 0x%x\n", 230219820Sjeff tx_sa->bytes_acked, tx_sa->bytes_sent, tx_sa->abort_flags); 231219820Sjeff 232219820Sjeff if (!ssk->qp_active) { 233219820Sjeff sdp_dbg(sk, "QP destroyed while waiting\n"); 234219820Sjeff return -EINVAL; 235219820Sjeff } 236219820Sjeff return err; 237219820Sjeff} 238219820Sjeff 239219820Sjeffstatic void sdp_wait_rdma_wr_finished(struct sdp_sock *ssk) 240219820Sjeff{ 241219820Sjeff struct socket *sk = ssk->socket; 242219820Sjeff long timeo = HZ * 5; /* Timeout for for RDMA read */ 243219820Sjeff DEFINE_WAIT(wait); 244219820Sjeff 245219820Sjeff sdp_dbg_data(sk, "Sleep till RDMA wr finished.\n"); 246219820Sjeff while (1) { 247219820Sjeff prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); 248219820Sjeff 249219820Sjeff if (!ssk->tx_ring.rdma_inflight->busy) { 250219820Sjeff sdp_dbg_data(sk, "got rdma cqe\n"); 251219820Sjeff break; 252219820Sjeff } 253219820Sjeff 254219820Sjeff if (!ssk->qp_active) { 255219820Sjeff sdp_dbg_data(sk, "QP destroyed\n"); 256219820Sjeff break; 257219820Sjeff } 258219820Sjeff 259219820Sjeff if (!timeo) { 260219820Sjeff sdp_warn(sk, "Panic: Timed out waiting for RDMA read\n"); 261219820Sjeff WARN_ON(1); 262219820Sjeff break; 263219820Sjeff } 264219820Sjeff 265219820Sjeff posts_handler_put(ssk); 266219820Sjeff 267219820Sjeff sdp_prf1(sk, NULL, "Going to sleep"); 268219820Sjeff sk_wait_event(sk, &timeo, 269219820Sjeff !ssk->tx_ring.rdma_inflight->busy); 270219820Sjeff sdp_prf1(sk, NULL, "Woke up"); 271219820Sjeff sdp_dbg_data(ssk->socket, "woke up sleepers\n"); 272219820Sjeff 273219820Sjeff posts_handler_get(ssk); 274219820Sjeff } 275219820Sjeff 276219820Sjeff finish_wait(sk->sk_sleep, &wait); 277219820Sjeff 278219820Sjeff sdp_dbg_data(sk, "Finished waiting\n"); 279219820Sjeff} 280219820Sjeff 281219820Sjeffint sdp_post_rdma_rd_compl(struct sdp_sock *ssk, 282219820Sjeff struct rx_srcavail_state *rx_sa) 283219820Sjeff{ 284219820Sjeff struct mbuf *mb; 285219820Sjeff int copied = rx_sa->used - rx_sa->reported; 286219820Sjeff 287219820Sjeff if (rx_sa->used <= rx_sa->reported) 288219820Sjeff return 0; 289219820Sjeff 290219820Sjeff mb = sdp_alloc_mb_rdmardcompl(ssk->socket, copied, 0); 291219820Sjeff 292219820Sjeff rx_sa->reported += copied; 293219820Sjeff 294219820Sjeff /* TODO: What if no tx_credits available? */ 295219820Sjeff sdp_post_send(ssk, mb); 296219820Sjeff 297219820Sjeff return 0; 298219820Sjeff} 299219820Sjeff 300219820Sjeffint sdp_post_sendsm(struct socket *sk) 301219820Sjeff{ 302219820Sjeff struct mbuf *mb = sdp_alloc_mb_sendsm(sk, 0); 303219820Sjeff 304219820Sjeff sdp_post_send(sdp_sk(sk), mb); 305219820Sjeff 306219820Sjeff return 0; 307219820Sjeff} 308219820Sjeff 309219820Sjeffstatic int sdp_update_iov_used(struct socket *sk, struct iovec *iov, int len) 310219820Sjeff{ 311219820Sjeff sdp_dbg_data(sk, "updating consumed 0x%x bytes from iov\n", len); 312219820Sjeff while (len > 0) { 313219820Sjeff if (iov->iov_len) { 314219820Sjeff int copy = min_t(unsigned int, iov->iov_len, len); 315219820Sjeff len -= copy; 316219820Sjeff iov->iov_len -= copy; 317219820Sjeff iov->iov_base += copy; 318219820Sjeff } 319219820Sjeff iov++; 320219820Sjeff } 321219820Sjeff 322219820Sjeff return 0; 323219820Sjeff} 324219820Sjeff 325219820Sjeffstatic inline int sge_bytes(struct ib_sge *sge, int sge_cnt) 326219820Sjeff{ 327219820Sjeff int bytes = 0; 328219820Sjeff 329219820Sjeff while (sge_cnt > 0) { 330219820Sjeff bytes += sge->length; 331219820Sjeff sge++; 332219820Sjeff sge_cnt--; 333219820Sjeff } 334219820Sjeff 335219820Sjeff return bytes; 336219820Sjeff} 337219820Sjeffvoid sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack) 338219820Sjeff{ 339219820Sjeff struct socket *sk = ssk->socket; 340219820Sjeff unsigned long flags; 341219820Sjeff 342219820Sjeff spin_lock_irqsave(&ssk->tx_sa_lock, flags); 343219820Sjeff 344219820Sjeff if (!ssk->tx_sa) { 345219820Sjeff sdp_prf1(sk, NULL, "SendSM for cancelled/finished SrcAvail"); 346219820Sjeff goto out; 347219820Sjeff } 348219820Sjeff 349219820Sjeff if (ssk->tx_sa->mseq > mseq_ack) { 350219820Sjeff sdp_dbg_data(sk, "SendSM arrived for old SrcAvail. " 351219820Sjeff "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n", 352219820Sjeff mseq_ack, ssk->tx_sa->mseq); 353219820Sjeff goto out; 354219820Sjeff } 355219820Sjeff 356219820Sjeff sdp_dbg_data(sk, "Got SendSM - aborting SrcAvail\n"); 357219820Sjeff 358219820Sjeff ssk->tx_sa->abort_flags |= TX_SA_SENDSM; 359219820Sjeff cancel_delayed_work(&ssk->srcavail_cancel_work); 360219820Sjeff 361219820Sjeff wake_up(sk->sk_sleep); 362219820Sjeff sdp_dbg_data(sk, "woke up sleepers\n"); 363219820Sjeff 364219820Sjeffout: 365219820Sjeff spin_unlock_irqrestore(&ssk->tx_sa_lock, flags); 366219820Sjeff} 367219820Sjeff 368219820Sjeffvoid sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack, 369219820Sjeff u32 bytes_completed) 370219820Sjeff{ 371219820Sjeff struct socket *sk = ssk->socket; 372219820Sjeff unsigned long flags; 373219820Sjeff 374219820Sjeff sdp_prf1(sk, NULL, "RdmaRdCompl ssk=%p tx_sa=%p", ssk, ssk->tx_sa); 375219820Sjeff sdp_dbg_data(sk, "RdmaRdCompl ssk=%p tx_sa=%p\n", ssk, ssk->tx_sa); 376219820Sjeff 377219820Sjeff spin_lock_irqsave(&ssk->tx_sa_lock, flags); 378219820Sjeff 379219820Sjeff BUG_ON(!ssk); 380219820Sjeff 381219820Sjeff if (!ssk->tx_sa) { 382219820Sjeff sdp_dbg_data(sk, "Got RdmaRdCompl for aborted SrcAvail\n"); 383219820Sjeff goto out; 384219820Sjeff } 385219820Sjeff 386219820Sjeff if (ssk->tx_sa->mseq > mseq_ack) { 387219820Sjeff sdp_dbg_data(sk, "RdmaRdCompl arrived for old SrcAvail. " 388219820Sjeff "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n", 389219820Sjeff mseq_ack, ssk->tx_sa->mseq); 390219820Sjeff goto out; 391219820Sjeff } 392219820Sjeff 393219820Sjeff ssk->tx_sa->bytes_acked += bytes_completed; 394219820Sjeff 395219820Sjeff wake_up(sk->sk_sleep); 396219820Sjeff sdp_dbg_data(sk, "woke up sleepers\n"); 397219820Sjeff 398219820Sjeffout: 399219820Sjeff spin_unlock_irqrestore(&ssk->tx_sa_lock, flags); 400219820Sjeff return; 401219820Sjeff} 402219820Sjeff 403219820Sjeffstatic unsigned long sdp_get_max_memlockable_bytes(unsigned long offset) 404219820Sjeff{ 405219820Sjeff unsigned long avail; 406219820Sjeff unsigned long lock_limit; 407219820Sjeff 408219820Sjeff if (capable(CAP_IPC_LOCK)) 409219820Sjeff return ULONG_MAX; 410219820Sjeff 411219820Sjeff lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 412219820Sjeff avail = lock_limit - (current->mm->locked_vm << PAGE_SHIFT); 413219820Sjeff 414219820Sjeff return avail - offset; 415219820Sjeff} 416219820Sjeff 417219820Sjeffstatic int sdp_alloc_fmr(struct socket *sk, void *uaddr, size_t len, 418219820Sjeff struct ib_pool_fmr **_fmr, struct ib_umem **_umem) 419219820Sjeff{ 420219820Sjeff struct ib_pool_fmr *fmr; 421219820Sjeff struct ib_umem *umem; 422219820Sjeff struct ib_device *dev; 423219820Sjeff u64 *pages; 424219820Sjeff struct ib_umem_chunk *chunk; 425219820Sjeff int n, j, k; 426219820Sjeff int rc = 0; 427219820Sjeff unsigned long max_lockable_bytes; 428219820Sjeff 429219820Sjeff if (unlikely(len > SDP_MAX_RDMA_READ_LEN)) { 430219820Sjeff sdp_dbg_data(sk, "len:0x%lx > FMR_SIZE: 0x%lx\n", 431219820Sjeff len, SDP_MAX_RDMA_READ_LEN); 432219820Sjeff len = SDP_MAX_RDMA_READ_LEN; 433219820Sjeff } 434219820Sjeff 435219820Sjeff max_lockable_bytes = sdp_get_max_memlockable_bytes((unsigned long)uaddr & ~PAGE_MASK); 436219820Sjeff if (unlikely(len > max_lockable_bytes)) { 437219820Sjeff sdp_dbg_data(sk, "len:0x%lx > RLIMIT_MEMLOCK available: 0x%lx\n", 438219820Sjeff len, max_lockable_bytes); 439219820Sjeff len = max_lockable_bytes; 440219820Sjeff } 441219820Sjeff 442219820Sjeff sdp_dbg_data(sk, "user buf: %p, len:0x%lx max_lockable_bytes: 0x%lx\n", 443219820Sjeff uaddr, len, max_lockable_bytes); 444219820Sjeff 445219820Sjeff umem = ib_umem_get(&sdp_sk(sk)->context, (unsigned long)uaddr, len, 446219820Sjeff IB_ACCESS_REMOTE_WRITE, 0); 447219820Sjeff 448219820Sjeff if (IS_ERR(umem)) { 449219820Sjeff rc = PTR_ERR(umem); 450219820Sjeff sdp_warn(sk, "Error doing umem_get 0x%lx bytes: %d\n", len, rc); 451219820Sjeff sdp_warn(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n", 452219820Sjeff current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur, 453219820Sjeff current->signal->rlim[RLIMIT_MEMLOCK].rlim_max, 454219820Sjeff capable(CAP_IPC_LOCK)); 455219820Sjeff goto err_umem_get; 456219820Sjeff } 457219820Sjeff 458219820Sjeff sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%lx\n", 459219820Sjeff umem->offset, umem->length); 460219820Sjeff 461219820Sjeff pages = (u64 *) __get_free_page(GFP_KERNEL); 462219820Sjeff if (!pages) 463219820Sjeff goto err_pages_alloc; 464219820Sjeff 465219820Sjeff n = 0; 466219820Sjeff 467219820Sjeff dev = sdp_sk(sk)->ib_device; 468219820Sjeff list_for_each_entry(chunk, &umem->chunk_list, list) { 469219820Sjeff for (j = 0; j < chunk->nmap; ++j) { 470219820Sjeff len = ib_sg_dma_len(dev, 471219820Sjeff &chunk->page_list[j]) >> PAGE_SHIFT; 472219820Sjeff 473219820Sjeff for (k = 0; k < len; ++k) { 474219820Sjeff pages[n++] = ib_sg_dma_address(dev, 475219820Sjeff &chunk->page_list[j]) + 476219820Sjeff umem->page_size * k; 477219820Sjeff 478219820Sjeff } 479219820Sjeff } 480219820Sjeff } 481219820Sjeff 482219820Sjeff fmr = ib_fmr_pool_map_phys(sdp_sk(sk)->sdp_dev->fmr_pool, pages, n, 0); 483219820Sjeff if (IS_ERR(fmr)) { 484219820Sjeff sdp_warn(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr)); 485219820Sjeff goto err_fmr_alloc; 486219820Sjeff } 487219820Sjeff 488219820Sjeff free_page((unsigned long) pages); 489219820Sjeff 490219820Sjeff *_umem = umem; 491219820Sjeff *_fmr = fmr; 492219820Sjeff 493219820Sjeff return 0; 494219820Sjeff 495219820Sjefferr_fmr_alloc: 496219820Sjeff free_page((unsigned long) pages); 497219820Sjeff 498219820Sjefferr_pages_alloc: 499219820Sjeff ib_umem_release(umem); 500219820Sjeff 501219820Sjefferr_umem_get: 502219820Sjeff 503219820Sjeff return rc; 504219820Sjeff} 505219820Sjeff 506219820Sjeffvoid sdp_free_fmr(struct socket *sk, struct ib_pool_fmr **_fmr, struct ib_umem **_umem) 507219820Sjeff{ 508219820Sjeff if (!sdp_sk(sk)->qp_active) 509219820Sjeff return; 510219820Sjeff 511219820Sjeff ib_fmr_pool_unmap(*_fmr); 512219820Sjeff *_fmr = NULL; 513219820Sjeff 514219820Sjeff ib_umem_release(*_umem); 515219820Sjeff *_umem = NULL; 516219820Sjeff} 517219820Sjeff 518219820Sjeffstatic int sdp_post_rdma_read(struct socket *sk, struct rx_srcavail_state *rx_sa) 519219820Sjeff{ 520219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 521219820Sjeff struct ib_send_wr *bad_wr; 522219820Sjeff struct ib_send_wr wr = { NULL }; 523219820Sjeff struct ib_sge sge; 524219820Sjeff 525219820Sjeff wr.opcode = IB_WR_RDMA_READ; 526219820Sjeff wr.next = NULL; 527219820Sjeff wr.wr_id = SDP_OP_RDMA; 528219820Sjeff wr.wr.rdma.rkey = rx_sa->rkey; 529219820Sjeff wr.send_flags = 0; 530219820Sjeff 531219820Sjeff ssk->tx_ring.rdma_inflight = rx_sa; 532219820Sjeff 533219820Sjeff sge.addr = rx_sa->umem->offset; 534219820Sjeff sge.length = rx_sa->umem->length; 535219820Sjeff sge.lkey = rx_sa->fmr->fmr->lkey; 536219820Sjeff 537219820Sjeff wr.wr.rdma.remote_addr = rx_sa->vaddr + rx_sa->used; 538219820Sjeff wr.num_sge = 1; 539219820Sjeff wr.sg_list = &sge; 540219820Sjeff rx_sa->busy++; 541219820Sjeff 542219820Sjeff wr.send_flags = IB_SEND_SIGNALED; 543219820Sjeff 544219820Sjeff return ib_post_send(ssk->qp, &wr, &bad_wr); 545219820Sjeff} 546219820Sjeff 547219820Sjeffint sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb, 548219820Sjeff unsigned long *used) 549219820Sjeff{ 550219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 551219820Sjeff struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(mb); 552219820Sjeff int got_srcavail_cancel; 553219820Sjeff int rc = 0; 554219820Sjeff int len = *used; 555219820Sjeff int copied; 556219820Sjeff 557219820Sjeff sdp_dbg_data(ssk->socket, "preparing RDMA read." 558219820Sjeff " len: 0x%x. buffer len: 0x%lx\n", len, iov->iov_len); 559219820Sjeff 560219820Sjeff sock_hold(sk, SOCK_REF_RDMA_RD); 561219820Sjeff 562219820Sjeff if (len > rx_sa->len) { 563219820Sjeff sdp_warn(sk, "len:0x%x > rx_sa->len: 0x%x\n", len, rx_sa->len); 564219820Sjeff WARN_ON(1); 565219820Sjeff len = rx_sa->len; 566219820Sjeff } 567219820Sjeff 568219820Sjeff rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem); 569219820Sjeff if (rc) { 570219820Sjeff sdp_warn(sk, "Error allocating fmr: %d\n", rc); 571219820Sjeff goto err_alloc_fmr; 572219820Sjeff } 573219820Sjeff 574219820Sjeff rc = sdp_post_rdma_read(sk, rx_sa); 575219820Sjeff if (unlikely(rc)) { 576219820Sjeff sdp_warn(sk, "ib_post_send failed with status %d.\n", rc); 577219820Sjeff sdp_set_error(ssk->socket, -ECONNRESET); 578219820Sjeff wake_up(&ssk->wq); 579219820Sjeff goto err_post_send; 580219820Sjeff } 581219820Sjeff 582219820Sjeff sdp_prf(sk, mb, "Finished posting(rc=%d), now to wait", rc); 583219820Sjeff 584219820Sjeff got_srcavail_cancel = ssk->srcavail_cancel_mseq > rx_sa->mseq; 585219820Sjeff 586219820Sjeff sdp_arm_tx_cq(sk); 587219820Sjeff 588219820Sjeff sdp_wait_rdma_wr_finished(ssk); 589219820Sjeff 590219820Sjeff sdp_prf(sk, mb, "Finished waiting(rc=%d)", rc); 591219820Sjeff if (!ssk->qp_active) { 592219820Sjeff sdp_dbg_data(sk, "QP destroyed during RDMA read\n"); 593219820Sjeff rc = -EPIPE; 594219820Sjeff goto err_post_send; 595219820Sjeff } 596219820Sjeff 597219820Sjeff copied = rx_sa->umem->length; 598219820Sjeff 599219820Sjeff sdp_update_iov_used(sk, iov, copied); 600219820Sjeff rx_sa->used += copied; 601219820Sjeff atomic_add(copied, &ssk->rcv_nxt); 602219820Sjeff *used = copied; 603219820Sjeff 604219820Sjeff ssk->tx_ring.rdma_inflight = NULL; 605219820Sjeff 606219820Sjefferr_post_send: 607219820Sjeff sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem); 608219820Sjeff 609219820Sjefferr_alloc_fmr: 610219820Sjeff if (rc && ssk->qp_active) { 611219820Sjeff sdp_warn(sk, "Couldn't do RDMA - post sendsm\n"); 612219820Sjeff rx_sa->flags |= RX_SA_ABORTED; 613219820Sjeff } 614219820Sjeff 615219820Sjeff sock_put(sk, SOCK_REF_RDMA_RD); 616219820Sjeff 617219820Sjeff return rc; 618219820Sjeff} 619219820Sjeff 620219820Sjeffstatic inline int wait_for_sndbuf(struct socket *sk, long *timeo_p) 621219820Sjeff{ 622219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 623219820Sjeff int ret = 0; 624219820Sjeff int credits_needed = 1; 625219820Sjeff 626219820Sjeff sdp_dbg_data(sk, "Wait for mem\n"); 627219820Sjeff 628219820Sjeff set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 629219820Sjeff 630219820Sjeff SDPSTATS_COUNTER_INC(send_wait_for_mem); 631219820Sjeff 632219820Sjeff sdp_do_posts(ssk); 633219820Sjeff 634219820Sjeff sdp_xmit_poll(ssk, 1); 635219820Sjeff 636219820Sjeff ret = sdp_tx_wait_memory(ssk, timeo_p, &credits_needed); 637219820Sjeff 638219820Sjeff return ret; 639219820Sjeff} 640219820Sjeff 641219820Sjeffstatic int do_sdp_sendmsg_zcopy(struct socket *sk, struct tx_srcavail_state *tx_sa, 642219820Sjeff struct iovec *iov, long *timeo) 643219820Sjeff{ 644219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 645219820Sjeff int rc = 0; 646219820Sjeff unsigned long lock_flags; 647219820Sjeff 648219820Sjeff rc = sdp_alloc_fmr(sk, iov->iov_base, iov->iov_len, 649219820Sjeff &tx_sa->fmr, &tx_sa->umem); 650219820Sjeff if (rc) { 651219820Sjeff sdp_warn(sk, "Error allocating fmr: %d\n", rc); 652219820Sjeff goto err_alloc_fmr; 653219820Sjeff } 654219820Sjeff 655219820Sjeff if (tx_slots_free(ssk) == 0) { 656219820Sjeff rc = wait_for_sndbuf(sk, timeo); 657219820Sjeff if (rc) { 658219820Sjeff sdp_warn(sk, "Couldn't get send buffer\n"); 659219820Sjeff goto err_no_tx_slots; 660219820Sjeff } 661219820Sjeff } 662219820Sjeff 663219820Sjeff rc = sdp_post_srcavail(sk, tx_sa); 664219820Sjeff if (rc) { 665219820Sjeff sdp_dbg(sk, "Error posting SrcAvail\n"); 666219820Sjeff goto err_abort_send; 667219820Sjeff } 668219820Sjeff 669219820Sjeff rc = sdp_wait_rdmardcompl(ssk, timeo, 0); 670219820Sjeff if (unlikely(rc)) { 671219820Sjeff enum tx_sa_flag f = tx_sa->abort_flags; 672219820Sjeff 673219820Sjeff if (f & TX_SA_SENDSM) { 674219820Sjeff sdp_dbg_data(sk, "Got SendSM. use SEND verb.\n"); 675219820Sjeff } else if (f & TX_SA_ERROR) { 676219820Sjeff sdp_dbg_data(sk, "SrcAvail error completion\n"); 677219820Sjeff sdp_reset(sk); 678219820Sjeff SDPSTATS_COUNTER_INC(zcopy_tx_error); 679219820Sjeff } else if (ssk->qp_active) { 680219820Sjeff sdp_post_srcavail_cancel(sk); 681219820Sjeff 682219820Sjeff /* Wait for RdmaRdCompl/SendSM to 683219820Sjeff * finish the transaction */ 684219820Sjeff *timeo = 2 * HZ; 685219820Sjeff sdp_dbg_data(sk, "Waiting for SendSM\n"); 686219820Sjeff sdp_wait_rdmardcompl(ssk, timeo, 1); 687219820Sjeff sdp_dbg_data(sk, "finished waiting\n"); 688219820Sjeff 689219820Sjeff cancel_delayed_work(&ssk->srcavail_cancel_work); 690219820Sjeff } else { 691219820Sjeff sdp_dbg_data(sk, "QP was destroyed while waiting\n"); 692219820Sjeff } 693219820Sjeff } else { 694219820Sjeff sdp_dbg_data(sk, "got RdmaRdCompl\n"); 695219820Sjeff } 696219820Sjeff 697219820Sjeff spin_lock_irqsave(&ssk->tx_sa_lock, lock_flags); 698219820Sjeff ssk->tx_sa = NULL; 699219820Sjeff spin_unlock_irqrestore(&ssk->tx_sa_lock, lock_flags); 700219820Sjeff 701219820Sjefferr_abort_send: 702219820Sjeff sdp_update_iov_used(sk, iov, tx_sa->bytes_acked); 703219820Sjeff 704219820Sjefferr_no_tx_slots: 705219820Sjeff sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem); 706219820Sjeff 707219820Sjefferr_alloc_fmr: 708219820Sjeff return rc; 709219820Sjeff} 710219820Sjeff 711219820Sjeffint sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov) 712219820Sjeff{ 713219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 714219820Sjeff int rc = 0; 715219820Sjeff long timeo; 716219820Sjeff struct tx_srcavail_state *tx_sa; 717219820Sjeff int offset; 718219820Sjeff size_t bytes_to_copy = 0; 719219820Sjeff int copied = 0; 720219820Sjeff 721219820Sjeff sdp_dbg_data(sk, "Sending iov: %p, iov_len: 0x%lx\n", 722219820Sjeff iov->iov_base, iov->iov_len); 723219820Sjeff sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy start"); 724219820Sjeff if (ssk->rx_sa) { 725219820Sjeff sdp_dbg_data(sk, "Deadlock prevent: crossing SrcAvail\n"); 726219820Sjeff return 0; 727219820Sjeff } 728219820Sjeff 729219820Sjeff sock_hold(ssk->socket, SOCK_REF_ZCOPY); 730219820Sjeff 731219820Sjeff SDPSTATS_COUNTER_INC(sendmsg_zcopy_segment); 732219820Sjeff 733219820Sjeff timeo = SDP_SRCAVAIL_ADV_TIMEOUT ; 734219820Sjeff 735219820Sjeff /* Ok commence sending. */ 736219820Sjeff offset = (unsigned long)iov->iov_base & (PAGE_SIZE - 1); 737219820Sjeff 738219820Sjeff tx_sa = kmalloc(sizeof(struct tx_srcavail_state), GFP_KERNEL); 739219820Sjeff if (!tx_sa) { 740219820Sjeff sdp_warn(sk, "Error allocating zcopy context\n"); 741219820Sjeff rc = -EAGAIN; /* Buffer too big - fallback to bcopy */ 742219820Sjeff goto err_alloc_tx_sa; 743219820Sjeff } 744219820Sjeff 745219820Sjeff bytes_to_copy = iov->iov_len; 746219820Sjeff do { 747219820Sjeff tx_sa_reset(tx_sa); 748219820Sjeff 749219820Sjeff rc = do_sdp_sendmsg_zcopy(sk, tx_sa, iov, &timeo); 750219820Sjeff 751219820Sjeff if (iov->iov_len && iov->iov_len < sdp_zcopy_thresh) { 752219820Sjeff sdp_dbg_data(sk, "0x%lx bytes left, switching to bcopy\n", 753219820Sjeff iov->iov_len); 754219820Sjeff break; 755219820Sjeff } 756219820Sjeff } while (!rc && iov->iov_len > 0 && !tx_sa->abort_flags); 757219820Sjeff 758219820Sjeff kfree(tx_sa); 759219820Sjefferr_alloc_tx_sa: 760219820Sjeff copied = bytes_to_copy - iov->iov_len; 761219820Sjeff 762219820Sjeff sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy end rc: %d copied: %d", rc, copied); 763219820Sjeff 764219820Sjeff sock_put(ssk->socket, SOCK_REF_ZCOPY); 765219820Sjeff 766219820Sjeff if (rc < 0 && rc != -EAGAIN && rc != -ETIME) 767219820Sjeff return rc; 768219820Sjeff 769219820Sjeff return copied; 770219820Sjeff} 771219820Sjeff 772219820Sjeffvoid sdp_abort_srcavail(struct socket *sk) 773219820Sjeff{ 774219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 775219820Sjeff struct tx_srcavail_state *tx_sa = ssk->tx_sa; 776219820Sjeff unsigned long flags; 777219820Sjeff 778219820Sjeff if (!tx_sa) 779219820Sjeff return; 780219820Sjeff 781219820Sjeff cancel_delayed_work(&ssk->srcavail_cancel_work); 782219820Sjeff flush_scheduled_work(); 783219820Sjeff 784219820Sjeff spin_lock_irqsave(&ssk->tx_sa_lock, flags); 785219820Sjeff 786219820Sjeff sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem); 787219820Sjeff 788219820Sjeff ssk->tx_sa = NULL; 789219820Sjeff 790219820Sjeff spin_unlock_irqrestore(&ssk->tx_sa_lock, flags); 791219820Sjeff} 792219820Sjeff 793219820Sjeffvoid sdp_abort_rdma_read(struct socket *sk) 794219820Sjeff{ 795219820Sjeff struct sdp_sock *ssk = sdp_sk(sk); 796219820Sjeff struct rx_srcavail_state *rx_sa = ssk->rx_sa; 797219820Sjeff 798219820Sjeff if (!rx_sa) 799219820Sjeff return; 800219820Sjeff 801219820Sjeff sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem); 802219820Sjeff 803219820Sjeff ssk->rx_sa = NULL; 804219820Sjeff} 805