1219820Sjeff/* 2219820Sjeff * Copyright (c) 2005 Topspin Communications. All rights reserved. 3219820Sjeff * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. 4219820Sjeff * Copyright (c) 2007 Cisco, Inc. All rights reserved. 5219820Sjeff * 6219820Sjeff * This software is available to you under a choice of one of two 7219820Sjeff * licenses. You may choose to be licensed under the terms of the GNU 8219820Sjeff * General Public License (GPL) Version 2, available from the file 9219820Sjeff * COPYING in the main directory of this source tree, or the 10219820Sjeff * OpenIB.org BSD license below: 11219820Sjeff * 12219820Sjeff * Redistribution and use in source and binary forms, with or 13219820Sjeff * without modification, are permitted provided that the following 14219820Sjeff * conditions are met: 15219820Sjeff * 16219820Sjeff * - Redistributions of source code must retain the above 17219820Sjeff * copyright notice, this list of conditions and the following 18219820Sjeff * disclaimer. 19219820Sjeff * 20219820Sjeff * - Redistributions in binary form must reproduce the above 21219820Sjeff * copyright notice, this list of conditions and the following 22219820Sjeff * disclaimer in the documentation and/or other materials 23219820Sjeff * provided with the distribution. 24219820Sjeff * 25219820Sjeff * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26219820Sjeff * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27219820Sjeff * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28219820Sjeff * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29219820Sjeff * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30219820Sjeff * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31219820Sjeff * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32219820Sjeff * SOFTWARE. 33219820Sjeff */ 34219820Sjeff 35219820Sjeff#if HAVE_CONFIG_H 36219820Sjeff# include <config.h> 37219820Sjeff#endif /* HAVE_CONFIG_H */ 38219820Sjeff 39219820Sjeff#include <stdlib.h> 40219820Sjeff#include <netinet/in.h> 41219820Sjeff#include <pthread.h> 42219820Sjeff#include <string.h> 43219820Sjeff 44219820Sjeff#include "mlx4.h" 45219820Sjeff#include "doorbell.h" 46219820Sjeff#include "wqe.h" 47219820Sjeff 48219820Sjeffstatic const uint32_t mlx4_ib_opcode[] = { 49219820Sjeff [IBV_WR_SEND] = MLX4_OPCODE_SEND, 50219820Sjeff [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM, 51219820Sjeff [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE, 52219820Sjeff [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM, 53219820Sjeff [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ, 54219820Sjeff [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS, 55219820Sjeff [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA, 56219820Sjeff}; 57219820Sjeff 58219820Sjeffstatic void *get_recv_wqe(struct mlx4_qp *qp, int n) 59219820Sjeff{ 60219820Sjeff return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); 61219820Sjeff} 62219820Sjeff 63219820Sjeffstatic void *get_send_wqe(struct mlx4_qp *qp, int n) 64219820Sjeff{ 65219820Sjeff return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift); 66219820Sjeff} 67219820Sjeff 68219820Sjeff/* 69219820Sjeff * Stamp a SQ WQE so that it is invalid if prefetched by marking the 70219820Sjeff * first four bytes of every 64 byte chunk with 0xffffffff, except for 71219820Sjeff * the very first chunk of the WQE. 72219820Sjeff */ 73219820Sjeffstatic void stamp_send_wqe(struct mlx4_qp *qp, int n) 74219820Sjeff{ 75219820Sjeff uint32_t *wqe = get_send_wqe(qp, n); 76219820Sjeff int i; 77219820Sjeff int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2; 78219820Sjeff 79219820Sjeff for (i = 16; i < ds; i += 16) 80219820Sjeff wqe[i] = 0xffffffff; 81219820Sjeff} 82219820Sjeff 83219820Sjeffvoid mlx4_init_qp_indices(struct mlx4_qp *qp) 84219820Sjeff{ 85219820Sjeff qp->sq.head = 0; 86219820Sjeff qp->sq.tail = 0; 87219820Sjeff qp->rq.head = 0; 88219820Sjeff qp->rq.tail = 0; 89219820Sjeff} 90219820Sjeff 91219820Sjeffvoid mlx4_qp_init_sq_ownership(struct mlx4_qp *qp) 92219820Sjeff{ 93219820Sjeff struct mlx4_wqe_ctrl_seg *ctrl; 94219820Sjeff int i; 95219820Sjeff 96219820Sjeff for (i = 0; i < qp->sq.wqe_cnt; ++i) { 97219820Sjeff ctrl = get_send_wqe(qp, i); 98219820Sjeff ctrl->owner_opcode = htonl(1 << 31); 99219820Sjeff ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); 100219820Sjeff 101219820Sjeff stamp_send_wqe(qp, i); 102219820Sjeff } 103219820Sjeff} 104219820Sjeff 105219820Sjeffstatic int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq) 106219820Sjeff{ 107219820Sjeff unsigned cur; 108219820Sjeff 109219820Sjeff cur = wq->head - wq->tail; 110219820Sjeff if (cur + nreq < wq->max_post) 111219820Sjeff return 0; 112219820Sjeff 113219820Sjeff pthread_spin_lock(&cq->lock); 114219820Sjeff cur = wq->head - wq->tail; 115219820Sjeff pthread_spin_unlock(&cq->lock); 116219820Sjeff 117219820Sjeff return cur + nreq >= wq->max_post; 118219820Sjeff} 119219820Sjeff 120219820Sjeffstatic inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, 121219820Sjeff uint64_t remote_addr, uint32_t rkey) 122219820Sjeff{ 123219820Sjeff rseg->raddr = htonll(remote_addr); 124219820Sjeff rseg->rkey = htonl(rkey); 125219820Sjeff rseg->reserved = 0; 126219820Sjeff} 127219820Sjeff 128219820Sjeffstatic void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr) 129219820Sjeff{ 130219820Sjeff if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { 131219820Sjeff aseg->swap_add = htonll(wr->wr.atomic.swap); 132219820Sjeff aseg->compare = htonll(wr->wr.atomic.compare_add); 133219820Sjeff } else { 134219820Sjeff aseg->swap_add = htonll(wr->wr.atomic.compare_add); 135219820Sjeff aseg->compare = 0; 136219820Sjeff } 137219820Sjeff 138219820Sjeff} 139219820Sjeff 140219820Sjeffstatic void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, 141219820Sjeff struct ibv_send_wr *wr) 142219820Sjeff{ 143219820Sjeff memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); 144219820Sjeff dseg->dqpn = htonl(wr->wr.ud.remote_qpn); 145219820Sjeff dseg->qkey = htonl(wr->wr.ud.remote_qkey); 146219820Sjeff dseg->vlan = htons(to_mah(wr->wr.ud.ah)->vlan); 147219820Sjeff memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6); 148219820Sjeff} 149219820Sjeff 150219820Sjeffstatic void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) 151219820Sjeff{ 152219820Sjeff dseg->byte_count = htonl(sg->length); 153219820Sjeff dseg->lkey = htonl(sg->lkey); 154219820Sjeff dseg->addr = htonll(sg->addr); 155219820Sjeff} 156219820Sjeff 157219820Sjeffstatic void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) 158219820Sjeff{ 159219820Sjeff dseg->lkey = htonl(sg->lkey); 160219820Sjeff dseg->addr = htonll(sg->addr); 161219820Sjeff 162219820Sjeff /* 163219820Sjeff * Need a barrier here before writing the byte_count field to 164219820Sjeff * make sure that all the data is visible before the 165219820Sjeff * byte_count field is set. Otherwise, if the segment begins 166219820Sjeff * a new cacheline, the HCA prefetcher could grab the 64-byte 167219820Sjeff * chunk and get a valid (!= * 0xffffffff) byte count but 168219820Sjeff * stale data, and end up sending the wrong data. 169219820Sjeff */ 170219820Sjeff wmb(); 171219820Sjeff 172219820Sjeff dseg->byte_count = htonl(sg->length); 173219820Sjeff} 174219820Sjeff 175219820Sjeff/* 176219820Sjeff * Avoid using memcpy() to copy to BlueFlame page, since memcpy() 177219820Sjeff * implementations may use move-string-buffer assembler instructions, 178219820Sjeff * which do not guarantee order of copying. 179219820Sjeff */ 180219820Sjeffstatic void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt) 181219820Sjeff{ 182219820Sjeff while (bytecnt > 0) { 183219820Sjeff *dst++ = *src++; 184219820Sjeff *dst++ = *src++; 185219820Sjeff bytecnt -= 2 * sizeof (long); 186219820Sjeff } 187219820Sjeff} 188219820Sjeff 189219820Sjeffint mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, 190219820Sjeff struct ibv_send_wr **bad_wr) 191219820Sjeff{ 192219820Sjeff struct mlx4_context *ctx; 193219820Sjeff struct mlx4_qp *qp = to_mqp(ibqp); 194219820Sjeff void *wqe; 195219820Sjeff struct mlx4_wqe_ctrl_seg *ctrl; 196219820Sjeff int ind; 197219820Sjeff int nreq; 198219820Sjeff int inl = 0; 199219820Sjeff int ret = 0; 200219820Sjeff int size; 201219820Sjeff int i; 202219820Sjeff 203219820Sjeff pthread_spin_lock(&qp->sq.lock); 204219820Sjeff 205219820Sjeff /* XXX check that state is OK to post send */ 206219820Sjeff 207219820Sjeff ind = qp->sq.head; 208219820Sjeff 209219820Sjeff for (nreq = 0; wr; ++nreq, wr = wr->next) { 210219820Sjeff if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) { 211219820Sjeff ret = -1; 212219820Sjeff *bad_wr = wr; 213219820Sjeff goto out; 214219820Sjeff } 215219820Sjeff 216219820Sjeff if (wr->num_sge > qp->sq.max_gs) { 217219820Sjeff ret = -1; 218219820Sjeff *bad_wr = wr; 219219820Sjeff goto out; 220219820Sjeff } 221219820Sjeff 222219820Sjeff if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { 223219820Sjeff ret = -1; 224219820Sjeff *bad_wr = wr; 225219820Sjeff goto out; 226219820Sjeff } 227219820Sjeff 228219820Sjeff ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 229219820Sjeff qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 230219820Sjeff 231219820Sjeff ctrl->xrcrb_flags = 232219820Sjeff (wr->send_flags & IBV_SEND_SIGNALED ? 233219820Sjeff htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | 234219820Sjeff (wr->send_flags & IBV_SEND_SOLICITED ? 235219820Sjeff htonl(MLX4_WQE_CTRL_SOLICIT) : 0) | 236219820Sjeff qp->sq_signal_bits; 237219820Sjeff 238219820Sjeff if (wr->opcode == IBV_WR_SEND_WITH_IMM || 239219820Sjeff wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) 240219820Sjeff ctrl->imm = wr->imm_data; 241219820Sjeff else 242219820Sjeff ctrl->imm = 0; 243219820Sjeff 244219820Sjeff wqe += sizeof *ctrl; 245219820Sjeff size = sizeof *ctrl / 16; 246219820Sjeff 247219820Sjeff switch (ibqp->qp_type) { 248219820Sjeff case IBV_QPT_XRC: 249219820Sjeff ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8); 250219820Sjeff /* fall thru */ 251219820Sjeff case IBV_QPT_RC: 252219820Sjeff case IBV_QPT_UC: 253219820Sjeff switch (wr->opcode) { 254219820Sjeff case IBV_WR_ATOMIC_CMP_AND_SWP: 255219820Sjeff case IBV_WR_ATOMIC_FETCH_AND_ADD: 256219820Sjeff set_raddr_seg(wqe, wr->wr.atomic.remote_addr, 257219820Sjeff wr->wr.atomic.rkey); 258219820Sjeff wqe += sizeof (struct mlx4_wqe_raddr_seg); 259219820Sjeff 260219820Sjeff set_atomic_seg(wqe, wr); 261219820Sjeff wqe += sizeof (struct mlx4_wqe_atomic_seg); 262219820Sjeff size += (sizeof (struct mlx4_wqe_raddr_seg) + 263219820Sjeff sizeof (struct mlx4_wqe_atomic_seg)) / 16; 264219820Sjeff 265219820Sjeff break; 266219820Sjeff 267219820Sjeff case IBV_WR_RDMA_READ: 268219820Sjeff inl = 1; 269219820Sjeff /* fall through */ 270219820Sjeff case IBV_WR_RDMA_WRITE: 271219820Sjeff case IBV_WR_RDMA_WRITE_WITH_IMM: 272219820Sjeff set_raddr_seg(wqe, wr->wr.rdma.remote_addr, 273219820Sjeff wr->wr.rdma.rkey); 274219820Sjeff wqe += sizeof (struct mlx4_wqe_raddr_seg); 275219820Sjeff size += sizeof (struct mlx4_wqe_raddr_seg) / 16; 276219820Sjeff 277219820Sjeff break; 278219820Sjeff 279219820Sjeff default: 280219820Sjeff /* No extra segments required for sends */ 281219820Sjeff break; 282219820Sjeff } 283219820Sjeff break; 284219820Sjeff 285219820Sjeff case IBV_QPT_UD: 286219820Sjeff set_datagram_seg(wqe, wr); 287219820Sjeff wqe += sizeof (struct mlx4_wqe_datagram_seg); 288219820Sjeff size += sizeof (struct mlx4_wqe_datagram_seg) / 16; 289219820Sjeff if (to_mah(wr->wr.ud.ah)->tagged) { 290219820Sjeff ctrl->ins_vlan = 1 << 6; 291219820Sjeff ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan); 292219820Sjeff } 293219820Sjeff 294219820Sjeff break; 295219820Sjeff 296219820Sjeff default: 297219820Sjeff break; 298219820Sjeff } 299219820Sjeff 300219820Sjeff if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { 301219820Sjeff struct mlx4_wqe_inline_seg *seg; 302219820Sjeff void *addr; 303219820Sjeff int len, seg_len; 304219820Sjeff int num_seg; 305219820Sjeff int off, to_copy; 306219820Sjeff 307219820Sjeff inl = 0; 308219820Sjeff 309219820Sjeff seg = wqe; 310219820Sjeff wqe += sizeof *seg; 311219820Sjeff off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); 312219820Sjeff num_seg = 0; 313219820Sjeff seg_len = 0; 314219820Sjeff 315219820Sjeff for (i = 0; i < wr->num_sge; ++i) { 316219820Sjeff addr = (void *) (uintptr_t) wr->sg_list[i].addr; 317219820Sjeff len = wr->sg_list[i].length; 318219820Sjeff inl += len; 319219820Sjeff 320219820Sjeff if (inl > qp->max_inline_data) { 321219820Sjeff inl = 0; 322219820Sjeff ret = -1; 323219820Sjeff *bad_wr = wr; 324219820Sjeff goto out; 325219820Sjeff } 326219820Sjeff 327219820Sjeff while (len >= MLX4_INLINE_ALIGN - off) { 328219820Sjeff to_copy = MLX4_INLINE_ALIGN - off; 329219820Sjeff memcpy(wqe, addr, to_copy); 330219820Sjeff len -= to_copy; 331219820Sjeff wqe += to_copy; 332219820Sjeff addr += to_copy; 333219820Sjeff seg_len += to_copy; 334219820Sjeff wmb(); /* see comment below */ 335219820Sjeff seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); 336219820Sjeff seg_len = 0; 337219820Sjeff seg = wqe; 338219820Sjeff wqe += sizeof *seg; 339219820Sjeff off = sizeof *seg; 340219820Sjeff ++num_seg; 341219820Sjeff } 342219820Sjeff 343219820Sjeff memcpy(wqe, addr, len); 344219820Sjeff wqe += len; 345219820Sjeff seg_len += len; 346219820Sjeff off += len; 347219820Sjeff } 348219820Sjeff 349219820Sjeff if (seg_len) { 350219820Sjeff ++num_seg; 351219820Sjeff /* 352219820Sjeff * Need a barrier here to make sure 353219820Sjeff * all the data is visible before the 354219820Sjeff * byte_count field is set. Otherwise 355219820Sjeff * the HCA prefetcher could grab the 356219820Sjeff * 64-byte chunk with this inline 357219820Sjeff * segment and get a valid (!= 358219820Sjeff * 0xffffffff) byte count but stale 359219820Sjeff * data, and end up sending the wrong 360219820Sjeff * data. 361219820Sjeff */ 362219820Sjeff wmb(); 363219820Sjeff seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); 364219820Sjeff } 365219820Sjeff 366219820Sjeff size += (inl + num_seg * sizeof * seg + 15) / 16; 367219820Sjeff } else { 368219820Sjeff struct mlx4_wqe_data_seg *seg = wqe; 369219820Sjeff 370219820Sjeff for (i = wr->num_sge - 1; i >= 0 ; --i) 371219820Sjeff set_data_seg(seg + i, wr->sg_list + i); 372219820Sjeff 373219820Sjeff size += wr->num_sge * (sizeof *seg / 16); 374219820Sjeff } 375219820Sjeff 376219820Sjeff ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? 377219820Sjeff MLX4_WQE_CTRL_FENCE : 0) | size; 378219820Sjeff 379219820Sjeff /* 380219820Sjeff * Make sure descriptor is fully written before 381219820Sjeff * setting ownership bit (because HW can start 382219820Sjeff * executing as soon as we do). 383219820Sjeff */ 384219820Sjeff wmb(); 385219820Sjeff 386219820Sjeff ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) | 387219820Sjeff (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0); 388219820Sjeff 389219820Sjeff /* 390219820Sjeff * We can improve latency by not stamping the last 391219820Sjeff * send queue WQE until after ringing the doorbell, so 392219820Sjeff * only stamp here if there are still more WQEs to post. 393219820Sjeff */ 394219820Sjeff if (wr->next) 395219820Sjeff stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & 396219820Sjeff (qp->sq.wqe_cnt - 1)); 397219820Sjeff 398219820Sjeff ++ind; 399219820Sjeff } 400219820Sjeff 401219820Sjeffout: 402219820Sjeff ctx = to_mctx(ibqp->context); 403219820Sjeff 404219820Sjeff if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) { 405219820Sjeff ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8); 406219820Sjeff *(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; 407219820Sjeff /* 408219820Sjeff * Make sure that descriptor is written to memory 409219820Sjeff * before writing to BlueFlame page. 410219820Sjeff */ 411219820Sjeff wmb(); 412219820Sjeff 413219820Sjeff ++qp->sq.head; 414219820Sjeff 415219820Sjeff pthread_spin_lock(&ctx->bf_lock); 416219820Sjeff 417219820Sjeff mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl, 418219820Sjeff align(size * 16, 64)); 419219820Sjeff wc_wmb(); 420219820Sjeff 421219820Sjeff ctx->bf_offset ^= ctx->bf_buf_size; 422219820Sjeff 423219820Sjeff pthread_spin_unlock(&ctx->bf_lock); 424219820Sjeff } else if (nreq) { 425219820Sjeff qp->sq.head += nreq; 426219820Sjeff 427219820Sjeff /* 428219820Sjeff * Make sure that descriptors are written before 429219820Sjeff * doorbell record. 430219820Sjeff */ 431219820Sjeff wmb(); 432219820Sjeff 433219820Sjeff *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn; 434219820Sjeff } 435219820Sjeff 436219820Sjeff if (nreq) 437219820Sjeff stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & 438219820Sjeff (qp->sq.wqe_cnt - 1)); 439219820Sjeff 440219820Sjeff pthread_spin_unlock(&qp->sq.lock); 441219820Sjeff 442219820Sjeff return ret; 443219820Sjeff} 444219820Sjeff 445219820Sjeffint mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, 446219820Sjeff struct ibv_recv_wr **bad_wr) 447219820Sjeff{ 448219820Sjeff struct mlx4_qp *qp = to_mqp(ibqp); 449219820Sjeff struct mlx4_wqe_data_seg *scat; 450219820Sjeff int ret = 0; 451219820Sjeff int nreq; 452219820Sjeff int ind; 453219820Sjeff int i; 454219820Sjeff 455219820Sjeff pthread_spin_lock(&qp->rq.lock); 456219820Sjeff 457219820Sjeff /* XXX check that state is OK to post receive */ 458219820Sjeff 459219820Sjeff ind = qp->rq.head & (qp->rq.wqe_cnt - 1); 460219820Sjeff 461219820Sjeff for (nreq = 0; wr; ++nreq, wr = wr->next) { 462219820Sjeff if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) { 463219820Sjeff ret = -1; 464219820Sjeff *bad_wr = wr; 465219820Sjeff goto out; 466219820Sjeff } 467219820Sjeff 468219820Sjeff if (wr->num_sge > qp->rq.max_gs) { 469219820Sjeff ret = -1; 470219820Sjeff *bad_wr = wr; 471219820Sjeff goto out; 472219820Sjeff } 473219820Sjeff 474219820Sjeff scat = get_recv_wqe(qp, ind); 475219820Sjeff 476219820Sjeff for (i = 0; i < wr->num_sge; ++i) 477219820Sjeff __set_data_seg(scat + i, wr->sg_list + i); 478219820Sjeff 479219820Sjeff if (i < qp->rq.max_gs) { 480219820Sjeff scat[i].byte_count = 0; 481219820Sjeff scat[i].lkey = htonl(MLX4_INVALID_LKEY); 482219820Sjeff scat[i].addr = 0; 483219820Sjeff } 484219820Sjeff 485219820Sjeff qp->rq.wrid[ind] = wr->wr_id; 486219820Sjeff 487219820Sjeff ind = (ind + 1) & (qp->rq.wqe_cnt - 1); 488219820Sjeff } 489219820Sjeff 490219820Sjeffout: 491219820Sjeff if (nreq) { 492219820Sjeff qp->rq.head += nreq; 493219820Sjeff 494219820Sjeff /* 495219820Sjeff * Make sure that descriptors are written before 496219820Sjeff * doorbell record. 497219820Sjeff */ 498219820Sjeff wmb(); 499219820Sjeff 500219820Sjeff *qp->db = htonl(qp->rq.head & 0xffff); 501219820Sjeff } 502219820Sjeff 503219820Sjeff pthread_spin_unlock(&qp->rq.lock); 504219820Sjeff 505219820Sjeff return ret; 506219820Sjeff} 507219820Sjeff 508219820Sjeffint num_inline_segs(int data, enum ibv_qp_type type) 509219820Sjeff{ 510219820Sjeff /* 511219820Sjeff * Inline data segments are not allowed to cross 64 byte 512219820Sjeff * boundaries. For UD QPs, the data segments always start 513219820Sjeff * aligned to 64 bytes (16 byte control segment + 48 byte 514219820Sjeff * datagram segment); for other QPs, there will be a 16 byte 515219820Sjeff * control segment and possibly a 16 byte remote address 516219820Sjeff * segment, so in the worst case there will be only 32 bytes 517219820Sjeff * available for the first data segment. 518219820Sjeff */ 519219820Sjeff if (type == IBV_QPT_UD) 520219820Sjeff data += (sizeof (struct mlx4_wqe_ctrl_seg) + 521219820Sjeff sizeof (struct mlx4_wqe_datagram_seg)) % 522219820Sjeff MLX4_INLINE_ALIGN; 523219820Sjeff else 524219820Sjeff data += (sizeof (struct mlx4_wqe_ctrl_seg) + 525219820Sjeff sizeof (struct mlx4_wqe_raddr_seg)) % 526219820Sjeff MLX4_INLINE_ALIGN; 527219820Sjeff 528219820Sjeff return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) / 529219820Sjeff (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg)); 530219820Sjeff} 531219820Sjeff 532219820Sjeffvoid mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, 533219820Sjeff struct mlx4_qp *qp) 534219820Sjeff{ 535219820Sjeff int size; 536219820Sjeff int max_sq_sge; 537219820Sjeff 538219820Sjeff max_sq_sge = align(cap->max_inline_data + 539219820Sjeff num_inline_segs(cap->max_inline_data, type) * 540219820Sjeff sizeof (struct mlx4_wqe_inline_seg), 541219820Sjeff sizeof (struct mlx4_wqe_data_seg)) / 542219820Sjeff sizeof (struct mlx4_wqe_data_seg); 543219820Sjeff if (max_sq_sge < cap->max_send_sge) 544219820Sjeff max_sq_sge = cap->max_send_sge; 545219820Sjeff 546219820Sjeff size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg); 547219820Sjeff switch (type) { 548219820Sjeff case IBV_QPT_UD: 549219820Sjeff size += sizeof (struct mlx4_wqe_datagram_seg); 550219820Sjeff break; 551219820Sjeff 552219820Sjeff case IBV_QPT_UC: 553219820Sjeff size += sizeof (struct mlx4_wqe_raddr_seg); 554219820Sjeff break; 555219820Sjeff 556219820Sjeff case IBV_QPT_XRC: 557219820Sjeff case IBV_QPT_RC: 558219820Sjeff size += sizeof (struct mlx4_wqe_raddr_seg); 559219820Sjeff /* 560219820Sjeff * An atomic op will require an atomic segment, a 561219820Sjeff * remote address segment and one scatter entry. 562219820Sjeff */ 563219820Sjeff if (size < (sizeof (struct mlx4_wqe_atomic_seg) + 564219820Sjeff sizeof (struct mlx4_wqe_raddr_seg) + 565219820Sjeff sizeof (struct mlx4_wqe_data_seg))) 566219820Sjeff size = (sizeof (struct mlx4_wqe_atomic_seg) + 567219820Sjeff sizeof (struct mlx4_wqe_raddr_seg) + 568219820Sjeff sizeof (struct mlx4_wqe_data_seg)); 569219820Sjeff break; 570219820Sjeff 571219820Sjeff default: 572219820Sjeff break; 573219820Sjeff } 574219820Sjeff 575219820Sjeff /* Make sure that we have enough space for a bind request */ 576219820Sjeff if (size < sizeof (struct mlx4_wqe_bind_seg)) 577219820Sjeff size = sizeof (struct mlx4_wqe_bind_seg); 578219820Sjeff 579219820Sjeff size += sizeof (struct mlx4_wqe_ctrl_seg); 580219820Sjeff 581219820Sjeff for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; 582219820Sjeff qp->sq.wqe_shift++) 583219820Sjeff ; /* nothing */ 584219820Sjeff} 585219820Sjeff 586219820Sjeffint mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, 587219820Sjeff enum ibv_qp_type type, struct mlx4_qp *qp) 588219820Sjeff{ 589219820Sjeff qp->rq.max_gs = cap->max_recv_sge; 590219820Sjeff 591219820Sjeff qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); 592219820Sjeff if (!qp->sq.wrid) 593219820Sjeff return -1; 594219820Sjeff 595219820Sjeff if (qp->rq.wqe_cnt) { 596219820Sjeff qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); 597219820Sjeff if (!qp->rq.wrid) { 598219820Sjeff free(qp->sq.wrid); 599219820Sjeff return -1; 600219820Sjeff } 601219820Sjeff } 602219820Sjeff 603219820Sjeff for (qp->rq.wqe_shift = 4; 604219820Sjeff 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg); 605219820Sjeff qp->rq.wqe_shift++) 606219820Sjeff ; /* nothing */ 607219820Sjeff 608219820Sjeff qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 609219820Sjeff (qp->sq.wqe_cnt << qp->sq.wqe_shift); 610219820Sjeff if (qp->rq.wqe_shift > qp->sq.wqe_shift) { 611219820Sjeff qp->rq.offset = 0; 612219820Sjeff qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; 613219820Sjeff } else { 614219820Sjeff qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; 615219820Sjeff qp->sq.offset = 0; 616219820Sjeff } 617219820Sjeff 618219820Sjeff if (mlx4_alloc_buf(&qp->buf, 619219820Sjeff align(qp->buf_size, to_mdev(pd->context->device)->page_size), 620219820Sjeff to_mdev(pd->context->device)->page_size)) { 621219820Sjeff free(qp->sq.wrid); 622219820Sjeff free(qp->rq.wrid); 623219820Sjeff return -1; 624219820Sjeff } 625219820Sjeff 626219820Sjeff memset(qp->buf.buf, 0, qp->buf_size); 627219820Sjeff 628219820Sjeff return 0; 629219820Sjeff} 630219820Sjeff 631219820Sjeffvoid mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, 632219820Sjeff enum ibv_qp_type type) 633219820Sjeff{ 634219820Sjeff int wqe_size; 635219820Sjeff struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context); 636219820Sjeff 637219820Sjeff wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) - 638219820Sjeff sizeof (struct mlx4_wqe_ctrl_seg); 639219820Sjeff switch (type) { 640219820Sjeff case IBV_QPT_UD: 641219820Sjeff wqe_size -= sizeof (struct mlx4_wqe_datagram_seg); 642219820Sjeff break; 643219820Sjeff 644219820Sjeff case IBV_QPT_UC: 645219820Sjeff case IBV_QPT_RC: 646219820Sjeff case IBV_QPT_XRC: 647219820Sjeff wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); 648219820Sjeff break; 649219820Sjeff 650219820Sjeff default: 651219820Sjeff break; 652219820Sjeff } 653219820Sjeff 654219820Sjeff qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg); 655219820Sjeff cap->max_send_sge = min(ctx->max_sge, qp->sq.max_gs); 656219820Sjeff qp->sq.max_post = min(ctx->max_qp_wr, 657219820Sjeff qp->sq.wqe_cnt - qp->sq_spare_wqes); 658219820Sjeff cap->max_send_wr = qp->sq.max_post; 659219820Sjeff 660219820Sjeff /* 661219820Sjeff * Inline data segments can't cross a 64 byte boundary. So 662219820Sjeff * subtract off one segment header for each 64-byte chunk, 663219820Sjeff * taking into account the fact that wqe_size will be 32 mod 664219820Sjeff * 64 for non-UD QPs. 665219820Sjeff */ 666219820Sjeff qp->max_inline_data = wqe_size - 667219820Sjeff sizeof (struct mlx4_wqe_inline_seg) * 668219820Sjeff (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN); 669219820Sjeff cap->max_inline_data = qp->max_inline_data; 670219820Sjeff} 671219820Sjeff 672219820Sjeffstruct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn) 673219820Sjeff{ 674219820Sjeff int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; 675219820Sjeff 676219820Sjeff if (ctx->qp_table[tind].refcnt) 677219820Sjeff return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask]; 678219820Sjeff else 679219820Sjeff return NULL; 680219820Sjeff} 681219820Sjeff 682219820Sjeffint mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp) 683219820Sjeff{ 684219820Sjeff int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; 685219820Sjeff 686219820Sjeff if (!ctx->qp_table[tind].refcnt) { 687219820Sjeff ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1, 688219820Sjeff sizeof (struct mlx4_qp *)); 689219820Sjeff if (!ctx->qp_table[tind].table) 690219820Sjeff return -1; 691219820Sjeff } 692219820Sjeff 693219820Sjeff ++ctx->qp_table[tind].refcnt; 694219820Sjeff ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp; 695219820Sjeff return 0; 696219820Sjeff} 697219820Sjeff 698219820Sjeffvoid mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn) 699219820Sjeff{ 700219820Sjeff int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; 701219820Sjeff 702219820Sjeff if (!--ctx->qp_table[tind].refcnt) 703219820Sjeff free(ctx->qp_table[tind].table); 704219820Sjeff else 705219820Sjeff ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; 706219820Sjeff} 707