icl_iser.h revision 331769
1/* $FreeBSD: stable/11/sys/dev/iser/icl_iser.h 331769 2018-03-30 18:06:29Z hselasky $ */ 2/*- 3 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#ifndef ICL_ISER_H 28#define ICL_ISER_H 29 30/* 31 * iSCSI Common Layer for RDMA. 32 */ 33 34#include <sys/cdefs.h> 35#include <sys/param.h> 36#include <sys/capsicum.h> 37#include <sys/condvar.h> 38#include <sys/conf.h> 39#include <sys/file.h> 40#include <sys/kernel.h> 41#include <sys/kthread.h> 42#include <sys/lock.h> 43#include <sys/mbuf.h> 44#include <sys/mutex.h> 45#include <sys/module.h> 46#include <sys/protosw.h> 47#include <sys/socket.h> 48#include <sys/socketvar.h> 49#include <sys/sysctl.h> 50#include <sys/systm.h> 51#include <sys/sx.h> 52#include <sys/uio.h> 53#include <sys/taskqueue.h> 54#include <sys/bio.h> 55#include <vm/uma.h> 56#include <netinet/in.h> 57#include <netinet/tcp.h> 58#include <dev/iscsi/icl.h> 59#include <dev/iscsi/iscsi_proto.h> 60#include <icl_conn_if.h> 61#include <cam/cam.h> 62#include <cam/cam_ccb.h> 63#include <rdma/ib_verbs.h> 64#include <rdma/ib_fmr_pool.h> 65#include <rdma/rdma_cm.h> 66 67 68#define ISER_DBG(X, ...) \ 69 do { \ 70 if (unlikely(iser_debug > 2)) \ 71 printf("DEBUG: %s: " X "\n", \ 72 __func__, ## __VA_ARGS__); \ 73 } while (0) 74 75#define ISER_INFO(X, ...) \ 76 do { \ 77 if (unlikely(iser_debug > 1)) \ 78 printf("INFO: %s: " X "\n", \ 79 __func__, ## __VA_ARGS__); \ 80 } while (0) 81 82#define ISER_WARN(X, ...) \ 83 do { \ 84 if (unlikely(iser_debug > 0)) { \ 85 printf("WARNING: %s: " X "\n", \ 86 __func__, ## __VA_ARGS__); \ 87 } \ 88 } while (0) 89 90#define ISER_ERR(X, ...) \ 91 printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__) 92 93#define ISER_VER 0x10 94#define ISER_WSV 0x08 95#define ISER_RSV 0x04 96 97#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL 98#define ISER_BEACON_WRID 0xfffffffffffffffeULL 99 100#define SHIFT_4K 12 101#define SIZE_4K (1ULL << SHIFT_4K) 102#define MASK_4K (~(SIZE_4K-1)) 103 104/* support up to 512KB in one RDMA */ 105#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) 106#define ISER_DEF_XMIT_CMDS_MAX 256 107 108/* the max RX (recv) WR supported by the iSER QP is defined by * 109 * max_recv_wr = commands_max + recv_beacon */ 110#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX + 1) 111#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) 112 113/* QP settings */ 114/* Maximal bounds on received asynchronous PDUs */ 115#define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ 116#define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */ 117 118/* the max TX (send) WR supported by the iSER QP is defined by * 119 * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * 120 * to have at max for SCSI command. The tx posting & completion handling code * 121 * supports -EAGAIN scheme where tx is suspended till the QP has room for more * 122 * send WR. D=8 comes from 64K/8K */ 123 124#define ISER_INFLIGHT_DATAOUTS 8 125 126/* the send_beacon increase the max_send_wr by 1 */ 127#define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ 128 (1 + ISER_INFLIGHT_DATAOUTS) + \ 129 ISER_MAX_TX_MISC_PDUS + \ 130 ISER_MAX_RX_MISC_PDUS + 1) 131 132#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \ 133 - ISER_MAX_TX_MISC_PDUS \ 134 - ISER_MAX_RX_MISC_PDUS - 1) / \ 135 (1 + ISER_INFLIGHT_DATAOUTS)) 136 137#define ISER_WC_BATCH_COUNT 16 138#define ISER_SIGNAL_CMD_COUNT 32 139 140/* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might * 141 * encounter a CQ overrun state. */ 142#define ISCSI_ISER_MAX_CONN 8 143#define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) 144#define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) 145#define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \ 146 ISCSI_ISER_MAX_CONN) 147 148#define ISER_ZBVA_NOT_SUPPORTED 0x80 149#define ISER_SEND_W_INV_NOT_SUPPORTED 0x40 150 151#define ISCSI_DEF_MAX_RECV_SEG_LEN 8192 152#define ISCSI_OPCODE_MASK 0x3f 153 154#define icl_to_iser_conn(ic) \ 155 container_of(ic, struct iser_conn, icl_conn) 156#define icl_to_iser_pdu(ip) \ 157 container_of(ip, struct icl_iser_pdu, icl_pdu) 158 159/** 160 * struct iser_hdr - iSER header 161 * 162 * @flags: flags support (zbva, remote_inv) 163 * @rsvd: reserved 164 * @write_stag: write rkey 165 * @write_va: write virtual address 166 * @reaf_stag: read rkey 167 * @read_va: read virtual address 168 */ 169struct iser_hdr { 170 u8 flags; 171 u8 rsvd[3]; 172 __be32 write_stag; 173 __be64 write_va; 174 __be32 read_stag; 175 __be64 read_va; 176} __attribute__((packed)); 177 178struct iser_cm_hdr { 179 u8 flags; 180 u8 rsvd[3]; 181} __packed; 182 183/* Constant PDU lengths calculations */ 184#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE) 185 186#define ISER_RECV_DATA_SEG_LEN 128 187#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) 188 189#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) 190 191enum iser_conn_state { 192 ISER_CONN_INIT, /* descriptor allocd, no conn */ 193 ISER_CONN_PENDING, /* in the process of being established */ 194 ISER_CONN_UP, /* up and running */ 195 ISER_CONN_TERMINATING, /* in the process of being terminated */ 196 ISER_CONN_DOWN, /* shut down */ 197 ISER_CONN_STATES_NUM 198}; 199 200enum iser_task_status { 201 ISER_TASK_STATUS_INIT = 0, 202 ISER_TASK_STATUS_STARTED, 203 ISER_TASK_STATUS_COMPLETED 204}; 205 206enum iser_data_dir { 207 ISER_DIR_IN = 0, /* to initiator */ 208 ISER_DIR_OUT, /* from initiator */ 209 ISER_DIRS_NUM 210}; 211 212/** 213 * struct iser_mem_reg - iSER memory registration info 214 * 215 * @sge: memory region sg element 216 * @rkey: memory region remote key 217 * @mem_h: pointer to registration context (FMR/Fastreg) 218 */ 219struct iser_mem_reg { 220 struct ib_sge sge; 221 u32 rkey; 222 void *mem_h; 223}; 224 225enum iser_desc_type { 226 ISCSI_TX_CONTROL , 227 ISCSI_TX_SCSI_COMMAND, 228 ISCSI_TX_DATAOUT 229}; 230 231/** 232 * struct iser_data_buf - iSER data buffer 233 * 234 * @sg: pointer to the sg list 235 * @size: num entries of this sg 236 * @data_len: total beffer byte len 237 * @dma_nents: returned by dma_map_sg 238 * @copy_buf: allocated copy buf for SGs unaligned 239 * for rdma which are copied 240 * @orig_sg: pointer to the original sg list (in case 241 * we used a copy) 242 * @sg_single: SG-ified clone of a non SG SC or 243 * unaligned SG 244 */ 245struct iser_data_buf { 246 struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE]; 247 void *sg; 248 int size; 249 unsigned long data_len; 250 unsigned int dma_nents; 251 char *copy_buf; 252 struct scatterlist *orig_sg; 253 struct scatterlist sg_single; 254 }; 255 256/* fwd declarations */ 257struct iser_conn; 258struct ib_conn; 259struct iser_device; 260 261/** 262 * struct iser_tx_desc - iSER TX descriptor (for send wr_id) 263 * 264 * @iser_header: iser header 265 * @iscsi_header: iscsi header (bhs) 266 * @type: command/control/dataout 267 * @dma_addr: header buffer dma_address 268 * @tx_sg: sg[0] points to iser/iscsi headers 269 * sg[1] optionally points to either of immediate data 270 * unsolicited data-out or control 271 * @num_sge: number sges used on this TX task 272 * @mapped: indicates if the descriptor is dma mapped 273 */ 274struct iser_tx_desc { 275 struct iser_hdr iser_header; 276 struct iscsi_bhs iscsi_header __attribute__((packed)); 277 enum iser_desc_type type; 278 u64 dma_addr; 279 struct ib_sge tx_sg[2]; 280 int num_sge; 281 bool mapped; 282}; 283 284#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ 285 sizeof(u64) + sizeof(struct ib_sge))) 286/** 287 * struct iser_rx_desc - iSER RX descriptor (for recv wr_id) 288 * 289 * @iser_header: iser header 290 * @iscsi_header: iscsi header 291 * @data: received data segment 292 * @dma_addr: receive buffer dma address 293 * @rx_sg: ib_sge of receive buffer 294 * @pad: for sense data TODO: Modify to maximum sense length supported 295 */ 296struct iser_rx_desc { 297 struct iser_hdr iser_header; 298 struct iscsi_bhs iscsi_header; 299 char data[ISER_RECV_DATA_SEG_LEN]; 300 u64 dma_addr; 301 struct ib_sge rx_sg; 302 char pad[ISER_RX_PAD_SIZE]; 303} __attribute__((packed)); 304 305struct icl_iser_pdu { 306 struct icl_pdu icl_pdu; 307 struct iser_tx_desc desc; 308 struct iser_conn *iser_conn; 309 enum iser_task_status status; 310 struct ccb_scsiio *csio; 311 int command_sent; 312 int dir[ISER_DIRS_NUM]; 313 struct iser_mem_reg rdma_reg[ISER_DIRS_NUM]; 314 struct iser_data_buf data[ISER_DIRS_NUM]; 315}; 316 317/** 318 * struct iser_comp - iSER completion context 319 * 320 * @device: pointer to device handle 321 * @cq: completion queue 322 * @wcs: work completion array 323 * @tq: taskqueue handle 324 * @task: task to run task_fn 325 * @active_qps: Number of active QPs attached 326 * to completion context 327 */ 328struct iser_comp { 329 struct iser_device *device; 330 struct ib_cq *cq; 331 struct ib_wc wcs[ISER_WC_BATCH_COUNT]; 332 struct taskqueue *tq; 333 struct task task; 334 int active_qps; 335}; 336 337/** 338 * struct iser_device - iSER device handle 339 * 340 * @ib_device: RDMA device 341 * @pd: Protection Domain for this device 342 * @dev_attr: Device attributes container 343 * @mr: Global DMA memory region 344 * @event_handler: IB events handle routine 345 * @ig_list: entry in devices list 346 * @refcount: Reference counter, dominated by open iser connections 347 * @comps_used: Number of completion contexts used, Min between online 348 * cpus and device max completion vectors 349 * @comps: Dinamically allocated array of completion handlers 350 */ 351struct iser_device { 352 struct ib_device *ib_device; 353 struct ib_pd *pd; 354 struct ib_device_attr dev_attr; 355 struct ib_mr *mr; 356 struct ib_event_handler event_handler; 357 struct list_head ig_list; 358 int refcount; 359 int comps_used; 360 struct iser_comp *comps; 361}; 362 363/** 364 * struct iser_reg_resources - Fast registration recources 365 * 366 * @mr: memory region 367 * @mr_valid: is mr valid indicator 368 */ 369struct iser_reg_resources { 370 struct ib_mr *mr; 371 u8 mr_valid:1; 372}; 373 374/** 375 * struct fast_reg_descriptor - Fast registration descriptor 376 * 377 * @list: entry in connection fastreg pool 378 * @rsc: data buffer registration resources 379 */ 380struct fast_reg_descriptor { 381 struct list_head list; 382 struct iser_reg_resources rsc; 383}; 384 385 386/** 387 * struct iser_beacon - beacon to signal all flush errors were drained 388 * 389 * @send: send wr 390 * @recv: recv wr 391 * @flush_lock: protects flush_cv 392 * @flush_cv: condition variable for beacon flush 393 */ 394struct iser_beacon { 395 union { 396 struct ib_send_wr send; 397 struct ib_recv_wr recv; 398 }; 399 struct mtx flush_lock; 400 struct cv flush_cv; 401}; 402 403/** 404 * struct ib_conn - Infiniband related objects 405 * 406 * @cma_id: rdma_cm connection maneger handle 407 * @qp: Connection Queue-pair 408 * @device: reference to iser device 409 * @comp: iser completion context 410 */ 411struct ib_conn { 412 struct rdma_cm_id *cma_id; 413 struct ib_qp *qp; 414 int post_recv_buf_count; 415 u8 sig_count; 416 struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; 417 struct iser_device *device; 418 struct iser_comp *comp; 419 struct iser_beacon beacon; 420 struct mtx lock; 421 union { 422 struct { 423 struct ib_fmr_pool *pool; 424 struct iser_page_vec *page_vec; 425 } fmr; 426 struct { 427 struct list_head pool; 428 int pool_size; 429 } fastreg; 430 }; 431}; 432 433struct iser_conn { 434 struct icl_conn icl_conn; 435 struct ib_conn ib_conn; 436 struct cv up_cv; 437 struct list_head conn_list; 438 struct sx state_mutex; 439 enum iser_conn_state state; 440 int qp_max_recv_dtos; 441 int min_posted_rx; 442 u16 max_cmds; 443 char *login_buf; 444 char *login_req_buf, *login_resp_buf; 445 u64 login_req_dma, login_resp_dma; 446 unsigned int rx_desc_head; 447 struct iser_rx_desc *rx_descs; 448 u32 num_rx_descs; 449 bool handoff_done; 450}; 451 452/** 453 * struct iser_global: iSER global context 454 * 455 * @device_list_mutex: protects device_list 456 * @device_list: iser devices global list 457 * @connlist_mutex: protects connlist 458 * @connlist: iser connections global list 459 * @desc_cache: kmem cache for tx dataout 460 * @close_conns_mutex: serializes conns closure 461 */ 462struct iser_global { 463 struct sx device_list_mutex; 464 struct list_head device_list; 465 struct mtx connlist_mutex; 466 struct list_head connlist; 467 struct sx close_conns_mutex; 468}; 469 470extern struct iser_global ig; 471extern int iser_debug; 472 473void 474iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *); 475 476int 477iser_post_recvl(struct iser_conn *); 478 479int 480iser_post_recvm(struct iser_conn *, int); 481 482int 483iser_alloc_login_buf(struct iser_conn *iser_conn); 484 485void 486iser_free_login_buf(struct iser_conn *iser_conn); 487 488int 489iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool); 490 491void 492iser_snd_completion(struct iser_tx_desc *, struct ib_conn *); 493 494void 495iser_rcv_completion(struct iser_rx_desc *, unsigned long, 496 struct ib_conn *); 497 498void 499iser_pdu_free(struct icl_conn *, struct icl_pdu *); 500 501struct icl_pdu * 502iser_new_pdu(struct icl_conn *ic, int flags); 503 504int 505iser_alloc_rx_descriptors(struct iser_conn *, int); 506 507void 508iser_free_rx_descriptors(struct iser_conn *); 509 510int 511iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *); 512 513int 514iser_send_control(struct iser_conn *, struct icl_iser_pdu *); 515 516int 517iser_send_command(struct iser_conn *, struct icl_iser_pdu *); 518 519int 520iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir); 521 522void 523iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir); 524 525int 526iser_create_fastreg_pool(struct ib_conn *, unsigned); 527 528void 529iser_free_fastreg_pool(struct ib_conn *); 530 531int 532iser_dma_map_task_data(struct icl_iser_pdu *, 533 struct iser_data_buf *, enum iser_data_dir, 534 enum dma_data_direction); 535 536int 537iser_conn_terminate(struct iser_conn *); 538 539void 540iser_free_ib_conn_res(struct iser_conn *, bool); 541 542void 543iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *, 544 enum dma_data_direction); 545 546int 547iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *); 548 549#endif /* !ICL_ISER_H */ 550