icl_iser.h revision 331769
1/* $FreeBSD: stable/11/sys/dev/iser/icl_iser.h 331769 2018-03-30 18:06:29Z hselasky $ */
2/*-
3 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#ifndef ICL_ISER_H
28#define ICL_ISER_H
29
30/*
31 * iSCSI Common Layer for RDMA.
32 */
33
34#include <sys/cdefs.h>
35#include <sys/param.h>
36#include <sys/capsicum.h>
37#include <sys/condvar.h>
38#include <sys/conf.h>
39#include <sys/file.h>
40#include <sys/kernel.h>
41#include <sys/kthread.h>
42#include <sys/lock.h>
43#include <sys/mbuf.h>
44#include <sys/mutex.h>
45#include <sys/module.h>
46#include <sys/protosw.h>
47#include <sys/socket.h>
48#include <sys/socketvar.h>
49#include <sys/sysctl.h>
50#include <sys/systm.h>
51#include <sys/sx.h>
52#include <sys/uio.h>
53#include <sys/taskqueue.h>
54#include <sys/bio.h>
55#include <vm/uma.h>
56#include <netinet/in.h>
57#include <netinet/tcp.h>
58#include <dev/iscsi/icl.h>
59#include <dev/iscsi/iscsi_proto.h>
60#include <icl_conn_if.h>
61#include <cam/cam.h>
62#include <cam/cam_ccb.h>
63#include <rdma/ib_verbs.h>
64#include <rdma/ib_fmr_pool.h>
65#include <rdma/rdma_cm.h>
66
67
68#define	ISER_DBG(X, ...)						\
69	do {								\
70		if (unlikely(iser_debug > 2))				\
71			printf("DEBUG: %s: " X "\n",			\
72				__func__, ## __VA_ARGS__);		\
73	} while (0)
74
75#define	ISER_INFO(X, ...)						\
76	do {								\
77		if (unlikely(iser_debug > 1))				\
78			printf("INFO: %s: " X "\n",			\
79				__func__, ## __VA_ARGS__);		\
80	} while (0)
81
82#define	ISER_WARN(X, ...)						\
83	do {								\
84		if (unlikely(iser_debug > 0)) {				\
85			printf("WARNING: %s: " X "\n",			\
86				__func__, ## __VA_ARGS__);		\
87		}							\
88	} while (0)
89
90#define	ISER_ERR(X, ...) 						\
91	printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)
92
93#define ISER_VER			0x10
94#define ISER_WSV			0x08
95#define ISER_RSV			0x04
96
97#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
98#define ISER_BEACON_WRID		0xfffffffffffffffeULL
99
100#define SHIFT_4K	12
101#define SIZE_4K	(1ULL << SHIFT_4K)
102#define MASK_4K	(~(SIZE_4K-1))
103
104/* support up to 512KB in one RDMA */
105#define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
106#define ISER_DEF_XMIT_CMDS_MAX 256
107
108/* the max RX (recv) WR supported by the iSER QP is defined by                 *
109 * max_recv_wr = commands_max + recv_beacon                                    */
110#define ISER_QP_MAX_RECV_DTOS  (ISER_DEF_XMIT_CMDS_MAX + 1)
111#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
112
113/* QP settings */
114/* Maximal bounds on received asynchronous PDUs */
115#define ISER_MAX_RX_MISC_PDUS           4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
116#define ISER_MAX_TX_MISC_PDUS           6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */
117
118/* the max TX (send) WR supported by the iSER QP is defined by                 *
119 * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
120 * to have at max for SCSI command. The tx posting & completion handling code  *
121 * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
122 * send WR. D=8 comes from 64K/8K                                              */
123
124#define ISER_INFLIGHT_DATAOUTS		8
125
126/* the send_beacon increase the max_send_wr by 1  */
127#define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
128					(1 + ISER_INFLIGHT_DATAOUTS) + \
129					ISER_MAX_TX_MISC_PDUS        + \
130					ISER_MAX_RX_MISC_PDUS + 1)
131
132#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr			\
133					 - ISER_MAX_TX_MISC_PDUS	\
134					 - ISER_MAX_RX_MISC_PDUS - 1) /	\
135					 (1 + ISER_INFLIGHT_DATAOUTS))
136
137#define ISER_WC_BATCH_COUNT   16
138#define ISER_SIGNAL_CMD_COUNT 32
139
140/* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might   *
141 * encounter a CQ overrun state.                                               */
142#define ISCSI_ISER_MAX_CONN	8
143#define ISER_MAX_RX_LEN		(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
144#define ISER_MAX_TX_LEN		(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
145#define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
146				 ISCSI_ISER_MAX_CONN)
147
148#define ISER_ZBVA_NOT_SUPPORTED                0x80
149#define ISER_SEND_W_INV_NOT_SUPPORTED	0x40
150
151#define	ISCSI_DEF_MAX_RECV_SEG_LEN	8192
152#define	ISCSI_OPCODE_MASK		0x3f
153
154#define icl_to_iser_conn(ic) \
155	container_of(ic, struct iser_conn, icl_conn)
156#define icl_to_iser_pdu(ip) \
157	container_of(ip, struct icl_iser_pdu, icl_pdu)
158
159/**
160 * struct iser_hdr - iSER header
161 *
162 * @flags:        flags support (zbva, remote_inv)
163 * @rsvd:         reserved
164 * @write_stag:   write rkey
165 * @write_va:     write virtual address
166 * @reaf_stag:    read rkey
167 * @read_va:      read virtual address
168 */
169struct iser_hdr {
170	u8      flags;
171	u8      rsvd[3];
172	__be32  write_stag;
173	__be64  write_va;
174	__be32  read_stag;
175	__be64  read_va;
176} __attribute__((packed));
177
178struct iser_cm_hdr {
179	u8      flags;
180	u8      rsvd[3];
181} __packed;
182
183/* Constant PDU lengths calculations */
184#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)
185
186#define ISER_RECV_DATA_SEG_LEN	128
187#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
188
189#define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
190
191enum iser_conn_state {
192	ISER_CONN_INIT,		   /* descriptor allocd, no conn          */
193	ISER_CONN_PENDING,	   /* in the process of being established */
194	ISER_CONN_UP,		   /* up and running                      */
195	ISER_CONN_TERMINATING,	   /* in the process of being terminated  */
196	ISER_CONN_DOWN,		   /* shut down                           */
197	ISER_CONN_STATES_NUM
198};
199
200enum iser_task_status {
201	ISER_TASK_STATUS_INIT = 0,
202	ISER_TASK_STATUS_STARTED,
203	ISER_TASK_STATUS_COMPLETED
204};
205
206enum iser_data_dir {
207	ISER_DIR_IN = 0,	   /* to initiator */
208	ISER_DIR_OUT,		   /* from initiator */
209	ISER_DIRS_NUM
210};
211
212/**
213 * struct iser_mem_reg - iSER memory registration info
214 *
215 * @sge:          memory region sg element
216 * @rkey:         memory region remote key
217 * @mem_h:        pointer to registration context (FMR/Fastreg)
218 */
219struct iser_mem_reg {
220	struct ib_sge	 sge;
221	u32		 rkey;
222	void		*mem_h;
223};
224
225enum iser_desc_type {
226	ISCSI_TX_CONTROL ,
227	ISCSI_TX_SCSI_COMMAND,
228	ISCSI_TX_DATAOUT
229};
230
231/**
232 * struct iser_data_buf - iSER data buffer
233 *
234 * @sg:           pointer to the sg list
235 * @size:         num entries of this sg
236 * @data_len:     total beffer byte len
237 * @dma_nents:    returned by dma_map_sg
238 * @copy_buf:     allocated copy buf for SGs unaligned
239 *                for rdma which are copied
240 * @orig_sg:      pointer to the original sg list (in case
241 *                we used a copy)
242 * @sg_single:    SG-ified clone of a non SG SC or
243 *                unaligned SG
244 */
245struct iser_data_buf {
246	struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
247	void               *sg;
248	int                size;
249	unsigned long      data_len;
250	unsigned int       dma_nents;
251	char               *copy_buf;
252	struct scatterlist *orig_sg;
253	struct scatterlist sg_single;
254  };
255
256/* fwd declarations */
257struct iser_conn;
258struct ib_conn;
259struct iser_device;
260
261/**
262 * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
263 *
264 * @iser_header:   iser header
265 * @iscsi_header:  iscsi header (bhs)
266 * @type:          command/control/dataout
267 * @dma_addr:      header buffer dma_address
268 * @tx_sg:         sg[0] points to iser/iscsi headers
269 *                 sg[1] optionally points to either of immediate data
270 *                 unsolicited data-out or control
271 * @num_sge:       number sges used on this TX task
272 * @mapped:        indicates if the descriptor is dma mapped
273 */
274struct iser_tx_desc {
275	struct iser_hdr              iser_header;
276	struct iscsi_bhs             iscsi_header __attribute__((packed));
277	enum   iser_desc_type        type;
278	u64		             dma_addr;
279	struct ib_sge		     tx_sg[2];
280	int                          num_sge;
281	bool                         mapped;
282};
283
284#define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
285					sizeof(u64) + sizeof(struct ib_sge)))
286/**
287 * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
288 *
289 * @iser_header:   iser header
290 * @iscsi_header:  iscsi header
291 * @data:          received data segment
292 * @dma_addr:      receive buffer dma address
293 * @rx_sg:         ib_sge of receive buffer
294 * @pad:           for sense data TODO: Modify to maximum sense length supported
295 */
296struct iser_rx_desc {
297	struct iser_hdr              iser_header;
298	struct iscsi_bhs             iscsi_header;
299	char		             data[ISER_RECV_DATA_SEG_LEN];
300	u64		             dma_addr;
301	struct ib_sge		     rx_sg;
302	char		             pad[ISER_RX_PAD_SIZE];
303} __attribute__((packed));
304
305struct icl_iser_pdu {
306	struct icl_pdu               icl_pdu;
307	struct iser_tx_desc          desc;
308	struct iser_conn             *iser_conn;
309	enum iser_task_status        status;
310	struct ccb_scsiio 			 *csio;
311	int                          command_sent;
312	int                          dir[ISER_DIRS_NUM];
313	struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
314	struct iser_data_buf         data[ISER_DIRS_NUM];
315};
316
317/**
318 * struct iser_comp - iSER completion context
319 *
320 * @device:     pointer to device handle
321 * @cq:         completion queue
322 * @wcs:        work completion array
323 * @tq:    	taskqueue handle
324 * @task:    	task to run task_fn
325 * @active_qps: Number of active QPs attached
326 *              to completion context
327 */
328struct iser_comp {
329	struct iser_device      *device;
330	struct ib_cq		*cq;
331	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
332	struct taskqueue        *tq;
333	struct task             task;
334	int                      active_qps;
335};
336
337/**
338 * struct iser_device - iSER device handle
339 *
340 * @ib_device:     RDMA device
341 * @pd:            Protection Domain for this device
342 * @dev_attr:      Device attributes container
343 * @mr:            Global DMA memory region
344 * @event_handler: IB events handle routine
345 * @ig_list:	   entry in devices list
346 * @refcount:      Reference counter, dominated by open iser connections
347 * @comps_used:    Number of completion contexts used, Min between online
348 *                 cpus and device max completion vectors
349 * @comps:         Dinamically allocated array of completion handlers
350 */
351struct iser_device {
352	struct ib_device             *ib_device;
353	struct ib_pd	             *pd;
354	struct ib_device_attr	     dev_attr;
355	struct ib_mr	             *mr;
356	struct ib_event_handler      event_handler;
357	struct list_head             ig_list;
358	int                          refcount;
359	int			     comps_used;
360	struct iser_comp	     *comps;
361};
362
363/**
364 * struct iser_reg_resources - Fast registration recources
365 *
366 * @mr:         memory region
367 * @mr_valid:   is mr valid indicator
368 */
369struct iser_reg_resources {
370	struct ib_mr                     *mr;
371	u8                                mr_valid:1;
372};
373
374/**
375 * struct fast_reg_descriptor - Fast registration descriptor
376 *
377 * @list:           entry in connection fastreg pool
378 * @rsc:            data buffer registration resources
379 */
380struct fast_reg_descriptor {
381	struct list_head		  list;
382	struct iser_reg_resources	  rsc;
383};
384
385
386/**
387 * struct iser_beacon - beacon to signal all flush errors were drained
388 *
389 * @send:           send wr
390 * @recv:           recv wr
391 * @flush_lock:     protects flush_cv
392 * @flush_cv:       condition variable for beacon flush
393 */
394struct iser_beacon {
395	union {
396		struct ib_send_wr	send;
397		struct ib_recv_wr	recv;
398	};
399	struct mtx		     flush_lock;
400	struct cv		     flush_cv;
401};
402
403/**
404 * struct ib_conn - Infiniband related objects
405 *
406 * @cma_id:              rdma_cm connection maneger handle
407 * @qp:                  Connection Queue-pair
408 * @device:              reference to iser device
409 * @comp:                iser completion context
410  */
411struct ib_conn {
412	struct rdma_cm_id           *cma_id;
413	struct ib_qp	            *qp;
414	int                          post_recv_buf_count;
415	u8                           sig_count;
416	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
417	struct iser_device          *device;
418	struct iser_comp	    *comp;
419	struct iser_beacon	     beacon;
420	struct mtx               lock;
421	union {
422		struct {
423			struct ib_fmr_pool      *pool;
424			struct iser_page_vec	*page_vec;
425		} fmr;
426		struct {
427			struct list_head	 pool;
428			int			 pool_size;
429		} fastreg;
430	};
431};
432
433struct iser_conn {
434	struct icl_conn             icl_conn;
435	struct ib_conn               ib_conn;
436	struct cv                    up_cv;
437	struct list_head             conn_list;
438	struct sx		     		 state_mutex;
439	enum iser_conn_state	     state;
440	int		     				 qp_max_recv_dtos;
441	int		     				 min_posted_rx;
442	u16                          max_cmds;
443	char  			     *login_buf;
444	char			     *login_req_buf, *login_resp_buf;
445	u64			     login_req_dma, login_resp_dma;
446	unsigned int 		     rx_desc_head;
447	struct iser_rx_desc	     *rx_descs;
448	u32                          num_rx_descs;
449	bool                         handoff_done;
450};
451
452/**
453 * struct iser_global: iSER global context
454 *
455 * @device_list_mutex:    protects device_list
456 * @device_list:          iser devices global list
457 * @connlist_mutex:       protects connlist
458 * @connlist:             iser connections global list
459 * @desc_cache:           kmem cache for tx dataout
460 * @close_conns_mutex:    serializes conns closure
461 */
462struct iser_global {
463	struct sx        device_list_mutex;
464	struct list_head  device_list;
465	struct mtx        connlist_mutex;
466	struct list_head  connlist;
467	struct sx         close_conns_mutex;
468};
469
470extern struct iser_global ig;
471extern int iser_debug;
472
473void
474iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *);
475
476int
477iser_post_recvl(struct iser_conn *);
478
479int
480iser_post_recvm(struct iser_conn *, int);
481
482int
483iser_alloc_login_buf(struct iser_conn *iser_conn);
484
485void
486iser_free_login_buf(struct iser_conn *iser_conn);
487
488int
489iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool);
490
491void
492iser_snd_completion(struct iser_tx_desc *, struct ib_conn *);
493
494void
495iser_rcv_completion(struct iser_rx_desc *, unsigned long,
496		    struct ib_conn *);
497
498void
499iser_pdu_free(struct icl_conn *, struct icl_pdu *);
500
501struct icl_pdu *
502iser_new_pdu(struct icl_conn *ic, int flags);
503
504int
505iser_alloc_rx_descriptors(struct iser_conn *, int);
506
507void
508iser_free_rx_descriptors(struct iser_conn *);
509
510int
511iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *);
512
513int
514iser_send_control(struct iser_conn *, struct icl_iser_pdu *);
515
516int
517iser_send_command(struct iser_conn *, struct icl_iser_pdu *);
518
519int
520iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
521
522void
523iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);
524
525int
526iser_create_fastreg_pool(struct ib_conn *, unsigned);
527
528void
529iser_free_fastreg_pool(struct ib_conn *);
530
531int
532iser_dma_map_task_data(struct icl_iser_pdu *,
533		       struct iser_data_buf *, enum iser_data_dir,
534		       enum dma_data_direction);
535
536int
537iser_conn_terminate(struct iser_conn *);
538
539void
540iser_free_ib_conn_res(struct iser_conn *, bool);
541
542void
543iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *,
544			 enum dma_data_direction);
545
546int
547iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
548
549#endif /* !ICL_ISER_H */
550