rpcib.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright (c) 2007, The Ohio State University. All rights reserved.
28 *
29 * Portions of this source code is developed by the team members of
30 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31 * headed by Professor Dhabaleswar K. (DK) Panda.
32 *
33 * Acknowledgements to contributions from developors:
34 *   Ranjit Noronha: noronha@cse.ohio-state.edu
35 *   Lei Chai      : chail@cse.ohio-state.edu
36 *   Weikuan Yu    : yuw@cse.ohio-state.edu
37 *
38 */
39
40/*
41 * The rpcib plugin. Implements the interface for RDMATF's
42 * interaction with IBTF.
43 */
44
45#include <sys/param.h>
46#include <sys/types.h>
47#include <sys/user.h>
48#include <sys/systm.h>
49#include <sys/sysmacros.h>
50#include <sys/proc.h>
51#include <sys/socket.h>
52#include <sys/file.h>
53#include <sys/stream.h>
54#include <sys/strsubr.h>
55#include <sys/stropts.h>
56#include <sys/errno.h>
57#include <sys/kmem.h>
58#include <sys/debug.h>
59#include <sys/systm.h>
60#include <sys/pathname.h>
61#include <sys/kstat.h>
62#include <sys/t_lock.h>
63#include <sys/ddi.h>
64#include <sys/cmn_err.h>
65#include <sys/time.h>
66#include <sys/isa_defs.h>
67#include <sys/callb.h>
68#include <sys/sunddi.h>
69#include <sys/sunndi.h>
70#include <sys/sunldi.h>
71#include <sys/sdt.h>
72#include <sys/dlpi.h>
73#include <sys/ib/ibtl/ibti.h>
74#include <rpc/rpc.h>
75#include <rpc/ib.h>
76
77#include <sys/modctl.h>
78
79#include <sys/pathname.h>
80#include <sys/kstr.h>
81#include <sys/sockio.h>
82#include <sys/vnode.h>
83#include <sys/tiuser.h>
84#include <net/if.h>
85#include <sys/cred.h>
86#include <rpc/rpc_rdma.h>
87
88#include <nfs/nfs.h>
89#include <sys/kstat.h>
90#include <sys/atomic.h>
91
92#define	NFS_RDMA_PORT	2050
93
94extern char *inet_ntop(int, const void *, char *, int);
95
96
97/*
98 * Prototype declarations for driver ops
99 */
100
101static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
102static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
103				void *, void **);
104static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
105static int	rpcib_is_ib_interface(char *);
106static int	rpcib_dl_info(ldi_handle_t, dl_info_ack_t *);
107static int	rpcib_do_ip_ioctl(int, int, caddr_t);
108static boolean_t	rpcib_get_ib_addresses(struct sockaddr_in *,
109			struct sockaddr_in6 *, uint_t *, uint_t *);
110static	uint_t rpcib_get_number_interfaces(void);
111static int rpcib_cache_kstat_update(kstat_t *, int);
112static void rib_force_cleanup(void *);
113
114struct {
115	kstat_named_t cache_limit;
116	kstat_named_t cache_allocation;
117	kstat_named_t cache_hits;
118	kstat_named_t cache_misses;
119	kstat_named_t cache_misses_above_the_limit;
120} rpcib_kstat = {
121	{"cache_limit",			KSTAT_DATA_UINT64 },
122	{"cache_allocation",		KSTAT_DATA_UINT64 },
123	{"cache_hits",			KSTAT_DATA_UINT64 },
124	{"cache_misses",		KSTAT_DATA_UINT64 },
125	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
126};
127
128/* rpcib cb_ops */
129static struct cb_ops rpcib_cbops = {
130	nulldev,		/* open */
131	nulldev,		/* close */
132	nodev,			/* strategy */
133	nodev,			/* print */
134	nodev,			/* dump */
135	nodev,			/* read */
136	nodev,			/* write */
137	nodev,			/* ioctl */
138	nodev,			/* devmap */
139	nodev,			/* mmap */
140	nodev,			/* segmap */
141	nochpoll,		/* poll */
142	ddi_prop_op,		/* prop_op */
143	NULL,			/* stream */
144	D_MP,			/* cb_flag */
145	CB_REV,			/* rev */
146	nodev,			/* int (*cb_aread)() */
147	nodev			/* int (*cb_awrite)() */
148};
149
150
151
152
153/*
154 * Device options
155 */
156static struct dev_ops rpcib_ops = {
157	DEVO_REV,		/* devo_rev, */
158	0,			/* refcnt  */
159	rpcib_getinfo,		/* info */
160	nulldev,		/* identify */
161	nulldev,		/* probe */
162	rpcib_attach,		/* attach */
163	rpcib_detach,		/* detach */
164	nodev,			/* reset */
165	&rpcib_cbops,		    /* driver ops - devctl interfaces */
166	NULL,			/* bus operations */
167	NULL,			/* power */
168	ddi_quiesce_not_needed,		/* quiesce */
169};
170
171/*
172 * Module linkage information.
173 */
174
175static struct modldrv rib_modldrv = {
176	&mod_driverops,		/* Driver module */
177	"RPCIB plugin driver",	/* Driver name and version */
178	&rpcib_ops,		/* Driver ops */
179};
180
181static struct modlinkage rib_modlinkage = {
182	MODREV_1,
183	(void *)&rib_modldrv,
184	NULL
185};
186
187typedef struct rib_lrc_entry {
188	struct rib_lrc_entry *forw;
189	struct rib_lrc_entry *back;
190	char *lrc_buf;
191
192	uint32_t lrc_len;
193	void  *avl_node;
194	bool_t registered;
195
196	struct mrc lrc_mhandle;
197	bool_t lrc_on_freed_list;
198} rib_lrc_entry_t;
199
200typedef	struct cache_struct	{
201	rib_lrc_entry_t		r;
202	uint32_t		len;
203	uint32_t		elements;
204	kmutex_t		node_lock;
205	avl_node_t		avl_link;
206} cache_avl_struct_t;
207
208
209static uint64_t 	rib_total_buffers = 0;
210uint64_t	cache_limit = 100 * 1024 * 1024;
211static volatile uint64_t	cache_allocation = 0;
212static uint64_t	cache_watermark = 80 * 1024 * 1024;
213static uint64_t	cache_hits = 0;
214static uint64_t	cache_misses = 0;
215static uint64_t	cache_cold_misses = 0;
216static uint64_t	cache_hot_misses = 0;
217static uint64_t	cache_misses_above_the_limit = 0;
218static bool_t	stats_enabled = FALSE;
219
220static uint64_t max_unsignaled_rws = 5;
221
222/*
223 * rib_stat: private data pointer used when registering
224 *	with the IBTF.  It is returned to the consumer
225 *	in all callbacks.
226 */
227static rpcib_state_t *rib_stat = NULL;
228
229#define	RNR_RETRIES	IBT_RNR_RETRY_1
230#define	MAX_PORTS	2
231
232int preposted_rbufs = RDMA_BUFS_GRANT;
233int send_threshold = 1;
234
235/*
236 * State of the plugin.
237 * ACCEPT = accepting new connections and requests.
238 * NO_ACCEPT = not accepting new connection and requests.
239 * This should eventually move to rpcib_state_t structure, since this
240 * will tell in which state the plugin is for a particular type of service
241 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
242 * state for one and in no_accept state for the other.
243 */
244int		plugin_state;
245kmutex_t	plugin_state_lock;
246
247ldi_ident_t rpcib_li;
248
249/*
250 * RPCIB RDMATF operations
251 */
252#if defined(MEASURE_POOL_DEPTH)
253static void rib_posted_rbufs(uint32_t x) { return; }
254#endif
255static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
256static rdma_stat rib_disconnect(CONN *conn);
257static void rib_listen(struct rdma_svc_data *rd);
258static void rib_listen_stop(struct rdma_svc_data *rd);
259static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
260	uint_t buflen, struct mrc *buf_handle);
261static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
262	struct mrc buf_handle);
263static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
264		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
265static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
266		struct mrc buf_handle);
267static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
268	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
269	void *lrc);
270static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
271	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
272static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
273	caddr_t buf, int len, int cpu);
274
275static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
276
277static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
278static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
279
280static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
281
282static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
283static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
284static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
285static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
286static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
287static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
288static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
289static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
290static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
291static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
292static rdma_stat rib_conn_release(CONN *conn);
293static rdma_stat rib_getinfo(rdma_info_t *info);
294
295static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
296static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
297static void rib_destroy_cache(rib_hca_t *hca);
298static	void	rib_server_side_cache_reclaim(void *argp);
299static int avl_compare(const void *t1, const void *t2);
300
301static void rib_stop_services(rib_hca_t *);
302static void rib_close_channels(rib_conn_list_t *);
303
304/*
305 * RPCIB addressing operations
306 */
307
308/*
309 * RDMA operations the RPCIB module exports
310 */
311static rdmaops_t rib_ops = {
312	rib_reachable,
313	rib_conn_get,
314	rib_conn_release,
315	rib_listen,
316	rib_listen_stop,
317	rib_registermem,
318	rib_deregistermem,
319	rib_registermemsync,
320	rib_deregistermemsync,
321	rib_syncmem,
322	rib_reg_buf_alloc,
323	rib_reg_buf_free,
324	rib_send,
325	rib_send_resp,
326	rib_post_resp,
327	rib_post_resp_remove,
328	rib_post_recv,
329	rib_recv,
330	rib_read,
331	rib_write,
332	rib_getinfo,
333};
334
335/*
336 * RDMATF RPCIB plugin details
337 */
338static rdma_mod_t rib_mod = {
339	"ibtf",		/* api name */
340	RDMATF_VERS_1,
341	0,
342	&rib_ops,	/* rdma op vector for ibtf */
343};
344
345static rdma_stat open_hcas(rpcib_state_t *);
346static rdma_stat rib_qp_init(rib_qp_t *, int);
347static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
348static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
349static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
350static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
351static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
352static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
353	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
354static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
355	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
356static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *,
357	ibt_ip_addr_t *, ibt_ip_addr_t *);
358static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
359	rib_qp_t **);
360static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
361	rib_qp_t **);
362static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
363static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
364static int rib_free_sendwait(struct send_wid *);
365static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
366static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
367static void rdma_done_rem_list(rib_qp_t *);
368static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
369
370static void rib_async_handler(void *,
371	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
372static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
373static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
374static int rib_free_svc_recv(struct svc_recv *);
375static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
376static void rib_free_wid(struct recv_wid *);
377static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
378static void rib_detach_hca(rib_hca_t *);
379static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int,
380	ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *);
381
382/*
383 * Registration with IBTF as a consumer
384 */
385static struct ibt_clnt_modinfo_s rib_modinfo = {
386	IBTI_V2,
387	IBT_GENERIC,
388	rib_async_handler,	/* async event handler */
389	NULL,			/* Memory Region Handler */
390	"nfs/ib"
391};
392
393/*
394 * Global strucuture
395 */
396
397typedef struct rpcib_s {
398	dev_info_t	*rpcib_dip;
399	kmutex_t	rpcib_mutex;
400} rpcib_t;
401
402rpcib_t rpcib;
403
404/*
405 * /etc/system controlled variable to control
406 * debugging in rpcib kernel module.
407 * Set it to values greater that 1 to control
408 * the amount of debugging messages required.
409 */
410int rib_debug = 0;
411
412
413int
414_init(void)
415{
416	int		error;
417	int ret;
418
419	error = mod_install((struct modlinkage *)&rib_modlinkage);
420	if (error != 0) {
421		/*
422		 * Could not load module
423		 */
424		return (error);
425	}
426	ret = ldi_ident_from_mod(&rib_modlinkage, &rpcib_li);
427	if (ret != 0)
428		rpcib_li = NULL;
429	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
430
431	return (0);
432}
433
434int
435_fini()
436{
437	int status;
438
439	if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
440		return (EBUSY);
441	}
442
443	/*
444	 * Remove module
445	 */
446	if ((status = mod_remove(&rib_modlinkage)) != 0) {
447		(void) rdma_register_mod(&rib_mod);
448		return (status);
449	}
450	mutex_destroy(&plugin_state_lock);
451	ldi_ident_release(rpcib_li);
452	return (0);
453}
454
455int
456_info(struct modinfo *modinfop)
457{
458	return (mod_info(&rib_modlinkage, modinfop));
459}
460
461
462/*
463 * rpcib_getinfo()
464 * Given the device number, return the devinfo pointer or the
465 * instance number.
466 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
467 */
468
469/*ARGSUSED*/
470static int
471rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
472{
473	int ret = DDI_SUCCESS;
474
475	switch (cmd) {
476	case DDI_INFO_DEVT2DEVINFO:
477		if (rpcib.rpcib_dip != NULL)
478			*result = rpcib.rpcib_dip;
479		else {
480			*result = NULL;
481			ret = DDI_FAILURE;
482		}
483		break;
484
485	case DDI_INFO_DEVT2INSTANCE:
486		*result = NULL;
487		break;
488
489	default:
490		ret = DDI_FAILURE;
491	}
492	return (ret);
493}
494
495static int
496rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
497{
498	ibt_status_t	ibt_status;
499	rdma_stat	r_status;
500
501	switch (cmd) {
502	case DDI_ATTACH:
503		break;
504	case DDI_RESUME:
505		return (DDI_SUCCESS);
506	default:
507		return (DDI_FAILURE);
508	}
509
510	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
511
512	mutex_enter(&rpcib.rpcib_mutex);
513	if (rpcib.rpcib_dip != NULL) {
514		mutex_exit(&rpcib.rpcib_mutex);
515		return (DDI_FAILURE);
516	}
517	rpcib.rpcib_dip = dip;
518	mutex_exit(&rpcib.rpcib_mutex);
519	/*
520	 * Create the "rpcib" minor-node.
521	 */
522	if (ddi_create_minor_node(dip,
523	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
524		/* Error message, no cmn_err as they print on console */
525		return (DDI_FAILURE);
526	}
527
528	if (rib_stat == NULL) {
529		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
530		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
531	}
532
533	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
534	if (rib_stat->hca_count < 1) {
535		mutex_destroy(&rib_stat->open_hca_lock);
536		kmem_free(rib_stat, sizeof (*rib_stat));
537		rib_stat = NULL;
538		return (DDI_FAILURE);
539	}
540
541	ibt_status = ibt_attach(&rib_modinfo, dip,
542	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
543
544	if (ibt_status != IBT_SUCCESS) {
545		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
546		mutex_destroy(&rib_stat->open_hca_lock);
547		kmem_free(rib_stat, sizeof (*rib_stat));
548		rib_stat = NULL;
549		return (DDI_FAILURE);
550	}
551
552	mutex_enter(&rib_stat->open_hca_lock);
553	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
554		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
555		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
556		mutex_exit(&rib_stat->open_hca_lock);
557		mutex_destroy(&rib_stat->open_hca_lock);
558		kmem_free(rib_stat, sizeof (*rib_stat));
559		rib_stat = NULL;
560		return (DDI_FAILURE);
561	}
562	mutex_exit(&rib_stat->open_hca_lock);
563
564	/*
565	 * Register with rdmatf
566	 */
567	rib_mod.rdma_count = rib_stat->hca_count;
568	r_status = rdma_register_mod(&rib_mod);
569	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
570		rib_detach_hca(rib_stat->hca);
571		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
572		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
573		mutex_destroy(&rib_stat->open_hca_lock);
574		kmem_free(rib_stat, sizeof (*rib_stat));
575		rib_stat = NULL;
576		return (DDI_FAILURE);
577	}
578
579
580	return (DDI_SUCCESS);
581}
582
583/*ARGSUSED*/
584static int
585rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
586{
587	switch (cmd) {
588
589	case DDI_DETACH:
590		break;
591
592	case DDI_SUSPEND:
593	default:
594		return (DDI_FAILURE);
595	}
596
597	/*
598	 * Detach the hca and free resources
599	 */
600	mutex_enter(&plugin_state_lock);
601	plugin_state = NO_ACCEPT;
602	mutex_exit(&plugin_state_lock);
603	rib_detach_hca(rib_stat->hca);
604	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
605	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
606
607	mutex_enter(&rpcib.rpcib_mutex);
608	rpcib.rpcib_dip = NULL;
609	mutex_exit(&rpcib.rpcib_mutex);
610
611	mutex_destroy(&rpcib.rpcib_mutex);
612	return (DDI_SUCCESS);
613}
614
615
616static void rib_rbufpool_free(rib_hca_t *, int);
617static void rib_rbufpool_deregister(rib_hca_t *, int);
618static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
619static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
620static rdma_stat rib_rem_replylist(rib_qp_t *);
621static int rib_remreply(rib_qp_t *, struct reply *);
622static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
623static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
624
625
626/*
627 * One CQ pair per HCA
628 */
629static rdma_stat
630rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
631	rib_cq_t **cqp, rpcib_state_t *ribstat)
632{
633	rib_cq_t	*cq;
634	ibt_cq_attr_t	cq_attr;
635	uint32_t	real_size;
636	ibt_status_t	status;
637	rdma_stat	error = RDMA_SUCCESS;
638
639	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
640	cq->rib_hca = hca;
641	cq_attr.cq_size = cq_size;
642	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
643	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
644	    &real_size);
645	if (status != IBT_SUCCESS) {
646		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
647		    " status=%d", status);
648		error = RDMA_FAILED;
649		goto fail;
650	}
651	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
652
653	/*
654	 * Enable CQ callbacks. CQ Callbacks are single shot
655	 * (e.g. you have to call ibt_enable_cq_notify()
656	 * after each callback to get another one).
657	 */
658	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
659	if (status != IBT_SUCCESS) {
660		cmn_err(CE_WARN, "rib_create_cq: "
661		    "enable_cq_notify failed, status %d", status);
662		error = RDMA_FAILED;
663		goto fail;
664	}
665	*cqp = cq;
666
667	return (error);
668fail:
669	if (cq->rib_cq_hdl)
670		(void) ibt_free_cq(cq->rib_cq_hdl);
671	if (cq)
672		kmem_free(cq, sizeof (rib_cq_t));
673	return (error);
674}
675
676static rdma_stat
677open_hcas(rpcib_state_t *ribstat)
678{
679	rib_hca_t		*hca;
680	ibt_status_t		ibt_status;
681	rdma_stat		status;
682	ibt_hca_portinfo_t	*pinfop;
683	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
684	uint_t			size, cq_size;
685	int			i;
686	kstat_t *ksp;
687	cache_avl_struct_t example_avl_node;
688	char rssc_name[32];
689
690	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
691
692	if (ribstat->hcas == NULL)
693		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
694		    sizeof (rib_hca_t), KM_SLEEP);
695
696	/*
697	 * Open a hca and setup for RDMA
698	 */
699	for (i = 0; i < ribstat->hca_count; i++) {
700		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
701		    ribstat->hca_guids[i],
702		    &ribstat->hcas[i].hca_hdl);
703		if (ibt_status != IBT_SUCCESS) {
704			continue;
705		}
706		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
707		hca = &(ribstat->hcas[i]);
708		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
709		hca->state = HCA_INITED;
710
711		/*
712		 * query HCA info
713		 */
714		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
715		if (ibt_status != IBT_SUCCESS) {
716			goto fail1;
717		}
718
719		/*
720		 * One PD (Protection Domain) per HCA.
721		 * A qp is allowed to access a memory region
722		 * only when it's in the same PD as that of
723		 * the memory region.
724		 */
725		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
726		if (ibt_status != IBT_SUCCESS) {
727			goto fail1;
728		}
729
730		/*
731		 * query HCA ports
732		 */
733		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
734		    0, &pinfop, &hca->hca_nports, &size);
735		if (ibt_status != IBT_SUCCESS) {
736			goto fail2;
737		}
738		hca->hca_ports = pinfop;
739		hca->hca_pinfosz = size;
740		pinfop = NULL;
741
742		cq_size = DEF_CQ_SIZE; /* default cq size */
743		/*
744		 * Create 2 pairs of cq's (1 pair for client
745		 * and the other pair for server) on this hca.
746		 * If number of qp's gets too large, then several
747		 * cq's will be needed.
748		 */
749		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
750		    &hca->svc_rcq, ribstat);
751		if (status != RDMA_SUCCESS) {
752			goto fail3;
753		}
754
755		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
756		    &hca->svc_scq, ribstat);
757		if (status != RDMA_SUCCESS) {
758			goto fail3;
759		}
760
761		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
762		    &hca->clnt_rcq, ribstat);
763		if (status != RDMA_SUCCESS) {
764			goto fail3;
765		}
766
767		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
768		    &hca->clnt_scq, ribstat);
769		if (status != RDMA_SUCCESS) {
770			goto fail3;
771		}
772
773		/*
774		 * Create buffer pools.
775		 * Note rib_rbuf_create also allocates memory windows.
776		 */
777		hca->recv_pool = rib_rbufpool_create(hca,
778		    RECV_BUFFER, MAX_BUFS);
779		if (hca->recv_pool == NULL) {
780			goto fail3;
781		}
782
783		hca->send_pool = rib_rbufpool_create(hca,
784		    SEND_BUFFER, MAX_BUFS);
785		if (hca->send_pool == NULL) {
786			rib_rbufpool_destroy(hca, RECV_BUFFER);
787			goto fail3;
788		}
789
790		if (hca->server_side_cache == NULL) {
791			(void) sprintf(rssc_name,
792			    "rib_server_side_cache_%04d", i);
793			hca->server_side_cache = kmem_cache_create(
794			    rssc_name,
795			    sizeof (cache_avl_struct_t), 0,
796			    NULL,
797			    NULL,
798			    rib_server_side_cache_reclaim,
799			    hca, NULL, 0);
800		}
801
802		avl_create(&hca->avl_tree,
803		    avl_compare,
804		    sizeof (cache_avl_struct_t),
805		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
806		    (uint_t)(uintptr_t)&example_avl_node);
807
808		rw_init(&hca->avl_rw_lock,
809		    NULL, RW_DRIVER, hca->iblock);
810		mutex_init(&hca->cache_allocation,
811		    NULL, MUTEX_DRIVER, NULL);
812		hca->avl_init = TRUE;
813
814		/* Create kstats for the cache */
815		ASSERT(INGLOBALZONE(curproc));
816
817		if (!stats_enabled) {
818			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
819			    KSTAT_TYPE_NAMED,
820			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
821			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
822			    GLOBAL_ZONEID);
823			if (ksp) {
824				ksp->ks_data = (void *) &rpcib_kstat;
825				ksp->ks_update = rpcib_cache_kstat_update;
826				kstat_install(ksp);
827				stats_enabled = TRUE;
828			}
829		}
830		if (NULL == hca->reg_cache_clean_up) {
831			hca->reg_cache_clean_up = ddi_taskq_create(NULL,
832			    "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0);
833		}
834
835		/*
836		 * Initialize the registered service list and
837		 * the lock
838		 */
839		hca->service_list = NULL;
840		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
841
842		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
843		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
844		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
845		    hca->iblock);
846		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
847		    hca->iblock);
848		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
849		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
850		hca->inuse = TRUE;
851		/*
852		 * XXX One hca only. Add multi-hca functionality if needed
853		 * later.
854		 */
855		ribstat->hca = hca;
856		ribstat->nhca_inited++;
857		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
858		break;
859
860fail3:
861		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
862fail2:
863		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
864fail1:
865		(void) ibt_close_hca(hca->hca_hdl);
866
867	}
868	if (ribstat->hca != NULL)
869		return (RDMA_SUCCESS);
870	else
871		return (RDMA_FAILED);
872}
873
874/*
875 * Callback routines
876 */
877
878/*
879 * SCQ handlers
880 */
881/* ARGSUSED */
882static void
883rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
884{
885	ibt_status_t	ibt_status;
886	ibt_wc_t	wc;
887	int		i;
888
889	/*
890	 * Re-enable cq notify here to avoid missing any
891	 * completion queue notification.
892	 */
893	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
894
895	ibt_status = IBT_SUCCESS;
896	while (ibt_status != IBT_CQ_EMPTY) {
897	bzero(&wc, sizeof (wc));
898	ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
899	if (ibt_status != IBT_SUCCESS)
900		return;
901
902	/*
903	 * Got a send completion
904	 */
905	if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
906		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
907		CONN	*conn = qptoc(wd->qp);
908
909		mutex_enter(&wd->sendwait_lock);
910		switch (wc.wc_status) {
911		case IBT_WC_SUCCESS:
912			wd->status = RDMA_SUCCESS;
913			break;
914		case IBT_WC_WR_FLUSHED_ERR:
915			wd->status = RDMA_FAILED;
916			break;
917		default:
918/*
919 *    RC Send Q Error Code		Local state     Remote State
920 *    ==================== 		===========     ============
921 *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
922 *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
923 *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
924 *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
925 *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
926 *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
927 *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
928 *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
929 *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
930 *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
931 *    IBT_WC_WR_FLUSHED_ERR               None            None
932 */
933			/*
934			 * Channel in error state. Set connection to
935			 * ERROR and cleanup will happen either from
936			 * conn_release  or from rib_conn_get
937			 */
938			wd->status = RDMA_FAILED;
939			mutex_enter(&conn->c_lock);
940			if (conn->c_state != C_DISCONN_PEND)
941				conn->c_state = C_ERROR_CONN;
942			mutex_exit(&conn->c_lock);
943			break;
944		}
945
946		if (wd->cv_sig == 1) {
947			/*
948			 * Notify poster
949			 */
950			cv_signal(&wd->wait_cv);
951			mutex_exit(&wd->sendwait_lock);
952		} else {
953			/*
954			 * Poster not waiting for notification.
955			 * Free the send buffers and send_wid
956			 */
957			for (i = 0; i < wd->nsbufs; i++) {
958				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
959				    (void *)(uintptr_t)wd->sbufaddr[i]);
960				}
961			mutex_exit(&wd->sendwait_lock);
962			(void) rib_free_sendwait(wd);
963			}
964		}
965	}
966}
967
968/* ARGSUSED */
969static void
970rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
971{
972	ibt_status_t	ibt_status;
973	ibt_wc_t	wc;
974	int		i;
975
976	/*
977	 * Re-enable cq notify here to avoid missing any
978	 * completion queue notification.
979	 */
980	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
981
982	ibt_status = IBT_SUCCESS;
983	while (ibt_status != IBT_CQ_EMPTY) {
984		bzero(&wc, sizeof (wc));
985		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
986		if (ibt_status != IBT_SUCCESS)
987			return;
988
989		/*
990		 * Got a send completion
991		 */
992		if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
993			struct send_wid *wd =
994			    (struct send_wid *)(uintptr_t)wc.wc_id;
995			mutex_enter(&wd->sendwait_lock);
996			if (wd->cv_sig == 1) {
997				/*
998				 * Update completion status and notify poster
999				 */
1000				if (wc.wc_status == IBT_WC_SUCCESS)
1001					wd->status = RDMA_SUCCESS;
1002				else
1003					wd->status = RDMA_FAILED;
1004				cv_signal(&wd->wait_cv);
1005				mutex_exit(&wd->sendwait_lock);
1006			} else {
1007				/*
1008				 * Poster not waiting for notification.
1009				 * Free the send buffers and send_wid
1010				 */
1011				for (i = 0; i < wd->nsbufs; i++) {
1012					rib_rbuf_free(qptoc(wd->qp),
1013					    SEND_BUFFER,
1014					    (void *)(uintptr_t)wd->sbufaddr[i]);
1015				}
1016				mutex_exit(&wd->sendwait_lock);
1017				(void) rib_free_sendwait(wd);
1018			}
1019		}
1020	}
1021}
1022
1023/*
1024 * RCQ handler
1025 */
1026/* ARGSUSED */
1027static void
1028rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1029{
1030	rib_qp_t	*qp;
1031	ibt_status_t	ibt_status;
1032	ibt_wc_t	wc;
1033	struct recv_wid	*rwid;
1034
1035	/*
1036	 * Re-enable cq notify here to avoid missing any
1037	 * completion queue notification.
1038	 */
1039	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1040
1041	ibt_status = IBT_SUCCESS;
1042	while (ibt_status != IBT_CQ_EMPTY) {
1043		bzero(&wc, sizeof (wc));
1044		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1045		if (ibt_status != IBT_SUCCESS)
1046			return;
1047
1048		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1049		qp = rwid->qp;
1050		if (wc.wc_status == IBT_WC_SUCCESS) {
1051			XDR	inxdrs, *xdrs;
1052			uint_t	xid, vers, op, find_xid = 0;
1053			struct reply	*r;
1054			CONN *conn = qptoc(qp);
1055			uint32_t rdma_credit = 0;
1056
1057			xdrs = &inxdrs;
1058			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1059			    wc.wc_bytes_xfer, XDR_DECODE);
1060			/*
1061			 * Treat xid as opaque (xid is the first entity
1062			 * in the rpc rdma message).
1063			 */
1064			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1065
1066			/* Skip xid and set the xdr position accordingly. */
1067			XDR_SETPOS(xdrs, sizeof (uint32_t));
1068			(void) xdr_u_int(xdrs, &vers);
1069			(void) xdr_u_int(xdrs, &rdma_credit);
1070			(void) xdr_u_int(xdrs, &op);
1071			XDR_DESTROY(xdrs);
1072
1073			if (vers != RPCRDMA_VERS) {
1074				/*
1075				 * Invalid RPC/RDMA version. Cannot
1076				 * interoperate.  Set connection to
1077				 * ERROR state and bail out.
1078				 */
1079				mutex_enter(&conn->c_lock);
1080				if (conn->c_state != C_DISCONN_PEND)
1081					conn->c_state = C_ERROR_CONN;
1082				mutex_exit(&conn->c_lock);
1083				rib_rbuf_free(conn, RECV_BUFFER,
1084				    (void *)(uintptr_t)rwid->addr);
1085				rib_free_wid(rwid);
1086				continue;
1087			}
1088
1089			mutex_enter(&qp->replylist_lock);
1090			for (r = qp->replylist; r != NULL; r = r->next) {
1091				if (r->xid == xid) {
1092					find_xid = 1;
1093					switch (op) {
1094					case RDMA_MSG:
1095					case RDMA_NOMSG:
1096					case RDMA_MSGP:
1097						r->status = RDMA_SUCCESS;
1098						r->vaddr_cq = rwid->addr;
1099						r->bytes_xfer =
1100						    wc.wc_bytes_xfer;
1101						cv_signal(&r->wait_cv);
1102						break;
1103					default:
1104						rib_rbuf_free(qptoc(qp),
1105						    RECV_BUFFER,
1106						    (void *)(uintptr_t)
1107						    rwid->addr);
1108						break;
1109					}
1110					break;
1111				}
1112			}
1113			mutex_exit(&qp->replylist_lock);
1114			if (find_xid == 0) {
1115				/* RPC caller not waiting for reply */
1116
1117				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1118				    int, xid);
1119
1120				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1121				    (void *)(uintptr_t)rwid->addr);
1122			}
1123		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1124			CONN *conn = qptoc(qp);
1125
1126			/*
1127			 * Connection being flushed. Just free
1128			 * the posted buffer
1129			 */
1130			rib_rbuf_free(conn, RECV_BUFFER,
1131			    (void *)(uintptr_t)rwid->addr);
1132		} else {
1133			CONN *conn = qptoc(qp);
1134/*
1135 *  RC Recv Q Error Code		Local state     Remote State
1136 *  ====================		===========     ============
1137 *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1138 *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1139 *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1140 *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1141 *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1142 *  IBT_WC_WR_FLUSHED_ERR               None            None
1143 */
1144			/*
1145			 * Channel in error state. Set connection
1146			 * in ERROR state.
1147			 */
1148			mutex_enter(&conn->c_lock);
1149			if (conn->c_state != C_DISCONN_PEND)
1150				conn->c_state = C_ERROR_CONN;
1151			mutex_exit(&conn->c_lock);
1152			rib_rbuf_free(conn, RECV_BUFFER,
1153			    (void *)(uintptr_t)rwid->addr);
1154		}
1155		rib_free_wid(rwid);
1156	}
1157}
1158
1159/* Server side */
1160/* ARGSUSED */
1161static void
1162rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1163{
1164	rdma_recv_data_t *rdp;
1165	rib_qp_t	*qp;
1166	ibt_status_t	ibt_status;
1167	ibt_wc_t	wc;
1168	struct svc_recv	*s_recvp;
1169	CONN		*conn;
1170	mblk_t		*mp;
1171
1172	/*
1173	 * Re-enable cq notify here to avoid missing any
1174	 * completion queue notification.
1175	 */
1176	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1177
1178	ibt_status = IBT_SUCCESS;
1179	while (ibt_status != IBT_CQ_EMPTY) {
1180		bzero(&wc, sizeof (wc));
1181		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1182		if (ibt_status != IBT_SUCCESS)
1183			return;
1184
1185		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1186		qp = s_recvp->qp;
1187		conn = qptoc(qp);
1188		mutex_enter(&qp->posted_rbufs_lock);
1189		qp->n_posted_rbufs--;
1190#if defined(MEASURE_POOL_DEPTH)
1191		rib_posted_rbufs(preposted_rbufs -  qp->n_posted_rbufs);
1192#endif
1193		if (qp->n_posted_rbufs == 0)
1194			cv_signal(&qp->posted_rbufs_cv);
1195		mutex_exit(&qp->posted_rbufs_lock);
1196
1197		if (wc.wc_status == IBT_WC_SUCCESS) {
1198			XDR	inxdrs, *xdrs;
1199			uint_t	xid, vers, op;
1200			uint32_t rdma_credit;
1201
1202			xdrs = &inxdrs;
1203			/* s_recvp->vaddr stores data */
1204			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1205			    wc.wc_bytes_xfer, XDR_DECODE);
1206
1207			/*
1208			 * Treat xid as opaque (xid is the first entity
1209			 * in the rpc rdma message).
1210			 */
1211			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1212			/* Skip xid and set the xdr position accordingly. */
1213			XDR_SETPOS(xdrs, sizeof (uint32_t));
1214			if (!xdr_u_int(xdrs, &vers) ||
1215			    !xdr_u_int(xdrs, &rdma_credit) ||
1216			    !xdr_u_int(xdrs, &op)) {
1217				rib_rbuf_free(conn, RECV_BUFFER,
1218				    (void *)(uintptr_t)s_recvp->vaddr);
1219				XDR_DESTROY(xdrs);
1220				(void) rib_free_svc_recv(s_recvp);
1221				continue;
1222			}
1223			XDR_DESTROY(xdrs);
1224
1225			if (vers != RPCRDMA_VERS) {
1226				/*
1227				 * Invalid RPC/RDMA version.
1228				 * Drop rpc rdma message.
1229				 */
1230				rib_rbuf_free(conn, RECV_BUFFER,
1231				    (void *)(uintptr_t)s_recvp->vaddr);
1232				(void) rib_free_svc_recv(s_recvp);
1233				continue;
1234			}
1235			/*
1236			 * Is this for RDMA_DONE?
1237			 */
1238			if (op == RDMA_DONE) {
1239				rib_rbuf_free(conn, RECV_BUFFER,
1240				    (void *)(uintptr_t)s_recvp->vaddr);
1241				/*
1242				 * Wake up the thread waiting on
1243				 * a RDMA_DONE for xid
1244				 */
1245				mutex_enter(&qp->rdlist_lock);
1246				rdma_done_notify(qp, xid);
1247				mutex_exit(&qp->rdlist_lock);
1248				(void) rib_free_svc_recv(s_recvp);
1249				continue;
1250			}
1251
1252			mutex_enter(&plugin_state_lock);
1253			if (plugin_state == ACCEPT) {
1254				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1255				    == NULL)
1256					(void) strwaitbuf(
1257					    sizeof (*rdp), BPRI_LO);
1258				/*
1259				 * Plugin is in accept state, hence the master
1260				 * transport queue for this is still accepting
1261				 * requests. Hence we can call svc_queuereq to
1262				 * queue this recieved msg.
1263				 */
1264				rdp = (rdma_recv_data_t *)mp->b_rptr;
1265				rdp->conn = conn;
1266				rdp->rpcmsg.addr =
1267				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1268				rdp->rpcmsg.type = RECV_BUFFER;
1269				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1270				rdp->status = wc.wc_status;
1271				mutex_enter(&conn->c_lock);
1272				conn->c_ref++;
1273				mutex_exit(&conn->c_lock);
1274				mp->b_wptr += sizeof (*rdp);
1275				svc_queuereq((queue_t *)rib_stat->q, mp);
1276				mutex_exit(&plugin_state_lock);
1277			} else {
1278				/*
1279				 * The master transport for this is going
1280				 * away and the queue is not accepting anymore
1281				 * requests for krpc, so don't do anything, just
1282				 * free the msg.
1283				 */
1284				mutex_exit(&plugin_state_lock);
1285				rib_rbuf_free(conn, RECV_BUFFER,
1286				    (void *)(uintptr_t)s_recvp->vaddr);
1287			}
1288		} else {
1289			rib_rbuf_free(conn, RECV_BUFFER,
1290			    (void *)(uintptr_t)s_recvp->vaddr);
1291		}
1292		(void) rib_free_svc_recv(s_recvp);
1293	}
1294}
1295
1296/*
1297 * Handles DR event of IBT_HCA_DETACH_EVENT.
1298 */
1299/* ARGSUSED */
1300static void
1301rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1302	ibt_async_code_t code, ibt_async_event_t *event)
1303{
1304
1305	switch (code) {
1306	case IBT_HCA_ATTACH_EVENT:
1307		/* ignore */
1308		break;
1309	case IBT_HCA_DETACH_EVENT:
1310	{
1311		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1312		rib_detach_hca(rib_stat->hca);
1313#ifdef DEBUG
1314		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1315#endif
1316		break;
1317	}
1318#ifdef DEBUG
1319	case IBT_EVENT_PATH_MIGRATED:
1320		cmn_err(CE_NOTE, "rib_async_handler(): "
1321		    "IBT_EVENT_PATH_MIGRATED\n");
1322		break;
1323	case IBT_EVENT_SQD:
1324		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1325		break;
1326	case IBT_EVENT_COM_EST:
1327		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1328		break;
1329	case IBT_ERROR_CATASTROPHIC_CHAN:
1330		cmn_err(CE_NOTE, "rib_async_handler(): "
1331		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1332		break;
1333	case IBT_ERROR_INVALID_REQUEST_CHAN:
1334		cmn_err(CE_NOTE, "rib_async_handler(): "
1335		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1336		break;
1337	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1338		cmn_err(CE_NOTE, "rib_async_handler(): "
1339		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1340		break;
1341	case IBT_ERROR_PATH_MIGRATE_REQ:
1342		cmn_err(CE_NOTE, "rib_async_handler(): "
1343		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1344		break;
1345	case IBT_ERROR_CQ:
1346		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1347		break;
1348	case IBT_ERROR_PORT_DOWN:
1349		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1350		break;
1351	case IBT_EVENT_PORT_UP:
1352		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1353		break;
1354	case IBT_ASYNC_OPAQUE1:
1355		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1356		break;
1357	case IBT_ASYNC_OPAQUE2:
1358		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1359		break;
1360	case IBT_ASYNC_OPAQUE3:
1361		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1362		break;
1363	case IBT_ASYNC_OPAQUE4:
1364		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1365		break;
1366#endif
1367	default:
1368		break;
1369	}
1370}
1371
1372/*
1373 * Client's reachable function.
1374 */
1375static rdma_stat
1376rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1377{
1378	rib_hca_t	*hca;
1379	rdma_stat	status;
1380
1381	/*
1382	 * First check if a hca is still attached
1383	 */
1384	*handle = NULL;
1385	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1386	if (rib_stat->hca->state != HCA_INITED) {
1387		rw_exit(&rib_stat->hca->state_lock);
1388		return (RDMA_FAILED);
1389	}
1390	status = rib_ping_srv(addr_type, raddr, &hca);
1391	rw_exit(&rib_stat->hca->state_lock);
1392
1393	if (status == RDMA_SUCCESS) {
1394		*handle = (void *)hca;
1395		return (RDMA_SUCCESS);
1396	} else {
1397		*handle = NULL;
1398		DTRACE_PROBE(rpcib__i__pingfailed);
1399		return (RDMA_FAILED);
1400	}
1401}
1402
1403/* Client side qp creation */
1404static rdma_stat
1405rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1406{
1407	rib_qp_t	*kqp = NULL;
1408	CONN		*conn;
1409	rdma_clnt_cred_ctrl_t *cc_info;
1410
1411	ASSERT(qp != NULL);
1412	*qp = NULL;
1413
1414	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1415	conn = qptoc(kqp);
1416	kqp->hca = hca;
1417	kqp->rdmaconn.c_rdmamod = &rib_mod;
1418	kqp->rdmaconn.c_private = (caddr_t)kqp;
1419
1420	kqp->mode = RIB_CLIENT;
1421	kqp->chan_flags = IBT_BLOCKING;
1422	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1423	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1424	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1425	/*
1426	 * Initialize
1427	 */
1428	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1429	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1430	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1431	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1432	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1433	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1434	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1435	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1436	/*
1437	 * Initialize the client credit control
1438	 * portion of the rdmaconn struct.
1439	 */
1440	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1441	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1442	cc_info->clnt_cc_granted_ops = 0;
1443	cc_info->clnt_cc_in_flight_ops = 0;
1444	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1445
1446	*qp = kqp;
1447	return (RDMA_SUCCESS);
1448}
1449
1450/* Server side qp creation */
1451static rdma_stat
1452rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1453{
1454	rib_qp_t	*kqp = NULL;
1455	ibt_chan_sizes_t	chan_sizes;
1456	ibt_rc_chan_alloc_args_t	qp_attr;
1457	ibt_status_t		ibt_status;
1458	rdma_srv_cred_ctrl_t *cc_info;
1459
1460	*qp = NULL;
1461
1462	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1463	kqp->hca = hca;
1464	kqp->port_num = port;
1465	kqp->rdmaconn.c_rdmamod = &rib_mod;
1466	kqp->rdmaconn.c_private = (caddr_t)kqp;
1467
1468	/*
1469	 * Create the qp handle
1470	 */
1471	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1472	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1473	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1474	qp_attr.rc_pd = hca->pd_hdl;
1475	qp_attr.rc_hca_port_num = port;
1476	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1477	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1478	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1479	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1480	qp_attr.rc_clone_chan = NULL;
1481	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1482	qp_attr.rc_flags = IBT_WR_SIGNALED;
1483
1484	rw_enter(&hca->state_lock, RW_READER);
1485	if (hca->state != HCA_DETACHED) {
1486		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1487		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1488		    &chan_sizes);
1489	} else {
1490		rw_exit(&hca->state_lock);
1491		goto fail;
1492	}
1493	rw_exit(&hca->state_lock);
1494
1495	if (ibt_status != IBT_SUCCESS) {
1496		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1497		    int, ibt_status);
1498		goto fail;
1499	}
1500
1501	kqp->mode = RIB_SERVER;
1502	kqp->chan_flags = IBT_BLOCKING;
1503	kqp->q = q;	/* server ONLY */
1504
1505	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1506	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1507	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1508	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1509	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1510	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1511	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1512	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1513	/*
1514	 * Set the private data area to qp to be used in callbacks
1515	 */
1516	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1517	kqp->rdmaconn.c_state = C_CONNECTED;
1518
1519	/*
1520	 * Initialize the server credit control
1521	 * portion of the rdmaconn struct.
1522	 */
1523	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1524	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1525	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1526	cc_info->srv_cc_cur_buffers_used = 0;
1527	cc_info->srv_cc_posted = preposted_rbufs;
1528
1529	*qp = kqp;
1530
1531	return (RDMA_SUCCESS);
1532fail:
1533	if (kqp)
1534		kmem_free(kqp, sizeof (rib_qp_t));
1535
1536	return (RDMA_FAILED);
1537}
1538
1539/* ARGSUSED */
1540ibt_cm_status_t
1541rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1542    ibt_cm_return_args_t *ret_args, void *priv_data,
1543    ibt_priv_data_len_t len)
1544{
1545	rpcib_state_t   *ribstat;
1546	rib_hca_t	*hca;
1547
1548	ribstat = (rpcib_state_t *)clnt_hdl;
1549	hca = (rib_hca_t *)ribstat->hca;
1550
1551	switch (event->cm_type) {
1552
1553	/* got a connection close event */
1554	case IBT_CM_EVENT_CONN_CLOSED:
1555	{
1556		CONN	*conn;
1557		rib_qp_t *qp;
1558
1559		/* check reason why connection was closed */
1560		switch (event->cm_event.closed) {
1561		case IBT_CM_CLOSED_DREP_RCVD:
1562		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1563		case IBT_CM_CLOSED_DUP:
1564		case IBT_CM_CLOSED_ABORT:
1565		case IBT_CM_CLOSED_ALREADY:
1566			/*
1567			 * These cases indicate the local end initiated
1568			 * the closing of the channel. Nothing to do here.
1569			 */
1570			break;
1571		default:
1572			/*
1573			 * Reason for CONN_CLOSED event must be one of
1574			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1575			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1576			 * the remote end is closing the channel. In these
1577			 * cases free the channel and transition to error
1578			 * state
1579			 */
1580			qp = ibt_get_chan_private(event->cm_channel);
1581			conn = qptoc(qp);
1582			mutex_enter(&conn->c_lock);
1583			if (conn->c_state == C_DISCONN_PEND) {
1584				mutex_exit(&conn->c_lock);
1585				break;
1586			}
1587
1588			conn->c_state = C_ERROR_CONN;
1589
1590			/*
1591			 * Free the rc_channel. Channel has already
1592			 * transitioned to ERROR state and WRs have been
1593			 * FLUSHED_ERR already.
1594			 */
1595			(void) ibt_free_channel(qp->qp_hdl);
1596			qp->qp_hdl = NULL;
1597
1598			/*
1599			 * Free the conn if c_ref is down to 0 already
1600			 */
1601			if (conn->c_ref == 0) {
1602				/*
1603				 * Remove from list and free conn
1604				 */
1605				conn->c_state = C_DISCONN_PEND;
1606				mutex_exit(&conn->c_lock);
1607				(void) rib_disconnect_channel(conn,
1608				    &hca->cl_conn_list);
1609			} else {
1610				mutex_exit(&conn->c_lock);
1611			}
1612#ifdef DEBUG
1613			if (rib_debug)
1614				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1615				    "(CONN_CLOSED) channel disconnected");
1616#endif
1617			break;
1618		}
1619		break;
1620	}
1621	default:
1622		break;
1623	}
1624	return (IBT_CM_ACCEPT);
1625}
1626
1627/* Check server ib address */
1628rdma_stat
1629rib_chk_srv_ibaddr(struct netbuf *raddr,
1630	int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip,
1631	ibt_ip_addr_t *d_ip)
1632{
1633	struct sockaddr_in	*sin4;
1634	struct sockaddr_in6	*sin6;
1635	ibt_status_t		ibt_status;
1636	ibt_ip_path_attr_t	ipattr;
1637	uint8_t npaths = 0;
1638	ibt_path_ip_src_t	srcip;
1639
1640	ASSERT(raddr->buf != NULL);
1641
1642	(void) bzero(path, sizeof (ibt_path_info_t));
1643
1644	switch (addr_type) {
1645	case AF_INET:
1646		sin4 = (struct sockaddr_in *)raddr->buf;
1647		d_ip->family = AF_INET;
1648		d_ip->un.ip4addr = htonl(sin4->sin_addr.s_addr);
1649		break;
1650
1651	case AF_INET6:
1652		sin6 = (struct sockaddr_in6 *)raddr->buf;
1653		d_ip->family = AF_INET6;
1654		d_ip->un.ip6addr = sin6->sin6_addr;
1655		break;
1656
1657	default:
1658		return (RDMA_INVAL);
1659	}
1660
1661	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1662	bzero(&srcip, sizeof (ibt_path_ip_src_t));
1663
1664	ipattr.ipa_dst_ip 	= d_ip;
1665	ipattr.ipa_hca_guid 	= rib_stat->hca->hca_guid;
1666	ipattr.ipa_ndst		= 1;
1667	ipattr.ipa_max_paths	= 1;
1668	npaths = 0;
1669
1670	ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1671	    IBT_PATH_NO_FLAGS,
1672	    &ipattr,
1673	    path,
1674	    &npaths,
1675	    &srcip);
1676
1677	if (ibt_status != IBT_SUCCESS ||
1678	    npaths < 1 ||
1679	    path->pi_hca_guid != rib_stat->hca->hca_guid) {
1680
1681		bzero(s_ip, sizeof (ibt_path_ip_src_t));
1682		return (RDMA_FAILED);
1683	}
1684
1685	if (srcip.ip_primary.family == AF_INET) {
1686		s_ip->family = AF_INET;
1687		s_ip->un.ip4addr = htonl(srcip.ip_primary.un.ip4addr);
1688	} else {
1689		s_ip->family = AF_INET6;
1690		s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr;
1691	}
1692
1693	return (RDMA_SUCCESS);
1694}
1695
1696
1697/*
1698 * Connect to the server.
1699 */
1700rdma_stat
1701rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path,
1702		ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip)
1703{
1704	ibt_chan_open_args_t	chan_args;	/* channel args */
1705	ibt_chan_sizes_t	chan_sizes;
1706	ibt_rc_chan_alloc_args_t	qp_attr;
1707	ibt_status_t		ibt_status;
1708	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1709	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1710	ibt_ip_cm_info_t	ipcm_info;
1711	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1712
1713
1714	(void) bzero(&chan_args, sizeof (chan_args));
1715	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1716	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1717
1718	switch (ipcm_info.src_addr.family = s_ip->family) {
1719	case AF_INET:
1720		ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr;
1721		break;
1722	case AF_INET6:
1723		ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr;
1724		break;
1725	}
1726
1727	switch (ipcm_info.dst_addr.family = d_ip->family) {
1728	case AF_INET:
1729		ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr;
1730		break;
1731	case AF_INET6:
1732		ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr;
1733		break;
1734	}
1735
1736	ipcm_info.src_port = NFS_RDMA_PORT;
1737
1738	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1739	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1740
1741	if (ibt_status != IBT_SUCCESS) {
1742		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1743		return (-1);
1744	}
1745
1746	qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1747	/* Alloc a RC channel */
1748	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1749	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1750	qp_attr.rc_pd = hca->pd_hdl;
1751	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1752	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1753	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1754	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1755	qp_attr.rc_clone_chan = NULL;
1756	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1757	qp_attr.rc_flags = IBT_WR_SIGNALED;
1758
1759	path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT);
1760	chan_args.oc_path = path;
1761	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1762	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1763	chan_args.oc_rdma_ra_out = 4;
1764	chan_args.oc_rdma_ra_in = 4;
1765	chan_args.oc_path_retry_cnt = 2;
1766	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1767	chan_args.oc_priv_data = cmp_ip_pvt;
1768	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1769
1770refresh:
1771	rw_enter(&hca->state_lock, RW_READER);
1772	if (hca->state != HCA_DETACHED) {
1773		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1774		    IBT_ACHAN_NO_FLAGS,
1775		    &qp_attr, &qp->qp_hdl,
1776		    &chan_sizes);
1777	} else {
1778		rw_exit(&hca->state_lock);
1779		return (RDMA_FAILED);
1780	}
1781	rw_exit(&hca->state_lock);
1782
1783	if (ibt_status != IBT_SUCCESS) {
1784		DTRACE_PROBE1(rpcib__i_conntosrv,
1785		    int, ibt_status);
1786		return (RDMA_FAILED);
1787	}
1788
1789	/* Connect to the Server */
1790	(void) bzero(&ret_args, sizeof (ret_args));
1791	mutex_enter(&qp->cb_lock);
1792	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1793	    IBT_BLOCKING, &chan_args, &ret_args);
1794	if (ibt_status != IBT_SUCCESS) {
1795		DTRACE_PROBE2(rpcib__i_openrctosrv,
1796		    int, ibt_status, int, ret_args.rc_status);
1797
1798		(void) ibt_free_channel(qp->qp_hdl);
1799		qp->qp_hdl = NULL;
1800		mutex_exit(&qp->cb_lock);
1801		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1802		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1803			/*
1804			 * Got IBT_CM_CONN_STALE probably because of stale
1805			 * data on the passive end of a channel that existed
1806			 * prior to reboot. Retry establishing a channel
1807			 * REFRESH_ATTEMPTS times, during which time the
1808			 * stale conditions on the server might clear up.
1809			 */
1810			goto refresh;
1811		}
1812		return (RDMA_FAILED);
1813	}
1814	mutex_exit(&qp->cb_lock);
1815	/*
1816	 * Set the private data area to qp to be used in callbacks
1817	 */
1818	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1819	return (RDMA_SUCCESS);
1820}
1821
1822rdma_stat
1823rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
1824{
1825	struct sockaddr_in	*sin4, *sin4arr;
1826	struct sockaddr_in6	*sin6, *sin6arr;
1827	uint_t			nif, nif4, nif6, i;
1828	ibt_path_info_t		path;
1829	ibt_status_t		ibt_status;
1830	uint8_t			num_paths_p;
1831	ibt_ip_path_attr_t	ipattr;
1832	ibt_ip_addr_t		dstip;
1833	ibt_path_ip_src_t	srcip;
1834
1835
1836	*hca = NULL;
1837
1838	ASSERT(raddr->buf != NULL);
1839
1840	bzero(&path, sizeof (ibt_path_info_t));
1841	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1842	bzero(&srcip, sizeof (ibt_path_ip_src_t));
1843
1844	/* Obtain the source IP addresses for the system */
1845	nif = rpcib_get_number_interfaces();
1846	sin4arr = (struct sockaddr_in *)
1847	    kmem_zalloc(sizeof (struct sockaddr_in) * nif, KM_SLEEP);
1848	sin6arr = (struct sockaddr_in6 *)
1849	    kmem_zalloc(sizeof (struct sockaddr_in6) * nif, KM_SLEEP);
1850
1851	(void) rpcib_get_ib_addresses(sin4arr, sin6arr, &nif4, &nif6);
1852
1853	/* Are there really any IB interfaces available */
1854	if (nif4 == 0 && nif6 == 0) {
1855		kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
1856		kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
1857		return (RDMA_FAILED);
1858	}
1859
1860	/* Prep the destination address */
1861	switch (addr_type) {
1862	case AF_INET:
1863		sin4 = (struct sockaddr_in *)raddr->buf;
1864		dstip.family = AF_INET;
1865		dstip.un.ip4addr = htonl(sin4->sin_addr.s_addr);
1866
1867		for (i = 0; i < nif4; i++) {
1868			num_paths_p = 0;
1869			ipattr.ipa_dst_ip 	= &dstip;
1870			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1871			ipattr.ipa_ndst		= 1;
1872			ipattr.ipa_max_paths	= 1;
1873			ipattr.ipa_src_ip.family = dstip.family;
1874			ipattr.ipa_src_ip.un.ip4addr =
1875			    htonl(sin4arr[i].sin_addr.s_addr);
1876
1877			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1878			    IBT_PATH_NO_FLAGS,
1879			    &ipattr,
1880			    &path,
1881			    &num_paths_p,
1882			    &srcip);
1883			if (ibt_status == IBT_SUCCESS &&
1884			    num_paths_p != 0 &&
1885			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
1886				*hca = rib_stat->hca;
1887
1888				kmem_free(sin4arr,
1889				    sizeof (struct sockaddr_in) * nif);
1890				kmem_free(sin6arr,
1891				    sizeof (struct sockaddr_in6) * nif);
1892
1893				return (RDMA_SUCCESS);
1894			}
1895		}
1896		break;
1897
1898	case AF_INET6:
1899		sin6 = (struct sockaddr_in6 *)raddr->buf;
1900		dstip.family = AF_INET6;
1901		dstip.un.ip6addr = sin6->sin6_addr;
1902
1903		for (i = 0; i < nif6; i++) {
1904			num_paths_p = 0;
1905			ipattr.ipa_dst_ip 	= &dstip;
1906			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1907			ipattr.ipa_ndst		= 1;
1908			ipattr.ipa_max_paths	= 1;
1909			ipattr.ipa_src_ip.family = dstip.family;
1910			ipattr.ipa_src_ip.un.ip6addr = sin6arr[i].sin6_addr;
1911
1912			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1913			    IBT_PATH_NO_FLAGS,
1914			    &ipattr,
1915			    &path,
1916			    &num_paths_p,
1917			    &srcip);
1918			if (ibt_status == IBT_SUCCESS &&
1919			    num_paths_p != 0 &&
1920			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
1921				*hca = rib_stat->hca;
1922
1923				kmem_free(sin4arr,
1924				    sizeof (struct sockaddr_in) * nif);
1925				kmem_free(sin6arr,
1926				    sizeof (struct sockaddr_in6) * nif);
1927
1928				return (RDMA_SUCCESS);
1929			}
1930		}
1931
1932		break;
1933
1934	default:
1935		kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
1936		kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
1937		return (RDMA_INVAL);
1938	}
1939
1940	kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
1941	kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
1942	return (RDMA_FAILED);
1943}
1944
1945/*
1946 * Close channel, remove from connection list and
1947 * free up resources allocated for that channel.
1948 */
1949rdma_stat
1950rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1951{
1952	rib_qp_t	*qp = ctoqp(conn);
1953	rib_hca_t	*hca;
1954
1955	/*
1956	 * c_ref == 0 and connection is in C_DISCONN_PEND
1957	 */
1958	hca = qp->hca;
1959	if (conn_list != NULL)
1960		(void) rib_rm_conn(conn, conn_list);
1961
1962	if (qp->qp_hdl != NULL) {
1963		/*
1964		 * If the channel has not been establised,
1965		 * ibt_flush_channel is called to flush outstanding WRs
1966		 * on the Qs.  Otherwise, ibt_close_rc_channel() is
1967		 * called.  The channel is then freed.
1968		 */
1969		if (conn_list != NULL)
1970			(void) ibt_close_rc_channel(qp->qp_hdl,
1971			    IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1972		else
1973			(void) ibt_flush_channel(qp->qp_hdl);
1974
1975		mutex_enter(&qp->posted_rbufs_lock);
1976		while (qp->n_posted_rbufs)
1977			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1978		mutex_exit(&qp->posted_rbufs_lock);
1979		(void) ibt_free_channel(qp->qp_hdl);
1980		qp->qp_hdl = NULL;
1981	}
1982
1983	ASSERT(qp->rdlist == NULL);
1984
1985	if (qp->replylist != NULL) {
1986		(void) rib_rem_replylist(qp);
1987	}
1988
1989	cv_destroy(&qp->cb_conn_cv);
1990	cv_destroy(&qp->posted_rbufs_cv);
1991	mutex_destroy(&qp->cb_lock);
1992
1993	mutex_destroy(&qp->replylist_lock);
1994	mutex_destroy(&qp->posted_rbufs_lock);
1995	mutex_destroy(&qp->rdlist_lock);
1996
1997	cv_destroy(&conn->c_cv);
1998	mutex_destroy(&conn->c_lock);
1999
2000	if (conn->c_raddr.buf != NULL) {
2001		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2002	}
2003	if (conn->c_laddr.buf != NULL) {
2004		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2005	}
2006
2007	/*
2008	 * Credit control cleanup.
2009	 */
2010	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2011		rdma_clnt_cred_ctrl_t *cc_info;
2012		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2013		cv_destroy(&cc_info->clnt_cc_cv);
2014	}
2015
2016	kmem_free(qp, sizeof (rib_qp_t));
2017
2018	/*
2019	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2020	 * then the hca is no longer being used.
2021	 */
2022	if (conn_list != NULL) {
2023		rw_enter(&hca->state_lock, RW_READER);
2024		if (hca->state == HCA_DETACHED) {
2025			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2026			if (hca->srv_conn_list.conn_hd == NULL) {
2027				rw_enter(&hca->cl_conn_list.conn_lock,
2028				    RW_READER);
2029
2030				if (hca->cl_conn_list.conn_hd == NULL) {
2031					mutex_enter(&hca->inuse_lock);
2032					hca->inuse = FALSE;
2033					cv_signal(&hca->cb_cv);
2034					mutex_exit(&hca->inuse_lock);
2035				}
2036				rw_exit(&hca->cl_conn_list.conn_lock);
2037			}
2038			rw_exit(&hca->srv_conn_list.conn_lock);
2039		}
2040		rw_exit(&hca->state_lock);
2041	}
2042
2043	return (RDMA_SUCCESS);
2044}
2045
2046/*
2047 * Wait for send completion notification. Only on receiving a
2048 * notification be it a successful or error completion, free the
2049 * send_wid.
2050 */
2051static rdma_stat
2052rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2053{
2054	clock_t timout, cv_wait_ret;
2055	rdma_stat error = RDMA_SUCCESS;
2056	int	i;
2057
2058	/*
2059	 * Wait for send to complete
2060	 */
2061	ASSERT(wd != NULL);
2062	mutex_enter(&wd->sendwait_lock);
2063	if (wd->status == (uint_t)SEND_WAIT) {
2064		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2065		    ddi_get_lbolt();
2066
2067		if (qp->mode == RIB_SERVER) {
2068			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2069			    &wd->sendwait_lock, timout)) > 0 &&
2070			    wd->status == (uint_t)SEND_WAIT)
2071				;
2072			switch (cv_wait_ret) {
2073			case -1:	/* timeout */
2074				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2075
2076				wd->cv_sig = 0;		/* no signal needed */
2077				error = RDMA_TIMEDOUT;
2078				break;
2079			default:	/* got send completion */
2080				break;
2081			}
2082		} else {
2083			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2084			    &wd->sendwait_lock, timout)) > 0 &&
2085			    wd->status == (uint_t)SEND_WAIT)
2086				;
2087			switch (cv_wait_ret) {
2088			case -1:	/* timeout */
2089				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2090
2091				wd->cv_sig = 0;		/* no signal needed */
2092				error = RDMA_TIMEDOUT;
2093				break;
2094			case 0:		/* interrupted */
2095				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2096
2097				wd->cv_sig = 0;		/* no signal needed */
2098				error = RDMA_INTR;
2099				break;
2100			default:	/* got send completion */
2101				break;
2102			}
2103		}
2104	}
2105
2106	if (wd->status != (uint_t)SEND_WAIT) {
2107		/* got send completion */
2108		if (wd->status != RDMA_SUCCESS) {
2109			error = wd->status;
2110		if (wd->status != RDMA_CONNLOST)
2111			error = RDMA_FAILED;
2112		}
2113		for (i = 0; i < wd->nsbufs; i++) {
2114			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2115			    (void *)(uintptr_t)wd->sbufaddr[i]);
2116		}
2117		mutex_exit(&wd->sendwait_lock);
2118		(void) rib_free_sendwait(wd);
2119	} else {
2120		mutex_exit(&wd->sendwait_lock);
2121	}
2122	return (error);
2123}
2124
2125static struct send_wid *
2126rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2127{
2128	struct send_wid	*wd;
2129
2130	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2131	wd->xid = xid;
2132	wd->cv_sig = cv_sig;
2133	wd->qp = qp;
2134	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2135	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2136	wd->status = (uint_t)SEND_WAIT;
2137
2138	return (wd);
2139}
2140
2141static int
2142rib_free_sendwait(struct send_wid *wdesc)
2143{
2144	cv_destroy(&wdesc->wait_cv);
2145	mutex_destroy(&wdesc->sendwait_lock);
2146	kmem_free(wdesc, sizeof (*wdesc));
2147
2148	return (0);
2149}
2150
2151static rdma_stat
2152rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2153{
2154	mutex_enter(&qp->replylist_lock);
2155	if (rep != NULL) {
2156		(void) rib_remreply(qp, rep);
2157		mutex_exit(&qp->replylist_lock);
2158		return (RDMA_SUCCESS);
2159	}
2160	mutex_exit(&qp->replylist_lock);
2161	return (RDMA_FAILED);
2162}
2163
2164/*
2165 * Send buffers are freed here only in case of error in posting
2166 * on QP. If the post succeeded, the send buffers are freed upon
2167 * send completion in rib_sendwait() or in the scq_handler.
2168 */
2169rdma_stat
2170rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2171	int send_sig, int cv_sig, caddr_t *swid)
2172{
2173	struct send_wid	*wdesc;
2174	struct clist	*clp;
2175	ibt_status_t	ibt_status = IBT_SUCCESS;
2176	rdma_stat	ret = RDMA_SUCCESS;
2177	ibt_send_wr_t	tx_wr;
2178	int		i, nds;
2179	ibt_wr_ds_t	sgl[DSEG_MAX];
2180	uint_t		total_msg_size;
2181	rib_qp_t	*qp;
2182
2183	qp = ctoqp(conn);
2184
2185	ASSERT(cl != NULL);
2186
2187	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2188
2189	nds = 0;
2190	total_msg_size = 0;
2191	clp = cl;
2192	while (clp != NULL) {
2193		if (nds >= DSEG_MAX) {
2194			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2195			return (RDMA_FAILED);
2196		}
2197		sgl[nds].ds_va = clp->w.c_saddr;
2198		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2199		sgl[nds].ds_len = clp->c_len;
2200		total_msg_size += clp->c_len;
2201		clp = clp->c_next;
2202		nds++;
2203	}
2204
2205	if (send_sig) {
2206		/* Set SEND_SIGNAL flag. */
2207		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2208		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2209		*swid = (caddr_t)wdesc;
2210	} else {
2211		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2212		wdesc = rib_init_sendwait(msgid, 0, qp);
2213		*swid = (caddr_t)wdesc;
2214	}
2215	wdesc->nsbufs = nds;
2216	for (i = 0; i < nds; i++) {
2217		wdesc->sbufaddr[i] = sgl[i].ds_va;
2218	}
2219
2220	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2221	tx_wr.wr_opcode = IBT_WRC_SEND;
2222	tx_wr.wr_trans = IBT_RC_SRV;
2223	tx_wr.wr_nds = nds;
2224	tx_wr.wr_sgl = sgl;
2225
2226	mutex_enter(&conn->c_lock);
2227	if (conn->c_state == C_CONNECTED) {
2228		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2229	}
2230	if (conn->c_state != C_CONNECTED ||
2231	    ibt_status != IBT_SUCCESS) {
2232		if (conn->c_state != C_DISCONN_PEND)
2233			conn->c_state = C_ERROR_CONN;
2234		mutex_exit(&conn->c_lock);
2235		for (i = 0; i < nds; i++) {
2236			rib_rbuf_free(conn, SEND_BUFFER,
2237			    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2238		}
2239
2240		(void) rib_free_sendwait(wdesc);
2241
2242		return (RDMA_CONNLOST);
2243	}
2244	mutex_exit(&conn->c_lock);
2245
2246	if (send_sig) {
2247		if (cv_sig) {
2248			/*
2249			 * cv_wait for send to complete.
2250			 * We can fail due to a timeout or signal or
2251			 * unsuccessful send.
2252			 */
2253			ret = rib_sendwait(qp, wdesc);
2254
2255			return (ret);
2256		}
2257	}
2258
2259	return (RDMA_SUCCESS);
2260}
2261
2262
2263rdma_stat
2264rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2265{
2266	rdma_stat	ret;
2267	caddr_t		wd;
2268
2269	/* send-wait & cv_signal */
2270	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2271	return (ret);
2272}
2273
2274/*
2275 * Server interface (svc_rdma_ksend).
2276 * Send RPC reply and wait for RDMA_DONE.
2277 */
2278rdma_stat
2279rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2280{
2281	rdma_stat ret = RDMA_SUCCESS;
2282	struct rdma_done_list *rd;
2283	clock_t timout, cv_wait_ret;
2284	caddr_t *wid = NULL;
2285	rib_qp_t *qp = ctoqp(conn);
2286
2287	mutex_enter(&qp->rdlist_lock);
2288	rd = rdma_done_add(qp, msgid);
2289
2290	/* No cv_signal (whether send-wait or no-send-wait) */
2291	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2292
2293	if (ret != RDMA_SUCCESS) {
2294		rdma_done_rm(qp, rd);
2295	} else {
2296		/*
2297		 * Wait for RDMA_DONE from remote end
2298		 */
2299		timout =
2300		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2301		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2302		    &qp->rdlist_lock,
2303		    timout);
2304
2305		rdma_done_rm(qp, rd);
2306
2307		if (cv_wait_ret < 0) {
2308			ret = RDMA_TIMEDOUT;
2309		}
2310	}
2311
2312	mutex_exit(&qp->rdlist_lock);
2313	return (ret);
2314}
2315
2316static struct recv_wid *
2317rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2318{
2319	struct recv_wid	*rwid;
2320
2321	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2322	rwid->xid = msgid;
2323	rwid->addr = sgl->ds_va;
2324	rwid->qp = qp;
2325
2326	return (rwid);
2327}
2328
2329static void
2330rib_free_wid(struct recv_wid *rwid)
2331{
2332	kmem_free(rwid, sizeof (struct recv_wid));
2333}
2334
2335rdma_stat
2336rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2337{
2338	rib_qp_t	*qp = ctoqp(conn);
2339	struct clist	*clp = cl;
2340	struct reply	*rep;
2341	struct recv_wid	*rwid;
2342	int		nds;
2343	ibt_wr_ds_t	sgl[DSEG_MAX];
2344	ibt_recv_wr_t	recv_wr;
2345	rdma_stat	ret;
2346	ibt_status_t	ibt_status;
2347
2348	/*
2349	 * rdma_clnt_postrecv uses RECV_BUFFER.
2350	 */
2351
2352	nds = 0;
2353	while (cl != NULL) {
2354		if (nds >= DSEG_MAX) {
2355			ret = RDMA_FAILED;
2356			goto done;
2357		}
2358		sgl[nds].ds_va = cl->w.c_saddr;
2359		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2360		sgl[nds].ds_len = cl->c_len;
2361		cl = cl->c_next;
2362		nds++;
2363	}
2364
2365	if (nds != 1) {
2366		ret = RDMA_FAILED;
2367		goto done;
2368	}
2369
2370	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2371	recv_wr.wr_nds = nds;
2372	recv_wr.wr_sgl = sgl;
2373
2374	rwid = rib_create_wid(qp, &sgl[0], msgid);
2375	if (rwid) {
2376		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2377	} else {
2378		ret = RDMA_NORESOURCE;
2379		goto done;
2380	}
2381	rep = rib_addreplylist(qp, msgid);
2382	if (!rep) {
2383		rib_free_wid(rwid);
2384		ret = RDMA_NORESOURCE;
2385		goto done;
2386	}
2387
2388	mutex_enter(&conn->c_lock);
2389
2390	if (conn->c_state == C_CONNECTED) {
2391		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2392	}
2393
2394	if (conn->c_state != C_CONNECTED ||
2395	    ibt_status != IBT_SUCCESS) {
2396		if (conn->c_state != C_DISCONN_PEND)
2397			conn->c_state = C_ERROR_CONN;
2398		mutex_exit(&conn->c_lock);
2399		rib_free_wid(rwid);
2400		(void) rib_rem_rep(qp, rep);
2401		ret = RDMA_CONNLOST;
2402		goto done;
2403	}
2404	mutex_exit(&conn->c_lock);
2405	return (RDMA_SUCCESS);
2406
2407done:
2408	while (clp != NULL) {
2409		rib_rbuf_free(conn, RECV_BUFFER,
2410		    (void *)(uintptr_t)clp->w.c_saddr3);
2411		clp = clp->c_next;
2412	}
2413	return (ret);
2414}
2415
2416rdma_stat
2417rib_svc_post(CONN* conn, struct clist *cl)
2418{
2419	rib_qp_t	*qp = ctoqp(conn);
2420	struct svc_recv	*s_recvp;
2421	int		nds;
2422	ibt_wr_ds_t	sgl[DSEG_MAX];
2423	ibt_recv_wr_t	recv_wr;
2424	ibt_status_t	ibt_status;
2425
2426	nds = 0;
2427	while (cl != NULL) {
2428		if (nds >= DSEG_MAX) {
2429			return (RDMA_FAILED);
2430		}
2431		sgl[nds].ds_va = cl->w.c_saddr;
2432		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2433		sgl[nds].ds_len = cl->c_len;
2434		cl = cl->c_next;
2435		nds++;
2436	}
2437
2438	if (nds != 1) {
2439		rib_rbuf_free(conn, RECV_BUFFER,
2440		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2441
2442		return (RDMA_FAILED);
2443	}
2444
2445	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2446	recv_wr.wr_nds = nds;
2447	recv_wr.wr_sgl = sgl;
2448
2449	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2450	/* Use s_recvp's addr as wr id */
2451	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2452	mutex_enter(&conn->c_lock);
2453	if (conn->c_state == C_CONNECTED) {
2454		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2455	}
2456	if (conn->c_state != C_CONNECTED ||
2457	    ibt_status != IBT_SUCCESS) {
2458		if (conn->c_state != C_DISCONN_PEND)
2459			conn->c_state = C_ERROR_CONN;
2460		mutex_exit(&conn->c_lock);
2461		rib_rbuf_free(conn, RECV_BUFFER,
2462		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2463		(void) rib_free_svc_recv(s_recvp);
2464
2465		return (RDMA_CONNLOST);
2466	}
2467	mutex_exit(&conn->c_lock);
2468
2469	return (RDMA_SUCCESS);
2470}
2471
2472/* Client */
2473rdma_stat
2474rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2475{
2476
2477	return (rib_clnt_post(conn, cl, msgid));
2478}
2479
2480/* Client */
2481rdma_stat
2482rib_post_resp_remove(CONN* conn, uint32_t msgid)
2483{
2484	rib_qp_t	*qp = ctoqp(conn);
2485	struct reply	*rep;
2486
2487	mutex_enter(&qp->replylist_lock);
2488	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2489		if (rep->xid == msgid) {
2490			if (rep->vaddr_cq) {
2491				rib_rbuf_free(conn, RECV_BUFFER,
2492				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2493			}
2494			(void) rib_remreply(qp, rep);
2495			break;
2496		}
2497	}
2498	mutex_exit(&qp->replylist_lock);
2499
2500	return (RDMA_SUCCESS);
2501}
2502
2503/* Server */
2504rdma_stat
2505rib_post_recv(CONN *conn, struct clist *cl)
2506{
2507	rib_qp_t	*qp = ctoqp(conn);
2508
2509	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2510		mutex_enter(&qp->posted_rbufs_lock);
2511		qp->n_posted_rbufs++;
2512		mutex_exit(&qp->posted_rbufs_lock);
2513		return (RDMA_SUCCESS);
2514	}
2515	return (RDMA_FAILED);
2516}
2517
2518/*
2519 * Client side only interface to "recv" the rpc reply buf
2520 * posted earlier by rib_post_resp(conn, cl, msgid).
2521 */
2522rdma_stat
2523rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2524{
2525	struct reply *rep = NULL;
2526	clock_t timout, cv_wait_ret;
2527	rdma_stat ret = RDMA_SUCCESS;
2528	rib_qp_t *qp = ctoqp(conn);
2529
2530	/*
2531	 * Find the reply structure for this msgid
2532	 */
2533	mutex_enter(&qp->replylist_lock);
2534
2535	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2536		if (rep->xid == msgid)
2537			break;
2538	}
2539
2540	if (rep != NULL) {
2541		/*
2542		 * If message not yet received, wait.
2543		 */
2544		if (rep->status == (uint_t)REPLY_WAIT) {
2545			timout = ddi_get_lbolt() +
2546			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2547
2548			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2549			    &qp->replylist_lock, timout)) > 0 &&
2550			    rep->status == (uint_t)REPLY_WAIT)
2551				;
2552
2553			switch (cv_wait_ret) {
2554			case -1:	/* timeout */
2555				ret = RDMA_TIMEDOUT;
2556				break;
2557			case 0:
2558				ret = RDMA_INTR;
2559				break;
2560			default:
2561				break;
2562			}
2563		}
2564
2565		if (rep->status == RDMA_SUCCESS) {
2566			struct clist *cl = NULL;
2567
2568			/*
2569			 * Got message successfully
2570			 */
2571			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2572			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2573			*clp = cl;
2574		} else {
2575			if (rep->status != (uint_t)REPLY_WAIT) {
2576				/*
2577				 * Got error in reply message. Free
2578				 * recv buffer here.
2579				 */
2580				ret = rep->status;
2581				rib_rbuf_free(conn, RECV_BUFFER,
2582				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2583			}
2584		}
2585		(void) rib_remreply(qp, rep);
2586	} else {
2587		/*
2588		 * No matching reply structure found for given msgid on the
2589		 * reply wait list.
2590		 */
2591		ret = RDMA_INVAL;
2592		DTRACE_PROBE(rpcib__i__nomatchxid2);
2593	}
2594
2595	/*
2596	 * Done.
2597	 */
2598	mutex_exit(&qp->replylist_lock);
2599	return (ret);
2600}
2601
2602/*
2603 * RDMA write a buffer to the remote address.
2604 */
2605rdma_stat
2606rib_write(CONN *conn, struct clist *cl, int wait)
2607{
2608	ibt_send_wr_t	tx_wr;
2609	int		cv_sig;
2610	int		i;
2611	ibt_wr_ds_t	sgl[DSEG_MAX];
2612	struct send_wid	*wdesc;
2613	ibt_status_t	ibt_status;
2614	rdma_stat	ret = RDMA_SUCCESS;
2615	rib_qp_t	*qp = ctoqp(conn);
2616	uint64_t	n_writes = 0;
2617	bool_t		force_wait = FALSE;
2618
2619	if (cl == NULL) {
2620		return (RDMA_FAILED);
2621	}
2622
2623
2624	while ((cl != NULL)) {
2625		if (cl->c_len > 0) {
2626			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2627			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2628			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2629			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2630			sgl[0].ds_va = cl->w.c_saddr;
2631			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2632			sgl[0].ds_len = cl->c_len;
2633
2634			if (wait) {
2635				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2636				cv_sig = 1;
2637			} else {
2638				if (n_writes > max_unsignaled_rws) {
2639					n_writes = 0;
2640					force_wait = TRUE;
2641					tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2642					cv_sig = 1;
2643				} else {
2644					tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2645					cv_sig = 0;
2646				}
2647			}
2648
2649			wdesc = rib_init_sendwait(0, cv_sig, qp);
2650			tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2651			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2652			tx_wr.wr_trans = IBT_RC_SRV;
2653			tx_wr.wr_nds = 1;
2654			tx_wr.wr_sgl = sgl;
2655
2656			mutex_enter(&conn->c_lock);
2657			if (conn->c_state == C_CONNECTED) {
2658				ibt_status =
2659				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2660			}
2661			if (conn->c_state != C_CONNECTED ||
2662			    ibt_status != IBT_SUCCESS) {
2663				if (conn->c_state != C_DISCONN_PEND)
2664					conn->c_state = C_ERROR_CONN;
2665				mutex_exit(&conn->c_lock);
2666				(void) rib_free_sendwait(wdesc);
2667				return (RDMA_CONNLOST);
2668			}
2669			mutex_exit(&conn->c_lock);
2670
2671			/*
2672			 * Wait for send to complete
2673			 */
2674			if (wait || force_wait) {
2675				force_wait = FALSE;
2676				ret = rib_sendwait(qp, wdesc);
2677				if (ret != 0) {
2678					return (ret);
2679				}
2680			} else {
2681				mutex_enter(&wdesc->sendwait_lock);
2682				for (i = 0; i < wdesc->nsbufs; i++) {
2683					rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2684					    (void *)(uintptr_t)
2685					    wdesc->sbufaddr[i]);
2686				}
2687				mutex_exit(&wdesc->sendwait_lock);
2688				(void) rib_free_sendwait(wdesc);
2689			}
2690			n_writes ++;
2691		}
2692		cl = cl->c_next;
2693	}
2694	return (RDMA_SUCCESS);
2695}
2696
2697/*
2698 * RDMA Read a buffer from the remote address.
2699 */
2700rdma_stat
2701rib_read(CONN *conn, struct clist *cl, int wait)
2702{
2703	ibt_send_wr_t	rx_wr;
2704	int		cv_sig;
2705	int		i;
2706	ibt_wr_ds_t	sgl;
2707	struct send_wid	*wdesc;
2708	ibt_status_t	ibt_status = IBT_SUCCESS;
2709	rdma_stat	ret = RDMA_SUCCESS;
2710	rib_qp_t	*qp = ctoqp(conn);
2711
2712	if (cl == NULL) {
2713		return (RDMA_FAILED);
2714	}
2715
2716	while (cl != NULL) {
2717		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2718		/*
2719		 * Remote address is at the head chunk item in list.
2720		 */
2721		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2722		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2723
2724		sgl.ds_va = cl->u.c_daddr;
2725		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2726		sgl.ds_len = cl->c_len;
2727
2728		if (wait) {
2729			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2730			cv_sig = 1;
2731		} else {
2732			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2733			cv_sig = 0;
2734		}
2735
2736		wdesc = rib_init_sendwait(0, cv_sig, qp);
2737		rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2738		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2739		rx_wr.wr_trans = IBT_RC_SRV;
2740		rx_wr.wr_nds = 1;
2741		rx_wr.wr_sgl = &sgl;
2742
2743		mutex_enter(&conn->c_lock);
2744		if (conn->c_state == C_CONNECTED) {
2745			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2746		}
2747		if (conn->c_state != C_CONNECTED ||
2748		    ibt_status != IBT_SUCCESS) {
2749			if (conn->c_state != C_DISCONN_PEND)
2750				conn->c_state = C_ERROR_CONN;
2751			mutex_exit(&conn->c_lock);
2752			(void) rib_free_sendwait(wdesc);
2753			return (RDMA_CONNLOST);
2754		}
2755		mutex_exit(&conn->c_lock);
2756
2757		/*
2758		 * Wait for send to complete if this is the
2759		 * last item in the list.
2760		 */
2761		if (wait && cl->c_next == NULL) {
2762			ret = rib_sendwait(qp, wdesc);
2763			if (ret != 0) {
2764				return (ret);
2765			}
2766		} else {
2767			mutex_enter(&wdesc->sendwait_lock);
2768			for (i = 0; i < wdesc->nsbufs; i++) {
2769				rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2770				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2771			}
2772			mutex_exit(&wdesc->sendwait_lock);
2773			(void) rib_free_sendwait(wdesc);
2774		}
2775		cl = cl->c_next;
2776	}
2777	return (RDMA_SUCCESS);
2778}
2779
2780/*
2781 * rib_srv_cm_handler()
2782 *    Connection Manager callback to handle RC connection requests.
2783 */
2784/* ARGSUSED */
2785static ibt_cm_status_t
2786rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2787	ibt_cm_return_args_t *ret_args, void *priv_data,
2788	ibt_priv_data_len_t len)
2789{
2790	queue_t		*q;
2791	rib_qp_t	*qp;
2792	rpcib_state_t	*ribstat;
2793	rib_hca_t	*hca;
2794	rdma_stat	status = RDMA_SUCCESS;
2795	int		i;
2796	struct clist	cl;
2797	rdma_buf_t	rdbuf = {0};
2798	void		*buf = NULL;
2799	CONN		*conn;
2800	ibt_ip_cm_info_t	ipinfo;
2801	struct sockaddr_in *s;
2802	struct sockaddr_in6 *s6;
2803	int sin_size = sizeof (struct sockaddr_in);
2804	int in_size = sizeof (struct in_addr);
2805	int sin6_size = sizeof (struct sockaddr_in6);
2806
2807	ASSERT(any != NULL);
2808	ASSERT(event != NULL);
2809
2810	ribstat = (rpcib_state_t *)any;
2811	hca = (rib_hca_t *)ribstat->hca;
2812	ASSERT(hca != NULL);
2813
2814	/* got a connection request */
2815	switch (event->cm_type) {
2816	case IBT_CM_EVENT_REQ_RCV:
2817		/*
2818		 * If the plugin is in the NO_ACCEPT state, bail out.
2819		 */
2820		mutex_enter(&plugin_state_lock);
2821		if (plugin_state == NO_ACCEPT) {
2822			mutex_exit(&plugin_state_lock);
2823			return (IBT_CM_REJECT);
2824		}
2825		mutex_exit(&plugin_state_lock);
2826
2827		/*
2828		 * Need to send a MRA MAD to CM so that it does not
2829		 * timeout on us.
2830		 */
2831		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2832		    event->cm_event.req.req_timeout * 8, NULL, 0);
2833
2834		mutex_enter(&rib_stat->open_hca_lock);
2835		q = rib_stat->q;
2836		mutex_exit(&rib_stat->open_hca_lock);
2837
2838		status = rib_svc_create_chan(hca, (caddr_t)q,
2839		    event->cm_event.req.req_prim_hca_port, &qp);
2840
2841		if (status) {
2842			return (IBT_CM_REJECT);
2843		}
2844
2845		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2846		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2847		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2848		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2849
2850		/*
2851		 * Pre-posts RECV buffers
2852		 */
2853		conn = qptoc(qp);
2854		for (i = 0; i < preposted_rbufs; i++) {
2855			bzero(&rdbuf, sizeof (rdbuf));
2856			rdbuf.type = RECV_BUFFER;
2857			buf = rib_rbuf_alloc(conn, &rdbuf);
2858			if (buf == NULL) {
2859				(void) rib_disconnect_channel(conn, NULL);
2860				return (IBT_CM_REJECT);
2861			}
2862
2863			bzero(&cl, sizeof (cl));
2864			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
2865			cl.c_len = rdbuf.len;
2866			cl.c_smemhandle.mrc_lmr =
2867			    rdbuf.handle.mrc_lmr; /* lkey */
2868			cl.c_next = NULL;
2869			status = rib_post_recv(conn, &cl);
2870			if (status != RDMA_SUCCESS) {
2871				(void) rib_disconnect_channel(conn, NULL);
2872				return (IBT_CM_REJECT);
2873			}
2874		}
2875		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2876
2877		/*
2878		 * Get the address translation
2879		 */
2880		rw_enter(&hca->state_lock, RW_READER);
2881		if (hca->state == HCA_DETACHED) {
2882			rw_exit(&hca->state_lock);
2883			return (IBT_CM_REJECT);
2884		}
2885		rw_exit(&hca->state_lock);
2886
2887		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
2888
2889		if (ibt_get_ip_data(event->cm_priv_data_len,
2890		    event->cm_priv_data,
2891		    &ipinfo) != IBT_SUCCESS) {
2892
2893			return (IBT_CM_REJECT);
2894		}
2895
2896		switch (ipinfo.src_addr.family) {
2897		case AF_INET:
2898
2899			conn->c_raddr.maxlen =
2900			    conn->c_raddr.len = sin_size;
2901			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
2902
2903			s = (struct sockaddr_in *)conn->c_raddr.buf;
2904			s->sin_family = AF_INET;
2905
2906			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
2907			    &s->sin_addr, in_size);
2908
2909			break;
2910
2911		case AF_INET6:
2912
2913			conn->c_raddr.maxlen =
2914			    conn->c_raddr.len = sin6_size;
2915			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
2916
2917			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2918			s6->sin6_family = AF_INET6;
2919			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
2920			    &s6->sin6_addr,
2921			    sizeof (struct in6_addr));
2922
2923			break;
2924
2925		default:
2926			return (IBT_CM_REJECT);
2927		}
2928
2929		break;
2930
2931	case IBT_CM_EVENT_CONN_CLOSED:
2932	{
2933		CONN		*conn;
2934		rib_qp_t	*qp;
2935
2936		switch (event->cm_event.closed) {
2937		case IBT_CM_CLOSED_DREP_RCVD:
2938		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2939		case IBT_CM_CLOSED_DUP:
2940		case IBT_CM_CLOSED_ABORT:
2941		case IBT_CM_CLOSED_ALREADY:
2942			/*
2943			 * These cases indicate the local end initiated
2944			 * the closing of the channel. Nothing to do here.
2945			 */
2946			break;
2947		default:
2948			/*
2949			 * Reason for CONN_CLOSED event must be one of
2950			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2951			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2952			 * the remote end is closing the channel. In these
2953			 * cases free the channel and transition to error
2954			 * state
2955			 */
2956			qp = ibt_get_chan_private(event->cm_channel);
2957			conn = qptoc(qp);
2958			mutex_enter(&conn->c_lock);
2959			if (conn->c_state == C_DISCONN_PEND) {
2960				mutex_exit(&conn->c_lock);
2961				break;
2962			}
2963			conn->c_state = C_ERROR_CONN;
2964
2965			/*
2966			 * Free the rc_channel. Channel has already
2967			 * transitioned to ERROR state and WRs have been
2968			 * FLUSHED_ERR already.
2969			 */
2970			(void) ibt_free_channel(qp->qp_hdl);
2971			qp->qp_hdl = NULL;
2972
2973			/*
2974			 * Free the conn if c_ref goes down to 0
2975			 */
2976			if (conn->c_ref == 0) {
2977				/*
2978				 * Remove from list and free conn
2979				 */
2980				conn->c_state = C_DISCONN_PEND;
2981				mutex_exit(&conn->c_lock);
2982				(void) rib_disconnect_channel(conn,
2983				    &hca->srv_conn_list);
2984			} else {
2985				mutex_exit(&conn->c_lock);
2986			}
2987			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
2988			break;
2989		}
2990		break;
2991	}
2992	case IBT_CM_EVENT_CONN_EST:
2993		/*
2994		 * RTU received, hence connection established.
2995		 */
2996		if (rib_debug > 1)
2997			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2998			    "(CONN_EST) channel established");
2999		break;
3000
3001	default:
3002		if (rib_debug > 2) {
3003			/* Let CM handle the following events. */
3004			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3005				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3006				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3007			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3008				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3009				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3010			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3011				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3012				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3013			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3014				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3015				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3016			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3017				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3018				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3019			}
3020		}
3021		return (IBT_CM_DEFAULT);
3022	}
3023
3024	/* accept all other CM messages (i.e. let the CM handle them) */
3025	return (IBT_CM_ACCEPT);
3026}
3027
3028static rdma_stat
3029rib_register_service(rib_hca_t *hca, int service_type)
3030{
3031	ibt_srv_desc_t		sdesc;
3032	ibt_hca_portinfo_t	*port_infop;
3033	ib_svc_id_t		srv_id;
3034	ibt_srv_hdl_t		srv_hdl;
3035	uint_t			port_size;
3036	uint_t			pki, i, num_ports, nbinds;
3037	ibt_status_t		ibt_status;
3038	rib_service_t		*new_service;
3039	ib_pkey_t		pkey;
3040
3041	/*
3042	 * Query all ports for the given HCA
3043	 */
3044	rw_enter(&hca->state_lock, RW_READER);
3045	if (hca->state != HCA_DETACHED) {
3046		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3047		    &num_ports, &port_size);
3048		rw_exit(&hca->state_lock);
3049	} else {
3050		rw_exit(&hca->state_lock);
3051		return (RDMA_FAILED);
3052	}
3053	if (ibt_status != IBT_SUCCESS) {
3054		return (RDMA_FAILED);
3055	}
3056
3057	DTRACE_PROBE1(rpcib__i__regservice_numports,
3058	    int, num_ports);
3059
3060	for (i = 0; i < num_ports; i++) {
3061		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3062			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3063			    int, i+1);
3064		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3065			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3066			    int, i+1);
3067		}
3068	}
3069
3070	/*
3071	 * Get all the IP addresses on this system to register the
3072	 * given "service type" on all DNS recognized IP addrs.
3073	 * Each service type such as NFS will have all the systems
3074	 * IP addresses as its different names. For now the only
3075	 * type of service we support in RPCIB is NFS.
3076	 */
3077	rw_enter(&hca->service_list_lock, RW_WRITER);
3078	/*
3079	 * Start registering and binding service to active
3080	 * on active ports on this HCA.
3081	 */
3082	nbinds = 0;
3083	new_service = NULL;
3084
3085	/*
3086	 * We use IP addresses as the service names for
3087	 * service registration.  Register each of them
3088	 * with CM to obtain a svc_id and svc_hdl.  We do not
3089	 * register the service with machine's loopback address.
3090	 */
3091	(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3092	(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3093	(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3094
3095	sdesc.sd_handler = rib_srv_cm_handler;
3096	sdesc.sd_flags = 0;
3097	ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3098	    &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT),
3099	    1, &srv_hdl, &srv_id);
3100
3101	for (i = 0; i < num_ports; i++) {
3102		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3103			continue;
3104
3105		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3106			pkey = port_infop[i].p_pkey_tbl[pki];
3107			if ((pkey & IBSRM_HB) &&
3108			    (pkey != IB_PKEY_INVALID_FULL)) {
3109
3110				/*
3111				 * Allocate and prepare a service entry
3112				 */
3113				new_service =
3114				    kmem_zalloc(1 * sizeof (rib_service_t),
3115				    KM_SLEEP);
3116
3117				new_service->srv_type = service_type;
3118				new_service->srv_hdl = srv_hdl;
3119				new_service->srv_next = NULL;
3120
3121				ibt_status = ibt_bind_service(srv_hdl,
3122				    port_infop[i].p_sgid_tbl[0],
3123				    NULL, rib_stat, NULL);
3124
3125				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3126				    int, ibt_status);
3127
3128				if (ibt_status != IBT_SUCCESS) {
3129					kmem_free(new_service,
3130					    sizeof (rib_service_t));
3131					new_service = NULL;
3132					continue;
3133				}
3134
3135				/*
3136				 * Add to the service list for this HCA
3137				 */
3138				new_service->srv_next = hca->service_list;
3139				hca->service_list = new_service;
3140				new_service = NULL;
3141				nbinds++;
3142			}
3143		}
3144	}
3145	rw_exit(&hca->service_list_lock);
3146
3147	ibt_free_portinfo(port_infop, port_size);
3148
3149	if (nbinds == 0) {
3150		return (RDMA_FAILED);
3151	} else {
3152		/*
3153		 * Put this plugin into accept state, since atleast
3154		 * one registration was successful.
3155		 */
3156		mutex_enter(&plugin_state_lock);
3157		plugin_state = ACCEPT;
3158		mutex_exit(&plugin_state_lock);
3159		return (RDMA_SUCCESS);
3160	}
3161}
3162
3163void
3164rib_listen(struct rdma_svc_data *rd)
3165{
3166	rdma_stat status = RDMA_SUCCESS;
3167
3168	rd->active = 0;
3169	rd->err_code = RDMA_FAILED;
3170
3171	/*
3172	 * First check if a hca is still attached
3173	 */
3174	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3175	if (rib_stat->hca->state != HCA_INITED) {
3176		rw_exit(&rib_stat->hca->state_lock);
3177		return;
3178	}
3179	rw_exit(&rib_stat->hca->state_lock);
3180
3181	rib_stat->q = &rd->q;
3182	/*
3183	 * Right now the only service type is NFS. Hence force feed this
3184	 * value. Ideally to communicate the service type it should be
3185	 * passed down in rdma_svc_data.
3186	 */
3187	rib_stat->service_type = NFS;
3188	status = rib_register_service(rib_stat->hca, NFS);
3189	if (status != RDMA_SUCCESS) {
3190		rd->err_code = status;
3191		return;
3192	}
3193	/*
3194	 * Service active on an HCA, check rd->err_code for more
3195	 * explainable errors.
3196	 */
3197	rd->active = 1;
3198	rd->err_code = status;
3199}
3200
3201/* XXXX */
3202/* ARGSUSED */
3203static void
3204rib_listen_stop(struct rdma_svc_data *svcdata)
3205{
3206	rib_hca_t		*hca;
3207
3208	/*
3209	 * KRPC called the RDMATF to stop the listeners, this means
3210	 * stop sending incomming or recieved requests to KRPC master
3211	 * transport handle for RDMA-IB. This is also means that the
3212	 * master transport handle, responsible for us, is going away.
3213	 */
3214	mutex_enter(&plugin_state_lock);
3215	plugin_state = NO_ACCEPT;
3216	if (svcdata != NULL)
3217		svcdata->active = 0;
3218	mutex_exit(&plugin_state_lock);
3219
3220	/*
3221	 * First check if a hca is still attached
3222	 */
3223	hca = rib_stat->hca;
3224	rw_enter(&hca->state_lock, RW_READER);
3225	if (hca->state != HCA_INITED) {
3226		rw_exit(&hca->state_lock);
3227		return;
3228	}
3229	rib_close_channels(&hca->srv_conn_list);
3230	rib_stop_services(hca);
3231	rw_exit(&hca->state_lock);
3232}
3233
3234/*
3235 * Traverse the HCA's service list to unbind and deregister services.
3236 * Instead of unbinding the service for a service handle by
3237 * calling ibt_unbind_service() for each port/pkey, we unbind
3238 * all the services for the service handle by making only one
3239 * call to ibt_unbind_all_services().  Then, we deregister the
3240 * service for the service handle.
3241 *
3242 * When traversing the entries in service_list, we compare the
3243 * srv_hdl of the current entry with that of the next.  If they
3244 * are different or if the next entry is NULL, the current entry
3245 * marks the last binding of the service handle.  In this case,
3246 * call ibt_unbind_all_services() and deregister the service for
3247 * the service handle.  If they are the same, the current and the
3248 * next entries are bound to the same service handle.  In this
3249 * case, move on to the next entry.
3250 */
3251static void
3252rib_stop_services(rib_hca_t *hca)
3253{
3254	rib_service_t		*srv_list, *to_remove;
3255
3256	/*
3257	 * unbind and deregister the services for this service type.
3258	 * Right now there is only one service type. In future it will
3259	 * be passed down to this function.
3260	 */
3261	rw_enter(&hca->service_list_lock, RW_WRITER);
3262	srv_list = hca->service_list;
3263	while (srv_list != NULL) {
3264		to_remove = srv_list;
3265		srv_list = to_remove->srv_next;
3266		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3267		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3268
3269			(void) ibt_unbind_all_services(to_remove->srv_hdl);
3270			(void) ibt_deregister_service(hca->ibt_clnt_hdl,
3271			    to_remove->srv_hdl);
3272		}
3273
3274		kmem_free(to_remove, sizeof (rib_service_t));
3275	}
3276	hca->service_list = NULL;
3277	rw_exit(&hca->service_list_lock);
3278}
3279
3280static struct svc_recv *
3281rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3282{
3283	struct svc_recv	*recvp;
3284
3285	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3286	recvp->vaddr = sgl->ds_va;
3287	recvp->qp = qp;
3288	recvp->bytes_xfer = 0;
3289	return (recvp);
3290}
3291
3292static int
3293rib_free_svc_recv(struct svc_recv *recvp)
3294{
3295	kmem_free(recvp, sizeof (*recvp));
3296
3297	return (0);
3298}
3299
3300static struct reply *
3301rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3302{
3303	struct reply	*rep;
3304
3305
3306	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3307	if (rep == NULL) {
3308		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3309		return (NULL);
3310	}
3311	rep->xid = msgid;
3312	rep->vaddr_cq = NULL;
3313	rep->bytes_xfer = 0;
3314	rep->status = (uint_t)REPLY_WAIT;
3315	rep->prev = NULL;
3316	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3317
3318	mutex_enter(&qp->replylist_lock);
3319	if (qp->replylist) {
3320		rep->next = qp->replylist;
3321		qp->replylist->prev = rep;
3322	}
3323	qp->rep_list_size++;
3324
3325	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3326	    int, qp->rep_list_size);
3327
3328	qp->replylist = rep;
3329	mutex_exit(&qp->replylist_lock);
3330
3331	return (rep);
3332}
3333
3334static rdma_stat
3335rib_rem_replylist(rib_qp_t *qp)
3336{
3337	struct reply	*r, *n;
3338
3339	mutex_enter(&qp->replylist_lock);
3340	for (r = qp->replylist; r != NULL; r = n) {
3341		n = r->next;
3342		(void) rib_remreply(qp, r);
3343	}
3344	mutex_exit(&qp->replylist_lock);
3345
3346	return (RDMA_SUCCESS);
3347}
3348
3349static int
3350rib_remreply(rib_qp_t *qp, struct reply *rep)
3351{
3352
3353	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3354	if (rep->prev) {
3355		rep->prev->next = rep->next;
3356	}
3357	if (rep->next) {
3358		rep->next->prev = rep->prev;
3359	}
3360	if (qp->replylist == rep)
3361		qp->replylist = rep->next;
3362
3363	cv_destroy(&rep->wait_cv);
3364	qp->rep_list_size--;
3365
3366	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3367	    int, qp->rep_list_size);
3368
3369	kmem_free(rep, sizeof (*rep));
3370
3371	return (0);
3372}
3373
3374rdma_stat
3375rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3376	struct mrc *buf_handle)
3377{
3378	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3379	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3380	rdma_stat	status;
3381	rib_hca_t	*hca = (ctoqp(conn))->hca;
3382
3383	/*
3384	 * Note: ALL buffer pools use the same memory type RDMARW.
3385	 */
3386	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3387	if (status == RDMA_SUCCESS) {
3388		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3389		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3390		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3391	} else {
3392		buf_handle->mrc_linfo = NULL;
3393		buf_handle->mrc_lmr = 0;
3394		buf_handle->mrc_rmr = 0;
3395	}
3396	return (status);
3397}
3398
3399static rdma_stat
3400rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3401	ibt_mr_flags_t spec,
3402	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3403{
3404	ibt_mr_attr_t	mem_attr;
3405	ibt_status_t	ibt_status;
3406	mem_attr.mr_vaddr = (uintptr_t)buf;
3407	mem_attr.mr_len = (ib_msglen_t)size;
3408	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3409	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3410	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3411	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3412
3413	rw_enter(&hca->state_lock, RW_READER);
3414	if (hca->state == HCA_INITED) {
3415		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3416		    &mem_attr, mr_hdlp, mr_descp);
3417		rw_exit(&hca->state_lock);
3418	} else {
3419		rw_exit(&hca->state_lock);
3420		return (RDMA_FAILED);
3421	}
3422
3423	if (ibt_status != IBT_SUCCESS) {
3424		return (RDMA_FAILED);
3425	}
3426	return (RDMA_SUCCESS);
3427}
3428
3429rdma_stat
3430rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3431	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3432{
3433	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3434	rib_lrc_entry_t *l;
3435	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3436	rdma_stat	status;
3437	rib_hca_t	*hca = (ctoqp(conn))->hca;
3438
3439	/*
3440	 * Non-coherent memory registration.
3441	 */
3442	l = (rib_lrc_entry_t *)lrc;
3443	if (l) {
3444		if (l->registered) {
3445			buf_handle->mrc_linfo =
3446			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3447			buf_handle->mrc_lmr =
3448			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3449			buf_handle->mrc_rmr =
3450			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3451			*sync_handle = (RIB_SYNCMEM_HANDLE)
3452			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3453			return (RDMA_SUCCESS);
3454		} else {
3455			/* Always register the whole buffer */
3456			buf = (caddr_t)l->lrc_buf;
3457			buflen = l->lrc_len;
3458		}
3459	}
3460	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3461
3462	if (status == RDMA_SUCCESS) {
3463		if (l) {
3464			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3465			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3466			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3467			l->registered		 = TRUE;
3468		}
3469		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3470		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3471		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3472		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3473	} else {
3474		buf_handle->mrc_linfo = NULL;
3475		buf_handle->mrc_lmr = 0;
3476		buf_handle->mrc_rmr = 0;
3477	}
3478	return (status);
3479}
3480
3481/* ARGSUSED */
3482rdma_stat
3483rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3484{
3485	rib_hca_t *hca = (ctoqp(conn))->hca;
3486	/*
3487	 * Allow memory deregistration even if HCA is
3488	 * getting detached. Need all outstanding
3489	 * memory registrations to be deregistered
3490	 * before HCA_DETACH_EVENT can be accepted.
3491	 */
3492	(void) ibt_deregister_mr(hca->hca_hdl,
3493	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3494	return (RDMA_SUCCESS);
3495}
3496
3497/* ARGSUSED */
3498rdma_stat
3499rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3500		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3501{
3502	rib_lrc_entry_t *l;
3503	l = (rib_lrc_entry_t *)lrc;
3504	if (l)
3505		if (l->registered)
3506			return (RDMA_SUCCESS);
3507
3508	(void) rib_deregistermem(conn, buf, buf_handle);
3509
3510	return (RDMA_SUCCESS);
3511}
3512
3513/* ARGSUSED */
3514rdma_stat
3515rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3516		int len, int cpu)
3517{
3518	ibt_status_t	status;
3519	rib_hca_t *hca = (ctoqp(conn))->hca;
3520	ibt_mr_sync_t	mr_segment;
3521
3522	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3523	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3524	mr_segment.ms_len = (ib_memlen_t)len;
3525	if (cpu) {
3526		/* make incoming data visible to memory */
3527		mr_segment.ms_flags = IBT_SYNC_WRITE;
3528	} else {
3529		/* make memory changes visible to IO */
3530		mr_segment.ms_flags = IBT_SYNC_READ;
3531	}
3532	rw_enter(&hca->state_lock, RW_READER);
3533	if (hca->state == HCA_INITED) {
3534		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3535		rw_exit(&hca->state_lock);
3536	} else {
3537		rw_exit(&hca->state_lock);
3538		return (RDMA_FAILED);
3539	}
3540
3541	if (status == IBT_SUCCESS)
3542		return (RDMA_SUCCESS);
3543	else {
3544		return (RDMA_FAILED);
3545	}
3546}
3547
3548/*
3549 * XXXX	????
3550 */
3551static rdma_stat
3552rib_getinfo(rdma_info_t *info)
3553{
3554	/*
3555	 * XXXX	Hack!
3556	 */
3557	info->addrlen = 16;
3558	info->mts = 1000000;
3559	info->mtu = 1000000;
3560
3561	return (RDMA_SUCCESS);
3562}
3563
3564rib_bufpool_t *
3565rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3566{
3567	rib_bufpool_t	*rbp = NULL;
3568	bufpool_t	*bp = NULL;
3569	caddr_t		buf;
3570	ibt_mr_attr_t	mem_attr;
3571	ibt_status_t	ibt_status;
3572	int		i, j;
3573
3574	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3575
3576	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3577	    num * sizeof (void *), KM_SLEEP);
3578
3579	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3580	bp->numelems = num;
3581
3582
3583	switch (ptype) {
3584	case SEND_BUFFER:
3585		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3586		bp->rsize = RPC_MSG_SZ;
3587		break;
3588	case RECV_BUFFER:
3589		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3590		bp->rsize = RPC_BUF_SIZE;
3591		break;
3592	default:
3593		goto fail;
3594	}
3595
3596	/*
3597	 * Register the pool.
3598	 */
3599	bp->bufsize = num * bp->rsize;
3600	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3601	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3602	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3603	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3604	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3605	rw_enter(&hca->state_lock, RW_READER);
3606
3607	if (hca->state != HCA_INITED) {
3608		rw_exit(&hca->state_lock);
3609		goto fail;
3610	}
3611
3612	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3613		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3614		mem_attr.mr_vaddr = (uintptr_t)buf;
3615		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3616		mem_attr.mr_as = NULL;
3617		ibt_status = ibt_register_mr(hca->hca_hdl,
3618		    hca->pd_hdl, &mem_attr,
3619		    &rbp->mr_hdl[i],
3620		    &rbp->mr_desc[i]);
3621		if (ibt_status != IBT_SUCCESS) {
3622			for (j = 0; j < i; j++) {
3623				(void) ibt_deregister_mr(hca->hca_hdl,
3624				    rbp->mr_hdl[j]);
3625			}
3626			rw_exit(&hca->state_lock);
3627			goto fail;
3628		}
3629	}
3630	rw_exit(&hca->state_lock);
3631	buf = (caddr_t)bp->buf;
3632	for (i = 0; i < num; i++, buf += bp->rsize) {
3633		bp->buflist[i] = (void *)buf;
3634	}
3635	bp->buffree = num - 1;	/* no. of free buffers */
3636	rbp->bpool = bp;
3637
3638	return (rbp);
3639fail:
3640	if (bp) {
3641		if (bp->buf)
3642			kmem_free(bp->buf, bp->bufsize);
3643		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3644	}
3645	if (rbp) {
3646		if (rbp->mr_hdl)
3647			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3648		if (rbp->mr_desc)
3649			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3650		kmem_free(rbp, sizeof (rib_bufpool_t));
3651	}
3652	return (NULL);
3653}
3654
3655static void
3656rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3657{
3658	int i;
3659	rib_bufpool_t *rbp = NULL;
3660	bufpool_t *bp;
3661
3662	/*
3663	 * Obtain pool address based on type of pool
3664	 */
3665	switch (ptype) {
3666		case SEND_BUFFER:
3667			rbp = hca->send_pool;
3668			break;
3669		case RECV_BUFFER:
3670			rbp = hca->recv_pool;
3671			break;
3672		default:
3673			return;
3674	}
3675	if (rbp == NULL)
3676		return;
3677
3678	bp = rbp->bpool;
3679
3680	/*
3681	 * Deregister the pool memory and free it.
3682	 */
3683	for (i = 0; i < bp->numelems; i++) {
3684		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3685	}
3686}
3687
3688static void
3689rib_rbufpool_free(rib_hca_t *hca, int ptype)
3690{
3691
3692	rib_bufpool_t *rbp = NULL;
3693	bufpool_t *bp;
3694
3695	/*
3696	 * Obtain pool address based on type of pool
3697	 */
3698	switch (ptype) {
3699		case SEND_BUFFER:
3700			rbp = hca->send_pool;
3701			break;
3702		case RECV_BUFFER:
3703			rbp = hca->recv_pool;
3704			break;
3705		default:
3706			return;
3707	}
3708	if (rbp == NULL)
3709		return;
3710
3711	bp = rbp->bpool;
3712
3713	/*
3714	 * Free the pool memory.
3715	 */
3716	if (rbp->mr_hdl)
3717		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3718
3719	if (rbp->mr_desc)
3720		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3721	if (bp->buf)
3722		kmem_free(bp->buf, bp->bufsize);
3723	mutex_destroy(&bp->buflock);
3724	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3725	kmem_free(rbp, sizeof (rib_bufpool_t));
3726}
3727
3728void
3729rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3730{
3731	/*
3732	 * Deregister the pool memory and free it.
3733	 */
3734	rib_rbufpool_deregister(hca, ptype);
3735	rib_rbufpool_free(hca, ptype);
3736}
3737
3738/*
3739 * Fetch a buffer from the pool of type specified in rdbuf->type.
3740 */
3741static rdma_stat
3742rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3743{
3744	rib_lrc_entry_t *rlep;
3745
3746	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3747		rlep = rib_get_cache_buf(conn, rdbuf->len);
3748		rdbuf->rb_private =  (caddr_t)rlep;
3749		rdbuf->addr = rlep->lrc_buf;
3750		rdbuf->handle = rlep->lrc_mhandle;
3751		return (RDMA_SUCCESS);
3752	}
3753
3754	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3755	if (rdbuf->addr) {
3756		switch (rdbuf->type) {
3757		case SEND_BUFFER:
3758			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3759			break;
3760		case RECV_BUFFER:
3761			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3762			break;
3763		default:
3764			rdbuf->len = 0;
3765		}
3766		return (RDMA_SUCCESS);
3767	} else
3768		return (RDMA_FAILED);
3769}
3770
3771#if defined(MEASURE_POOL_DEPTH)
3772static void rib_recv_bufs(uint32_t x) {
3773
3774}
3775
3776static void rib_send_bufs(uint32_t x) {
3777
3778}
3779#endif
3780
3781/*
3782 * Fetch a buffer of specified type.
3783 * Note that rdbuf->handle is mw's rkey.
3784 */
3785static void *
3786rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3787{
3788	rib_qp_t	*qp = ctoqp(conn);
3789	rib_hca_t	*hca = qp->hca;
3790	rdma_btype	ptype = rdbuf->type;
3791	void		*buf;
3792	rib_bufpool_t	*rbp = NULL;
3793	bufpool_t	*bp;
3794	int		i;
3795
3796	/*
3797	 * Obtain pool address based on type of pool
3798	 */
3799	switch (ptype) {
3800	case SEND_BUFFER:
3801		rbp = hca->send_pool;
3802		break;
3803	case RECV_BUFFER:
3804		rbp = hca->recv_pool;
3805		break;
3806	default:
3807		return (NULL);
3808	}
3809	if (rbp == NULL)
3810		return (NULL);
3811
3812	bp = rbp->bpool;
3813
3814	mutex_enter(&bp->buflock);
3815	if (bp->buffree < 0) {
3816		mutex_exit(&bp->buflock);
3817		return (NULL);
3818	}
3819
3820	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
3821	buf = bp->buflist[bp->buffree];
3822	rdbuf->addr = buf;
3823	rdbuf->len = bp->rsize;
3824	for (i = bp->numelems - 1; i >= 0; i--) {
3825		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
3826			rdbuf->handle.mrc_rmr =
3827			    (uint32_t)rbp->mr_desc[i].md_rkey;
3828			rdbuf->handle.mrc_linfo =
3829			    (uintptr_t)rbp->mr_hdl[i];
3830			rdbuf->handle.mrc_lmr =
3831			    (uint32_t)rbp->mr_desc[i].md_lkey;
3832#if defined(MEASURE_POOL_DEPTH)
3833			if (ptype == SEND_BUFFER)
3834				rib_send_bufs(MAX_BUFS - (bp->buffree+1));
3835			if (ptype == RECV_BUFFER)
3836				rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
3837#endif
3838			bp->buffree--;
3839
3840			mutex_exit(&bp->buflock);
3841
3842			return (buf);
3843		}
3844	}
3845
3846	mutex_exit(&bp->buflock);
3847
3848	return (NULL);
3849}
3850
3851static void
3852rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
3853{
3854
3855	if (rdbuf->type == RDMA_LONG_BUFFER) {
3856		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
3857		rdbuf->rb_private = NULL;
3858		return;
3859	}
3860	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
3861}
3862
3863static void
3864rib_rbuf_free(CONN *conn, int ptype, void *buf)
3865{
3866	rib_qp_t *qp = ctoqp(conn);
3867	rib_hca_t *hca = qp->hca;
3868	rib_bufpool_t *rbp = NULL;
3869	bufpool_t *bp;
3870
3871	/*
3872	 * Obtain pool address based on type of pool
3873	 */
3874	switch (ptype) {
3875	case SEND_BUFFER:
3876		rbp = hca->send_pool;
3877		break;
3878	case RECV_BUFFER:
3879		rbp = hca->recv_pool;
3880		break;
3881	default:
3882		return;
3883	}
3884	if (rbp == NULL)
3885		return;
3886
3887	bp = rbp->bpool;
3888
3889	mutex_enter(&bp->buflock);
3890	if (++bp->buffree >= bp->numelems) {
3891		/*
3892		 * Should never happen
3893		 */
3894		bp->buffree--;
3895	} else {
3896		bp->buflist[bp->buffree] = buf;
3897	}
3898	mutex_exit(&bp->buflock);
3899}
3900
3901static rdma_stat
3902rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
3903{
3904	rw_enter(&connlist->conn_lock, RW_WRITER);
3905	if (connlist->conn_hd) {
3906		cn->c_next = connlist->conn_hd;
3907		connlist->conn_hd->c_prev = cn;
3908	}
3909	connlist->conn_hd = cn;
3910	rw_exit(&connlist->conn_lock);
3911
3912	return (RDMA_SUCCESS);
3913}
3914
3915static rdma_stat
3916rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
3917{
3918	rw_enter(&connlist->conn_lock, RW_WRITER);
3919	if (cn->c_prev) {
3920		cn->c_prev->c_next = cn->c_next;
3921	}
3922	if (cn->c_next) {
3923		cn->c_next->c_prev = cn->c_prev;
3924	}
3925	if (connlist->conn_hd == cn)
3926		connlist->conn_hd = cn->c_next;
3927	rw_exit(&connlist->conn_lock);
3928
3929	return (RDMA_SUCCESS);
3930}
3931
3932/*
3933 * Connection management.
3934 * IBTF does not support recycling of channels. So connections are only
3935 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
3936 * C_DISCONN_PEND state. No C_IDLE state.
3937 * C_CONN_PEND state: Connection establishment in progress to the server.
3938 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
3939 * It has an RC channel associated with it. ibt_post_send/recv are allowed
3940 * only in this state.
3941 * C_ERROR_CONN state: A connection transitions to this state when WRs on the
3942 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
3943 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
3944 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
3945 * c_ref drops to 0 (this indicates that RPC has no more references to this
3946 * connection), the connection should be destroyed. A connection transitions
3947 * into this state when it is being destroyed.
3948 */
3949static rdma_stat
3950rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
3951{
3952	CONN *cn;
3953	int status = RDMA_SUCCESS;
3954	rib_hca_t *hca = (rib_hca_t *)handle;
3955	rib_qp_t *qp;
3956	clock_t cv_stat, timout;
3957	ibt_path_info_t path;
3958	ibt_ip_addr_t s_ip, d_ip;
3959
3960again:
3961	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
3962	cn = hca->cl_conn_list.conn_hd;
3963	while (cn != NULL) {
3964		/*
3965		 * First, clear up any connection in the ERROR state
3966		 */
3967		mutex_enter(&cn->c_lock);
3968		if (cn->c_state == C_ERROR_CONN) {
3969			if (cn->c_ref == 0) {
3970				/*
3971				 * Remove connection from list and destroy it.
3972				 */
3973				cn->c_state = C_DISCONN_PEND;
3974				mutex_exit(&cn->c_lock);
3975				rw_exit(&hca->cl_conn_list.conn_lock);
3976				(void) rib_disconnect_channel(cn,
3977				    &hca->cl_conn_list);
3978				goto again;
3979			}
3980			mutex_exit(&cn->c_lock);
3981			cn = cn->c_next;
3982			continue;
3983		}
3984		if (cn->c_state == C_DISCONN_PEND) {
3985			mutex_exit(&cn->c_lock);
3986			cn = cn->c_next;
3987			continue;
3988		}
3989		if ((cn->c_raddr.len == svcaddr->len) &&
3990		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
3991			/*
3992			 * Our connection. Give up conn list lock
3993			 * as we are done traversing the list.
3994			 */
3995			rw_exit(&hca->cl_conn_list.conn_lock);
3996			if (cn->c_state == C_CONNECTED) {
3997				cn->c_ref++;	/* sharing a conn */
3998				mutex_exit(&cn->c_lock);
3999				*conn = cn;
4000				return (status);
4001			}
4002			if (cn->c_state == C_CONN_PEND) {
4003				/*
4004				 * Hold a reference to this conn before
4005				 * we give up the lock.
4006				 */
4007				cn->c_ref++;
4008				timout =  ddi_get_lbolt() +
4009				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4010				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4011				    &cn->c_lock, timout)) > 0 &&
4012				    cn->c_state == C_CONN_PEND)
4013					;
4014				if (cv_stat == 0) {
4015					cn->c_ref--;
4016					mutex_exit(&cn->c_lock);
4017					return (RDMA_INTR);
4018				}
4019				if (cv_stat < 0) {
4020					cn->c_ref--;
4021					mutex_exit(&cn->c_lock);
4022					return (RDMA_TIMEDOUT);
4023				}
4024				if (cn->c_state == C_CONNECTED) {
4025					*conn = cn;
4026					mutex_exit(&cn->c_lock);
4027					return (status);
4028				} else {
4029					cn->c_ref--;
4030					mutex_exit(&cn->c_lock);
4031					return (RDMA_TIMEDOUT);
4032				}
4033			}
4034		}
4035		mutex_exit(&cn->c_lock);
4036		cn = cn->c_next;
4037	}
4038	rw_exit(&hca->cl_conn_list.conn_lock);
4039
4040	bzero(&path, sizeof (ibt_path_info_t));
4041	bzero(&s_ip, sizeof (ibt_ip_addr_t));
4042	bzero(&d_ip, sizeof (ibt_ip_addr_t));
4043
4044	status = rib_chk_srv_ibaddr(svcaddr, addr_type, &path, &s_ip, &d_ip);
4045	if (status != RDMA_SUCCESS) {
4046		return (RDMA_FAILED);
4047	}
4048
4049	/*
4050	 * Channel to server doesn't exist yet, create one.
4051	 */
4052	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4053		return (RDMA_FAILED);
4054	}
4055	cn = qptoc(qp);
4056	cn->c_state = C_CONN_PEND;
4057	cn->c_ref = 1;
4058
4059	/*
4060	 * Add to conn list.
4061	 * We had given up the READER lock. In the time since then,
4062	 * another thread might have created the connection we are
4063	 * trying here. But for now, that is quiet alright - there
4064	 * might be two connections between a pair of hosts instead
4065	 * of one. If we really want to close that window,
4066	 * then need to check the list after acquiring the
4067	 * WRITER lock.
4068	 */
4069	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4070	status = rib_conn_to_srv(hca, qp, &path, &s_ip, &d_ip);
4071	mutex_enter(&cn->c_lock);
4072	if (status == RDMA_SUCCESS) {
4073		cn->c_state = C_CONNECTED;
4074		*conn = cn;
4075	} else {
4076		cn->c_state = C_ERROR_CONN;
4077		cn->c_ref--;
4078	}
4079	cv_broadcast(&cn->c_cv);
4080	mutex_exit(&cn->c_lock);
4081	return (status);
4082}
4083
4084static rdma_stat
4085rib_conn_release(CONN *conn)
4086{
4087	rib_qp_t	*qp = ctoqp(conn);
4088
4089	mutex_enter(&conn->c_lock);
4090	conn->c_ref--;
4091
4092	/*
4093	 * If a conn is C_ERROR_CONN, close the channel.
4094	 * If it's CONNECTED, keep it that way.
4095	 */
4096	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4097		conn->c_state = C_DISCONN_PEND;
4098		mutex_exit(&conn->c_lock);
4099		if (qp->mode == RIB_SERVER)
4100			(void) rib_disconnect_channel(conn,
4101			    &qp->hca->srv_conn_list);
4102		else
4103			(void) rib_disconnect_channel(conn,
4104			    &qp->hca->cl_conn_list);
4105		return (RDMA_SUCCESS);
4106	}
4107	mutex_exit(&conn->c_lock);
4108	return (RDMA_SUCCESS);
4109}
4110
4111/*
4112 * Add at front of list
4113 */
4114static struct rdma_done_list *
4115rdma_done_add(rib_qp_t *qp, uint32_t xid)
4116{
4117	struct rdma_done_list *rd;
4118
4119	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4120
4121	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4122	rd->xid = xid;
4123	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4124
4125	rd->prev = NULL;
4126	rd->next = qp->rdlist;
4127	if (qp->rdlist != NULL)
4128		qp->rdlist->prev = rd;
4129	qp->rdlist = rd;
4130
4131	return (rd);
4132}
4133
4134static void
4135rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4136{
4137	struct rdma_done_list *r;
4138
4139	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4140
4141	r = rd->next;
4142	if (r != NULL) {
4143		r->prev = rd->prev;
4144	}
4145
4146	r = rd->prev;
4147	if (r != NULL) {
4148		r->next = rd->next;
4149	} else {
4150		qp->rdlist = rd->next;
4151	}
4152
4153	cv_destroy(&rd->rdma_done_cv);
4154	kmem_free(rd, sizeof (*rd));
4155}
4156
4157static void
4158rdma_done_rem_list(rib_qp_t *qp)
4159{
4160	struct rdma_done_list	*r, *n;
4161
4162	mutex_enter(&qp->rdlist_lock);
4163	for (r = qp->rdlist; r != NULL; r = n) {
4164		n = r->next;
4165		rdma_done_rm(qp, r);
4166	}
4167	mutex_exit(&qp->rdlist_lock);
4168}
4169
4170static void
4171rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4172{
4173	struct rdma_done_list *r = qp->rdlist;
4174
4175	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4176
4177	while (r) {
4178		if (r->xid == xid) {
4179			cv_signal(&r->rdma_done_cv);
4180			return;
4181		} else {
4182			r = r->next;
4183		}
4184	}
4185	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4186	    int, xid);
4187}
4188
4189
4190/*
4191 * Goes through all connections and closes the channel
4192 * This will cause all the WRs on those channels to be
4193 * flushed.
4194 */
4195static void
4196rib_close_channels(rib_conn_list_t *connlist)
4197{
4198	CONN 		*conn;
4199	rib_qp_t	*qp;
4200
4201	rw_enter(&connlist->conn_lock, RW_READER);
4202	conn = connlist->conn_hd;
4203	while (conn != NULL) {
4204		mutex_enter(&conn->c_lock);
4205		qp = ctoqp(conn);
4206		if (conn->c_state == C_CONNECTED) {
4207			/*
4208			 * Live connection in CONNECTED state.
4209			 * Call ibt_close_rc_channel in nonblocking mode
4210			 * with no callbacks.
4211			 */
4212			conn->c_state = C_ERROR_CONN;
4213			(void) ibt_close_rc_channel(qp->qp_hdl,
4214			    IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4215			(void) ibt_free_channel(qp->qp_hdl);
4216			qp->qp_hdl = NULL;
4217		} else {
4218			if (conn->c_state == C_ERROR_CONN &&
4219			    qp->qp_hdl != NULL) {
4220				/*
4221				 * Connection in ERROR state but
4222				 * channel is not yet freed.
4223				 */
4224				(void) ibt_close_rc_channel(qp->qp_hdl,
4225				    IBT_NOCALLBACKS, NULL, 0, NULL,
4226				    NULL, 0);
4227				(void) ibt_free_channel(qp->qp_hdl);
4228				qp->qp_hdl = NULL;
4229			}
4230		}
4231		mutex_exit(&conn->c_lock);
4232		conn = conn->c_next;
4233	}
4234	rw_exit(&connlist->conn_lock);
4235}
4236
4237/*
4238 * Frees up all connections that are no longer being referenced
4239 */
4240static void
4241rib_purge_connlist(rib_conn_list_t *connlist)
4242{
4243	CONN 		*conn;
4244
4245top:
4246	rw_enter(&connlist->conn_lock, RW_READER);
4247	conn = connlist->conn_hd;
4248	while (conn != NULL) {
4249		mutex_enter(&conn->c_lock);
4250
4251		/*
4252		 * At this point connection is either in ERROR
4253		 * or DISCONN_PEND state. If in DISCONN_PEND state
4254		 * then some other thread is culling that connection.
4255		 * If not and if c_ref is 0, then destroy the connection.
4256		 */
4257		if (conn->c_ref == 0 &&
4258		    conn->c_state != C_DISCONN_PEND) {
4259			/*
4260			 * Cull the connection
4261			 */
4262			conn->c_state = C_DISCONN_PEND;
4263			mutex_exit(&conn->c_lock);
4264			rw_exit(&connlist->conn_lock);
4265			(void) rib_disconnect_channel(conn, connlist);
4266			goto top;
4267		} else {
4268			/*
4269			 * conn disconnect already scheduled or will
4270			 * happen from conn_release when c_ref drops to 0.
4271			 */
4272			mutex_exit(&conn->c_lock);
4273		}
4274		conn = conn->c_next;
4275	}
4276	rw_exit(&connlist->conn_lock);
4277
4278	/*
4279	 * At this point, only connections with c_ref != 0 are on the list
4280	 */
4281}
4282
4283/*
4284 * Cleans and closes up all uses of the HCA
4285 */
4286static void
4287rib_detach_hca(rib_hca_t *hca)
4288{
4289
4290	/*
4291	 * Stop all services on the HCA
4292	 * Go through cl_conn_list and close all rc_channels
4293	 * Go through svr_conn_list and close all rc_channels
4294	 * Free connections whose c_ref has dropped to 0
4295	 * Destroy all CQs
4296	 * Deregister and released all buffer pool memory after all
4297	 * connections are destroyed
4298	 * Free the protection domain
4299	 * ibt_close_hca()
4300	 */
4301	rw_enter(&hca->state_lock, RW_WRITER);
4302	if (hca->state == HCA_DETACHED) {
4303		rw_exit(&hca->state_lock);
4304		return;
4305	}
4306
4307	hca->state = HCA_DETACHED;
4308	rib_stat->nhca_inited--;
4309
4310	rib_stop_services(hca);
4311	rib_close_channels(&hca->cl_conn_list);
4312	rib_close_channels(&hca->srv_conn_list);
4313	rw_exit(&hca->state_lock);
4314
4315	rib_purge_connlist(&hca->cl_conn_list);
4316	rib_purge_connlist(&hca->srv_conn_list);
4317
4318	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4319	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4320	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4321	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4322	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4323	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4324	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4325	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4326
4327	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4328	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4329	if (hca->srv_conn_list.conn_hd == NULL &&
4330	    hca->cl_conn_list.conn_hd == NULL) {
4331		/*
4332		 * conn_lists are NULL, so destroy
4333		 * buffers, close hca and be done.
4334		 */
4335		rib_rbufpool_destroy(hca, RECV_BUFFER);
4336		rib_rbufpool_destroy(hca, SEND_BUFFER);
4337		rib_destroy_cache(hca);
4338		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4339		(void) ibt_close_hca(hca->hca_hdl);
4340		hca->hca_hdl = NULL;
4341	}
4342	rw_exit(&hca->cl_conn_list.conn_lock);
4343	rw_exit(&hca->srv_conn_list.conn_lock);
4344
4345	if (hca->hca_hdl != NULL) {
4346		mutex_enter(&hca->inuse_lock);
4347		while (hca->inuse)
4348			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4349		mutex_exit(&hca->inuse_lock);
4350		/*
4351		 * conn_lists are now NULL, so destroy
4352		 * buffers, close hca and be done.
4353		 */
4354		rib_rbufpool_destroy(hca, RECV_BUFFER);
4355		rib_rbufpool_destroy(hca, SEND_BUFFER);
4356		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4357		(void) ibt_close_hca(hca->hca_hdl);
4358		hca->hca_hdl = NULL;
4359	}
4360}
4361
4362static void
4363rib_server_side_cache_reclaim(void *argp)
4364{
4365	cache_avl_struct_t    *rcas;
4366	rib_lrc_entry_t		*rb;
4367	rib_hca_t *hca = (rib_hca_t *)argp;
4368
4369	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4370	rcas = avl_first(&hca->avl_tree);
4371	if (rcas != NULL)
4372		avl_remove(&hca->avl_tree, rcas);
4373
4374	while (rcas != NULL) {
4375		while (rcas->r.forw != &rcas->r) {
4376			rcas->elements--;
4377			rib_total_buffers --;
4378			rb = rcas->r.forw;
4379			remque(rb);
4380			if (rb->registered)
4381				(void) rib_deregistermem_via_hca(hca,
4382				    rb->lrc_buf, rb->lrc_mhandle);
4383			cache_allocation -= rb->lrc_len;
4384			kmem_free(rb->lrc_buf, rb->lrc_len);
4385			kmem_free(rb, sizeof (rib_lrc_entry_t));
4386		}
4387		mutex_destroy(&rcas->node_lock);
4388		kmem_cache_free(hca->server_side_cache, rcas);
4389		rcas = avl_first(&hca->avl_tree);
4390		if (rcas != NULL)
4391			avl_remove(&hca->avl_tree, rcas);
4392	}
4393	rw_exit(&hca->avl_rw_lock);
4394}
4395
4396static void
4397rib_server_side_cache_cleanup(void *argp)
4398{
4399	cache_avl_struct_t    *rcas;
4400	rib_lrc_entry_t		*rb;
4401	rib_hca_t *hca = (rib_hca_t *)argp;
4402
4403	rw_enter(&hca->avl_rw_lock, RW_READER);
4404	if (cache_allocation < cache_limit) {
4405		rw_exit(&hca->avl_rw_lock);
4406		return;
4407	}
4408	rw_exit(&hca->avl_rw_lock);
4409
4410	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4411	rcas = avl_last(&hca->avl_tree);
4412	if (rcas != NULL)
4413		avl_remove(&hca->avl_tree, rcas);
4414
4415	while (rcas != NULL) {
4416		while (rcas->r.forw != &rcas->r) {
4417			rcas->elements--;
4418			rib_total_buffers --;
4419			rb = rcas->r.forw;
4420			remque(rb);
4421			if (rb->registered)
4422				(void) rib_deregistermem_via_hca(hca,
4423				    rb->lrc_buf, rb->lrc_mhandle);
4424			cache_allocation -= rb->lrc_len;
4425			kmem_free(rb->lrc_buf, rb->lrc_len);
4426			kmem_free(rb, sizeof (rib_lrc_entry_t));
4427		}
4428		mutex_destroy(&rcas->node_lock);
4429		kmem_cache_free(hca->server_side_cache, rcas);
4430		if ((cache_allocation) < cache_limit) {
4431			rw_exit(&hca->avl_rw_lock);
4432			return;
4433		}
4434
4435		rcas = avl_last(&hca->avl_tree);
4436		if (rcas != NULL)
4437			avl_remove(&hca->avl_tree, rcas);
4438	}
4439	rw_exit(&hca->avl_rw_lock);
4440}
4441
4442static int
4443avl_compare(const void *t1, const void *t2)
4444{
4445	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4446		return (0);
4447
4448	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4449		return (-1);
4450
4451	return (1);
4452}
4453
4454static void
4455rib_destroy_cache(rib_hca_t *hca)
4456{
4457	if (hca->reg_cache_clean_up != NULL) {
4458		ddi_taskq_destroy(hca->reg_cache_clean_up);
4459		hca->reg_cache_clean_up = NULL;
4460	}
4461	if (!hca->avl_init) {
4462		kmem_cache_destroy(hca->server_side_cache);
4463		avl_destroy(&hca->avl_tree);
4464		mutex_destroy(&hca->cache_allocation);
4465		rw_destroy(&hca->avl_rw_lock);
4466	}
4467	hca->avl_init = FALSE;
4468}
4469
4470static void
4471rib_force_cleanup(void *hca)
4472{
4473	if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL)
4474		(void) ddi_taskq_dispatch(
4475		    ((rib_hca_t *)hca)->reg_cache_clean_up,
4476		    rib_server_side_cache_cleanup,
4477		    (void *)hca, DDI_NOSLEEP);
4478}
4479
4480static rib_lrc_entry_t *
4481rib_get_cache_buf(CONN *conn, uint32_t len)
4482{
4483	cache_avl_struct_t	cas, *rcas;
4484	rib_hca_t	*hca = (ctoqp(conn))->hca;
4485	rib_lrc_entry_t *reply_buf;
4486	avl_index_t where = NULL;
4487	uint64_t c_alloc = 0;
4488
4489	if (!hca->avl_init)
4490		goto  error_alloc;
4491
4492	cas.len = len;
4493
4494	rw_enter(&hca->avl_rw_lock, RW_READER);
4495
4496	mutex_enter(&hca->cache_allocation);
4497	c_alloc = cache_allocation;
4498	mutex_exit(&hca->cache_allocation);
4499
4500	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4501	    &where)) == NULL) {
4502		/* Am I above the cache limit */
4503		if ((c_alloc + len) >= cache_limit) {
4504			rib_force_cleanup((void *)hca);
4505			rw_exit(&hca->avl_rw_lock);
4506			cache_misses_above_the_limit ++;
4507
4508			/* Allocate and register the buffer directly */
4509			goto error_alloc;
4510		}
4511
4512		rw_exit(&hca->avl_rw_lock);
4513		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4514
4515		/* Recheck to make sure no other thread added the entry in */
4516		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4517		    &cas, &where)) == NULL) {
4518			/* Allocate an avl tree entry */
4519			rcas = (cache_avl_struct_t *)
4520			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4521
4522			bzero(rcas, sizeof (cache_avl_struct_t));
4523			rcas->elements = 0;
4524			rcas->r.forw = &rcas->r;
4525			rcas->r.back = &rcas->r;
4526			rcas->len = len;
4527			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4528			avl_insert(&hca->avl_tree, rcas, where);
4529		}
4530	}
4531
4532	mutex_enter(&rcas->node_lock);
4533
4534	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4535		rib_total_buffers--;
4536		cache_hits++;
4537		reply_buf = rcas->r.forw;
4538		remque(reply_buf);
4539		rcas->elements--;
4540		mutex_exit(&rcas->node_lock);
4541		rw_exit(&hca->avl_rw_lock);
4542		mutex_enter(&hca->cache_allocation);
4543		cache_allocation -= len;
4544		mutex_exit(&hca->cache_allocation);
4545	} else {
4546		/* Am I above the cache limit */
4547		mutex_exit(&rcas->node_lock);
4548		if ((c_alloc + len) >= cache_limit) {
4549			rib_force_cleanup((void *)hca);
4550			rw_exit(&hca->avl_rw_lock);
4551			cache_misses_above_the_limit ++;
4552			/* Allocate and register the buffer directly */
4553			goto error_alloc;
4554		}
4555		rw_exit(&hca->avl_rw_lock);
4556		cache_misses ++;
4557		/* Allocate a reply_buf entry */
4558		reply_buf = (rib_lrc_entry_t *)
4559		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4560		bzero(reply_buf, sizeof (rib_lrc_entry_t));
4561		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
4562		reply_buf->lrc_len  = len;
4563		reply_buf->registered = FALSE;
4564		reply_buf->avl_node = (void *)rcas;
4565	}
4566
4567	return (reply_buf);
4568
4569error_alloc:
4570	reply_buf = (rib_lrc_entry_t *)
4571	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4572	bzero(reply_buf, sizeof (rib_lrc_entry_t));
4573	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
4574	reply_buf->lrc_len = len;
4575	reply_buf->registered = FALSE;
4576	reply_buf->avl_node = NULL;
4577
4578	return (reply_buf);
4579}
4580
4581/*
4582 * Return a pre-registered back to the cache (without
4583 * unregistering the buffer)..
4584 */
4585
4586static void
4587rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
4588{
4589	cache_avl_struct_t    cas, *rcas;
4590	avl_index_t where = NULL;
4591	rib_hca_t	*hca = (ctoqp(conn))->hca;
4592
4593	if (!hca->avl_init)
4594		goto  error_free;
4595
4596	cas.len = reg_buf->lrc_len;
4597	rw_enter(&hca->avl_rw_lock, RW_READER);
4598	if ((rcas = (cache_avl_struct_t *)
4599	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
4600		rw_exit(&hca->avl_rw_lock);
4601		goto error_free;
4602	} else {
4603		rib_total_buffers ++;
4604		cas.len = reg_buf->lrc_len;
4605		mutex_enter(&rcas->node_lock);
4606		insque(reg_buf, &rcas->r);
4607		rcas->elements ++;
4608		mutex_exit(&rcas->node_lock);
4609		rw_exit(&hca->avl_rw_lock);
4610		mutex_enter(&hca->cache_allocation);
4611		cache_allocation += cas.len;
4612		mutex_exit(&hca->cache_allocation);
4613	}
4614
4615	return;
4616
4617error_free:
4618
4619	if (reg_buf->registered)
4620		(void) rib_deregistermem_via_hca(hca,
4621		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
4622	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
4623	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
4624}
4625
4626static rdma_stat
4627rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
4628	uint_t buflen, struct mrc *buf_handle)
4629{
4630	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
4631	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
4632	rdma_stat	status;
4633
4634
4635	/*
4636	 * Note: ALL buffer pools use the same memory type RDMARW.
4637	 */
4638	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4639	if (status == RDMA_SUCCESS) {
4640		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
4641		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4642		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4643	} else {
4644		buf_handle->mrc_linfo = NULL;
4645		buf_handle->mrc_lmr = 0;
4646		buf_handle->mrc_rmr = 0;
4647	}
4648	return (status);
4649}
4650
4651/* ARGSUSED */
4652static rdma_stat
4653rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
4654    struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
4655{
4656
4657	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
4658	return (RDMA_SUCCESS);
4659}
4660
4661/* ARGSUSED */
4662static rdma_stat
4663rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
4664{
4665
4666	(void) ibt_deregister_mr(hca->hca_hdl,
4667	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4668	return (RDMA_SUCCESS);
4669}
4670
4671
4672/*
4673 * Return 0 if the interface is IB.
4674 * Return error (>0) if any error is encountered during processing.
4675 * Return -1 if the interface is not IB and no error.
4676 */
4677#define	isalpha(ch)	(((ch) >= 'a' && (ch) <= 'z') || \
4678			((ch) >= 'A' && (ch) <= 'Z'))
4679static int
4680rpcib_is_ib_interface(char *name)
4681{
4682
4683	char	dev_path[MAXPATHLEN];
4684	char	devname[MAXNAMELEN];
4685	ldi_handle_t	lh;
4686	dl_info_ack_t	info;
4687	int	ret = 0;
4688	int	i;
4689
4690	/*
4691	 * ibd devices are only style 2 devices
4692	 * so we will open only style 2 devices
4693	 * by ignoring the ppa
4694	 */
4695
4696	i = strlen(name) - 1;
4697	while ((i >= 0) && (!isalpha(name[i]))) i--;
4698
4699	if (i < 0) {
4700		/* Invalid interface name, no alphabet */
4701		return (-1);
4702	}
4703
4704	(void) strncpy(devname, name, i + 1);
4705	devname[i + 1] = '\0';
4706
4707	if (strcmp("lo", devname) == 0) {
4708		/*
4709		 * loopback interface  not rpc/rdma capable
4710		 */
4711		return (-1);
4712	}
4713
4714	(void) strncpy(dev_path, "/dev/", MAXPATHLEN);
4715	if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) {
4716		/* string overflow */
4717		return (-1);
4718	}
4719
4720	ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rpcib_li);
4721	if (ret != 0) {
4722		return (ret);
4723	}
4724	ret = rpcib_dl_info(lh, &info);
4725	(void) ldi_close(lh, FREAD|FWRITE, kcred);
4726	if (ret != 0) {
4727		return (ret);
4728	}
4729
4730	if (info.dl_mac_type != DL_IB) {
4731		return (-1);
4732	}
4733
4734	return (0);
4735}
4736
4737static int
4738rpcib_dl_info(ldi_handle_t lh, dl_info_ack_t *info)
4739{
4740	dl_info_req_t *info_req;
4741	union DL_primitives *dl_prim;
4742	mblk_t *mp;
4743	k_sigset_t smask;
4744	int error;
4745
4746	if ((mp = allocb(sizeof (dl_info_req_t), BPRI_MED)) == NULL) {
4747		return (ENOMEM);
4748	}
4749
4750	mp->b_datap->db_type = M_PROTO;
4751
4752	info_req = (dl_info_req_t *)(uintptr_t)mp->b_wptr;
4753	mp->b_wptr += sizeof (dl_info_req_t);
4754	info_req->dl_primitive = DL_INFO_REQ;
4755
4756	sigintr(&smask, 0);
4757	if ((error = ldi_putmsg(lh, mp)) != 0) {
4758		sigunintr(&smask);
4759		return (error);
4760	}
4761	if ((error = ldi_getmsg(lh, &mp, (timestruc_t *)NULL)) != 0) {
4762		sigunintr(&smask);
4763		return (error);
4764	}
4765	sigunintr(&smask);
4766
4767	dl_prim = (union DL_primitives *)(uintptr_t)mp->b_rptr;
4768	switch (dl_prim->dl_primitive) {
4769		case DL_INFO_ACK:
4770			if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) <
4771			    sizeof (dl_info_ack_t)) {
4772			error = -1;
4773			} else {
4774				*info = *(dl_info_ack_t *)(uintptr_t)mp->b_rptr;
4775				error = 0;
4776			}
4777			break;
4778		default:
4779			error = -1;
4780			break;
4781	}
4782
4783	freemsg(mp);
4784	return (error);
4785}
4786static int
4787rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg)
4788{
4789	vnode_t *kvp, *vp;
4790	TIUSER  *tiptr;
4791	struct  strioctl iocb;
4792	k_sigset_t smask;
4793	int	err = 0;
4794
4795	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
4796	    &kvp) == 0) {
4797		if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4798		    &tiptr, CRED()) == 0) {
4799		vp = tiptr->fp->f_vnode;
4800	} else {
4801		VN_RELE(kvp);
4802		return (EPROTO);
4803		}
4804	} else {
4805			return (EPROTO);
4806	}
4807
4808	iocb.ic_cmd = cmd;
4809	iocb.ic_timout = 0;
4810	iocb.ic_len = len;
4811	iocb.ic_dp = arg;
4812	sigintr(&smask, 0);
4813	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4814	sigunintr(&smask);
4815	(void) t_kclose(tiptr, 0);
4816	VN_RELE(kvp);
4817	return (err);
4818}
4819
4820static uint_t rpcib_get_number_interfaces(void) {
4821uint_t	numifs;
4822	if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (uint_t), (caddr_t)&numifs)) {
4823		return (0);
4824	}
4825	return (numifs);
4826}
4827
4828static boolean_t
4829rpcib_get_ib_addresses(
4830	struct sockaddr_in *saddr4,
4831	struct sockaddr_in6 *saddr6,
4832	uint_t *number4,
4833	uint_t *number6)
4834{
4835	int	numifs;
4836	struct	ifconf	kifc;
4837	struct  ifreq *ifr;
4838	boolean_t ret = B_FALSE;
4839
4840	*number4 = 0;
4841	*number6 = 0;
4842
4843	if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) {
4844		return (ret);
4845	}
4846
4847	kifc.ifc_len = numifs * sizeof (struct ifreq);
4848	kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP);
4849
4850	if (rpcib_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf),
4851	    (caddr_t)&kifc)) {
4852		goto done;
4853	}
4854
4855	ifr = kifc.ifc_req;
4856	for (numifs = kifc.ifc_len / sizeof (struct ifreq);
4857	    numifs > 0; numifs--, ifr++) {
4858		struct sockaddr_in *sin4;
4859		struct sockaddr_in6 *sin6;
4860
4861		if ((rpcib_is_ib_interface(ifr->ifr_name) == 0)) {
4862			sin4 = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr;
4863			sin6 = (struct sockaddr_in6 *)(uintptr_t)&ifr->ifr_addr;
4864			if (sin4->sin_family == AF_INET) {
4865				saddr4[*number4] = *(struct sockaddr_in *)
4866				    (uintptr_t)&ifr->ifr_addr;
4867				*number4 = *number4 + 1;
4868			} else if (sin6->sin6_family == AF_INET6) {
4869				saddr6[*number6] = *(struct sockaddr_in6 *)
4870				    (uintptr_t)&ifr->ifr_addr;
4871				*number6 = *number6 + 1;
4872			}
4873		}
4874	}
4875	ret = B_TRUE;
4876done:
4877	kmem_free(kifc.ifc_buf, kifc.ifc_len);
4878	return (ret);
4879}
4880
4881/* ARGSUSED */
4882static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) {
4883
4884	if (KSTAT_WRITE == rw) {
4885		return (EACCES);
4886	}
4887	rpcib_kstat.cache_limit.value.ui64 =
4888	    (uint64_t)cache_limit;
4889	rpcib_kstat.cache_allocation.value.ui64 =
4890	    (uint64_t)cache_allocation;
4891	rpcib_kstat.cache_hits.value.ui64 =
4892	    (uint64_t)cache_hits;
4893	rpcib_kstat.cache_misses.value.ui64 =
4894	    (uint64_t)cache_misses;
4895	rpcib_kstat.cache_misses_above_the_limit.value.ui64 =
4896	    (uint64_t)cache_misses_above_the_limit;
4897	return (0);
4898}
4899