ib.c revision 12198:4db936bda957
19286SGarrett.Damore@Sun.COM/*
29286SGarrett.Damore@Sun.COM * CDDL HEADER START
39286SGarrett.Damore@Sun.COM *
49286SGarrett.Damore@Sun.COM * The contents of this file are subject to the terms of the
59286SGarrett.Damore@Sun.COM * Common Development and Distribution License (the "License").
69286SGarrett.Damore@Sun.COM * You may not use this file except in compliance with the License.
79286SGarrett.Damore@Sun.COM *
89286SGarrett.Damore@Sun.COM * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
99286SGarrett.Damore@Sun.COM * or http://www.opensolaris.org/os/licensing.
109286SGarrett.Damore@Sun.COM * See the License for the specific language governing permissions
119286SGarrett.Damore@Sun.COM * and limitations under the License.
129286SGarrett.Damore@Sun.COM *
139286SGarrett.Damore@Sun.COM * When distributing Covered Code, include this CDDL HEADER in each
149286SGarrett.Damore@Sun.COM * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
159286SGarrett.Damore@Sun.COM * If applicable, add the following below this CDDL HEADER, with the
169286SGarrett.Damore@Sun.COM * fields enclosed by brackets "[]" replaced with your own identifying
179286SGarrett.Damore@Sun.COM * information: Portions Copyright [yyyy] [name of copyright owner]
189286SGarrett.Damore@Sun.COM *
199286SGarrett.Damore@Sun.COM * CDDL HEADER END
209286SGarrett.Damore@Sun.COM */
219286SGarrett.Damore@Sun.COM/*
2211453Sgdamore@opensolaris.org * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
239286SGarrett.Damore@Sun.COM */
249286SGarrett.Damore@Sun.COM
259286SGarrett.Damore@Sun.COM/*
269286SGarrett.Damore@Sun.COM * Copyright (c) 2006 Oracle.  All rights reserved.
279286SGarrett.Damore@Sun.COM *
289286SGarrett.Damore@Sun.COM * This software is available to you under a choice of one of two
299286SGarrett.Damore@Sun.COM * licenses.  You may choose to be licensed under the terms of the GNU
309286SGarrett.Damore@Sun.COM * General Public License (GPL) Version 2, available from the file
319286SGarrett.Damore@Sun.COM * COPYING in the main directory of this source tree, or the
329286SGarrett.Damore@Sun.COM * OpenIB.org BSD license below:
339286SGarrett.Damore@Sun.COM *
349286SGarrett.Damore@Sun.COM *     Redistribution and use in source and binary forms, with or
359286SGarrett.Damore@Sun.COM *     without modification, are permitted provided that the following
369286SGarrett.Damore@Sun.COM *     conditions are met:
379286SGarrett.Damore@Sun.COM *
389286SGarrett.Damore@Sun.COM *      - Redistributions of source code must retain the above
399286SGarrett.Damore@Sun.COM *        copyright notice, this list of conditions and the following
409286SGarrett.Damore@Sun.COM *        disclaimer.
419286SGarrett.Damore@Sun.COM *
429286SGarrett.Damore@Sun.COM *      - Redistributions in binary form must reproduce the above
439286SGarrett.Damore@Sun.COM *        copyright notice, this list of conditions and the following
449286SGarrett.Damore@Sun.COM *        disclaimer in the documentation and/or other materials
459286SGarrett.Damore@Sun.COM *        provided with the distribution.
469286SGarrett.Damore@Sun.COM *
479286SGarrett.Damore@Sun.COM * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
489286SGarrett.Damore@Sun.COM * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
499286SGarrett.Damore@Sun.COM * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
509286SGarrett.Damore@Sun.COM * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
519286SGarrett.Damore@Sun.COM * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
529286SGarrett.Damore@Sun.COM * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
5311453Sgdamore@opensolaris.org * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
549286SGarrett.Damore@Sun.COM * SOFTWARE.
559286SGarrett.Damore@Sun.COM *
569286SGarrett.Damore@Sun.COM */
579286SGarrett.Damore@Sun.COM#include <sys/sysmacros.h>
589286SGarrett.Damore@Sun.COM#include <sys/rds.h>
599286SGarrett.Damore@Sun.COM
609286SGarrett.Damore@Sun.COM#include <sys/ib/ibtl/ibti.h>
619286SGarrett.Damore@Sun.COM#include <sys/ib/clients/rdsv3/rdsv3.h>
6211453Sgdamore@opensolaris.org#include <sys/ib/clients/rdsv3/ib.h>
639286SGarrett.Damore@Sun.COM#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
649286SGarrett.Damore@Sun.COM
659286SGarrett.Damore@Sun.COMunsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
669286SGarrett.Damore@Sun.COM
679286SGarrett.Damore@Sun.COMstruct list	rdsv3_ib_devices;
689286SGarrett.Damore@Sun.COM
699286SGarrett.Damore@Sun.COM/* NOTE: if also grabbing ibdev lock, grab this first */
709286SGarrett.Damore@Sun.COMkmutex_t ib_nodev_conns_lock;
719286SGarrett.Damore@Sun.COMlist_t ib_nodev_conns;
729286SGarrett.Damore@Sun.COM
739286SGarrett.Damore@Sun.COMvoid
749286SGarrett.Damore@Sun.COMrdsv3_ib_add_one(ib_device_t *device)
759286SGarrett.Damore@Sun.COM{
769286SGarrett.Damore@Sun.COM	struct rdsv3_ib_device *rds_ibdev;
779286SGarrett.Damore@Sun.COM	ibt_hca_attr_t *dev_attr;
789286SGarrett.Damore@Sun.COM
799286SGarrett.Damore@Sun.COM	RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device);
809286SGarrett.Damore@Sun.COM
819286SGarrett.Damore@Sun.COM	/* Only handle IB (no iWARP) devices */
829286SGarrett.Damore@Sun.COM	if (device->node_type != RDMA_NODE_IB_CA)
839286SGarrett.Damore@Sun.COM		return;
849286SGarrett.Damore@Sun.COM
859286SGarrett.Damore@Sun.COM	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
869286SGarrett.Damore@Sun.COM	    KM_NOSLEEP);
879286SGarrett.Damore@Sun.COM	if (!dev_attr)
889286SGarrett.Damore@Sun.COM		return;
89
90	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
91		RDSV3_DPRINTF5("rdsv3_ib_add_one",
92		    "Query device failed for %s", device->name);
93		goto free_attr;
94	}
95
96	/* We depend on Reserved Lkey */
97	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
98		RDSV3_DPRINTF5("rdsv3_ib_add_one",
99		    "Reserved Lkey support is required: %s",
100		    device->name);
101		goto free_attr;
102	}
103
104	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
105	if (!rds_ibdev)
106		goto free_attr;
107
108	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
109
110	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
111	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
112
113	rds_ibdev->dev = device;
114	rds_ibdev->pd = ib_alloc_pd(device);
115	if (IS_ERR(rds_ibdev->pd))
116		goto free_dev;
117
118	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
119		goto free_dev;
120	}
121
122	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
123	    offsetof(struct rdsv3_ib_ipaddr, list));
124	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
125	    offsetof(struct rdsv3_ib_connection, ib_node));
126
127	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
128
129	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
130
131	RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device);
132
133	goto free_attr;
134
135err_pd:
136	(void) ib_dealloc_pd(rds_ibdev->pd);
137free_dev:
138	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
139free_attr:
140	kmem_free(dev_attr, sizeof (*dev_attr));
141}
142
143void
144rdsv3_ib_remove_one(struct ib_device *device)
145{
146	struct rdsv3_ib_device *rds_ibdev;
147	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
148
149	RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device);
150
151	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
152	if (!rds_ibdev)
153		return;
154
155	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
156	    list) {
157		list_remove_node(&i_ipaddr->list);
158		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
159	}
160
161	rdsv3_ib_destroy_conns(rds_ibdev);
162
163	rdsv3_ib_destroy_mr_pool(rds_ibdev);
164
165#if 0
166	while (ib_dealloc_pd(rds_ibdev->pd)) {
167#ifndef __lock_lint
168		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
169		    "%s-%d Failed to dealloc pd %p",
170		    __func__, __LINE__, rds_ibdev->pd);
171#endif
172		delay(drv_usectohz(1000));
173	}
174#else
175	if (ib_dealloc_pd(rds_ibdev->pd)) {
176#ifndef __lock_lint
177		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
178		    "%s-%d Failed to dealloc pd %p",
179		    __func__, __LINE__, rds_ibdev->pd);
180#endif
181	}
182#endif
183
184	list_destroy(&rds_ibdev->ipaddr_list);
185	list_destroy(&rds_ibdev->conn_list);
186	list_remove_node(&rds_ibdev->list);
187	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
188
189	RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device);
190}
191
192#ifndef __lock_lint
193struct ib_client rdsv3_ib_client = {
194	.name		= "rdsv3_ib",
195	.add		= rdsv3_ib_add_one,
196	.remove		= rdsv3_ib_remove_one,
197	.clnt_hdl	= NULL,
198	.state		= IB_CLNT_UNINITIALIZED
199};
200#else
201struct ib_client rdsv3_ib_client = {
202	"rdsv3_ib",
203	rdsv3_ib_add_one,
204	rdsv3_ib_remove_one,
205	NULL,
206	NULL,
207	IB_CLNT_UNINITIALIZED
208};
209#endif
210
211static int
212rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
213    void *buffer)
214{
215	struct rdsv3_info_rdma_connection *iinfo = buffer;
216	struct rdsv3_ib_connection *ic;
217
218	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
219	    conn, buffer);
220
221	/* We will only ever look at IB transports */
222	if (conn->c_trans != &rdsv3_ib_transport)
223		return (0);
224
225	iinfo->src_addr = conn->c_laddr;
226	iinfo->dst_addr = conn->c_faddr;
227
228	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
229	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
230	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
231		struct rdsv3_ib_device *rds_ibdev;
232		struct rdma_dev_addr *dev_addr;
233
234		ic = conn->c_transport_data;
235		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
236
237		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
238		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
239
240		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
241		    &rdsv3_ib_client);
242		iinfo->max_send_wr = ic->i_send_ring.w_nr;
243		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
244		iinfo->max_send_sge = rds_ibdev->max_sge;
245	}
246
247	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
248	    conn, buffer);
249	return (1);
250}
251
252static void
253rds_ib_ic_info(struct rsock *sock, unsigned int len,
254    struct rdsv3_info_iterator *iter,
255    struct rdsv3_info_lengths *lens)
256{
257	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
258	    sock, iter, lens, len);
259
260	rdsv3_for_each_conn_info(sock, len, iter, lens,
261	    rds_ib_conn_info_visitor,
262	    sizeof (struct rdsv3_info_rdma_connection));
263}
264
265/*
266 * Early RDS/IB was built to only bind to an address if there is an IPoIB
267 * device with that address set.
268 *
269 * If it were me, I'd advocate for something more flexible.  Sending and
270 * receiving should be device-agnostic.  Transports would try and maintain
271 * connections between peers who have messages queued.  Userspace would be
272 * allowed to influence which paths have priority.  We could call userspace
273 * asserting this policy "routing".
274 */
275static int
276rds_ib_laddr_check(uint32_be_t addr)
277{
278	int ret;
279	struct rdma_cm_id *cm_id;
280	struct sockaddr_in sin;
281
282	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
283
284	/*
285	 * Create a CMA ID and try to bind it. This catches both
286	 * IB and iWARP capable NICs.
287	 */
288	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
289	if (IS_ERR(cm_id))
290		return (PTR_ERR(cm_id));
291
292	(void) memset(&sin, 0, sizeof (sin));
293	sin.sin_family = AF_INET;
294	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
295
296	/* rdma_bind_addr will only succeed for IB & iWARP devices */
297	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
298	/*
299	 * due to this, we will claim to support iWARP devices unless we
300	 * check node_type.
301	 */
302	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
303		ret = -EADDRNOTAVAIL;
304
305	RDSV3_DPRINTF5("rds_ib_laddr_check",
306	    "addr %u.%u.%u.%u ret %d node type %d",
307	    NIPQUAD(addr), ret,
308	    cm_id->device ? cm_id->device->node_type : -1);
309
310	rdma_destroy_id(cm_id);
311
312	return (ret);
313}
314
315void
316rdsv3_ib_exit(void)
317{
318	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
319
320	rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
321	rdsv3_ib_destroy_nodev_conns();
322	ib_unregister_client(&rdsv3_ib_client);
323	rdsv3_ib_sysctl_exit();
324	rdsv3_ib_recv_exit();
325	rdsv3_trans_unregister(&rdsv3_ib_transport);
326	mutex_destroy(&ib_nodev_conns_lock);
327	list_destroy(&ib_nodev_conns);
328	list_destroy(&rdsv3_ib_devices);
329
330	RDSV3_DPRINTF4("rds_ib_exit", "Return");
331}
332
333#ifndef __lock_lint
334struct rdsv3_transport rdsv3_ib_transport = {
335	.laddr_check		= rds_ib_laddr_check,
336	.xmit_complete		= rdsv3_ib_xmit_complete,
337	.xmit			= rdsv3_ib_xmit,
338	.xmit_cong_map		= NULL,
339	.xmit_rdma		= rdsv3_ib_xmit_rdma,
340	.recv			= rdsv3_ib_recv,
341	.conn_alloc		= rdsv3_ib_conn_alloc,
342	.conn_free		= rdsv3_ib_conn_free,
343	.conn_connect		= rdsv3_ib_conn_connect,
344	.conn_shutdown		= rdsv3_ib_conn_shutdown,
345	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
346	.inc_purge		= rdsv3_ib_inc_purge,
347	.inc_free		= rdsv3_ib_inc_free,
348	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
349	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
350	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
351	.stats_info_copy	= rdsv3_ib_stats_info_copy,
352	.exit			= rdsv3_ib_exit,
353	.get_mr			= rdsv3_ib_get_mr,
354	.sync_mr		= rdsv3_ib_sync_mr,
355	.free_mr		= rdsv3_ib_free_mr,
356	.flush_mrs		= rdsv3_ib_flush_mrs,
357	.t_name			= "infiniband",
358};
359#else
360struct rdsv3_transport rdsv3_ib_transport;
361#endif
362
363int
364rdsv3_ib_init(void)
365{
366	int ret;
367
368	RDSV3_DPRINTF4("rds_ib_init", "Enter");
369
370	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
371	    offsetof(struct rdsv3_ib_device, list));
372	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
373	    offsetof(struct rdsv3_ib_connection, ib_node));
374	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
375
376	rdsv3_ib_client.dip = rdsv3_dev_info;
377	ret = ib_register_client(&rdsv3_ib_client);
378	if (ret)
379		goto out;
380
381	ret = rdsv3_ib_sysctl_init();
382	if (ret)
383		goto out_ibreg;
384
385	ret = rdsv3_ib_recv_init();
386	if (ret)
387		goto out_sysctl;
388
389	ret = rdsv3_trans_register(&rdsv3_ib_transport);
390	if (ret)
391		goto out_recv;
392
393	rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
394
395	RDSV3_DPRINTF4("rds_ib_init", "Return");
396
397	return (0);
398
399out_recv:
400	rdsv3_ib_recv_exit();
401out_sysctl:
402	rdsv3_ib_sysctl_exit();
403out_ibreg:
404	ib_unregister_client(&rdsv3_ib_client);
405out:
406	mutex_destroy(&ib_nodev_conns_lock);
407	list_destroy(&ib_nodev_conns);
408	list_destroy(&rdsv3_ib_devices);
409	return (ret);
410}
411