ib.c revision 12198:4db936bda957
19286SGarrett.Damore@Sun.COM/* 29286SGarrett.Damore@Sun.COM * CDDL HEADER START 39286SGarrett.Damore@Sun.COM * 49286SGarrett.Damore@Sun.COM * The contents of this file are subject to the terms of the 59286SGarrett.Damore@Sun.COM * Common Development and Distribution License (the "License"). 69286SGarrett.Damore@Sun.COM * You may not use this file except in compliance with the License. 79286SGarrett.Damore@Sun.COM * 89286SGarrett.Damore@Sun.COM * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 99286SGarrett.Damore@Sun.COM * or http://www.opensolaris.org/os/licensing. 109286SGarrett.Damore@Sun.COM * See the License for the specific language governing permissions 119286SGarrett.Damore@Sun.COM * and limitations under the License. 129286SGarrett.Damore@Sun.COM * 139286SGarrett.Damore@Sun.COM * When distributing Covered Code, include this CDDL HEADER in each 149286SGarrett.Damore@Sun.COM * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 159286SGarrett.Damore@Sun.COM * If applicable, add the following below this CDDL HEADER, with the 169286SGarrett.Damore@Sun.COM * fields enclosed by brackets "[]" replaced with your own identifying 179286SGarrett.Damore@Sun.COM * information: Portions Copyright [yyyy] [name of copyright owner] 189286SGarrett.Damore@Sun.COM * 199286SGarrett.Damore@Sun.COM * CDDL HEADER END 209286SGarrett.Damore@Sun.COM */ 219286SGarrett.Damore@Sun.COM/* 2211453Sgdamore@opensolaris.org * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 239286SGarrett.Damore@Sun.COM */ 249286SGarrett.Damore@Sun.COM 259286SGarrett.Damore@Sun.COM/* 269286SGarrett.Damore@Sun.COM * Copyright (c) 2006 Oracle. All rights reserved. 279286SGarrett.Damore@Sun.COM * 289286SGarrett.Damore@Sun.COM * This software is available to you under a choice of one of two 299286SGarrett.Damore@Sun.COM * licenses. You may choose to be licensed under the terms of the GNU 309286SGarrett.Damore@Sun.COM * General Public License (GPL) Version 2, available from the file 319286SGarrett.Damore@Sun.COM * COPYING in the main directory of this source tree, or the 329286SGarrett.Damore@Sun.COM * OpenIB.org BSD license below: 339286SGarrett.Damore@Sun.COM * 349286SGarrett.Damore@Sun.COM * Redistribution and use in source and binary forms, with or 359286SGarrett.Damore@Sun.COM * without modification, are permitted provided that the following 369286SGarrett.Damore@Sun.COM * conditions are met: 379286SGarrett.Damore@Sun.COM * 389286SGarrett.Damore@Sun.COM * - Redistributions of source code must retain the above 399286SGarrett.Damore@Sun.COM * copyright notice, this list of conditions and the following 409286SGarrett.Damore@Sun.COM * disclaimer. 419286SGarrett.Damore@Sun.COM * 429286SGarrett.Damore@Sun.COM * - Redistributions in binary form must reproduce the above 439286SGarrett.Damore@Sun.COM * copyright notice, this list of conditions and the following 449286SGarrett.Damore@Sun.COM * disclaimer in the documentation and/or other materials 459286SGarrett.Damore@Sun.COM * provided with the distribution. 469286SGarrett.Damore@Sun.COM * 479286SGarrett.Damore@Sun.COM * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 489286SGarrett.Damore@Sun.COM * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 499286SGarrett.Damore@Sun.COM * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 509286SGarrett.Damore@Sun.COM * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 519286SGarrett.Damore@Sun.COM * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 529286SGarrett.Damore@Sun.COM * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 5311453Sgdamore@opensolaris.org * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 549286SGarrett.Damore@Sun.COM * SOFTWARE. 559286SGarrett.Damore@Sun.COM * 569286SGarrett.Damore@Sun.COM */ 579286SGarrett.Damore@Sun.COM#include <sys/sysmacros.h> 589286SGarrett.Damore@Sun.COM#include <sys/rds.h> 599286SGarrett.Damore@Sun.COM 609286SGarrett.Damore@Sun.COM#include <sys/ib/ibtl/ibti.h> 619286SGarrett.Damore@Sun.COM#include <sys/ib/clients/rdsv3/rdsv3.h> 6211453Sgdamore@opensolaris.org#include <sys/ib/clients/rdsv3/ib.h> 639286SGarrett.Damore@Sun.COM#include <sys/ib/clients/rdsv3/rdsv3_debug.h> 649286SGarrett.Damore@Sun.COM 659286SGarrett.Damore@Sun.COMunsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT; 669286SGarrett.Damore@Sun.COM 679286SGarrett.Damore@Sun.COMstruct list rdsv3_ib_devices; 689286SGarrett.Damore@Sun.COM 699286SGarrett.Damore@Sun.COM/* NOTE: if also grabbing ibdev lock, grab this first */ 709286SGarrett.Damore@Sun.COMkmutex_t ib_nodev_conns_lock; 719286SGarrett.Damore@Sun.COMlist_t ib_nodev_conns; 729286SGarrett.Damore@Sun.COM 739286SGarrett.Damore@Sun.COMvoid 749286SGarrett.Damore@Sun.COMrdsv3_ib_add_one(ib_device_t *device) 759286SGarrett.Damore@Sun.COM{ 769286SGarrett.Damore@Sun.COM struct rdsv3_ib_device *rds_ibdev; 779286SGarrett.Damore@Sun.COM ibt_hca_attr_t *dev_attr; 789286SGarrett.Damore@Sun.COM 799286SGarrett.Damore@Sun.COM RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device); 809286SGarrett.Damore@Sun.COM 819286SGarrett.Damore@Sun.COM /* Only handle IB (no iWARP) devices */ 829286SGarrett.Damore@Sun.COM if (device->node_type != RDMA_NODE_IB_CA) 839286SGarrett.Damore@Sun.COM return; 849286SGarrett.Damore@Sun.COM 859286SGarrett.Damore@Sun.COM dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr), 869286SGarrett.Damore@Sun.COM KM_NOSLEEP); 879286SGarrett.Damore@Sun.COM if (!dev_attr) 889286SGarrett.Damore@Sun.COM return; 89 90 if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) { 91 RDSV3_DPRINTF5("rdsv3_ib_add_one", 92 "Query device failed for %s", device->name); 93 goto free_attr; 94 } 95 96 /* We depend on Reserved Lkey */ 97 if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) { 98 RDSV3_DPRINTF5("rdsv3_ib_add_one", 99 "Reserved Lkey support is required: %s", 100 device->name); 101 goto free_attr; 102 } 103 104 rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP); 105 if (!rds_ibdev) 106 goto free_attr; 107 108 mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); 109 110 rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; 111 rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); 112 113 rds_ibdev->dev = device; 114 rds_ibdev->pd = ib_alloc_pd(device); 115 if (IS_ERR(rds_ibdev->pd)) 116 goto free_dev; 117 118 if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) { 119 goto free_dev; 120 } 121 122 list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), 123 offsetof(struct rdsv3_ib_ipaddr, list)); 124 list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection), 125 offsetof(struct rdsv3_ib_connection, ib_node)); 126 127 list_insert_tail(&rdsv3_ib_devices, rds_ibdev); 128 129 ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev); 130 131 RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device); 132 133 goto free_attr; 134 135err_pd: 136 (void) ib_dealloc_pd(rds_ibdev->pd); 137free_dev: 138 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 139free_attr: 140 kmem_free(dev_attr, sizeof (*dev_attr)); 141} 142 143void 144rdsv3_ib_remove_one(struct ib_device *device) 145{ 146 struct rdsv3_ib_device *rds_ibdev; 147 struct rdsv3_ib_ipaddr *i_ipaddr, *i_next; 148 149 RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device); 150 151 rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client); 152 if (!rds_ibdev) 153 return; 154 155 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, 156 list) { 157 list_remove_node(&i_ipaddr->list); 158 kmem_free(i_ipaddr, sizeof (*i_ipaddr)); 159 } 160 161 rdsv3_ib_destroy_conns(rds_ibdev); 162 163 rdsv3_ib_destroy_mr_pool(rds_ibdev); 164 165#if 0 166 while (ib_dealloc_pd(rds_ibdev->pd)) { 167#ifndef __lock_lint 168 RDSV3_DPRINTF5("rdsv3_ib_remove_one", 169 "%s-%d Failed to dealloc pd %p", 170 __func__, __LINE__, rds_ibdev->pd); 171#endif 172 delay(drv_usectohz(1000)); 173 } 174#else 175 if (ib_dealloc_pd(rds_ibdev->pd)) { 176#ifndef __lock_lint 177 RDSV3_DPRINTF2("rdsv3_ib_remove_one", 178 "%s-%d Failed to dealloc pd %p", 179 __func__, __LINE__, rds_ibdev->pd); 180#endif 181 } 182#endif 183 184 list_destroy(&rds_ibdev->ipaddr_list); 185 list_destroy(&rds_ibdev->conn_list); 186 list_remove_node(&rds_ibdev->list); 187 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 188 189 RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device); 190} 191 192#ifndef __lock_lint 193struct ib_client rdsv3_ib_client = { 194 .name = "rdsv3_ib", 195 .add = rdsv3_ib_add_one, 196 .remove = rdsv3_ib_remove_one, 197 .clnt_hdl = NULL, 198 .state = IB_CLNT_UNINITIALIZED 199}; 200#else 201struct ib_client rdsv3_ib_client = { 202 "rdsv3_ib", 203 rdsv3_ib_add_one, 204 rdsv3_ib_remove_one, 205 NULL, 206 NULL, 207 IB_CLNT_UNINITIALIZED 208}; 209#endif 210 211static int 212rds_ib_conn_info_visitor(struct rdsv3_connection *conn, 213 void *buffer) 214{ 215 struct rdsv3_info_rdma_connection *iinfo = buffer; 216 struct rdsv3_ib_connection *ic; 217 218 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 219 conn, buffer); 220 221 /* We will only ever look at IB transports */ 222 if (conn->c_trans != &rdsv3_ib_transport) 223 return (0); 224 225 iinfo->src_addr = conn->c_laddr; 226 iinfo->dst_addr = conn->c_faddr; 227 228 (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid)); 229 (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid)); 230 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 231 struct rdsv3_ib_device *rds_ibdev; 232 struct rdma_dev_addr *dev_addr; 233 234 ic = conn->c_transport_data; 235 dev_addr = &ic->i_cm_id->route.addr.dev_addr; 236 237 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid); 238 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid); 239 240 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, 241 &rdsv3_ib_client); 242 iinfo->max_send_wr = ic->i_send_ring.w_nr; 243 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 244 iinfo->max_send_sge = rds_ibdev->max_sge; 245 } 246 247 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 248 conn, buffer); 249 return (1); 250} 251 252static void 253rds_ib_ic_info(struct rsock *sock, unsigned int len, 254 struct rdsv3_info_iterator *iter, 255 struct rdsv3_info_lengths *lens) 256{ 257 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d", 258 sock, iter, lens, len); 259 260 rdsv3_for_each_conn_info(sock, len, iter, lens, 261 rds_ib_conn_info_visitor, 262 sizeof (struct rdsv3_info_rdma_connection)); 263} 264 265/* 266 * Early RDS/IB was built to only bind to an address if there is an IPoIB 267 * device with that address set. 268 * 269 * If it were me, I'd advocate for something more flexible. Sending and 270 * receiving should be device-agnostic. Transports would try and maintain 271 * connections between peers who have messages queued. Userspace would be 272 * allowed to influence which paths have priority. We could call userspace 273 * asserting this policy "routing". 274 */ 275static int 276rds_ib_laddr_check(uint32_be_t addr) 277{ 278 int ret; 279 struct rdma_cm_id *cm_id; 280 struct sockaddr_in sin; 281 282 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr)); 283 284 /* 285 * Create a CMA ID and try to bind it. This catches both 286 * IB and iWARP capable NICs. 287 */ 288 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 289 if (IS_ERR(cm_id)) 290 return (PTR_ERR(cm_id)); 291 292 (void) memset(&sin, 0, sizeof (sin)); 293 sin.sin_family = AF_INET; 294 sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr); 295 296 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 297 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 298 /* 299 * due to this, we will claim to support iWARP devices unless we 300 * check node_type. 301 */ 302 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) 303 ret = -EADDRNOTAVAIL; 304 305 RDSV3_DPRINTF5("rds_ib_laddr_check", 306 "addr %u.%u.%u.%u ret %d node type %d", 307 NIPQUAD(addr), ret, 308 cm_id->device ? cm_id->device->node_type : -1); 309 310 rdma_destroy_id(cm_id); 311 312 return (ret); 313} 314 315void 316rdsv3_ib_exit(void) 317{ 318 RDSV3_DPRINTF4("rds_ib_exit", "Enter"); 319 320 rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); 321 rdsv3_ib_destroy_nodev_conns(); 322 ib_unregister_client(&rdsv3_ib_client); 323 rdsv3_ib_sysctl_exit(); 324 rdsv3_ib_recv_exit(); 325 rdsv3_trans_unregister(&rdsv3_ib_transport); 326 mutex_destroy(&ib_nodev_conns_lock); 327 list_destroy(&ib_nodev_conns); 328 list_destroy(&rdsv3_ib_devices); 329 330 RDSV3_DPRINTF4("rds_ib_exit", "Return"); 331} 332 333#ifndef __lock_lint 334struct rdsv3_transport rdsv3_ib_transport = { 335 .laddr_check = rds_ib_laddr_check, 336 .xmit_complete = rdsv3_ib_xmit_complete, 337 .xmit = rdsv3_ib_xmit, 338 .xmit_cong_map = NULL, 339 .xmit_rdma = rdsv3_ib_xmit_rdma, 340 .recv = rdsv3_ib_recv, 341 .conn_alloc = rdsv3_ib_conn_alloc, 342 .conn_free = rdsv3_ib_conn_free, 343 .conn_connect = rdsv3_ib_conn_connect, 344 .conn_shutdown = rdsv3_ib_conn_shutdown, 345 .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, 346 .inc_purge = rdsv3_ib_inc_purge, 347 .inc_free = rdsv3_ib_inc_free, 348 .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, 349 .cm_handle_connect = rdsv3_ib_cm_handle_connect, 350 .cm_connect_complete = rdsv3_ib_cm_connect_complete, 351 .stats_info_copy = rdsv3_ib_stats_info_copy, 352 .exit = rdsv3_ib_exit, 353 .get_mr = rdsv3_ib_get_mr, 354 .sync_mr = rdsv3_ib_sync_mr, 355 .free_mr = rdsv3_ib_free_mr, 356 .flush_mrs = rdsv3_ib_flush_mrs, 357 .t_name = "infiniband", 358}; 359#else 360struct rdsv3_transport rdsv3_ib_transport; 361#endif 362 363int 364rdsv3_ib_init(void) 365{ 366 int ret; 367 368 RDSV3_DPRINTF4("rds_ib_init", "Enter"); 369 370 list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device), 371 offsetof(struct rdsv3_ib_device, list)); 372 list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection), 373 offsetof(struct rdsv3_ib_connection, ib_node)); 374 mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL); 375 376 rdsv3_ib_client.dip = rdsv3_dev_info; 377 ret = ib_register_client(&rdsv3_ib_client); 378 if (ret) 379 goto out; 380 381 ret = rdsv3_ib_sysctl_init(); 382 if (ret) 383 goto out_ibreg; 384 385 ret = rdsv3_ib_recv_init(); 386 if (ret) 387 goto out_sysctl; 388 389 ret = rdsv3_trans_register(&rdsv3_ib_transport); 390 if (ret) 391 goto out_recv; 392 393 rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); 394 395 RDSV3_DPRINTF4("rds_ib_init", "Return"); 396 397 return (0); 398 399out_recv: 400 rdsv3_ib_recv_exit(); 401out_sysctl: 402 rdsv3_ib_sysctl_exit(); 403out_ibreg: 404 ib_unregister_client(&rdsv3_ib_client); 405out: 406 mutex_destroy(&ib_nodev_conns_lock); 407 list_destroy(&ib_nodev_conns); 408 list_destroy(&rdsv3_ib_devices); 409 return (ret); 410} 411