1321936Shselasky/* 2321936Shselasky * Copyright (c) 2010-2012 Intel Corporation. All rights reserved. 3321936Shselasky * 4321936Shselasky * This software is available to you under a choice of one of two 5321936Shselasky * licenses. You may choose to be licensed under the terms of the GNU 6321936Shselasky * General Public License (GPL) Version 2, available from the file 7321936Shselasky * COPYING in the main directory of this source tree, or the 8321936Shselasky * OpenIB.org BSD license below: 9321936Shselasky * 10321936Shselasky * Redistribution and use in source and binary forms, with or 11321936Shselasky * without modification, are permitted provided that the following 12321936Shselasky * conditions are met: 13321936Shselasky * 14321936Shselasky * - Redistributions of source code must retain the above 15321936Shselasky * copyright notice, this list of conditions and the following 16321936Shselasky * disclaimer. 17321936Shselasky * 18321936Shselasky * - Redistributions in binary form must reproduce the above 19321936Shselasky * copyright notice, this list of conditions and the following 20321936Shselasky * disclaimer in the documentation and/or other materials 21321936Shselasky * provided with the distribution. 22321936Shselasky * 23321936Shselasky * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24321936Shselasky * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25321936Shselasky * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26321936Shselasky * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27321936Shselasky * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28321936Shselasky * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29321936Shselasky * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30321936Shselasky * SOFTWARE. 31321936Shselasky */ 32321936Shselasky 33321936Shselasky#include <config.h> 34321936Shselasky 35321936Shselasky#include <stdio.h> 36321936Shselasky#include <inttypes.h> 37321936Shselasky#include <sys/types.h> 38321936Shselasky#include <sys/socket.h> 39321936Shselasky#include <netdb.h> 40321936Shselasky#include <unistd.h> 41321936Shselasky 42321936Shselasky#include "cma.h" 43321936Shselasky#include <rdma/rdma_cma.h> 44321936Shselasky#include <infiniband/ib.h> 45321936Shselasky#include <infiniband/sa.h> 46321936Shselasky 47321936Shselasky#define ACM_VERSION 1 48321936Shselasky 49321936Shselasky#define ACM_OP_RESOLVE 0x01 50321936Shselasky#define ACM_OP_ACK 0x80 51321936Shselasky 52321936Shselasky#define ACM_STATUS_SUCCESS 0 53321936Shselasky#define ACM_STATUS_ENOMEM 1 54321936Shselasky#define ACM_STATUS_EINVAL 2 55321936Shselasky#define ACM_STATUS_ENODATA 3 56321936Shselasky#define ACM_STATUS_ENOTCONN 5 57321936Shselasky#define ACM_STATUS_ETIMEDOUT 6 58321936Shselasky#define ACM_STATUS_ESRCADDR 7 59321936Shselasky#define ACM_STATUS_ESRCTYPE 8 60321936Shselasky#define ACM_STATUS_EDESTADDR 9 61321936Shselasky#define ACM_STATUS_EDESTTYPE 10 62321936Shselasky 63321936Shselasky#define ACM_FLAGS_NODELAY (1<<30) 64321936Shselasky 65321936Shselasky#define ACM_MSG_HDR_LENGTH 16 66321936Shselasky#define ACM_MAX_ADDRESS 64 67321936Shselasky#define ACM_MSG_EP_LENGTH 72 68321936Shselasky#define ACM_MSG_DATA_LENGTH (ACM_MSG_EP_LENGTH * 8) 69321936Shselasky 70321936Shselaskystruct acm_hdr { 71321936Shselasky uint8_t version; 72321936Shselasky uint8_t opcode; 73321936Shselasky uint8_t status; 74321936Shselasky uint8_t data[3]; 75321936Shselasky uint16_t length; 76321936Shselasky uint64_t tid; 77321936Shselasky}; 78321936Shselasky 79321936Shselasky#define ACM_EP_INFO_NAME 0x0001 80321936Shselasky#define ACM_EP_INFO_ADDRESS_IP 0x0002 81321936Shselasky#define ACM_EP_INFO_ADDRESS_IP6 0x0003 82321936Shselasky#define ACM_EP_INFO_PATH 0x0010 83321936Shselasky 84321936Shselaskyunion acm_ep_info { 85321936Shselasky uint8_t addr[ACM_MAX_ADDRESS]; 86321936Shselasky uint8_t name[ACM_MAX_ADDRESS]; 87321936Shselasky struct ibv_path_record path; 88321936Shselasky}; 89321936Shselasky 90321936Shselasky#define ACM_EP_FLAG_SOURCE (1<<0) 91321936Shselasky#define ACM_EP_FLAG_DEST (1<<1) 92321936Shselasky 93321936Shselaskystruct acm_ep_addr_data { 94321936Shselasky uint32_t flags; 95321936Shselasky uint16_t type; 96321936Shselasky uint16_t reserved; 97321936Shselasky union acm_ep_info info; 98321936Shselasky}; 99321936Shselasky 100321936Shselaskystruct acm_resolve_msg { 101321936Shselasky struct acm_hdr hdr; 102321936Shselasky struct acm_ep_addr_data data[0]; 103321936Shselasky}; 104321936Shselasky 105321936Shselaskystruct acm_msg { 106321936Shselasky struct acm_hdr hdr; 107321936Shselasky union{ 108321936Shselasky uint8_t data[ACM_MSG_DATA_LENGTH]; 109321936Shselasky struct acm_ep_addr_data resolve_data[0]; 110321936Shselasky }; 111321936Shselasky}; 112321936Shselasky 113321936Shselaskystatic pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER; 114321936Shselaskystatic int sock = -1; 115321936Shselaskystatic uint16_t server_port; 116321936Shselasky 117321936Shselaskystatic int ucma_set_server_port(void) 118321936Shselasky{ 119321936Shselasky FILE *f; 120321936Shselasky 121321936Shselasky if ((f = fopen(IBACM_PORT_FILE, "r" STREAM_CLOEXEC))) { 122321936Shselasky if (fscanf(f, "%" SCNu16, &server_port) != 1) 123321936Shselasky server_port = 0; 124321936Shselasky fclose(f); 125321936Shselasky } 126321936Shselasky return server_port; 127321936Shselasky} 128321936Shselasky 129321936Shselaskyvoid ucma_ib_init(void) 130321936Shselasky{ 131321936Shselasky struct sockaddr_in addr; 132321936Shselasky static int init; 133321936Shselasky int ret; 134321936Shselasky 135321936Shselasky if (init) 136321936Shselasky return; 137321936Shselasky 138321936Shselasky pthread_mutex_lock(&acm_lock); 139321936Shselasky if (init) 140321936Shselasky goto unlock; 141321936Shselasky 142321936Shselasky if (!ucma_set_server_port()) 143321936Shselasky goto out; 144321936Shselasky 145321936Shselasky sock = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); 146321936Shselasky if (sock < 0) 147321936Shselasky goto out; 148321936Shselasky 149321936Shselasky memset(&addr, 0, sizeof addr); 150321936Shselasky addr.sin_family = AF_INET; 151321936Shselasky addr.sin_addr.s_addr = htobe32(INADDR_LOOPBACK); 152321936Shselasky addr.sin_port = htobe16(server_port); 153321936Shselasky ret = connect(sock, (struct sockaddr *) &addr, sizeof(addr)); 154321936Shselasky if (ret) { 155321936Shselasky close(sock); 156321936Shselasky sock = -1; 157321936Shselasky } 158321936Shselaskyout: 159321936Shselasky init = 1; 160321936Shselaskyunlock: 161321936Shselasky pthread_mutex_unlock(&acm_lock); 162321936Shselasky} 163321936Shselasky 164321936Shselaskyvoid ucma_ib_cleanup(void) 165321936Shselasky{ 166321936Shselasky if (sock >= 0) { 167321936Shselasky shutdown(sock, SHUT_RDWR); 168321936Shselasky close(sock); 169321936Shselasky } 170321936Shselasky} 171321936Shselasky 172321936Shselaskystatic int ucma_ib_set_addr(struct rdma_addrinfo *ib_rai, 173321936Shselasky struct rdma_addrinfo *rai) 174321936Shselasky{ 175321936Shselasky struct sockaddr_ib *src, *dst; 176321936Shselasky struct ibv_path_record *path; 177321936Shselasky 178321936Shselasky src = calloc(1, sizeof(*src)); 179321936Shselasky if (!src) 180321936Shselasky return ERR(ENOMEM); 181321936Shselasky 182321936Shselasky dst = calloc(1, sizeof(*dst)); 183321936Shselasky if (!dst) { 184321936Shselasky free(src); 185321936Shselasky return ERR(ENOMEM); 186321936Shselasky } 187321936Shselasky 188321936Shselasky path = &((struct ibv_path_data *) ib_rai->ai_route)->path; 189321936Shselasky 190321936Shselasky src->sib_family = AF_IB; 191321936Shselasky src->sib_pkey = path->pkey; 192321936Shselasky src->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8); 193321936Shselasky memcpy(&src->sib_addr, &path->sgid, 16); 194321936Shselasky ucma_set_sid(ib_rai->ai_port_space, rai->ai_src_addr, src); 195321936Shselasky 196321936Shselasky dst->sib_family = AF_IB; 197321936Shselasky dst->sib_pkey = path->pkey; 198321936Shselasky dst->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8); 199321936Shselasky memcpy(&dst->sib_addr, &path->dgid, 16); 200321936Shselasky ucma_set_sid(ib_rai->ai_port_space, rai->ai_dst_addr, dst); 201321936Shselasky 202321936Shselasky ib_rai->ai_src_addr = (struct sockaddr *) src; 203321936Shselasky ib_rai->ai_src_len = sizeof(*src); 204321936Shselasky 205321936Shselasky ib_rai->ai_dst_addr = (struct sockaddr *) dst; 206321936Shselasky ib_rai->ai_dst_len = sizeof(*dst); 207321936Shselasky 208321936Shselasky return 0; 209321936Shselasky} 210321936Shselasky 211321936Shselaskystatic int ucma_ib_set_connect(struct rdma_addrinfo *ib_rai, 212321936Shselasky struct rdma_addrinfo *rai) 213321936Shselasky{ 214321936Shselasky struct ib_connect_hdr *hdr; 215321936Shselasky 216321936Shselasky if (rai->ai_family == AF_IB) 217321936Shselasky return 0; 218321936Shselasky 219321936Shselasky hdr = calloc(1, sizeof(*hdr)); 220321936Shselasky if (!hdr) 221321936Shselasky return ERR(ENOMEM); 222321936Shselasky 223321936Shselasky if (rai->ai_family == AF_INET) { 224321936Shselasky hdr->ip_version = 4 << 4; 225321936Shselasky memcpy(&hdr->cma_src_ip4, 226321936Shselasky &((struct sockaddr_in *) rai->ai_src_addr)->sin_addr, 4); 227321936Shselasky memcpy(&hdr->cma_dst_ip4, 228321936Shselasky &((struct sockaddr_in *) rai->ai_dst_addr)->sin_addr, 4); 229321936Shselasky } else { 230321936Shselasky hdr->ip_version = 6 << 4; 231321936Shselasky memcpy(&hdr->cma_src_ip6, 232321936Shselasky &((struct sockaddr_in6 *) rai->ai_src_addr)->sin6_addr, 16); 233321936Shselasky memcpy(&hdr->cma_dst_ip6, 234321936Shselasky &((struct sockaddr_in6 *) rai->ai_dst_addr)->sin6_addr, 16); 235321936Shselasky } 236321936Shselasky 237321936Shselasky ib_rai->ai_connect = hdr; 238321936Shselasky ib_rai->ai_connect_len = sizeof(*hdr); 239321936Shselasky return 0; 240321936Shselasky} 241321936Shselasky 242321936Shselaskystatic void ucma_resolve_af_ib(struct rdma_addrinfo **rai) 243321936Shselasky{ 244321936Shselasky struct rdma_addrinfo *ib_rai; 245321936Shselasky 246321936Shselasky ib_rai = calloc(1, sizeof(*ib_rai)); 247321936Shselasky if (!ib_rai) 248321936Shselasky return; 249321936Shselasky 250321936Shselasky ib_rai->ai_flags = (*rai)->ai_flags; 251321936Shselasky ib_rai->ai_family = AF_IB; 252321936Shselasky ib_rai->ai_qp_type = (*rai)->ai_qp_type; 253321936Shselasky ib_rai->ai_port_space = (*rai)->ai_port_space; 254321936Shselasky 255321936Shselasky ib_rai->ai_route = calloc(1, (*rai)->ai_route_len); 256321936Shselasky if (!ib_rai->ai_route) 257321936Shselasky goto err; 258321936Shselasky 259321936Shselasky memcpy(ib_rai->ai_route, (*rai)->ai_route, (*rai)->ai_route_len); 260321936Shselasky ib_rai->ai_route_len = (*rai)->ai_route_len; 261321936Shselasky 262321936Shselasky if ((*rai)->ai_src_canonname) { 263321936Shselasky ib_rai->ai_src_canonname = strdup((*rai)->ai_src_canonname); 264321936Shselasky if (!ib_rai->ai_src_canonname) 265321936Shselasky goto err; 266321936Shselasky } 267321936Shselasky 268321936Shselasky if ((*rai)->ai_dst_canonname) { 269321936Shselasky ib_rai->ai_dst_canonname = strdup((*rai)->ai_dst_canonname); 270321936Shselasky if (!ib_rai->ai_dst_canonname) 271321936Shselasky goto err; 272321936Shselasky } 273321936Shselasky 274321936Shselasky if (ucma_ib_set_connect(ib_rai, *rai)) 275321936Shselasky goto err; 276321936Shselasky 277321936Shselasky if (ucma_ib_set_addr(ib_rai, *rai)) 278321936Shselasky goto err; 279321936Shselasky 280321936Shselasky ib_rai->ai_next = *rai; 281321936Shselasky *rai = ib_rai; 282321936Shselasky return; 283321936Shselasky 284321936Shselaskyerr: 285321936Shselasky rdma_freeaddrinfo(ib_rai); 286321936Shselasky} 287321936Shselasky 288321936Shselaskystatic void ucma_ib_save_resp(struct rdma_addrinfo *rai, struct acm_msg *msg) 289321936Shselasky{ 290321936Shselasky struct acm_ep_addr_data *ep_data; 291321936Shselasky struct ibv_path_data *path_data = NULL; 292321936Shselasky struct sockaddr_in *sin; 293321936Shselasky struct sockaddr_in6 *sin6; 294321936Shselasky int i, cnt, path_cnt = 0; 295321936Shselasky 296321936Shselasky cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH; 297321936Shselasky for (i = 0; i < cnt; i++) { 298321936Shselasky ep_data = &msg->resolve_data[i]; 299321936Shselasky switch (ep_data->type) { 300321936Shselasky case ACM_EP_INFO_PATH: 301321936Shselasky ep_data->type = 0; 302321936Shselasky if (!path_data) 303321936Shselasky path_data = (struct ibv_path_data *) ep_data; 304321936Shselasky path_cnt++; 305321936Shselasky break; 306321936Shselasky case ACM_EP_INFO_ADDRESS_IP: 307321936Shselasky if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len) 308321936Shselasky break; 309321936Shselasky 310321936Shselasky sin = calloc(1, sizeof(*sin)); 311321936Shselasky if (!sin) 312321936Shselasky break; 313321936Shselasky 314321936Shselasky sin->sin_family = AF_INET; 315321936Shselasky memcpy(&sin->sin_addr, &ep_data->info.addr, 4); 316321936Shselasky rai->ai_src_len = sizeof(*sin); 317321936Shselasky rai->ai_src_addr = (struct sockaddr *) sin; 318321936Shselasky break; 319321936Shselasky case ACM_EP_INFO_ADDRESS_IP6: 320321936Shselasky if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len) 321321936Shselasky break; 322321936Shselasky 323321936Shselasky sin6 = calloc(1, sizeof(*sin6)); 324321936Shselasky if (!sin6) 325321936Shselasky break; 326321936Shselasky 327321936Shselasky sin6->sin6_family = AF_INET6; 328321936Shselasky memcpy(&sin6->sin6_addr, &ep_data->info.addr, 16); 329321936Shselasky rai->ai_src_len = sizeof(*sin6); 330321936Shselasky rai->ai_src_addr = (struct sockaddr *) sin6; 331321936Shselasky break; 332321936Shselasky default: 333321936Shselasky break; 334321936Shselasky } 335321936Shselasky } 336321936Shselasky 337321936Shselasky rai->ai_route = calloc(path_cnt, sizeof(*path_data)); 338321936Shselasky if (rai->ai_route) { 339321936Shselasky memcpy(rai->ai_route, path_data, path_cnt * sizeof(*path_data)); 340321936Shselasky rai->ai_route_len = path_cnt * sizeof(*path_data); 341321936Shselasky } 342321936Shselasky} 343321936Shselasky 344321936Shselaskystatic void ucma_set_ep_addr(struct acm_ep_addr_data *data, struct sockaddr *addr) 345321936Shselasky{ 346321936Shselasky if (addr->sa_family == AF_INET) { 347321936Shselasky data->type = ACM_EP_INFO_ADDRESS_IP; 348321936Shselasky memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4); 349321936Shselasky } else { 350321936Shselasky data->type = ACM_EP_INFO_ADDRESS_IP6; 351321936Shselasky memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16); 352321936Shselasky } 353321936Shselasky} 354321936Shselasky 355321936Shselaskystatic int ucma_inet_addr(struct sockaddr *addr, socklen_t len) 356321936Shselasky{ 357321936Shselasky return len && addr && (addr->sa_family == AF_INET || 358321936Shselasky addr->sa_family == AF_INET6); 359321936Shselasky} 360321936Shselasky 361321936Shselaskystatic int ucma_ib_addr(struct sockaddr *addr, socklen_t len) 362321936Shselasky{ 363321936Shselasky return len && addr && (addr->sa_family == AF_IB); 364321936Shselasky} 365321936Shselasky 366321936Shselaskyvoid ucma_ib_resolve(struct rdma_addrinfo **rai, 367321936Shselasky const struct rdma_addrinfo *hints) 368321936Shselasky{ 369321936Shselasky struct acm_msg msg; 370321936Shselasky struct acm_ep_addr_data *data; 371321936Shselasky int ret; 372321936Shselasky 373321936Shselasky ucma_ib_init(); 374321936Shselasky if (sock < 0) 375321936Shselasky return; 376321936Shselasky 377321936Shselasky memset(&msg, 0, sizeof msg); 378321936Shselasky msg.hdr.version = ACM_VERSION; 379321936Shselasky msg.hdr.opcode = ACM_OP_RESOLVE; 380321936Shselasky msg.hdr.length = ACM_MSG_HDR_LENGTH; 381321936Shselasky 382321936Shselasky data = &msg.resolve_data[0]; 383321936Shselasky if (ucma_inet_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) { 384321936Shselasky data->flags = ACM_EP_FLAG_SOURCE; 385321936Shselasky ucma_set_ep_addr(data, (*rai)->ai_src_addr); 386321936Shselasky data++; 387321936Shselasky msg.hdr.length += ACM_MSG_EP_LENGTH; 388321936Shselasky } 389321936Shselasky 390321936Shselasky if (ucma_inet_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { 391321936Shselasky data->flags = ACM_EP_FLAG_DEST; 392321936Shselasky if (hints->ai_flags & (RAI_NUMERICHOST | RAI_NOROUTE)) 393321936Shselasky data->flags |= ACM_FLAGS_NODELAY; 394321936Shselasky ucma_set_ep_addr(data, (*rai)->ai_dst_addr); 395321936Shselasky data++; 396321936Shselasky msg.hdr.length += ACM_MSG_EP_LENGTH; 397321936Shselasky } 398321936Shselasky 399321936Shselasky if (hints->ai_route_len || 400321936Shselasky ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len) || 401321936Shselasky ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { 402321936Shselasky struct ibv_path_record *path; 403321936Shselasky 404321936Shselasky if (hints->ai_route_len == sizeof(struct ibv_path_record)) 405321936Shselasky path = (struct ibv_path_record *) hints->ai_route; 406321936Shselasky else if (hints->ai_route_len == sizeof(struct ibv_path_data)) 407321936Shselasky path = &((struct ibv_path_data *) hints->ai_route)->path; 408321936Shselasky else 409321936Shselasky path = NULL; 410321936Shselasky 411321936Shselasky if (path) 412321936Shselasky memcpy(&data->info.path, path, sizeof(*path)); 413321936Shselasky 414321936Shselasky if (ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) { 415321936Shselasky memcpy(&data->info.path.sgid, 416321936Shselasky &((struct sockaddr_ib *) (*rai)->ai_src_addr)->sib_addr, 16); 417321936Shselasky } 418321936Shselasky if (ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { 419321936Shselasky memcpy(&data->info.path.dgid, 420321936Shselasky &((struct sockaddr_ib *) (*rai)->ai_dst_addr)->sib_addr, 16); 421321936Shselasky } 422321936Shselasky data->type = ACM_EP_INFO_PATH; 423321936Shselasky data++; 424321936Shselasky msg.hdr.length += ACM_MSG_EP_LENGTH; 425321936Shselasky } 426321936Shselasky 427321936Shselasky pthread_mutex_lock(&acm_lock); 428321936Shselasky ret = send(sock, (char *) &msg, msg.hdr.length, 0); 429321936Shselasky if (ret != msg.hdr.length) { 430321936Shselasky pthread_mutex_unlock(&acm_lock); 431321936Shselasky return; 432321936Shselasky } 433321936Shselasky 434321936Shselasky ret = recv(sock, (char *) &msg, sizeof msg, 0); 435321936Shselasky pthread_mutex_unlock(&acm_lock); 436321936Shselasky if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length || msg.hdr.status) 437321936Shselasky return; 438321936Shselasky 439321936Shselasky ucma_ib_save_resp(*rai, &msg); 440321936Shselasky 441321936Shselasky if (af_ib_support && !(hints->ai_flags & RAI_ROUTEONLY) && (*rai)->ai_route_len) 442321936Shselasky ucma_resolve_af_ib(rai); 443321936Shselasky} 444