/* * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cma.h" #include "indexer.h" #include #include #include #include #include #include #define CMA_INIT_CMD(req, req_size, op) \ do { \ memset(req, 0, req_size); \ (req)->cmd = UCMA_CMD_##op; \ (req)->in = req_size - sizeof(struct ucma_abi_cmd_hdr); \ } while (0) #define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \ do { \ CMA_INIT_CMD(req, req_size, op); \ (req)->out = resp_size; \ (req)->response = (uintptr_t) (resp); \ } while (0) struct cma_port { uint8_t link_layer; }; struct cma_device { struct ibv_context *verbs; struct ibv_pd *pd; struct ibv_xrcd *xrcd; struct cma_port *port; __be64 guid; int port_cnt; int refcnt; int max_qpsize; uint8_t max_initiator_depth; uint8_t max_responder_resources; }; struct cma_id_private { struct rdma_cm_id id; struct cma_device *cma_dev; void *connect; size_t connect_len; int events_completed; int connect_error; int sync; pthread_cond_t cond; pthread_mutex_t mut; uint32_t handle; struct cma_multicast *mc_list; struct ibv_qp_init_attr *qp_init_attr; uint8_t initiator_depth; uint8_t responder_resources; }; struct cma_multicast { struct cma_multicast *next; struct cma_id_private *id_priv; void *context; int events_completed; pthread_cond_t cond; uint32_t handle; union ibv_gid mgid; uint16_t mlid; struct sockaddr_storage addr; }; struct cma_event { struct rdma_cm_event event; uint8_t private_data[RDMA_MAX_PRIVATE_DATA]; struct cma_id_private *id_priv; struct cma_multicast *mc; }; static struct cma_device *cma_dev_array; static int cma_dev_cnt; static int cma_init_cnt; static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION; int af_ib_support; static struct index_map ucma_idm; static fastlock_t idm_lock; static int check_abi_version(void) { char value[8]; if ((ibv_read_sysfs_file(ibv_get_sysfs_path(), "class/misc/rdma_cm/abi_version", value, sizeof value) < 0) && (ibv_read_sysfs_file(ibv_get_sysfs_path(), "class/infiniband_ucma/abi_version", value, sizeof value) < 0)) { /* * Older version of Linux do not have class/misc. To support * backports, assume the most recent version of the ABI. If * we're wrong, we'll simply fail later when calling the ABI. */ return 0; } abi_ver = strtol(value, NULL, 10); if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION || abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) { return -1; } return 0; } /* * This function is called holding the mutex lock * cma_dev_cnt must be set before calling this function to * ensure that the lock is not acquired recursively. */ static void ucma_set_af_ib_support(void) { struct rdma_cm_id *id; struct sockaddr_ib sib; int ret; ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB); if (ret) return; memset(&sib, 0, sizeof sib); sib.sib_family = AF_IB; sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP); sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK); af_ib_support = 1; ret = rdma_bind_addr(id, (struct sockaddr *) &sib); af_ib_support = !ret; rdma_destroy_id(id); } int ucma_init(void) { struct ibv_device **dev_list = NULL; int i, ret, dev_cnt; /* Quick check without lock to see if we're already initialized */ if (cma_dev_cnt) return 0; pthread_mutex_lock(&mut); if (cma_dev_cnt) { pthread_mutex_unlock(&mut); return 0; } fastlock_init(&idm_lock); ret = check_abi_version(); if (ret) goto err1; dev_list = ibv_get_device_list(&dev_cnt); if (!dev_list) { ret = ERR(ENODEV); goto err1; } if (!dev_cnt) { ret = ERR(ENODEV); goto err2; } cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array)); if (!cma_dev_array) { ret = ERR(ENOMEM); goto err2; } for (i = 0; dev_list[i]; i++) cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]); cma_dev_cnt = dev_cnt; ucma_set_af_ib_support(); pthread_mutex_unlock(&mut); ibv_free_device_list(dev_list); return 0; err2: ibv_free_device_list(dev_list); err1: fastlock_destroy(&idm_lock); pthread_mutex_unlock(&mut); return ret; } static struct ibv_context *ucma_open_device(__be64 guid) { struct ibv_device **dev_list; struct ibv_context *verbs = NULL; int i; dev_list = ibv_get_device_list(NULL); if (!dev_list) { return NULL; } for (i = 0; dev_list[i]; i++) { if (ibv_get_device_guid(dev_list[i]) == guid) { verbs = ibv_open_device(dev_list[i]); break; } } ibv_free_device_list(dev_list); return verbs; } static int ucma_init_device(struct cma_device *cma_dev) { struct ibv_port_attr port_attr; struct ibv_device_attr attr; int i, ret; if (cma_dev->verbs) return 0; cma_dev->verbs = ucma_open_device(cma_dev->guid); if (!cma_dev->verbs) return ERR(ENODEV); ret = ibv_query_device(cma_dev->verbs, &attr); if (ret) { ret = ERR(ret); goto err; } cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt); if (!cma_dev->port) { ret = ERR(ENOMEM); goto err; } for (i = 1; i <= attr.phys_port_cnt; i++) { if (ibv_query_port(cma_dev->verbs, i, &port_attr)) cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED; else cma_dev->port[i - 1].link_layer = port_attr.link_layer; } cma_dev->port_cnt = attr.phys_port_cnt; cma_dev->max_qpsize = attr.max_qp_wr; cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom; cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom; cma_init_cnt++; return 0; err: ibv_close_device(cma_dev->verbs); cma_dev->verbs = NULL; return ret; } static int ucma_init_all(void) { int i, ret = 0; if (!cma_dev_cnt) { ret = ucma_init(); if (ret) return ret; } if (cma_init_cnt == cma_dev_cnt) return 0; pthread_mutex_lock(&mut); for (i = 0; i < cma_dev_cnt; i++) { ret = ucma_init_device(&cma_dev_array[i]); if (ret) break; } pthread_mutex_unlock(&mut); return ret; } struct ibv_context **rdma_get_devices(int *num_devices) { struct ibv_context **devs = NULL; int i; if (ucma_init_all()) goto out; devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1)); if (!devs) goto out; for (i = 0; i < cma_dev_cnt; i++) devs[i] = cma_dev_array[i].verbs; devs[i] = NULL; out: if (num_devices) *num_devices = devs ? cma_dev_cnt : 0; return devs; } void rdma_free_devices(struct ibv_context **list) { free(list); } struct rdma_event_channel *rdma_create_event_channel(void) { struct rdma_event_channel *channel; if (ucma_init()) return NULL; channel = malloc(sizeof(*channel)); if (!channel) return NULL; channel->fd = open("/dev/rdma_cm", O_RDWR | O_CLOEXEC); if (channel->fd < 0) { goto err; } return channel; err: free(channel); return NULL; } void rdma_destroy_event_channel(struct rdma_event_channel *channel) { close(channel->fd); free(channel); } static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid) { struct cma_device *cma_dev; int i, ret; for (i = 0; i < cma_dev_cnt; i++) { cma_dev = &cma_dev_array[i]; if (cma_dev->guid == guid) goto match; } return ERR(ENODEV); match: pthread_mutex_lock(&mut); if ((ret = ucma_init_device(cma_dev))) goto out; if (!cma_dev->refcnt++) { cma_dev->pd = ibv_alloc_pd(cma_dev->verbs); if (!cma_dev->pd) { cma_dev->refcnt--; ret = ERR(ENOMEM); goto out; } } id_priv->cma_dev = cma_dev; id_priv->id.verbs = cma_dev->verbs; id_priv->id.pd = cma_dev->pd; out: pthread_mutex_unlock(&mut); return ret; } static void ucma_put_device(struct cma_device *cma_dev) { pthread_mutex_lock(&mut); if (!--cma_dev->refcnt) { ibv_dealloc_pd(cma_dev->pd); if (cma_dev->xrcd) ibv_close_xrcd(cma_dev->xrcd); } pthread_mutex_unlock(&mut); } static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev) { struct ibv_xrcd_init_attr attr; pthread_mutex_lock(&mut); if (!cma_dev->xrcd) { memset(&attr, 0, sizeof attr); attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; attr.fd = -1; attr.oflags = O_CREAT; cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr); } pthread_mutex_unlock(&mut); return cma_dev->xrcd; } static void ucma_insert_id(struct cma_id_private *id_priv) { fastlock_acquire(&idm_lock); idm_set(&ucma_idm, id_priv->handle, id_priv); fastlock_release(&idm_lock); } static void ucma_remove_id(struct cma_id_private *id_priv) { if (id_priv->handle <= IDX_MAX_INDEX) idm_clear(&ucma_idm, id_priv->handle); } static struct cma_id_private *ucma_lookup_id(int handle) { return idm_lookup(&ucma_idm, handle); } static void ucma_free_id(struct cma_id_private *id_priv) { ucma_remove_id(id_priv); if (id_priv->cma_dev) ucma_put_device(id_priv->cma_dev); pthread_cond_destroy(&id_priv->cond); pthread_mutex_destroy(&id_priv->mut); if (id_priv->id.route.path_rec) free(id_priv->id.route.path_rec); if (id_priv->sync) rdma_destroy_event_channel(id_priv->id.channel); if (id_priv->connect_len) free(id_priv->connect); free(id_priv); } static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel, void *context, enum rdma_port_space ps, enum ibv_qp_type qp_type) { struct cma_id_private *id_priv; id_priv = calloc(1, sizeof(*id_priv)); if (!id_priv) return NULL; id_priv->id.context = context; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->handle = 0xFFFFFFFF; if (!channel) { id_priv->id.channel = rdma_create_event_channel(); if (!id_priv->id.channel) goto err; id_priv->sync = 1; } else { id_priv->id.channel = channel; } pthread_mutex_init(&id_priv->mut, NULL); if (pthread_cond_init(&id_priv->cond, NULL)) goto err; return id_priv; err: ucma_free_id(id_priv); return NULL; } static int rdma_create_id2(struct rdma_event_channel *channel, struct rdma_cm_id **id, void *context, enum rdma_port_space ps, enum ibv_qp_type qp_type) { struct ucma_abi_create_id_resp resp; struct ucma_abi_create_id cmd; struct cma_id_private *id_priv; int ret; ret = ucma_init(); if (ret) return ret; id_priv = ucma_alloc_id(channel, context, ps, qp_type); if (!id_priv) return ERR(ENOMEM); CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp); cmd.uid = (uintptr_t) id_priv; cmd.ps = ps; cmd.qp_type = qp_type; ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) goto err; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); id_priv->handle = resp.id; ucma_insert_id(id_priv); *id = &id_priv->id; return 0; err: ucma_free_id(id_priv); return ret; } int rdma_create_id(struct rdma_event_channel *channel, struct rdma_cm_id **id, void *context, enum rdma_port_space ps) { enum ibv_qp_type qp_type; qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ? IBV_QPT_UD : IBV_QPT_RC; return rdma_create_id2(channel, id, context, ps, qp_type); } static int ucma_destroy_kern_id(int fd, uint32_t handle) { struct ucma_abi_destroy_id_resp resp; struct ucma_abi_destroy_id cmd; int ret; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp); cmd.id = handle; ret = write(fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); return resp.events_reported; } int rdma_destroy_id(struct rdma_cm_id *id) { struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle); if (ret < 0) return ret; if (id_priv->id.event) rdma_ack_cm_event(id_priv->id.event); pthread_mutex_lock(&id_priv->mut); while (id_priv->events_completed < ret) pthread_cond_wait(&id_priv->cond, &id_priv->mut); pthread_mutex_unlock(&id_priv->mut); ucma_free_id(id_priv); return 0; } int ucma_addrlen(struct sockaddr *addr) { if (!addr) return 0; switch (addr->sa_family) { case PF_INET: return sizeof(struct sockaddr_in); case PF_INET6: return sizeof(struct sockaddr_in6); case PF_IB: return af_ib_support ? sizeof(struct sockaddr_ib) : 0; default: return 0; } } static int ucma_query_addr(struct rdma_cm_id *id) { struct ucma_abi_query_addr_resp resp; struct ucma_abi_query cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.option = UCMA_QUERY_ADDR; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size); memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size); if (!id_priv->cma_dev && resp.node_guid) { ret = ucma_get_device(id_priv, resp.node_guid); if (ret) return ret; id->port_num = resp.port_num; id->route.addr.addr.ibaddr.pkey = resp.pkey; } return 0; } static int ucma_query_gid(struct rdma_cm_id *id) { struct ucma_abi_query_addr_resp resp; struct ucma_abi_query cmd; struct cma_id_private *id_priv; struct sockaddr_ib *sib; int ret; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.option = UCMA_QUERY_GID; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); sib = (struct sockaddr_ib *) &resp.src_addr; memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw, sizeof id->route.addr.addr.ibaddr.sgid); sib = (struct sockaddr_ib *) &resp.dst_addr; memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw, sizeof id->route.addr.addr.ibaddr.dgid); return 0; } static void ucma_convert_path(struct ibv_path_data *path_data, struct ibv_sa_path_rec *sa_path) { uint32_t fl_hop; sa_path->dgid = path_data->path.dgid; sa_path->sgid = path_data->path.sgid; sa_path->dlid = path_data->path.dlid; sa_path->slid = path_data->path.slid; sa_path->raw_traffic = 0; fl_hop = be32toh(path_data->path.flowlabel_hoplimit); sa_path->flow_label = htobe32(fl_hop >> 8); sa_path->hop_limit = (uint8_t) fl_hop; sa_path->traffic_class = path_data->path.tclass; sa_path->reversible = path_data->path.reversible_numpath >> 7; sa_path->numb_path = 1; sa_path->pkey = path_data->path.pkey; sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF; sa_path->mtu_selector = 2; /* exactly */ sa_path->mtu = path_data->path.mtu & 0x1F; sa_path->rate_selector = 2; sa_path->rate = path_data->path.rate & 0x1F; sa_path->packet_life_time_selector = 2; sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F; sa_path->preference = (uint8_t) path_data->flags; } static int ucma_query_path(struct rdma_cm_id *id) { struct ucma_abi_query_path_resp *resp; struct ucma_abi_query cmd; struct cma_id_private *id_priv; int ret, i, size; size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6; resp = alloca(size); CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.option = UCMA_QUERY_PATH; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(resp, size); if (resp->num_paths) { id->route.path_rec = malloc(sizeof(*id->route.path_rec) * resp->num_paths); if (!id->route.path_rec) return ERR(ENOMEM); id->route.num_paths = resp->num_paths; for (i = 0; i < resp->num_paths; i++) ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]); } return 0; } static int ucma_query_route(struct rdma_cm_id *id) { struct ucma_abi_query_route_resp resp; struct ucma_abi_query cmd; struct cma_id_private *id_priv; int ret, i; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); if (resp.num_paths) { id->route.path_rec = malloc(sizeof(*id->route.path_rec) * resp.num_paths); if (!id->route.path_rec) return ERR(ENOMEM); id->route.num_paths = resp.num_paths; for (i = 0; i < resp.num_paths; i++) ibv_copy_path_rec_from_kern(&id->route.path_rec[i], &resp.ib_route[i]); } memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid, sizeof id->route.addr.addr.ibaddr.sgid); memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid, sizeof id->route.addr.addr.ibaddr.dgid); id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey; memcpy(&id->route.addr.src_addr, &resp.src_addr, sizeof resp.src_addr); memcpy(&id->route.addr.dst_addr, &resp.dst_addr, sizeof resp.dst_addr); if (!id_priv->cma_dev && resp.node_guid) { ret = ucma_get_device(id_priv, resp.node_guid); if (ret) return ret; id_priv->id.port_num = resp.port_num; } return 0; } static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr, socklen_t addrlen) { struct ucma_abi_bind cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, BIND); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.addr_size = addrlen; memcpy(&cmd.addr, addr, addrlen); ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; ret = ucma_query_addr(id); if (!ret) ret = ucma_query_gid(id); return ret; } int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct ucma_abi_bind_ip cmd; struct cma_id_private *id_priv; int ret, addrlen; addrlen = ucma_addrlen(addr); if (!addrlen) return ERR(EINVAL); if (af_ib_support) return rdma_bind_addr2(id, addr, addrlen); CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; memcpy(&cmd.addr, addr, addrlen); ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return ucma_query_route(id); } int ucma_complete(struct rdma_cm_id *id) { struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); if (!id_priv->sync) return 0; if (id_priv->id.event) { rdma_ack_cm_event(id_priv->id.event); id_priv->id.event = NULL; } ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event); if (ret) return ret; if (id_priv->id.event->status) { if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED) ret = ERR(ECONNREFUSED); else if (id_priv->id.event->status < 0) ret = ERR(-id_priv->id.event->status); else ret = ERR(-id_priv->id.event->status); } return ret; } static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr, socklen_t src_len, struct sockaddr *dst_addr, socklen_t dst_len, int timeout_ms) { struct ucma_abi_resolve_addr cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; if ((cmd.src_size = src_len)) memcpy(&cmd.src_addr, src_addr, src_len); memcpy(&cmd.dst_addr, dst_addr, dst_len); cmd.dst_size = dst_len; cmd.timeout_ms = timeout_ms; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); return ucma_complete(id); } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, struct sockaddr *dst_addr, int timeout_ms) { struct ucma_abi_resolve_ip cmd; struct cma_id_private *id_priv; int ret, dst_len, src_len; dst_len = ucma_addrlen(dst_addr); if (!dst_len) return ERR(EINVAL); src_len = ucma_addrlen(src_addr); if (src_addr && !src_len) return ERR(EINVAL); if (af_ib_support) return rdma_resolve_addr2(id, src_addr, src_len, dst_addr, dst_len, timeout_ms); CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; if (src_addr) memcpy(&cmd.src_addr, src_addr, src_len); memcpy(&cmd.dst_addr, dst_addr, dst_len); cmd.timeout_ms = timeout_ms; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); return ucma_complete(id); } static int ucma_set_ib_route(struct rdma_cm_id *id) { struct rdma_addrinfo hint, *rai; int ret; memset(&hint, 0, sizeof hint); hint.ai_flags = RAI_ROUTEONLY; hint.ai_family = id->route.addr.src_addr.sa_family; hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr); hint.ai_src_addr = &id->route.addr.src_addr; hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr); hint.ai_dst_addr = &id->route.addr.dst_addr; ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai); if (ret) return ret; if (rai->ai_route_len) ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, rai->ai_route, rai->ai_route_len); else ret = -1; rdma_freeaddrinfo(rai); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct ucma_abi_resolve_route cmd; struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) { ret = ucma_set_ib_route(id); if (!ret) goto out; } CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE); cmd.id = id_priv->handle; cmd.timeout_ms = timeout_ms; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; out: return ucma_complete(id); } static int ucma_is_ud_qp(enum ibv_qp_type qp_type) { return (qp_type == IBV_QPT_UD); } static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr, int *qp_attr_mask) { struct ucma_abi_init_qp_attr cmd; struct ibv_kern_qp_attr resp; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.qp_state = qp_attr->qp_state; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); ibv_copy_qp_attr_from_kern(qp_attr, &resp); *qp_attr_mask = resp.qp_attr_mask; return 0; } static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res) { struct cma_id_private *id_priv; struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; uint8_t link_layer; if (!id->qp) return ERR(EINVAL); /* Need to update QP attributes from default values. */ qp_attr.qp_state = IBV_QPS_INIT; ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTR; ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); if (ret) return ret; /* * Workaround for rdma_ucm kernel bug: * mask off qp_attr_mask bits 21-24 which are used for RoCE */ id_priv = container_of(id, struct cma_id_private, id); link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer; if (link_layer == IBV_LINK_LAYER_INFINIBAND) qp_attr_mask &= UINT_MAX ^ 0xe00000; if (resp_res != RDMA_MAX_RESP_RES) qp_attr.max_dest_rd_atomic = resp_res; return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); } static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth) { struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IBV_QPS_RTS; ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); if (ret) return ret; if (init_depth != RDMA_MAX_INIT_DEPTH) qp_attr.max_rd_atomic = init_depth; return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); } static int ucma_modify_qp_sqd(struct rdma_cm_id *id) { struct ibv_qp_attr qp_attr; if (!id->qp) return 0; qp_attr.qp_state = IBV_QPS_SQD; return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); } static int ucma_modify_qp_err(struct rdma_cm_id *id) { struct ibv_qp_attr qp_attr; if (!id->qp) return 0; qp_attr.qp_state = IBV_QPS_ERR; return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); } static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num, __be16 pkey, uint16_t *pkey_index) { int ret, i; __be16 chk_pkey; for (i = 0, ret = 0; !ret; i++) { ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey); if (!ret && pkey == chk_pkey) { *pkey_index = (uint16_t) i; return 0; } } return ERR(EINVAL); } static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) { struct ibv_qp_attr qp_attr; int ret; ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num, id_priv->id.route.addr.addr.ibaddr.pkey, &qp_attr.pkey_index); if (ret) return ret; qp_attr.port_num = id_priv->id.port_num; qp_attr.qp_state = IBV_QPS_INIT; qp_attr.qp_access_flags = 0; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS | IBV_QP_PKEY_INDEX | IBV_QP_PORT); return rdma_seterrno(ret); } static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) { struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; if (abi_ver == 3) return ucma_init_conn_qp3(id_priv, qp); qp_attr.qp_state = IBV_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask)); } static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) { struct ibv_qp_attr qp_attr; int ret; ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num, id_priv->id.route.addr.addr.ibaddr.pkey, &qp_attr.pkey_index); if (ret) return ret; qp_attr.port_num = id_priv->id.port_num; qp_attr.qp_state = IBV_QPS_INIT; qp_attr.qkey = RDMA_UDP_QKEY; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY | IBV_QP_PKEY_INDEX | IBV_QP_PORT); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTR; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTS; qp_attr.sq_psn = 0; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); return rdma_seterrno(ret); } static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) { struct ibv_qp_attr qp_attr; int qp_attr_mask, ret; if (abi_ver == 3) return ucma_init_ud_qp3(id_priv, qp); qp_attr.qp_state = IBV_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTR; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); if (ret) return ERR(ret); qp_attr.qp_state = IBV_QPS_RTS; qp_attr.sq_psn = 0; ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); return rdma_seterrno(ret); } static void ucma_destroy_cqs(struct rdma_cm_id *id) { if (id->qp_type == IBV_QPT_XRC_RECV && id->srq) return; if (id->recv_cq) { ibv_destroy_cq(id->recv_cq); if (id->send_cq && (id->send_cq != id->recv_cq)) { ibv_destroy_cq(id->send_cq); id->send_cq = NULL; } id->recv_cq = NULL; } if (id->recv_cq_channel) { ibv_destroy_comp_channel(id->recv_cq_channel); if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) { ibv_destroy_comp_channel(id->send_cq_channel); id->send_cq_channel = NULL; } id->recv_cq_channel = NULL; } } static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size) { if (recv_size) { id->recv_cq_channel = ibv_create_comp_channel(id->verbs); if (!id->recv_cq_channel) goto err; id->recv_cq = ibv_create_cq(id->verbs, recv_size, id, id->recv_cq_channel, 0); if (!id->recv_cq) goto err; } if (send_size) { id->send_cq_channel = ibv_create_comp_channel(id->verbs); if (!id->send_cq_channel) goto err; id->send_cq = ibv_create_cq(id->verbs, send_size, id, id->send_cq_channel, 0); if (!id->send_cq) goto err; } return 0; err: ucma_destroy_cqs(id); return ERR(ENOMEM); } int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr) { struct cma_id_private *id_priv; struct ibv_srq *srq; int ret; id_priv = container_of(id, struct cma_id_private, id); if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE)) return ERR(EINVAL); if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) { attr->pd = id->pd; attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD; } if (attr->srq_type == IBV_SRQT_XRC) { if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) { attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); if (!attr->xrcd) return -1; } if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) { ret = ucma_create_cqs(id, 0, attr->attr.max_wr); if (ret) return ret; attr->cq = id->recv_cq; } attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ; } srq = ibv_create_srq_ex(id->verbs, attr); if (!srq) { ret = -1; goto err; } if (!id->pd) id->pd = attr->pd; id->srq = srq; return 0; err: ucma_destroy_cqs(id); return ret; } int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, struct ibv_srq_init_attr *attr) { struct ibv_srq_init_attr_ex attr_ex; int ret; memcpy(&attr_ex, attr, sizeof(*attr)); attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD; if (id->qp_type == IBV_QPT_XRC_RECV) { attr_ex.srq_type = IBV_SRQT_XRC; } else { attr_ex.srq_type = IBV_SRQT_BASIC; } attr_ex.pd = pd; ret = rdma_create_srq_ex(id, &attr_ex); memcpy(attr, &attr_ex, sizeof(*attr)); return ret; } void rdma_destroy_srq(struct rdma_cm_id *id) { ibv_destroy_srq(id->srq); id->srq = NULL; ucma_destroy_cqs(id); } int rdma_create_qp_ex(struct rdma_cm_id *id, struct ibv_qp_init_attr_ex *attr) { struct cma_id_private *id_priv; struct ibv_qp *qp; int ret; if (id->qp) return ERR(EINVAL); id_priv = container_of(id, struct cma_id_private, id); if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) { attr->comp_mask |= IBV_QP_INIT_ATTR_PD; attr->pd = id->pd; } else if (id->verbs != attr->pd->context) return ERR(EINVAL); if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) || (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq)) return ERR(EINVAL); if (id->qp_type == IBV_QPT_XRC_RECV) { if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) { attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); if (!attr->xrcd) return -1; attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD; } } ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr, attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr); if (ret) return ret; if (!attr->send_cq) attr->send_cq = id->send_cq; if (!attr->recv_cq) attr->recv_cq = id->recv_cq; if (id->srq && !attr->srq) attr->srq = id->srq; qp = ibv_create_qp_ex(id->verbs, attr); if (!qp) { ret = ERR(ENOMEM); goto err1; } if (ucma_is_ud_qp(id->qp_type)) ret = ucma_init_ud_qp(id_priv, qp); else ret = ucma_init_conn_qp(id_priv, qp); if (ret) goto err2; id->pd = qp->pd; id->qp = qp; return 0; err2: ibv_destroy_qp(qp); err1: ucma_destroy_cqs(id); return ret; } int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { struct ibv_qp_init_attr_ex attr_ex; int ret; memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr)); attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; attr_ex.pd = pd ? pd : id->pd; ret = rdma_create_qp_ex(id, &attr_ex); memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr)); return ret; } void rdma_destroy_qp(struct rdma_cm_id *id) { ibv_destroy_qp(id->qp); id->qp = NULL; ucma_destroy_cqs(id); } static int ucma_valid_param(struct cma_id_private *id_priv, struct rdma_conn_param *param) { if (id_priv->id.ps != RDMA_PS_TCP) return 0; if (!id_priv->id.qp && !param) goto err; if (!param) return 0; if ((param->responder_resources != RDMA_MAX_RESP_RES) && (param->responder_resources > id_priv->cma_dev->max_responder_resources)) goto err; if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) && (param->initiator_depth > id_priv->cma_dev->max_initiator_depth)) goto err; return 0; err: return ERR(EINVAL); } static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv, struct ucma_abi_conn_param *dst, struct rdma_conn_param *src, uint32_t qp_num, uint8_t srq) { dst->qp_num = qp_num; dst->srq = srq; dst->responder_resources = id_priv->responder_resources; dst->initiator_depth = id_priv->initiator_depth; dst->valid = 1; if (id_priv->connect_len) { memcpy(dst->private_data, id_priv->connect, id_priv->connect_len); dst->private_data_len = id_priv->connect_len; } if (src) { dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; if (src->private_data && src->private_data_len) { memcpy(dst->private_data + dst->private_data_len, src->private_data, src->private_data_len); dst->private_data_len += src->private_data_len; } } else { dst->retry_count = 7; dst->rnr_retry_count = 7; } } int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct ucma_abi_connect cmd; struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); ret = ucma_valid_param(id_priv, conn_param); if (ret) return ret; if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH) id_priv->initiator_depth = conn_param->initiator_depth; else id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth; if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES) id_priv->responder_resources = conn_param->responder_resources; else id_priv->responder_resources = id_priv->cma_dev->max_responder_resources; CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT); cmd.id = id_priv->handle; if (id->qp) { ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, id->qp->qp_num, (id->qp->srq != NULL)); } else if (conn_param) { ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, conn_param->qp_num, conn_param->srq); } else { ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, 0, 0); } ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; if (id_priv->connect_len) { free(id_priv->connect); id_priv->connect_len = 0; } return ucma_complete(id); } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct ucma_abi_listen cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.backlog = backlog; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; if (af_ib_support) return ucma_query_addr(id); else return ucma_query_route(id); } int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id) { struct cma_id_private *id_priv; struct rdma_cm_event *event; int ret; id_priv = container_of(listen, struct cma_id_private, id); if (!id_priv->sync) return ERR(EINVAL); if (listen->event) { rdma_ack_cm_event(listen->event); listen->event = NULL; } ret = rdma_get_cm_event(listen->channel, &event); if (ret) return ret; if (event->status) { ret = ERR(event->status); goto err; } if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { ret = ERR(EINVAL); goto err; } if (id_priv->qp_init_attr) { struct ibv_qp_init_attr attr; attr = *id_priv->qp_init_attr; ret = rdma_create_qp(event->id, listen->pd, &attr); if (ret) goto err; } *id = event->id; (*id)->event = event; return 0; err: listen->event = event; return ret; } int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct ucma_abi_accept cmd; struct cma_id_private *id_priv; int ret; id_priv = container_of(id, struct cma_id_private, id); ret = ucma_valid_param(id_priv, conn_param); if (ret) return ret; if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) { id_priv->initiator_depth = min(id_priv->initiator_depth, id_priv->cma_dev->max_initiator_depth); } else { id_priv->initiator_depth = conn_param->initiator_depth; } if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) { id_priv->responder_resources = min(id_priv->responder_resources, id_priv->cma_dev->max_responder_resources); } else { id_priv->responder_resources = conn_param->responder_resources; } if (!ucma_is_ud_qp(id->qp_type)) { ret = ucma_modify_qp_rtr(id, id_priv->responder_resources); if (ret) return ret; ret = ucma_modify_qp_rts(id, id_priv->initiator_depth); if (ret) return ret; } CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); cmd.id = id_priv->handle; cmd.uid = (uintptr_t) id_priv; if (id->qp) ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, id->qp->qp_num, (id->qp->srq != NULL)); else ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, conn_param->qp_num, conn_param->srq); ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ucma_modify_qp_err(id); return (ret >= 0) ? ERR(ENODATA) : -1; } if (ucma_is_ud_qp(id->qp_type)) return 0; return ucma_complete(id); } int rdma_reject(struct rdma_cm_id *id, const void *private_data, uint8_t private_data_len) { struct ucma_abi_reject cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, REJECT); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; if (private_data && private_data_len) { memcpy(cmd.private_data, private_data, private_data_len); cmd.private_data_len = private_data_len; } ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return 0; } int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event) { struct ucma_abi_notify cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.event = event; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return 0; } int ucma_shutdown(struct rdma_cm_id *id) { switch (id->verbs->device->transport_type) { case IBV_TRANSPORT_IB: return ucma_modify_qp_err(id); case IBV_TRANSPORT_IWARP: return ucma_modify_qp_sqd(id); default: return ERR(EINVAL); } } int rdma_disconnect(struct rdma_cm_id *id) { struct ucma_abi_disconnect cmd; struct cma_id_private *id_priv; int ret; ret = ucma_shutdown(id); if (ret) return ret; CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return ucma_complete(id); } static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr, socklen_t addrlen, void *context) { struct ucma_abi_create_id_resp resp; struct cma_id_private *id_priv; struct cma_multicast *mc, **pos; int ret; id_priv = container_of(id, struct cma_id_private, id); mc = calloc(1, sizeof(*mc)); if (!mc) return ERR(ENOMEM); mc->context = context; mc->id_priv = id_priv; memcpy(&mc->addr, addr, addrlen); if (pthread_cond_init(&mc->cond, NULL)) { ret = -1; goto err1; } pthread_mutex_lock(&id_priv->mut); mc->next = id_priv->mc_list; id_priv->mc_list = mc; pthread_mutex_unlock(&id_priv->mut); if (af_ib_support) { struct ucma_abi_join_mcast cmd; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp); cmd.id = id_priv->handle; memcpy(&cmd.addr, addr, addrlen); cmd.addr_size = addrlen; cmd.uid = (uintptr_t) mc; cmd.reserved = 0; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ret = (ret >= 0) ? ERR(ENODATA) : -1; goto err2; } } else { struct ucma_abi_join_ip_mcast cmd; CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp); cmd.id = id_priv->handle; memcpy(&cmd.addr, addr, addrlen); cmd.uid = (uintptr_t) mc; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ret = (ret >= 0) ? ERR(ENODATA) : -1; goto err2; } } VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); mc->handle = resp.id; return ucma_complete(id); err2: pthread_mutex_lock(&id_priv->mut); for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next) ; *pos = mc->next; pthread_mutex_unlock(&id_priv->mut); err1: free(mc); return ret; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context) { int addrlen; addrlen = ucma_addrlen(addr); if (!addrlen) return ERR(EINVAL); return rdma_join_multicast2(id, addr, addrlen, context); } int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct ucma_abi_destroy_id cmd; struct ucma_abi_destroy_id_resp resp; struct cma_id_private *id_priv; struct cma_multicast *mc, **pos; int ret, addrlen; addrlen = ucma_addrlen(addr); if (!addrlen) return ERR(EINVAL); id_priv = container_of(id, struct cma_id_private, id); pthread_mutex_lock(&id_priv->mut); for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next) if (!memcmp(&(*pos)->addr, addr, addrlen)) break; mc = *pos; if (*pos) *pos = mc->next; pthread_mutex_unlock(&id_priv->mut); if (!mc) return ERR(EADDRNOTAVAIL); if (id->qp) ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid); CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp); cmd.id = mc->handle; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ret = (ret >= 0) ? ERR(ENODATA) : -1; goto free; } VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); pthread_mutex_lock(&id_priv->mut); while (mc->events_completed < resp.events_reported) pthread_cond_wait(&mc->cond, &id_priv->mut); pthread_mutex_unlock(&id_priv->mut); ret = 0; free: free(mc); return ret; } static void ucma_complete_event(struct cma_id_private *id_priv) { pthread_mutex_lock(&id_priv->mut); id_priv->events_completed++; pthread_cond_signal(&id_priv->cond); pthread_mutex_unlock(&id_priv->mut); } static void ucma_complete_mc_event(struct cma_multicast *mc) { pthread_mutex_lock(&mc->id_priv->mut); mc->events_completed++; pthread_cond_signal(&mc->cond); mc->id_priv->events_completed++; pthread_cond_signal(&mc->id_priv->cond); pthread_mutex_unlock(&mc->id_priv->mut); } int rdma_ack_cm_event(struct rdma_cm_event *event) { struct cma_event *evt; if (!event) return ERR(EINVAL); evt = container_of(event, struct cma_event, event); if (evt->mc) ucma_complete_mc_event(evt->mc); else ucma_complete_event(evt->id_priv); free(evt); return 0; } static void ucma_process_addr_resolved(struct cma_event *evt) { if (af_ib_support) { evt->event.status = ucma_query_addr(&evt->id_priv->id); if (!evt->event.status && evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB) evt->event.status = ucma_query_gid(&evt->id_priv->id); } else { evt->event.status = ucma_query_route(&evt->id_priv->id); } if (evt->event.status) evt->event.event = RDMA_CM_EVENT_ADDR_ERROR; } static void ucma_process_route_resolved(struct cma_event *evt) { if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB) return; if (af_ib_support) evt->event.status = ucma_query_path(&evt->id_priv->id); else evt->event.status = ucma_query_route(&evt->id_priv->id); if (evt->event.status) evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR; } static int ucma_query_req_info(struct rdma_cm_id *id) { int ret; if (!af_ib_support) return ucma_query_route(id); ret = ucma_query_addr(id); if (ret) return ret; ret = ucma_query_gid(id); if (ret) return ret; ret = ucma_query_path(id); if (ret) return ret; return 0; } static int ucma_process_conn_req(struct cma_event *evt, uint32_t handle) { struct cma_id_private *id_priv; int ret; id_priv = ucma_alloc_id(evt->id_priv->id.channel, evt->id_priv->id.context, evt->id_priv->id.ps, evt->id_priv->id.qp_type); if (!id_priv) { ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle); ret = ERR(ENOMEM); goto err1; } evt->event.listen_id = &evt->id_priv->id; evt->event.id = &id_priv->id; id_priv->handle = handle; ucma_insert_id(id_priv); id_priv->initiator_depth = evt->event.param.conn.initiator_depth; id_priv->responder_resources = evt->event.param.conn.responder_resources; if (evt->id_priv->sync) { ret = rdma_migrate_id(&id_priv->id, NULL); if (ret) goto err2; } ret = ucma_query_req_info(&id_priv->id); if (ret) goto err2; return 0; err2: rdma_destroy_id(&id_priv->id); err1: ucma_complete_event(evt->id_priv); return ret; } static int ucma_process_conn_resp(struct cma_id_private *id_priv) { struct ucma_abi_accept cmd; int ret; ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES); if (ret) goto err; ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH); if (ret) goto err; CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); cmd.id = id_priv->handle; ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { ret = (ret >= 0) ? ERR(ENODATA) : -1; goto err; } return 0; err: ucma_modify_qp_err(&id_priv->id); return ret; } static int ucma_process_join(struct cma_event *evt) { evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid; evt->mc->mlid = evt->event.param.ud.ah_attr.dlid; if (!evt->id_priv->id.qp) return 0; return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp, &evt->mc->mgid, evt->mc->mlid)); } static void ucma_copy_conn_event(struct cma_event *event, struct ucma_abi_conn_param *src) { struct rdma_conn_param *dst = &event->event.param.conn; dst->private_data_len = src->private_data_len; if (src->private_data_len) { dst->private_data = &event->private_data; memcpy(&event->private_data, src->private_data, src->private_data_len); } dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; } static void ucma_copy_ud_event(struct cma_event *event, struct ucma_abi_ud_param *src) { struct rdma_ud_param *dst = &event->event.param.ud; dst->private_data_len = src->private_data_len; if (src->private_data_len) { dst->private_data = &event->private_data; memcpy(&event->private_data, src->private_data, src->private_data_len); } ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } int rdma_get_cm_event(struct rdma_event_channel *channel, struct rdma_cm_event **event) { struct ucma_abi_event_resp resp; struct ucma_abi_get_event cmd; struct cma_event *evt; int ret; ret = ucma_init(); if (ret) return ret; if (!event) return ERR(EINVAL); evt = malloc(sizeof(*evt)); if (!evt) return ERR(ENOMEM); retry: memset(evt, 0, sizeof(*evt)); CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp); ret = write(channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { free(evt); return (ret >= 0) ? ERR(ENODATA) : -1; } VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); evt->event.event = resp.event; /* * We should have a non-zero uid, except for connection requests. * But a bug in older kernels can report a uid 0. Work-around this * issue by looking up the cma_id based on the kernel's id when the * uid is 0 and we're processing a connection established event. * In all other cases, if the uid is 0, we discard the event, like * the kernel should have done. */ if (resp.uid) { evt->id_priv = (void *) (uintptr_t) resp.uid; } else { evt->id_priv = ucma_lookup_id(resp.id); if (!evt->id_priv) { syslog(LOG_WARNING, PFX "Warning: discarding unmatched " "event - rdma_destroy_id may hang.\n"); goto retry; } if (resp.event != RDMA_CM_EVENT_ESTABLISHED) { ucma_complete_event(evt->id_priv); goto retry; } } evt->event.id = &evt->id_priv->id; evt->event.status = resp.status; switch (resp.event) { case RDMA_CM_EVENT_ADDR_RESOLVED: ucma_process_addr_resolved(evt); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: ucma_process_route_resolved(evt); break; case RDMA_CM_EVENT_CONNECT_REQUEST: evt->id_priv = (void *) (uintptr_t) resp.uid; if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) ucma_copy_ud_event(evt, &resp.param.ud); else ucma_copy_conn_event(evt, &resp.param.conn); ret = ucma_process_conn_req(evt, resp.id); if (ret) goto retry; break; case RDMA_CM_EVENT_CONNECT_RESPONSE: ucma_copy_conn_event(evt, &resp.param.conn); evt->event.status = ucma_process_conn_resp(evt->id_priv); if (!evt->event.status) evt->event.event = RDMA_CM_EVENT_ESTABLISHED; else { evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR; evt->id_priv->connect_error = 1; } break; case RDMA_CM_EVENT_ESTABLISHED: if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) { ucma_copy_ud_event(evt, &resp.param.ud); break; } ucma_copy_conn_event(evt, &resp.param.conn); break; case RDMA_CM_EVENT_REJECTED: if (evt->id_priv->connect_error) { ucma_complete_event(evt->id_priv); goto retry; } ucma_copy_conn_event(evt, &resp.param.conn); ucma_modify_qp_err(evt->event.id); break; case RDMA_CM_EVENT_DISCONNECTED: if (evt->id_priv->connect_error) { ucma_complete_event(evt->id_priv); goto retry; } ucma_copy_conn_event(evt, &resp.param.conn); break; case RDMA_CM_EVENT_MULTICAST_JOIN: evt->mc = (void *) (uintptr_t) resp.uid; evt->id_priv = evt->mc->id_priv; evt->event.id = &evt->id_priv->id; ucma_copy_ud_event(evt, &resp.param.ud); evt->event.param.ud.private_data = evt->mc->context; evt->event.status = ucma_process_join(evt); if (evt->event.status) evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR; break; case RDMA_CM_EVENT_MULTICAST_ERROR: evt->mc = (void *) (uintptr_t) resp.uid; evt->id_priv = evt->mc->id_priv; evt->event.id = &evt->id_priv->id; evt->event.param.ud.private_data = evt->mc->context; break; default: evt->id_priv = (void *) (uintptr_t) resp.uid; evt->event.id = &evt->id_priv->id; evt->event.status = resp.status; if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) ucma_copy_ud_event(evt, &resp.param.ud); else ucma_copy_conn_event(evt, &resp.param.conn); break; } *event = &evt->event; return 0; } const char *rdma_event_str(enum rdma_cm_event_type event) { switch (event) { case RDMA_CM_EVENT_ADDR_RESOLVED: return "RDMA_CM_EVENT_ADDR_RESOLVED"; case RDMA_CM_EVENT_ADDR_ERROR: return "RDMA_CM_EVENT_ADDR_ERROR"; case RDMA_CM_EVENT_ROUTE_RESOLVED: return "RDMA_CM_EVENT_ROUTE_RESOLVED"; case RDMA_CM_EVENT_ROUTE_ERROR: return "RDMA_CM_EVENT_ROUTE_ERROR"; case RDMA_CM_EVENT_CONNECT_REQUEST: return "RDMA_CM_EVENT_CONNECT_REQUEST"; case RDMA_CM_EVENT_CONNECT_RESPONSE: return "RDMA_CM_EVENT_CONNECT_RESPONSE"; case RDMA_CM_EVENT_CONNECT_ERROR: return "RDMA_CM_EVENT_CONNECT_ERROR"; case RDMA_CM_EVENT_UNREACHABLE: return "RDMA_CM_EVENT_UNREACHABLE"; case RDMA_CM_EVENT_REJECTED: return "RDMA_CM_EVENT_REJECTED"; case RDMA_CM_EVENT_ESTABLISHED: return "RDMA_CM_EVENT_ESTABLISHED"; case RDMA_CM_EVENT_DISCONNECTED: return "RDMA_CM_EVENT_DISCONNECTED"; case RDMA_CM_EVENT_DEVICE_REMOVAL: return "RDMA_CM_EVENT_DEVICE_REMOVAL"; case RDMA_CM_EVENT_MULTICAST_JOIN: return "RDMA_CM_EVENT_MULTICAST_JOIN"; case RDMA_CM_EVENT_MULTICAST_ERROR: return "RDMA_CM_EVENT_MULTICAST_ERROR"; case RDMA_CM_EVENT_ADDR_CHANGE: return "RDMA_CM_EVENT_ADDR_CHANGE"; case RDMA_CM_EVENT_TIMEWAIT_EXIT: return "RDMA_CM_EVENT_TIMEWAIT_EXIT"; default: return "UNKNOWN EVENT"; } } int rdma_set_option(struct rdma_cm_id *id, int level, int optname, void *optval, size_t optlen) { struct ucma_abi_set_option cmd; struct cma_id_private *id_priv; int ret; CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; cmd.optval = (uintptr_t) optval; cmd.level = level; cmd.optname = optname; cmd.optlen = optlen; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; return 0; } int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel) { struct ucma_abi_migrate_resp resp; struct ucma_abi_migrate_id cmd; struct cma_id_private *id_priv; int ret, sync; id_priv = container_of(id, struct cma_id_private, id); if (id_priv->sync && !channel) return ERR(EINVAL); if ((sync = (channel == NULL))) { channel = rdma_create_event_channel(); if (!channel) return -1; } CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp); cmd.id = id_priv->handle; cmd.fd = id->channel->fd; ret = write(channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { if (sync) rdma_destroy_event_channel(channel); return (ret >= 0) ? ERR(ENODATA) : -1; } VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); if (id_priv->sync) { if (id->event) { rdma_ack_cm_event(id->event); id->event = NULL; } rdma_destroy_event_channel(id->channel); } /* * Eventually if we want to support migrating channels while events are * being processed on the current channel, we need to block here while * there are any outstanding events on the current channel for this id * to prevent the user from processing events for this id on the old * channel after this call returns. */ pthread_mutex_lock(&id_priv->mut); id_priv->sync = sync; id->channel = channel; while (id_priv->events_completed < resp.events_reported) pthread_cond_wait(&id_priv->cond, &id_priv->mut); pthread_mutex_unlock(&id_priv->mut); return 0; } static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { struct cma_id_private *id_priv; int ret; if (af_ib_support) ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len); else ret = rdma_bind_addr(id, res->ai_src_addr); if (ret) return ret; id_priv = container_of(id, struct cma_id_private, id); if (pd) id->pd = pd; if (qp_init_attr) { id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr)); if (!id_priv->qp_init_attr) return ERR(ENOMEM); *id_priv->qp_init_attr = *qp_init_attr; id_priv->qp_init_attr->qp_type = res->ai_qp_type; } return 0; } int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { struct rdma_cm_id *cm_id; struct cma_id_private *id_priv; int ret; ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type); if (ret) return ret; if (res->ai_flags & RAI_PASSIVE) { ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr); if (ret) goto err; goto out; } if (af_ib_support) ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len, res->ai_dst_addr, res->ai_dst_len, 2000); else ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000); if (ret) goto err; if (res->ai_route_len) { ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, res->ai_route, res->ai_route_len); if (!ret) ret = ucma_complete(cm_id); } else { ret = rdma_resolve_route(cm_id, 2000); } if (ret) goto err; if (qp_init_attr) { qp_init_attr->qp_type = res->ai_qp_type; ret = rdma_create_qp(cm_id, pd, qp_init_attr); if (ret) goto err; } if (res->ai_connect_len) { id_priv = container_of(cm_id, struct cma_id_private, id); id_priv->connect = malloc(res->ai_connect_len); if (!id_priv->connect) { ret = ERR(ENOMEM); goto err; } memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len); id_priv->connect_len = res->ai_connect_len; } out: *id = cm_id; return 0; err: rdma_destroy_ep(cm_id); return ret; } void rdma_destroy_ep(struct rdma_cm_id *id) { struct cma_id_private *id_priv; if (id->qp) rdma_destroy_qp(id); if (id->srq) rdma_destroy_srq(id); id_priv = container_of(id, struct cma_id_private, id); if (id_priv->qp_init_attr) free(id_priv->qp_init_attr); rdma_destroy_id(id); } int ucma_max_qpsize(struct rdma_cm_id *id) { struct cma_id_private *id_priv; int i, max_size = 0; id_priv = container_of(id, struct cma_id_private, id); if (id && id_priv->cma_dev) { max_size = id_priv->cma_dev->max_qpsize; } else { ucma_init_all(); for (i = 0; i < cma_dev_cnt; i++) { if (!max_size || max_size > cma_dev_array[i].max_qpsize) max_size = cma_dev_array[i].max_qpsize; } } return max_size; } __be16 ucma_get_port(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid)); default: return 0; } } __be16 rdma_get_src_port(struct rdma_cm_id *id) { return ucma_get_port(&id->route.addr.src_addr); } __be16 rdma_get_dst_port(struct rdma_cm_id *id) { return ucma_get_port(&id->route.addr.dst_addr); }