1335640Shselasky/* 2335640Shselasky * Copyright (c) 2017 Pure Storage, Inc. 3335640Shselasky * All rights reserved. 4335640Shselasky * 5335640Shselasky * Redistribution and use in source and binary forms, with or without 6335640Shselasky * modification, are permitted provided that the following conditions 7335640Shselasky * are met: 8335640Shselasky * 9335640Shselasky * 1. Redistributions of source code must retain the above copyright 10335640Shselasky * notice, this list of conditions and the following disclaimer. 11335640Shselasky * 2. Redistributions in binary form must reproduce the above copyright 12335640Shselasky * notice, this list of conditions and the following disclaimer in the 13335640Shselasky * documentation and/or other materials provided with the distribution. 14335640Shselasky * 3. The name of the author may not be used to endorse or promote 15335640Shselasky * products derived from this software without specific prior written 16335640Shselasky * permission. 17335640Shselasky * 18335640Shselasky * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19335640Shselasky * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20335640Shselasky * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21335640Shselasky * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22335640Shselasky * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23335640Shselasky * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24335640Shselasky * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25335640Shselasky * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26335640Shselasky * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27335640Shselasky * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28335640Shselasky * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29335640Shselasky */ 30335640Shselasky 31335640Shselasky#ifdef HAVE_CONFIG_H 32335640Shselasky#include "config.h" 33335640Shselasky#endif 34335640Shselasky 35335640Shselasky#include "pcap-int.h" 36335640Shselasky#include "pcap-rdmasniff.h" 37335640Shselasky 38335640Shselasky#include <infiniband/verbs.h> 39335640Shselasky#include <stdlib.h> 40335640Shselasky#include <string.h> 41335640Shselasky#include <sys/time.h> 42335640Shselasky 43335640Shselasky#if !defined(IBV_FLOW_ATTR_SNIFFER) 44335640Shselasky#define IBV_FLOW_ATTR_SNIFFER 3 45335640Shselasky#endif 46335640Shselasky 47335640Shselaskystatic const int RDMASNIFF_NUM_RECEIVES = 128; 48335640Shselaskystatic const int RDMASNIFF_RECEIVE_SIZE = 10000; 49335640Shselasky 50335640Shselaskystruct pcap_rdmasniff { 51335640Shselasky struct ibv_device * rdma_device; 52335640Shselasky struct ibv_context * context; 53335640Shselasky struct ibv_comp_channel * channel; 54335640Shselasky struct ibv_pd * pd; 55335640Shselasky struct ibv_cq * cq; 56335640Shselasky struct ibv_qp * qp; 57335640Shselasky struct ibv_flow * flow; 58335640Shselasky struct ibv_mr * mr; 59335640Shselasky u_char * oneshot_buffer; 60335640Shselasky unsigned port_num; 61335640Shselasky int cq_event; 62335640Shselasky u_int packets_recv; 63335640Shselasky}; 64335640Shselasky 65335640Shselaskystatic int 66335640Shselaskyrdmasniff_stats(pcap_t *handle, struct pcap_stat *stat) 67335640Shselasky{ 68335640Shselasky struct pcap_rdmasniff *priv = handle->priv; 69335640Shselasky 70335640Shselasky stat->ps_recv = priv->packets_recv; 71335640Shselasky stat->ps_drop = 0; 72335640Shselasky stat->ps_ifdrop = 0; 73335640Shselasky 74335640Shselasky return 0; 75335640Shselasky} 76335640Shselasky 77335640Shselaskystatic void 78335640Shselaskyrdmasniff_cleanup(pcap_t *handle) 79335640Shselasky{ 80335640Shselasky struct pcap_rdmasniff *priv = handle->priv; 81335640Shselasky 82335640Shselasky ibv_dereg_mr(priv->mr); 83335640Shselasky ibv_destroy_flow(priv->flow); 84335640Shselasky ibv_destroy_qp(priv->qp); 85335640Shselasky ibv_destroy_cq(priv->cq); 86335640Shselasky ibv_dealloc_pd(priv->pd); 87335640Shselasky ibv_destroy_comp_channel(priv->channel); 88335640Shselasky ibv_close_device(priv->context); 89335640Shselasky free(priv->oneshot_buffer); 90335640Shselasky 91335640Shselasky pcap_cleanup_live_common(handle); 92335640Shselasky} 93335640Shselasky 94335640Shselaskystatic void 95335640Shselaskyrdmasniff_post_recv(pcap_t *handle, uint64_t wr_id) 96335640Shselasky{ 97335640Shselasky struct pcap_rdmasniff *priv = handle->priv; 98335640Shselasky struct ibv_sge sg_entry; 99335640Shselasky struct ibv_recv_wr wr, *bad_wr; 100335640Shselasky 101335640Shselasky sg_entry.length = RDMASNIFF_RECEIVE_SIZE; 102335640Shselasky sg_entry.addr = (uintptr_t) handle->buffer + RDMASNIFF_RECEIVE_SIZE * wr_id; 103335640Shselasky sg_entry.lkey = priv->mr->lkey; 104335640Shselasky 105335640Shselasky wr.wr_id = wr_id; 106335640Shselasky wr.num_sge = 1; 107335640Shselasky wr.sg_list = &sg_entry; 108335640Shselasky wr.next = NULL; 109335640Shselasky 110335640Shselasky ibv_post_recv(priv->qp, &wr, &bad_wr); 111335640Shselasky} 112335640Shselasky 113335640Shselaskystatic int 114335640Shselaskyrdmasniff_read(pcap_t *handle, int max_packets, pcap_handler callback, u_char *user) 115335640Shselasky{ 116335640Shselasky struct pcap_rdmasniff *priv = handle->priv; 117335640Shselasky struct ibv_cq *ev_cq; 118335640Shselasky void *ev_ctx; 119335640Shselasky struct ibv_wc wc; 120335640Shselasky struct pcap_pkthdr pkth; 121335640Shselasky u_char *pktd; 122335640Shselasky int count = 0; 123335640Shselasky 124335640Shselasky if (!priv->cq_event) { 125335640Shselasky while (ibv_get_cq_event(priv->channel, &ev_cq, &ev_ctx) < 0) { 126335640Shselasky if (errno != EINTR) { 127335640Shselasky return PCAP_ERROR; 128335640Shselasky } 129335640Shselasky if (handle->break_loop) { 130335640Shselasky handle->break_loop = 0; 131335640Shselasky return PCAP_ERROR_BREAK; 132335640Shselasky } 133335640Shselasky } 134335640Shselasky ibv_ack_cq_events(priv->cq, 1); 135335640Shselasky ibv_req_notify_cq(priv->cq, 0); 136335640Shselasky priv->cq_event = 1; 137335640Shselasky } 138335640Shselasky 139335640Shselasky while (count < max_packets || PACKET_COUNT_IS_UNLIMITED(max_packets)) { 140335640Shselasky if (ibv_poll_cq(priv->cq, 1, &wc) != 1) { 141335640Shselasky priv->cq_event = 0; 142335640Shselasky break; 143335640Shselasky } 144335640Shselasky 145335640Shselasky if (wc.status != IBV_WC_SUCCESS) { 146335640Shselasky fprintf(stderr, "failed WC wr_id %lld status %d/%s\n", 147335640Shselasky (unsigned long long) wc.wr_id, 148335640Shselasky wc.status, ibv_wc_status_str(wc.status)); 149335640Shselasky continue; 150335640Shselasky } 151335640Shselasky 152335640Shselasky pkth.len = wc.byte_len; 153335640Shselasky pkth.caplen = min(pkth.len, (u_int)handle->snapshot); 154335640Shselasky gettimeofday(&pkth.ts, NULL); 155335640Shselasky 156335640Shselasky pktd = (u_char *) handle->buffer + wc.wr_id * RDMASNIFF_RECEIVE_SIZE; 157335640Shselasky 158335640Shselasky if (handle->fcode.bf_insns == NULL || 159335640Shselasky bpf_filter(handle->fcode.bf_insns, pktd, pkth.len, pkth.caplen)) { 160335640Shselasky callback(user, &pkth, pktd); 161335640Shselasky ++priv->packets_recv; 162335640Shselasky ++count; 163335640Shselasky } 164335640Shselasky 165335640Shselasky rdmasniff_post_recv(handle, wc.wr_id); 166335640Shselasky 167335640Shselasky if (handle->break_loop) { 168335640Shselasky handle->break_loop = 0; 169335640Shselasky return PCAP_ERROR_BREAK; 170335640Shselasky } 171335640Shselasky } 172335640Shselasky 173335640Shselasky return count; 174335640Shselasky} 175335640Shselasky 176335640Shselaskystatic void 177335640Shselaskyrdmasniff_oneshot(u_char *user, const struct pcap_pkthdr *h, const u_char *bytes) 178335640Shselasky{ 179335640Shselasky struct oneshot_userdata *sp = (struct oneshot_userdata *) user; 180335640Shselasky pcap_t *handle = sp->pd; 181335640Shselasky struct pcap_rdmasniff *priv = handle->priv; 182335640Shselasky 183335640Shselasky *sp->hdr = *h; 184335640Shselasky memcpy(priv->oneshot_buffer, bytes, h->caplen); 185335640Shselasky *sp->pkt = priv->oneshot_buffer; 186335640Shselasky} 187335640Shselasky 188335640Shselaskystatic int 189335640Shselaskyrdmasniff_activate(pcap_t *handle) 190335640Shselasky{ 191335640Shselasky struct pcap_rdmasniff *priv = handle->priv; 192335640Shselasky struct ibv_qp_init_attr qp_init_attr; 193335640Shselasky struct ibv_qp_attr qp_attr; 194335640Shselasky struct ibv_flow_attr flow_attr; 195335640Shselasky struct ibv_port_attr port_attr; 196335640Shselasky int i; 197335640Shselasky 198335640Shselasky priv->context = ibv_open_device(priv->rdma_device); 199335640Shselasky if (!priv->context) { 200335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 201335640Shselasky "Failed to open device %s", handle->opt.device); 202335640Shselasky goto error; 203335640Shselasky } 204335640Shselasky 205335640Shselasky priv->pd = ibv_alloc_pd(priv->context); 206335640Shselasky if (!priv->pd) { 207335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 208335640Shselasky "Failed to alloc PD for device %s", handle->opt.device); 209335640Shselasky goto error; 210335640Shselasky } 211335640Shselasky 212335640Shselasky priv->channel = ibv_create_comp_channel(priv->context); 213335640Shselasky if (!priv->channel) { 214335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 215335640Shselasky "Failed to create comp channel for device %s", handle->opt.device); 216335640Shselasky goto error; 217335640Shselasky } 218335640Shselasky 219335640Shselasky priv->cq = ibv_create_cq(priv->context, RDMASNIFF_NUM_RECEIVES, 220335640Shselasky NULL, priv->channel, 0); 221335640Shselasky if (!priv->cq) { 222335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 223335640Shselasky "Failed to create CQ for device %s", handle->opt.device); 224335640Shselasky goto error; 225335640Shselasky } 226335640Shselasky 227335640Shselasky ibv_req_notify_cq(priv->cq, 0); 228335640Shselasky 229335640Shselasky memset(&qp_init_attr, 0, sizeof qp_init_attr); 230335640Shselasky qp_init_attr.send_cq = qp_init_attr.recv_cq = priv->cq; 231335640Shselasky qp_init_attr.cap.max_recv_wr = RDMASNIFF_NUM_RECEIVES; 232335640Shselasky qp_init_attr.cap.max_recv_sge = 1; 233335640Shselasky qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; 234335640Shselasky priv->qp = ibv_create_qp(priv->pd, &qp_init_attr); 235335640Shselasky if (!priv->qp) { 236335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 237335640Shselasky "Failed to create QP for device %s", handle->opt.device); 238335640Shselasky goto error; 239335640Shselasky } 240335640Shselasky 241335640Shselasky memset(&qp_attr, 0, sizeof qp_attr); 242335640Shselasky qp_attr.qp_state = IBV_QPS_INIT; 243335640Shselasky qp_attr.port_num = priv->port_num; 244335640Shselasky if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE | IBV_QP_PORT)) { 245335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 246335640Shselasky "Failed to modify QP to INIT for device %s", handle->opt.device); 247335640Shselasky goto error; 248335640Shselasky } 249335640Shselasky 250335640Shselasky memset(&qp_attr, 0, sizeof qp_attr); 251335640Shselasky qp_attr.qp_state = IBV_QPS_RTR; 252335640Shselasky if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE)) { 253335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 254335640Shselasky "Failed to modify QP to RTR for device %s", handle->opt.device); 255335640Shselasky goto error; 256335640Shselasky } 257335640Shselasky 258335640Shselasky memset(&flow_attr, 0, sizeof flow_attr); 259335640Shselasky flow_attr.type = IBV_FLOW_ATTR_SNIFFER; 260335640Shselasky flow_attr.size = sizeof flow_attr; 261335640Shselasky flow_attr.port = priv->port_num; 262335640Shselasky priv->flow = ibv_create_flow(priv->qp, &flow_attr); 263335640Shselasky if (!priv->flow) { 264335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 265335640Shselasky "Failed to create flow for device %s", handle->opt.device); 266335640Shselasky goto error; 267335640Shselasky } 268335640Shselasky 269335640Shselasky handle->bufsize = RDMASNIFF_NUM_RECEIVES * RDMASNIFF_RECEIVE_SIZE; 270335640Shselasky handle->buffer = malloc(handle->bufsize); 271335640Shselasky if (!handle->buffer) { 272335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 273335640Shselasky "Failed to allocate receive buffer for device %s", handle->opt.device); 274335640Shselasky goto error; 275335640Shselasky } 276335640Shselasky 277335640Shselasky priv->oneshot_buffer = malloc(RDMASNIFF_RECEIVE_SIZE); 278335640Shselasky if (!priv->oneshot_buffer) { 279335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 280335640Shselasky "Failed to allocate oneshot buffer for device %s", handle->opt.device); 281335640Shselasky goto error; 282335640Shselasky } 283335640Shselasky 284335640Shselasky priv->mr = ibv_reg_mr(priv->pd, handle->buffer, handle->bufsize, IBV_ACCESS_LOCAL_WRITE); 285335640Shselasky if (!priv->mr) { 286335640Shselasky pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 287335640Shselasky "Failed to register MR for device %s", handle->opt.device); 288335640Shselasky goto error; 289335640Shselasky } 290335640Shselasky 291335640Shselasky 292335640Shselasky for (i = 0; i < RDMASNIFF_NUM_RECEIVES; ++i) { 293335640Shselasky rdmasniff_post_recv(handle, i); 294335640Shselasky } 295335640Shselasky 296335640Shselasky if (!ibv_query_port(priv->context, priv->port_num, &port_attr) && 297335640Shselasky port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { 298335640Shselasky handle->linktype = DLT_INFINIBAND; 299335640Shselasky } else { 300335640Shselasky handle->linktype = DLT_EN10MB; 301335640Shselasky } 302335640Shselasky 303335640Shselasky if (handle->snapshot <= 0 || handle->snapshot > RDMASNIFF_RECEIVE_SIZE) 304335640Shselasky handle->snapshot = RDMASNIFF_RECEIVE_SIZE; 305335640Shselasky 306335640Shselasky handle->offset = 0; 307335640Shselasky handle->read_op = rdmasniff_read; 308335640Shselasky handle->stats_op = rdmasniff_stats; 309335640Shselasky handle->cleanup_op = rdmasniff_cleanup; 310335640Shselasky handle->setfilter_op = install_bpf_program; 311335640Shselasky handle->setdirection_op = NULL; 312335640Shselasky handle->set_datalink_op = NULL; 313335640Shselasky handle->getnonblock_op = pcap_getnonblock_fd; 314335640Shselasky handle->setnonblock_op = pcap_setnonblock_fd; 315335640Shselasky handle->oneshot_callback = rdmasniff_oneshot; 316335640Shselasky handle->selectable_fd = priv->channel->fd; 317335640Shselasky 318335640Shselasky return 0; 319335640Shselasky 320335640Shselaskyerror: 321335640Shselasky if (priv->mr) { 322335640Shselasky ibv_dereg_mr(priv->mr); 323335640Shselasky } 324335640Shselasky 325335640Shselasky if (priv->flow) { 326335640Shselasky ibv_destroy_flow(priv->flow); 327335640Shselasky } 328335640Shselasky 329335640Shselasky if (priv->qp) { 330335640Shselasky ibv_destroy_qp(priv->qp); 331335640Shselasky } 332335640Shselasky 333335640Shselasky if (priv->cq) { 334335640Shselasky ibv_destroy_cq(priv->cq); 335335640Shselasky } 336335640Shselasky 337335640Shselasky if (priv->channel) { 338335640Shselasky ibv_destroy_comp_channel(priv->channel); 339335640Shselasky } 340335640Shselasky 341335640Shselasky if (priv->pd) { 342335640Shselasky ibv_dealloc_pd(priv->pd); 343335640Shselasky } 344335640Shselasky 345335640Shselasky if (priv->context) { 346335640Shselasky ibv_close_device(priv->context); 347335640Shselasky } 348335640Shselasky 349335640Shselasky if (priv->oneshot_buffer) { 350335640Shselasky free(priv->oneshot_buffer); 351335640Shselasky } 352335640Shselasky 353335640Shselasky return PCAP_ERROR; 354335640Shselasky} 355335640Shselasky 356335640Shselaskypcap_t * 357335640Shselaskyrdmasniff_create(const char *device, char *ebuf, int *is_ours) 358335640Shselasky{ 359335640Shselasky struct pcap_rdmasniff *priv; 360335640Shselasky struct ibv_device **dev_list; 361335640Shselasky int numdev; 362335640Shselasky size_t namelen; 363335640Shselasky const char *port; 364335640Shselasky unsigned port_num; 365335640Shselasky int i; 366335640Shselasky pcap_t *p = NULL; 367335640Shselasky 368335640Shselasky *is_ours = 0; 369335640Shselasky 370335640Shselasky dev_list = ibv_get_device_list(&numdev); 371335640Shselasky if (!dev_list || !numdev) { 372335640Shselasky return NULL; 373335640Shselasky } 374335640Shselasky 375335640Shselasky namelen = strlen(device); 376335640Shselasky 377335640Shselasky port = strchr(device, ':'); 378335640Shselasky if (port) { 379335640Shselasky port_num = strtoul(port + 1, NULL, 10); 380335640Shselasky if (port_num > 0) { 381335640Shselasky namelen = port - device; 382335640Shselasky } else { 383335640Shselasky port_num = 1; 384335640Shselasky } 385335640Shselasky } else { 386335640Shselasky port_num = 1; 387335640Shselasky } 388335640Shselasky 389335640Shselasky for (i = 0; i < numdev; ++i) { 390335640Shselasky if (strlen(dev_list[i]->name) == namelen && 391335640Shselasky !strncmp(device, dev_list[i]->name, namelen)) { 392335640Shselasky *is_ours = 1; 393335640Shselasky 394335640Shselasky p = pcap_create_common(ebuf, sizeof (struct pcap_rdmasniff)); 395335640Shselasky if (p) { 396335640Shselasky p->activate_op = rdmasniff_activate; 397335640Shselasky priv = p->priv; 398335640Shselasky priv->rdma_device = dev_list[i]; 399335640Shselasky priv->port_num = port_num; 400335640Shselasky } 401335640Shselasky break; 402335640Shselasky } 403335640Shselasky } 404335640Shselasky 405335640Shselasky ibv_free_device_list(dev_list); 406335640Shselasky return p; 407335640Shselasky} 408335640Shselasky 409335640Shselaskyint 410335640Shselaskyrdmasniff_findalldevs(pcap_if_list_t *devlistp, char *err_str) 411335640Shselasky{ 412335640Shselasky struct ibv_device **dev_list; 413335640Shselasky int numdev; 414335640Shselasky int i; 415335640Shselasky int ret = 0; 416335640Shselasky 417335640Shselasky dev_list = ibv_get_device_list(&numdev); 418335640Shselasky if (!dev_list || !numdev) { 419335640Shselasky return 0; 420335640Shselasky } 421335640Shselasky 422335640Shselasky for (i = 0; i < numdev; ++i) { 423335640Shselasky /* 424335640Shselasky * XXX - do the notions of "up", "running", or 425335640Shselasky * "connected" apply here? 426335640Shselasky */ 427335640Shselasky if (!add_dev(devlistp, dev_list[i]->name, 0, "RDMA sniffer", err_str)) { 428335640Shselasky ret = -1; 429335640Shselasky goto out; 430335640Shselasky } 431335640Shselasky } 432335640Shselasky 433335640Shselaskyout: 434335640Shselasky ibv_free_device_list(dev_list); 435335640Shselasky return ret; 436335640Shselasky} 437