1335640Shselasky/*
2335640Shselasky * Copyright (c) 2017 Pure Storage, Inc.
3335640Shselasky * All rights reserved.
4335640Shselasky *
5335640Shselasky * Redistribution and use in source and binary forms, with or without
6335640Shselasky * modification, are permitted provided that the following conditions
7335640Shselasky * are met:
8335640Shselasky *
9335640Shselasky * 1. Redistributions of source code must retain the above copyright
10335640Shselasky * notice, this list of conditions and the following disclaimer.
11335640Shselasky * 2. Redistributions in binary form must reproduce the above copyright
12335640Shselasky * notice, this list of conditions and the following disclaimer in the
13335640Shselasky * documentation and/or other materials provided with the distribution.
14335640Shselasky * 3. The name of the author may not be used to endorse or promote
15335640Shselasky * products derived from this software without specific prior written
16335640Shselasky * permission.
17335640Shselasky *
18335640Shselasky * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19335640Shselasky * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20335640Shselasky * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21335640Shselasky * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22335640Shselasky * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23335640Shselasky * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24335640Shselasky * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25335640Shselasky * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26335640Shselasky * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27335640Shselasky * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28335640Shselasky * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29335640Shselasky */
30335640Shselasky
31335640Shselasky#ifdef HAVE_CONFIG_H
32335640Shselasky#include "config.h"
33335640Shselasky#endif
34335640Shselasky
35335640Shselasky#include "pcap-int.h"
36335640Shselasky#include "pcap-rdmasniff.h"
37335640Shselasky
38335640Shselasky#include <infiniband/verbs.h>
39335640Shselasky#include <stdlib.h>
40335640Shselasky#include <string.h>
41335640Shselasky#include <sys/time.h>
42335640Shselasky
43335640Shselasky#if !defined(IBV_FLOW_ATTR_SNIFFER)
44335640Shselasky#define IBV_FLOW_ATTR_SNIFFER	3
45335640Shselasky#endif
46335640Shselasky
47335640Shselaskystatic const int RDMASNIFF_NUM_RECEIVES = 128;
48335640Shselaskystatic const int RDMASNIFF_RECEIVE_SIZE = 10000;
49335640Shselasky
50335640Shselaskystruct pcap_rdmasniff {
51335640Shselasky	struct ibv_device *		rdma_device;
52335640Shselasky	struct ibv_context *		context;
53335640Shselasky	struct ibv_comp_channel *	channel;
54335640Shselasky	struct ibv_pd *			pd;
55335640Shselasky	struct ibv_cq *			cq;
56335640Shselasky	struct ibv_qp *			qp;
57335640Shselasky	struct ibv_flow *               flow;
58335640Shselasky	struct ibv_mr *			mr;
59335640Shselasky	u_char *			oneshot_buffer;
60335640Shselasky	unsigned			port_num;
61335640Shselasky	int                             cq_event;
62335640Shselasky	u_int                           packets_recv;
63335640Shselasky};
64335640Shselasky
65335640Shselaskystatic int
66335640Shselaskyrdmasniff_stats(pcap_t *handle, struct pcap_stat *stat)
67335640Shselasky{
68335640Shselasky	struct pcap_rdmasniff *priv = handle->priv;
69335640Shselasky
70335640Shselasky	stat->ps_recv = priv->packets_recv;
71335640Shselasky	stat->ps_drop = 0;
72335640Shselasky	stat->ps_ifdrop = 0;
73335640Shselasky
74335640Shselasky	return 0;
75335640Shselasky}
76335640Shselasky
77335640Shselaskystatic void
78335640Shselaskyrdmasniff_cleanup(pcap_t *handle)
79335640Shselasky{
80335640Shselasky	struct pcap_rdmasniff *priv = handle->priv;
81335640Shselasky
82335640Shselasky	ibv_dereg_mr(priv->mr);
83335640Shselasky	ibv_destroy_flow(priv->flow);
84335640Shselasky	ibv_destroy_qp(priv->qp);
85335640Shselasky	ibv_destroy_cq(priv->cq);
86335640Shselasky	ibv_dealloc_pd(priv->pd);
87335640Shselasky	ibv_destroy_comp_channel(priv->channel);
88335640Shselasky	ibv_close_device(priv->context);
89335640Shselasky	free(priv->oneshot_buffer);
90335640Shselasky
91335640Shselasky	pcap_cleanup_live_common(handle);
92335640Shselasky}
93335640Shselasky
94335640Shselaskystatic void
95335640Shselaskyrdmasniff_post_recv(pcap_t *handle, uint64_t wr_id)
96335640Shselasky{
97335640Shselasky	struct pcap_rdmasniff *priv = handle->priv;
98335640Shselasky	struct ibv_sge sg_entry;
99335640Shselasky	struct ibv_recv_wr wr, *bad_wr;
100335640Shselasky
101335640Shselasky	sg_entry.length = RDMASNIFF_RECEIVE_SIZE;
102335640Shselasky	sg_entry.addr = (uintptr_t) handle->buffer + RDMASNIFF_RECEIVE_SIZE * wr_id;
103335640Shselasky	sg_entry.lkey = priv->mr->lkey;
104335640Shselasky
105335640Shselasky	wr.wr_id = wr_id;
106335640Shselasky	wr.num_sge = 1;
107335640Shselasky	wr.sg_list = &sg_entry;
108335640Shselasky	wr.next = NULL;
109335640Shselasky
110335640Shselasky	ibv_post_recv(priv->qp, &wr, &bad_wr);
111335640Shselasky}
112335640Shselasky
113335640Shselaskystatic int
114335640Shselaskyrdmasniff_read(pcap_t *handle, int max_packets, pcap_handler callback, u_char *user)
115335640Shselasky{
116335640Shselasky	struct pcap_rdmasniff *priv = handle->priv;
117335640Shselasky	struct ibv_cq *ev_cq;
118335640Shselasky	void *ev_ctx;
119335640Shselasky	struct ibv_wc wc;
120335640Shselasky	struct pcap_pkthdr pkth;
121335640Shselasky	u_char *pktd;
122335640Shselasky	int count = 0;
123335640Shselasky
124335640Shselasky	if (!priv->cq_event) {
125335640Shselasky		while (ibv_get_cq_event(priv->channel, &ev_cq, &ev_ctx) < 0) {
126335640Shselasky			if (errno != EINTR) {
127335640Shselasky				return PCAP_ERROR;
128335640Shselasky			}
129335640Shselasky			if (handle->break_loop) {
130335640Shselasky				handle->break_loop = 0;
131335640Shselasky				return PCAP_ERROR_BREAK;
132335640Shselasky			}
133335640Shselasky		}
134335640Shselasky		ibv_ack_cq_events(priv->cq, 1);
135335640Shselasky		ibv_req_notify_cq(priv->cq, 0);
136335640Shselasky		priv->cq_event = 1;
137335640Shselasky	}
138335640Shselasky
139335640Shselasky	while (count < max_packets || PACKET_COUNT_IS_UNLIMITED(max_packets)) {
140335640Shselasky		if (ibv_poll_cq(priv->cq, 1, &wc) != 1) {
141335640Shselasky			priv->cq_event = 0;
142335640Shselasky			break;
143335640Shselasky		}
144335640Shselasky
145335640Shselasky		if (wc.status != IBV_WC_SUCCESS) {
146335640Shselasky			fprintf(stderr, "failed WC wr_id %lld status %d/%s\n",
147335640Shselasky				(unsigned long long) wc.wr_id,
148335640Shselasky				wc.status, ibv_wc_status_str(wc.status));
149335640Shselasky			continue;
150335640Shselasky		}
151335640Shselasky
152335640Shselasky		pkth.len = wc.byte_len;
153335640Shselasky		pkth.caplen = min(pkth.len, (u_int)handle->snapshot);
154335640Shselasky		gettimeofday(&pkth.ts, NULL);
155335640Shselasky
156335640Shselasky		pktd = (u_char *) handle->buffer + wc.wr_id * RDMASNIFF_RECEIVE_SIZE;
157335640Shselasky
158335640Shselasky		if (handle->fcode.bf_insns == NULL ||
159335640Shselasky		    bpf_filter(handle->fcode.bf_insns, pktd, pkth.len, pkth.caplen)) {
160335640Shselasky			callback(user, &pkth, pktd);
161335640Shselasky			++priv->packets_recv;
162335640Shselasky			++count;
163335640Shselasky		}
164335640Shselasky
165335640Shselasky		rdmasniff_post_recv(handle, wc.wr_id);
166335640Shselasky
167335640Shselasky		if (handle->break_loop) {
168335640Shselasky			handle->break_loop = 0;
169335640Shselasky			return PCAP_ERROR_BREAK;
170335640Shselasky		}
171335640Shselasky	}
172335640Shselasky
173335640Shselasky	return count;
174335640Shselasky}
175335640Shselasky
176335640Shselaskystatic void
177335640Shselaskyrdmasniff_oneshot(u_char *user, const struct pcap_pkthdr *h, const u_char *bytes)
178335640Shselasky{
179335640Shselasky	struct oneshot_userdata *sp = (struct oneshot_userdata *) user;
180335640Shselasky	pcap_t *handle = sp->pd;
181335640Shselasky	struct pcap_rdmasniff *priv = handle->priv;
182335640Shselasky
183335640Shselasky	*sp->hdr = *h;
184335640Shselasky	memcpy(priv->oneshot_buffer, bytes, h->caplen);
185335640Shselasky	*sp->pkt = priv->oneshot_buffer;
186335640Shselasky}
187335640Shselasky
188335640Shselaskystatic int
189335640Shselaskyrdmasniff_activate(pcap_t *handle)
190335640Shselasky{
191335640Shselasky	struct pcap_rdmasniff *priv = handle->priv;
192335640Shselasky	struct ibv_qp_init_attr qp_init_attr;
193335640Shselasky	struct ibv_qp_attr qp_attr;
194335640Shselasky	struct ibv_flow_attr flow_attr;
195335640Shselasky	struct ibv_port_attr port_attr;
196335640Shselasky	int i;
197335640Shselasky
198335640Shselasky	priv->context = ibv_open_device(priv->rdma_device);
199335640Shselasky	if (!priv->context) {
200335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
201335640Shselasky			      "Failed to open device %s", handle->opt.device);
202335640Shselasky		goto error;
203335640Shselasky	}
204335640Shselasky
205335640Shselasky	priv->pd = ibv_alloc_pd(priv->context);
206335640Shselasky	if (!priv->pd) {
207335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
208335640Shselasky			      "Failed to alloc PD for device %s", handle->opt.device);
209335640Shselasky		goto error;
210335640Shselasky	}
211335640Shselasky
212335640Shselasky	priv->channel = ibv_create_comp_channel(priv->context);
213335640Shselasky	if (!priv->channel) {
214335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
215335640Shselasky			      "Failed to create comp channel for device %s", handle->opt.device);
216335640Shselasky		goto error;
217335640Shselasky	}
218335640Shselasky
219335640Shselasky	priv->cq = ibv_create_cq(priv->context, RDMASNIFF_NUM_RECEIVES,
220335640Shselasky				 NULL, priv->channel, 0);
221335640Shselasky	if (!priv->cq) {
222335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
223335640Shselasky			      "Failed to create CQ for device %s", handle->opt.device);
224335640Shselasky		goto error;
225335640Shselasky	}
226335640Shselasky
227335640Shselasky	ibv_req_notify_cq(priv->cq, 0);
228335640Shselasky
229335640Shselasky	memset(&qp_init_attr, 0, sizeof qp_init_attr);
230335640Shselasky	qp_init_attr.send_cq = qp_init_attr.recv_cq = priv->cq;
231335640Shselasky	qp_init_attr.cap.max_recv_wr = RDMASNIFF_NUM_RECEIVES;
232335640Shselasky	qp_init_attr.cap.max_recv_sge = 1;
233335640Shselasky	qp_init_attr.qp_type = IBV_QPT_RAW_PACKET;
234335640Shselasky	priv->qp = ibv_create_qp(priv->pd, &qp_init_attr);
235335640Shselasky	if (!priv->qp) {
236335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
237335640Shselasky			      "Failed to create QP for device %s", handle->opt.device);
238335640Shselasky		goto error;
239335640Shselasky	}
240335640Shselasky
241335640Shselasky	memset(&qp_attr, 0, sizeof qp_attr);
242335640Shselasky	qp_attr.qp_state = IBV_QPS_INIT;
243335640Shselasky	qp_attr.port_num = priv->port_num;
244335640Shselasky	if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE | IBV_QP_PORT)) {
245335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
246335640Shselasky			      "Failed to modify QP to INIT for device %s", handle->opt.device);
247335640Shselasky		goto error;
248335640Shselasky	}
249335640Shselasky
250335640Shselasky	memset(&qp_attr, 0, sizeof qp_attr);
251335640Shselasky	qp_attr.qp_state = IBV_QPS_RTR;
252335640Shselasky	if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE)) {
253335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
254335640Shselasky			      "Failed to modify QP to RTR for device %s", handle->opt.device);
255335640Shselasky		goto error;
256335640Shselasky	}
257335640Shselasky
258335640Shselasky	memset(&flow_attr, 0, sizeof flow_attr);
259335640Shselasky	flow_attr.type = IBV_FLOW_ATTR_SNIFFER;
260335640Shselasky	flow_attr.size = sizeof flow_attr;
261335640Shselasky	flow_attr.port = priv->port_num;
262335640Shselasky	priv->flow = ibv_create_flow(priv->qp, &flow_attr);
263335640Shselasky	if (!priv->flow) {
264335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
265335640Shselasky			      "Failed to create flow for device %s", handle->opt.device);
266335640Shselasky		goto error;
267335640Shselasky	}
268335640Shselasky
269335640Shselasky	handle->bufsize = RDMASNIFF_NUM_RECEIVES * RDMASNIFF_RECEIVE_SIZE;
270335640Shselasky	handle->buffer = malloc(handle->bufsize);
271335640Shselasky	if (!handle->buffer) {
272335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
273335640Shselasky			      "Failed to allocate receive buffer for device %s", handle->opt.device);
274335640Shselasky		goto error;
275335640Shselasky	}
276335640Shselasky
277335640Shselasky	priv->oneshot_buffer = malloc(RDMASNIFF_RECEIVE_SIZE);
278335640Shselasky	if (!priv->oneshot_buffer) {
279335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
280335640Shselasky			      "Failed to allocate oneshot buffer for device %s", handle->opt.device);
281335640Shselasky		goto error;
282335640Shselasky	}
283335640Shselasky
284335640Shselasky	priv->mr = ibv_reg_mr(priv->pd, handle->buffer, handle->bufsize, IBV_ACCESS_LOCAL_WRITE);
285335640Shselasky	if (!priv->mr) {
286335640Shselasky		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
287335640Shselasky			      "Failed to register MR for device %s", handle->opt.device);
288335640Shselasky		goto error;
289335640Shselasky	}
290335640Shselasky
291335640Shselasky
292335640Shselasky	for (i = 0; i < RDMASNIFF_NUM_RECEIVES; ++i) {
293335640Shselasky		rdmasniff_post_recv(handle, i);
294335640Shselasky	}
295335640Shselasky
296335640Shselasky	if (!ibv_query_port(priv->context, priv->port_num, &port_attr) &&
297335640Shselasky	    port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
298335640Shselasky		handle->linktype = DLT_INFINIBAND;
299335640Shselasky	} else {
300335640Shselasky		handle->linktype = DLT_EN10MB;
301335640Shselasky	}
302335640Shselasky
303335640Shselasky	if (handle->snapshot <= 0 || handle->snapshot > RDMASNIFF_RECEIVE_SIZE)
304335640Shselasky		handle->snapshot = RDMASNIFF_RECEIVE_SIZE;
305335640Shselasky
306335640Shselasky	handle->offset = 0;
307335640Shselasky	handle->read_op = rdmasniff_read;
308335640Shselasky	handle->stats_op = rdmasniff_stats;
309335640Shselasky	handle->cleanup_op = rdmasniff_cleanup;
310335640Shselasky	handle->setfilter_op = install_bpf_program;
311335640Shselasky	handle->setdirection_op = NULL;
312335640Shselasky	handle->set_datalink_op = NULL;
313335640Shselasky	handle->getnonblock_op = pcap_getnonblock_fd;
314335640Shselasky	handle->setnonblock_op = pcap_setnonblock_fd;
315335640Shselasky	handle->oneshot_callback = rdmasniff_oneshot;
316335640Shselasky	handle->selectable_fd = priv->channel->fd;
317335640Shselasky
318335640Shselasky	return 0;
319335640Shselasky
320335640Shselaskyerror:
321335640Shselasky	if (priv->mr) {
322335640Shselasky		ibv_dereg_mr(priv->mr);
323335640Shselasky	}
324335640Shselasky
325335640Shselasky	if (priv->flow) {
326335640Shselasky		ibv_destroy_flow(priv->flow);
327335640Shselasky	}
328335640Shselasky
329335640Shselasky	if (priv->qp) {
330335640Shselasky		ibv_destroy_qp(priv->qp);
331335640Shselasky	}
332335640Shselasky
333335640Shselasky	if (priv->cq) {
334335640Shselasky		ibv_destroy_cq(priv->cq);
335335640Shselasky	}
336335640Shselasky
337335640Shselasky	if (priv->channel) {
338335640Shselasky		ibv_destroy_comp_channel(priv->channel);
339335640Shselasky	}
340335640Shselasky
341335640Shselasky	if (priv->pd) {
342335640Shselasky		ibv_dealloc_pd(priv->pd);
343335640Shselasky	}
344335640Shselasky
345335640Shselasky	if (priv->context) {
346335640Shselasky		ibv_close_device(priv->context);
347335640Shselasky	}
348335640Shselasky
349335640Shselasky	if (priv->oneshot_buffer) {
350335640Shselasky		free(priv->oneshot_buffer);
351335640Shselasky	}
352335640Shselasky
353335640Shselasky	return PCAP_ERROR;
354335640Shselasky}
355335640Shselasky
356335640Shselaskypcap_t *
357335640Shselaskyrdmasniff_create(const char *device, char *ebuf, int *is_ours)
358335640Shselasky{
359335640Shselasky	struct pcap_rdmasniff *priv;
360335640Shselasky	struct ibv_device **dev_list;
361335640Shselasky	int numdev;
362335640Shselasky	size_t namelen;
363335640Shselasky	const char *port;
364335640Shselasky	unsigned port_num;
365335640Shselasky	int i;
366335640Shselasky	pcap_t *p = NULL;
367335640Shselasky
368335640Shselasky	*is_ours = 0;
369335640Shselasky
370335640Shselasky	dev_list = ibv_get_device_list(&numdev);
371335640Shselasky	if (!dev_list || !numdev) {
372335640Shselasky		return NULL;
373335640Shselasky	}
374335640Shselasky
375335640Shselasky	namelen = strlen(device);
376335640Shselasky
377335640Shselasky	port = strchr(device, ':');
378335640Shselasky	if (port) {
379335640Shselasky		port_num = strtoul(port + 1, NULL, 10);
380335640Shselasky		if (port_num > 0) {
381335640Shselasky			namelen = port - device;
382335640Shselasky		} else {
383335640Shselasky			port_num = 1;
384335640Shselasky		}
385335640Shselasky	} else {
386335640Shselasky		port_num = 1;
387335640Shselasky	}
388335640Shselasky
389335640Shselasky	for (i = 0; i < numdev; ++i) {
390335640Shselasky		if (strlen(dev_list[i]->name) == namelen &&
391335640Shselasky		    !strncmp(device, dev_list[i]->name, namelen)) {
392335640Shselasky			*is_ours = 1;
393335640Shselasky
394335640Shselasky			p = pcap_create_common(ebuf, sizeof (struct pcap_rdmasniff));
395335640Shselasky			if (p) {
396335640Shselasky				p->activate_op = rdmasniff_activate;
397335640Shselasky				priv = p->priv;
398335640Shselasky				priv->rdma_device = dev_list[i];
399335640Shselasky				priv->port_num = port_num;
400335640Shselasky			}
401335640Shselasky			break;
402335640Shselasky		}
403335640Shselasky	}
404335640Shselasky
405335640Shselasky	ibv_free_device_list(dev_list);
406335640Shselasky	return p;
407335640Shselasky}
408335640Shselasky
409335640Shselaskyint
410335640Shselaskyrdmasniff_findalldevs(pcap_if_list_t *devlistp, char *err_str)
411335640Shselasky{
412335640Shselasky	struct ibv_device **dev_list;
413335640Shselasky	int numdev;
414335640Shselasky	int i;
415335640Shselasky	int ret = 0;
416335640Shselasky
417335640Shselasky	dev_list = ibv_get_device_list(&numdev);
418335640Shselasky	if (!dev_list || !numdev) {
419335640Shselasky		return 0;
420335640Shselasky	}
421335640Shselasky
422335640Shselasky	for (i = 0; i < numdev; ++i) {
423335640Shselasky		/*
424335640Shselasky		 * XXX - do the notions of "up", "running", or
425335640Shselasky		 * "connected" apply here?
426335640Shselasky		 */
427335640Shselasky		if (!add_dev(devlistp, dev_list[i]->name, 0, "RDMA sniffer", err_str)) {
428335640Shselasky			ret = -1;
429335640Shselasky			goto out;
430335640Shselasky		}
431335640Shselasky	}
432335640Shselasky
433335640Shselaskyout:
434335640Shselasky	ibv_free_device_list(dev_list);
435335640Shselasky	return ret;
436335640Shselasky}
437