1/*
2 * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3 * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/contrib/rdma/krping/krping.c 325602 2017-11-09 17:51:55Z hselasky $");
36
37#include <linux/module.h>
38#include <linux/moduleparam.h>
39#include <linux/slab.h>
40#include <linux/err.h>
41#include <linux/string.h>
42#include <linux/list.h>
43#include <linux/in.h>
44#include <linux/device.h>
45#include <linux/pci.h>
46#include <linux/sched.h>
47
48#include <asm/atomic.h>
49
50#include <rdma/ib_verbs.h>
51#include <rdma/rdma_cm.h>
52
53#include "krping.h"
54#include "getopt.h"
55
56extern int krping_debug;
57#define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x)
58#define PRINTF(cb, x...) krping_printf((cb)->cookie, x)
59
60MODULE_AUTHOR("Steve Wise");
61MODULE_DESCRIPTION("RDMA ping client/server");
62MODULE_LICENSE("Dual BSD/GPL");
63
64static __inline uint64_t
65get_cycles(void)
66{
67	uint32_t low, high;
68	__asm __volatile("rdtsc" : "=a" (low), "=d" (high));
69	return (low | ((u_int64_t)high << 32));
70}
71
72typedef uint64_t cycles_t;
73
74enum mem_type {
75	DMA = 1,
76	FASTREG = 2,
77	MW = 3,
78	MR = 4
79};
80
81static const struct krping_option krping_opts[] = {
82	{"count", OPT_INT, 'C'},
83	{"size", OPT_INT, 'S'},
84	{"addr", OPT_STRING, 'a'},
85	{"addr6", OPT_STRING, 'A'},
86	{"port", OPT_INT, 'p'},
87	{"verbose", OPT_NOPARAM, 'v'},
88	{"validate", OPT_NOPARAM, 'V'},
89	{"server", OPT_NOPARAM, 's'},
90	{"client", OPT_NOPARAM, 'c'},
91	{"mem_mode", OPT_STRING, 'm'},
92	{"server_inv", OPT_NOPARAM, 'I'},
93 	{"wlat", OPT_NOPARAM, 'l'},
94 	{"rlat", OPT_NOPARAM, 'L'},
95 	{"bw", OPT_NOPARAM, 'B'},
96 	{"duplex", OPT_NOPARAM, 'd'},
97 	{"txdepth", OPT_INT, 'T'},
98 	{"poll", OPT_NOPARAM, 'P'},
99 	{"local_dma_lkey", OPT_NOPARAM, 'Z'},
100 	{"read_inv", OPT_NOPARAM, 'R'},
101 	{"fr", OPT_NOPARAM, 'f'},
102	{NULL, 0, 0}
103};
104
105#define htonll(x) cpu_to_be64((x))
106#define ntohll(x) cpu_to_be64((x))
107
108static struct mutex krping_mutex;
109
110/*
111 * List of running krping threads.
112 */
113static LIST_HEAD(krping_cbs);
114
115/*
116 * krping "ping/pong" loop:
117 * 	client sends source rkey/addr/len
118 *	server receives source rkey/add/len
119 *	server rdma reads "ping" data from source
120 * 	server sends "go ahead" on rdma read completion
121 *	client sends sink rkey/addr/len
122 * 	server receives sink rkey/addr/len
123 * 	server rdma writes "pong" data to sink
124 * 	server sends "go ahead" on rdma write completion
125 * 	<repeat loop>
126 */
127
128/*
129 * These states are used to signal events between the completion handler
130 * and the main client or server thread.
131 *
132 * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
133 * and RDMA_WRITE_COMPLETE for each ping.
134 */
135enum test_state {
136	IDLE = 1,
137	CONNECT_REQUEST,
138	ADDR_RESOLVED,
139	ROUTE_RESOLVED,
140	CONNECTED,
141	RDMA_READ_ADV,
142	RDMA_READ_COMPLETE,
143	RDMA_WRITE_ADV,
144	RDMA_WRITE_COMPLETE,
145	ERROR
146};
147
148struct krping_rdma_info {
149	uint64_t buf;
150	uint32_t rkey;
151	uint32_t size;
152};
153
154/*
155 * Default max buffer size for IO...
156 */
157#define RPING_BUFSIZE 128*1024
158#define RPING_SQ_DEPTH 64
159
160/*
161 * Control block struct.
162 */
163struct krping_cb {
164	void *cookie;
165	int server;			/* 0 iff client */
166	struct ib_cq *cq;
167	struct ib_pd *pd;
168	struct ib_qp *qp;
169
170	enum mem_type mem;
171	struct ib_mr *dma_mr;
172
173	struct ib_fast_reg_page_list *page_list;
174	int page_list_len;
175	struct ib_send_wr fastreg_wr;
176	struct ib_send_wr invalidate_wr;
177	struct ib_mr *fastreg_mr;
178	int server_invalidate;
179	int read_inv;
180	u8 key;
181
182	struct ib_mw *mw;
183	struct ib_mw_bind bind_attr;
184
185	struct ib_recv_wr rq_wr;	/* recv work request record */
186	struct ib_sge recv_sgl;		/* recv single SGE */
187	struct krping_rdma_info recv_buf;/* malloc'd buffer */
188	u64 recv_dma_addr;
189	DECLARE_PCI_UNMAP_ADDR(recv_mapping)
190	struct ib_mr *recv_mr;
191
192	struct ib_send_wr sq_wr;	/* send work requrest record */
193	struct ib_sge send_sgl;
194	struct krping_rdma_info send_buf;/* single send buf */
195	u64 send_dma_addr;
196	DECLARE_PCI_UNMAP_ADDR(send_mapping)
197	struct ib_mr *send_mr;
198
199	struct ib_send_wr rdma_sq_wr;	/* rdma work request record */
200	struct ib_sge rdma_sgl;		/* rdma single SGE */
201	char *rdma_buf;			/* used as rdma sink */
202	u64  rdma_dma_addr;
203	DECLARE_PCI_UNMAP_ADDR(rdma_mapping)
204	struct ib_mr *rdma_mr;
205
206	uint32_t remote_rkey;		/* remote guys RKEY */
207	uint64_t remote_addr;		/* remote guys TO */
208	uint32_t remote_len;		/* remote guys LEN */
209
210	char *start_buf;		/* rdma read src */
211	u64  start_dma_addr;
212	DECLARE_PCI_UNMAP_ADDR(start_mapping)
213	struct ib_mr *start_mr;
214
215	enum test_state state;		/* used for cond/signalling */
216	wait_queue_head_t sem;
217	struct krping_stats stats;
218
219	uint16_t port;			/* dst port in NBO */
220	union {
221		struct in_addr v4;
222		struct in6_addr v6;
223	} addr;				/* dst addr in NBO */
224	int addr_type;			/* AF_INET or AF_INET6 */
225	char *addr_str;			/* dst addr string */
226	int verbose;			/* verbose logging */
227	int count;			/* ping count */
228	int size;			/* ping data size */
229	int validate;			/* validate ping data */
230	int wlat;			/* run wlat test */
231	int rlat;			/* run rlat test */
232	int bw;				/* run bw test */
233	int duplex;			/* run bw full duplex test */
234	int poll;			/* poll or block for rlat test */
235	int txdepth;			/* SQ depth */
236	int local_dma_lkey;		/* use 0 for lkey */
237	int frtest;			/* fastreg test */
238
239	/* CM stuff */
240	struct rdma_cm_id *cm_id;	/* connection on client side,*/
241					/* listener on server side. */
242	struct rdma_cm_id *child_cm_id;	/* connection on server side */
243	struct list_head list;
244};
245
246static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
247				   struct rdma_cm_event *event)
248{
249	int ret;
250	struct krping_cb *cb = cma_id->context;
251
252	DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
253	    cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
254
255	switch (event->event) {
256	case RDMA_CM_EVENT_ADDR_RESOLVED:
257		cb->state = ADDR_RESOLVED;
258		ret = rdma_resolve_route(cma_id, 2000);
259		if (ret) {
260			PRINTF(cb, "rdma_resolve_route error %d\n", ret);
261			wake_up_interruptible(&cb->sem);
262		}
263		break;
264
265	case RDMA_CM_EVENT_ROUTE_RESOLVED:
266		cb->state = ROUTE_RESOLVED;
267		cb->child_cm_id = cma_id;
268		wake_up_interruptible(&cb->sem);
269		break;
270
271	case RDMA_CM_EVENT_CONNECT_REQUEST:
272		cb->state = CONNECT_REQUEST;
273		cb->child_cm_id = cma_id;
274		DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
275		wake_up_interruptible(&cb->sem);
276		break;
277
278	case RDMA_CM_EVENT_ESTABLISHED:
279		DEBUG_LOG(cb, "ESTABLISHED\n");
280		if (!cb->server) {
281			cb->state = CONNECTED;
282		}
283		wake_up_interruptible(&cb->sem);
284		break;
285
286	case RDMA_CM_EVENT_ADDR_ERROR:
287	case RDMA_CM_EVENT_ROUTE_ERROR:
288	case RDMA_CM_EVENT_CONNECT_ERROR:
289	case RDMA_CM_EVENT_UNREACHABLE:
290	case RDMA_CM_EVENT_REJECTED:
291		PRINTF(cb, "cma event %d, error %d\n", event->event,
292		       event->status);
293		cb->state = ERROR;
294		wake_up_interruptible(&cb->sem);
295		break;
296
297	case RDMA_CM_EVENT_DISCONNECTED:
298		PRINTF(cb, "DISCONNECT EVENT...\n");
299		cb->state = ERROR;
300		wake_up_interruptible(&cb->sem);
301		break;
302
303	case RDMA_CM_EVENT_DEVICE_REMOVAL:
304		PRINTF(cb, "cma detected device removal!!!!\n");
305		break;
306
307	default:
308		PRINTF(cb, "oof bad type!\n");
309		wake_up_interruptible(&cb->sem);
310		break;
311	}
312	return 0;
313}
314
315static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
316{
317	if (wc->byte_len != sizeof(cb->recv_buf)) {
318		PRINTF(cb, "Received bogus data, size %d\n",
319		       wc->byte_len);
320		return -1;
321	}
322
323	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
324	cb->remote_addr = ntohll(cb->recv_buf.buf);
325	cb->remote_len  = ntohl(cb->recv_buf.size);
326	DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
327		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
328		  cb->remote_len);
329
330	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
331		cb->state = RDMA_READ_ADV;
332	else
333		cb->state = RDMA_WRITE_ADV;
334
335	return 0;
336}
337
338static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
339{
340	if (wc->byte_len != sizeof(cb->recv_buf)) {
341		PRINTF(cb, "Received bogus data, size %d\n",
342		       wc->byte_len);
343		return -1;
344	}
345
346	if (cb->state == RDMA_READ_ADV)
347		cb->state = RDMA_WRITE_ADV;
348	else
349		cb->state = RDMA_WRITE_COMPLETE;
350
351	return 0;
352}
353
354static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
355{
356	struct krping_cb *cb = ctx;
357	struct ib_wc wc;
358	struct ib_recv_wr *bad_wr;
359	int ret;
360
361	BUG_ON(cb->cq != cq);
362	if (cb->state == ERROR) {
363		PRINTF(cb, "cq completion in ERROR state\n");
364		return;
365	}
366	if (cb->frtest) {
367		PRINTF(cb, "cq completion event in frtest!\n");
368		return;
369	}
370	if (!cb->wlat && !cb->rlat && !cb->bw)
371		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
372	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
373		if (wc.status) {
374			if (wc.status == IB_WC_WR_FLUSH_ERR) {
375				DEBUG_LOG(cb, "cq flushed\n");
376				continue;
377			} else {
378				PRINTF(cb, "cq completion failed with "
379				       "wr_id %Lx status %d opcode %d vender_err %x\n",
380					wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
381				goto error;
382			}
383		}
384
385		switch (wc.opcode) {
386		case IB_WC_SEND:
387			DEBUG_LOG(cb, "send completion\n");
388			cb->stats.send_bytes += cb->send_sgl.length;
389			cb->stats.send_msgs++;
390			break;
391
392		case IB_WC_RDMA_WRITE:
393			DEBUG_LOG(cb, "rdma write completion\n");
394			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
395			cb->stats.write_msgs++;
396			cb->state = RDMA_WRITE_COMPLETE;
397			wake_up_interruptible(&cb->sem);
398			break;
399
400		case IB_WC_RDMA_READ:
401			DEBUG_LOG(cb, "rdma read completion\n");
402			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
403			cb->stats.read_msgs++;
404			cb->state = RDMA_READ_COMPLETE;
405			wake_up_interruptible(&cb->sem);
406			break;
407
408		case IB_WC_RECV:
409			DEBUG_LOG(cb, "recv completion\n");
410			cb->stats.recv_bytes += sizeof(cb->recv_buf);
411			cb->stats.recv_msgs++;
412			if (cb->wlat || cb->rlat || cb->bw)
413				ret = server_recv(cb, &wc);
414			else
415				ret = cb->server ? server_recv(cb, &wc) :
416						   client_recv(cb, &wc);
417			if (ret) {
418				PRINTF(cb, "recv wc error: %d\n", ret);
419				goto error;
420			}
421
422			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
423			if (ret) {
424				PRINTF(cb, "post recv error: %d\n",
425				       ret);
426				goto error;
427			}
428			wake_up_interruptible(&cb->sem);
429			break;
430
431		default:
432			PRINTF(cb,
433			       "%s:%d Unexpected opcode %d, Shutting down\n",
434			       __func__, __LINE__, wc.opcode);
435			goto error;
436		}
437	}
438	if (ret) {
439		PRINTF(cb, "poll error %d\n", ret);
440		goto error;
441	}
442	return;
443error:
444	cb->state = ERROR;
445	wake_up_interruptible(&cb->sem);
446}
447
448static int krping_accept(struct krping_cb *cb)
449{
450	struct rdma_conn_param conn_param;
451	int ret;
452
453	DEBUG_LOG(cb, "accepting client connection request\n");
454
455	memset(&conn_param, 0, sizeof conn_param);
456	conn_param.responder_resources = 1;
457	conn_param.initiator_depth = 1;
458
459	ret = rdma_accept(cb->child_cm_id, &conn_param);
460	if (ret) {
461		PRINTF(cb, "rdma_accept error: %d\n", ret);
462		return ret;
463	}
464
465	if (!cb->wlat && !cb->rlat && !cb->bw) {
466		wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
467		if (cb->state == ERROR) {
468			PRINTF(cb, "wait for CONNECTED state %d\n",
469				cb->state);
470			return -1;
471		}
472	}
473	return 0;
474}
475
476static void krping_setup_wr(struct krping_cb *cb)
477{
478	cb->recv_sgl.addr = cb->recv_dma_addr;
479	cb->recv_sgl.length = sizeof cb->recv_buf;
480	if (cb->local_dma_lkey)
481		cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
482	else if (cb->mem == DMA)
483		cb->recv_sgl.lkey = cb->dma_mr->lkey;
484	else
485		cb->recv_sgl.lkey = cb->recv_mr->lkey;
486	cb->rq_wr.sg_list = &cb->recv_sgl;
487	cb->rq_wr.num_sge = 1;
488
489	cb->send_sgl.addr = cb->send_dma_addr;
490	cb->send_sgl.length = sizeof cb->send_buf;
491	if (cb->local_dma_lkey)
492		cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
493	else if (cb->mem == DMA)
494		cb->send_sgl.lkey = cb->dma_mr->lkey;
495	else
496		cb->send_sgl.lkey = cb->send_mr->lkey;
497
498	cb->sq_wr.opcode = IB_WR_SEND;
499	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
500	cb->sq_wr.sg_list = &cb->send_sgl;
501	cb->sq_wr.num_sge = 1;
502
503	if (cb->server || cb->wlat || cb->rlat || cb->bw) {
504		cb->rdma_sgl.addr = cb->rdma_dma_addr;
505		if (cb->mem == MR)
506			cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
507		cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
508		cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
509		cb->rdma_sq_wr.num_sge = 1;
510	}
511
512	switch(cb->mem) {
513	case FASTREG:
514
515		/*
516		 * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
517		 * both unsignaled.  The client uses them to reregister
518		 * the rdma buffers with a new key each iteration.
519		 */
520		cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
521		cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
522		cb->fastreg_wr.wr.fast_reg.length = cb->size;
523		cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
524		cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
525
526		cb->invalidate_wr.next = &cb->fastreg_wr;
527		cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
528		break;
529	case MW:
530		cb->bind_attr.wr_id = 0xabbaabba;
531		cb->bind_attr.send_flags = 0; /* unsignaled */
532		cb->bind_attr.length = cb->size;
533		break;
534	default:
535		break;
536	}
537}
538
539static int krping_setup_buffers(struct krping_cb *cb)
540{
541	int ret;
542	struct ib_phys_buf buf;
543	u64 iovbase;
544
545	DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
546
547	cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device,
548				   &cb->recv_buf,
549				   sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
550	pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
551	cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device,
552					   &cb->send_buf, sizeof(cb->send_buf),
553					   DMA_BIDIRECTIONAL);
554	pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
555
556	if (cb->mem == DMA) {
557		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
558					   IB_ACCESS_REMOTE_READ|
559				           IB_ACCESS_REMOTE_WRITE);
560		if (IS_ERR(cb->dma_mr)) {
561			DEBUG_LOG(cb, "reg_dmamr failed\n");
562			ret = PTR_ERR(cb->dma_mr);
563			goto bail;
564		}
565	} else {
566		if (!cb->local_dma_lkey) {
567			buf.addr = cb->recv_dma_addr;
568			buf.size = sizeof cb->recv_buf;
569			DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr,
570				(int)buf.size);
571			iovbase = cb->recv_dma_addr;
572			cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
573						     IB_ACCESS_LOCAL_WRITE,
574						     &iovbase);
575
576			if (IS_ERR(cb->recv_mr)) {
577				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
578				ret = PTR_ERR(cb->recv_mr);
579				goto bail;
580			}
581
582			buf.addr = cb->send_dma_addr;
583			buf.size = sizeof cb->send_buf;
584			DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr,
585				(int)buf.size);
586			iovbase = cb->send_dma_addr;
587			cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
588						     0, &iovbase);
589
590			if (IS_ERR(cb->send_mr)) {
591				DEBUG_LOG(cb, "send_buf reg_mr failed\n");
592				ret = PTR_ERR(cb->send_mr);
593				goto bail;
594			}
595		}
596	}
597
598	cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
599	if (!cb->rdma_buf) {
600		DEBUG_LOG(cb, "rdma_buf malloc failed\n");
601		ret = -ENOMEM;
602		goto bail;
603	}
604
605	cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device,
606			       cb->rdma_buf, cb->size,
607			       DMA_BIDIRECTIONAL);
608	pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
609	if (cb->mem != DMA) {
610		switch (cb->mem) {
611		case FASTREG:
612			cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
613				PAGE_SIZE) >> PAGE_SHIFT;
614			cb->page_list = ib_alloc_fast_reg_page_list(
615						cb->pd->device,
616						cb->page_list_len);
617			if (IS_ERR(cb->page_list)) {
618				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
619				ret = PTR_ERR(cb->page_list);
620				goto bail;
621			}
622			cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd,
623					cb->page_list->max_page_list_len);
624			if (IS_ERR(cb->fastreg_mr)) {
625				DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
626				ret = PTR_ERR(cb->fastreg_mr);
627				goto bail;
628			}
629			DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
630				" page_list_len %u\n", cb->fastreg_mr->rkey,
631				cb->page_list, cb->page_list_len);
632			break;
633		case MW:
634			cb->mw = ib_alloc_mw(cb->pd);
635			if (IS_ERR(cb->mw)) {
636				DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
637				ret = PTR_ERR(cb->mw);
638				goto bail;
639			}
640			DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
641			/*FALLTHROUGH*/
642		case MR:
643			buf.addr = cb->rdma_dma_addr;
644			buf.size = cb->size;
645			iovbase = cb->rdma_dma_addr;
646			cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
647					     IB_ACCESS_REMOTE_READ|
648					     IB_ACCESS_REMOTE_WRITE,
649					     &iovbase);
650			if (IS_ERR(cb->rdma_mr)) {
651				DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
652				ret = PTR_ERR(cb->rdma_mr);
653				goto bail;
654			}
655			DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n",
656				buf.addr, (int)buf.size, cb->rdma_mr->rkey);
657			break;
658		default:
659			ret = -EINVAL;
660			goto bail;
661			break;
662		}
663	}
664
665	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
666
667		cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
668		if (!cb->start_buf) {
669			DEBUG_LOG(cb, "start_buf malloc failed\n");
670			ret = -ENOMEM;
671			goto bail;
672		}
673
674		cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device,
675						   cb->start_buf, cb->size,
676						   DMA_BIDIRECTIONAL);
677		pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
678
679		if (cb->mem == MR || cb->mem == MW) {
680			unsigned flags = IB_ACCESS_REMOTE_READ;
681
682			if (cb->wlat || cb->rlat || cb->bw)
683				flags |= IB_ACCESS_REMOTE_WRITE;
684
685			buf.addr = cb->start_dma_addr;
686			buf.size = cb->size;
687			DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n",
688				buf.addr, (int)buf.size);
689			iovbase = cb->start_dma_addr;
690			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
691					     flags,
692					     &iovbase);
693
694			if (IS_ERR(cb->start_mr)) {
695				DEBUG_LOG(cb, "start_buf reg_mr failed\n");
696				ret = PTR_ERR(cb->start_mr);
697				goto bail;
698			}
699		}
700	}
701
702	krping_setup_wr(cb);
703	DEBUG_LOG(cb, "allocated & registered buffers...\n");
704	return 0;
705bail:
706	if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
707		ib_dereg_mr(cb->fastreg_mr);
708	if (cb->mw && !IS_ERR(cb->mw))
709		ib_dealloc_mw(cb->mw);
710	if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
711		ib_dereg_mr(cb->rdma_mr);
712	if (cb->page_list && !IS_ERR(cb->page_list))
713		ib_free_fast_reg_page_list(cb->page_list);
714	if (cb->dma_mr && !IS_ERR(cb->dma_mr))
715		ib_dereg_mr(cb->dma_mr);
716	if (cb->recv_mr && !IS_ERR(cb->recv_mr))
717		ib_dereg_mr(cb->recv_mr);
718	if (cb->send_mr && !IS_ERR(cb->send_mr))
719		ib_dereg_mr(cb->send_mr);
720	if (cb->rdma_buf)
721		kfree(cb->rdma_buf);
722	if (cb->start_buf)
723		kfree(cb->start_buf);
724	return ret;
725}
726
727static void krping_free_buffers(struct krping_cb *cb)
728{
729	DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
730
731	if (cb->dma_mr)
732		ib_dereg_mr(cb->dma_mr);
733	if (cb->send_mr)
734		ib_dereg_mr(cb->send_mr);
735	if (cb->recv_mr)
736		ib_dereg_mr(cb->recv_mr);
737	if (cb->rdma_mr)
738		ib_dereg_mr(cb->rdma_mr);
739	if (cb->start_mr)
740		ib_dereg_mr(cb->start_mr);
741	if (cb->fastreg_mr)
742		ib_dereg_mr(cb->fastreg_mr);
743	if (cb->mw)
744		ib_dealloc_mw(cb->mw);
745
746	dma_unmap_single(cb->pd->device->dma_device,
747			 pci_unmap_addr(cb, recv_mapping),
748			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
749	dma_unmap_single(cb->pd->device->dma_device,
750			 pci_unmap_addr(cb, send_mapping),
751			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
752	dma_unmap_single(cb->pd->device->dma_device,
753			 pci_unmap_addr(cb, rdma_mapping),
754			 cb->size, DMA_BIDIRECTIONAL);
755	kfree(cb->rdma_buf);
756	if (cb->start_buf) {
757		dma_unmap_single(cb->pd->device->dma_device,
758			 pci_unmap_addr(cb, start_mapping),
759			 cb->size, DMA_BIDIRECTIONAL);
760		kfree(cb->start_buf);
761	}
762}
763
764static int krping_create_qp(struct krping_cb *cb)
765{
766	struct ib_qp_init_attr init_attr;
767	int ret;
768
769	memset(&init_attr, 0, sizeof(init_attr));
770	init_attr.cap.max_send_wr = cb->txdepth;
771	init_attr.cap.max_recv_wr = 2;
772	init_attr.cap.max_recv_sge = 1;
773	init_attr.cap.max_send_sge = 1;
774	init_attr.qp_type = IB_QPT_RC;
775	init_attr.send_cq = cb->cq;
776	init_attr.recv_cq = cb->cq;
777	init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
778
779	if (cb->server) {
780		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
781		if (!ret)
782			cb->qp = cb->child_cm_id->qp;
783	} else {
784		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
785		if (!ret)
786			cb->qp = cb->cm_id->qp;
787	}
788
789	return ret;
790}
791
792static void krping_free_qp(struct krping_cb *cb)
793{
794	ib_destroy_qp(cb->qp);
795	ib_destroy_cq(cb->cq);
796	ib_dealloc_pd(cb->pd);
797}
798
799static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
800{
801	int ret;
802	cb->pd = ib_alloc_pd(cm_id->device);
803	if (IS_ERR(cb->pd)) {
804		PRINTF(cb, "ib_alloc_pd failed\n");
805		return PTR_ERR(cb->pd);
806	}
807	DEBUG_LOG(cb, "created pd %p\n", cb->pd);
808
809	strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
810
811	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
812			      cb, cb->txdepth * 2, 0);
813	if (IS_ERR(cb->cq)) {
814		PRINTF(cb, "ib_create_cq failed\n");
815		ret = PTR_ERR(cb->cq);
816		goto err1;
817	}
818	DEBUG_LOG(cb, "created cq %p\n", cb->cq);
819
820	if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
821		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
822		if (ret) {
823			PRINTF(cb, "ib_create_cq failed\n");
824			goto err2;
825		}
826	}
827
828	ret = krping_create_qp(cb);
829	if (ret) {
830		PRINTF(cb, "krping_create_qp failed: %d\n", ret);
831		goto err2;
832	}
833	DEBUG_LOG(cb, "created qp %p\n", cb->qp);
834	return 0;
835err2:
836	ib_destroy_cq(cb->cq);
837err1:
838	ib_dealloc_pd(cb->pd);
839	return ret;
840}
841
842/*
843 * return the (possibly rebound) rkey for the rdma buffer.
844 * FASTREG mode: invalidate and rebind via fastreg wr.
845 * MW mode: rebind the MW.
846 * other modes: just return the mr rkey.
847 */
848static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
849{
850	u32 rkey = 0xffffffff;
851	u64 p;
852	struct ib_send_wr *bad_wr;
853	int i;
854	int ret;
855
856	switch (cb->mem) {
857	case FASTREG:
858		cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
859
860		/*
861		 * Update the fastreg key.
862		 */
863		ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
864		cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
865
866		/*
867		 * Update the fastreg WR with new buf info.
868		 */
869		if (buf == (u64)cb->start_dma_addr)
870			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
871		else
872			cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
873		cb->fastreg_wr.wr.fast_reg.iova_start = buf;
874		p = (u64)(buf & PAGE_MASK);
875		for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len;
876		     i++, p += PAGE_SIZE) {
877			cb->page_list->page_list[i] = p;
878			DEBUG_LOG(cb, "page_list[%d] 0x%llx\n", i, p);
879		}
880
881		DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
882			" iova_start %llx page_list_len %u\n",
883			post_inv,
884			cb->fastreg_wr.wr.fast_reg.rkey,
885			cb->fastreg_wr.wr.fast_reg.page_shift,
886			cb->fastreg_wr.wr.fast_reg.length,
887			cb->fastreg_wr.wr.fast_reg.iova_start,
888			cb->fastreg_wr.wr.fast_reg.page_list_len);
889
890		if (post_inv)
891			ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
892		else
893			ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
894		if (ret) {
895			PRINTF(cb, "post send error %d\n", ret);
896			cb->state = ERROR;
897		}
898		rkey = cb->fastreg_mr->rkey;
899		break;
900	case MW:
901		/*
902		 * Update the MW with new buf info.
903		 */
904		if (buf == (u64)cb->start_dma_addr) {
905			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
906			cb->bind_attr.mr = cb->start_mr;
907		} else {
908			cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
909			cb->bind_attr.mr = cb->rdma_mr;
910		}
911		cb->bind_attr.addr = buf;
912		DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %llx mr rkey 0x%x\n",
913			cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
914		ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
915		if (ret) {
916			PRINTF(cb, "bind mw error %d\n", ret);
917			cb->state = ERROR;
918		} else
919			rkey = cb->mw->rkey;
920		break;
921	case MR:
922		if (buf == (u64)cb->start_dma_addr)
923			rkey = cb->start_mr->rkey;
924		else
925			rkey = cb->rdma_mr->rkey;
926		break;
927	case DMA:
928		rkey = cb->dma_mr->rkey;
929		break;
930	default:
931		PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
932		cb->state = ERROR;
933		break;
934	}
935	return rkey;
936}
937
938static void krping_format_send(struct krping_cb *cb, u64 buf)
939{
940	struct krping_rdma_info *info = &cb->send_buf;
941	u32 rkey;
942
943	/*
944	 * Client side will do fastreg or mw bind before
945	 * advertising the rdma buffer.  Server side
946	 * sends have no data.
947	 */
948	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
949		rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
950		info->buf = htonll(buf);
951		info->rkey = htonl(rkey);
952		info->size = htonl(cb->size);
953		DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
954			  (unsigned long long)buf, rkey, cb->size);
955	}
956}
957
958static void krping_test_server(struct krping_cb *cb)
959{
960	struct ib_send_wr *bad_wr, inv;
961	int ret;
962
963	while (1) {
964		/* Wait for client's Start STAG/TO/Len */
965		wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
966		if (cb->state != RDMA_READ_ADV) {
967			PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
968				cb->state);
969			break;
970		}
971
972		DEBUG_LOG(cb, "server received sink adv\n");
973
974		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
975		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
976		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
977		cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
978		cb->rdma_sq_wr.next = NULL;
979
980		/* Issue RDMA Read. */
981		if (cb->read_inv)
982			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
983		else {
984
985			cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
986			if (cb->mem == FASTREG) {
987				/*
988				 * Immediately follow the read with a
989				 * fenced LOCAL_INV.
990				 */
991				cb->rdma_sq_wr.next = &inv;
992				memset(&inv, 0, sizeof inv);
993				inv.opcode = IB_WR_LOCAL_INV;
994				inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
995				inv.send_flags = IB_SEND_FENCE;
996			}
997		}
998
999		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1000		if (ret) {
1001			PRINTF(cb, "post send error %d\n", ret);
1002			break;
1003		}
1004		cb->rdma_sq_wr.next = NULL;
1005
1006		DEBUG_LOG(cb, "server posted rdma read req \n");
1007
1008		/* Wait for read completion */
1009		wait_event_interruptible(cb->sem,
1010					 cb->state >= RDMA_READ_COMPLETE);
1011		if (cb->state != RDMA_READ_COMPLETE) {
1012			PRINTF(cb,
1013			       "wait for RDMA_READ_COMPLETE state %d\n",
1014			       cb->state);
1015			break;
1016		}
1017		DEBUG_LOG(cb, "server received read complete\n");
1018
1019		/* Display data in recv buf */
1020		if (cb->verbose)
1021			PRINTF(cb, "server ping data: %s\n",
1022				cb->rdma_buf);
1023
1024		/* Tell client to continue */
1025		if (cb->server && cb->server_invalidate) {
1026			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1027			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1028			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1029		}
1030		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1031		if (ret) {
1032			PRINTF(cb, "post send error %d\n", ret);
1033			break;
1034		}
1035		DEBUG_LOG(cb, "server posted go ahead\n");
1036
1037		/* Wait for client's RDMA STAG/TO/Len */
1038		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1039		if (cb->state != RDMA_WRITE_ADV) {
1040			PRINTF(cb,
1041			       "wait for RDMA_WRITE_ADV state %d\n",
1042			       cb->state);
1043			break;
1044		}
1045		DEBUG_LOG(cb, "server received sink adv\n");
1046
1047		/* RDMA Write echo data */
1048		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1049		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1050		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1051		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
1052		if (cb->local_dma_lkey)
1053			cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
1054		else
1055			cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
1056
1057		DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
1058			  cb->rdma_sq_wr.sg_list->lkey,
1059			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
1060			  cb->rdma_sq_wr.sg_list->length);
1061
1062		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1063		if (ret) {
1064			PRINTF(cb, "post send error %d\n", ret);
1065			break;
1066		}
1067
1068		/* Wait for completion */
1069		ret = wait_event_interruptible(cb->sem, cb->state >=
1070							 RDMA_WRITE_COMPLETE);
1071		if (cb->state != RDMA_WRITE_COMPLETE) {
1072			PRINTF(cb,
1073			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1074			       cb->state);
1075			break;
1076		}
1077		DEBUG_LOG(cb, "server rdma write complete \n");
1078
1079		cb->state = CONNECTED;
1080
1081		/* Tell client to begin again */
1082		if (cb->server && cb->server_invalidate) {
1083			cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
1084			cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
1085			DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
1086		}
1087		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1088		if (ret) {
1089			PRINTF(cb, "post send error %d\n", ret);
1090			break;
1091		}
1092		DEBUG_LOG(cb, "server posted go ahead\n");
1093	}
1094}
1095
1096static void rlat_test(struct krping_cb *cb)
1097{
1098	int scnt;
1099	int iters = cb->count;
1100	struct timeval start_tv, stop_tv;
1101	int ret;
1102	struct ib_wc wc;
1103	struct ib_send_wr *bad_wr;
1104	int ne;
1105
1106	scnt = 0;
1107	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
1108	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1109	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1110	cb->rdma_sq_wr.sg_list->length = cb->size;
1111
1112	microtime(&start_tv);
1113	if (!cb->poll) {
1114		cb->state = RDMA_READ_ADV;
1115		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
1116	}
1117	while (scnt < iters) {
1118
1119		cb->state = RDMA_READ_ADV;
1120		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
1121		if (ret) {
1122			PRINTF(cb,
1123				"Couldn't post send: ret=%d scnt %d\n",
1124				ret, scnt);
1125			return;
1126		}
1127
1128		do {
1129			if (!cb->poll) {
1130				wait_event_interruptible(cb->sem,
1131					cb->state != RDMA_READ_ADV);
1132				if (cb->state == RDMA_READ_COMPLETE) {
1133					ne = 1;
1134					ib_req_notify_cq(cb->cq,
1135						IB_CQ_NEXT_COMP);
1136				} else {
1137					ne = -1;
1138				}
1139			} else
1140				ne = ib_poll_cq(cb->cq, 1, &wc);
1141			if (cb->state == ERROR) {
1142				PRINTF(cb,
1143					"state == ERROR...bailing scnt %d\n",
1144					scnt);
1145				return;
1146			}
1147		} while (ne == 0);
1148
1149		if (ne < 0) {
1150			PRINTF(cb, "poll CQ failed %d\n", ne);
1151			return;
1152		}
1153		if (cb->poll && wc.status != IB_WC_SUCCESS) {
1154			PRINTF(cb, "Completion wth error at %s:\n",
1155				cb->server ? "server" : "client");
1156			PRINTF(cb, "Failed status %d: wr_id %d\n",
1157				wc.status, (int) wc.wr_id);
1158			return;
1159		}
1160		++scnt;
1161	}
1162	microtime(&stop_tv);
1163
1164        if (stop_tv.tv_usec < start_tv.tv_usec) {
1165                stop_tv.tv_usec += 1000000;
1166                stop_tv.tv_sec  -= 1;
1167        }
1168
1169	PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
1170		stop_tv.tv_sec - start_tv.tv_sec,
1171		stop_tv.tv_usec - start_tv.tv_usec,
1172		scnt, cb->size);
1173}
1174
1175static void wlat_test(struct krping_cb *cb)
1176{
1177	int ccnt, scnt, rcnt;
1178	int iters=cb->count;
1179	volatile char *poll_buf = (char *) cb->start_buf;
1180	char *buf = (char *)cb->rdma_buf;
1181	struct timeval start_tv, stop_tv;
1182	cycles_t *post_cycles_start, *post_cycles_stop;
1183	cycles_t *poll_cycles_start, *poll_cycles_stop;
1184	cycles_t *last_poll_cycles_start;
1185	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1186	int i;
1187	int cycle_iters = 1000;
1188
1189	ccnt = 0;
1190	scnt = 0;
1191	rcnt = 0;
1192
1193	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1194	if (!post_cycles_start) {
1195		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1196		return;
1197	}
1198	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1199	if (!post_cycles_stop) {
1200		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1201		return;
1202	}
1203	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1204	if (!poll_cycles_start) {
1205		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1206		return;
1207	}
1208	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1209	if (!poll_cycles_stop) {
1210		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1211		return;
1212	}
1213	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1214		GFP_KERNEL);
1215	if (!last_poll_cycles_start) {
1216		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1217		return;
1218	}
1219	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1220	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1221	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1222	cb->rdma_sq_wr.sg_list->length = cb->size;
1223
1224	if (cycle_iters > iters)
1225		cycle_iters = iters;
1226	microtime(&start_tv);
1227	while (scnt < iters || ccnt < iters || rcnt < iters) {
1228
1229		/* Wait till buffer changes. */
1230		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
1231			++rcnt;
1232			while (*poll_buf != (char)rcnt) {
1233				if (cb->state == ERROR) {
1234					PRINTF(cb,
1235						"state = ERROR, bailing\n");
1236					return;
1237				}
1238			}
1239		}
1240
1241		if (scnt < iters) {
1242			struct ib_send_wr *bad_wr;
1243
1244			*buf = (char)scnt+1;
1245			if (scnt < cycle_iters)
1246				post_cycles_start[scnt] = get_cycles();
1247			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1248				PRINTF(cb,
1249					"Couldn't post send: scnt=%d\n",
1250					scnt);
1251				return;
1252			}
1253			if (scnt < cycle_iters)
1254				post_cycles_stop[scnt] = get_cycles();
1255			scnt++;
1256		}
1257
1258		if (ccnt < iters) {
1259			struct ib_wc wc;
1260			int ne;
1261
1262			if (ccnt < cycle_iters)
1263				poll_cycles_start[ccnt] = get_cycles();
1264			do {
1265				if (ccnt < cycle_iters)
1266					last_poll_cycles_start[ccnt] =
1267						get_cycles();
1268				ne = ib_poll_cq(cb->cq, 1, &wc);
1269			} while (ne == 0);
1270			if (ccnt < cycle_iters)
1271				poll_cycles_stop[ccnt] = get_cycles();
1272			++ccnt;
1273
1274			if (ne < 0) {
1275				PRINTF(cb, "poll CQ failed %d\n", ne);
1276				return;
1277			}
1278			if (wc.status != IB_WC_SUCCESS) {
1279				PRINTF(cb,
1280					"Completion wth error at %s:\n",
1281					cb->server ? "server" : "client");
1282				PRINTF(cb,
1283					"Failed status %d: wr_id %d\n",
1284					wc.status, (int) wc.wr_id);
1285				PRINTF(cb,
1286					"scnt=%d, rcnt=%d, ccnt=%d\n",
1287					scnt, rcnt, ccnt);
1288				return;
1289			}
1290		}
1291	}
1292	microtime(&stop_tv);
1293
1294        if (stop_tv.tv_usec < start_tv.tv_usec) {
1295                stop_tv.tv_usec += 1000000;
1296                stop_tv.tv_sec  -= 1;
1297        }
1298
1299	for (i=0; i < cycle_iters; i++) {
1300		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1301		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1302		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1303	}
1304	PRINTF(cb,
1305		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1306		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1307		stop_tv.tv_sec - start_tv.tv_sec,
1308		stop_tv.tv_usec - start_tv.tv_usec,
1309		scnt, cb->size, cycle_iters,
1310		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1311		(unsigned long long)sum_last_poll);
1312	kfree(post_cycles_start);
1313	kfree(post_cycles_stop);
1314	kfree(poll_cycles_start);
1315	kfree(poll_cycles_stop);
1316	kfree(last_poll_cycles_start);
1317}
1318
1319static void bw_test(struct krping_cb *cb)
1320{
1321	int ccnt, scnt, rcnt;
1322	int iters=cb->count;
1323	struct timeval start_tv, stop_tv;
1324	cycles_t *post_cycles_start, *post_cycles_stop;
1325	cycles_t *poll_cycles_start, *poll_cycles_stop;
1326	cycles_t *last_poll_cycles_start;
1327	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1328	int i;
1329	int cycle_iters = 1000;
1330
1331	ccnt = 0;
1332	scnt = 0;
1333	rcnt = 0;
1334
1335	post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1336	if (!post_cycles_start) {
1337		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1338		return;
1339	}
1340	post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1341	if (!post_cycles_stop) {
1342		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1343		return;
1344	}
1345	poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1346	if (!poll_cycles_start) {
1347		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1348		return;
1349	}
1350	poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
1351	if (!poll_cycles_stop) {
1352		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1353		return;
1354	}
1355	last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
1356		GFP_KERNEL);
1357	if (!last_poll_cycles_start) {
1358		PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
1359		return;
1360	}
1361	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1362	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1363	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1364	cb->rdma_sq_wr.sg_list->length = cb->size;
1365
1366	if (cycle_iters > iters)
1367		cycle_iters = iters;
1368	microtime(&start_tv);
1369	while (scnt < iters || ccnt < iters) {
1370
1371		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1372			struct ib_send_wr *bad_wr;
1373
1374			if (scnt < cycle_iters)
1375				post_cycles_start[scnt] = get_cycles();
1376			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1377				PRINTF(cb,
1378					"Couldn't post send: scnt=%d\n",
1379					scnt);
1380				return;
1381			}
1382			if (scnt < cycle_iters)
1383				post_cycles_stop[scnt] = get_cycles();
1384			++scnt;
1385		}
1386
1387		if (ccnt < iters) {
1388			int ne;
1389			struct ib_wc wc;
1390
1391			if (ccnt < cycle_iters)
1392				poll_cycles_start[ccnt] = get_cycles();
1393			do {
1394				if (ccnt < cycle_iters)
1395					last_poll_cycles_start[ccnt] =
1396						get_cycles();
1397				ne = ib_poll_cq(cb->cq, 1, &wc);
1398			} while (ne == 0);
1399			if (ccnt < cycle_iters)
1400				poll_cycles_stop[ccnt] = get_cycles();
1401			ccnt += 1;
1402
1403			if (ne < 0) {
1404				PRINTF(cb, "poll CQ failed %d\n", ne);
1405				return;
1406			}
1407			if (wc.status != IB_WC_SUCCESS) {
1408				PRINTF(cb,
1409					"Completion wth error at %s:\n",
1410					cb->server ? "server" : "client");
1411				PRINTF(cb,
1412					"Failed status %d: wr_id %d\n",
1413					wc.status, (int) wc.wr_id);
1414				return;
1415			}
1416		}
1417	}
1418	microtime(&stop_tv);
1419
1420        if (stop_tv.tv_usec < start_tv.tv_usec) {
1421                stop_tv.tv_usec += 1000000;
1422                stop_tv.tv_sec  -= 1;
1423        }
1424
1425	for (i=0; i < cycle_iters; i++) {
1426		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1427		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1428		sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
1429	}
1430	PRINTF(cb,
1431		"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
1432		" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1433		stop_tv.tv_sec - start_tv.tv_sec,
1434		stop_tv.tv_usec - start_tv.tv_usec,
1435		scnt, cb->size, cycle_iters,
1436		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1437		(unsigned long long)sum_last_poll);
1438	kfree(post_cycles_start);
1439	kfree(post_cycles_stop);
1440	kfree(poll_cycles_start);
1441	kfree(poll_cycles_stop);
1442	kfree(last_poll_cycles_start);
1443}
1444
1445static void krping_rlat_test_server(struct krping_cb *cb)
1446{
1447	struct ib_send_wr *bad_wr;
1448	struct ib_wc wc;
1449	int ret;
1450
1451	/* Spin waiting for client's Start STAG/TO/Len */
1452	while (cb->state < RDMA_READ_ADV) {
1453		krping_cq_event_handler(cb->cq, cb);
1454	}
1455
1456	/* Send STAG/TO/Len to client */
1457	krping_format_send(cb, cb->start_dma_addr);
1458	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1459	if (ret) {
1460		PRINTF(cb, "post send error %d\n", ret);
1461		return;
1462	}
1463
1464	/* Spin waiting for send completion */
1465	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1466	if (ret < 0) {
1467		PRINTF(cb, "poll error %d\n", ret);
1468		return;
1469	}
1470	if (wc.status) {
1471		PRINTF(cb, "send completiong error %d\n", wc.status);
1472		return;
1473	}
1474
1475	wait_event_interruptible(cb->sem, cb->state == ERROR);
1476}
1477
1478static void krping_wlat_test_server(struct krping_cb *cb)
1479{
1480	struct ib_send_wr *bad_wr;
1481	struct ib_wc wc;
1482	int ret;
1483
1484	/* Spin waiting for client's Start STAG/TO/Len */
1485	while (cb->state < RDMA_READ_ADV) {
1486		krping_cq_event_handler(cb->cq, cb);
1487	}
1488
1489	/* Send STAG/TO/Len to client */
1490	krping_format_send(cb, cb->start_dma_addr);
1491	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1492	if (ret) {
1493		PRINTF(cb, "post send error %d\n", ret);
1494		return;
1495	}
1496
1497	/* Spin waiting for send completion */
1498	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1499	if (ret < 0) {
1500		PRINTF(cb, "poll error %d\n", ret);
1501		return;
1502	}
1503	if (wc.status) {
1504		PRINTF(cb, "send completiong error %d\n", wc.status);
1505		return;
1506	}
1507
1508	wlat_test(cb);
1509	wait_event_interruptible(cb->sem, cb->state == ERROR);
1510}
1511
1512static void krping_bw_test_server(struct krping_cb *cb)
1513{
1514	struct ib_send_wr *bad_wr;
1515	struct ib_wc wc;
1516	int ret;
1517
1518	/* Spin waiting for client's Start STAG/TO/Len */
1519	while (cb->state < RDMA_READ_ADV) {
1520		krping_cq_event_handler(cb->cq, cb);
1521	}
1522
1523	/* Send STAG/TO/Len to client */
1524	krping_format_send(cb, cb->start_dma_addr);
1525	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1526	if (ret) {
1527		PRINTF(cb, "post send error %d\n", ret);
1528		return;
1529	}
1530
1531	/* Spin waiting for send completion */
1532	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1533	if (ret < 0) {
1534		PRINTF(cb, "poll error %d\n", ret);
1535		return;
1536	}
1537	if (wc.status) {
1538		PRINTF(cb, "send completiong error %d\n", wc.status);
1539		return;
1540	}
1541
1542	if (cb->duplex)
1543		bw_test(cb);
1544	wait_event_interruptible(cb->sem, cb->state == ERROR);
1545}
1546
1547static int fastreg_supported(struct krping_cb *cb)
1548{
1549	struct ib_device *dev = cb->child_cm_id->device;
1550	struct ib_device_attr attr;
1551	int ret;
1552
1553	ret = ib_query_device(dev, &attr);
1554	if (ret) {
1555		PRINTF(cb, "ib_query_device failed ret %d\n", ret);
1556		return 0;
1557	}
1558	if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
1559		PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%x\n",
1560		    attr.device_cap_flags);
1561		return 0;
1562	}
1563	DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%x\n",
1564		attr.device_cap_flags);
1565	return 1;
1566}
1567
1568static int krping_bind_server(struct krping_cb *cb)
1569{
1570	union {
1571		struct sockaddr_in v4;
1572		struct sockaddr_in6 v6;
1573	} sin;
1574	int ret;
1575
1576	memset(&sin, 0, sizeof(sin));
1577
1578	switch (cb->addr_type) {
1579	case AF_INET:
1580		sin.v4.sin_len = sizeof sin.v4;
1581		sin.v4.sin_family = AF_INET;
1582		sin.v4.sin_addr = cb->addr.v4;
1583		sin.v4.sin_port = cb->port;
1584		break;
1585	case AF_INET6:
1586		sin.v6.sin6_len = sizeof sin.v6;
1587		sin.v6.sin6_family = AF_INET6;
1588		sin.v6.sin6_addr = cb->addr.v6;
1589		sin.v6.sin6_port = cb->port;
1590		break;
1591	default:
1592		return (-EINVAL);
1593	}
1594
1595	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1596	if (ret) {
1597		PRINTF(cb, "rdma_bind_addr error %d\n", ret);
1598		return ret;
1599	}
1600	DEBUG_LOG(cb, "rdma_bind_addr successful\n");
1601
1602	DEBUG_LOG(cb, "rdma_listen\n");
1603	ret = rdma_listen(cb->cm_id, 3);
1604	if (ret) {
1605		PRINTF(cb, "rdma_listen failed: %d\n", ret);
1606		return ret;
1607	}
1608
1609	wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
1610	if (cb->state != CONNECT_REQUEST) {
1611		PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
1612			cb->state);
1613		return -1;
1614	}
1615
1616	if (cb->mem == FASTREG && !fastreg_supported(cb))
1617		return -EINVAL;
1618
1619	return 0;
1620}
1621
1622static void krping_run_server(struct krping_cb *cb)
1623{
1624	struct ib_recv_wr *bad_wr;
1625	int ret;
1626
1627	ret = krping_bind_server(cb);
1628	if (ret)
1629		return;
1630
1631	ret = krping_setup_qp(cb, cb->child_cm_id);
1632	if (ret) {
1633		PRINTF(cb, "setup_qp failed: %d\n", ret);
1634		goto err0;
1635	}
1636
1637	ret = krping_setup_buffers(cb);
1638	if (ret) {
1639		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
1640		goto err1;
1641	}
1642
1643	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1644	if (ret) {
1645		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
1646		goto err2;
1647	}
1648
1649	ret = krping_accept(cb);
1650	if (ret) {
1651		PRINTF(cb, "connect error %d\n", ret);
1652		goto err2;
1653	}
1654
1655	if (cb->wlat)
1656		krping_wlat_test_server(cb);
1657	else if (cb->rlat)
1658		krping_rlat_test_server(cb);
1659	else if (cb->bw)
1660		krping_bw_test_server(cb);
1661	else
1662		krping_test_server(cb);
1663	rdma_disconnect(cb->child_cm_id);
1664err2:
1665	krping_free_buffers(cb);
1666err1:
1667	krping_free_qp(cb);
1668err0:
1669	rdma_destroy_id(cb->child_cm_id);
1670}
1671
1672static void krping_test_client(struct krping_cb *cb)
1673{
1674	int ping, start, cc, i, ret;
1675	struct ib_send_wr *bad_wr;
1676	unsigned char c;
1677
1678	start = 65;
1679	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1680		cb->state = RDMA_READ_ADV;
1681
1682		/* Put some ascii text in the buffer. */
1683		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1684		for (i = cc, c = start; i < cb->size; i++) {
1685			cb->start_buf[i] = c;
1686			c++;
1687			if (c > 122)
1688				c = 65;
1689		}
1690		start++;
1691		if (start > 122)
1692			start = 65;
1693		cb->start_buf[cb->size - 1] = 0;
1694
1695		krping_format_send(cb, cb->start_dma_addr);
1696		if (cb->state == ERROR) {
1697			PRINTF(cb, "krping_format_send failed\n");
1698			break;
1699		}
1700		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1701		if (ret) {
1702			PRINTF(cb, "post send error %d\n", ret);
1703			break;
1704		}
1705
1706		/* Wait for server to ACK */
1707		wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
1708		if (cb->state != RDMA_WRITE_ADV) {
1709			PRINTF(cb,
1710			       "wait for RDMA_WRITE_ADV state %d\n",
1711			       cb->state);
1712			break;
1713		}
1714
1715		krping_format_send(cb, cb->rdma_dma_addr);
1716		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1717		if (ret) {
1718			PRINTF(cb, "post send error %d\n", ret);
1719			break;
1720		}
1721
1722		/* Wait for the server to say the RDMA Write is complete. */
1723		wait_event_interruptible(cb->sem,
1724					 cb->state >= RDMA_WRITE_COMPLETE);
1725		if (cb->state != RDMA_WRITE_COMPLETE) {
1726			PRINTF(cb,
1727			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1728			       cb->state);
1729			break;
1730		}
1731
1732		if (cb->validate)
1733			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1734				PRINTF(cb, "data mismatch!\n");
1735				break;
1736			}
1737
1738		if (cb->verbose)
1739			PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
1740#ifdef SLOW_KRPING
1741		wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
1742#endif
1743	}
1744}
1745
1746static void krping_rlat_test_client(struct krping_cb *cb)
1747{
1748	struct ib_send_wr *bad_wr;
1749	struct ib_wc wc;
1750	int ret;
1751
1752	cb->state = RDMA_READ_ADV;
1753
1754	/* Send STAG/TO/Len to client */
1755	krping_format_send(cb, cb->start_dma_addr);
1756	if (cb->state == ERROR) {
1757		PRINTF(cb, "krping_format_send failed\n");
1758		return;
1759	}
1760	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1761	if (ret) {
1762		PRINTF(cb, "post send error %d\n", ret);
1763		return;
1764	}
1765
1766	/* Spin waiting for send completion */
1767	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1768	if (ret < 0) {
1769		PRINTF(cb, "poll error %d\n", ret);
1770		return;
1771	}
1772	if (wc.status) {
1773		PRINTF(cb, "send completion error %d\n", wc.status);
1774		return;
1775	}
1776
1777	/* Spin waiting for server's Start STAG/TO/Len */
1778	while (cb->state < RDMA_WRITE_ADV) {
1779		krping_cq_event_handler(cb->cq, cb);
1780	}
1781
1782#if 0
1783{
1784	int i;
1785	struct timeval start, stop;
1786	time_t sec;
1787	suseconds_t usec;
1788	unsigned long long elapsed;
1789	struct ib_wc wc;
1790	struct ib_send_wr *bad_wr;
1791	int ne;
1792
1793	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1794	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1795	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1796	cb->rdma_sq_wr.sg_list->length = 0;
1797	cb->rdma_sq_wr.num_sge = 0;
1798
1799	microtime(&start);
1800	for (i=0; i < 100000; i++) {
1801		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1802			PRINTF(cb, "Couldn't post send\n");
1803			return;
1804		}
1805		do {
1806			ne = ib_poll_cq(cb->cq, 1, &wc);
1807		} while (ne == 0);
1808		if (ne < 0) {
1809			PRINTF(cb, "poll CQ failed %d\n", ne);
1810			return;
1811		}
1812		if (wc.status != IB_WC_SUCCESS) {
1813			PRINTF(cb, "Completion wth error at %s:\n",
1814				cb->server ? "server" : "client");
1815			PRINTF(cb, "Failed status %d: wr_id %d\n",
1816				wc.status, (int) wc.wr_id);
1817			return;
1818		}
1819	}
1820	microtime(&stop);
1821
1822	if (stop.tv_usec < start.tv_usec) {
1823		stop.tv_usec += 1000000;
1824		stop.tv_sec  -= 1;
1825	}
1826	sec     = stop.tv_sec - start.tv_sec;
1827	usec    = stop.tv_usec - start.tv_usec;
1828	elapsed = sec * 1000000 + usec;
1829	PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1830}
1831#endif
1832
1833	rlat_test(cb);
1834}
1835
1836static void krping_wlat_test_client(struct krping_cb *cb)
1837{
1838	struct ib_send_wr *bad_wr;
1839	struct ib_wc wc;
1840	int ret;
1841
1842	cb->state = RDMA_READ_ADV;
1843
1844	/* Send STAG/TO/Len to client */
1845	krping_format_send(cb, cb->start_dma_addr);
1846	if (cb->state == ERROR) {
1847		PRINTF(cb, "krping_format_send failed\n");
1848		return;
1849	}
1850	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1851	if (ret) {
1852		PRINTF(cb, "post send error %d\n", ret);
1853		return;
1854	}
1855
1856	/* Spin waiting for send completion */
1857	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1858	if (ret < 0) {
1859		PRINTF(cb, "poll error %d\n", ret);
1860		return;
1861	}
1862	if (wc.status) {
1863		PRINTF(cb, "send completion error %d\n", wc.status);
1864		return;
1865	}
1866
1867	/* Spin waiting for server's Start STAG/TO/Len */
1868	while (cb->state < RDMA_WRITE_ADV) {
1869		krping_cq_event_handler(cb->cq, cb);
1870	}
1871
1872	wlat_test(cb);
1873}
1874
1875static void krping_bw_test_client(struct krping_cb *cb)
1876{
1877	struct ib_send_wr *bad_wr;
1878	struct ib_wc wc;
1879	int ret;
1880
1881	cb->state = RDMA_READ_ADV;
1882
1883	/* Send STAG/TO/Len to client */
1884	krping_format_send(cb, cb->start_dma_addr);
1885	if (cb->state == ERROR) {
1886		PRINTF(cb, "krping_format_send failed\n");
1887		return;
1888	}
1889	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1890	if (ret) {
1891		PRINTF(cb, "post send error %d\n", ret);
1892		return;
1893	}
1894
1895	/* Spin waiting for send completion */
1896	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1897	if (ret < 0) {
1898		PRINTF(cb, "poll error %d\n", ret);
1899		return;
1900	}
1901	if (wc.status) {
1902		PRINTF(cb, "send completion error %d\n", wc.status);
1903		return;
1904	}
1905
1906	/* Spin waiting for server's Start STAG/TO/Len */
1907	while (cb->state < RDMA_WRITE_ADV) {
1908		krping_cq_event_handler(cb->cq, cb);
1909	}
1910
1911	bw_test(cb);
1912}
1913
1914static void krping_fr_test(struct krping_cb *cb)
1915{
1916	struct ib_fast_reg_page_list *pl;
1917	struct ib_send_wr fr, inv, *bad;
1918	struct ib_wc wc;
1919	u8 key = 0;
1920	struct ib_mr *mr;
1921	int i;
1922	int ret;
1923	int size = cb->size;
1924	int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1925	time_t start;
1926	int count = 0;
1927	int scnt = 0;
1928
1929	pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
1930	if (IS_ERR(pl)) {
1931		PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
1932		return;
1933	}
1934
1935	mr = ib_alloc_fast_reg_mr(cb->pd, plen);
1936	if (IS_ERR(mr)) {
1937		PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
1938		goto err1;
1939	}
1940
1941	for (i=0; i<plen; i++)
1942		pl->page_list[i] = 0xcafebabe | i;
1943
1944	memset(&fr, 0, sizeof fr);
1945	fr.opcode = IB_WR_FAST_REG_MR;
1946	fr.wr.fast_reg.page_shift = PAGE_SHIFT;
1947	fr.wr.fast_reg.length = size;
1948	fr.wr.fast_reg.page_list = pl;
1949	fr.wr.fast_reg.page_list_len = plen;
1950	fr.wr.fast_reg.iova_start = 0;
1951	fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
1952	fr.next = &inv;
1953	memset(&inv, 0, sizeof inv);
1954	inv.opcode = IB_WR_LOCAL_INV;
1955	inv.send_flags = IB_SEND_SIGNALED;
1956
1957	DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
1958	start = time_uptime;
1959	while (1) {
1960		if ((time_uptime - start) >= 9) {
1961			DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
1962			wait_event_interruptible(cb->sem, cb->state == ERROR);
1963			if (cb->state == ERROR)
1964				break;
1965			start = time_uptime;
1966		}
1967		while (scnt < (cb->txdepth>>1)) {
1968			ib_update_fast_reg_key(mr, ++key);
1969			fr.wr.fast_reg.rkey = mr->rkey;
1970			inv.ex.invalidate_rkey = mr->rkey;
1971			size = arc4random() % cb->size;
1972			if (size == 0)
1973				size = cb->size;
1974			plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
1975			fr.wr.fast_reg.length = size;
1976			fr.wr.fast_reg.page_list_len = plen;
1977			ret = ib_post_send(cb->qp, &fr, &bad);
1978			if (ret) {
1979				PRINTF(cb, "ib_post_send failed %d\n", ret);
1980				goto err2;
1981			}
1982			scnt++;
1983		}
1984
1985		do {
1986			ret = ib_poll_cq(cb->cq, 1, &wc);
1987			if (ret < 0) {
1988				PRINTF(cb, "ib_poll_cq failed %d\n", ret);
1989				goto err2;
1990			}
1991			if (ret == 1) {
1992				if (wc.status) {
1993					PRINTF(cb, "completion error %u\n", wc.status);
1994					goto err2;
1995				}
1996				count++;
1997				scnt--;
1998			}
1999			else if (krping_sigpending()) {
2000				PRINTF(cb, "signal!\n");
2001				goto err2;
2002			}
2003		} while (ret == 1);
2004	}
2005err2:
2006#if 0
2007	DEBUG_LOG(cb, "sleeping 1 second\n");
2008	wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
2009#endif
2010	DEBUG_LOG(cb, "draining the cq...\n");
2011	do {
2012		ret = ib_poll_cq(cb->cq, 1, &wc);
2013		if (ret < 0) {
2014			PRINTF(cb, "ib_poll_cq failed %d\n", ret);
2015			break;
2016		}
2017		if (ret == 1) {
2018			if (wc.status) {
2019				PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
2020			}
2021		}
2022	} while (ret == 1);
2023	DEBUG_LOG(cb, "fr_test: done!\n");
2024	ib_dereg_mr(mr);
2025err1:
2026	ib_free_fast_reg_page_list(pl);
2027}
2028
2029static int krping_connect_client(struct krping_cb *cb)
2030{
2031	struct rdma_conn_param conn_param;
2032	int ret;
2033
2034	memset(&conn_param, 0, sizeof conn_param);
2035	conn_param.responder_resources = 1;
2036	conn_param.initiator_depth = 1;
2037	conn_param.retry_count = 10;
2038
2039	ret = rdma_connect(cb->cm_id, &conn_param);
2040	if (ret) {
2041		PRINTF(cb, "rdma_connect error %d\n", ret);
2042		return ret;
2043	}
2044
2045	wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
2046	if (cb->state == ERROR) {
2047		PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
2048		return -1;
2049	}
2050
2051	DEBUG_LOG(cb, "rdma_connect successful\n");
2052	return 0;
2053}
2054
2055static int krping_bind_client(struct krping_cb *cb)
2056{
2057	union {
2058		struct sockaddr_in v4;
2059		struct sockaddr_in6 v6;
2060	} sin;
2061	int ret;
2062
2063	memset(&sin, 0, sizeof(sin));
2064
2065	switch (cb->addr_type) {
2066	case AF_INET:
2067		sin.v4.sin_len = sizeof sin.v4;
2068		sin.v4.sin_family = AF_INET;
2069		sin.v4.sin_addr = cb->addr.v4;
2070		sin.v4.sin_port = cb->port;
2071		break;
2072	case AF_INET6:
2073		sin.v6.sin6_len = sizeof sin.v6;
2074		sin.v6.sin6_family = AF_INET6;
2075		sin.v6.sin6_addr = cb->addr.v6;
2076		sin.v6.sin6_port = cb->port;
2077		break;
2078	default:
2079		return (-EINVAL);
2080	}
2081
2082	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
2083				2000);
2084	if (ret) {
2085		PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
2086		return ret;
2087	}
2088
2089	wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
2090	if (cb->state != ROUTE_RESOLVED) {
2091		PRINTF(cb,
2092		       "addr/route resolution did not resolve: state %d\n",
2093		       cb->state);
2094		return -EINTR;
2095	}
2096
2097	if (cb->mem == FASTREG && !fastreg_supported(cb))
2098		return -EINVAL;
2099
2100	DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
2101	return 0;
2102}
2103
2104static void krping_run_client(struct krping_cb *cb)
2105{
2106	struct ib_recv_wr *bad_wr;
2107	int ret;
2108
2109	ret = krping_bind_client(cb);
2110	if (ret)
2111		return;
2112
2113	ret = krping_setup_qp(cb, cb->cm_id);
2114	if (ret) {
2115		PRINTF(cb, "setup_qp failed: %d\n", ret);
2116		return;
2117	}
2118
2119	ret = krping_setup_buffers(cb);
2120	if (ret) {
2121		PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
2122		goto err1;
2123	}
2124
2125	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
2126	if (ret) {
2127		PRINTF(cb, "ib_post_recv failed: %d\n", ret);
2128		goto err2;
2129	}
2130
2131	ret = krping_connect_client(cb);
2132	if (ret) {
2133		PRINTF(cb, "connect error %d\n", ret);
2134		goto err2;
2135	}
2136
2137	if (cb->wlat)
2138		krping_wlat_test_client(cb);
2139	else if (cb->rlat)
2140		krping_rlat_test_client(cb);
2141	else if (cb->bw)
2142		krping_bw_test_client(cb);
2143	else if (cb->frtest)
2144		krping_fr_test(cb);
2145	else
2146		krping_test_client(cb);
2147	rdma_disconnect(cb->cm_id);
2148err2:
2149	krping_free_buffers(cb);
2150err1:
2151	krping_free_qp(cb);
2152}
2153
2154static uint16_t
2155krping_get_ipv6_scope_id(char *name)
2156{
2157	struct ifnet *ifp;
2158	uint16_t retval;
2159
2160	if (name == NULL)
2161		return (0);
2162	ifp = ifunit_ref(name);
2163	if (ifp == NULL)
2164		return (0);
2165	retval = ifp->if_index;
2166	if_rele(ifp);
2167	return (retval);
2168}
2169
2170int krping_doit(char *cmd, void *cookie)
2171{
2172	struct krping_cb *cb;
2173	int op;
2174	int ret = 0;
2175	char *optarg;
2176	char *scope;
2177	unsigned long optint;
2178
2179	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
2180	if (!cb)
2181		return -ENOMEM;
2182
2183	mutex_lock(&krping_mutex);
2184	list_add_tail(&cb->list, &krping_cbs);
2185	mutex_unlock(&krping_mutex);
2186
2187	cb->cookie = cookie;
2188	cb->server = -1;
2189	cb->state = IDLE;
2190	cb->size = 64;
2191	cb->txdepth = RPING_SQ_DEPTH;
2192	cb->mem = DMA;
2193	cb->addr_type = AF_INET;
2194	init_waitqueue_head(&cb->sem);
2195
2196	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
2197			      &optint)) != 0) {
2198		switch (op) {
2199		case 'a':
2200			cb->addr_str = optarg;
2201			cb->addr_type = AF_INET;
2202			DEBUG_LOG(cb, "ipv4addr (%s)\n", optarg);
2203			if (inet_pton(AF_INET, optarg, &cb->addr) != 1) {
2204				PRINTF(cb, "bad addr string %s\n",
2205				    optarg);
2206				ret = EINVAL;
2207			}
2208			break;
2209		case 'A':
2210			cb->addr_str = optarg;
2211			cb->addr_type = AF_INET6;
2212			DEBUG_LOG(cb, "ipv6addr (%s)\n", optarg);
2213			scope = strstr(optarg, "%");
2214			/* extract scope ID, if any */
2215			if (scope != NULL)
2216				*scope++ = 0;
2217			/* extract IPv6 network address */
2218			if (inet_pton(AF_INET6, optarg, &cb->addr) != 1) {
2219				PRINTF(cb, "bad addr string %s\n",
2220				    optarg);
2221				ret = EINVAL;
2222			} else if (IN6_IS_SCOPE_LINKLOCAL(&cb->addr.v6) ||
2223			    IN6_IS_ADDR_MC_INTFACELOCAL(&cb->addr.v6)) {
2224				uint16_t scope_id = krping_get_ipv6_scope_id(scope);
2225				DEBUG_LOG(cb, "ipv6 scope ID = %d\n", scope_id);
2226				cb->addr.v6.s6_addr[2] = scope_id >> 8;
2227				cb->addr.v6.s6_addr[3] = scope_id & 0xFF;
2228			}
2229			break;
2230		case 'p':
2231			cb->port = htons(optint);
2232			DEBUG_LOG(cb, "port %d\n", (int)optint);
2233			break;
2234		case 'P':
2235			cb->poll = 1;
2236			DEBUG_LOG(cb, "server\n");
2237			break;
2238		case 's':
2239			cb->server = 1;
2240			DEBUG_LOG(cb, "server\n");
2241			break;
2242		case 'c':
2243			cb->server = 0;
2244			DEBUG_LOG(cb, "client\n");
2245			break;
2246		case 'S':
2247			cb->size = optint;
2248			if ((cb->size < 1) ||
2249			    (cb->size > RPING_BUFSIZE)) {
2250				PRINTF(cb, "Invalid size %d "
2251				       "(valid range is 1 to %d)\n",
2252				       cb->size, RPING_BUFSIZE);
2253				ret = EINVAL;
2254			} else
2255				DEBUG_LOG(cb, "size %d\n", (int)optint);
2256			break;
2257		case 'C':
2258			cb->count = optint;
2259			if (cb->count < 0) {
2260				PRINTF(cb, "Invalid count %d\n",
2261					cb->count);
2262				ret = EINVAL;
2263			} else
2264				DEBUG_LOG(cb, "count %d\n", (int) cb->count);
2265			break;
2266		case 'v':
2267			cb->verbose++;
2268			DEBUG_LOG(cb, "verbose\n");
2269			break;
2270		case 'V':
2271			cb->validate++;
2272			DEBUG_LOG(cb, "validate data\n");
2273			break;
2274		case 'l':
2275			cb->wlat++;
2276			break;
2277		case 'L':
2278			cb->rlat++;
2279			break;
2280		case 'B':
2281			cb->bw++;
2282			break;
2283		case 'd':
2284			cb->duplex++;
2285			break;
2286		case 'm':
2287			if (!strncmp(optarg, "dma", 3))
2288				cb->mem = DMA;
2289			else if (!strncmp(optarg, "fastreg", 7))
2290				cb->mem = FASTREG;
2291			else if (!strncmp(optarg, "mw", 2))
2292				cb->mem = MW;
2293			else if (!strncmp(optarg, "mr", 2))
2294				cb->mem = MR;
2295			else {
2296				PRINTF(cb, "unknown mem mode %s.  "
2297					"Must be dma, fastreg, mw, or mr\n",
2298					optarg);
2299				ret = -EINVAL;
2300				break;
2301			}
2302			break;
2303		case 'I':
2304			cb->server_invalidate = 1;
2305			break;
2306		case 'T':
2307			cb->txdepth = optint;
2308			DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
2309			break;
2310		case 'Z':
2311			cb->local_dma_lkey = 1;
2312			DEBUG_LOG(cb, "using local dma lkey\n");
2313			break;
2314		case 'R':
2315			cb->read_inv = 1;
2316			DEBUG_LOG(cb, "using read-with-inv\n");
2317			break;
2318		case 'f':
2319			cb->frtest = 1;
2320			DEBUG_LOG(cb, "fast-reg test!\n");
2321			break;
2322		default:
2323			PRINTF(cb, "unknown opt %s\n", optarg);
2324			ret = -EINVAL;
2325			break;
2326		}
2327	}
2328	if (ret)
2329		goto out;
2330
2331	if (cb->server == -1) {
2332		PRINTF(cb, "must be either client or server\n");
2333		ret = -EINVAL;
2334		goto out;
2335	}
2336
2337	if (cb->server && cb->frtest) {
2338		PRINTF(cb, "must be client to run frtest\n");
2339		ret = -EINVAL;
2340		goto out;
2341	}
2342
2343	if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
2344		PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
2345		ret = -EINVAL;
2346		goto out;
2347	}
2348
2349	if (cb->server_invalidate && cb->mem != FASTREG) {
2350		PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
2351		ret = -EINVAL;
2352		goto out;
2353	}
2354
2355	if (cb->read_inv && cb->mem != FASTREG) {
2356		PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
2357		ret = -EINVAL;
2358		goto out;
2359	}
2360
2361	if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw)) {
2362		PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
2363		ret = -EINVAL;
2364		goto out;
2365	}
2366
2367	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
2368	if (IS_ERR(cb->cm_id)) {
2369		ret = PTR_ERR(cb->cm_id);
2370		PRINTF(cb, "rdma_create_id error %d\n", ret);
2371		goto out;
2372	}
2373	DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
2374
2375	if (cb->server)
2376		krping_run_server(cb);
2377	else
2378		krping_run_client(cb);
2379
2380	DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
2381	rdma_destroy_id(cb->cm_id);
2382out:
2383	mutex_lock(&krping_mutex);
2384	list_del(&cb->list);
2385	mutex_unlock(&krping_mutex);
2386	kfree(cb);
2387	return ret;
2388}
2389
2390void
2391krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
2392{
2393	struct krping_cb *cb;
2394
2395	mutex_lock(&krping_mutex);
2396	list_for_each_entry(cb, &krping_cbs, list)
2397	    (*f)(cb->pd ? &cb->stats : NULL, arg);
2398	mutex_unlock(&krping_mutex);
2399}
2400
2401void krping_init(void)
2402{
2403
2404	mutex_init(&krping_mutex);
2405}
2406