1/*
2 * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
3 * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD$");
36
37#include <sys/ctype.h>
38
39#include <sys/param.h>
40#include <sys/condvar.h>
41#include <sys/systm.h>
42#include <sys/kernel.h>
43#include <sys/socket.h>
44#include <sys/endian.h>
45#include <sys/limits.h>
46#include <sys/proc.h>
47#include <sys/signalvar.h>
48
49#include <sys/lock.h>
50#include <sys/mutex.h>
51#include <sys/rwlock.h>
52#include <sys/queue.h>
53#include <sys/taskqueue.h>
54#include <sys/syslog.h>
55#include <netinet/in.h>
56
57#include <vm/vm.h>
58#include <vm/pmap.h>
59
60#include <linux/types.h>
61#include <rdma/rdma_cm.h>
62
63#include "getopt.h"
64#include "krping.h"
65
66#define PFX "krping: "
67
68static int debug = 0;
69#define DEBUG_LOG if (debug) printf
70
71static const struct krping_option krping_opts[] = {
72	{"count", OPT_INT, 'C'},
73	{"size", OPT_INT, 'S'},
74	{"addr", OPT_STRING, 'a'},
75	{"port", OPT_INT, 'p'},
76	{"verbose", OPT_NOPARAM, 'v'},
77	{"validate", OPT_NOPARAM, 'V'},
78	{"server", OPT_NOPARAM, 's'},
79	{"client", OPT_NOPARAM, 'c'},
80	{"dmamr", OPT_NOPARAM, 'D'},
81	{"debug", OPT_NOPARAM, 'd'},
82	{"wlat", OPT_NOPARAM, 'l'},
83	{"rlat", OPT_NOPARAM, 'L'},
84	{"bw", OPT_NOPARAM, 'B'},
85	{"tx-depth", OPT_INT, 't'},
86  	{"poll", OPT_NOPARAM, 'P'},
87  	{"memlimit", OPT_INT, 'm'},
88	{NULL, 0, 0}
89};
90
91struct mtx krping_mutex;
92
93/*
94 * List of running krping threads.
95 */
96struct krping_cb_list krping_cbs;
97
98/*
99 * krping "ping/pong" loop:
100 * 	client sends source rkey/addr/len
101 *	server receives source rkey/add/len
102 *	server rdma reads "ping" data from source
103 * 	server sends "go ahead" on rdma read completion
104 *	client sends sink rkey/addr/len
105 * 	server receives sink rkey/addr/len
106 * 	server rdma writes "pong" data to sink
107 * 	server sends "go ahead" on rdma write completion
108 * 	<repeat loop>
109 */
110
111/*
112 * Default max buffer size for IO...
113 */
114#define RPING_BUFSIZE 128*1024
115#define RPING_SQ_DEPTH 32
116
117static void krping_wait(struct krping_cb *cb, int state)
118{
119	int rc;
120	mtx_lock(&cb->lock);
121	while (cb->state < state) {
122		rc = msleep(cb, &cb->lock, 0, "krping", 0);
123		if (rc && rc != ERESTART) {
124			cb->state = ERROR;
125			break;
126		}
127	}
128	mtx_unlock(&cb->lock);
129}
130
131static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
132				   struct rdma_cm_event *event)
133{
134	int ret;
135	struct krping_cb *cb = cma_id->context;
136
137	DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
138		  (cma_id == cb->cm_id) ? "parent" : "child");
139
140	mtx_lock(&cb->lock);
141	switch (event->event) {
142	case RDMA_CM_EVENT_ADDR_RESOLVED:
143		cb->state = ADDR_RESOLVED;
144		ret = rdma_resolve_route(cma_id, 2000);
145		if (ret) {
146			log(LOG_ERR, "rdma_resolve_route error %d\n",
147			       ret);
148			wakeup(cb);
149		}
150		break;
151
152	case RDMA_CM_EVENT_ROUTE_RESOLVED:
153		cb->state = ROUTE_RESOLVED;
154		wakeup(cb);
155		break;
156
157	case RDMA_CM_EVENT_CONNECT_REQUEST:
158		cb->state = CONNECT_REQUEST;
159		cb->child_cm_id = cma_id;
160		DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id);
161		wakeup(cb);
162		break;
163
164	case RDMA_CM_EVENT_ESTABLISHED:
165		DEBUG_LOG(PFX "ESTABLISHED\n");
166		if (!cb->server) {
167			cb->state = CONNECTED;
168			wakeup(cb);
169		}
170		break;
171
172	case RDMA_CM_EVENT_ADDR_ERROR:
173	case RDMA_CM_EVENT_ROUTE_ERROR:
174	case RDMA_CM_EVENT_CONNECT_ERROR:
175	case RDMA_CM_EVENT_UNREACHABLE:
176	case RDMA_CM_EVENT_REJECTED:
177		log(LOG_ERR, "cma event %d, error %d\n", event->event,
178		       event->status);
179		cb->state = ERROR;
180		wakeup(cb);
181		break;
182
183	case RDMA_CM_EVENT_DISCONNECTED:
184		DEBUG_LOG(PFX "DISCONNECT EVENT...\n");
185		cb->state = ERROR;
186		wakeup(cb);
187		break;
188
189	case RDMA_CM_EVENT_DEVICE_REMOVAL:
190		DEBUG_LOG(PFX "cma detected device removal!!!!\n");
191		break;
192
193	default:
194		log(LOG_ERR, "oof bad type!\n");
195		wakeup(cb);
196		break;
197	}
198	mtx_unlock(&cb->lock);
199	return 0;
200}
201
202static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
203{
204	if (wc->byte_len != sizeof(cb->recv_buf)) {
205		log(LOG_ERR, "Received bogus data, size %d\n",
206		       wc->byte_len);
207		return -1;
208	}
209
210	cb->remote_rkey = ntohl(cb->recv_buf.rkey);
211	cb->remote_addr = ntohll(cb->recv_buf.buf);
212	cb->remote_len  = ntohl(cb->recv_buf.size);
213	DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n",
214		  cb->remote_rkey, (unsigned long long)cb->remote_addr,
215		  cb->remote_len);
216
217	if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
218		cb->state = RDMA_READ_ADV;
219	else
220		cb->state = RDMA_WRITE_ADV;
221
222	return 0;
223}
224
225static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
226{
227	if (wc->byte_len != sizeof(cb->recv_buf)) {
228		log(LOG_ERR, "Received bogus data, size %d\n",
229		       wc->byte_len);
230		return -1;
231	}
232
233	if (cb->state == RDMA_READ_ADV)
234		cb->state = RDMA_WRITE_ADV;
235	else
236		cb->state = RDMA_WRITE_COMPLETE;
237
238	return 0;
239}
240
241static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
242{
243	struct krping_cb *cb = ctx;
244	struct ib_wc wc;
245	struct ib_recv_wr *bad_wr;
246	int ret;
247
248	mtx_lock(&cb->lock);
249	KASSERT(cb->cq == cq, ("bad condition"));
250	if (cb->state == ERROR) {
251		log(LOG_ERR,  "cq completion in ERROR state\n");
252		mtx_unlock(&cb->lock);
253		return;
254	}
255	if (!cb->wlat && !cb->rlat && !cb->bw)
256		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
257	while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
258		if (wc.status) {
259			if (wc.status == IB_WC_WR_FLUSH_ERR) {
260				DEBUG_LOG("cq flushed\n");
261				continue;
262			} else {
263				log(LOG_CRIT, "cq completion failed status %d\n",
264					wc.status);
265				goto error;
266			}
267		}
268
269		switch (wc.opcode) {
270		case IB_WC_SEND:
271			DEBUG_LOG(PFX "send completion\n");
272			cb->stats.send_bytes += cb->send_sgl.length;
273			cb->stats.send_msgs++;
274			break;
275
276		case IB_WC_RDMA_WRITE:
277			DEBUG_LOG(PFX "rdma write completion\n");
278			cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
279			cb->stats.write_msgs++;
280			cb->state = RDMA_WRITE_COMPLETE;
281			wakeup(cb);
282			break;
283
284		case IB_WC_RDMA_READ:
285			DEBUG_LOG(PFX "rdma read completion\n");
286			cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
287			cb->stats.read_msgs++;
288			cb->state = RDMA_READ_COMPLETE;
289			wakeup(cb);
290			break;
291
292		case IB_WC_RECV:
293			DEBUG_LOG(PFX "recv completion\n");
294			cb->stats.recv_bytes += sizeof(cb->recv_buf);
295			cb->stats.recv_msgs++;
296			if (cb->wlat || cb->rlat || cb->bw)
297				ret = server_recv(cb, &wc);
298			else
299				ret = cb->server ? server_recv(cb, &wc) :
300					   client_recv(cb, &wc);
301			if (ret) {
302				log(LOG_ERR, "recv wc error: %d\n", ret);
303				goto error;
304			}
305
306			ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
307			if (ret) {
308				log(LOG_ERR, "post recv error: %d\n",
309				       ret);
310				goto error;
311			}
312			wakeup(cb);
313			break;
314
315		default:
316			log(LOG_ERR, "unknown!!!!! completion\n");
317			goto error;
318		}
319	}
320	if (ret) {
321		log(LOG_ERR, "poll error %d\n", ret);
322		goto error;
323	}
324	mtx_unlock(&cb->lock);
325	return;
326error:
327	cb->state = ERROR;
328	wakeup(cb);
329	mtx_unlock(&cb->lock);
330}
331
332static int krping_accept(struct krping_cb *cb)
333{
334	struct rdma_conn_param conn_param;
335	int ret;
336
337	DEBUG_LOG(PFX "accepting client connection request\n");
338
339	memset(&conn_param, 0, sizeof conn_param);
340	conn_param.responder_resources = 1;
341	conn_param.initiator_depth = 1;
342
343	ret = rdma_accept(cb->child_cm_id, &conn_param);
344	if (ret) {
345		log(LOG_ERR, "rdma_accept error: %d\n", ret);
346		return ret;
347	}
348
349	if (!cb->wlat && !cb->rlat && !cb->bw) {
350		krping_wait(cb, CONNECTED);
351		if (cb->state == ERROR) {
352			log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
353			return -1;
354		}
355	}
356	return 0;
357}
358
359static void krping_setup_wr(struct krping_cb *cb)
360{
361	/* XXX X86 only here... not mapping for dma! */
362	cb->recv_sgl.addr = vtophys(&cb->recv_buf);
363	cb->recv_sgl.length = sizeof cb->recv_buf;
364	if (cb->use_dmamr)
365		cb->recv_sgl.lkey = cb->dma_mr->lkey;
366	else
367		cb->recv_sgl.lkey = cb->recv_mr->lkey;
368	cb->rq_wr.sg_list = &cb->recv_sgl;
369	cb->rq_wr.num_sge = 1;
370
371	cb->send_sgl.addr = vtophys(&cb->send_buf);
372	cb->send_sgl.length = sizeof cb->send_buf;
373	if (cb->use_dmamr)
374		cb->send_sgl.lkey = cb->dma_mr->lkey;
375	else
376		cb->send_sgl.lkey = cb->send_mr->lkey;
377
378	cb->sq_wr.opcode = IB_WR_SEND;
379	cb->sq_wr.send_flags = IB_SEND_SIGNALED;
380	cb->sq_wr.sg_list = &cb->send_sgl;
381	cb->sq_wr.num_sge = 1;
382
383	cb->rdma_addr = vtophys(cb->rdma_buf);
384	cb->rdma_sgl.addr = cb->rdma_addr;
385	if (cb->use_dmamr)
386		cb->rdma_sgl.lkey = cb->dma_mr->lkey;
387	else
388		cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
389	cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
390	cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
391	cb->rdma_sq_wr.num_sge = 1;
392
393	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
394		cb->start_addr = vtophys(cb->start_buf);
395	}
396}
397
398static int krping_setup_buffers(struct krping_cb *cb)
399{
400	int ret;
401	struct ib_phys_buf buf;
402	u64 iovbase;
403
404	DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
405
406	if (cb->use_dmamr) {
407		cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
408					   IB_ACCESS_REMOTE_READ|
409				           IB_ACCESS_REMOTE_WRITE);
410		if (IS_ERR(cb->dma_mr)) {
411			log(LOG_ERR, "reg_dmamr failed\n");
412			return PTR_ERR(cb->dma_mr);
413		}
414	} else {
415
416		buf.addr = vtophys(&cb->recv_buf);
417		buf.size = sizeof cb->recv_buf;
418		iovbase = vtophys(&cb->recv_buf);
419		cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
420					     IB_ACCESS_LOCAL_WRITE,
421					     &iovbase);
422
423		if (IS_ERR(cb->recv_mr)) {
424			log(LOG_ERR, "recv_buf reg_mr failed\n");
425			return PTR_ERR(cb->recv_mr);
426		}
427
428		buf.addr = vtophys(&cb->send_buf);
429		buf.size = sizeof cb->send_buf;
430		iovbase = vtophys(&cb->send_buf);
431		cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
432					     0, &iovbase);
433
434		if (IS_ERR(cb->send_mr)) {
435			log(LOG_ERR, "send_buf reg_mr failed\n");
436			ib_dereg_mr(cb->recv_mr);
437			return PTR_ERR(cb->send_mr);
438		}
439	}
440
441	/* RNIC adapters have a limit upto which it can register physical memory
442	 * If DMA-MR memory mode is set then normally driver registers maximum
443	 * supported memory. After that if contigmalloc allocates memory beyond the
444	 * specified RNIC limit then Krping may not work.
445	 */
446	if (cb->use_dmamr && cb->memlimit)
447		cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit,
448					    PAGE_SIZE, 0);
449	else
450		cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
451					    PAGE_SIZE, 0);
452
453	if (!cb->rdma_buf) {
454		log(LOG_ERR, "rdma_buf malloc failed\n");
455		ret = ENOMEM;
456		goto err1;
457	}
458	if (!cb->use_dmamr) {
459
460		buf.addr = vtophys(cb->rdma_buf);
461		buf.size = cb->size;
462		iovbase = vtophys(cb->rdma_buf);
463		cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
464					     IB_ACCESS_REMOTE_READ|
465					     IB_ACCESS_REMOTE_WRITE,
466					     &iovbase);
467
468		if (IS_ERR(cb->rdma_mr)) {
469			log(LOG_ERR, "rdma_buf reg_mr failed\n");
470			ret = PTR_ERR(cb->rdma_mr);
471			goto err2;
472		}
473	}
474
475	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
476		if (cb->use_dmamr && cb->memlimit)
477			cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
478						     0, cb->memlimit, PAGE_SIZE, 0);
479		else
480			cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
481						     0, -1UL, PAGE_SIZE, 0);
482		if (!cb->start_buf) {
483			log(LOG_ERR, "start_buf malloc failed\n");
484			ret = ENOMEM;
485			goto err2;
486		}
487		if (!cb->use_dmamr) {
488			unsigned flags = IB_ACCESS_REMOTE_READ;
489
490			if (cb->wlat || cb->rlat || cb->bw)
491				flags |= IB_ACCESS_REMOTE_WRITE;
492			buf.addr = vtophys(cb->start_buf);
493			buf.size = cb->size;
494			iovbase = vtophys(cb->start_buf);
495			cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
496					     flags,
497					     &iovbase);
498
499			if (IS_ERR(cb->start_mr)) {
500				log(LOG_ERR, "start_buf reg_mr failed\n");
501				ret = PTR_ERR(cb->start_mr);
502				goto err3;
503			}
504		}
505	}
506
507	krping_setup_wr(cb);
508	DEBUG_LOG(PFX "allocated & registered buffers...\n");
509	return 0;
510err3:
511	contigfree(cb->start_buf, cb->size, M_DEVBUF);
512
513	if (!cb->use_dmamr)
514		ib_dereg_mr(cb->rdma_mr);
515err2:
516	contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
517err1:
518	if (cb->use_dmamr)
519		ib_dereg_mr(cb->dma_mr);
520	else {
521		ib_dereg_mr(cb->recv_mr);
522		ib_dereg_mr(cb->send_mr);
523	}
524	return ret;
525}
526
527static void krping_free_buffers(struct krping_cb *cb)
528{
529	DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb);
530
531#if 0
532	dma_unmap_single(cb->pd->device->dma_device,
533			 pci_unmap_addr(cb, recv_mapping),
534			 sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
535	dma_unmap_single(cb->pd->device->dma_device,
536			 pci_unmap_addr(cb, send_mapping),
537			 sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
538	dma_unmap_single(cb->pd->device->dma_device,
539			 pci_unmap_addr(cb, rdma_mapping),
540			 cb->size, DMA_BIDIRECTIONAL);
541#endif
542	contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
543	if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
544#if 0
545		dma_unmap_single(cb->pd->device->dma_device,
546			 pci_unmap_addr(cb, start_mapping),
547			 cb->size, DMA_BIDIRECTIONAL);
548#endif
549		contigfree(cb->start_buf, cb->size, M_DEVBUF);
550	}
551	if (cb->use_dmamr)
552		ib_dereg_mr(cb->dma_mr);
553	else {
554		ib_dereg_mr(cb->send_mr);
555		ib_dereg_mr(cb->recv_mr);
556		ib_dereg_mr(cb->rdma_mr);
557		if (!cb->server)
558			ib_dereg_mr(cb->start_mr);
559	}
560}
561
562static int krping_create_qp(struct krping_cb *cb)
563{
564	struct ib_qp_init_attr init_attr;
565	int ret;
566
567	memset(&init_attr, 0, sizeof(init_attr));
568	init_attr.cap.max_send_wr = cb->txdepth;
569	init_attr.cap.max_recv_wr = 2;
570	init_attr.cap.max_recv_sge = 1;
571	init_attr.cap.max_send_sge = 1;
572	init_attr.qp_type = IB_QPT_RC;
573	init_attr.send_cq = cb->cq;
574	init_attr.recv_cq = cb->cq;
575
576	if (cb->server) {
577		ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
578		if (!ret)
579			cb->qp = cb->child_cm_id->qp;
580	} else {
581		ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
582		if (!ret)
583			cb->qp = cb->cm_id->qp;
584	}
585
586	return ret;
587}
588
589static void krping_free_qp(struct krping_cb *cb)
590{
591	ib_destroy_qp(cb->qp);
592	ib_destroy_cq(cb->cq);
593	ib_dealloc_pd(cb->pd);
594}
595
596static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
597{
598	int ret;
599	cb->pd = ib_alloc_pd(cm_id->device);
600	if (IS_ERR(cb->pd)) {
601		log(LOG_ERR, "ib_alloc_pd failed\n");
602		return PTR_ERR(cb->pd);
603	}
604	DEBUG_LOG(PFX "created pd %p\n", cb->pd);
605
606	cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
607			      cb, cb->txdepth * 2, 0);
608	if (IS_ERR(cb->cq)) {
609		log(LOG_ERR, "ib_create_cq failed\n");
610		ret = PTR_ERR(cb->cq);
611		goto err1;
612	}
613	DEBUG_LOG(PFX "created cq %p\n", cb->cq);
614
615	if (!cb->wlat && !cb->rlat && !cb->bw) {
616		ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
617		if (ret) {
618			log(LOG_ERR, "ib_create_cq failed\n");
619			goto err2;
620		}
621	}
622
623	ret = krping_create_qp(cb);
624	if (ret) {
625		log(LOG_ERR, "krping_create_qp failed: %d\n", ret);
626		goto err2;
627	}
628	DEBUG_LOG(PFX "created qp %p\n", cb->qp);
629	return 0;
630err2:
631	ib_destroy_cq(cb->cq);
632err1:
633	ib_dealloc_pd(cb->pd);
634	return ret;
635}
636
637static void krping_format_send(struct krping_cb *cb, u64 buf,
638			       struct ib_mr *mr)
639{
640	struct krping_rdma_info *info = &cb->send_buf;
641
642	info->buf = htonll(buf);
643	info->rkey = htonl(mr->rkey);
644	info->size = htonl(cb->size);
645
646	DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n",
647		  (unsigned long long)buf, mr->rkey, cb->size);
648}
649
650static void krping_test_server(struct krping_cb *cb)
651{
652	struct ib_send_wr *bad_wr;
653	int ret;
654
655	while (1) {
656		/* Wait for client's Start STAG/TO/Len */
657		krping_wait(cb, RDMA_READ_ADV);
658		if (cb->state != RDMA_READ_ADV) {
659			DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n",
660				cb->state);
661			break;
662		}
663
664		DEBUG_LOG(PFX "server received sink adv\n");
665
666		/* Issue RDMA Read. */
667		cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
668		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
669		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
670		cb->rdma_sq_wr.sg_list->length = cb->remote_len;
671
672		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
673		if (ret) {
674			log(LOG_ERR, "post send error %d\n", ret);
675			break;
676		}
677		DEBUG_LOG(PFX "server posted rdma read req \n");
678
679		/* Wait for read completion */
680		krping_wait(cb, RDMA_READ_COMPLETE);
681		if (cb->state != RDMA_READ_COMPLETE) {
682			log(LOG_ERR,
683			       "wait for RDMA_READ_COMPLETE state %d\n",
684			       cb->state);
685			break;
686		}
687		DEBUG_LOG(PFX "server received read complete\n");
688
689		/* Display data in recv buf */
690		if (cb->verbose)
691			DEBUG_LOG("server ping data: %s\n", cb->rdma_buf);
692
693		/* Tell client to continue */
694		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
695		if (ret) {
696			log(LOG_ERR, "post send error %d\n", ret);
697			break;
698		}
699		DEBUG_LOG(PFX "server posted go ahead\n");
700
701		/* Wait for client's RDMA STAG/TO/Len */
702		krping_wait(cb, RDMA_WRITE_ADV);
703		if (cb->state != RDMA_WRITE_ADV) {
704			log(LOG_ERR,
705			       "wait for RDMA_WRITE_ADV state %d\n",
706			       cb->state);
707			break;
708		}
709		DEBUG_LOG(PFX "server received sink adv\n");
710
711		/* RDMA Write echo data */
712		cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
713		cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
714		cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
715		cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
716		DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n",
717			  cb->rdma_sq_wr.sg_list->lkey,
718			  (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
719			  cb->rdma_sq_wr.sg_list->length);
720
721		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
722		if (ret) {
723			log(LOG_ERR, "post send error %d\n", ret);
724			break;
725		}
726
727		/* Wait for completion */
728		krping_wait(cb, RDMA_WRITE_COMPLETE);
729		if (cb->state != RDMA_WRITE_COMPLETE) {
730			log(LOG_ERR,
731			       "wait for RDMA_WRITE_COMPLETE state %d\n",
732			       cb->state);
733			break;
734		}
735		DEBUG_LOG(PFX "server rdma write complete \n");
736
737		cb->state = CONNECTED;
738
739		/* Tell client to begin again */
740		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
741		if (ret) {
742			log(LOG_ERR, "post send error %d\n", ret);
743			break;
744		}
745		DEBUG_LOG(PFX "server posted go ahead\n");
746	}
747}
748
749static void rlat_test(struct krping_cb *cb)
750{
751	int scnt;
752	int iters = cb->count;
753	struct timeval start_tv, stop_tv;
754	int ret;
755	struct ib_wc wc;
756	struct ib_send_wr *bad_wr;
757	int ne;
758
759	scnt = 0;
760	cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
761	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
762	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
763	cb->rdma_sq_wr.sg_list->length = cb->size;
764
765	microtime(&start_tv);
766 	if (!cb->poll) {
767 		cb->state = RDMA_READ_ADV;
768 		ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
769 	}
770	while (scnt < iters) {
771
772 		cb->state = RDMA_READ_ADV;
773		ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
774		if (ret) {
775			log(LOG_ERR,
776				"Couldn't post send: ret=%d scnt %d\n",
777				ret, scnt);
778			return;
779		}
780
781		do {
782			if (!cb->poll) {
783				krping_wait(cb, RDMA_READ_COMPLETE);
784				if (cb->state == RDMA_READ_COMPLETE) {
785					ne = 1;
786					ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
787				} else {
788					ne = -1;
789				}
790			} else
791				ne = ib_poll_cq(cb->cq, 1, &wc);
792			if (cb->state == ERROR) {
793				log(LOG_ERR,
794				       "state == ERROR...bailing scnt %d\n", scnt);
795				return;
796			}
797		} while (ne == 0);
798
799		if (ne < 0) {
800			log(LOG_ERR, "poll CQ failed %d\n", ne);
801			return;
802		}
803 		if (cb->poll && wc.status != IB_WC_SUCCESS) {
804			log(LOG_ERR, "Completion wth error at %s:\n",
805				cb->server ? "server" : "client");
806			log(LOG_ERR, "Failed status %d: wr_id %d\n",
807				wc.status, (int) wc.wr_id);
808			return;
809		}
810		++scnt;
811	}
812	microtime(&stop_tv);
813
814        if (stop_tv.tv_usec < start_tv.tv_usec) {
815                stop_tv.tv_usec += 1000000;
816                stop_tv.tv_sec  -= 1;
817        }
818
819	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d\n",
820		stop_tv.tv_sec - start_tv.tv_sec,
821		stop_tv.tv_usec - start_tv.tv_usec,
822		scnt, cb->size);
823}
824
825static int alloc_cycle_mem(int cycle_iters,
826				cycles_t **post_cycles_start,
827				cycles_t **post_cycles_stop,
828				cycles_t **poll_cycles_start,
829				cycles_t **poll_cycles_stop,
830				cycles_t **last_poll_cycles_start)
831{
832	*post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
833	if (!*post_cycles_start) {
834		goto fail1;
835	}
836	*post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
837	if (!*post_cycles_stop) {
838		goto fail2;
839	}
840	*poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
841	if (!*poll_cycles_start) {
842		goto fail3;
843	}
844	*poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
845	if (!*poll_cycles_stop) {
846		goto fail4;
847	}
848	*last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
849	if (!*last_poll_cycles_start) {
850		goto fail5;
851	}
852	return 0;
853fail5:
854	free(*poll_cycles_stop, M_DEVBUF);
855fail4:
856	free(*poll_cycles_start, M_DEVBUF);
857fail3:
858	free(*post_cycles_stop, M_DEVBUF);
859fail2:
860	free(*post_cycles_start, M_DEVBUF);
861fail1:
862	log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
863	return ENOMEM;
864}
865
866static void free_cycle_mem(cycles_t *post_cycles_start,
867				cycles_t *post_cycles_stop,
868				cycles_t *poll_cycles_start,
869				cycles_t *poll_cycles_stop,
870				cycles_t *last_poll_cycles_start)
871{
872	free(last_poll_cycles_start, M_DEVBUF);
873	free(poll_cycles_stop, M_DEVBUF);
874	free(poll_cycles_start, M_DEVBUF);
875	free(post_cycles_stop, M_DEVBUF);
876	free(post_cycles_start, M_DEVBUF);
877}
878
879static void wlat_test(struct krping_cb *cb)
880{
881	int ccnt, scnt, rcnt;
882	int iters=cb->count;
883	volatile char *poll_buf = (char *) cb->start_buf;
884	char *buf = (char *)cb->rdma_buf;
885	ccnt = 0;
886	scnt = 0;
887	rcnt = 0;
888	struct timeval start_tv, stop_tv;
889	cycles_t *post_cycles_start, *post_cycles_stop;
890	cycles_t *poll_cycles_start, *poll_cycles_stop;
891	cycles_t *last_poll_cycles_start;
892	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
893	int i;
894	int cycle_iters = 1000;
895	int err;
896
897	err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
898				&poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
899
900	if (err) {
901		log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
902		return;
903	}
904
905	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
906	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
907	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
908	cb->rdma_sq_wr.sg_list->length = cb->size;
909
910	if (cycle_iters > iters)
911		cycle_iters = iters;
912	microtime(&start_tv);
913	while (scnt < iters || ccnt < iters || rcnt < iters) {
914
915		/* Wait till buffer changes. */
916		if (rcnt < iters && !(scnt < 1 && !cb->server)) {
917			++rcnt;
918			while (*poll_buf != (char)rcnt) {
919				if (cb->state == ERROR) {
920					log(LOG_ERR, "state = ERROR, bailing\n");
921					return;
922				}
923			}
924		}
925
926		if (scnt < iters) {
927			struct ib_send_wr *bad_wr;
928
929			*buf = (char)scnt+1;
930			if (scnt < cycle_iters)
931				post_cycles_start[scnt] = get_cycles();
932			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
933				log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
934					scnt);
935				return;
936			}
937			if (scnt < cycle_iters)
938				post_cycles_stop[scnt] = get_cycles();
939			scnt++;
940		}
941
942		if (ccnt < iters) {
943			struct ib_wc wc;
944			int ne;
945
946			if (ccnt < cycle_iters)
947				poll_cycles_start[ccnt] = get_cycles();
948			do {
949				if (ccnt < cycle_iters)
950					last_poll_cycles_start[ccnt] = get_cycles();
951				ne = ib_poll_cq(cb->cq, 1, &wc);
952			} while (ne == 0);
953			if (ccnt < cycle_iters)
954				poll_cycles_stop[ccnt] = get_cycles();
955			++ccnt;
956
957			if (ne < 0) {
958				log(LOG_ERR, "poll CQ failed %d\n", ne);
959				return;
960			}
961			if (wc.status != IB_WC_SUCCESS) {
962				log(LOG_ERR, "Completion wth error at %s:\n",
963					cb->server ? "server" : "client");
964				log(LOG_ERR, "Failed status %d: wr_id %d\n",
965					wc.status, (int) wc.wr_id);
966				log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n",
967					scnt, rcnt, ccnt);
968				return;
969			}
970		}
971	}
972	microtime(&stop_tv);
973
974        if (stop_tv.tv_usec < start_tv.tv_usec) {
975                stop_tv.tv_usec += 1000000;
976                stop_tv.tv_sec  -= 1;
977        }
978
979	for (i=0; i < cycle_iters; i++) {
980		sum_post += post_cycles_stop[i] - post_cycles_start[i];
981		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
982		sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
983	}
984
985	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
986		stop_tv.tv_sec - start_tv.tv_sec,
987		stop_tv.tv_usec - start_tv.tv_usec,
988		scnt, cb->size, cycle_iters,
989		(unsigned long long)sum_post, (unsigned long long)sum_poll,
990		(unsigned long long)sum_last_poll);
991
992	free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start,
993			poll_cycles_stop, last_poll_cycles_start);
994}
995
996static void bw_test(struct krping_cb *cb)
997{
998	int ccnt, scnt, rcnt;
999	int iters=cb->count;
1000	ccnt = 0;
1001	scnt = 0;
1002	rcnt = 0;
1003	struct timeval start_tv, stop_tv;
1004	cycles_t *post_cycles_start, *post_cycles_stop;
1005	cycles_t *poll_cycles_start, *poll_cycles_stop;
1006	cycles_t *last_poll_cycles_start;
1007	cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
1008	int i;
1009	int cycle_iters = 1000;
1010	int err;
1011
1012	err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
1013				&poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
1014
1015	if (err) {
1016		log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__);
1017		return;
1018	}
1019
1020	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1021	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1022	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1023	cb->rdma_sq_wr.sg_list->length = cb->size;
1024
1025	if (cycle_iters > iters)
1026		cycle_iters = iters;
1027	microtime(&start_tv);
1028	while (scnt < iters || ccnt < iters) {
1029
1030		while (scnt < iters && scnt - ccnt < cb->txdepth) {
1031			struct ib_send_wr *bad_wr;
1032
1033			if (scnt < cycle_iters)
1034				post_cycles_start[scnt] = get_cycles();
1035			if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1036				log(LOG_ERR,  "Couldn't post send: scnt=%d\n",
1037					scnt);
1038				return;
1039			}
1040			if (scnt < cycle_iters)
1041				post_cycles_stop[scnt] = get_cycles();
1042			++scnt;
1043		}
1044
1045		if (ccnt < iters) {
1046			int ne;
1047			struct ib_wc wc;
1048
1049			if (ccnt < cycle_iters)
1050				poll_cycles_start[ccnt] = get_cycles();
1051			do {
1052				if (ccnt < cycle_iters)
1053					last_poll_cycles_start[ccnt] = get_cycles();
1054				ne = ib_poll_cq(cb->cq, 1, &wc);
1055			} while (ne == 0);
1056			if (ccnt < cycle_iters)
1057				poll_cycles_stop[ccnt] = get_cycles();
1058			ccnt += 1;
1059
1060			if (ne < 0) {
1061				log(LOG_ERR, "poll CQ failed %d\n", ne);
1062				return;
1063			}
1064			if (wc.status != IB_WC_SUCCESS) {
1065				log(LOG_ERR, "Completion wth error at %s:\n",
1066					cb->server ? "server" : "client");
1067				log(LOG_ERR, "Failed status %d: wr_id %d\n",
1068					wc.status, (int) wc.wr_id);
1069				return;
1070			}
1071		}
1072	}
1073	microtime(&stop_tv);
1074
1075        if (stop_tv.tv_usec < start_tv.tv_usec) {
1076                stop_tv.tv_usec += 1000000;
1077                stop_tv.tv_sec  -= 1;
1078        }
1079
1080	for (i=0; i < cycle_iters; i++) {
1081		sum_post += post_cycles_stop[i] - post_cycles_start[i];
1082		sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
1083		sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
1084	}
1085
1086	log(LOG_ERR, "delta sec %zu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
1087		stop_tv.tv_sec - start_tv.tv_sec,
1088		stop_tv.tv_usec - start_tv.tv_usec,
1089		scnt, cb->size, cycle_iters,
1090		(unsigned long long)sum_post, (unsigned long long)sum_poll,
1091		(unsigned long long)sum_last_poll);
1092
1093	free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start,
1094			poll_cycles_stop, last_poll_cycles_start);
1095}
1096
1097static void krping_rlat_test_server(struct krping_cb *cb)
1098{
1099	struct ib_send_wr *bad_wr;
1100	struct ib_wc wc;
1101	int ret;
1102
1103	/* Spin waiting for client's Start STAG/TO/Len */
1104	while (cb->state < RDMA_READ_ADV) {
1105		krping_cq_event_handler(cb->cq, cb);
1106	}
1107
1108	/* Send STAG/TO/Len to client */
1109	if (cb->dma_mr)
1110		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1111	else
1112		krping_format_send(cb, cb->start_addr, cb->start_mr);
1113	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1114	if (ret) {
1115		log(LOG_ERR, "post send error %d\n", ret);
1116		return;
1117	}
1118
1119	/* Spin waiting for send completion */
1120	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1121	if (ret < 0) {
1122		log(LOG_ERR, "poll error %d\n", ret);
1123		return;
1124	}
1125	if (wc.status) {
1126		log(LOG_ERR, "send completiong error %d\n", wc.status);
1127		return;
1128	}
1129
1130	krping_wait(cb, ERROR);
1131}
1132
1133static void krping_wlat_test_server(struct krping_cb *cb)
1134{
1135	struct ib_send_wr *bad_wr;
1136	struct ib_wc wc;
1137	int ret;
1138
1139	/* Spin waiting for client's Start STAG/TO/Len */
1140	while (cb->state < RDMA_READ_ADV) {
1141		krping_cq_event_handler(cb->cq, cb);
1142	}
1143
1144	/* Send STAG/TO/Len to client */
1145	if (cb->dma_mr)
1146		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1147	else
1148		krping_format_send(cb, cb->start_addr, cb->start_mr);
1149	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1150	if (ret) {
1151		log(LOG_ERR, "post send error %d\n", ret);
1152		return;
1153	}
1154
1155	/* Spin waiting for send completion */
1156	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1157	if (ret < 0) {
1158		log(LOG_ERR, "poll error %d\n", ret);
1159		return;
1160	}
1161	if (wc.status) {
1162		log(LOG_ERR, "send completiong error %d\n", wc.status);
1163		return;
1164	}
1165
1166	wlat_test(cb);
1167
1168}
1169
1170static void krping_bw_test_server(struct krping_cb *cb)
1171{
1172	struct ib_send_wr *bad_wr;
1173	struct ib_wc wc;
1174	int ret;
1175
1176	/* Spin waiting for client's Start STAG/TO/Len */
1177	while (cb->state < RDMA_READ_ADV) {
1178		krping_cq_event_handler(cb->cq, cb);
1179	}
1180
1181	/* Send STAG/TO/Len to client */
1182	if (cb->dma_mr)
1183		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1184	else
1185		krping_format_send(cb, cb->start_addr, cb->start_mr);
1186	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1187	if (ret) {
1188		log(LOG_ERR, "post send error %d\n", ret);
1189		return;
1190	}
1191
1192	/* Spin waiting for send completion */
1193	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1194	if (ret < 0) {
1195		log(LOG_ERR, "poll error %d\n", ret);
1196		return;
1197	}
1198	if (wc.status) {
1199		log(LOG_ERR, "send completiong error %d\n", wc.status);
1200		return;
1201	}
1202
1203	if (cb->duplex)
1204		bw_test(cb);
1205	krping_wait(cb, ERROR);
1206}
1207
1208static int krping_bind_server(struct krping_cb *cb)
1209{
1210	struct sockaddr_in sin;
1211	int ret;
1212
1213	memset(&sin, 0, sizeof(sin));
1214	sin.sin_len = sizeof sin;
1215	sin.sin_family = AF_INET;
1216	sin.sin_addr.s_addr = cb->addr.s_addr;
1217	sin.sin_port = cb->port;
1218
1219	ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
1220	if (ret) {
1221		log(LOG_ERR, "rdma_bind_addr error %d\n", ret);
1222		return ret;
1223	}
1224	DEBUG_LOG(PFX "rdma_bind_addr successful\n");
1225
1226	DEBUG_LOG(PFX "rdma_listen\n");
1227	ret = rdma_listen(cb->cm_id, 3);
1228	if (ret) {
1229		log(LOG_ERR, "rdma_listen failed: %d\n", ret);
1230		return ret;
1231	}
1232
1233	krping_wait(cb, CONNECT_REQUEST);
1234	if (cb->state != CONNECT_REQUEST) {
1235		log(LOG_ERR,  "wait for CONNECT_REQUEST state %d\n",
1236			cb->state);
1237		return -1;
1238	}
1239
1240	return 0;
1241}
1242
1243static void krping_run_server(struct krping_cb *cb)
1244{
1245	struct ib_recv_wr *bad_wr;
1246	int ret;
1247
1248	ret = krping_bind_server(cb);
1249	if (ret)
1250		return;
1251
1252	ret = krping_setup_qp(cb, cb->child_cm_id);
1253	if (ret) {
1254		log(LOG_ERR, "setup_qp failed: %d\n", ret);
1255		return;
1256	}
1257
1258	ret = krping_setup_buffers(cb);
1259	if (ret) {
1260		log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1261		goto err1;
1262	}
1263
1264	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1265	if (ret) {
1266		log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1267		goto err2;
1268	}
1269
1270	ret = krping_accept(cb);
1271	if (ret) {
1272		log(LOG_ERR, "connect error %d\n", ret);
1273		goto err2;
1274	}
1275
1276	if (cb->wlat)
1277		krping_wlat_test_server(cb);
1278	else if (cb->rlat)
1279		krping_rlat_test_server(cb);
1280	else if (cb->bw)
1281		krping_bw_test_server(cb);
1282	else
1283		krping_test_server(cb);
1284
1285	rdma_disconnect(cb->child_cm_id);
1286	rdma_destroy_id(cb->child_cm_id);
1287err2:
1288	krping_free_buffers(cb);
1289err1:
1290	krping_free_qp(cb);
1291}
1292
1293static void krping_test_client(struct krping_cb *cb)
1294{
1295	int ping, start, cc, i, ret;
1296	struct ib_send_wr *bad_wr;
1297	unsigned char c;
1298
1299	start = 65;
1300	for (ping = 0; !cb->count || ping < cb->count; ping++) {
1301		cb->state = RDMA_READ_ADV;
1302
1303		/* Put some ascii text in the buffer. */
1304		cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
1305		for (i = cc, c = start; i < cb->size; i++) {
1306			cb->start_buf[i] = c;
1307			c++;
1308			if (c > 122)
1309				c = 65;
1310		}
1311		start++;
1312		if (start > 122)
1313			start = 65;
1314		cb->start_buf[cb->size - 1] = 0;
1315
1316		if (cb->dma_mr)
1317			krping_format_send(cb, cb->start_addr, cb->dma_mr);
1318		else
1319			krping_format_send(cb, cb->start_addr, cb->start_mr);
1320
1321		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1322		if (ret) {
1323			log(LOG_ERR, "post send error %d\n", ret);
1324			break;
1325		}
1326
1327		/* Wait for server to ACK */
1328		krping_wait(cb, RDMA_WRITE_ADV);
1329		if (cb->state != RDMA_WRITE_ADV) {
1330			log(LOG_ERR,
1331			       "wait for RDMA_WRITE_ADV state %d\n",
1332			       cb->state);
1333			break;
1334		}
1335
1336		if (cb->dma_mr)
1337			krping_format_send(cb, cb->rdma_addr, cb->dma_mr);
1338		else
1339			krping_format_send(cb, cb->rdma_addr, cb->rdma_mr);
1340
1341		ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1342		if (ret) {
1343			log(LOG_ERR, "post send error %d\n", ret);
1344			break;
1345		}
1346
1347		/* Wait for the server to say the RDMA Write is complete. */
1348		krping_wait(cb, RDMA_WRITE_COMPLETE);
1349		if (cb->state != RDMA_WRITE_COMPLETE) {
1350			log(LOG_ERR,
1351			       "wait for RDMA_WRITE_COMPLETE state %d\n",
1352			       cb->state);
1353			break;
1354		}
1355
1356		if (cb->validate)
1357			if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
1358				log(LOG_ERR, "data mismatch!\n");
1359				break;
1360			}
1361
1362		if (cb->verbose)
1363			DEBUG_LOG("ping data: %s\n", cb->rdma_buf);
1364	}
1365}
1366
1367static void krping_rlat_test_client(struct krping_cb *cb)
1368{
1369	struct ib_send_wr *bad_wr;
1370	struct ib_wc wc;
1371	int ret;
1372
1373	cb->state = RDMA_READ_ADV;
1374
1375	/* Send STAG/TO/Len to client */
1376	if (cb->dma_mr)
1377		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1378	else
1379		krping_format_send(cb, cb->start_addr, cb->rdma_mr);
1380	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1381	if (ret) {
1382		log(LOG_ERR, "post send error %d\n", ret);
1383		return;
1384	}
1385
1386	/* Spin waiting for send completion */
1387	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1388	if (ret < 0) {
1389		log(LOG_ERR, "poll error %d\n", ret);
1390		return;
1391	}
1392	if (wc.status) {
1393		log(LOG_ERR, "send completion error %d\n", wc.status);
1394		return;
1395	}
1396
1397	/* Spin waiting for server's Start STAG/TO/Len */
1398	while (cb->state < RDMA_WRITE_ADV) {
1399		krping_cq_event_handler(cb->cq, cb);
1400	}
1401
1402#if 0
1403{
1404	int i;
1405	struct timeval start, stop;
1406	time_t sec;
1407	suseconds_t usec;
1408	unsigned long long elapsed;
1409	struct ib_wc wc;
1410	struct ib_send_wr *bad_wr;
1411	int ne;
1412
1413	cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
1414	cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
1415	cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
1416	cb->rdma_sq_wr.sg_list->length = 0;
1417	cb->rdma_sq_wr.num_sge = 0;
1418
1419	microtime(&start);
1420	for (i=0; i < 100000; i++) {
1421		if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
1422			log(LOG_ERR,  "Couldn't post send\n");
1423			return;
1424		}
1425		do {
1426			ne = ib_poll_cq(cb->cq, 1, &wc);
1427		} while (ne == 0);
1428		if (ne < 0) {
1429			log(LOG_ERR, "poll CQ failed %d\n", ne);
1430			return;
1431		}
1432		if (wc.status != IB_WC_SUCCESS) {
1433			log(LOG_ERR, "Completion wth error at %s:\n",
1434				cb->server ? "server" : "client");
1435			log(LOG_ERR, "Failed status %d: wr_id %d\n",
1436				wc.status, (int) wc.wr_id);
1437			return;
1438		}
1439	}
1440	microtime(&stop);
1441
1442	if (stop.tv_usec < start.tv_usec) {
1443		stop.tv_usec += 1000000;
1444		stop.tv_sec  -= 1;
1445	}
1446	sec     = stop.tv_sec - start.tv_sec;
1447	usec    = stop.tv_usec - start.tv_usec;
1448	elapsed = sec * 1000000 + usec;
1449	log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed);
1450}
1451#endif
1452
1453	rlat_test(cb);
1454}
1455
1456static void krping_wlat_test_client(struct krping_cb *cb)
1457{
1458	struct ib_send_wr *bad_wr;
1459	struct ib_wc wc;
1460	int ret;
1461
1462	cb->state = RDMA_READ_ADV;
1463
1464	/* Send STAG/TO/Len to client */
1465	if (cb->dma_mr)
1466		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1467	else
1468		krping_format_send(cb, cb->start_addr, cb->start_mr);
1469	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1470	if (ret) {
1471		log(LOG_ERR, "post send error %d\n", ret);
1472		return;
1473	}
1474
1475	/* Spin waiting for send completion */
1476	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1477	if (ret < 0) {
1478		log(LOG_ERR, "poll error %d\n", ret);
1479		return;
1480	}
1481	if (wc.status) {
1482		log(LOG_ERR, "send completion error %d\n", wc.status);
1483		return;
1484	}
1485
1486	/* Spin waiting for server's Start STAG/TO/Len */
1487	while (cb->state < RDMA_WRITE_ADV) {
1488		krping_cq_event_handler(cb->cq, cb);
1489	}
1490
1491	wlat_test(cb);
1492}
1493
1494static void krping_bw_test_client(struct krping_cb *cb)
1495{
1496	struct ib_send_wr *bad_wr;
1497	struct ib_wc wc;
1498	int ret;
1499
1500	cb->state = RDMA_READ_ADV;
1501
1502	/* Send STAG/TO/Len to client */
1503	if (cb->dma_mr)
1504		krping_format_send(cb, cb->start_addr, cb->dma_mr);
1505	else
1506		krping_format_send(cb, cb->start_addr, cb->start_mr);
1507	ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
1508	if (ret) {
1509		log(LOG_ERR, "post send error %d\n", ret);
1510		return;
1511	}
1512
1513	/* Spin waiting for send completion */
1514	while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
1515	if (ret < 0) {
1516		log(LOG_ERR, "poll error %d\n", ret);
1517		return;
1518	}
1519	if (wc.status) {
1520		log(LOG_ERR, "send completion error %d\n", wc.status);
1521		return;
1522	}
1523
1524	/* Spin waiting for server's Start STAG/TO/Len */
1525	while (cb->state < RDMA_WRITE_ADV) {
1526		krping_cq_event_handler(cb->cq, cb);
1527	}
1528
1529	bw_test(cb);
1530}
1531
1532static int krping_connect_client(struct krping_cb *cb)
1533{
1534	struct rdma_conn_param conn_param;
1535	int ret;
1536
1537	memset(&conn_param, 0, sizeof conn_param);
1538	conn_param.responder_resources = 1;
1539	conn_param.initiator_depth = 1;
1540	conn_param.retry_count = 10;
1541
1542	ret = rdma_connect(cb->cm_id, &conn_param);
1543	if (ret) {
1544		log(LOG_ERR, "rdma_connect error %d\n", ret);
1545		return ret;
1546	}
1547
1548	krping_wait(cb, CONNECTED);
1549	if (cb->state == ERROR) {
1550		log(LOG_ERR,  "wait for CONNECTED state %d\n", cb->state);
1551		return -1;
1552	}
1553
1554	DEBUG_LOG(PFX "rdma_connect successful\n");
1555	return 0;
1556}
1557
1558static int krping_bind_client(struct krping_cb *cb)
1559{
1560	struct sockaddr_in sin;
1561	int ret;
1562
1563	memset(&sin, 0, sizeof(sin));
1564	sin.sin_len = sizeof sin;
1565	sin.sin_family = AF_INET;
1566	sin.sin_addr.s_addr = cb->addr.s_addr;
1567	sin.sin_port = cb->port;
1568
1569	ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
1570				2000);
1571	if (ret) {
1572		log(LOG_ERR, "rdma_resolve_addr error %d\n", ret);
1573		return ret;
1574	}
1575
1576	krping_wait(cb, ROUTE_RESOLVED);
1577	if (cb->state != ROUTE_RESOLVED) {
1578		log(LOG_ERR,
1579		       "addr/route resolution did not resolve: state %d\n",
1580		       cb->state);
1581		return EINTR;
1582	}
1583
1584	DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n");
1585	return 0;
1586}
1587
1588static void krping_run_client(struct krping_cb *cb)
1589{
1590	struct ib_recv_wr *bad_wr;
1591	int ret;
1592
1593	ret = krping_bind_client(cb);
1594	if (ret)
1595		return;
1596
1597	ret = krping_setup_qp(cb, cb->cm_id);
1598	if (ret) {
1599		log(LOG_ERR, "setup_qp failed: %d\n", ret);
1600		return;
1601	}
1602
1603	ret = krping_setup_buffers(cb);
1604	if (ret) {
1605		log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
1606		goto err1;
1607	}
1608
1609	ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
1610	if (ret) {
1611		log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
1612		goto err2;
1613	}
1614
1615	ret = krping_connect_client(cb);
1616	if (ret) {
1617		log(LOG_ERR, "connect error %d\n", ret);
1618		goto err2;
1619	}
1620
1621	if (cb->wlat)
1622		krping_wlat_test_client(cb);
1623	else if (cb->rlat)
1624		krping_rlat_test_client(cb);
1625	else if (cb->bw)
1626		krping_bw_test_client(cb);
1627	else
1628		krping_test_client(cb);
1629	rdma_disconnect(cb->cm_id);
1630err2:
1631	krping_free_buffers(cb);
1632err1:
1633	krping_free_qp(cb);
1634}
1635
1636int krping_doit(char *cmd)
1637{
1638	struct krping_cb *cb;
1639	int op;
1640	int ret = 0;
1641	char *optarg;
1642	unsigned long optint;
1643	debug = 0;
1644
1645	cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK);
1646	if (!cb)
1647		return ENOMEM;
1648	bzero(cb, sizeof *cb);
1649
1650	mtx_lock(&krping_mutex);
1651	TAILQ_INSERT_TAIL(&krping_cbs, cb, list);
1652	mtx_unlock(&krping_mutex);
1653
1654	cb->server = -1;
1655	cb->state = IDLE;
1656	cb->size = 64;
1657	cb->txdepth = RPING_SQ_DEPTH;
1658	cb->use_dmamr = 1;
1659	cb->memlimit = 0;
1660	mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
1661
1662	while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
1663			      &optint)) != 0) {
1664		switch (op) {
1665		case 'a':
1666			cb->addr_str = optarg;
1667			DEBUG_LOG(PFX "ipaddr (%s)\n", optarg);
1668			if (!inet_aton(optarg, &cb->addr)) {
1669				log(LOG_ERR, "bad addr string %s\n", optarg);
1670				ret = EINVAL;
1671			}
1672			break;
1673		case 'D':
1674			cb->use_dmamr = 1;
1675			DEBUG_LOG(PFX "using dma mr\n");
1676			break;
1677		case 'p':
1678			cb->port = htons(optint);
1679			DEBUG_LOG(PFX "port %d\n", (int)optint);
1680			break;
1681		case 'P':
1682			cb->poll = 1;
1683			DEBUG_LOG("server\n");
1684			break;
1685		case 's':
1686			cb->server = 1;
1687			DEBUG_LOG(PFX "server\n");
1688			break;
1689		case 'c':
1690			cb->server = 0;
1691			DEBUG_LOG(PFX "client\n");
1692			break;
1693		case 'S':
1694			cb->size = optint;
1695			if ((cb->size < 1) ||
1696			    (cb->size > RPING_BUFSIZE)) {
1697				log(LOG_ERR, "Invalid size %d "
1698				       "(valid range is 1 to %d)\n",
1699				       cb->size, RPING_BUFSIZE);
1700				ret = EINVAL;
1701			} else
1702				DEBUG_LOG(PFX "size %d\n", (int)optint);
1703			break;
1704		case 'C':
1705			cb->count = optint;
1706			if (cb->count < 0) {
1707				log(LOG_ERR, "Invalid count %d\n",
1708					cb->count);
1709				ret = EINVAL;
1710			} else
1711				DEBUG_LOG(PFX "count %d\n", (int) cb->count);
1712			break;
1713		case 'v':
1714			cb->verbose++;
1715			DEBUG_LOG(PFX "verbose\n");
1716			break;
1717		case 'V':
1718			cb->validate++;
1719			DEBUG_LOG(PFX "validate data\n");
1720			break;
1721		case 'L':
1722			cb->rlat++;
1723			break;
1724		case 'l':
1725			cb->wlat++;
1726			break;
1727		case 'B':
1728			cb->bw++;
1729			break;
1730		case 't':
1731			cb->txdepth = optint;
1732			DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth);
1733			break;
1734		case 'd':
1735			debug++;
1736			break;
1737		case 'm':
1738                        cb->memlimit = optint;
1739                        if (cb->memlimit < 1) {
1740                                log(LOG_ERR, "Invalid memory limit %ju\n",
1741				    cb->memlimit);
1742                                ret = EINVAL;
1743                        } else
1744                                DEBUG_LOG(PFX "memory limit %d\n", (int)optint);
1745                        break;
1746		default:
1747			log(LOG_ERR, "unknown opt %s\n", optarg);
1748			ret = EINVAL;
1749			break;
1750		}
1751	}
1752	if (ret)
1753		goto out;
1754
1755	if (cb->server == -1) {
1756		log(LOG_ERR, "must be either client or server\n");
1757		ret = EINVAL;
1758		goto out;
1759	}
1760	if ((cb->bw + cb->rlat + cb->wlat) > 1) {
1761		log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n");
1762		ret = EINVAL;
1763		goto out;
1764	}
1765
1766
1767	cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
1768	if (IS_ERR(cb->cm_id)) {
1769		ret = PTR_ERR(cb->cm_id);
1770		log(LOG_ERR, "rdma_create_id error %d\n", ret);
1771		goto out;
1772	}
1773	DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id);
1774	if (cb->server)
1775		krping_run_server(cb);
1776	else
1777		krping_run_client(cb);
1778	DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id);
1779	rdma_destroy_id(cb->cm_id);
1780out:
1781	mtx_lock(&krping_mutex);
1782	TAILQ_REMOVE(&krping_cbs, cb, list);
1783	mtx_unlock(&krping_mutex);
1784	free(cb, M_DEVBUF);
1785	return ret;
1786}
1787
1788void krping_init(void)
1789{
1790	mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF);
1791	TAILQ_INIT(&krping_cbs);
1792}
1793