1/*
2 * Copyright (c) 2011-2012 Intel Corporation.  All rights reserved.
3 * Copyright (c) 2014-2015 Mellanox Technologies LTD. All rights reserved.
4 *
5 * This software is available to you under the OpenIB.org BSD license
6 * below:
7 *
8 *     Redistribution and use in source and binary forms, with or
9 *     without modification, are permitted provided that the following
10 *     conditions are met:
11 *
12 *      - Redistributions of source code must retain the above
13 *        copyright notice, this list of conditions and the following
14 *        disclaimer.
15 *
16 *      - Redistributions in binary form must reproduce the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer in the documentation and/or other materials
19 *        provided with the distribution.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
24 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
25 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
26 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
27 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * SOFTWARE.
29 */
30
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#include <strings.h>
35#include <errno.h>
36#include <getopt.h>
37#include <sys/types.h>
38#include <sys/socket.h>
39#include <sys/time.h>
40#include <sys/wait.h>
41#include <netdb.h>
42#include <fcntl.h>
43#include <unistd.h>
44#include <netinet/tcp.h>
45
46#include <rdma/rdma_cma.h>
47#include <rdma/rsocket.h>
48#include <util/compiler.h>
49#include "common.h"
50
51struct test_size_param {
52	int size;
53	int option;
54};
55
56static struct test_size_param test_size[] = {
57	{ 1 <<  6, 0 },
58	{ 1 <<  7, 1 }, { (1 <<  7) + (1 <<  6), 1},
59	{ 1 <<  8, 1 }, { (1 <<  8) + (1 <<  7), 1},
60	{ 1 <<  9, 1 }, { (1 <<  9) + (1 <<  8), 1},
61	{ 1 << 10, 1 }, { (1 << 10) + (1 <<  9), 1},
62	{ 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1},
63	{ 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1},
64	{ 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1},
65	{ 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1},
66	{ 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1},
67	{ 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1},
68	{ 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1},
69	{ 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1},
70	{ 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1},
71	{ 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1},
72	{ 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1},
73	{ 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1},
74};
75#define TEST_CNT (sizeof test_size / sizeof test_size[0])
76
77static int rs, lrs;
78static int use_async;
79static int use_rgai;
80static int verify;
81static int flags = MSG_DONTWAIT;
82static int poll_timeout = 0;
83static int custom;
84static int use_fork;
85static pid_t fork_pid;
86static enum rs_optimization optimization;
87static int size_option;
88static int iterations = 1;
89static int transfer_size = 1000;
90static int transfer_count = 1000;
91static int buffer_size, inline_size = 64;
92static char test_name[10] = "custom";
93static const char *port = "7471";
94static int keepalive;
95static char *dst_addr;
96static char *src_addr;
97static struct timeval start, end;
98static void *buf;
99static struct rdma_addrinfo rai_hints;
100static struct addrinfo ai_hints;
101
102static void show_perf(void)
103{
104	char str[32];
105	float usec;
106	long long bytes;
107
108	usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
109	bytes = (long long) iterations * transfer_count * transfer_size * 2;
110
111	/* name size transfers iterations bytes seconds Gb/sec usec/xfer */
112	printf("%-10s", test_name);
113	size_str(str, sizeof str, transfer_size);
114	printf("%-8s", str);
115	cnt_str(str, sizeof str, transfer_count);
116	printf("%-8s", str);
117	cnt_str(str, sizeof str, iterations);
118	printf("%-8s", str);
119	size_str(str, sizeof str, bytes);
120	printf("%-8s", str);
121	printf("%8.2fs%10.2f%11.2f\n",
122		usec / 1000000., (bytes * 8) / (1000. * usec),
123		(usec / iterations) / (transfer_count * 2));
124}
125
126static void init_latency_test(int size)
127{
128	char sstr[5];
129
130	size_str(sstr, sizeof sstr, size);
131	snprintf(test_name, sizeof test_name, "%s_lat", sstr);
132	transfer_count = 1;
133	transfer_size = size;
134	iterations = size_to_count(transfer_size);
135}
136
137static void init_bandwidth_test(int size)
138{
139	char sstr[5];
140
141	size_str(sstr, sizeof sstr, size);
142	snprintf(test_name, sizeof test_name, "%s_bw", sstr);
143	iterations = 1;
144	transfer_size = size;
145	transfer_count = size_to_count(transfer_size);
146}
147
148static int send_xfer(int size)
149{
150	struct pollfd fds;
151	int offset, ret;
152
153	if (verify)
154		format_buf(buf, size);
155
156	if (use_async) {
157		fds.fd = rs;
158		fds.events = POLLOUT;
159	}
160
161	for (offset = 0; offset < size; ) {
162		if (use_async) {
163			ret = do_poll(&fds, poll_timeout);
164			if (ret)
165				return ret;
166		}
167
168		ret = rs_send(rs, buf + offset, size - offset, flags);
169		if (ret > 0) {
170			offset += ret;
171		} else if (errno != EWOULDBLOCK && errno != EAGAIN) {
172			perror("rsend");
173			return ret;
174		}
175	}
176
177	return 0;
178}
179
180static int recv_xfer(int size)
181{
182	struct pollfd fds;
183	int offset, ret;
184
185	if (use_async) {
186		fds.fd = rs;
187		fds.events = POLLIN;
188	}
189
190	for (offset = 0; offset < size; ) {
191		if (use_async) {
192			ret = do_poll(&fds, poll_timeout);
193			if (ret)
194				return ret;
195		}
196
197		ret = rs_recv(rs, buf + offset, size - offset, flags);
198		if (ret > 0) {
199			offset += ret;
200		} else if (errno != EWOULDBLOCK && errno != EAGAIN) {
201			perror("rrecv");
202			return ret;
203		}
204	}
205
206	if (verify) {
207		ret = verify_buf(buf, size);
208		if (ret)
209			return ret;
210	}
211
212	return 0;
213}
214
215static int sync_test(void)
216{
217	int ret;
218
219	ret = dst_addr ? send_xfer(16) : recv_xfer(16);
220	if (ret)
221		return ret;
222
223	return dst_addr ? recv_xfer(16) : send_xfer(16);
224}
225
226static int run_test(void)
227{
228	int ret, i, t;
229
230	ret = sync_test();
231	if (ret)
232		goto out;
233
234	gettimeofday(&start, NULL);
235	for (i = 0; i < iterations; i++) {
236		for (t = 0; t < transfer_count; t++) {
237			ret = dst_addr ? send_xfer(transfer_size) :
238					 recv_xfer(transfer_size);
239			if (ret)
240				goto out;
241		}
242
243		for (t = 0; t < transfer_count; t++) {
244			ret = dst_addr ? recv_xfer(transfer_size) :
245					 send_xfer(transfer_size);
246			if (ret)
247				goto out;
248		}
249	}
250	gettimeofday(&end, NULL);
251	show_perf();
252	ret = 0;
253
254out:
255	return ret;
256}
257
258static void set_keepalive(int fd)
259{
260	int optval;
261	socklen_t optlen = sizeof(optlen);
262
263	optval = 1;
264	if (rs_setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &optval, optlen)) {
265		perror("rsetsockopt SO_KEEPALIVE");
266		return;
267	}
268
269	optval = keepalive;
270	if (rs_setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &optval, optlen))
271		perror("rsetsockopt TCP_KEEPIDLE");
272
273	if (!(rs_getsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &optval, &optlen)))
274		printf("Keepalive: %s\n", (optval ? "ON" : "OFF"));
275
276	if (!(rs_getsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &optval, &optlen)))
277		printf("  time: %i\n", optval);
278}
279
280static void set_options(int fd)
281{
282	int val;
283
284	if (buffer_size) {
285		rs_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size,
286			      sizeof buffer_size);
287		rs_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size,
288			      sizeof buffer_size);
289	} else {
290		val = 1 << 19;
291		rs_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val);
292		rs_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val);
293	}
294
295	val = 1;
296	rs_setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (void *) &val, sizeof(val));
297
298	if (flags & MSG_DONTWAIT)
299		rs_fcntl(fd, F_SETFL, O_NONBLOCK);
300
301	if (use_rs) {
302		/* Inline size based on experimental data */
303		if (optimization == opt_latency) {
304			rs_setsockopt(fd, SOL_RDMA, RDMA_INLINE, &inline_size,
305				      sizeof inline_size);
306		} else if (optimization == opt_bandwidth) {
307			val = 0;
308			rs_setsockopt(fd, SOL_RDMA, RDMA_INLINE, &val, sizeof val);
309		}
310	}
311
312	if (keepalive)
313		set_keepalive(fd);
314}
315
316static int server_listen(void)
317{
318	struct rdma_addrinfo *rai = NULL;
319	struct addrinfo *ai;
320	int val, ret;
321
322	if (use_rgai) {
323		rai_hints.ai_flags |= RAI_PASSIVE;
324		ret = rdma_getaddrinfo(src_addr, port, &rai_hints, &rai);
325	} else {
326		ai_hints.ai_flags |= AI_PASSIVE;
327		ret = getaddrinfo(src_addr, port, &ai_hints, &ai);
328	}
329	if (ret) {
330		printf("getaddrinfo: %s\n", gai_strerror(ret));
331		return ret;
332	}
333
334	lrs = rai ? rs_socket(rai->ai_family, SOCK_STREAM, 0) :
335		    rs_socket(ai->ai_family, SOCK_STREAM, 0);
336	if (lrs < 0) {
337		perror("rsocket");
338		ret = lrs;
339		goto free;
340	}
341
342	val = 1;
343	ret = rs_setsockopt(lrs, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val);
344	if (ret) {
345		perror("rsetsockopt SO_REUSEADDR");
346		goto close;
347	}
348
349	ret = rai ? rs_bind(lrs, rai->ai_src_addr, rai->ai_src_len) :
350		    rs_bind(lrs, ai->ai_addr, ai->ai_addrlen);
351	if (ret) {
352		perror("rbind");
353		goto close;
354	}
355
356	ret = rs_listen(lrs, 1);
357	if (ret)
358		perror("rlisten");
359
360close:
361	if (ret)
362		rs_close(lrs);
363free:
364	if (rai)
365		rdma_freeaddrinfo(rai);
366	else
367		freeaddrinfo(ai);
368	return ret;
369}
370
371static int server_connect(void)
372{
373	struct pollfd fds;
374	int ret = 0;
375
376	set_options(lrs);
377	do {
378		if (use_async) {
379			fds.fd = lrs;
380			fds.events = POLLIN;
381
382			ret = do_poll(&fds, poll_timeout);
383			if (ret) {
384				perror("rpoll");
385				return ret;
386			}
387		}
388
389		rs = rs_accept(lrs, NULL, NULL);
390	} while (rs < 0 && (errno == EAGAIN || errno == EWOULDBLOCK));
391	if (rs < 0) {
392		perror("raccept");
393		return rs;
394	}
395
396	if (use_fork)
397		fork_pid = fork();
398	if (!fork_pid)
399		set_options(rs);
400	return ret;
401}
402
403static int client_connect(void)
404{
405	struct rdma_addrinfo *rai = NULL, *rai_src = NULL;
406	struct addrinfo *ai, *ai_src;
407	struct pollfd fds;
408	int ret, err;
409	socklen_t len;
410
411	ret = use_rgai ? rdma_getaddrinfo(dst_addr, port, &rai_hints, &rai) :
412			 getaddrinfo(dst_addr, port, &ai_hints, &ai);
413
414	if (ret) {
415		printf("getaddrinfo: %s\n", gai_strerror(ret));
416		return ret;
417	}
418
419	if (src_addr) {
420		if (use_rgai) {
421			rai_hints.ai_flags |= RAI_PASSIVE;
422			ret = rdma_getaddrinfo(src_addr, port, &rai_hints, &rai_src);
423		} else {
424			ai_hints.ai_flags |= AI_PASSIVE;
425			ret = getaddrinfo(src_addr, port, &ai_hints, &ai_src);
426		}
427		if (ret) {
428			printf("getaddrinfo src_addr: %s\n", gai_strerror(ret));
429			return ret;
430		}
431	}
432
433	rs = rai ? rs_socket(rai->ai_family, SOCK_STREAM, 0) :
434		   rs_socket(ai->ai_family, SOCK_STREAM, 0);
435	if (rs < 0) {
436		perror("rsocket");
437		ret = rs;
438		goto free;
439	}
440
441	set_options(rs);
442
443	if (src_addr) {
444		ret = rai ? rs_bind(rs, rai_src->ai_src_addr, rai_src->ai_src_len) :
445			    rs_bind(rs, ai_src->ai_addr, ai_src->ai_addrlen);
446		if (ret) {
447			perror("rbind");
448			goto close;
449		}
450	}
451
452	if (rai && rai->ai_route) {
453		ret = rs_setsockopt(rs, SOL_RDMA, RDMA_ROUTE, rai->ai_route,
454				    rai->ai_route_len);
455		if (ret) {
456			perror("rsetsockopt RDMA_ROUTE");
457			goto close;
458		}
459	}
460
461	ret = rai ? rs_connect(rs, rai->ai_dst_addr, rai->ai_dst_len) :
462		    rs_connect(rs, ai->ai_addr, ai->ai_addrlen);
463	if (ret && (errno != EINPROGRESS)) {
464		perror("rconnect");
465		goto close;
466	}
467
468	if (ret && (errno == EINPROGRESS)) {
469		fds.fd = rs;
470		fds.events = POLLOUT;
471		ret = do_poll(&fds, poll_timeout);
472		if (ret) {
473			perror("rpoll");
474			goto close;
475		}
476
477		len = sizeof err;
478		ret = rs_getsockopt(rs, SOL_SOCKET, SO_ERROR, &err, &len);
479		if (ret)
480			goto close;
481		if (err) {
482			ret = -1;
483			errno = err;
484			perror("async rconnect");
485		}
486	}
487
488close:
489	if (ret)
490		rs_close(rs);
491free:
492	if (rai)
493		rdma_freeaddrinfo(rai);
494	else
495		freeaddrinfo(ai);
496	return ret;
497}
498
499static int run(void)
500{
501	int i, ret = 0;
502
503	buf = malloc(!custom ? test_size[TEST_CNT - 1].size : transfer_size);
504	if (!buf) {
505		perror("malloc");
506		return -1;
507	}
508
509	if (!dst_addr) {
510		ret = server_listen();
511		if (ret)
512			goto free;
513	}
514
515	printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n",
516	       "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer");
517	if (!custom) {
518		optimization = opt_latency;
519		ret = dst_addr ? client_connect() : server_connect();
520		if (ret)
521			goto free;
522
523		for (i = 0; i < TEST_CNT && !fork_pid; i++) {
524			if (test_size[i].option > size_option)
525				continue;
526			init_latency_test(test_size[i].size);
527			run_test();
528		}
529		if (fork_pid)
530			waitpid(fork_pid, NULL, 0);
531		else
532			rs_shutdown(rs, SHUT_RDWR);
533		rs_close(rs);
534
535		if (!dst_addr && use_fork && !fork_pid)
536			goto free;
537
538		optimization = opt_bandwidth;
539		ret = dst_addr ? client_connect() : server_connect();
540		if (ret)
541			goto free;
542		for (i = 0; i < TEST_CNT && !fork_pid; i++) {
543			if (test_size[i].option > size_option)
544				continue;
545			init_bandwidth_test(test_size[i].size);
546			run_test();
547		}
548	} else {
549		ret = dst_addr ? client_connect() : server_connect();
550		if (ret)
551			goto free;
552
553		if (!fork_pid)
554			ret = run_test();
555	}
556
557	if (fork_pid)
558		waitpid(fork_pid, NULL, 0);
559	else
560		rs_shutdown(rs, SHUT_RDWR);
561	rs_close(rs);
562free:
563	free(buf);
564	return ret;
565}
566
567static int set_test_opt(const char *arg)
568{
569	if (strlen(arg) == 1) {
570		switch (arg[0]) {
571		case 's':
572			use_rs = 0;
573			break;
574		case 'a':
575			use_async = 1;
576			break;
577		case 'b':
578			flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL;
579			break;
580		case 'f':
581			use_fork = 1;
582			use_rs = 0;
583			break;
584		case 'n':
585			flags |= MSG_DONTWAIT;
586			break;
587		case 'r':
588			use_rgai = 1;
589			break;
590		case 'v':
591			verify = 1;
592			break;
593		default:
594			return -1;
595		}
596	} else {
597		if (!strncasecmp("socket", arg, 6)) {
598			use_rs = 0;
599		} else if (!strncasecmp("async", arg, 5)) {
600			use_async = 1;
601		} else if (!strncasecmp("block", arg, 5)) {
602			flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL;
603		} else if (!strncasecmp("nonblock", arg, 8)) {
604			flags |= MSG_DONTWAIT;
605		} else if (!strncasecmp("resolve", arg, 7)) {
606			use_rgai = 1;
607		} else if (!strncasecmp("verify", arg, 6)) {
608			verify = 1;
609		} else if (!strncasecmp("fork", arg, 4)) {
610			use_fork = 1;
611			use_rs = 0;
612		} else {
613			return -1;
614		}
615	}
616	return 0;
617}
618
619int main(int argc, char **argv)
620{
621	int op, ret;
622
623	ai_hints.ai_socktype = SOCK_STREAM;
624	rai_hints.ai_port_space = RDMA_PS_TCP;
625	while ((op = getopt(argc, argv, "s:b:f:B:i:I:C:S:p:k:T:")) != -1) {
626		switch (op) {
627		case 's':
628			dst_addr = optarg;
629			break;
630		case 'b':
631			src_addr = optarg;
632			break;
633		case 'f':
634			if (!strncasecmp("ip", optarg, 2)) {
635				ai_hints.ai_flags = AI_NUMERICHOST;
636			} else if (!strncasecmp("gid", optarg, 3)) {
637				rai_hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY;
638				rai_hints.ai_family = AF_IB;
639				use_rgai = 1;
640			} else {
641				fprintf(stderr, "Warning: unknown address format\n");
642			}
643			break;
644		case 'B':
645			buffer_size = atoi(optarg);
646			break;
647		case 'i':
648			inline_size = atoi(optarg);
649			break;
650		case 'I':
651			custom = 1;
652			iterations = atoi(optarg);
653			break;
654		case 'C':
655			custom = 1;
656			transfer_count = atoi(optarg);
657			break;
658		case 'S':
659			if (!strncasecmp("all", optarg, 3)) {
660				size_option = 1;
661			} else {
662				custom = 1;
663				transfer_size = atoi(optarg);
664			}
665			break;
666		case 'p':
667			port = optarg;
668			break;
669		case 'k':
670			keepalive = atoi(optarg);
671			break;
672		case 'T':
673			if (!set_test_opt(optarg))
674				break;
675			/* invalid option - fall through */
676			SWITCH_FALLTHROUGH;
677		default:
678			printf("usage: %s\n", argv[0]);
679			printf("\t[-s server_address]\n");
680			printf("\t[-b bind_address]\n");
681			printf("\t[-f address_format]\n");
682			printf("\t    name, ip, ipv6, or gid\n");
683			printf("\t[-B buffer_size]\n");
684			printf("\t[-i inline_size]\n");
685			printf("\t[-I iterations]\n");
686			printf("\t[-C transfer_count]\n");
687			printf("\t[-S transfer_size or all]\n");
688			printf("\t[-p port_number]\n");
689			printf("\t[-k keepalive_time]\n");
690			printf("\t[-T test_option]\n");
691			printf("\t    s|sockets - use standard tcp/ip sockets\n");
692			printf("\t    a|async - asynchronous operation (use poll)\n");
693			printf("\t    b|blocking - use blocking calls\n");
694			printf("\t    f|fork - fork server processing\n");
695			printf("\t    n|nonblocking - use nonblocking calls\n");
696			printf("\t    r|resolve - use rdma cm to resolve address\n");
697			printf("\t    v|verify - verify data\n");
698			exit(1);
699		}
700	}
701
702	if (!(flags & MSG_DONTWAIT))
703		poll_timeout = -1;
704
705	ret = run();
706	return ret;
707}
708