mlx5.c revision 326169
1/*
2 * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32#define _GNU_SOURCE
33#include <config.h>
34
35#include <stdio.h>
36#include <stdlib.h>
37#include <unistd.h>
38#include <errno.h>
39#include <sys/mman.h>
40#include <pthread.h>
41#include <string.h>
42#include <sched.h>
43#include <sys/param.h>
44#include <sys/cpuset.h>
45
46#include "mlx5.h"
47#include "mlx5-abi.h"
48
49#ifndef PCI_VENDOR_ID_MELLANOX
50#define PCI_VENDOR_ID_MELLANOX			0x15b3
51#endif
52
53#ifndef CPU_OR
54#define CPU_OR(x, y, z) do {} while (0)
55#endif
56
57#ifndef CPU_EQUAL
58#define CPU_EQUAL(x, y) 1
59#endif
60
61
62#define HCA(v, d) \
63	{ .vendor = PCI_VENDOR_ID_##v,			\
64	  .device = d }
65
66static struct {
67	unsigned		vendor;
68	unsigned		device;
69} hca_table[] = {
70	HCA(MELLANOX, 4113),	/* MT4113 Connect-IB */
71	HCA(MELLANOX, 4114),	/* Connect-IB Virtual Function */
72	HCA(MELLANOX, 4115),	/* ConnectX-4 */
73	HCA(MELLANOX, 4116),	/* ConnectX-4 Virtual Function */
74	HCA(MELLANOX, 4117),	/* ConnectX-4LX */
75	HCA(MELLANOX, 4118),	/* ConnectX-4LX Virtual Function */
76	HCA(MELLANOX, 4119),	/* ConnectX-5, PCIe 3.0 */
77	HCA(MELLANOX, 4120),	/* ConnectX-5 Virtual Function */
78	HCA(MELLANOX, 4121),    /* ConnectX-5 Ex */
79	HCA(MELLANOX, 4122),	/* ConnectX-5 Ex VF */
80	HCA(MELLANOX, 4123),    /* ConnectX-6 */
81	HCA(MELLANOX, 4124),	/* ConnectX-6 VF */
82	HCA(MELLANOX, 41682),	/* BlueField integrated ConnectX-5 network controller */
83	HCA(MELLANOX, 41683),	/* BlueField integrated ConnectX-5 network controller VF */
84};
85
86uint32_t mlx5_debug_mask = 0;
87int mlx5_freeze_on_error_cqe;
88
89static struct ibv_context_ops mlx5_ctx_ops = {
90	.query_device  = mlx5_query_device,
91	.query_port    = mlx5_query_port,
92	.alloc_pd      = mlx5_alloc_pd,
93	.dealloc_pd    = mlx5_free_pd,
94	.reg_mr	       = mlx5_reg_mr,
95	.rereg_mr      = mlx5_rereg_mr,
96	.dereg_mr      = mlx5_dereg_mr,
97	.alloc_mw      = mlx5_alloc_mw,
98	.dealloc_mw    = mlx5_dealloc_mw,
99	.bind_mw       = mlx5_bind_mw,
100	.create_cq     = mlx5_create_cq,
101	.poll_cq       = mlx5_poll_cq,
102	.req_notify_cq = mlx5_arm_cq,
103	.cq_event      = mlx5_cq_event,
104	.resize_cq     = mlx5_resize_cq,
105	.destroy_cq    = mlx5_destroy_cq,
106	.create_srq    = mlx5_create_srq,
107	.modify_srq    = mlx5_modify_srq,
108	.query_srq     = mlx5_query_srq,
109	.destroy_srq   = mlx5_destroy_srq,
110	.post_srq_recv = mlx5_post_srq_recv,
111	.create_qp     = mlx5_create_qp,
112	.query_qp      = mlx5_query_qp,
113	.modify_qp     = mlx5_modify_qp,
114	.destroy_qp    = mlx5_destroy_qp,
115	.post_send     = mlx5_post_send,
116	.post_recv     = mlx5_post_recv,
117	.create_ah     = mlx5_create_ah,
118	.destroy_ah    = mlx5_destroy_ah,
119	.attach_mcast  = mlx5_attach_mcast,
120	.detach_mcast  = mlx5_detach_mcast
121};
122
123static int read_number_from_line(const char *line, int *value)
124{
125	const char *ptr;
126
127	ptr = strchr(line, ':');
128	if (!ptr)
129		return 1;
130
131	++ptr;
132
133	*value = atoi(ptr);
134	return 0;
135}
136/**
137 * The function looks for the first free user-index in all the
138 * user-index tables. If all are used, returns -1, otherwise
139 * a valid user-index.
140 * In case the reference count of the table is zero, it means the
141 * table is not in use and wasn't allocated yet, therefore the
142 * mlx5_store_uidx allocates the table, and increment the reference
143 * count on the table.
144 */
145static int32_t get_free_uidx(struct mlx5_context *ctx)
146{
147	int32_t tind;
148	int32_t i;
149
150	for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
151		if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
152			break;
153	}
154
155	if (tind == MLX5_UIDX_TABLE_SIZE)
156		return -1;
157
158	if (!ctx->uidx_table[tind].refcnt)
159		return tind << MLX5_UIDX_TABLE_SHIFT;
160
161	for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
162		if (!ctx->uidx_table[tind].table[i])
163			break;
164	}
165
166	return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
167}
168
169int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
170{
171	int32_t tind;
172	int32_t ret = -1;
173	int32_t uidx;
174
175	pthread_mutex_lock(&ctx->uidx_table_mutex);
176	uidx = get_free_uidx(ctx);
177	if (uidx < 0)
178		goto out;
179
180	tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
181
182	if (!ctx->uidx_table[tind].refcnt) {
183		ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
184						     sizeof(struct mlx5_resource *));
185		if (!ctx->uidx_table[tind].table)
186			goto out;
187	}
188
189	++ctx->uidx_table[tind].refcnt;
190	ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
191	ret = uidx;
192
193out:
194	pthread_mutex_unlock(&ctx->uidx_table_mutex);
195	return ret;
196}
197
198void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
199{
200	int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
201
202	pthread_mutex_lock(&ctx->uidx_table_mutex);
203
204	if (!--ctx->uidx_table[tind].refcnt)
205		free(ctx->uidx_table[tind].table);
206	else
207		ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
208
209	pthread_mutex_unlock(&ctx->uidx_table_mutex);
210}
211
212static int mlx5_is_sandy_bridge(int *num_cores)
213{
214	char line[128];
215	FILE *fd;
216	int rc = 0;
217	int cur_cpu_family = -1;
218	int cur_cpu_model = -1;
219
220	fd = fopen("/proc/cpuinfo", "r");
221	if (!fd)
222		return 0;
223
224	*num_cores = 0;
225
226	while (fgets(line, 128, fd)) {
227		int value;
228
229		/* if this is information on new processor */
230		if (!strncmp(line, "processor", 9)) {
231			++*num_cores;
232
233			cur_cpu_family = -1;
234			cur_cpu_model  = -1;
235		} else if (!strncmp(line, "cpu family", 10)) {
236			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
237				cur_cpu_family = value;
238		} else if (!strncmp(line, "model", 5)) {
239			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
240				cur_cpu_model = value;
241		}
242
243		/* if this is a Sandy Bridge CPU */
244		if ((cur_cpu_family == 6) &&
245		    (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
246			rc = 1;
247	}
248
249	fclose(fd);
250	return rc;
251}
252
253/*
254man cpuset
255
256  This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
257  are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
258  words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
259  within a word are also in big-endian order.
260
261  The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
262  the size of the bitmask.
263
264  Examples of the Mask Format:
265
266     00000001                        # just bit 0 set
267     40000000,00000000,00000000      # just bit 94 set
268     000000ff,00000000               # bits 32-39 set
269     00000000,000E3862               # 1,5,6,11-13,17-19 set
270
271  A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
272
273     00000001,00000001,00010117
274
275  The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
276  bit 4, and the "7" is for bits 2, 1, and 0.
277*/
278static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
279{
280	char *p, buf[1024];
281	char *env_value;
282	uint32_t word;
283	int i, k;
284
285	env_value = getenv("MLX5_LOCAL_CPUS");
286	if (env_value)
287		strncpy(buf, env_value, sizeof(buf));
288	else {
289		char fname[MAXPATHLEN];
290
291		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
292			 ibv_get_device_name(ibdev));
293
294		if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
295			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
296			return;
297		}
298	}
299
300	p = strrchr(buf, ',');
301	if (!p)
302		p = buf;
303
304	i = 0;
305	do {
306		if (*p == ',') {
307			*p = 0;
308			p ++;
309		}
310
311		word = strtoul(p, NULL, 16);
312
313		for (k = 0; word; ++k, word >>= 1)
314			if (word & 1)
315				CPU_SET(k+i, cpu_set);
316
317		if (p == buf)
318			break;
319
320		p = strrchr(buf, ',');
321		if (!p)
322			p = buf;
323
324		i += 32;
325	} while (i < CPU_SETSIZE);
326}
327
328static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
329{
330	cpuset_t my_cpus, dev_local_cpus, result_set;
331	int stall_enable;
332	int ret;
333	int num_cores;
334
335	if (!mlx5_is_sandy_bridge(&num_cores))
336		return 0;
337
338	/* by default enable stall on sandy bridge arch */
339	stall_enable = 1;
340
341	/*
342	 * check if app is bound to cpu set that is inside
343	 * of device local cpu set. Disable stalling if true
344	 */
345
346	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
347	CPU_ZERO(&my_cpus);
348	CPU_ZERO(&dev_local_cpus);
349	CPU_ZERO(&result_set);
350	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
351	    sizeof(my_cpus), &my_cpus);
352	if (ret == -1) {
353		if (errno == EINVAL)
354			fprintf(stderr, PFX "Warning: my cpu set is too small\n");
355		else
356			fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
357		goto out;
358	}
359
360	/* get device local cpu set */
361	mlx5_local_cpu_set(ibdev, &dev_local_cpus);
362
363	/* check if my cpu set is in dev cpu */
364	CPU_OR(&result_set, &my_cpus);
365	CPU_OR(&result_set, &dev_local_cpus);
366	stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
367
368out:
369	return stall_enable;
370}
371
372static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
373{
374	char *env_value;
375
376	env_value = getenv("MLX5_STALL_CQ_POLL");
377	if (env_value)
378		/* check if cq stall is enforced by user */
379		ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
380	else
381		/* autodetect if we need to do cq polling */
382		ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
383
384	env_value = getenv("MLX5_STALL_NUM_LOOP");
385	if (env_value)
386		mlx5_stall_num_loop = atoi(env_value);
387
388	env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
389	if (env_value)
390		mlx5_stall_cq_poll_min = atoi(env_value);
391
392	env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
393	if (env_value)
394		mlx5_stall_cq_poll_max = atoi(env_value);
395
396	env_value = getenv("MLX5_STALL_CQ_INC_STEP");
397	if (env_value)
398		mlx5_stall_cq_inc_step = atoi(env_value);
399
400	env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
401	if (env_value)
402		mlx5_stall_cq_dec_step = atoi(env_value);
403
404	ctx->stall_adaptive_enable = 0;
405	ctx->stall_cycles = 0;
406
407	if (mlx5_stall_num_loop < 0) {
408		ctx->stall_adaptive_enable = 1;
409		ctx->stall_cycles = mlx5_stall_cq_poll_min;
410	}
411
412}
413
414static int get_total_uuars(int page_size)
415{
416	int size = MLX5_DEF_TOT_UUARS;
417	int uuars_in_page;
418	char *env;
419
420	env = getenv("MLX5_TOTAL_UUARS");
421	if (env)
422		size = atoi(env);
423
424	if (size < 1)
425		return -EINVAL;
426
427	uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
428	size = max(uuars_in_page, size);
429	size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
430	if (size > MLX5_MAX_BFREGS)
431		return -ENOMEM;
432
433	return size;
434}
435
436static void open_debug_file(struct mlx5_context *ctx)
437{
438	char *env;
439
440	env = getenv("MLX5_DEBUG_FILE");
441	if (!env) {
442		ctx->dbg_fp = stderr;
443		return;
444	}
445
446	ctx->dbg_fp = fopen(env, "aw+");
447	if (!ctx->dbg_fp) {
448		fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
449		ctx->dbg_fp = stderr;
450		return;
451	}
452}
453
454static void close_debug_file(struct mlx5_context *ctx)
455{
456	if (ctx->dbg_fp && ctx->dbg_fp != stderr)
457		fclose(ctx->dbg_fp);
458}
459
460static void set_debug_mask(void)
461{
462	char *env;
463
464	env = getenv("MLX5_DEBUG_MASK");
465	if (env)
466		mlx5_debug_mask = strtol(env, NULL, 0);
467}
468
469static void set_freeze_on_error(void)
470{
471	char *env;
472
473	env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
474	if (env)
475		mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
476}
477
478static int get_always_bf(void)
479{
480	char *env;
481
482	env = getenv("MLX5_POST_SEND_PREFER_BF");
483	if (!env)
484		return 1;
485
486	return strcmp(env, "0") ? 1 : 0;
487}
488
489static int get_shut_up_bf(void)
490{
491	char *env;
492
493	env = getenv("MLX5_SHUT_UP_BF");
494	if (!env)
495		return 0;
496
497	return strcmp(env, "0") ? 1 : 0;
498}
499
500static int get_num_low_lat_uuars(int tot_uuars)
501{
502	char *env;
503	int num = 4;
504
505	env = getenv("MLX5_NUM_LOW_LAT_UUARS");
506	if (env)
507		num = atoi(env);
508
509	if (num < 0)
510		return -EINVAL;
511
512	num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
513	return num;
514}
515
516/* The library allocates an array of uuar contexts. The one in index zero does
517 * not to execersize odd/even policy so it can avoid a lock but it may not use
518 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
519 * since they are assigned to one QP only. The rest can use blue flame but since
520 * they are shared they need a lock
521 */
522static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
523{
524	if (uuarn == 0 || mlx5_single_threaded)
525		return 0;
526
527	if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
528		return 0;
529
530	return 1;
531}
532
533static int single_threaded_app(void)
534{
535
536	char *env;
537
538	env = getenv("MLX5_SINGLE_THREADED");
539	if (env)
540		return strcmp(env, "1") ? 0 : 1;
541
542	return 0;
543}
544
545static int mlx5_cmd_get_context(struct mlx5_context *context,
546				struct mlx5_alloc_ucontext *req,
547				size_t req_len,
548				struct mlx5_alloc_ucontext_resp *resp,
549				size_t resp_len)
550{
551	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
552				 req_len, &resp->ibv_resp, resp_len))
553		return 0;
554
555	/* The ibv_cmd_get_context fails in older kernels when passing
556	 * a request length that the kernel doesn't know.
557	 * To avoid breaking compatibility of new libmlx5 and older
558	 * kernels, when ibv_cmd_get_context fails with the full
559	 * request length, we try once again with the legacy length.
560	 * We repeat this process while reducing requested size based
561	 * on the feature input size. To avoid this in the future, we
562	 * will remove the check in kernel that requires fields unknown
563	 * to the kernel to be cleared. This will require that any new
564	 * feature that involves extending struct mlx5_alloc_ucontext
565	 * will be accompanied by an indication in the form of one or
566	 * more fields in struct mlx5_alloc_ucontext_resp. If the
567	 * response value can be interpreted as feature not supported
568	 * when the returned value is zero, this will suffice to
569	 * indicate to the library that the request was ignored by the
570	 * kernel, either because it is unaware or because it decided
571	 * to do so. If zero is a valid response, we will add a new
572	 * field that indicates whether the request was handled.
573	 */
574	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
575				 offsetof(struct mlx5_alloc_ucontext, lib_caps),
576				 &resp->ibv_resp, resp_len))
577		return 0;
578
579	return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
580				   offsetof(struct mlx5_alloc_ucontext,
581					    cqe_version),
582				   &resp->ibv_resp, resp_len);
583}
584
585static int mlx5_map_internal_clock(struct mlx5_device *mdev,
586				   struct ibv_context *ibv_ctx)
587{
588	struct mlx5_context *context = to_mctx(ibv_ctx);
589	void *hca_clock_page;
590	off_t offset = 0;
591
592	set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
593	hca_clock_page = mmap(NULL, mdev->page_size,
594			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
595			      mdev->page_size * offset);
596
597	if (hca_clock_page == MAP_FAILED) {
598		fprintf(stderr, PFX
599			"Warning: Timestamp available,\n"
600			"but failed to mmap() hca core clock page.\n");
601		return -1;
602	}
603
604	context->hca_core_clock = hca_clock_page +
605		(context->core_clock.offset & (mdev->page_size - 1));
606	return 0;
607}
608
609int mlx5dv_query_device(struct ibv_context *ctx_in,
610			 struct mlx5dv_context *attrs_out)
611{
612	struct mlx5_context *mctx = to_mctx(ctx_in);
613	uint64_t comp_mask_out = 0;
614
615	attrs_out->version   = 0;
616	attrs_out->flags     = 0;
617
618	if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
619		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
620
621	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
622		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
623
624	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
625		attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
626		comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
627	}
628
629	attrs_out->comp_mask = comp_mask_out;
630
631	return 0;
632}
633
634static int mlx5dv_get_qp(struct ibv_qp *qp_in,
635			 struct mlx5dv_qp *qp_out)
636{
637	struct mlx5_qp *mqp = to_mqp(qp_in);
638
639	qp_out->comp_mask = 0;
640	qp_out->dbrec     = mqp->db;
641
642	if (mqp->sq_buf_size)
643		/* IBV_QPT_RAW_PACKET */
644		qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
645	else
646		qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
647	qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
648	qp_out->sq.stride  = 1 << mqp->sq.wqe_shift;
649
650	qp_out->rq.buf     = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
651	qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
652	qp_out->rq.stride  = 1 << mqp->rq.wqe_shift;
653
654	qp_out->bf.reg    = mqp->bf->reg;
655
656	if (mqp->bf->uuarn > 0)
657		qp_out->bf.size = mqp->bf->buf_size;
658	else
659		qp_out->bf.size = 0;
660
661	return 0;
662}
663
664static int mlx5dv_get_cq(struct ibv_cq *cq_in,
665			 struct mlx5dv_cq *cq_out)
666{
667	struct mlx5_cq *mcq = to_mcq(cq_in);
668	struct mlx5_context *mctx = to_mctx(cq_in->context);
669
670	cq_out->comp_mask = 0;
671	cq_out->cqn       = mcq->cqn;
672	cq_out->cqe_cnt   = mcq->ibv_cq.cqe + 1;
673	cq_out->cqe_size  = mcq->cqe_sz;
674	cq_out->buf       = mcq->active_buf->buf;
675	cq_out->dbrec     = mcq->dbrec;
676	cq_out->uar	  = mctx->uar;
677
678	mcq->flags	 |= MLX5_CQ_FLAGS_DV_OWNED;
679
680	return 0;
681}
682
683static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
684			  struct mlx5dv_rwq *rwq_out)
685{
686	struct mlx5_rwq *mrwq = to_mrwq(wq_in);
687
688	rwq_out->comp_mask = 0;
689	rwq_out->buf       = mrwq->pbuff;
690	rwq_out->dbrec     = mrwq->recv_db;
691	rwq_out->wqe_cnt   = mrwq->rq.wqe_cnt;
692	rwq_out->stride    = 1 << mrwq->rq.wqe_shift;
693
694	return 0;
695}
696
697static int mlx5dv_get_srq(struct ibv_srq *srq_in,
698			  struct mlx5dv_srq *srq_out)
699{
700	struct mlx5_srq *msrq;
701
702	msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
703
704	srq_out->comp_mask = 0;
705	srq_out->buf       = msrq->buf.buf;
706	srq_out->dbrec     = msrq->db;
707	srq_out->stride    = 1 << msrq->wqe_shift;
708	srq_out->head      = msrq->head;
709	srq_out->tail      = msrq->tail;
710
711	return 0;
712}
713
714int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
715{
716	int ret = 0;
717
718	if (obj_type & MLX5DV_OBJ_QP)
719		ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
720	if (!ret && (obj_type & MLX5DV_OBJ_CQ))
721		ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
722	if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
723		ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
724	if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
725		ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
726
727	return ret;
728}
729
730static void adjust_uar_info(struct mlx5_device *mdev,
731			    struct mlx5_context *context,
732			    struct mlx5_alloc_ucontext_resp resp)
733{
734	if (!resp.log_uar_size && !resp.num_uars_per_page) {
735		/* old kernel */
736		context->uar_size = mdev->page_size;
737		context->num_uars_per_page = 1;
738		return;
739	}
740
741	context->uar_size = 1 << resp.log_uar_size;
742	context->num_uars_per_page = resp.num_uars_per_page;
743}
744
745static int mlx5_init_context(struct verbs_device *vdev,
746			     struct ibv_context *ctx, int cmd_fd)
747{
748	struct mlx5_context	       *context;
749	struct mlx5_alloc_ucontext	req;
750	struct mlx5_alloc_ucontext_resp resp;
751	int				i;
752	int				page_size;
753	int				tot_uuars;
754	int				low_lat_uuars;
755	int				gross_uuars;
756	int				j;
757	off_t				offset;
758	struct mlx5_device	       *mdev;
759	struct verbs_context	       *v_ctx;
760	struct ibv_port_attr		port_attr;
761	struct ibv_device_attr_ex	device_attr;
762	int				k;
763	int				bfi;
764	int				num_sys_page_map;
765
766	mdev = to_mdev(&vdev->device);
767	v_ctx = verbs_get_ctx(ctx);
768	page_size = mdev->page_size;
769	mlx5_single_threaded = single_threaded_app();
770
771	context = to_mctx(ctx);
772	context->ibv_ctx.cmd_fd = cmd_fd;
773
774	open_debug_file(context);
775	set_debug_mask();
776	set_freeze_on_error();
777	if (gethostname(context->hostname, sizeof(context->hostname)))
778		strcpy(context->hostname, "host_unknown");
779
780	tot_uuars = get_total_uuars(page_size);
781	if (tot_uuars < 0) {
782		errno = -tot_uuars;
783		goto err_free;
784	}
785
786	low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
787	if (low_lat_uuars < 0) {
788		errno = -low_lat_uuars;
789		goto err_free;
790	}
791
792	if (low_lat_uuars > tot_uuars - 1) {
793		errno = ENOMEM;
794		goto err_free;
795	}
796
797	memset(&req, 0, sizeof(req));
798	memset(&resp, 0, sizeof(resp));
799
800	req.total_num_uuars = tot_uuars;
801	req.num_low_latency_uuars = low_lat_uuars;
802	req.cqe_version = MLX5_CQE_VERSION_V1;
803	req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
804
805	if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
806				 sizeof(resp)))
807		goto err_free;
808
809	context->max_num_qps		= resp.qp_tab_size;
810	context->bf_reg_size		= resp.bf_reg_size;
811	context->tot_uuars		= resp.tot_uuars;
812	context->low_lat_uuars		= low_lat_uuars;
813	context->cache_line_size	= resp.cache_line_size;
814	context->max_sq_desc_sz = resp.max_sq_desc_sz;
815	context->max_rq_desc_sz = resp.max_rq_desc_sz;
816	context->max_send_wqebb	= resp.max_send_wqebb;
817	context->num_ports	= resp.num_ports;
818	context->max_recv_wr	= resp.max_recv_wr;
819	context->max_srq_recv_wr = resp.max_srq_recv_wr;
820
821	context->cqe_version = resp.cqe_version;
822	if (context->cqe_version) {
823		if (context->cqe_version == MLX5_CQE_VERSION_V1)
824			mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
825		else
826			goto err_free;
827	}
828
829	adjust_uar_info(mdev, context, resp);
830
831	gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
832	context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
833	if (!context->bfs) {
834		errno = ENOMEM;
835		goto err_free;
836	}
837
838	context->cmds_supp_uhw = resp.cmds_supp_uhw;
839	context->vendor_cap_flags = 0;
840
841	pthread_mutex_init(&context->qp_table_mutex, NULL);
842	pthread_mutex_init(&context->srq_table_mutex, NULL);
843	pthread_mutex_init(&context->uidx_table_mutex, NULL);
844	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
845		context->qp_table[i].refcnt = 0;
846
847	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
848		context->uidx_table[i].refcnt = 0;
849
850	context->db_list = NULL;
851
852	pthread_mutex_init(&context->db_list_mutex, NULL);
853
854	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
855	for (i = 0; i < num_sys_page_map; ++i) {
856		offset = 0;
857		set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
858		set_index(i, &offset);
859		context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
860				       cmd_fd, page_size * offset);
861		if (context->uar[i] == MAP_FAILED) {
862			context->uar[i] = NULL;
863			goto err_free_bf;
864		}
865	}
866
867	for (i = 0; i < num_sys_page_map; i++) {
868		for (j = 0; j < context->num_uars_per_page; j++) {
869			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
870				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
871				context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
872							MLX5_BF_OFFSET + k * context->bf_reg_size;
873				context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
874				mlx5_spinlock_init(&context->bfs[bfi].lock);
875				context->bfs[bfi].offset = 0;
876				if (bfi)
877					context->bfs[bfi].buf_size = context->bf_reg_size / 2;
878				context->bfs[bfi].uuarn = bfi;
879			}
880		}
881	}
882	context->hca_core_clock = NULL;
883	if (resp.response_length + sizeof(resp.ibv_resp) >=
884	    offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
885	    sizeof(resp.hca_core_clock_offset) &&
886	    resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
887		context->core_clock.offset = resp.hca_core_clock_offset;
888		mlx5_map_internal_clock(mdev, ctx);
889	}
890
891	mlx5_spinlock_init(&context->lock32);
892
893	context->prefer_bf = get_always_bf();
894	context->shut_up_bf = get_shut_up_bf();
895	mlx5_read_env(&vdev->device, context);
896
897	mlx5_spinlock_init(&context->hugetlb_lock);
898	TAILQ_INIT(&context->hugetlb_list);
899
900	context->ibv_ctx.ops = mlx5_ctx_ops;
901
902	verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
903	verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
904	verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
905	verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
906	verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
907	verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
908	verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
909	verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
910	verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
911	verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
912	verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
913	verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
914	verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
915	verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
916	verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
917
918	memset(&device_attr, 0, sizeof(device_attr));
919	if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
920				  sizeof(struct ibv_device_attr_ex))) {
921		context->cached_device_cap_flags =
922			device_attr.orig_attr.device_cap_flags;
923		context->atomic_cap = device_attr.orig_attr.atomic_cap;
924		context->cached_tso_caps = device_attr.tso_caps;
925	}
926
927	for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
928		memset(&port_attr, 0, sizeof(port_attr));
929		if (!mlx5_query_port(ctx, j + 1, &port_attr))
930			context->cached_link_layer[j] = port_attr.link_layer;
931	}
932
933	return 0;
934
935err_free_bf:
936	free(context->bfs);
937
938err_free:
939	for (i = 0; i < MLX5_MAX_UARS; ++i) {
940		if (context->uar[i])
941			munmap(context->uar[i], page_size);
942	}
943	close_debug_file(context);
944	return errno;
945}
946
947static void mlx5_cleanup_context(struct verbs_device *device,
948				 struct ibv_context *ibctx)
949{
950	struct mlx5_context *context = to_mctx(ibctx);
951	int page_size = to_mdev(ibctx->device)->page_size;
952	int i;
953
954	free(context->bfs);
955	for (i = 0; i < MLX5_MAX_UARS; ++i) {
956		if (context->uar[i])
957			munmap(context->uar[i], page_size);
958	}
959	if (context->hca_core_clock)
960		munmap(context->hca_core_clock - context->core_clock.offset,
961		       page_size);
962	close_debug_file(context);
963}
964
965static struct verbs_device_ops mlx5_dev_ops = {
966	.init_context = mlx5_init_context,
967	.uninit_context = mlx5_cleanup_context,
968};
969
970static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
971					     int abi_version)
972{
973	char			value[8];
974	struct mlx5_device     *dev;
975	unsigned		vendor, device;
976	int			i;
977
978	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
979				value, sizeof value) < 0)
980		return NULL;
981	sscanf(value, "%i", &vendor);
982
983	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
984				value, sizeof value) < 0)
985		return NULL;
986	sscanf(value, "%i", &device);
987
988	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
989		if (vendor == hca_table[i].vendor &&
990		    device == hca_table[i].device)
991			goto found;
992
993	return NULL;
994
995found:
996	if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
997	    abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
998		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
999			"(min supported %d, max supported %d)\n",
1000			abi_version, uverbs_sys_path,
1001			MLX5_UVERBS_MIN_ABI_VERSION,
1002			MLX5_UVERBS_MAX_ABI_VERSION);
1003		return NULL;
1004	}
1005
1006	dev = calloc(1, sizeof *dev);
1007	if (!dev) {
1008		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1009			uverbs_sys_path);
1010		return NULL;
1011	}
1012
1013	dev->page_size   = sysconf(_SC_PAGESIZE);
1014	dev->driver_abi_ver = abi_version;
1015
1016	dev->verbs_dev.ops = &mlx5_dev_ops;
1017	dev->verbs_dev.sz = sizeof(*dev);
1018	dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1019		sizeof(struct ibv_context);
1020
1021	return &dev->verbs_dev;
1022}
1023
1024static __attribute__((constructor)) void mlx5_register_driver(void)
1025{
1026	verbs_register_driver("mlx5", mlx5_driver_init);
1027}
1028