mlx5.c revision 347867
1/*
2 * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32#define _GNU_SOURCE
33#include <config.h>
34
35#include <stdio.h>
36#include <stdlib.h>
37#include <unistd.h>
38#include <errno.h>
39#include <sys/mman.h>
40#include <pthread.h>
41#include <string.h>
42#include <sched.h>
43#include <sys/param.h>
44#include <sys/cpuset.h>
45
46#include "mlx5.h"
47#include "mlx5-abi.h"
48
49#ifndef PCI_VENDOR_ID_MELLANOX
50#define PCI_VENDOR_ID_MELLANOX			0x15b3
51#endif
52
53#ifndef CPU_OR
54#define CPU_OR(x, y, z) do {} while (0)
55#endif
56
57#ifndef CPU_EQUAL
58#define CPU_EQUAL(x, y) 1
59#endif
60
61
62#define HCA(v, d) \
63	{ .vendor = PCI_VENDOR_ID_##v,			\
64	  .device = d }
65
66static struct {
67	unsigned		vendor;
68	unsigned		device;
69} hca_table[] = {
70	HCA(MELLANOX, 4113),	/* MT4113 Connect-IB */
71	HCA(MELLANOX, 4114),	/* Connect-IB Virtual Function */
72	HCA(MELLANOX, 4115),	/* ConnectX-4 */
73	HCA(MELLANOX, 4116),	/* ConnectX-4 Virtual Function */
74	HCA(MELLANOX, 4117),	/* ConnectX-4LX */
75	HCA(MELLANOX, 4118),	/* ConnectX-4LX Virtual Function */
76	HCA(MELLANOX, 4119),	/* ConnectX-5, PCIe 3.0 */
77	HCA(MELLANOX, 4120),	/* ConnectX-5 Virtual Function */
78	HCA(MELLANOX, 4121),    /* ConnectX-5 Ex */
79	HCA(MELLANOX, 4122),	/* ConnectX-5 Ex VF */
80	HCA(MELLANOX, 4123),    /* ConnectX-6 */
81	HCA(MELLANOX, 4124),	/* ConnectX-6 VF */
82	HCA(MELLANOX, 4125),	/* ConnectX-6 DX */
83	HCA(MELLANOX, 4126),	/* ConnectX family mlx5Gen Virtual Function */
84	HCA(MELLANOX, 41682),	/* BlueField integrated ConnectX-5 network controller */
85	HCA(MELLANOX, 41683),	/* BlueField integrated ConnectX-5 network controller VF */
86};
87
88uint32_t mlx5_debug_mask = 0;
89int mlx5_freeze_on_error_cqe;
90
91static struct ibv_context_ops mlx5_ctx_ops = {
92	.query_device  = mlx5_query_device,
93	.query_port    = mlx5_query_port,
94	.alloc_pd      = mlx5_alloc_pd,
95	.dealloc_pd    = mlx5_free_pd,
96	.reg_mr	       = mlx5_reg_mr,
97	.rereg_mr      = mlx5_rereg_mr,
98	.dereg_mr      = mlx5_dereg_mr,
99	.alloc_mw      = mlx5_alloc_mw,
100	.dealloc_mw    = mlx5_dealloc_mw,
101	.bind_mw       = mlx5_bind_mw,
102	.create_cq     = mlx5_create_cq,
103	.poll_cq       = mlx5_poll_cq,
104	.req_notify_cq = mlx5_arm_cq,
105	.cq_event      = mlx5_cq_event,
106	.resize_cq     = mlx5_resize_cq,
107	.destroy_cq    = mlx5_destroy_cq,
108	.create_srq    = mlx5_create_srq,
109	.modify_srq    = mlx5_modify_srq,
110	.query_srq     = mlx5_query_srq,
111	.destroy_srq   = mlx5_destroy_srq,
112	.post_srq_recv = mlx5_post_srq_recv,
113	.create_qp     = mlx5_create_qp,
114	.query_qp      = mlx5_query_qp,
115	.modify_qp     = mlx5_modify_qp,
116	.destroy_qp    = mlx5_destroy_qp,
117	.post_send     = mlx5_post_send,
118	.post_recv     = mlx5_post_recv,
119	.create_ah     = mlx5_create_ah,
120	.destroy_ah    = mlx5_destroy_ah,
121	.attach_mcast  = mlx5_attach_mcast,
122	.detach_mcast  = mlx5_detach_mcast
123};
124
125static int read_number_from_line(const char *line, int *value)
126{
127	const char *ptr;
128
129	ptr = strchr(line, ':');
130	if (!ptr)
131		return 1;
132
133	++ptr;
134
135	*value = atoi(ptr);
136	return 0;
137}
138/**
139 * The function looks for the first free user-index in all the
140 * user-index tables. If all are used, returns -1, otherwise
141 * a valid user-index.
142 * In case the reference count of the table is zero, it means the
143 * table is not in use and wasn't allocated yet, therefore the
144 * mlx5_store_uidx allocates the table, and increment the reference
145 * count on the table.
146 */
147static int32_t get_free_uidx(struct mlx5_context *ctx)
148{
149	int32_t tind;
150	int32_t i;
151
152	for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) {
153		if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK)
154			break;
155	}
156
157	if (tind == MLX5_UIDX_TABLE_SIZE)
158		return -1;
159
160	if (!ctx->uidx_table[tind].refcnt)
161		return tind << MLX5_UIDX_TABLE_SHIFT;
162
163	for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) {
164		if (!ctx->uidx_table[tind].table[i])
165			break;
166	}
167
168	return (tind << MLX5_UIDX_TABLE_SHIFT) | i;
169}
170
171int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc)
172{
173	int32_t tind;
174	int32_t ret = -1;
175	int32_t uidx;
176
177	pthread_mutex_lock(&ctx->uidx_table_mutex);
178	uidx = get_free_uidx(ctx);
179	if (uidx < 0)
180		goto out;
181
182	tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
183
184	if (!ctx->uidx_table[tind].refcnt) {
185		ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1,
186						     sizeof(struct mlx5_resource *));
187		if (!ctx->uidx_table[tind].table)
188			goto out;
189	}
190
191	++ctx->uidx_table[tind].refcnt;
192	ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc;
193	ret = uidx;
194
195out:
196	pthread_mutex_unlock(&ctx->uidx_table_mutex);
197	return ret;
198}
199
200void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx)
201{
202	int tind = uidx >> MLX5_UIDX_TABLE_SHIFT;
203
204	pthread_mutex_lock(&ctx->uidx_table_mutex);
205
206	if (!--ctx->uidx_table[tind].refcnt)
207		free(ctx->uidx_table[tind].table);
208	else
209		ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL;
210
211	pthread_mutex_unlock(&ctx->uidx_table_mutex);
212}
213
214static int mlx5_is_sandy_bridge(int *num_cores)
215{
216	char line[128];
217	FILE *fd;
218	int rc = 0;
219	int cur_cpu_family = -1;
220	int cur_cpu_model = -1;
221
222	fd = fopen("/proc/cpuinfo", "r");
223	if (!fd)
224		return 0;
225
226	*num_cores = 0;
227
228	while (fgets(line, 128, fd)) {
229		int value;
230
231		/* if this is information on new processor */
232		if (!strncmp(line, "processor", 9)) {
233			++*num_cores;
234
235			cur_cpu_family = -1;
236			cur_cpu_model  = -1;
237		} else if (!strncmp(line, "cpu family", 10)) {
238			if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value)))
239				cur_cpu_family = value;
240		} else if (!strncmp(line, "model", 5)) {
241			if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value)))
242				cur_cpu_model = value;
243		}
244
245		/* if this is a Sandy Bridge CPU */
246		if ((cur_cpu_family == 6) &&
247		    (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) ))
248			rc = 1;
249	}
250
251	fclose(fd);
252	return rc;
253}
254
255/*
256man cpuset
257
258  This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words
259  are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between
260  words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits
261  within a word are also in big-endian order.
262
263  The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on
264  the size of the bitmask.
265
266  Examples of the Mask Format:
267
268     00000001                        # just bit 0 set
269     40000000,00000000,00000000      # just bit 94 set
270     000000ff,00000000               # bits 32-39 set
271     00000000,000E3862               # 1,5,6,11-13,17-19 set
272
273  A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as:
274
275     00000001,00000001,00010117
276
277  The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for
278  bit 4, and the "7" is for bits 2, 1, and 0.
279*/
280static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpuset_t *cpu_set)
281{
282	char *p, buf[1024];
283	char *env_value;
284	uint32_t word;
285	int i, k;
286
287	env_value = getenv("MLX5_LOCAL_CPUS");
288	if (env_value)
289		strncpy(buf, env_value, sizeof(buf));
290	else {
291		char fname[MAXPATHLEN];
292
293		snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s",
294			 ibv_get_device_name(ibdev));
295
296		if (ibv_read_sysfs_file(fname, "device/local_cpus", buf, sizeof(buf))) {
297			fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname);
298			return;
299		}
300	}
301
302	p = strrchr(buf, ',');
303	if (!p)
304		p = buf;
305
306	i = 0;
307	do {
308		if (*p == ',') {
309			*p = 0;
310			p ++;
311		}
312
313		word = strtoul(p, NULL, 16);
314
315		for (k = 0; word; ++k, word >>= 1)
316			if (word & 1)
317				CPU_SET(k+i, cpu_set);
318
319		if (p == buf)
320			break;
321
322		p = strrchr(buf, ',');
323		if (!p)
324			p = buf;
325
326		i += 32;
327	} while (i < CPU_SETSIZE);
328}
329
330static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev)
331{
332	cpuset_t my_cpus, dev_local_cpus, result_set;
333	int stall_enable;
334	int ret;
335	int num_cores;
336
337	if (!mlx5_is_sandy_bridge(&num_cores))
338		return 0;
339
340	/* by default enable stall on sandy bridge arch */
341	stall_enable = 1;
342
343	/*
344	 * check if app is bound to cpu set that is inside
345	 * of device local cpu set. Disable stalling if true
346	 */
347
348	/* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */
349	CPU_ZERO(&my_cpus);
350	CPU_ZERO(&dev_local_cpus);
351	CPU_ZERO(&result_set);
352	ret = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
353	    sizeof(my_cpus), &my_cpus);
354	if (ret == -1) {
355		if (errno == EINVAL)
356			fprintf(stderr, PFX "Warning: my cpu set is too small\n");
357		else
358			fprintf(stderr, PFX "Warning: failed to get my cpu set\n");
359		goto out;
360	}
361
362	/* get device local cpu set */
363	mlx5_local_cpu_set(ibdev, &dev_local_cpus);
364
365	/* check if my cpu set is in dev cpu */
366	CPU_OR(&result_set, &my_cpus);
367	CPU_OR(&result_set, &dev_local_cpus);
368	stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1;
369
370out:
371	return stall_enable;
372}
373
374static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx)
375{
376	char *env_value;
377
378	env_value = getenv("MLX5_STALL_CQ_POLL");
379	if (env_value)
380		/* check if cq stall is enforced by user */
381		ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0;
382	else
383		/* autodetect if we need to do cq polling */
384		ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev);
385
386	env_value = getenv("MLX5_STALL_NUM_LOOP");
387	if (env_value)
388		mlx5_stall_num_loop = atoi(env_value);
389
390	env_value = getenv("MLX5_STALL_CQ_POLL_MIN");
391	if (env_value)
392		mlx5_stall_cq_poll_min = atoi(env_value);
393
394	env_value = getenv("MLX5_STALL_CQ_POLL_MAX");
395	if (env_value)
396		mlx5_stall_cq_poll_max = atoi(env_value);
397
398	env_value = getenv("MLX5_STALL_CQ_INC_STEP");
399	if (env_value)
400		mlx5_stall_cq_inc_step = atoi(env_value);
401
402	env_value = getenv("MLX5_STALL_CQ_DEC_STEP");
403	if (env_value)
404		mlx5_stall_cq_dec_step = atoi(env_value);
405
406	ctx->stall_adaptive_enable = 0;
407	ctx->stall_cycles = 0;
408
409	if (mlx5_stall_num_loop < 0) {
410		ctx->stall_adaptive_enable = 1;
411		ctx->stall_cycles = mlx5_stall_cq_poll_min;
412	}
413
414}
415
416static int get_total_uuars(int page_size)
417{
418	int size = MLX5_DEF_TOT_UUARS;
419	int uuars_in_page;
420	char *env;
421
422	env = getenv("MLX5_TOTAL_UUARS");
423	if (env)
424		size = atoi(env);
425
426	if (size < 1)
427		return -EINVAL;
428
429	uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR;
430	size = max(uuars_in_page, size);
431	size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR);
432	if (size > MLX5_MAX_BFREGS)
433		return -ENOMEM;
434
435	return size;
436}
437
438static void open_debug_file(struct mlx5_context *ctx)
439{
440	char *env;
441
442	env = getenv("MLX5_DEBUG_FILE");
443	if (!env) {
444		ctx->dbg_fp = stderr;
445		return;
446	}
447
448	ctx->dbg_fp = fopen(env, "aw+");
449	if (!ctx->dbg_fp) {
450		fprintf(stderr, "Failed opening debug file %s, using stderr\n", env);
451		ctx->dbg_fp = stderr;
452		return;
453	}
454}
455
456static void close_debug_file(struct mlx5_context *ctx)
457{
458	if (ctx->dbg_fp && ctx->dbg_fp != stderr)
459		fclose(ctx->dbg_fp);
460}
461
462static void set_debug_mask(void)
463{
464	char *env;
465
466	env = getenv("MLX5_DEBUG_MASK");
467	if (env)
468		mlx5_debug_mask = strtol(env, NULL, 0);
469}
470
471static void set_freeze_on_error(void)
472{
473	char *env;
474
475	env = getenv("MLX5_FREEZE_ON_ERROR_CQE");
476	if (env)
477		mlx5_freeze_on_error_cqe = strtol(env, NULL, 0);
478}
479
480static int get_always_bf(void)
481{
482	char *env;
483
484	env = getenv("MLX5_POST_SEND_PREFER_BF");
485	if (!env)
486		return 1;
487
488	return strcmp(env, "0") ? 1 : 0;
489}
490
491static int get_shut_up_bf(void)
492{
493	char *env;
494
495	env = getenv("MLX5_SHUT_UP_BF");
496	if (!env)
497		return 0;
498
499	return strcmp(env, "0") ? 1 : 0;
500}
501
502static int get_num_low_lat_uuars(int tot_uuars)
503{
504	char *env;
505	int num = 4;
506
507	env = getenv("MLX5_NUM_LOW_LAT_UUARS");
508	if (env)
509		num = atoi(env);
510
511	if (num < 0)
512		return -EINVAL;
513
514	num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD);
515	return num;
516}
517
518/* The library allocates an array of uuar contexts. The one in index zero does
519 * not to execersize odd/even policy so it can avoid a lock but it may not use
520 * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock
521 * since they are assigned to one QP only. The rest can use blue flame but since
522 * they are shared they need a lock
523 */
524static int need_uuar_lock(struct mlx5_context *ctx, int uuarn)
525{
526	if (uuarn == 0 || mlx5_single_threaded)
527		return 0;
528
529	if (uuarn >= (ctx->tot_uuars - ctx->low_lat_uuars) * 2)
530		return 0;
531
532	return 1;
533}
534
535static int single_threaded_app(void)
536{
537
538	char *env;
539
540	env = getenv("MLX5_SINGLE_THREADED");
541	if (env)
542		return strcmp(env, "1") ? 0 : 1;
543
544	return 0;
545}
546
547static int mlx5_cmd_get_context(struct mlx5_context *context,
548				struct mlx5_alloc_ucontext *req,
549				size_t req_len,
550				struct mlx5_alloc_ucontext_resp *resp,
551				size_t resp_len)
552{
553	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
554				 req_len, &resp->ibv_resp, resp_len))
555		return 0;
556
557	/* The ibv_cmd_get_context fails in older kernels when passing
558	 * a request length that the kernel doesn't know.
559	 * To avoid breaking compatibility of new libmlx5 and older
560	 * kernels, when ibv_cmd_get_context fails with the full
561	 * request length, we try once again with the legacy length.
562	 * We repeat this process while reducing requested size based
563	 * on the feature input size. To avoid this in the future, we
564	 * will remove the check in kernel that requires fields unknown
565	 * to the kernel to be cleared. This will require that any new
566	 * feature that involves extending struct mlx5_alloc_ucontext
567	 * will be accompanied by an indication in the form of one or
568	 * more fields in struct mlx5_alloc_ucontext_resp. If the
569	 * response value can be interpreted as feature not supported
570	 * when the returned value is zero, this will suffice to
571	 * indicate to the library that the request was ignored by the
572	 * kernel, either because it is unaware or because it decided
573	 * to do so. If zero is a valid response, we will add a new
574	 * field that indicates whether the request was handled.
575	 */
576	if (!ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
577				 offsetof(struct mlx5_alloc_ucontext, lib_caps),
578				 &resp->ibv_resp, resp_len))
579		return 0;
580
581	return ibv_cmd_get_context(&context->ibv_ctx, &req->ibv_req,
582				   offsetof(struct mlx5_alloc_ucontext,
583					    cqe_version),
584				   &resp->ibv_resp, resp_len);
585}
586
587static int mlx5_map_internal_clock(struct mlx5_device *mdev,
588				   struct ibv_context *ibv_ctx)
589{
590	struct mlx5_context *context = to_mctx(ibv_ctx);
591	void *hca_clock_page;
592	off_t offset = 0;
593
594	set_command(MLX5_MMAP_GET_CORE_CLOCK_CMD, &offset);
595	hca_clock_page = mmap(NULL, mdev->page_size,
596			      PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd,
597			      mdev->page_size * offset);
598
599	if (hca_clock_page == MAP_FAILED) {
600		fprintf(stderr, PFX
601			"Warning: Timestamp available,\n"
602			"but failed to mmap() hca core clock page.\n");
603		return -1;
604	}
605
606	context->hca_core_clock = hca_clock_page +
607		(context->core_clock.offset & (mdev->page_size - 1));
608	return 0;
609}
610
611int mlx5dv_query_device(struct ibv_context *ctx_in,
612			 struct mlx5dv_context *attrs_out)
613{
614	struct mlx5_context *mctx = to_mctx(ctx_in);
615	uint64_t comp_mask_out = 0;
616
617	attrs_out->version   = 0;
618	attrs_out->flags     = 0;
619
620	if (mctx->cqe_version == MLX5_CQE_VERSION_V1)
621		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1;
622
623	if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW)
624		attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW;
625
626	if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) {
627		attrs_out->cqe_comp_caps = mctx->cqe_comp_caps;
628		comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION;
629	}
630
631	attrs_out->comp_mask = comp_mask_out;
632
633	return 0;
634}
635
636static int mlx5dv_get_qp(struct ibv_qp *qp_in,
637			 struct mlx5dv_qp *qp_out)
638{
639	struct mlx5_qp *mqp = to_mqp(qp_in);
640
641	qp_out->comp_mask = 0;
642	qp_out->dbrec     = mqp->db;
643
644	if (mqp->sq_buf_size)
645		/* IBV_QPT_RAW_PACKET */
646		qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf);
647	else
648		qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset);
649	qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt;
650	qp_out->sq.stride  = 1 << mqp->sq.wqe_shift;
651
652	qp_out->rq.buf     = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset);
653	qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt;
654	qp_out->rq.stride  = 1 << mqp->rq.wqe_shift;
655
656	qp_out->bf.reg    = mqp->bf->reg;
657
658	if (mqp->bf->uuarn > 0)
659		qp_out->bf.size = mqp->bf->buf_size;
660	else
661		qp_out->bf.size = 0;
662
663	return 0;
664}
665
666static int mlx5dv_get_cq(struct ibv_cq *cq_in,
667			 struct mlx5dv_cq *cq_out)
668{
669	struct mlx5_cq *mcq = to_mcq(cq_in);
670	struct mlx5_context *mctx = to_mctx(cq_in->context);
671
672	cq_out->comp_mask = 0;
673	cq_out->cqn       = mcq->cqn;
674	cq_out->cqe_cnt   = mcq->ibv_cq.cqe + 1;
675	cq_out->cqe_size  = mcq->cqe_sz;
676	cq_out->buf       = mcq->active_buf->buf;
677	cq_out->dbrec     = mcq->dbrec;
678	cq_out->uar	  = mctx->uar;
679
680	mcq->flags	 |= MLX5_CQ_FLAGS_DV_OWNED;
681
682	return 0;
683}
684
685static int mlx5dv_get_rwq(struct ibv_wq *wq_in,
686			  struct mlx5dv_rwq *rwq_out)
687{
688	struct mlx5_rwq *mrwq = to_mrwq(wq_in);
689
690	rwq_out->comp_mask = 0;
691	rwq_out->buf       = mrwq->pbuff;
692	rwq_out->dbrec     = mrwq->recv_db;
693	rwq_out->wqe_cnt   = mrwq->rq.wqe_cnt;
694	rwq_out->stride    = 1 << mrwq->rq.wqe_shift;
695
696	return 0;
697}
698
699static int mlx5dv_get_srq(struct ibv_srq *srq_in,
700			  struct mlx5dv_srq *srq_out)
701{
702	struct mlx5_srq *msrq;
703
704	msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq);
705
706	srq_out->comp_mask = 0;
707	srq_out->buf       = msrq->buf.buf;
708	srq_out->dbrec     = msrq->db;
709	srq_out->stride    = 1 << msrq->wqe_shift;
710	srq_out->head      = msrq->head;
711	srq_out->tail      = msrq->tail;
712
713	return 0;
714}
715
716int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
717{
718	int ret = 0;
719
720	if (obj_type & MLX5DV_OBJ_QP)
721		ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out);
722	if (!ret && (obj_type & MLX5DV_OBJ_CQ))
723		ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out);
724	if (!ret && (obj_type & MLX5DV_OBJ_SRQ))
725		ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out);
726	if (!ret && (obj_type & MLX5DV_OBJ_RWQ))
727		ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out);
728
729	return ret;
730}
731
732static void adjust_uar_info(struct mlx5_device *mdev,
733			    struct mlx5_context *context,
734			    struct mlx5_alloc_ucontext_resp resp)
735{
736	if (!resp.log_uar_size && !resp.num_uars_per_page) {
737		/* old kernel */
738		context->uar_size = mdev->page_size;
739		context->num_uars_per_page = 1;
740		return;
741	}
742
743	context->uar_size = 1 << resp.log_uar_size;
744	context->num_uars_per_page = resp.num_uars_per_page;
745}
746
747static int mlx5_init_context(struct verbs_device *vdev,
748			     struct ibv_context *ctx, int cmd_fd)
749{
750	struct mlx5_context	       *context;
751	struct mlx5_alloc_ucontext	req;
752	struct mlx5_alloc_ucontext_resp resp;
753	int				i;
754	int				page_size;
755	int				tot_uuars;
756	int				low_lat_uuars;
757	int				gross_uuars;
758	int				j;
759	off_t				offset;
760	struct mlx5_device	       *mdev;
761	struct verbs_context	       *v_ctx;
762	struct ibv_port_attr		port_attr;
763	struct ibv_device_attr_ex	device_attr;
764	int				k;
765	int				bfi;
766	int				num_sys_page_map;
767
768	mdev = to_mdev(&vdev->device);
769	v_ctx = verbs_get_ctx(ctx);
770	page_size = mdev->page_size;
771	mlx5_single_threaded = single_threaded_app();
772
773	context = to_mctx(ctx);
774	context->ibv_ctx.cmd_fd = cmd_fd;
775
776	open_debug_file(context);
777	set_debug_mask();
778	set_freeze_on_error();
779	if (gethostname(context->hostname, sizeof(context->hostname)))
780		strcpy(context->hostname, "host_unknown");
781
782	tot_uuars = get_total_uuars(page_size);
783	if (tot_uuars < 0) {
784		errno = -tot_uuars;
785		goto err_free;
786	}
787
788	low_lat_uuars = get_num_low_lat_uuars(tot_uuars);
789	if (low_lat_uuars < 0) {
790		errno = -low_lat_uuars;
791		goto err_free;
792	}
793
794	if (low_lat_uuars > tot_uuars - 1) {
795		errno = ENOMEM;
796		goto err_free;
797	}
798
799	memset(&req, 0, sizeof(req));
800	memset(&resp, 0, sizeof(resp));
801
802	req.total_num_uuars = tot_uuars;
803	req.num_low_latency_uuars = low_lat_uuars;
804	req.cqe_version = MLX5_CQE_VERSION_V1;
805	req.lib_caps |= MLX5_LIB_CAP_4K_UAR;
806
807	if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp,
808				 sizeof(resp)))
809		goto err_free;
810
811	context->max_num_qps		= resp.qp_tab_size;
812	context->bf_reg_size		= resp.bf_reg_size;
813	context->tot_uuars		= resp.tot_uuars;
814	context->low_lat_uuars		= low_lat_uuars;
815	context->cache_line_size	= resp.cache_line_size;
816	context->max_sq_desc_sz = resp.max_sq_desc_sz;
817	context->max_rq_desc_sz = resp.max_rq_desc_sz;
818	context->max_send_wqebb	= resp.max_send_wqebb;
819	context->num_ports	= resp.num_ports;
820	context->max_recv_wr	= resp.max_recv_wr;
821	context->max_srq_recv_wr = resp.max_srq_recv_wr;
822
823	context->cqe_version = resp.cqe_version;
824	if (context->cqe_version) {
825		if (context->cqe_version == MLX5_CQE_VERSION_V1)
826			mlx5_ctx_ops.poll_cq = mlx5_poll_cq_v1;
827		else
828			goto err_free;
829	}
830
831	adjust_uar_info(mdev, context, resp);
832
833	gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR;
834	context->bfs = calloc(gross_uuars, sizeof(*context->bfs));
835	if (!context->bfs) {
836		errno = ENOMEM;
837		goto err_free;
838	}
839
840	context->cmds_supp_uhw = resp.cmds_supp_uhw;
841	context->vendor_cap_flags = 0;
842
843	pthread_mutex_init(&context->qp_table_mutex, NULL);
844	pthread_mutex_init(&context->srq_table_mutex, NULL);
845	pthread_mutex_init(&context->uidx_table_mutex, NULL);
846	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
847		context->qp_table[i].refcnt = 0;
848
849	for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i)
850		context->uidx_table[i].refcnt = 0;
851
852	context->db_list = NULL;
853
854	pthread_mutex_init(&context->db_list_mutex, NULL);
855
856	num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR);
857	for (i = 0; i < num_sys_page_map; ++i) {
858		offset = 0;
859		set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset);
860		set_index(i, &offset);
861		context->uar[i] = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED,
862				       cmd_fd, page_size * offset);
863		if (context->uar[i] == MAP_FAILED) {
864			context->uar[i] = NULL;
865			goto err_free_bf;
866		}
867	}
868
869	for (i = 0; i < num_sys_page_map; i++) {
870		for (j = 0; j < context->num_uars_per_page; j++) {
871			for (k = 0; k < NUM_BFREGS_PER_UAR; k++) {
872				bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k;
873				context->bfs[bfi].reg = context->uar[i] + MLX5_ADAPTER_PAGE_SIZE * j +
874							MLX5_BF_OFFSET + k * context->bf_reg_size;
875				context->bfs[bfi].need_lock = need_uuar_lock(context, bfi);
876				mlx5_spinlock_init(&context->bfs[bfi].lock);
877				context->bfs[bfi].offset = 0;
878				if (bfi)
879					context->bfs[bfi].buf_size = context->bf_reg_size / 2;
880				context->bfs[bfi].uuarn = bfi;
881			}
882		}
883	}
884	context->hca_core_clock = NULL;
885	if (resp.response_length + sizeof(resp.ibv_resp) >=
886	    offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) +
887	    sizeof(resp.hca_core_clock_offset) &&
888	    resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) {
889		context->core_clock.offset = resp.hca_core_clock_offset;
890		mlx5_map_internal_clock(mdev, ctx);
891	}
892
893	mlx5_spinlock_init(&context->lock32);
894
895	context->prefer_bf = get_always_bf();
896	context->shut_up_bf = get_shut_up_bf();
897	mlx5_read_env(&vdev->device, context);
898
899	mlx5_spinlock_init(&context->hugetlb_lock);
900	TAILQ_INIT(&context->hugetlb_list);
901
902	context->ibv_ctx.ops = mlx5_ctx_ops;
903
904	verbs_set_ctx_op(v_ctx, create_qp_ex, mlx5_create_qp_ex);
905	verbs_set_ctx_op(v_ctx, open_xrcd, mlx5_open_xrcd);
906	verbs_set_ctx_op(v_ctx, close_xrcd, mlx5_close_xrcd);
907	verbs_set_ctx_op(v_ctx, create_srq_ex, mlx5_create_srq_ex);
908	verbs_set_ctx_op(v_ctx, get_srq_num, mlx5_get_srq_num);
909	verbs_set_ctx_op(v_ctx, query_device_ex, mlx5_query_device_ex);
910	verbs_set_ctx_op(v_ctx, query_rt_values, mlx5_query_rt_values);
911	verbs_set_ctx_op(v_ctx, ibv_create_flow, ibv_cmd_create_flow);
912	verbs_set_ctx_op(v_ctx, ibv_destroy_flow, ibv_cmd_destroy_flow);
913	verbs_set_ctx_op(v_ctx, create_cq_ex, mlx5_create_cq_ex);
914	verbs_set_ctx_op(v_ctx, create_wq, mlx5_create_wq);
915	verbs_set_ctx_op(v_ctx, modify_wq, mlx5_modify_wq);
916	verbs_set_ctx_op(v_ctx, destroy_wq, mlx5_destroy_wq);
917	verbs_set_ctx_op(v_ctx, create_rwq_ind_table, mlx5_create_rwq_ind_table);
918	verbs_set_ctx_op(v_ctx, destroy_rwq_ind_table, mlx5_destroy_rwq_ind_table);
919
920	memset(&device_attr, 0, sizeof(device_attr));
921	if (!mlx5_query_device_ex(ctx, NULL, &device_attr,
922				  sizeof(struct ibv_device_attr_ex))) {
923		context->cached_device_cap_flags =
924			device_attr.orig_attr.device_cap_flags;
925		context->atomic_cap = device_attr.orig_attr.atomic_cap;
926		context->cached_tso_caps = device_attr.tso_caps;
927	}
928
929	for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) {
930		memset(&port_attr, 0, sizeof(port_attr));
931		if (!mlx5_query_port(ctx, j + 1, &port_attr))
932			context->cached_link_layer[j] = port_attr.link_layer;
933	}
934
935	return 0;
936
937err_free_bf:
938	free(context->bfs);
939
940err_free:
941	for (i = 0; i < MLX5_MAX_UARS; ++i) {
942		if (context->uar[i])
943			munmap(context->uar[i], page_size);
944	}
945	close_debug_file(context);
946	return errno;
947}
948
949static void mlx5_cleanup_context(struct verbs_device *device,
950				 struct ibv_context *ibctx)
951{
952	struct mlx5_context *context = to_mctx(ibctx);
953	int page_size = to_mdev(ibctx->device)->page_size;
954	int i;
955
956	free(context->bfs);
957	for (i = 0; i < MLX5_MAX_UARS; ++i) {
958		if (context->uar[i])
959			munmap(context->uar[i], page_size);
960	}
961	if (context->hca_core_clock)
962		munmap(context->hca_core_clock - context->core_clock.offset,
963		       page_size);
964	close_debug_file(context);
965}
966
967static struct verbs_device_ops mlx5_dev_ops = {
968	.init_context = mlx5_init_context,
969	.uninit_context = mlx5_cleanup_context,
970};
971
972static struct verbs_device *mlx5_driver_init(const char *uverbs_sys_path,
973					     int abi_version)
974{
975	char			value[8];
976	struct mlx5_device     *dev;
977	unsigned		vendor, device;
978	int			i;
979
980	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
981				value, sizeof value) < 0)
982		return NULL;
983	sscanf(value, "%i", &vendor);
984
985	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
986				value, sizeof value) < 0)
987		return NULL;
988	sscanf(value, "%i", &device);
989
990	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
991		if (vendor == hca_table[i].vendor &&
992		    device == hca_table[i].device)
993			goto found;
994
995	return NULL;
996
997found:
998	if (abi_version < MLX5_UVERBS_MIN_ABI_VERSION ||
999	    abi_version > MLX5_UVERBS_MAX_ABI_VERSION) {
1000		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
1001			"(min supported %d, max supported %d)\n",
1002			abi_version, uverbs_sys_path,
1003			MLX5_UVERBS_MIN_ABI_VERSION,
1004			MLX5_UVERBS_MAX_ABI_VERSION);
1005		return NULL;
1006	}
1007
1008	dev = calloc(1, sizeof *dev);
1009	if (!dev) {
1010		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
1011			uverbs_sys_path);
1012		return NULL;
1013	}
1014
1015	dev->page_size   = sysconf(_SC_PAGESIZE);
1016	dev->driver_abi_ver = abi_version;
1017
1018	dev->verbs_dev.ops = &mlx5_dev_ops;
1019	dev->verbs_dev.sz = sizeof(*dev);
1020	dev->verbs_dev.size_of_context = sizeof(struct mlx5_context) -
1021		sizeof(struct ibv_context);
1022
1023	return &dev->verbs_dev;
1024}
1025
1026static __attribute__((constructor)) void mlx5_register_driver(void)
1027{
1028	verbs_register_driver("mlx5", mlx5_driver_init);
1029}
1030