mlx5_ib_main.c revision 323223
1/*-
2 * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: stable/10/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c 323223 2017-09-06 15:33:23Z hselasky $
26 */
27
28#include <linux/errno.h>
29#include <linux/pci.h>
30#include <linux/dma-mapping.h>
31#include <linux/slab.h>
32#include <linux/io-mapping.h>
33#include <linux/sched.h>
34#include <linux/netdevice.h>
35#include <linux/etherdevice.h>
36#include <net/ipv6.h>
37#include <linux/list.h>
38#include <dev/mlx5/driver.h>
39#include <dev/mlx5/vport.h>
40#include <asm/pgtable.h>
41#include <linux/fs.h>
42#undef inode
43
44#include <rdma/ib_user_verbs.h>
45#include <rdma/ib_smi.h>
46#include <rdma/ib_umem.h>
47#include "user.h"
48#include "mlx5_ib.h"
49
50#include <sys/unistd.h>
51
52#define DRIVER_NAME "mlx5_ib"
53#define DRIVER_VERSION "3.2-rc1"
54#define DRIVER_RELDATE	"May 2016"
55
56#undef MODULE_VERSION
57#include <sys/module.h>
58
59MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
60MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
61MODULE_LICENSE("Dual BSD/GPL");
62MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1);
63MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1);
64MODULE_VERSION(mlx5ib, 1);
65
66static int deprecated_prof_sel = 2;
67module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
68MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
69
70enum {
71	MLX5_STANDARD_ATOMIC_SIZE = 0x8,
72};
73
74struct workqueue_struct *mlx5_ib_wq;
75
76static char mlx5_version[] =
77	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
78	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
79
80static void get_atomic_caps(struct mlx5_ib_dev *dev,
81			    struct ib_device_attr *props)
82{
83	int tmp;
84	u8 atomic_operations;
85	u8 atomic_size_qp;
86	u8 atomic_req_endianess;
87
88	atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
89	atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
90	atomic_req_endianess = MLX5_CAP_ATOMIC(dev->mdev,
91					       atomic_req_8B_endianess_mode) ||
92			       !mlx5_host_is_le();
93
94	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
95	if (((atomic_operations & tmp) == tmp)
96	    && (atomic_size_qp & 8)) {
97		if (atomic_req_endianess) {
98			props->atomic_cap = IB_ATOMIC_HCA;
99		} else {
100			props->atomic_cap = IB_ATOMIC_NONE;
101		}
102	} else {
103		props->atomic_cap = IB_ATOMIC_NONE;
104	}
105
106	tmp = MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | MLX5_ATOMIC_OPS_MASKED_FETCH_ADD;
107	if (((atomic_operations & tmp) == tmp)
108	    &&(atomic_size_qp & 8)) {
109		if (atomic_req_endianess)
110			props->masked_atomic_cap = IB_ATOMIC_HCA;
111		else {
112			props->masked_atomic_cap = IB_ATOMIC_NONE;
113		}
114	} else {
115		props->masked_atomic_cap = IB_ATOMIC_NONE;
116	}
117}
118
119static enum rdma_link_layer
120mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
121{
122	struct mlx5_ib_dev *dev = to_mdev(device);
123
124	switch (MLX5_CAP_GEN(dev->mdev, port_type)) {
125	case MLX5_CAP_PORT_TYPE_IB:
126		return IB_LINK_LAYER_INFINIBAND;
127	case MLX5_CAP_PORT_TYPE_ETH:
128		return IB_LINK_LAYER_ETHERNET;
129	default:
130		return IB_LINK_LAYER_UNSPECIFIED;
131	}
132}
133
134static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
135{
136	return !dev->mdev->issi;
137}
138
139enum {
140	MLX5_VPORT_ACCESS_METHOD_MAD,
141	MLX5_VPORT_ACCESS_METHOD_HCA,
142	MLX5_VPORT_ACCESS_METHOD_NIC,
143};
144
145static int mlx5_get_vport_access_method(struct ib_device *ibdev)
146{
147	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
148		return MLX5_VPORT_ACCESS_METHOD_MAD;
149
150	if (mlx5_ib_port_link_layer(ibdev, 1) ==
151	    IB_LINK_LAYER_ETHERNET)
152		return MLX5_VPORT_ACCESS_METHOD_NIC;
153
154	return MLX5_VPORT_ACCESS_METHOD_HCA;
155}
156
157static int mlx5_query_system_image_guid(struct ib_device *ibdev,
158					__be64 *sys_image_guid)
159{
160	struct mlx5_ib_dev *dev = to_mdev(ibdev);
161	struct mlx5_core_dev *mdev = dev->mdev;
162	u64 tmp;
163	int err;
164
165	switch (mlx5_get_vport_access_method(ibdev)) {
166	case MLX5_VPORT_ACCESS_METHOD_MAD:
167		return mlx5_query_system_image_guid_mad_ifc(ibdev,
168							    sys_image_guid);
169
170	case MLX5_VPORT_ACCESS_METHOD_HCA:
171		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
172		if (!err)
173			*sys_image_guid = cpu_to_be64(tmp);
174		return err;
175
176	case MLX5_VPORT_ACCESS_METHOD_NIC:
177		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
178		if (!err)
179			*sys_image_guid = cpu_to_be64(tmp);
180		return err;
181
182	default:
183		return -EINVAL;
184	}
185}
186
187static int mlx5_query_max_pkeys(struct ib_device *ibdev,
188				u16 *max_pkeys)
189{
190	struct mlx5_ib_dev *dev = to_mdev(ibdev);
191	struct mlx5_core_dev *mdev = dev->mdev;
192
193	switch (mlx5_get_vport_access_method(ibdev)) {
194	case MLX5_VPORT_ACCESS_METHOD_MAD:
195		return mlx5_query_max_pkeys_mad_ifc(ibdev, max_pkeys);
196
197	case MLX5_VPORT_ACCESS_METHOD_HCA:
198	case MLX5_VPORT_ACCESS_METHOD_NIC:
199		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
200						pkey_table_size));
201		return 0;
202
203	default:
204		return -EINVAL;
205	}
206}
207
208static int mlx5_query_vendor_id(struct ib_device *ibdev,
209				u32 *vendor_id)
210{
211	struct mlx5_ib_dev *dev = to_mdev(ibdev);
212
213	switch (mlx5_get_vport_access_method(ibdev)) {
214	case MLX5_VPORT_ACCESS_METHOD_MAD:
215		return mlx5_query_vendor_id_mad_ifc(ibdev, vendor_id);
216
217	case MLX5_VPORT_ACCESS_METHOD_HCA:
218	case MLX5_VPORT_ACCESS_METHOD_NIC:
219		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
220
221	default:
222		return -EINVAL;
223	}
224}
225
226static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
227				__be64 *node_guid)
228{
229	u64 tmp;
230	int err;
231
232	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
233	case MLX5_VPORT_ACCESS_METHOD_MAD:
234		return mlx5_query_node_guid_mad_ifc(dev, node_guid);
235
236	case MLX5_VPORT_ACCESS_METHOD_HCA:
237		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
238		if (!err)
239			*node_guid = cpu_to_be64(tmp);
240		return err;
241
242	case MLX5_VPORT_ACCESS_METHOD_NIC:
243		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
244		if (!err)
245			*node_guid = cpu_to_be64(tmp);
246		return err;
247
248	default:
249		return -EINVAL;
250	}
251}
252
253struct mlx5_reg_node_desc {
254	u8	desc[64];
255};
256
257static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
258{
259	struct mlx5_reg_node_desc in;
260
261	if (mlx5_use_mad_ifc(dev))
262		return mlx5_query_node_desc_mad_ifc(dev, node_desc);
263
264	memset(&in, 0, sizeof(in));
265
266	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
267				    sizeof(struct mlx5_reg_node_desc),
268				    MLX5_REG_NODE_DESC, 0, 0);
269}
270
271static int mlx5_ib_query_device(struct ib_device *ibdev,
272				struct ib_device_attr *props)
273{
274	struct mlx5_ib_dev *dev = to_mdev(ibdev);
275	struct mlx5_core_dev *mdev = dev->mdev;
276	int max_sq_desc;
277	int max_rq_sg;
278	int max_sq_sg;
279	int err;
280
281
282	memset(props, 0, sizeof(*props));
283
284	err = mlx5_query_system_image_guid(ibdev,
285					   &props->sys_image_guid);
286	if (err)
287		return err;
288
289	err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
290	if (err)
291		return err;
292
293	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
294	if (err)
295		return err;
296
297	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
298		((u64)fw_rev_min(dev->mdev) << 16) |
299		fw_rev_sub(dev->mdev);
300	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
301		IB_DEVICE_PORT_ACTIVE_EVENT		|
302		IB_DEVICE_SYS_IMAGE_GUID		|
303		IB_DEVICE_RC_RNR_NAK_GEN;
304
305	if (MLX5_CAP_GEN(mdev, pkv))
306		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
307	if (MLX5_CAP_GEN(mdev, qkv))
308		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
309	if (MLX5_CAP_GEN(mdev, apm))
310		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
311	props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
312	if (MLX5_CAP_GEN(mdev, xrc))
313		props->device_cap_flags |= IB_DEVICE_XRC;
314	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
315	if (MLX5_CAP_GEN(mdev, block_lb_mc))
316		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
317
318	props->vendor_part_id	   = mdev->pdev->device;
319	props->hw_ver		   = mdev->pdev->revision;
320
321	props->max_mr_size	   = ~0ull;
322	props->page_size_cap	   = ~(u32)((1ull << MLX5_CAP_GEN(mdev, log_pg_sz)) -1);
323	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
324	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
325	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
326		     sizeof(struct mlx5_wqe_data_seg);
327	max_sq_desc = min((int)MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
328	max_sq_sg = (max_sq_desc -
329		     sizeof(struct mlx5_wqe_ctrl_seg) -
330		     sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg);
331	props->max_sge = min(max_rq_sg, max_sq_sg);
332	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
333	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
334	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
335	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
336	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
337	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
338	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
339	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
340	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
341	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
342	props->max_srq_sge	   = max_rq_sg - 1;
343	props->max_fast_reg_page_list_len = (unsigned int)-1;
344	get_atomic_caps(dev, props);
345	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
346	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
347	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
348					   props->max_mcast_grp;
349	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
350	props->max_ah		= INT_MAX;
351
352	return 0;
353}
354
355enum mlx5_ib_width {
356	MLX5_IB_WIDTH_1X	= 1 << 0,
357	MLX5_IB_WIDTH_2X	= 1 << 1,
358	MLX5_IB_WIDTH_4X	= 1 << 2,
359	MLX5_IB_WIDTH_8X	= 1 << 3,
360	MLX5_IB_WIDTH_12X	= 1 << 4
361};
362
363static int translate_active_width(struct ib_device *ibdev, u8 active_width,
364				  u8 *ib_width)
365{
366	struct mlx5_ib_dev *dev = to_mdev(ibdev);
367	int err = 0;
368
369	if (active_width & MLX5_IB_WIDTH_1X) {
370		*ib_width = IB_WIDTH_1X;
371	} else if (active_width & MLX5_IB_WIDTH_2X) {
372		mlx5_ib_warn(dev, "active_width %d is not supported by IB spec\n",
373			     (int)active_width);
374		err = -EINVAL;
375	} else if (active_width & MLX5_IB_WIDTH_4X) {
376		*ib_width = IB_WIDTH_4X;
377	} else if (active_width & MLX5_IB_WIDTH_8X) {
378		*ib_width = IB_WIDTH_8X;
379	} else if (active_width & MLX5_IB_WIDTH_12X) {
380		*ib_width = IB_WIDTH_12X;
381	} else {
382		mlx5_ib_dbg(dev, "Invalid active_width %d\n",
383			    (int)active_width);
384		err = -EINVAL;
385	}
386
387	return err;
388}
389
390/*
391 * TODO: Move to IB core
392 */
393enum ib_max_vl_num {
394	__IB_MAX_VL_0		= 1,
395	__IB_MAX_VL_0_1		= 2,
396	__IB_MAX_VL_0_3		= 3,
397	__IB_MAX_VL_0_7		= 4,
398	__IB_MAX_VL_0_14	= 5,
399};
400
401enum mlx5_vl_hw_cap {
402	MLX5_VL_HW_0	= 1,
403	MLX5_VL_HW_0_1	= 2,
404	MLX5_VL_HW_0_2	= 3,
405	MLX5_VL_HW_0_3	= 4,
406	MLX5_VL_HW_0_4	= 5,
407	MLX5_VL_HW_0_5	= 6,
408	MLX5_VL_HW_0_6	= 7,
409	MLX5_VL_HW_0_7	= 8,
410	MLX5_VL_HW_0_14	= 15
411};
412
413static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
414				u8 *max_vl_num)
415{
416	switch (vl_hw_cap) {
417	case MLX5_VL_HW_0:
418		*max_vl_num = __IB_MAX_VL_0;
419		break;
420	case MLX5_VL_HW_0_1:
421		*max_vl_num = __IB_MAX_VL_0_1;
422		break;
423	case MLX5_VL_HW_0_3:
424		*max_vl_num = __IB_MAX_VL_0_3;
425		break;
426	case MLX5_VL_HW_0_7:
427		*max_vl_num = __IB_MAX_VL_0_7;
428		break;
429	case MLX5_VL_HW_0_14:
430		*max_vl_num = __IB_MAX_VL_0_14;
431		break;
432
433	default:
434		return -EINVAL;
435	}
436
437	return 0;
438}
439
440static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port,
441			      struct ib_port_attr *props)
442{
443	struct mlx5_ib_dev *dev = to_mdev(ibdev);
444	struct mlx5_core_dev *mdev = dev->mdev;
445	u32 *rep;
446	int outlen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out);
447	struct mlx5_ptys_reg *ptys;
448	struct mlx5_pmtu_reg *pmtu;
449	struct mlx5_pvlc_reg pvlc;
450	void *ctx;
451	int err;
452
453	rep = mlx5_vzalloc(outlen);
454	ptys = kzalloc(sizeof(*ptys), GFP_KERNEL);
455	pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL);
456	if (!rep || !ptys || !pmtu) {
457		err = -ENOMEM;
458		goto out;
459	}
460
461	memset(props, 0, sizeof(*props));
462
463	/* what if I am pf with dual port */
464	err = mlx5_query_hca_vport_context(mdev, port, 0, rep, outlen);
465	if (err)
466		goto out;
467
468	ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context);
469
470	props->lid		= MLX5_GET(hca_vport_context, ctx, lid);
471	props->lmc		= MLX5_GET(hca_vport_context, ctx, lmc);
472	props->sm_lid		= MLX5_GET(hca_vport_context, ctx, sm_lid);
473	props->sm_sl		= MLX5_GET(hca_vport_context, ctx, sm_sl);
474	props->state		= MLX5_GET(hca_vport_context, ctx, vport_state);
475	props->phys_state	= MLX5_GET(hca_vport_context, ctx,
476					port_physical_state);
477	props->port_cap_flags	= MLX5_GET(hca_vport_context, ctx, cap_mask1);
478	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
479	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
480	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
481	props->bad_pkey_cntr	= MLX5_GET(hca_vport_context, ctx,
482					      pkey_violation_counter);
483	props->qkey_viol_cntr	= MLX5_GET(hca_vport_context, ctx,
484					      qkey_violation_counter);
485	props->subnet_timeout	= MLX5_GET(hca_vport_context, ctx,
486					      subnet_timeout);
487	props->init_type_reply	= MLX5_GET(hca_vport_context, ctx,
488					   init_type_reply);
489
490	ptys->proto_mask |= MLX5_PTYS_IB;
491	ptys->local_port = port;
492	err = mlx5_core_access_ptys(mdev, ptys, 0);
493	if (err)
494		goto out;
495
496	err = translate_active_width(ibdev, ptys->ib_link_width_oper,
497				     &props->active_width);
498	if (err)
499		goto out;
500
501	props->active_speed	= (u8)ptys->ib_proto_oper;
502
503	pmtu->local_port = port;
504	err = mlx5_core_access_pmtu(mdev, pmtu, 0);
505	if (err)
506		goto out;
507
508	props->max_mtu		= pmtu->max_mtu;
509	props->active_mtu	= pmtu->oper_mtu;
510
511	memset(&pvlc, 0, sizeof(pvlc));
512	pvlc.local_port = port;
513	err = mlx5_core_access_pvlc(mdev, &pvlc, 0);
514	if (err)
515		goto out;
516
517	err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap,
518				   &props->max_vl_num);
519out:
520	kvfree(rep);
521	kfree(ptys);
522	kfree(pmtu);
523	return err;
524}
525
526int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
527		       struct ib_port_attr *props)
528{
529	switch (mlx5_get_vport_access_method(ibdev)) {
530	case MLX5_VPORT_ACCESS_METHOD_MAD:
531		return mlx5_query_port_mad_ifc(ibdev, port, props);
532
533	case MLX5_VPORT_ACCESS_METHOD_HCA:
534		return mlx5_query_port_ib(ibdev, port, props);
535
536	case MLX5_VPORT_ACCESS_METHOD_NIC:
537		return mlx5_query_port_roce(ibdev, port, props);
538
539	default:
540		return -EINVAL;
541	}
542}
543
544static inline int
545mlx5_addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
546{
547	if (dev->if_addrlen != ETH_ALEN)
548		return -1;
549	memcpy(eui, IF_LLADDR(dev), 3);
550	memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
551
552	/* NOTE: The scope ID is added by the GID to IP conversion */
553
554	eui[3] = 0xFF;
555	eui[4] = 0xFE;
556	eui[0] ^= 2;
557	return 0;
558}
559
560static void
561mlx5_make_default_gid(struct net_device *dev, union ib_gid *gid)
562{
563	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
564	mlx5_addrconf_ifid_eui48(&gid->raw[8], dev);
565}
566
567static inline int
568mlx5_ip2gid(const struct sockaddr *addr, union ib_gid *gid)
569{
570	switch (addr->sa_family) {
571	case AF_INET:
572		ipv6_addr_set_v4mapped(((const struct sockaddr_in *)addr)->sin_addr.s_addr,
573		    (struct in6_addr *)gid->raw);
574		break;
575	case AF_INET6:
576		memcpy(gid->raw, &((const struct sockaddr_in6 *)addr)->sin6_addr, 16);
577		/* clear SCOPE ID */
578		gid->raw[2] = 0;
579		gid->raw[3] = 0;
580		break;
581	default:
582		return -EINVAL;
583	}
584	return 0;
585}
586
587static void
588mlx5_ib_roce_port_update(void *arg)
589{
590	struct mlx5_ib_port *port = (struct mlx5_ib_port *)arg;
591	struct mlx5_ib_dev *dev = port->dev;
592	struct mlx5_core_dev *mdev = dev->mdev;
593	struct net_device *xdev[MLX5_IB_GID_MAX];
594	struct net_device *idev;
595	struct net_device *ndev;
596	struct ifaddr *ifa;
597	union ib_gid gid_temp;
598
599	while (port->port_gone == 0) {
600		int update = 0;
601		int gid_index = 0;
602		int j;
603		int error;
604
605		ndev = mlx5_get_protocol_dev(mdev, MLX5_INTERFACE_PROTOCOL_ETH);
606		if (ndev == NULL) {
607			pause("W", hz);
608			continue;
609		}
610
611		CURVNET_SET_QUIET(ndev->if_vnet);
612
613		memset(&gid_temp, 0, sizeof(gid_temp));
614		mlx5_make_default_gid(ndev, &gid_temp);
615		if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) {
616			port->gid_table[gid_index] = gid_temp;
617			update = 1;
618		}
619		xdev[gid_index] = ndev;
620		gid_index++;
621
622		IFNET_RLOCK();
623		TAILQ_FOREACH(idev, &V_ifnet, if_link) {
624			if (idev == ndev)
625				break;
626		}
627		if (idev != NULL) {
628		    TAILQ_FOREACH(idev, &V_ifnet, if_link) {
629			if (idev != ndev) {
630				if (idev->if_type != IFT_L2VLAN)
631					continue;
632				if (ndev != rdma_vlan_dev_real_dev(idev))
633					continue;
634			}
635			/* clone address information for IPv4 and IPv6 */
636			IF_ADDR_RLOCK(idev);
637			TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) {
638				if (ifa->ifa_addr == NULL ||
639				    (ifa->ifa_addr->sa_family != AF_INET &&
640				     ifa->ifa_addr->sa_family != AF_INET6) ||
641				    gid_index >= MLX5_IB_GID_MAX)
642					continue;
643				memset(&gid_temp, 0, sizeof(gid_temp));
644				mlx5_ip2gid(ifa->ifa_addr, &gid_temp);
645				/* check for existing entry */
646				for (j = 0; j != gid_index; j++) {
647					if (bcmp(&gid_temp, &port->gid_table[j], sizeof(gid_temp)) == 0)
648						break;
649				}
650				/* check if new entry must be added */
651				if (j == gid_index) {
652					if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) {
653						port->gid_table[gid_index] = gid_temp;
654						update = 1;
655					}
656					xdev[gid_index] = idev;
657					gid_index++;
658				}
659			}
660			IF_ADDR_RUNLOCK(idev);
661		    }
662		}
663		IFNET_RUNLOCK();
664		CURVNET_RESTORE();
665
666		if (update != 0 &&
667		    mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) {
668			struct ib_event event = {
669			    .device = &dev->ib_dev,
670			    .element.port_num = port->port_num + 1,
671			    .event = IB_EVENT_GID_CHANGE,
672			};
673
674			/* add new entries, if any */
675			for (j = 0; j != gid_index; j++) {
676				error = modify_gid_roce(&dev->ib_dev, port->port_num, j,
677				    port->gid_table + j, xdev[j]);
678				if (error != 0)
679					printf("mlx5_ib: Failed to update ROCE GID table: %d\n", error);
680			}
681			memset(&gid_temp, 0, sizeof(gid_temp));
682
683			/* clear old entries, if any */
684			for (; j != MLX5_IB_GID_MAX; j++) {
685				if (bcmp(&gid_temp, port->gid_table + j, sizeof(gid_temp)) == 0)
686					continue;
687				port->gid_table[j] = gid_temp;
688				(void) modify_gid_roce(&dev->ib_dev, port->port_num, j,
689				    port->gid_table + j, ndev);
690			}
691
692			/* make sure ibcore gets updated */
693			ib_dispatch_event(&event);
694		}
695		pause("W", hz);
696	}
697	do {
698		struct ib_event event = {
699			.device = &dev->ib_dev,
700			.element.port_num = port->port_num + 1,
701			.event = IB_EVENT_GID_CHANGE,
702		};
703		/* make sure ibcore gets updated */
704		ib_dispatch_event(&event);
705
706		/* wait a bit */
707		pause("W", hz);
708	} while (0);
709	port->port_gone = 2;
710	kthread_exit();
711}
712
713static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
714			     union ib_gid *gid)
715{
716	struct mlx5_ib_dev *dev = to_mdev(ibdev);
717	struct mlx5_core_dev *mdev = dev->mdev;
718
719	switch (mlx5_get_vport_access_method(ibdev)) {
720	case MLX5_VPORT_ACCESS_METHOD_MAD:
721		return mlx5_query_gids_mad_ifc(ibdev, port, index, gid);
722
723	case MLX5_VPORT_ACCESS_METHOD_HCA:
724		return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid);
725
726	case MLX5_VPORT_ACCESS_METHOD_NIC:
727		if (port == 0 || port > MLX5_CAP_GEN(mdev, num_ports) ||
728		    index < 0 || index >= MLX5_IB_GID_MAX ||
729		    dev->port[port - 1].port_gone != 0)
730			memset(gid, 0, sizeof(*gid));
731		else
732			*gid = dev->port[port - 1].gid_table[index];
733		return 0;
734
735	default:
736		return -EINVAL;
737	}
738}
739
740static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
741			      u16 *pkey)
742{
743	struct mlx5_ib_dev *dev = to_mdev(ibdev);
744	struct mlx5_core_dev *mdev = dev->mdev;
745
746	switch (mlx5_get_vport_access_method(ibdev)) {
747	case MLX5_VPORT_ACCESS_METHOD_MAD:
748		return mlx5_query_pkey_mad_ifc(ibdev, port, index, pkey);
749
750	case MLX5_VPORT_ACCESS_METHOD_HCA:
751	case MLX5_VPORT_ACCESS_METHOD_NIC:
752		return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index,
753						 pkey);
754
755	default:
756		return -EINVAL;
757	}
758}
759
760static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
761				 struct ib_device_modify *props)
762{
763	struct mlx5_ib_dev *dev = to_mdev(ibdev);
764	struct mlx5_reg_node_desc in;
765	struct mlx5_reg_node_desc out;
766	int err;
767
768	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
769		return -EOPNOTSUPP;
770
771	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
772		return 0;
773
774	/*
775	 * If possible, pass node desc to FW, so it can generate
776	 * a 144 trap.  If cmd fails, just ignore.
777	 */
778	memcpy(&in, props->node_desc, 64);
779	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
780				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
781	if (err)
782		return err;
783
784	memcpy(ibdev->node_desc, props->node_desc, 64);
785
786	return err;
787}
788
789static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
790			       struct ib_port_modify *props)
791{
792	u8 is_eth = (mlx5_ib_port_link_layer(ibdev, port) ==
793		     IB_LINK_LAYER_ETHERNET);
794	struct mlx5_ib_dev *dev = to_mdev(ibdev);
795	struct ib_port_attr attr;
796	u32 tmp;
797	int err;
798
799	/* return OK if this is RoCE. CM calls ib_modify_port() regardless
800	 * of whether port link layer is ETH or IB. For ETH ports, qkey
801	 * violations and port capabilities are not valid.
802	 */
803	if (is_eth)
804		return 0;
805
806	mutex_lock(&dev->cap_mask_mutex);
807
808	err = mlx5_ib_query_port(ibdev, port, &attr);
809	if (err)
810		goto out;
811
812	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
813		~props->clr_port_cap_mask;
814
815	err = mlx5_set_port_caps(dev->mdev, port, tmp);
816
817out:
818	mutex_unlock(&dev->cap_mask_mutex);
819	return err;
820}
821
822enum mlx5_cap_flags {
823	MLX5_CAP_COMPACT_AV = 1 << 0,
824};
825
826static void set_mlx5_flags(u32 *flags, struct mlx5_core_dev *dev)
827{
828	*flags |= MLX5_CAP_GEN(dev, compact_address_vector) ?
829		  MLX5_CAP_COMPACT_AV : 0;
830}
831
832static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
833						  struct ib_udata *udata)
834{
835	struct mlx5_ib_dev *dev = to_mdev(ibdev);
836	struct mlx5_ib_alloc_ucontext_req_v2 req;
837	struct mlx5_ib_alloc_ucontext_resp resp;
838	struct mlx5_ib_ucontext *context;
839	struct mlx5_uuar_info *uuari;
840	struct mlx5_uar *uars;
841	int gross_uuars;
842	int num_uars;
843	int ver;
844	int uuarn;
845	int err;
846	int i;
847	size_t reqlen;
848
849	if (!dev->ib_active)
850		return ERR_PTR(-EAGAIN);
851
852	memset(&req, 0, sizeof(req));
853	memset(&resp, 0, sizeof(resp));
854
855	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
856	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
857		ver = 0;
858	else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
859		ver = 2;
860	else {
861		mlx5_ib_err(dev, "request malformed, reqlen: %ld\n", (long)reqlen);
862		return ERR_PTR(-EINVAL);
863	}
864
865	err = ib_copy_from_udata(&req, udata, reqlen);
866	if (err) {
867		mlx5_ib_err(dev, "copy failed\n");
868		return ERR_PTR(err);
869	}
870
871	if (req.reserved) {
872		mlx5_ib_err(dev, "request corrupted\n");
873		return ERR_PTR(-EINVAL);
874	}
875
876	if (req.total_num_uuars == 0 || req.total_num_uuars > MLX5_MAX_UUARS) {
877		mlx5_ib_warn(dev, "wrong num_uuars: %d\n", req.total_num_uuars);
878		return ERR_PTR(-ENOMEM);
879	}
880
881	req.total_num_uuars = ALIGN(req.total_num_uuars,
882				    MLX5_NON_FP_BF_REGS_PER_PAGE);
883	if (req.num_low_latency_uuars > req.total_num_uuars - 1) {
884		mlx5_ib_warn(dev, "wrong num_low_latency_uuars: %d ( > %d)\n",
885			     req.total_num_uuars, req.total_num_uuars);
886		return ERR_PTR(-EINVAL);
887	}
888
889	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
890	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
891	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
892	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
893		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
894	resp.cache_line_size = L1_CACHE_BYTES;
895	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
896	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
897	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
898	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
899	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
900	set_mlx5_flags(&resp.flags, dev->mdev);
901
902	if (offsetof(struct mlx5_ib_alloc_ucontext_resp, max_desc_sz_sq_dc) < udata->outlen)
903		resp.max_desc_sz_sq_dc = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq_dc);
904
905	if (offsetof(struct mlx5_ib_alloc_ucontext_resp, atomic_arg_sizes_dc) < udata->outlen)
906		resp.atomic_arg_sizes_dc = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
907
908	context = kzalloc(sizeof(*context), GFP_KERNEL);
909	if (!context)
910		return ERR_PTR(-ENOMEM);
911
912	uuari = &context->uuari;
913	mutex_init(&uuari->lock);
914	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
915	if (!uars) {
916		err = -ENOMEM;
917		goto out_ctx;
918	}
919
920	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
921				sizeof(*uuari->bitmap),
922				GFP_KERNEL);
923	if (!uuari->bitmap) {
924		err = -ENOMEM;
925		goto out_uar_ctx;
926	}
927	/*
928	 * clear all fast path uuars
929	 */
930	for (i = 0; i < gross_uuars; i++) {
931		uuarn = i & 3;
932		if (uuarn == 2 || uuarn == 3)
933			set_bit(i, uuari->bitmap);
934	}
935
936	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
937	if (!uuari->count) {
938		err = -ENOMEM;
939		goto out_bitmap;
940	}
941
942	for (i = 0; i < num_uars; i++) {
943		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
944		if (err) {
945			mlx5_ib_err(dev, "uar alloc failed at %d\n", i);
946			goto out_uars;
947		}
948	}
949	for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++)
950		context->dynamic_wc_uar_index[i] = MLX5_IB_INVALID_UAR_INDEX;
951
952	INIT_LIST_HEAD(&context->db_page_list);
953	mutex_init(&context->db_page_mutex);
954
955	resp.tot_uuars = req.total_num_uuars;
956	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
957	err = ib_copy_to_udata(udata, &resp,
958			       min_t(size_t, udata->outlen, sizeof(resp)));
959	if (err)
960		goto out_uars;
961
962	uuari->ver = ver;
963	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
964	uuari->uars = uars;
965	uuari->num_uars = num_uars;
966
967	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
968	    IB_LINK_LAYER_ETHERNET) {
969		err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn);
970		if (err)
971			goto out_uars;
972	}
973
974	return &context->ibucontext;
975
976out_uars:
977	for (i--; i >= 0; i--)
978		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
979	kfree(uuari->count);
980
981out_bitmap:
982	kfree(uuari->bitmap);
983
984out_uar_ctx:
985	kfree(uars);
986
987out_ctx:
988	kfree(context);
989	return ERR_PTR(err);
990}
991
992static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
993{
994	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
995	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
996	struct mlx5_uuar_info *uuari = &context->uuari;
997	int i;
998
999	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
1000	    IB_LINK_LAYER_ETHERNET)
1001		mlx5_dealloc_transport_domain(dev->mdev, context->tdn);
1002
1003	for (i = 0; i < uuari->num_uars; i++) {
1004		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
1005			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
1006	}
1007	for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) {
1008		if (context->dynamic_wc_uar_index[i] != MLX5_IB_INVALID_UAR_INDEX)
1009			mlx5_cmd_free_uar(dev->mdev, context->dynamic_wc_uar_index[i]);
1010	}
1011
1012	kfree(uuari->count);
1013	kfree(uuari->bitmap);
1014	kfree(uuari->uars);
1015	kfree(context);
1016
1017	return 0;
1018}
1019
1020static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
1021{
1022	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
1023}
1024
1025static int get_command(unsigned long offset)
1026{
1027	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1028}
1029
1030static int get_arg(unsigned long offset)
1031{
1032	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1033}
1034
1035static int get_index(unsigned long offset)
1036{
1037	return get_arg(offset);
1038}
1039
1040static int uar_mmap(struct vm_area_struct *vma, pgprot_t prot, bool is_wc,
1041		    struct mlx5_uuar_info *uuari, struct mlx5_ib_dev *dev,
1042		    struct mlx5_ib_ucontext *context)
1043{
1044	unsigned long idx;
1045	phys_addr_t pfn;
1046
1047	if (vma->vm_end - vma->vm_start != PAGE_SIZE) {
1048		mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n",
1049			     (long)PAGE_SIZE, (long)(vma->vm_end - vma->vm_start));
1050		return -EINVAL;
1051	}
1052
1053	idx = get_index(vma->vm_pgoff);
1054	if (idx >= uuari->num_uars) {
1055		mlx5_ib_warn(dev, "wrong offset, idx:%ld num_uars:%d\n",
1056			     idx, uuari->num_uars);
1057		return -EINVAL;
1058	}
1059
1060	pfn = uar_index2pfn(dev, uuari->uars[idx].index);
1061	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
1062		    (unsigned long long)pfn);
1063
1064	vma->vm_page_prot = prot;
1065	if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1066			       PAGE_SIZE, vma->vm_page_prot)) {
1067		mlx5_ib_err(dev, "io remap failed\n");
1068		return -EAGAIN;
1069	}
1070
1071	mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA 0x%llx\n", is_wc ? "WC" : "NC",
1072		    (long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT);
1073
1074	return 0;
1075}
1076
1077static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1078{
1079	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1080	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1081	struct mlx5_uuar_info *uuari = &context->uuari;
1082	unsigned long command;
1083
1084	command = get_command(vma->vm_pgoff);
1085	switch (command) {
1086	case MLX5_IB_MMAP_REGULAR_PAGE:
1087		return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot),
1088				true,
1089				uuari, dev, context);
1090
1091		break;
1092
1093	case MLX5_IB_MMAP_WC_PAGE:
1094		return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot),
1095				true, uuari, dev, context);
1096		break;
1097
1098	case MLX5_IB_MMAP_NC_PAGE:
1099		return uar_mmap(vma, pgprot_noncached(vma->vm_page_prot),
1100				false, uuari, dev, context);
1101		break;
1102
1103	default:
1104		return -EINVAL;
1105	}
1106
1107	return 0;
1108}
1109
1110static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn)
1111{
1112	struct mlx5_create_mkey_mbox_in *in;
1113	struct mlx5_mkey_seg *seg;
1114	struct mlx5_core_mr mr;
1115	int err;
1116
1117	in = kzalloc(sizeof(*in), GFP_KERNEL);
1118	if (!in)
1119		return -ENOMEM;
1120
1121	seg = &in->seg;
1122	seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA;
1123	seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
1124	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1125	seg->start_addr = 0;
1126
1127	err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in),
1128				    NULL, NULL, NULL);
1129	if (err) {
1130		mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
1131		goto err_in;
1132	}
1133
1134	kfree(in);
1135	*key = mr.key;
1136
1137	return 0;
1138
1139err_in:
1140	kfree(in);
1141
1142	return err;
1143}
1144
1145static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key)
1146{
1147	struct mlx5_core_mr mr;
1148	int err;
1149
1150	memset(&mr, 0, sizeof(mr));
1151	mr.key = key;
1152	err = mlx5_core_destroy_mkey(dev->mdev, &mr);
1153	if (err)
1154		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key);
1155}
1156
1157static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1158				      struct ib_ucontext *context,
1159				      struct ib_udata *udata)
1160{
1161	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1162	struct mlx5_ib_alloc_pd_resp resp;
1163	struct mlx5_ib_pd *pd;
1164	int err;
1165
1166	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1167	if (!pd)
1168		return ERR_PTR(-ENOMEM);
1169
1170	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1171	if (err) {
1172		mlx5_ib_warn(dev, "pd alloc failed\n");
1173		kfree(pd);
1174		return ERR_PTR(err);
1175	}
1176
1177	if (context) {
1178		resp.pdn = pd->pdn;
1179		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1180			mlx5_ib_err(dev, "copy failed\n");
1181			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1182			kfree(pd);
1183			return ERR_PTR(-EFAULT);
1184		}
1185	} else {
1186		err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn);
1187		if (err) {
1188			mlx5_ib_err(dev, "alloc mkey failed\n");
1189			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1190			kfree(pd);
1191			return ERR_PTR(err);
1192		}
1193	}
1194
1195	return &pd->ibpd;
1196}
1197
1198static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1199{
1200	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1201	struct mlx5_ib_pd *mpd = to_mpd(pd);
1202
1203	if (!pd->uobject)
1204		free_pa_mkey(mdev, mpd->pa_lkey);
1205
1206	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1207	kfree(mpd);
1208
1209	return 0;
1210}
1211
1212static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1213{
1214	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1215	int err;
1216
1217	if (ibqp->qp_type == IB_QPT_RAW_PACKET)
1218		err = -EOPNOTSUPP;
1219	else
1220		err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
1221	if (err)
1222		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
1223			     ibqp->qp_num, gid->raw);
1224
1225	return err;
1226}
1227
1228static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1229{
1230	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1231	int err;
1232
1233	if (ibqp->qp_type == IB_QPT_RAW_PACKET)
1234		err = -EOPNOTSUPP;
1235	else
1236		err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
1237	if (err)
1238		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
1239			     ibqp->qp_num, gid->raw);
1240
1241	return err;
1242}
1243
1244static int init_node_data(struct mlx5_ib_dev *dev)
1245{
1246	int err;
1247
1248	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
1249	if (err)
1250		return err;
1251
1252	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
1253}
1254
1255static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
1256			     char *buf)
1257{
1258	struct mlx5_ib_dev *dev =
1259		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1260
1261	return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages);
1262}
1263
1264static ssize_t show_reg_pages(struct device *device,
1265			      struct device_attribute *attr, char *buf)
1266{
1267	struct mlx5_ib_dev *dev =
1268		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1269
1270	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
1271}
1272
1273static ssize_t show_hca(struct device *device, struct device_attribute *attr,
1274			char *buf)
1275{
1276	struct mlx5_ib_dev *dev =
1277		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1278	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
1279}
1280
1281static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
1282			   char *buf)
1283{
1284	struct mlx5_ib_dev *dev =
1285		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1286	return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev),
1287		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
1288}
1289
1290static ssize_t show_rev(struct device *device, struct device_attribute *attr,
1291			char *buf)
1292{
1293	struct mlx5_ib_dev *dev =
1294		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1295	return sprintf(buf, "%x\n", (unsigned)dev->mdev->pdev->revision);
1296}
1297
1298static ssize_t show_board(struct device *device, struct device_attribute *attr,
1299			  char *buf)
1300{
1301	struct mlx5_ib_dev *dev =
1302		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1303	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
1304		       dev->mdev->board_id);
1305}
1306
1307static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
1308static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
1309static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
1310static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
1311static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
1312static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
1313
1314static struct device_attribute *mlx5_class_attributes[] = {
1315	&dev_attr_hw_rev,
1316	&dev_attr_fw_ver,
1317	&dev_attr_hca_type,
1318	&dev_attr_board_id,
1319	&dev_attr_fw_pages,
1320	&dev_attr_reg_pages,
1321};
1322
1323static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
1324{
1325	struct mlx5_ib_qp *mqp;
1326	struct mlx5_ib_cq *send_mcq, *recv_mcq;
1327	struct mlx5_core_cq *mcq;
1328	struct list_head cq_armed_list;
1329	unsigned long flags_qp;
1330	unsigned long flags_cq;
1331	unsigned long flags;
1332
1333	mlx5_ib_warn(ibdev, " started\n");
1334	INIT_LIST_HEAD(&cq_armed_list);
1335
1336	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
1337	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
1338	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
1339		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
1340		if (mqp->sq.tail != mqp->sq.head) {
1341			send_mcq = to_mcq(mqp->ibqp.send_cq);
1342			spin_lock_irqsave(&send_mcq->lock, flags_cq);
1343			if (send_mcq->mcq.comp &&
1344			    mqp->ibqp.send_cq->comp_handler) {
1345				if (!send_mcq->mcq.reset_notify_added) {
1346					send_mcq->mcq.reset_notify_added = 1;
1347					list_add_tail(&send_mcq->mcq.reset_notify,
1348						      &cq_armed_list);
1349				}
1350			}
1351			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
1352		}
1353		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
1354		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
1355		/* no handling is needed for SRQ */
1356		if (!mqp->ibqp.srq) {
1357			if (mqp->rq.tail != mqp->rq.head) {
1358				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
1359				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
1360				if (recv_mcq->mcq.comp &&
1361				    mqp->ibqp.recv_cq->comp_handler) {
1362					if (!recv_mcq->mcq.reset_notify_added) {
1363						recv_mcq->mcq.reset_notify_added = 1;
1364						list_add_tail(&recv_mcq->mcq.reset_notify,
1365							      &cq_armed_list);
1366					}
1367				}
1368				spin_unlock_irqrestore(&recv_mcq->lock,
1369						       flags_cq);
1370			}
1371		}
1372		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
1373	}
1374	/*At that point all inflight post send were put to be executed as of we
1375	 * lock/unlock above locks Now need to arm all involved CQs.
1376	 */
1377	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
1378		mcq->comp(mcq);
1379	}
1380	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
1381	mlx5_ib_warn(ibdev, " ended\n");
1382	return;
1383}
1384
1385static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
1386			  enum mlx5_dev_event event, unsigned long param)
1387{
1388	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
1389	struct ib_event ibev;
1390
1391	u8 port = 0;
1392
1393	switch (event) {
1394	case MLX5_DEV_EVENT_SYS_ERROR:
1395		ibdev->ib_active = false;
1396		ibev.event = IB_EVENT_DEVICE_FATAL;
1397		mlx5_ib_handle_internal_error(ibdev);
1398		break;
1399
1400	case MLX5_DEV_EVENT_PORT_UP:
1401		ibev.event = IB_EVENT_PORT_ACTIVE;
1402		port = (u8)param;
1403		break;
1404
1405	case MLX5_DEV_EVENT_PORT_DOWN:
1406	case MLX5_DEV_EVENT_PORT_INITIALIZED:
1407		ibev.event = IB_EVENT_PORT_ERR;
1408		port = (u8)param;
1409		break;
1410
1411	case MLX5_DEV_EVENT_LID_CHANGE:
1412		ibev.event = IB_EVENT_LID_CHANGE;
1413		port = (u8)param;
1414		break;
1415
1416	case MLX5_DEV_EVENT_PKEY_CHANGE:
1417		ibev.event = IB_EVENT_PKEY_CHANGE;
1418		port = (u8)param;
1419		break;
1420
1421	case MLX5_DEV_EVENT_GUID_CHANGE:
1422		ibev.event = IB_EVENT_GID_CHANGE;
1423		port = (u8)param;
1424		break;
1425
1426	case MLX5_DEV_EVENT_CLIENT_REREG:
1427		ibev.event = IB_EVENT_CLIENT_REREGISTER;
1428		port = (u8)param;
1429		break;
1430
1431	default:
1432		break;
1433	}
1434
1435	ibev.device	      = &ibdev->ib_dev;
1436	ibev.element.port_num = port;
1437
1438	if ((event != MLX5_DEV_EVENT_SYS_ERROR) &&
1439	    (port < 1 || port > ibdev->num_ports)) {
1440		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
1441		return;
1442	}
1443
1444	if (ibdev->ib_active)
1445		ib_dispatch_event(&ibev);
1446}
1447
1448static void get_ext_port_caps(struct mlx5_ib_dev *dev)
1449{
1450	int port;
1451
1452	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
1453		mlx5_query_ext_port_caps(dev, port);
1454}
1455
1456static void config_atomic_responder(struct mlx5_ib_dev *dev,
1457				    struct ib_device_attr *props)
1458{
1459	enum ib_atomic_cap cap = props->atomic_cap;
1460
1461#if 0
1462	if (cap == IB_ATOMIC_HCA ||
1463	    cap == IB_ATOMIC_GLOB)
1464#endif
1465		dev->enable_atomic_resp = 1;
1466
1467	dev->atomic_cap = cap;
1468}
1469
1470enum mlx5_addr_align {
1471	MLX5_ADDR_ALIGN_0	= 0,
1472	MLX5_ADDR_ALIGN_64	= 64,
1473	MLX5_ADDR_ALIGN_128	= 128,
1474};
1475
1476static int get_port_caps(struct mlx5_ib_dev *dev)
1477{
1478	struct ib_device_attr *dprops = NULL;
1479	struct ib_port_attr *pprops = NULL;
1480	int err = -ENOMEM;
1481	int port;
1482
1483	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
1484	if (!pprops)
1485		goto out;
1486
1487	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
1488	if (!dprops)
1489		goto out;
1490
1491	err = mlx5_ib_query_device(&dev->ib_dev, dprops);
1492	if (err) {
1493		mlx5_ib_warn(dev, "query_device failed %d\n", err);
1494		goto out;
1495	}
1496	config_atomic_responder(dev, dprops);
1497
1498	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
1499		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
1500		if (err) {
1501			mlx5_ib_warn(dev, "query_port %d failed %d\n",
1502				     port, err);
1503			break;
1504		}
1505		dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys;
1506		dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len;
1507		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
1508			    dprops->max_pkeys, pprops->gid_tbl_len);
1509	}
1510
1511out:
1512	kfree(pprops);
1513	kfree(dprops);
1514
1515	return err;
1516}
1517
1518static void destroy_umrc_res(struct mlx5_ib_dev *dev)
1519{
1520	int err;
1521
1522	err = mlx5_mr_cache_cleanup(dev);
1523	if (err)
1524		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
1525
1526	ib_dereg_mr(dev->umrc.mr);
1527	ib_dealloc_pd(dev->umrc.pd);
1528}
1529
1530enum {
1531	MAX_UMR_WR = 128,
1532};
1533
1534static int create_umr_res(struct mlx5_ib_dev *dev)
1535{
1536	struct ib_pd *pd;
1537	struct ib_mr *mr;
1538	int ret;
1539
1540	pd = ib_alloc_pd(&dev->ib_dev);
1541	if (IS_ERR(pd)) {
1542		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
1543		ret = PTR_ERR(pd);
1544		goto error_0;
1545	}
1546
1547	mr = ib_get_dma_mr(pd,  IB_ACCESS_LOCAL_WRITE);
1548	if (IS_ERR(mr)) {
1549		mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
1550		ret = PTR_ERR(mr);
1551		goto error_1;
1552	}
1553
1554	dev->umrc.mr = mr;
1555	dev->umrc.pd = pd;
1556
1557	ret = mlx5_mr_cache_init(dev);
1558	if (ret) {
1559		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
1560		goto error_4;
1561	}
1562
1563	return 0;
1564
1565error_4:
1566	ib_dereg_mr(mr);
1567error_1:
1568	ib_dealloc_pd(pd);
1569error_0:
1570	return ret;
1571}
1572
1573static int create_dev_resources(struct mlx5_ib_resources *devr)
1574{
1575	struct ib_srq_init_attr attr;
1576	struct mlx5_ib_dev *dev;
1577	int ret = 0;
1578
1579	dev = container_of(devr, struct mlx5_ib_dev, devr);
1580
1581	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
1582	if (IS_ERR(devr->p0)) {
1583		ret = PTR_ERR(devr->p0);
1584		goto error0;
1585	}
1586	devr->p0->device  = &dev->ib_dev;
1587	devr->p0->uobject = NULL;
1588	atomic_set(&devr->p0->usecnt, 0);
1589
1590	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL);
1591	if (IS_ERR(devr->c0)) {
1592		ret = PTR_ERR(devr->c0);
1593		goto error1;
1594	}
1595	devr->c0->device        = &dev->ib_dev;
1596	devr->c0->uobject       = NULL;
1597	devr->c0->comp_handler  = NULL;
1598	devr->c0->event_handler = NULL;
1599	devr->c0->cq_context    = NULL;
1600	atomic_set(&devr->c0->usecnt, 0);
1601
1602	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
1603	if (IS_ERR(devr->x0)) {
1604		ret = PTR_ERR(devr->x0);
1605		goto error2;
1606	}
1607	devr->x0->device = &dev->ib_dev;
1608	devr->x0->inode = NULL;
1609	atomic_set(&devr->x0->usecnt, 0);
1610	mutex_init(&devr->x0->tgt_qp_mutex);
1611	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
1612
1613	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
1614	if (IS_ERR(devr->x1)) {
1615		ret = PTR_ERR(devr->x1);
1616		goto error3;
1617	}
1618	devr->x1->device = &dev->ib_dev;
1619	devr->x1->inode = NULL;
1620	atomic_set(&devr->x1->usecnt, 0);
1621	mutex_init(&devr->x1->tgt_qp_mutex);
1622	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
1623
1624	memset(&attr, 0, sizeof(attr));
1625	attr.attr.max_sge = 1;
1626	attr.attr.max_wr = 1;
1627	attr.srq_type = IB_SRQT_XRC;
1628	attr.ext.xrc.cq = devr->c0;
1629	attr.ext.xrc.xrcd = devr->x0;
1630
1631	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
1632	if (IS_ERR(devr->s0)) {
1633		ret = PTR_ERR(devr->s0);
1634		goto error4;
1635	}
1636	devr->s0->device	= &dev->ib_dev;
1637	devr->s0->pd		= devr->p0;
1638	devr->s0->uobject       = NULL;
1639	devr->s0->event_handler = NULL;
1640	devr->s0->srq_context   = NULL;
1641	devr->s0->srq_type      = IB_SRQT_XRC;
1642	devr->s0->ext.xrc.xrcd  = devr->x0;
1643	devr->s0->ext.xrc.cq	= devr->c0;
1644	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
1645	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
1646	atomic_inc(&devr->p0->usecnt);
1647	atomic_set(&devr->s0->usecnt, 0);
1648
1649	memset(&attr, 0, sizeof(attr));
1650	attr.attr.max_sge = 1;
1651	attr.attr.max_wr = 1;
1652	attr.srq_type = IB_SRQT_BASIC;
1653	devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
1654	if (IS_ERR(devr->s1)) {
1655		ret = PTR_ERR(devr->s1);
1656		goto error5;
1657	}
1658	devr->s1->device	= &dev->ib_dev;
1659	devr->s1->pd		= devr->p0;
1660	devr->s1->uobject       = NULL;
1661	devr->s1->event_handler = NULL;
1662	devr->s1->srq_context   = NULL;
1663	devr->s1->srq_type      = IB_SRQT_BASIC;
1664	devr->s1->ext.xrc.cq	= devr->c0;
1665	atomic_inc(&devr->p0->usecnt);
1666	atomic_set(&devr->s1->usecnt, 0);
1667
1668	return 0;
1669
1670error5:
1671	mlx5_ib_destroy_srq(devr->s0);
1672error4:
1673	mlx5_ib_dealloc_xrcd(devr->x1);
1674error3:
1675	mlx5_ib_dealloc_xrcd(devr->x0);
1676error2:
1677	mlx5_ib_destroy_cq(devr->c0);
1678error1:
1679	mlx5_ib_dealloc_pd(devr->p0);
1680error0:
1681	return ret;
1682}
1683
1684static void destroy_dev_resources(struct mlx5_ib_resources *devr)
1685{
1686	mlx5_ib_destroy_srq(devr->s1);
1687	mlx5_ib_destroy_srq(devr->s0);
1688	mlx5_ib_dealloc_xrcd(devr->x0);
1689	mlx5_ib_dealloc_xrcd(devr->x1);
1690	mlx5_ib_destroy_cq(devr->c0);
1691	mlx5_ib_dealloc_pd(devr->p0);
1692}
1693
1694static void enable_dc_tracer(struct mlx5_ib_dev *dev)
1695{
1696	struct device *device = dev->ib_dev.dma_device;
1697	struct mlx5_dc_tracer *dct = &dev->dctr;
1698	int order;
1699	void *tmp;
1700	int size;
1701	int err;
1702
1703	size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096;
1704	if (size <= PAGE_SIZE)
1705		order = 0;
1706	else
1707		order = 1;
1708
1709	dct->pg = alloc_pages(GFP_KERNEL, order);
1710	if (!dct->pg) {
1711		mlx5_ib_err(dev, "failed to allocate %d pages\n", order);
1712		return;
1713	}
1714
1715	tmp = page_address(dct->pg);
1716	memset(tmp, 0xff, size);
1717
1718	dct->size = size;
1719	dct->order = order;
1720	dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE);
1721	if (dma_mapping_error(device, dct->dma)) {
1722		mlx5_ib_err(dev, "dma mapping error\n");
1723		goto map_err;
1724	}
1725
1726	err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma);
1727	if (err) {
1728		mlx5_ib_warn(dev, "failed to enable DC tracer\n");
1729		goto cmd_err;
1730	}
1731
1732	return;
1733
1734cmd_err:
1735	dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE);
1736map_err:
1737	__free_pages(dct->pg, dct->order);
1738	dct->pg = NULL;
1739}
1740
1741static void disable_dc_tracer(struct mlx5_ib_dev *dev)
1742{
1743	struct device *device = dev->ib_dev.dma_device;
1744	struct mlx5_dc_tracer *dct = &dev->dctr;
1745	int err;
1746
1747	if (!dct->pg)
1748		return;
1749
1750	err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma);
1751	if (err) {
1752		mlx5_ib_warn(dev, "failed to disable DC tracer\n");
1753		return;
1754	}
1755
1756	dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE);
1757	__free_pages(dct->pg, dct->order);
1758	dct->pg = NULL;
1759}
1760
1761enum {
1762	MLX5_DC_CNAK_SIZE		= 128,
1763	MLX5_NUM_BUF_IN_PAGE		= PAGE_SIZE / MLX5_DC_CNAK_SIZE,
1764	MLX5_CNAK_TX_CQ_SIGNAL_FACTOR	= 128,
1765	MLX5_DC_CNAK_SL			= 0,
1766	MLX5_DC_CNAK_VL			= 0,
1767};
1768
1769static int init_dc_improvements(struct mlx5_ib_dev *dev)
1770{
1771	if (!mlx5_core_is_pf(dev->mdev))
1772		return 0;
1773
1774	if (!(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace)))
1775		return 0;
1776
1777	enable_dc_tracer(dev);
1778
1779	return 0;
1780}
1781
1782static void cleanup_dc_improvements(struct mlx5_ib_dev *dev)
1783{
1784
1785	disable_dc_tracer(dev);
1786}
1787
1788static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num)
1789{
1790	mlx5_vport_dealloc_q_counter(dev->mdev,
1791				     MLX5_INTERFACE_PROTOCOL_IB,
1792				     dev->port[port_num].q_cnt_id);
1793	dev->port[port_num].q_cnt_id = 0;
1794}
1795
1796static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
1797{
1798	unsigned int i;
1799
1800	for (i = 0; i < dev->num_ports; i++)
1801		mlx5_ib_dealloc_q_port_counter(dev, i);
1802}
1803
1804static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
1805{
1806	int i;
1807	int ret;
1808
1809	for (i = 0; i < dev->num_ports; i++) {
1810		ret = mlx5_vport_alloc_q_counter(dev->mdev,
1811						 MLX5_INTERFACE_PROTOCOL_IB,
1812						 &dev->port[i].q_cnt_id);
1813		if (ret) {
1814			mlx5_ib_warn(dev,
1815				     "couldn't allocate queue counter for port %d\n",
1816				     i + 1);
1817			goto dealloc_counters;
1818		}
1819	}
1820
1821	return 0;
1822
1823dealloc_counters:
1824	while (--i >= 0)
1825		mlx5_ib_dealloc_q_port_counter(dev, i);
1826
1827	return ret;
1828}
1829
1830struct port_attribute {
1831	struct attribute attr;
1832	ssize_t (*show)(struct mlx5_ib_port *,
1833			struct port_attribute *, char *buf);
1834	ssize_t (*store)(struct mlx5_ib_port *,
1835			 struct port_attribute *,
1836			 const char *buf, size_t count);
1837};
1838
1839struct port_counter_attribute {
1840	struct port_attribute	attr;
1841	size_t			offset;
1842};
1843
1844static ssize_t port_attr_show(struct kobject *kobj,
1845			      struct attribute *attr, char *buf)
1846{
1847	struct port_attribute *port_attr =
1848		container_of(attr, struct port_attribute, attr);
1849	struct mlx5_ib_port_sysfs_group *p =
1850		container_of(kobj, struct mlx5_ib_port_sysfs_group,
1851			     kobj);
1852	struct mlx5_ib_port *mibport = container_of(p, struct mlx5_ib_port,
1853						    group);
1854
1855	if (!port_attr->show)
1856		return -EIO;
1857
1858	return port_attr->show(mibport, port_attr, buf);
1859}
1860
1861static ssize_t show_port_counter(struct mlx5_ib_port *p,
1862				 struct port_attribute *port_attr,
1863				 char *buf)
1864{
1865	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
1866	struct port_counter_attribute *counter_attr =
1867		container_of(port_attr, struct port_counter_attribute, attr);
1868	void *out;
1869	int ret;
1870
1871	out = mlx5_vzalloc(outlen);
1872	if (!out)
1873		return -ENOMEM;
1874
1875	ret = mlx5_vport_query_q_counter(p->dev->mdev,
1876					 p->q_cnt_id, 0,
1877					 out, outlen);
1878	if (ret)
1879		goto free;
1880
1881	ret = sprintf(buf, "%d\n",
1882		      be32_to_cpu(*(__be32 *)(out + counter_attr->offset)));
1883
1884free:
1885	kfree(out);
1886	return ret;
1887}
1888
1889#define PORT_COUNTER_ATTR(_name)					\
1890struct port_counter_attribute port_counter_attr_##_name = {		\
1891	.attr  = __ATTR(_name, S_IRUGO, show_port_counter, NULL),	\
1892	.offset = MLX5_BYTE_OFF(query_q_counter_out, _name)		\
1893}
1894
1895static PORT_COUNTER_ATTR(rx_write_requests);
1896static PORT_COUNTER_ATTR(rx_read_requests);
1897static PORT_COUNTER_ATTR(rx_atomic_requests);
1898static PORT_COUNTER_ATTR(rx_dct_connect);
1899static PORT_COUNTER_ATTR(out_of_buffer);
1900static PORT_COUNTER_ATTR(out_of_sequence);
1901static PORT_COUNTER_ATTR(duplicate_request);
1902static PORT_COUNTER_ATTR(rnr_nak_retry_err);
1903static PORT_COUNTER_ATTR(packet_seq_err);
1904static PORT_COUNTER_ATTR(implied_nak_seq_err);
1905static PORT_COUNTER_ATTR(local_ack_timeout_err);
1906
1907static struct attribute *counter_attrs[] = {
1908	&port_counter_attr_rx_write_requests.attr.attr,
1909	&port_counter_attr_rx_read_requests.attr.attr,
1910	&port_counter_attr_rx_atomic_requests.attr.attr,
1911	&port_counter_attr_rx_dct_connect.attr.attr,
1912	&port_counter_attr_out_of_buffer.attr.attr,
1913	&port_counter_attr_out_of_sequence.attr.attr,
1914	&port_counter_attr_duplicate_request.attr.attr,
1915	&port_counter_attr_rnr_nak_retry_err.attr.attr,
1916	&port_counter_attr_packet_seq_err.attr.attr,
1917	&port_counter_attr_implied_nak_seq_err.attr.attr,
1918	&port_counter_attr_local_ack_timeout_err.attr.attr,
1919	NULL
1920};
1921
1922static struct attribute_group port_counters_group = {
1923	.name  = "counters",
1924	.attrs  = counter_attrs
1925};
1926
1927static const struct sysfs_ops port_sysfs_ops = {
1928	.show = port_attr_show
1929};
1930
1931static struct kobj_type port_type = {
1932	.sysfs_ops     = &port_sysfs_ops,
1933};
1934
1935static int add_port_attrs(struct mlx5_ib_dev *dev,
1936			  struct kobject *parent,
1937			  struct mlx5_ib_port_sysfs_group *port,
1938			  u8 port_num)
1939{
1940	int ret;
1941
1942	ret = kobject_init_and_add(&port->kobj, &port_type,
1943				   parent,
1944				   "%d", port_num);
1945	if (ret)
1946		return ret;
1947
1948	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
1949	    MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
1950		ret = sysfs_create_group(&port->kobj, &port_counters_group);
1951		if (ret)
1952			goto put_kobj;
1953	}
1954
1955	port->enabled = true;
1956	return ret;
1957
1958put_kobj:
1959	kobject_put(&port->kobj);
1960	return ret;
1961}
1962
1963static void destroy_ports_attrs(struct mlx5_ib_dev *dev,
1964				unsigned int num_ports)
1965{
1966	unsigned int i;
1967
1968	for (i = 0; i < num_ports; i++) {
1969		struct mlx5_ib_port_sysfs_group *port =
1970			&dev->port[i].group;
1971
1972		if (!port->enabled)
1973			continue;
1974
1975		if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
1976		    MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
1977			sysfs_remove_group(&port->kobj,
1978					   &port_counters_group);
1979		kobject_put(&port->kobj);
1980		port->enabled = false;
1981	}
1982
1983	if (dev->ports_parent) {
1984		kobject_put(dev->ports_parent);
1985		dev->ports_parent = NULL;
1986	}
1987}
1988
1989static int create_port_attrs(struct mlx5_ib_dev *dev)
1990{
1991	int ret = 0;
1992	unsigned int i = 0;
1993	struct device *device = &dev->ib_dev.dev;
1994
1995	dev->ports_parent = kobject_create_and_add("mlx5_ports",
1996						   &device->kobj);
1997	if (!dev->ports_parent)
1998		return -ENOMEM;
1999
2000	for (i = 0; i < dev->num_ports; i++) {
2001		ret = add_port_attrs(dev,
2002				     dev->ports_parent,
2003				     &dev->port[i].group,
2004				     i + 1);
2005
2006		if (ret)
2007			goto _destroy_ports_attrs;
2008	}
2009
2010	return 0;
2011
2012_destroy_ports_attrs:
2013	destroy_ports_attrs(dev, i);
2014	return ret;
2015}
2016
2017static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
2018{
2019	struct mlx5_ib_dev *dev;
2020	int err;
2021	int i;
2022
2023	printk_once(KERN_INFO "%s", mlx5_version);
2024
2025	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
2026	if (!dev)
2027		return NULL;
2028
2029	dev->mdev = mdev;
2030
2031	dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
2032			     GFP_KERNEL);
2033	if (!dev->port)
2034		goto err_dealloc;
2035
2036	for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2037		dev->port[i].dev = dev;
2038		dev->port[i].port_num = i;
2039		dev->port[i].port_gone = 0;
2040		memset(dev->port[i].gid_table, 0, sizeof(dev->port[i].gid_table));
2041	}
2042
2043	err = get_port_caps(dev);
2044	if (err)
2045		goto err_free_port;
2046
2047	if (mlx5_use_mad_ifc(dev))
2048		get_ext_port_caps(dev);
2049
2050	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2051	    IB_LINK_LAYER_ETHERNET) {
2052		if (MLX5_CAP_GEN(mdev, roce)) {
2053			err = mlx5_nic_vport_enable_roce(mdev);
2054			if (err)
2055				goto err_free_port;
2056		} else {
2057			goto err_free_port;
2058		}
2059	}
2060
2061	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
2062
2063	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
2064	dev->ib_dev.owner		= THIS_MODULE;
2065	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
2066	dev->ib_dev.local_dma_lkey	= mdev->special_contexts.resd_lkey;
2067	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
2068	dev->ib_dev.phys_port_cnt     = dev->num_ports;
2069	dev->ib_dev.num_comp_vectors    =
2070		dev->mdev->priv.eq_table.num_comp_vectors;
2071	dev->ib_dev.dma_device	= &mdev->pdev->dev;
2072
2073	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
2074	dev->ib_dev.uverbs_cmd_mask	=
2075		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
2076		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
2077		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
2078		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
2079		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
2080		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
2081		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
2082		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
2083		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
2084		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
2085		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
2086		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
2087		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
2088		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
2089		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
2090		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
2091		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
2092		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
2093		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
2094		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
2095		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
2096		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
2097		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
2098
2099	dev->ib_dev.query_device	= mlx5_ib_query_device;
2100	dev->ib_dev.query_port		= mlx5_ib_query_port;
2101	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
2102	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
2103	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
2104	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
2105	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
2106	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
2107	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
2108	dev->ib_dev.mmap		= mlx5_ib_mmap;
2109	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
2110	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
2111	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
2112	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
2113	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
2114	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
2115	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
2116	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
2117	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
2118	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
2119	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
2120	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
2121	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
2122	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
2123	dev->ib_dev.post_send		= mlx5_ib_post_send;
2124	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
2125	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
2126	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
2127	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
2128	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
2129	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
2130	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
2131	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
2132	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
2133	dev->ib_dev.reg_phys_mr		= mlx5_ib_reg_phys_mr;
2134	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
2135	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
2136	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
2137	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
2138	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
2139	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
2140	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
2141
2142	if (MLX5_CAP_GEN(mdev, xrc)) {
2143		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
2144		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
2145		dev->ib_dev.uverbs_cmd_mask |=
2146			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
2147			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
2148	}
2149
2150	err = init_node_data(dev);
2151	if (err)
2152		goto err_disable_roce;
2153
2154	mutex_init(&dev->cap_mask_mutex);
2155	INIT_LIST_HEAD(&dev->qp_list);
2156	spin_lock_init(&dev->reset_flow_resource_lock);
2157
2158	err = create_dev_resources(&dev->devr);
2159	if (err)
2160		goto err_disable_roce;
2161
2162
2163	err = mlx5_ib_alloc_q_counters(dev);
2164	if (err)
2165		goto err_odp;
2166
2167	err = ib_register_device(&dev->ib_dev, NULL);
2168	if (err)
2169		goto err_q_cnt;
2170
2171	err = create_umr_res(dev);
2172	if (err)
2173		goto err_dev;
2174
2175	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2176	    MLX5_CAP_PORT_TYPE_IB) {
2177		if (init_dc_improvements(dev))
2178			mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n");
2179	}
2180
2181	err = create_port_attrs(dev);
2182	if (err)
2183		goto err_dc;
2184
2185	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
2186		err = device_create_file(&dev->ib_dev.dev,
2187					 mlx5_class_attributes[i]);
2188		if (err)
2189			goto err_port_attrs;
2190	}
2191
2192	if (1) {
2193		struct thread *rl_thread = NULL;
2194		struct proc *rl_proc = NULL;
2195
2196		for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2197			(void) kproc_kthread_add(mlx5_ib_roce_port_update, dev->port + i, &rl_proc, &rl_thread,
2198			    RFHIGHPID, 0, "mlx5-ib-roce-port", "mlx5-ib-roce_port-%d", i);
2199		}
2200	}
2201
2202	dev->ib_active = true;
2203
2204	return dev;
2205
2206err_port_attrs:
2207	destroy_ports_attrs(dev, dev->num_ports);
2208
2209err_dc:
2210	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2211	    MLX5_CAP_PORT_TYPE_IB)
2212		cleanup_dc_improvements(dev);
2213	destroy_umrc_res(dev);
2214
2215err_dev:
2216	ib_unregister_device(&dev->ib_dev);
2217
2218err_q_cnt:
2219	mlx5_ib_dealloc_q_counters(dev);
2220
2221err_odp:
2222	destroy_dev_resources(&dev->devr);
2223
2224err_disable_roce:
2225	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2226	    IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce))
2227		mlx5_nic_vport_disable_roce(mdev);
2228err_free_port:
2229	kfree(dev->port);
2230
2231err_dealloc:
2232	ib_dealloc_device((struct ib_device *)dev);
2233
2234	return NULL;
2235}
2236
2237static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
2238{
2239	struct mlx5_ib_dev *dev = context;
2240	int i;
2241
2242	for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2243		dev->port[i].port_gone = 1;
2244		while (dev->port[i].port_gone != 2)
2245			pause("W", hz);
2246	}
2247
2248	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
2249		device_remove_file(&dev->ib_dev.dev,
2250		    mlx5_class_attributes[i]);
2251	}
2252
2253	destroy_ports_attrs(dev, dev->num_ports);
2254	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2255	    MLX5_CAP_PORT_TYPE_IB)
2256		cleanup_dc_improvements(dev);
2257	mlx5_ib_dealloc_q_counters(dev);
2258	ib_unregister_device(&dev->ib_dev);
2259	destroy_umrc_res(dev);
2260	destroy_dev_resources(&dev->devr);
2261
2262	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2263	    IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce))
2264		mlx5_nic_vport_disable_roce(mdev);
2265
2266	kfree(dev->port);
2267	ib_dealloc_device(&dev->ib_dev);
2268}
2269
2270static struct mlx5_interface mlx5_ib_interface = {
2271	.add            = mlx5_ib_add,
2272	.remove         = mlx5_ib_remove,
2273	.event          = mlx5_ib_event,
2274	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
2275};
2276
2277static int __init mlx5_ib_init(void)
2278{
2279	int err;
2280
2281	if (deprecated_prof_sel != 2)
2282		printf("mlx5_ib: WARN: ""prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
2283
2284	err = mlx5_register_interface(&mlx5_ib_interface);
2285	if (err)
2286		goto clean_odp;
2287
2288	mlx5_ib_wq = create_singlethread_workqueue("mlx5_ib_wq");
2289	if (!mlx5_ib_wq) {
2290		printf("mlx5_ib: ERR: ""%s: failed to create mlx5_ib_wq\n", __func__);
2291		goto err_unreg;
2292	}
2293
2294	return err;
2295
2296err_unreg:
2297	mlx5_unregister_interface(&mlx5_ib_interface);
2298
2299clean_odp:
2300	return err;
2301}
2302
2303static void __exit mlx5_ib_cleanup(void)
2304{
2305	destroy_workqueue(mlx5_ib_wq);
2306	mlx5_unregister_interface(&mlx5_ib_interface);
2307}
2308
2309module_init_order(mlx5_ib_init, SI_ORDER_THIRD);
2310module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD);
2311