1/*-
2 * Copyright (c) 2013-2021, Mellanox Technologies, Ltd.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include "opt_rss.h"
27#include "opt_ratelimit.h"
28
29#include <linux/module.h>
30#include <linux/errno.h>
31#include <linux/pci.h>
32#include <linux/dma-mapping.h>
33#include <linux/slab.h>
34#if defined(CONFIG_X86)
35#include <asm/pat.h>
36#endif
37#include <linux/sched.h>
38#include <linux/delay.h>
39#include <linux/fs.h>
40#undef inode
41#include <rdma/ib_user_verbs.h>
42#include <rdma/ib_addr.h>
43#include <rdma/ib_cache.h>
44#include <dev/mlx5/port.h>
45#include <dev/mlx5/vport.h>
46#include <linux/list.h>
47#include <rdma/ib_smi.h>
48#include <rdma/ib_umem.h>
49#include <rdma/uverbs_ioctl.h>
50#include <linux/in.h>
51#include <linux/etherdevice.h>
52#include <dev/mlx5/fs.h>
53#include <dev/mlx5/mlx5_ib/mlx5_ib.h>
54
55MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
56MODULE_LICENSE("Dual BSD/GPL");
57MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1);
58MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1);
59MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1);
60MODULE_VERSION(mlx5ib, 1);
61
62enum {
63	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
64};
65
66static enum rdma_link_layer
67mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
68{
69	switch (port_type_cap) {
70	case MLX5_CAP_PORT_TYPE_IB:
71		return IB_LINK_LAYER_INFINIBAND;
72	case MLX5_CAP_PORT_TYPE_ETH:
73		return IB_LINK_LAYER_ETHERNET;
74	default:
75		return IB_LINK_LAYER_UNSPECIFIED;
76	}
77}
78
79static enum rdma_link_layer
80mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
81{
82	struct mlx5_ib_dev *dev = to_mdev(device);
83	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
84
85	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
86}
87
88static bool mlx5_netdev_match(if_t ndev,
89			      struct mlx5_core_dev *mdev,
90			      const char *dname)
91{
92	return if_gettype(ndev) == IFT_ETHER &&
93	  if_getdname(ndev) != NULL &&
94	  strcmp(if_getdname(ndev), dname) == 0 &&
95	  if_getsoftc(ndev) != NULL &&
96	  *(struct mlx5_core_dev **)if_getsoftc(ndev) == mdev;
97}
98
99static int mlx5_netdev_event(struct notifier_block *this,
100			     unsigned long event, void *ptr)
101{
102	if_t ndev = netdev_notifier_info_to_ifp(ptr);
103	struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
104						 roce.nb);
105
106	switch (event) {
107	case NETDEV_REGISTER:
108	case NETDEV_UNREGISTER:
109		write_lock(&ibdev->roce.netdev_lock);
110		/* check if network interface belongs to mlx5en */
111		if (mlx5_netdev_match(ndev, ibdev->mdev, "mce"))
112			ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ?
113					     NULL : ndev;
114		write_unlock(&ibdev->roce.netdev_lock);
115		break;
116
117	case NETDEV_UP:
118	case NETDEV_DOWN: {
119		if_t upper = NULL;
120
121		if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
122		    && ibdev->ib_active) {
123			struct ib_event ibev = {0};
124
125			ibev.device = &ibdev->ib_dev;
126			ibev.event = (event == NETDEV_UP) ?
127				     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
128			ibev.element.port_num = 1;
129			ib_dispatch_event(&ibev);
130		}
131		break;
132	}
133
134	default:
135		break;
136	}
137
138	return NOTIFY_DONE;
139}
140
141static if_t mlx5_ib_get_netdev(struct ib_device *device,
142					     u8 port_num)
143{
144	struct mlx5_ib_dev *ibdev = to_mdev(device);
145	if_t ndev;
146
147	/* Ensure ndev does not disappear before we invoke if_ref()
148	 */
149	read_lock(&ibdev->roce.netdev_lock);
150	ndev = ibdev->roce.netdev;
151	if (ndev)
152		if_ref(ndev);
153	read_unlock(&ibdev->roce.netdev_lock);
154
155	return ndev;
156}
157
158static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
159				    u8 *active_width)
160{
161	switch (eth_proto_oper) {
162	case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
163	case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
164	case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
165	case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
166		*active_width = IB_WIDTH_1X;
167		*active_speed = IB_SPEED_SDR;
168		break;
169	case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
170	case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
171	case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
172	case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
173	case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
174	case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
175	case MLX5E_PROT_MASK(MLX5E_10GBASE_ER_LR):
176		*active_width = IB_WIDTH_1X;
177		*active_speed = IB_SPEED_QDR;
178		break;
179	case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
180	case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
181	case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
182		*active_width = IB_WIDTH_1X;
183		*active_speed = IB_SPEED_EDR;
184		break;
185	case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
186	case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
187	case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
188	case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4_ER4):
189		*active_width = IB_WIDTH_4X;
190		*active_speed = IB_SPEED_QDR;
191		break;
192	case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
193	case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
194	case MLX5E_PROT_MASK(MLX5E_50GBASE_KR4):
195	case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
196		*active_width = IB_WIDTH_1X;
197		*active_speed = IB_SPEED_HDR;
198		break;
199	case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
200		*active_width = IB_WIDTH_4X;
201		*active_speed = IB_SPEED_FDR;
202		break;
203	case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
204	case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
205	case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
206	case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
207		*active_width = IB_WIDTH_4X;
208		*active_speed = IB_SPEED_EDR;
209		break;
210	default:
211		*active_width = IB_WIDTH_4X;
212		*active_speed = IB_SPEED_QDR;
213		return -EINVAL;
214	}
215
216	return 0;
217}
218
219static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed,
220					u8 *active_width)
221{
222	switch (eth_proto_oper) {
223	case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
224	case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
225		*active_width = IB_WIDTH_1X;
226		*active_speed = IB_SPEED_SDR;
227		break;
228	case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
229		*active_width = IB_WIDTH_1X;
230		*active_speed = IB_SPEED_DDR;
231		break;
232	case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
233		*active_width = IB_WIDTH_1X;
234		*active_speed = IB_SPEED_QDR;
235		break;
236	case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
237		*active_width = IB_WIDTH_4X;
238		*active_speed = IB_SPEED_QDR;
239		break;
240	case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
241		*active_width = IB_WIDTH_1X;
242		*active_speed = IB_SPEED_EDR;
243		break;
244	case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
245		*active_width = IB_WIDTH_2X;
246		*active_speed = IB_SPEED_EDR;
247		break;
248	case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
249		*active_width = IB_WIDTH_1X;
250		*active_speed = IB_SPEED_HDR;
251		break;
252	case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
253		*active_width = IB_WIDTH_4X;
254		*active_speed = IB_SPEED_EDR;
255		break;
256	case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
257		*active_width = IB_WIDTH_2X;
258		*active_speed = IB_SPEED_HDR;
259		break;
260	case MLX5E_PROT_MASK(MLX5E_100GAUI_1_100GBASE_CR_KR):
261		*active_width = IB_WIDTH_1X;
262		*active_speed = IB_SPEED_NDR;
263		break;
264	case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
265		*active_width = IB_WIDTH_4X;
266		*active_speed = IB_SPEED_HDR;
267		break;
268	case MLX5E_PROT_MASK(MLX5E_200GAUI_2_200GBASE_CR2_KR2):
269		*active_width = IB_WIDTH_2X;
270		*active_speed = IB_SPEED_NDR;
271		break;
272	case MLX5E_PROT_MASK(MLX5E_400GAUI_4_400GBASE_CR4_KR4):
273		*active_width = IB_WIDTH_4X;
274		*active_speed = IB_SPEED_NDR;
275		break;
276	default:
277		*active_width = IB_WIDTH_4X;
278		*active_speed = IB_SPEED_QDR;
279		return -EINVAL;
280	}
281
282	return 0;
283}
284
285static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
286				struct ib_port_attr *props)
287{
288	struct mlx5_ib_dev *dev = to_mdev(device);
289	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {};
290	if_t ndev;
291	enum ib_mtu ndev_ib_mtu;
292	u16 qkey_viol_cntr;
293	u32 eth_prot_oper;
294	bool ext;
295	int err;
296
297	memset(props, 0, sizeof(*props));
298
299	/* Possible bad flows are checked before filling out props so in case
300	 * of an error it will still be zeroed out.
301	 */
302	err = mlx5_query_port_ptys(dev->mdev, out, sizeof(out), MLX5_PTYS_EN,
303	    port_num);
304	if (err)
305		return err;
306
307	ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet);
308	eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
309
310	if (ext)
311		translate_eth_ext_proto_oper(eth_prot_oper, &props->active_speed,
312		    &props->active_width);
313	else
314		translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
315		    &props->active_width);
316
317	props->port_cap_flags  |= IB_PORT_CM_SUP;
318	props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
319
320	props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
321						roce_address_table_size);
322	props->max_mtu          = IB_MTU_4096;
323	props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
324	props->pkey_tbl_len     = 1;
325	props->state            = IB_PORT_DOWN;
326	props->phys_state       = IB_PORT_PHYS_STATE_DISABLED;
327
328	mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
329	props->qkey_viol_cntr = qkey_viol_cntr;
330
331	ndev = mlx5_ib_get_netdev(device, port_num);
332	if (!ndev)
333		return 0;
334
335	if (if_getdrvflags(ndev) & IFF_DRV_RUNNING &&
336	    if_getlinkstate(ndev) == LINK_STATE_UP) {
337		props->state      = IB_PORT_ACTIVE;
338		props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
339	}
340
341	ndev_ib_mtu = iboe_get_mtu(if_getmtu(ndev));
342
343	if_rele(ndev);
344
345	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
346	return 0;
347}
348
349static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
350				     const struct ib_gid_attr *attr,
351				     void *mlx5_addr)
352{
353#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
354	char *mlx5_addr_l3_addr	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
355					       source_l3_address);
356	void *mlx5_addr_mac	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
357					       source_mac_47_32);
358	u16 vlan_id;
359
360	if (!gid)
361		return;
362	ether_addr_copy(mlx5_addr_mac, if_getlladdr(attr->ndev));
363
364	vlan_id = rdma_vlan_dev_vlan_id(attr->ndev);
365	if (vlan_id != 0xffff) {
366		MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
367		MLX5_SET_RA(mlx5_addr, vlan_id, vlan_id);
368	}
369
370	switch (attr->gid_type) {
371	case IB_GID_TYPE_IB:
372		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
373		break;
374	case IB_GID_TYPE_ROCE_UDP_ENCAP:
375		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
376		break;
377
378	default:
379		WARN_ON(true);
380	}
381
382	if (attr->gid_type != IB_GID_TYPE_IB) {
383		if (ipv6_addr_v4mapped((void *)gid))
384			MLX5_SET_RA(mlx5_addr, roce_l3_type,
385				    MLX5_ROCE_L3_TYPE_IPV4);
386		else
387			MLX5_SET_RA(mlx5_addr, roce_l3_type,
388				    MLX5_ROCE_L3_TYPE_IPV6);
389	}
390
391	if ((attr->gid_type == IB_GID_TYPE_IB) ||
392	    !ipv6_addr_v4mapped((void *)gid))
393		memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
394	else
395		memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
396}
397
398static int set_roce_addr(struct ib_device *device, u8 port_num,
399			 unsigned int index,
400			 const union ib_gid *gid,
401			 const struct ib_gid_attr *attr)
402{
403	struct mlx5_ib_dev *dev = to_mdev(device);
404	u32  in[MLX5_ST_SZ_DW(set_roce_address_in)]  = {0};
405	u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0};
406	void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
407	enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
408
409	if (ll != IB_LINK_LAYER_ETHERNET)
410		return -EINVAL;
411
412	ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
413
414	MLX5_SET(set_roce_address_in, in, roce_address_index, index);
415	MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
416	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
417}
418
419static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
420			   unsigned int index, const union ib_gid *gid,
421			   const struct ib_gid_attr *attr,
422			   __always_unused void **context)
423{
424	return set_roce_addr(device, port_num, index, gid, attr);
425}
426
427static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
428			   unsigned int index, __always_unused void **context)
429{
430	return set_roce_addr(device, port_num, index, NULL, NULL);
431}
432
433__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
434			       int index)
435{
436	struct ib_gid_attr attr;
437	union ib_gid gid;
438
439	if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
440		return 0;
441
442	if (!attr.ndev)
443		return 0;
444
445	if_rele(attr.ndev);
446
447	if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
448		return 0;
449
450	return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
451}
452
453int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
454			   int index, enum ib_gid_type *gid_type)
455{
456	struct ib_gid_attr attr;
457	union ib_gid gid;
458	int ret;
459
460	ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr);
461	if (ret)
462		return ret;
463
464	if (!attr.ndev)
465		return -ENODEV;
466
467	if_rele(attr.ndev);
468
469	*gid_type = attr.gid_type;
470
471	return 0;
472}
473
474static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
475{
476	if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
477		return !MLX5_CAP_GEN(dev->mdev, ib_virt);
478	return 0;
479}
480
481enum {
482	MLX5_VPORT_ACCESS_METHOD_MAD,
483	MLX5_VPORT_ACCESS_METHOD_HCA,
484	MLX5_VPORT_ACCESS_METHOD_NIC,
485};
486
487static int mlx5_get_vport_access_method(struct ib_device *ibdev)
488{
489	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
490		return MLX5_VPORT_ACCESS_METHOD_MAD;
491
492	if (mlx5_ib_port_link_layer(ibdev, 1) ==
493	    IB_LINK_LAYER_ETHERNET)
494		return MLX5_VPORT_ACCESS_METHOD_NIC;
495
496	return MLX5_VPORT_ACCESS_METHOD_HCA;
497}
498
499static void get_atomic_caps(struct mlx5_ib_dev *dev,
500			    struct ib_device_attr *props)
501{
502	u8 tmp;
503	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
504	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
505	u8 atomic_req_8B_endianness_mode =
506		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
507
508	/* Check if HW supports 8 bytes standard atomic operations and capable
509	 * of host endianness respond
510	 */
511	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
512	if (((atomic_operations & tmp) == tmp) &&
513	    (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
514	    (atomic_req_8B_endianness_mode)) {
515		props->atomic_cap = IB_ATOMIC_HCA;
516	} else {
517		props->atomic_cap = IB_ATOMIC_NONE;
518	}
519}
520
521static int mlx5_query_system_image_guid(struct ib_device *ibdev,
522					__be64 *sys_image_guid)
523{
524	struct mlx5_ib_dev *dev = to_mdev(ibdev);
525	struct mlx5_core_dev *mdev = dev->mdev;
526	u64 tmp;
527	int err;
528
529	switch (mlx5_get_vport_access_method(ibdev)) {
530	case MLX5_VPORT_ACCESS_METHOD_MAD:
531		return mlx5_query_mad_ifc_system_image_guid(ibdev,
532							    sys_image_guid);
533
534	case MLX5_VPORT_ACCESS_METHOD_HCA:
535		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
536		break;
537
538	case MLX5_VPORT_ACCESS_METHOD_NIC:
539		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
540		break;
541
542	default:
543		return -EINVAL;
544	}
545
546	if (!err)
547		*sys_image_guid = cpu_to_be64(tmp);
548
549	return err;
550
551}
552
553static int mlx5_query_max_pkeys(struct ib_device *ibdev,
554				u16 *max_pkeys)
555{
556	struct mlx5_ib_dev *dev = to_mdev(ibdev);
557	struct mlx5_core_dev *mdev = dev->mdev;
558
559	switch (mlx5_get_vport_access_method(ibdev)) {
560	case MLX5_VPORT_ACCESS_METHOD_MAD:
561		return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
562
563	case MLX5_VPORT_ACCESS_METHOD_HCA:
564	case MLX5_VPORT_ACCESS_METHOD_NIC:
565		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
566						pkey_table_size));
567		return 0;
568
569	default:
570		return -EINVAL;
571	}
572}
573
574static int mlx5_query_vendor_id(struct ib_device *ibdev,
575				u32 *vendor_id)
576{
577	struct mlx5_ib_dev *dev = to_mdev(ibdev);
578
579	switch (mlx5_get_vport_access_method(ibdev)) {
580	case MLX5_VPORT_ACCESS_METHOD_MAD:
581		return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
582
583	case MLX5_VPORT_ACCESS_METHOD_HCA:
584	case MLX5_VPORT_ACCESS_METHOD_NIC:
585		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
586
587	default:
588		return -EINVAL;
589	}
590}
591
592static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
593				__be64 *node_guid)
594{
595	u64 tmp;
596	int err;
597
598	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
599	case MLX5_VPORT_ACCESS_METHOD_MAD:
600		return mlx5_query_mad_ifc_node_guid(dev, node_guid);
601
602	case MLX5_VPORT_ACCESS_METHOD_HCA:
603		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
604		break;
605
606	case MLX5_VPORT_ACCESS_METHOD_NIC:
607		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
608		break;
609
610	default:
611		return -EINVAL;
612	}
613
614	if (!err)
615		*node_guid = cpu_to_be64(tmp);
616
617	return err;
618}
619
620struct mlx5_reg_node_desc {
621	u8	desc[IB_DEVICE_NODE_DESC_MAX];
622};
623
624static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
625{
626	struct mlx5_reg_node_desc in;
627
628	if (mlx5_use_mad_ifc(dev))
629		return mlx5_query_mad_ifc_node_desc(dev, node_desc);
630
631	memset(&in, 0, sizeof(in));
632
633	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
634				    sizeof(struct mlx5_reg_node_desc),
635				    MLX5_REG_NODE_DESC, 0, 0);
636}
637
638static int mlx5_ib_query_device(struct ib_device *ibdev,
639				struct ib_device_attr *props,
640				struct ib_udata *uhw)
641{
642	struct mlx5_ib_dev *dev = to_mdev(ibdev);
643	struct mlx5_core_dev *mdev = dev->mdev;
644	int err = -ENOMEM;
645	int max_sq_desc;
646	int max_rq_sg;
647	int max_sq_sg;
648	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
649	struct mlx5_ib_query_device_resp resp = {};
650	size_t resp_len;
651	u64 max_tso;
652
653	resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
654	if (uhw->outlen && uhw->outlen < resp_len)
655		return -EINVAL;
656	else
657		resp.response_length = resp_len;
658
659	if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
660		return -EINVAL;
661
662	memset(props, 0, sizeof(*props));
663	err = mlx5_query_system_image_guid(ibdev,
664					   &props->sys_image_guid);
665	if (err)
666		return err;
667
668	err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
669	if (err)
670		return err;
671
672	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
673	if (err)
674		return err;
675
676	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
677		((u32)fw_rev_min(dev->mdev) << 16) |
678		fw_rev_sub(dev->mdev);
679	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
680		IB_DEVICE_PORT_ACTIVE_EVENT		|
681		IB_DEVICE_SYS_IMAGE_GUID		|
682		IB_DEVICE_RC_RNR_NAK_GEN		|
683		IB_DEVICE_KNOWSEPOCH;
684
685	if (MLX5_CAP_GEN(mdev, pkv))
686		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
687	if (MLX5_CAP_GEN(mdev, qkv))
688		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
689	if (MLX5_CAP_GEN(mdev, apm))
690		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
691	if (MLX5_CAP_GEN(mdev, xrc))
692		props->device_cap_flags |= IB_DEVICE_XRC;
693	if (MLX5_CAP_GEN(mdev, imaicl)) {
694		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
695					   IB_DEVICE_MEM_WINDOW_TYPE_2B;
696		props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
697		/* We support 'Gappy' memory registration too */
698		props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
699	}
700	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
701	if (MLX5_CAP_GEN(mdev, sho)) {
702		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
703		/* At this stage no support for signature handover */
704		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
705				      IB_PROT_T10DIF_TYPE_2 |
706				      IB_PROT_T10DIF_TYPE_3;
707		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
708				       IB_GUARD_T10DIF_CSUM;
709	}
710	if (MLX5_CAP_GEN(mdev, block_lb_mc))
711		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
712
713	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
714		if (MLX5_CAP_ETH(mdev, csum_cap))
715			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
716
717		if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
718			max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
719			if (max_tso) {
720				resp.tso_caps.max_tso = 1 << max_tso;
721				resp.tso_caps.supported_qpts |=
722					1 << IB_QPT_RAW_PACKET;
723				resp.response_length += sizeof(resp.tso_caps);
724			}
725		}
726
727		if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
728			resp.rss_caps.rx_hash_function =
729						MLX5_RX_HASH_FUNC_TOEPLITZ;
730			resp.rss_caps.rx_hash_fields_mask =
731						MLX5_RX_HASH_SRC_IPV4 |
732						MLX5_RX_HASH_DST_IPV4 |
733						MLX5_RX_HASH_SRC_IPV6 |
734						MLX5_RX_HASH_DST_IPV6 |
735						MLX5_RX_HASH_SRC_PORT_TCP |
736						MLX5_RX_HASH_DST_PORT_TCP |
737						MLX5_RX_HASH_SRC_PORT_UDP |
738						MLX5_RX_HASH_DST_PORT_UDP;
739			resp.response_length += sizeof(resp.rss_caps);
740		}
741	} else {
742		if (field_avail(typeof(resp), tso_caps, uhw->outlen))
743			resp.response_length += sizeof(resp.tso_caps);
744		if (field_avail(typeof(resp), rss_caps, uhw->outlen))
745			resp.response_length += sizeof(resp.rss_caps);
746	}
747
748	if (MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) {
749		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
750		props->device_cap_flags |= IB_DEVICE_UD_TSO;
751	}
752
753	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
754	    MLX5_CAP_ETH(dev->mdev, scatter_fcs))
755		props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
756
757	if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
758		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
759
760	props->vendor_part_id	   = mdev->pdev->device;
761	props->hw_ver		   = mdev->pdev->revision;
762
763	props->max_mr_size	   = ~0ull;
764	props->page_size_cap	   = ~(min_page_size - 1);
765	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
766	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
767	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
768		     sizeof(struct mlx5_wqe_data_seg);
769	max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
770	max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
771		     sizeof(struct mlx5_wqe_raddr_seg)) /
772		sizeof(struct mlx5_wqe_data_seg);
773	props->max_sge = min(max_rq_sg, max_sq_sg);
774	props->max_sge_rd	   = MLX5_MAX_SGE_RD;
775	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
776	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
777	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
778	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
779	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
780	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
781	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
782	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
783	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
784	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
785	props->max_srq_sge	   = max_rq_sg - 1;
786	props->max_fast_reg_page_list_len =
787		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
788	get_atomic_caps(dev, props);
789	props->masked_atomic_cap   = IB_ATOMIC_NONE;
790	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
791	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
792	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
793					   props->max_mcast_grp;
794	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
795	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
796	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
797
798#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
799	if (MLX5_CAP_GEN(mdev, pg))
800		props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
801	props->odp_caps = dev->odp_caps;
802#endif
803
804	if (MLX5_CAP_GEN(mdev, cd))
805		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
806
807	if (!mlx5_core_is_pf(mdev))
808		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
809
810	if (mlx5_ib_port_link_layer(ibdev, 1) ==
811	    IB_LINK_LAYER_ETHERNET) {
812		props->rss_caps.max_rwq_indirection_tables =
813			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
814		props->rss_caps.max_rwq_indirection_table_size =
815			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
816		props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
817		props->max_wq_type_rq =
818			1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
819	}
820
821	if (uhw->outlen) {
822		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
823
824		if (err)
825			return err;
826	}
827
828	return 0;
829}
830
831enum mlx5_ib_width {
832	MLX5_IB_WIDTH_1X	= 1 << 0,
833	MLX5_IB_WIDTH_2X	= 1 << 1,
834	MLX5_IB_WIDTH_4X	= 1 << 2,
835	MLX5_IB_WIDTH_8X	= 1 << 3,
836	MLX5_IB_WIDTH_12X	= 1 << 4
837};
838
839static int translate_active_width(struct ib_device *ibdev, u8 active_width,
840				  u8 *ib_width)
841{
842	struct mlx5_ib_dev *dev = to_mdev(ibdev);
843	int err = 0;
844
845	if (active_width & MLX5_IB_WIDTH_1X) {
846		*ib_width = IB_WIDTH_1X;
847	} else if (active_width & MLX5_IB_WIDTH_2X) {
848		*ib_width = IB_WIDTH_2X;
849	} else if (active_width & MLX5_IB_WIDTH_4X) {
850		*ib_width = IB_WIDTH_4X;
851	} else if (active_width & MLX5_IB_WIDTH_8X) {
852		*ib_width = IB_WIDTH_8X;
853	} else if (active_width & MLX5_IB_WIDTH_12X) {
854		*ib_width = IB_WIDTH_12X;
855	} else {
856		mlx5_ib_dbg(dev, "Invalid active_width %d\n",
857			    (int)active_width);
858		err = -EINVAL;
859	}
860
861	return err;
862}
863
864enum ib_max_vl_num {
865	__IB_MAX_VL_0		= 1,
866	__IB_MAX_VL_0_1		= 2,
867	__IB_MAX_VL_0_3		= 3,
868	__IB_MAX_VL_0_7		= 4,
869	__IB_MAX_VL_0_14	= 5,
870};
871
872enum mlx5_vl_hw_cap {
873	MLX5_VL_HW_0	= 1,
874	MLX5_VL_HW_0_1	= 2,
875	MLX5_VL_HW_0_2	= 3,
876	MLX5_VL_HW_0_3	= 4,
877	MLX5_VL_HW_0_4	= 5,
878	MLX5_VL_HW_0_5	= 6,
879	MLX5_VL_HW_0_6	= 7,
880	MLX5_VL_HW_0_7	= 8,
881	MLX5_VL_HW_0_14	= 15
882};
883
884static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
885				u8 *max_vl_num)
886{
887	switch (vl_hw_cap) {
888	case MLX5_VL_HW_0:
889		*max_vl_num = __IB_MAX_VL_0;
890		break;
891	case MLX5_VL_HW_0_1:
892		*max_vl_num = __IB_MAX_VL_0_1;
893		break;
894	case MLX5_VL_HW_0_3:
895		*max_vl_num = __IB_MAX_VL_0_3;
896		break;
897	case MLX5_VL_HW_0_7:
898		*max_vl_num = __IB_MAX_VL_0_7;
899		break;
900	case MLX5_VL_HW_0_14:
901		*max_vl_num = __IB_MAX_VL_0_14;
902		break;
903
904	default:
905		return -EINVAL;
906	}
907
908	return 0;
909}
910
911static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
912			       struct ib_port_attr *props)
913{
914	struct mlx5_ib_dev *dev = to_mdev(ibdev);
915	struct mlx5_core_dev *mdev = dev->mdev;
916	u32 *rep;
917	int replen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out);
918	struct mlx5_ptys_reg *ptys;
919	struct mlx5_pmtu_reg *pmtu;
920	struct mlx5_pvlc_reg pvlc;
921	void *ctx;
922	int err;
923
924	rep = mlx5_vzalloc(replen);
925	ptys = kzalloc(sizeof(*ptys), GFP_KERNEL);
926	pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL);
927	if (!rep || !ptys || !pmtu) {
928		err = -ENOMEM;
929		goto out;
930	}
931
932	memset(props, 0, sizeof(*props));
933
934	err = mlx5_query_hca_vport_context(mdev, port, 0, rep, replen);
935	if (err)
936		goto out;
937
938	ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context);
939
940	props->lid		= MLX5_GET(hca_vport_context, ctx, lid);
941	props->lmc		= MLX5_GET(hca_vport_context, ctx, lmc);
942	props->sm_lid		= MLX5_GET(hca_vport_context, ctx, sm_lid);
943	props->sm_sl		= MLX5_GET(hca_vport_context, ctx, sm_sl);
944	props->state		= MLX5_GET(hca_vport_context, ctx, vport_state);
945	props->phys_state	= MLX5_GET(hca_vport_context, ctx,
946					port_physical_state);
947	props->port_cap_flags	= MLX5_GET(hca_vport_context, ctx, cap_mask1);
948	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
949	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
950	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
951	props->bad_pkey_cntr	= MLX5_GET(hca_vport_context, ctx,
952					pkey_violation_counter);
953	props->qkey_viol_cntr	= MLX5_GET(hca_vport_context, ctx,
954					qkey_violation_counter);
955	props->subnet_timeout	= MLX5_GET(hca_vport_context, ctx,
956					subnet_timeout);
957	props->init_type_reply	= MLX5_GET(hca_vport_context, ctx,
958					init_type_reply);
959	props->grh_required	= MLX5_GET(hca_vport_context, ctx, grh_required);
960
961	ptys->proto_mask |= MLX5_PTYS_IB;
962	ptys->local_port = port;
963	err = mlx5_core_access_ptys(mdev, ptys, 0);
964	if (err)
965		goto out;
966
967	err = translate_active_width(ibdev, ptys->ib_link_width_oper,
968				     &props->active_width);
969	if (err)
970		goto out;
971
972	props->active_speed	= (u8)ptys->ib_proto_oper;
973
974	pmtu->local_port = port;
975	err = mlx5_core_access_pmtu(mdev, pmtu, 0);
976	if (err)
977		goto out;
978
979	props->max_mtu		= pmtu->max_mtu;
980	props->active_mtu	= pmtu->oper_mtu;
981
982	memset(&pvlc, 0, sizeof(pvlc));
983	pvlc.local_port = port;
984	err = mlx5_core_access_pvlc(mdev, &pvlc, 0);
985	if (err)
986		goto out;
987
988	err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap,
989				   &props->max_vl_num);
990out:
991	kvfree(rep);
992	kfree(ptys);
993	kfree(pmtu);
994	return err;
995}
996
997int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
998		       struct ib_port_attr *props)
999{
1000	switch (mlx5_get_vport_access_method(ibdev)) {
1001	case MLX5_VPORT_ACCESS_METHOD_MAD:
1002		return mlx5_query_mad_ifc_port(ibdev, port, props);
1003
1004	case MLX5_VPORT_ACCESS_METHOD_HCA:
1005		return mlx5_query_hca_port(ibdev, port, props);
1006
1007	case MLX5_VPORT_ACCESS_METHOD_NIC:
1008		return mlx5_query_port_roce(ibdev, port, props);
1009
1010	default:
1011		return -EINVAL;
1012	}
1013}
1014
1015static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
1016			     union ib_gid *gid)
1017{
1018	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1019	struct mlx5_core_dev *mdev = dev->mdev;
1020
1021	switch (mlx5_get_vport_access_method(ibdev)) {
1022	case MLX5_VPORT_ACCESS_METHOD_MAD:
1023		return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
1024
1025	case MLX5_VPORT_ACCESS_METHOD_HCA:
1026		return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid);
1027
1028	default:
1029		return -EINVAL;
1030	}
1031
1032}
1033
1034static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1035			      u16 *pkey)
1036{
1037	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1038	struct mlx5_core_dev *mdev = dev->mdev;
1039
1040	switch (mlx5_get_vport_access_method(ibdev)) {
1041	case MLX5_VPORT_ACCESS_METHOD_MAD:
1042		return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
1043
1044	case MLX5_VPORT_ACCESS_METHOD_HCA:
1045	case MLX5_VPORT_ACCESS_METHOD_NIC:
1046		return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
1047						 pkey);
1048	default:
1049		return -EINVAL;
1050	}
1051}
1052
1053static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
1054				 struct ib_device_modify *props)
1055{
1056	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1057	struct mlx5_reg_node_desc in;
1058	struct mlx5_reg_node_desc out;
1059	int err;
1060
1061	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
1062		return -EOPNOTSUPP;
1063
1064	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
1065		return 0;
1066
1067	/*
1068	 * If possible, pass node desc to FW, so it can generate
1069	 * a 144 trap.  If cmd fails, just ignore.
1070	 */
1071	memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1072	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
1073				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
1074	if (err)
1075		return err;
1076
1077	memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1078
1079	return err;
1080}
1081
1082static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1083			       struct ib_port_modify *props)
1084{
1085	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1086	struct ib_port_attr attr;
1087	u32 tmp;
1088	int err;
1089
1090	/*
1091	 * CM layer calls ib_modify_port() regardless of the link
1092	 * layer. For Ethernet ports, qkey violation and Port
1093	 * capabilities are meaningless.
1094	 */
1095	if (mlx5_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_ETHERNET)
1096		return 0;
1097
1098	mutex_lock(&dev->cap_mask_mutex);
1099
1100	err = mlx5_ib_query_port(ibdev, port, &attr);
1101	if (err)
1102		goto out;
1103
1104	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1105		~props->clr_port_cap_mask;
1106
1107	err = mlx5_set_port_caps(dev->mdev, port, tmp);
1108
1109out:
1110	mutex_unlock(&dev->cap_mask_mutex);
1111	return err;
1112}
1113
1114static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1115{
1116	mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1117		    caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1118}
1119
1120static u16 calc_dynamic_bfregs(int uars_per_sys_page)
1121{
1122	/* Large page with non 4k uar support might limit the dynamic size */
1123	if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
1124		return MLX5_MIN_DYN_BFREGS;
1125
1126	return MLX5_MAX_DYN_BFREGS;
1127}
1128
1129static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1130			     struct mlx5_ib_alloc_ucontext_req_v2 *req,
1131			     struct mlx5_bfreg_info *bfregi)
1132{
1133	int uars_per_sys_page;
1134	int bfregs_per_sys_page;
1135	int ref_bfregs = req->total_num_bfregs;
1136
1137	if (req->total_num_bfregs == 0)
1138		return -EINVAL;
1139
1140	BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1141	BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1142
1143	if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1144		return -ENOMEM;
1145
1146	uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1147	bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1148	/* This holds the required static allocation asked by the user */
1149	req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1150	if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1151		return -EINVAL;
1152
1153	bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1154	bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
1155	bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
1156	bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
1157
1158	mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
1159		    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1160		    lib_uar_4k ? "yes" : "no", ref_bfregs,
1161		    req->total_num_bfregs, bfregi->total_num_bfregs,
1162		    bfregi->num_sys_pages);
1163
1164	return 0;
1165}
1166
1167static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1168{
1169	struct mlx5_bfreg_info *bfregi;
1170	int err;
1171	int i;
1172
1173	bfregi = &context->bfregi;
1174	for (i = 0; i < bfregi->num_static_sys_pages; i++) {
1175		err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1176		if (err)
1177			goto error;
1178
1179		mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1180	}
1181
1182	for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
1183		bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
1184
1185	return 0;
1186
1187error:
1188	for (--i; i >= 0; i--)
1189		if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1190			mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1191
1192	return err;
1193}
1194
1195static void deallocate_uars(struct mlx5_ib_dev *dev,
1196			    struct mlx5_ib_ucontext *context)
1197{
1198	struct mlx5_bfreg_info *bfregi;
1199	int i;
1200
1201	bfregi = &context->bfregi;
1202	for (i = 0; i < bfregi->num_sys_pages; i++)
1203		if (i < bfregi->num_static_sys_pages ||
1204		    bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX)
1205			mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1206}
1207
1208static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
1209					  u16 uid)
1210{
1211	int err;
1212
1213	if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1214		return 0;
1215
1216	err = mlx5_alloc_transport_domain(dev->mdev, tdn, uid);
1217	if (err)
1218		return err;
1219
1220	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1221	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1222	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1223		return 0;
1224
1225	mutex_lock(&dev->lb_mutex);
1226	dev->user_td++;
1227
1228	if (dev->user_td == 2)
1229		err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
1230
1231	mutex_unlock(&dev->lb_mutex);
1232
1233	if (err != 0)
1234		mlx5_dealloc_transport_domain(dev->mdev, *tdn, uid);
1235	return err;
1236}
1237
1238static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
1239					     u16 uid)
1240{
1241	if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1242		return;
1243
1244	mlx5_dealloc_transport_domain(dev->mdev, tdn, uid);
1245
1246	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1247	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1248	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1249		return;
1250
1251	mutex_lock(&dev->lb_mutex);
1252	dev->user_td--;
1253
1254	if (dev->user_td < 2)
1255		mlx5_nic_vport_update_local_lb(dev->mdev, false);
1256
1257	mutex_unlock(&dev->lb_mutex);
1258}
1259
1260static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
1261				  struct ib_udata *udata)
1262{
1263	struct ib_device *ibdev = uctx->device;
1264	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1265	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1266	struct mlx5_ib_alloc_ucontext_resp resp = {};
1267	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
1268	struct mlx5_bfreg_info *bfregi;
1269	int ver;
1270	int err;
1271	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1272				     max_cqe_version);
1273	bool lib_uar_4k;
1274	bool lib_uar_dyn;
1275
1276	if (!dev->ib_active)
1277		return -EAGAIN;
1278
1279	if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1280		ver = 0;
1281	else if (udata->inlen >= min_req_v2)
1282		ver = 2;
1283	else
1284		return -EINVAL;
1285
1286	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1287	if (err)
1288		return err;
1289
1290	if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
1291		return -EOPNOTSUPP;
1292
1293	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1294		return -EOPNOTSUPP;
1295
1296	req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1297				    MLX5_NON_FP_BFREGS_PER_UAR);
1298	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1299		return -EINVAL;
1300
1301	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1302	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1303		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1304	resp.cache_line_size = cache_line_size();
1305	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1306	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1307	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1308	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1309	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1310	resp.cqe_version = min_t(__u8,
1311				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1312				 req.max_cqe_version);
1313	resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1314				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1315	resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1316					MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
1317	resp.response_length = min(offsetof(typeof(resp), response_length) +
1318				   sizeof(resp.response_length), udata->outlen);
1319
1320	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1321	lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
1322	bfregi = &context->bfregi;
1323
1324	if (lib_uar_dyn) {
1325		bfregi->lib_uar_dyn = lib_uar_dyn;
1326		goto uar_done;
1327	}
1328
1329	/* updates req->total_num_bfregs */
1330	err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
1331	if (err)
1332		goto out_ctx;
1333
1334	mutex_init(&bfregi->lock);
1335	bfregi->lib_uar_4k = lib_uar_4k;
1336	bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
1337				GFP_KERNEL);
1338	if (!bfregi->count) {
1339		err = -ENOMEM;
1340		goto out_ctx;
1341	}
1342
1343	bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1344				    sizeof(*bfregi->sys_pages),
1345				    GFP_KERNEL);
1346	if (!bfregi->sys_pages) {
1347		err = -ENOMEM;
1348		goto out_count;
1349	}
1350
1351	err = allocate_uars(dev, context);
1352	if (err)
1353		goto out_sys_pages;
1354
1355uar_done:
1356	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
1357		err = mlx5_ib_devx_create(dev, true);
1358		if (err < 0)
1359			goto out_uars;
1360		context->devx_uid = err;
1361	}
1362
1363	err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
1364					     context->devx_uid);
1365	if (err)
1366		goto out_devx;
1367
1368	INIT_LIST_HEAD(&context->db_page_list);
1369	mutex_init(&context->db_page_mutex);
1370
1371	resp.tot_bfregs = lib_uar_dyn ? 0 : req.total_num_bfregs;
1372	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
1373
1374	if (field_avail(typeof(resp), cqe_version, udata->outlen))
1375		resp.response_length += sizeof(resp.cqe_version);
1376
1377	if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1378		resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1379				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1380		resp.response_length += sizeof(resp.cmds_supp_uhw);
1381	}
1382
1383	/*
1384	 * We don't want to expose information from the PCI bar that is located
1385	 * after 4096 bytes, so if the arch only supports larger pages, let's
1386	 * pretend we don't support reading the HCA's core clock. This is also
1387	 * forced by mmap function.
1388	 */
1389	if (offsetofend(typeof(resp), hca_core_clock_offset) <= udata->outlen) {
1390		if (PAGE_SIZE <= 4096) {
1391			resp.comp_mask |=
1392				MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1393			resp.hca_core_clock_offset =
1394				offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
1395		}
1396		resp.response_length += sizeof(resp.hca_core_clock_offset);
1397	}
1398
1399	if (offsetofend(typeof(resp), log_uar_size) <= udata->outlen)
1400		resp.response_length += sizeof(resp.log_uar_size);
1401
1402	if (offsetofend(typeof(resp), num_uars_per_page) <= udata->outlen)
1403		resp.response_length += sizeof(resp.num_uars_per_page);
1404
1405	if (offsetofend(typeof(resp), num_dyn_bfregs) <= udata->outlen) {
1406		resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
1407		resp.response_length += sizeof(resp.num_dyn_bfregs);
1408	}
1409
1410	err = ib_copy_to_udata(udata, &resp, resp.response_length);
1411	if (err)
1412		goto out_mdev;
1413
1414	bfregi->ver = ver;
1415	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1416	context->cqe_version = resp.cqe_version;
1417	context->lib_caps = req.lib_caps;
1418	print_lib_caps(dev, context->lib_caps);
1419
1420	return 0;
1421
1422out_mdev:
1423	mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1424out_devx:
1425	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
1426		mlx5_ib_devx_destroy(dev, context->devx_uid);
1427
1428out_uars:
1429	deallocate_uars(dev, context);
1430
1431out_sys_pages:
1432	kfree(bfregi->sys_pages);
1433
1434out_count:
1435	kfree(bfregi->count);
1436
1437out_ctx:
1438	return err;
1439}
1440
1441static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1442{
1443	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1444	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1445	struct mlx5_bfreg_info *bfregi;
1446
1447	bfregi = &context->bfregi;
1448	mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1449
1450	if (context->devx_uid)
1451		mlx5_ib_devx_destroy(dev, context->devx_uid);
1452
1453	deallocate_uars(dev, context);
1454	kfree(bfregi->sys_pages);
1455	kfree(bfregi->count);
1456}
1457
1458static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
1459				 int uar_idx)
1460{
1461	int fw_uars_per_page;
1462
1463	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
1464
1465	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
1466}
1467
1468static int get_command(unsigned long offset)
1469{
1470	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1471}
1472
1473static int get_arg(unsigned long offset)
1474{
1475	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1476}
1477
1478static int get_index(unsigned long offset)
1479{
1480	return get_arg(offset);
1481}
1482
1483/* Index resides in an extra byte to enable larger values than 255 */
1484static int get_extended_index(unsigned long offset)
1485{
1486	return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
1487}
1488
1489
1490static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
1491{
1492}
1493
1494static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
1495{
1496	switch (cmd) {
1497	case MLX5_IB_MMAP_WC_PAGE:
1498		return "WC";
1499	case MLX5_IB_MMAP_REGULAR_PAGE:
1500		return "best effort WC";
1501	case MLX5_IB_MMAP_NC_PAGE:
1502		return "NC";
1503	default:
1504		return NULL;
1505	}
1506}
1507
1508static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
1509					struct vm_area_struct *vma,
1510					struct mlx5_ib_ucontext *context)
1511{
1512	if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
1513	    !(vma->vm_flags & VM_SHARED))
1514		return -EINVAL;
1515
1516	if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
1517		return -EOPNOTSUPP;
1518
1519	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1520		return -EPERM;
1521
1522	return -EOPNOTSUPP;
1523}
1524
1525static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
1526{
1527	struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
1528	struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
1529
1530	switch (mentry->mmap_flag) {
1531	case MLX5_IB_MMAP_TYPE_UAR_WC:
1532	case MLX5_IB_MMAP_TYPE_UAR_NC:
1533		mlx5_cmd_free_uar(dev->mdev, mentry->page_idx);
1534		kfree(mentry);
1535		break;
1536	default:
1537		WARN_ON(true);
1538	}
1539}
1540
1541static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
1542		    struct vm_area_struct *vma,
1543		    struct mlx5_ib_ucontext *context)
1544{
1545	struct mlx5_bfreg_info *bfregi = &context->bfregi;
1546	int err;
1547	unsigned long idx;
1548	phys_addr_t pfn;
1549	pgprot_t prot;
1550	u32 bfreg_dyn_idx = 0;
1551	u32 uar_index;
1552	int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
1553	int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
1554				bfregi->num_static_sys_pages;
1555
1556	if (bfregi->lib_uar_dyn)
1557		return -EINVAL;
1558
1559	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1560		return -EINVAL;
1561
1562	if (dyn_uar)
1563		idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
1564	else
1565		idx = get_index(vma->vm_pgoff);
1566
1567	if (idx >= max_valid_idx) {
1568		mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
1569			     idx, max_valid_idx);
1570		return -EINVAL;
1571	}
1572
1573	switch (cmd) {
1574	case MLX5_IB_MMAP_WC_PAGE:
1575	case MLX5_IB_MMAP_ALLOC_WC:
1576	case MLX5_IB_MMAP_REGULAR_PAGE:
1577		/* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
1578		prot = pgprot_writecombine(vma->vm_page_prot);
1579		break;
1580	case MLX5_IB_MMAP_NC_PAGE:
1581		prot = pgprot_noncached(vma->vm_page_prot);
1582		break;
1583	default:
1584		return -EINVAL;
1585	}
1586
1587	if (dyn_uar) {
1588		int uars_per_page;
1589
1590		uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
1591		bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
1592		if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
1593			mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
1594				     bfreg_dyn_idx, bfregi->total_num_bfregs);
1595			return -EINVAL;
1596		}
1597
1598		mutex_lock(&bfregi->lock);
1599		/* Fail if uar already allocated, first bfreg index of each
1600		 * page holds its count.
1601		 */
1602		if (bfregi->count[bfreg_dyn_idx]) {
1603			mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
1604			mutex_unlock(&bfregi->lock);
1605			return -EINVAL;
1606		}
1607
1608		bfregi->count[bfreg_dyn_idx]++;
1609		mutex_unlock(&bfregi->lock);
1610
1611		err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
1612		if (err) {
1613			mlx5_ib_warn(dev, "UAR alloc failed\n");
1614			goto free_bfreg;
1615		}
1616	} else {
1617		uar_index = bfregi->sys_pages[idx];
1618	}
1619
1620	pfn = uar_index2pfn(dev, uar_index);
1621	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
1622
1623	err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
1624				prot, NULL);
1625	if (err) {
1626		mlx5_ib_err(dev,
1627			    "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
1628			    err, mmap_cmd2str(cmd));
1629		goto err;
1630	}
1631
1632	if (dyn_uar)
1633		bfregi->sys_pages[idx] = uar_index;
1634	return 0;
1635
1636err:
1637	if (!dyn_uar)
1638		return err;
1639
1640	mlx5_cmd_free_uar(dev->mdev, idx);
1641
1642free_bfreg:
1643	mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
1644
1645	return err;
1646}
1647
1648static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
1649{
1650	unsigned long idx;
1651	u8 command;
1652
1653	command = get_command(vma->vm_pgoff);
1654	idx = get_extended_index(vma->vm_pgoff);
1655
1656	return (command << 16 | idx);
1657}
1658
1659static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
1660			       struct vm_area_struct *vma,
1661			       struct ib_ucontext *ucontext)
1662{
1663	struct mlx5_user_mmap_entry *mentry;
1664	struct rdma_user_mmap_entry *entry;
1665	unsigned long pgoff;
1666	pgprot_t prot;
1667	phys_addr_t pfn;
1668	int ret;
1669
1670	pgoff = mlx5_vma_to_pgoff(vma);
1671	entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff);
1672	if (!entry)
1673		return -EINVAL;
1674
1675	mentry = to_mmmap(entry);
1676	pfn = (mentry->address >> PAGE_SHIFT);
1677	if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR ||
1678	    mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC)
1679		prot = pgprot_noncached(vma->vm_page_prot);
1680	else
1681		prot = pgprot_writecombine(vma->vm_page_prot);
1682	ret = rdma_user_mmap_io(ucontext, vma, pfn,
1683				entry->npages * PAGE_SIZE,
1684				prot,
1685				entry);
1686	rdma_user_mmap_entry_put(&mentry->rdma_entry);
1687	return ret;
1688}
1689
1690static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1691{
1692	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1693	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1694	unsigned long command;
1695	phys_addr_t pfn;
1696
1697	command = get_command(vma->vm_pgoff);
1698	switch (command) {
1699	case MLX5_IB_MMAP_WC_PAGE:
1700	case MLX5_IB_MMAP_ALLOC_WC:
1701		if (!dev->wc_support)
1702			return -EPERM;
1703		/* FALLTHROUGH */
1704	case MLX5_IB_MMAP_NC_PAGE:
1705	case MLX5_IB_MMAP_REGULAR_PAGE:
1706		return uar_mmap(dev, command, vma, context);
1707
1708	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
1709		return -ENOSYS;
1710
1711	case MLX5_IB_MMAP_CORE_CLOCK:
1712		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1713			return -EINVAL;
1714
1715		if (vma->vm_flags & VM_WRITE)
1716			return -EPERM;
1717
1718		/* Don't expose to user-space information it shouldn't have */
1719		if (PAGE_SIZE > 4096)
1720			return -EOPNOTSUPP;
1721
1722		pfn = (dev->mdev->iseg_base +
1723		       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
1724			PAGE_SHIFT;
1725		return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
1726					 PAGE_SIZE,
1727					 pgprot_noncached(vma->vm_page_prot),
1728					 NULL);
1729	case MLX5_IB_MMAP_CLOCK_INFO:
1730		return mlx5_ib_mmap_clock_info_page(dev, vma, context);
1731
1732	default:
1733		return mlx5_ib_mmap_offset(dev, vma, ibcontext);
1734	}
1735
1736	return 0;
1737}
1738
1739static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
1740{
1741	struct mlx5_ib_pd *pd = to_mpd(ibpd);
1742	struct ib_device *ibdev = ibpd->device;
1743	struct mlx5_ib_alloc_pd_resp resp;
1744	int err;
1745	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
1746		udata, struct mlx5_ib_ucontext, ibucontext);
1747	u16 uid = context ? context->devx_uid : 0;
1748
1749	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn, uid);
1750	if (err)
1751		return (err);
1752
1753	pd->uid = uid;
1754	if (udata) {
1755		resp.pdn = pd->pdn;
1756		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1757			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
1758			return -EFAULT;
1759		}
1760	}
1761
1762	return 0;
1763}
1764
1765static void mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
1766{
1767	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1768	struct mlx5_ib_pd *mpd = to_mpd(pd);
1769
1770	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
1771}
1772
1773enum {
1774	MATCH_CRITERIA_ENABLE_OUTER_BIT,
1775	MATCH_CRITERIA_ENABLE_MISC_BIT,
1776	MATCH_CRITERIA_ENABLE_INNER_BIT
1777};
1778
1779#define HEADER_IS_ZERO(match_criteria, headers)			           \
1780	!(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
1781		    0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
1782
1783static u8 get_match_criteria_enable(u32 *match_criteria)
1784{
1785	u8 match_criteria_enable;
1786
1787	match_criteria_enable =
1788		(!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
1789		MATCH_CRITERIA_ENABLE_OUTER_BIT;
1790	match_criteria_enable |=
1791		(!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
1792		MATCH_CRITERIA_ENABLE_MISC_BIT;
1793	match_criteria_enable |=
1794		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
1795		MATCH_CRITERIA_ENABLE_INNER_BIT;
1796
1797	return match_criteria_enable;
1798}
1799
1800static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
1801{
1802	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
1803	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
1804}
1805
1806static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
1807{
1808	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
1809	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
1810	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
1811	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
1812}
1813
1814#define LAST_ETH_FIELD vlan_tag
1815#define LAST_IB_FIELD sl
1816#define LAST_IPV4_FIELD tos
1817#define LAST_IPV6_FIELD traffic_class
1818#define LAST_TCP_UDP_FIELD src_port
1819
1820/* Field is the last supported field */
1821#define FIELDS_NOT_SUPPORTED(filter, field)\
1822	memchr_inv((void *)&filter.field  +\
1823		   sizeof(filter.field), 0,\
1824		   sizeof(filter) -\
1825		   offsetof(typeof(filter), field) -\
1826		   sizeof(filter.field))
1827
1828static int parse_flow_attr(u32 *match_c, u32 *match_v,
1829			   const union ib_flow_spec *ib_spec)
1830{
1831	void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1832					     outer_headers);
1833	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1834					     outer_headers);
1835	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
1836					   misc_parameters);
1837	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
1838					   misc_parameters);
1839
1840	switch (ib_spec->type) {
1841	case IB_FLOW_SPEC_ETH:
1842		if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
1843			return -ENOTSUPP;
1844
1845		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1846					     dmac_47_16),
1847				ib_spec->eth.mask.dst_mac);
1848		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1849					     dmac_47_16),
1850				ib_spec->eth.val.dst_mac);
1851
1852		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1853					     smac_47_16),
1854				ib_spec->eth.mask.src_mac);
1855		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1856					     smac_47_16),
1857				ib_spec->eth.val.src_mac);
1858
1859		if (ib_spec->eth.mask.vlan_tag) {
1860			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1861				 cvlan_tag, 1);
1862			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1863				 cvlan_tag, 1);
1864
1865			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1866				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
1867			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1868				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
1869
1870			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1871				 first_cfi,
1872				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
1873			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1874				 first_cfi,
1875				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
1876
1877			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1878				 first_prio,
1879				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
1880			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1881				 first_prio,
1882				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
1883		}
1884		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1885			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
1886		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1887			 ethertype, ntohs(ib_spec->eth.val.ether_type));
1888		break;
1889	case IB_FLOW_SPEC_IPV4:
1890		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
1891			return -ENOTSUPP;
1892
1893		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1894			 ethertype, 0xffff);
1895		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1896			 ethertype, ETH_P_IP);
1897
1898		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1899				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1900		       &ib_spec->ipv4.mask.src_ip,
1901		       sizeof(ib_spec->ipv4.mask.src_ip));
1902		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1903				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1904		       &ib_spec->ipv4.val.src_ip,
1905		       sizeof(ib_spec->ipv4.val.src_ip));
1906		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1907				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1908		       &ib_spec->ipv4.mask.dst_ip,
1909		       sizeof(ib_spec->ipv4.mask.dst_ip));
1910		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1911				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1912		       &ib_spec->ipv4.val.dst_ip,
1913		       sizeof(ib_spec->ipv4.val.dst_ip));
1914
1915		set_tos(outer_headers_c, outer_headers_v,
1916			ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
1917
1918		set_proto(outer_headers_c, outer_headers_v,
1919			  ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
1920		break;
1921	case IB_FLOW_SPEC_IPV6:
1922		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
1923			return -ENOTSUPP;
1924
1925		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1926			 ethertype, 0xffff);
1927		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1928			 ethertype, ETH_P_IPV6);
1929
1930		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1931				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
1932		       &ib_spec->ipv6.mask.src_ip,
1933		       sizeof(ib_spec->ipv6.mask.src_ip));
1934		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1935				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
1936		       &ib_spec->ipv6.val.src_ip,
1937		       sizeof(ib_spec->ipv6.val.src_ip));
1938		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1939				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1940		       &ib_spec->ipv6.mask.dst_ip,
1941		       sizeof(ib_spec->ipv6.mask.dst_ip));
1942		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1943				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1944		       &ib_spec->ipv6.val.dst_ip,
1945		       sizeof(ib_spec->ipv6.val.dst_ip));
1946
1947		set_tos(outer_headers_c, outer_headers_v,
1948			ib_spec->ipv6.mask.traffic_class,
1949			ib_spec->ipv6.val.traffic_class);
1950
1951		set_proto(outer_headers_c, outer_headers_v,
1952			  ib_spec->ipv6.mask.next_hdr,
1953			  ib_spec->ipv6.val.next_hdr);
1954
1955		MLX5_SET(fte_match_set_misc, misc_params_c,
1956			 outer_ipv6_flow_label,
1957			 ntohl(ib_spec->ipv6.mask.flow_label));
1958		MLX5_SET(fte_match_set_misc, misc_params_v,
1959			 outer_ipv6_flow_label,
1960			 ntohl(ib_spec->ipv6.val.flow_label));
1961		break;
1962	case IB_FLOW_SPEC_TCP:
1963		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1964					 LAST_TCP_UDP_FIELD))
1965			return -ENOTSUPP;
1966
1967		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1968			 0xff);
1969		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1970			 IPPROTO_TCP);
1971
1972		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport,
1973			 ntohs(ib_spec->tcp_udp.mask.src_port));
1974		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport,
1975			 ntohs(ib_spec->tcp_udp.val.src_port));
1976
1977		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport,
1978			 ntohs(ib_spec->tcp_udp.mask.dst_port));
1979		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport,
1980			 ntohs(ib_spec->tcp_udp.val.dst_port));
1981		break;
1982	case IB_FLOW_SPEC_UDP:
1983		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1984					 LAST_TCP_UDP_FIELD))
1985			return -ENOTSUPP;
1986
1987		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1988			 0xff);
1989		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1990			 IPPROTO_UDP);
1991
1992		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport,
1993			 ntohs(ib_spec->tcp_udp.mask.src_port));
1994		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport,
1995			 ntohs(ib_spec->tcp_udp.val.src_port));
1996
1997		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport,
1998			 ntohs(ib_spec->tcp_udp.mask.dst_port));
1999		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport,
2000			 ntohs(ib_spec->tcp_udp.val.dst_port));
2001		break;
2002	default:
2003		return -EINVAL;
2004	}
2005
2006	return 0;
2007}
2008
2009/* If a flow could catch both multicast and unicast packets,
2010 * it won't fall into the multicast flow steering table and this rule
2011 * could steal other multicast packets.
2012 */
2013static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
2014{
2015	struct ib_flow_spec_eth *eth_spec;
2016
2017	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
2018	    ib_attr->size < sizeof(struct ib_flow_attr) +
2019	    sizeof(struct ib_flow_spec_eth) ||
2020	    ib_attr->num_of_specs < 1)
2021		return false;
2022
2023	eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
2024	if (eth_spec->type != IB_FLOW_SPEC_ETH ||
2025	    eth_spec->size != sizeof(*eth_spec))
2026		return false;
2027
2028	return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
2029	       is_multicast_ether_addr(eth_spec->val.dst_mac);
2030}
2031
2032static bool is_valid_attr(const struct ib_flow_attr *flow_attr)
2033{
2034	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
2035	bool has_ipv4_spec = false;
2036	bool eth_type_ipv4 = true;
2037	unsigned int spec_index;
2038
2039	/* Validate that ethertype is correct */
2040	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
2041		if (ib_spec->type == IB_FLOW_SPEC_ETH &&
2042		    ib_spec->eth.mask.ether_type) {
2043			if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
2044			      ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
2045				eth_type_ipv4 = false;
2046		} else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
2047			has_ipv4_spec = true;
2048		}
2049		ib_spec = (void *)ib_spec + ib_spec->size;
2050	}
2051	return !has_ipv4_spec || eth_type_ipv4;
2052}
2053
2054static void put_flow_table(struct mlx5_ib_dev *dev,
2055			   struct mlx5_ib_flow_prio *prio, bool ft_added)
2056{
2057	prio->refcount -= !!ft_added;
2058	if (!prio->refcount) {
2059		mlx5_destroy_flow_table(prio->flow_table);
2060		prio->flow_table = NULL;
2061	}
2062}
2063
2064static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
2065{
2066	struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
2067	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
2068							  struct mlx5_ib_flow_handler,
2069							  ibflow);
2070	struct mlx5_ib_flow_handler *iter, *tmp;
2071
2072	mutex_lock(&dev->flow_db.lock);
2073
2074	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
2075		mlx5_del_flow_rule(&iter->rule);
2076		put_flow_table(dev, iter->prio, true);
2077		list_del(&iter->list);
2078		kfree(iter);
2079	}
2080
2081	mlx5_del_flow_rule(&handler->rule);
2082	put_flow_table(dev, handler->prio, true);
2083	mutex_unlock(&dev->flow_db.lock);
2084
2085	kfree(handler);
2086
2087	return 0;
2088}
2089
2090static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
2091{
2092	priority *= 2;
2093	if (!dont_trap)
2094		priority++;
2095	return priority;
2096}
2097
2098enum flow_table_type {
2099	MLX5_IB_FT_RX,
2100	MLX5_IB_FT_TX
2101};
2102
2103#define MLX5_FS_MAX_TYPES	 10
2104#define MLX5_FS_MAX_ENTRIES	 32000UL
2105static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
2106						struct ib_flow_attr *flow_attr,
2107						enum flow_table_type ft_type)
2108{
2109	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
2110	struct mlx5_flow_namespace *ns = NULL;
2111	struct mlx5_ib_flow_prio *prio;
2112	struct mlx5_flow_table *ft;
2113	int num_entries;
2114	int num_groups;
2115	int priority;
2116	int err = 0;
2117
2118	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2119		if (flow_is_multicast_only(flow_attr) &&
2120		    !dont_trap)
2121			priority = MLX5_IB_FLOW_MCAST_PRIO;
2122		else
2123			priority = ib_prio_to_core_prio(flow_attr->priority,
2124							dont_trap);
2125		ns = mlx5_get_flow_namespace(dev->mdev,
2126					     MLX5_FLOW_NAMESPACE_BYPASS);
2127		num_entries = MLX5_FS_MAX_ENTRIES;
2128		num_groups = MLX5_FS_MAX_TYPES;
2129		prio = &dev->flow_db.prios[priority];
2130	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2131		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2132		ns = mlx5_get_flow_namespace(dev->mdev,
2133					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
2134		build_leftovers_ft_param("bypass", &priority,
2135					 &num_entries,
2136					 &num_groups);
2137		prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
2138	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2139		if (!MLX5_CAP_FLOWTABLE(dev->mdev,
2140					allow_sniffer_and_nic_rx_shared_tir))
2141			return ERR_PTR(-ENOTSUPP);
2142
2143		ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
2144					     MLX5_FLOW_NAMESPACE_SNIFFER_RX :
2145					     MLX5_FLOW_NAMESPACE_SNIFFER_TX);
2146
2147		prio = &dev->flow_db.sniffer[ft_type];
2148		priority = 0;
2149		num_entries = 1;
2150		num_groups = 1;
2151	}
2152
2153	if (!ns)
2154		return ERR_PTR(-ENOTSUPP);
2155
2156	ft = prio->flow_table;
2157	if (!ft) {
2158		ft = mlx5_create_auto_grouped_flow_table(ns, priority, "bypass",
2159							 num_entries,
2160							 num_groups,
2161							 0);
2162
2163		if (!IS_ERR(ft)) {
2164			prio->refcount = 0;
2165			prio->flow_table = ft;
2166		} else {
2167			err = PTR_ERR(ft);
2168		}
2169	}
2170
2171	return err ? ERR_PTR(err) : prio;
2172}
2173
2174static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
2175						     struct mlx5_ib_flow_prio *ft_prio,
2176						     const struct ib_flow_attr *flow_attr,
2177						     struct mlx5_flow_destination *dst)
2178{
2179	struct mlx5_flow_table	*ft = ft_prio->flow_table;
2180	struct mlx5_ib_flow_handler *handler;
2181	struct mlx5_flow_spec *spec;
2182	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
2183	unsigned int spec_index;
2184	struct mlx5_flow_act flow_act = {
2185		.actions = MLX5_FLOW_ACT_ACTIONS_FLOW_TAG,
2186		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
2187	};
2188	u32 action;
2189	int err = 0;
2190
2191	if (!is_valid_attr(flow_attr))
2192		return ERR_PTR(-EINVAL);
2193
2194	spec = mlx5_vzalloc(sizeof(*spec));
2195	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
2196	if (!handler || !spec) {
2197		err = -ENOMEM;
2198		goto free;
2199	}
2200
2201	INIT_LIST_HEAD(&handler->list);
2202
2203	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
2204		err = parse_flow_attr(spec->match_criteria,
2205				      spec->match_value, ib_flow);
2206		if (err < 0)
2207			goto free;
2208
2209		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
2210	}
2211
2212	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
2213	action = dst ? MLX5_FLOW_RULE_FWD_ACTION_DEST : 0;
2214	handler->rule = mlx5_add_flow_rule(ft, spec->match_criteria_enable,
2215					   spec->match_criteria,
2216					   spec->match_value,
2217					   action,
2218					   &flow_act,
2219					   dst);
2220
2221	if (IS_ERR(handler->rule)) {
2222		err = PTR_ERR(handler->rule);
2223		goto free;
2224	}
2225
2226	ft_prio->refcount++;
2227	handler->prio = ft_prio;
2228
2229	ft_prio->flow_table = ft;
2230free:
2231	if (err)
2232		kfree(handler);
2233	kvfree(spec);
2234	return err ? ERR_PTR(err) : handler;
2235}
2236
2237static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
2238							  struct mlx5_ib_flow_prio *ft_prio,
2239							  struct ib_flow_attr *flow_attr,
2240							  struct mlx5_flow_destination *dst)
2241{
2242	struct mlx5_ib_flow_handler *handler_dst = NULL;
2243	struct mlx5_ib_flow_handler *handler = NULL;
2244
2245	handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
2246	if (!IS_ERR(handler)) {
2247		handler_dst = create_flow_rule(dev, ft_prio,
2248					       flow_attr, dst);
2249		if (IS_ERR(handler_dst)) {
2250			mlx5_del_flow_rule(&handler->rule);
2251			ft_prio->refcount--;
2252			kfree(handler);
2253			handler = handler_dst;
2254		} else {
2255			list_add(&handler_dst->list, &handler->list);
2256		}
2257	}
2258
2259	return handler;
2260}
2261enum {
2262	LEFTOVERS_MC,
2263	LEFTOVERS_UC,
2264};
2265
2266static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
2267							  struct mlx5_ib_flow_prio *ft_prio,
2268							  struct ib_flow_attr *flow_attr,
2269							  struct mlx5_flow_destination *dst)
2270{
2271	struct mlx5_ib_flow_handler *handler_ucast = NULL;
2272	struct mlx5_ib_flow_handler *handler = NULL;
2273
2274	static struct {
2275		struct ib_flow_attr	flow_attr;
2276		struct ib_flow_spec_eth eth_flow;
2277	} leftovers_specs[] = {
2278		[LEFTOVERS_MC] = {
2279			.flow_attr = {
2280				.num_of_specs = 1,
2281				.size = sizeof(leftovers_specs[0])
2282			},
2283			.eth_flow = {
2284				.type = IB_FLOW_SPEC_ETH,
2285				.size = sizeof(struct ib_flow_spec_eth),
2286				.mask = {.dst_mac = {0x1} },
2287				.val =  {.dst_mac = {0x1} }
2288			}
2289		},
2290		[LEFTOVERS_UC] = {
2291			.flow_attr = {
2292				.num_of_specs = 1,
2293				.size = sizeof(leftovers_specs[0])
2294			},
2295			.eth_flow = {
2296				.type = IB_FLOW_SPEC_ETH,
2297				.size = sizeof(struct ib_flow_spec_eth),
2298				.mask = {.dst_mac = {0x1} },
2299				.val = {.dst_mac = {} }
2300			}
2301		}
2302	};
2303
2304	handler = create_flow_rule(dev, ft_prio,
2305				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
2306				   dst);
2307	if (!IS_ERR(handler) &&
2308	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
2309		handler_ucast = create_flow_rule(dev, ft_prio,
2310						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
2311						 dst);
2312		if (IS_ERR(handler_ucast)) {
2313			mlx5_del_flow_rule(&handler->rule);
2314			ft_prio->refcount--;
2315			kfree(handler);
2316			handler = handler_ucast;
2317		} else {
2318			list_add(&handler_ucast->list, &handler->list);
2319		}
2320	}
2321
2322	return handler;
2323}
2324
2325static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
2326							struct mlx5_ib_flow_prio *ft_rx,
2327							struct mlx5_ib_flow_prio *ft_tx,
2328							struct mlx5_flow_destination *dst)
2329{
2330	struct mlx5_ib_flow_handler *handler_rx;
2331	struct mlx5_ib_flow_handler *handler_tx;
2332	int err;
2333	static const struct ib_flow_attr flow_attr  = {
2334		.num_of_specs = 0,
2335		.type = IB_FLOW_ATTR_SNIFFER,
2336		.size = sizeof(flow_attr)
2337	};
2338
2339	handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
2340	if (IS_ERR(handler_rx)) {
2341		err = PTR_ERR(handler_rx);
2342		goto err;
2343	}
2344
2345	handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
2346	if (IS_ERR(handler_tx)) {
2347		err = PTR_ERR(handler_tx);
2348		goto err_tx;
2349	}
2350
2351	list_add(&handler_tx->list, &handler_rx->list);
2352
2353	return handler_rx;
2354
2355err_tx:
2356	mlx5_del_flow_rule(&handler_rx->rule);
2357	ft_rx->refcount--;
2358	kfree(handler_rx);
2359err:
2360	return ERR_PTR(err);
2361}
2362
2363static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
2364					   struct ib_flow_attr *flow_attr,
2365					   int domain,
2366					   struct ib_udata *udata)
2367{
2368	struct mlx5_ib_dev *dev = to_mdev(qp->device);
2369	struct mlx5_ib_qp *mqp = to_mqp(qp);
2370	struct mlx5_ib_flow_handler *handler = NULL;
2371	struct mlx5_flow_destination *dst = NULL;
2372	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
2373	struct mlx5_ib_flow_prio *ft_prio;
2374	struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
2375	size_t min_ucmd_sz, required_ucmd_sz;
2376	int err;
2377
2378	if (udata && udata->inlen) {
2379		min_ucmd_sz = offsetofend(struct mlx5_ib_create_flow, reserved);
2380		if (udata->inlen < min_ucmd_sz)
2381			return ERR_PTR(-EOPNOTSUPP);
2382
2383		err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
2384		if (err)
2385			return ERR_PTR(err);
2386
2387		/* currently supports only one counters data */
2388		if (ucmd_hdr.ncounters_data > 1)
2389			return ERR_PTR(-EINVAL);
2390
2391		required_ucmd_sz = min_ucmd_sz +
2392			sizeof(struct mlx5_ib_flow_counters_data) *
2393			ucmd_hdr.ncounters_data;
2394		if (udata->inlen > required_ucmd_sz &&
2395		    !ib_is_udata_cleared(udata, required_ucmd_sz,
2396					 udata->inlen - required_ucmd_sz))
2397			return ERR_PTR(-EOPNOTSUPP);
2398
2399		ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
2400		if (!ucmd)
2401			return ERR_PTR(-ENOMEM);
2402
2403		err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
2404		if (err)
2405			goto free_ucmd;
2406	}
2407
2408	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
2409		err = -ENOMEM;
2410		goto free_ucmd;
2411	}
2412
2413	if (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP) {
2414		err = -EINVAL;
2415		goto free_ucmd;
2416	}
2417
2418	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
2419	if (!dst) {
2420		err = -ENOMEM;
2421		goto free_ucmd;
2422	}
2423
2424	mutex_lock(&dev->flow_db.lock);
2425
2426	ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX);
2427	if (IS_ERR(ft_prio)) {
2428		err = PTR_ERR(ft_prio);
2429		goto unlock;
2430	}
2431	if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2432		ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
2433		if (IS_ERR(ft_prio_tx)) {
2434			err = PTR_ERR(ft_prio_tx);
2435			ft_prio_tx = NULL;
2436			goto destroy_ft;
2437		}
2438	}
2439
2440	dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
2441	if (mqp->flags & MLX5_IB_QP_RSS)
2442		dst->tir_num = mqp->rss_qp.tirn;
2443	else
2444		dst->tir_num = mqp->raw_packet_qp.rq.tirn;
2445
2446	switch (flow_attr->type) {
2447	case IB_FLOW_ATTR_NORMAL:
2448		if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) {
2449			err = -EOPNOTSUPP;
2450			goto destroy_ft;
2451		}
2452		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) {
2453			handler = create_dont_trap_rule(dev, ft_prio, flow_attr, dst);
2454		} else {
2455			handler = create_flow_rule(dev, ft_prio, flow_attr, dst);
2456		}
2457		break;
2458	case IB_FLOW_ATTR_ALL_DEFAULT:
2459	case IB_FLOW_ATTR_MC_DEFAULT:
2460		handler = create_leftovers_rule(dev, ft_prio, flow_attr, dst);
2461		break;
2462	case IB_FLOW_ATTR_SNIFFER:
2463		handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
2464		break;
2465	default:
2466		err = -EINVAL;
2467		goto destroy_ft;
2468	}
2469
2470	if (IS_ERR(handler)) {
2471		err = PTR_ERR(handler);
2472		handler = NULL;
2473		goto destroy_ft;
2474	}
2475
2476	mutex_unlock(&dev->flow_db.lock);
2477	kfree(dst);
2478	kfree(ucmd);
2479
2480	return &handler->ibflow;
2481
2482destroy_ft:
2483	put_flow_table(dev, ft_prio, false);
2484	if (ft_prio_tx)
2485		put_flow_table(dev, ft_prio_tx, false);
2486unlock:
2487	mutex_unlock(&dev->flow_db.lock);
2488	kfree(dst);
2489free_ucmd:
2490	kfree(ucmd);
2491	return ERR_PTR(err);
2492}
2493
2494static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2495{
2496	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2497	int err;
2498
2499	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
2500	if (err)
2501		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
2502			     ibqp->qp_num, gid->raw);
2503
2504	return err;
2505}
2506
2507static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2508{
2509	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2510	int err;
2511
2512	err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
2513	if (err)
2514		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
2515			     ibqp->qp_num, gid->raw);
2516
2517	return err;
2518}
2519
2520static int init_node_data(struct mlx5_ib_dev *dev)
2521{
2522	int err;
2523
2524	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
2525	if (err)
2526		return err;
2527
2528	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
2529}
2530
2531static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
2532			     char *buf)
2533{
2534	struct mlx5_ib_dev *dev =
2535		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2536
2537	return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages);
2538}
2539
2540static ssize_t show_reg_pages(struct device *device,
2541			      struct device_attribute *attr, char *buf)
2542{
2543	struct mlx5_ib_dev *dev =
2544		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2545
2546	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
2547}
2548
2549static ssize_t show_hca(struct device *device, struct device_attribute *attr,
2550			char *buf)
2551{
2552	struct mlx5_ib_dev *dev =
2553		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2554	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
2555}
2556
2557static ssize_t show_rev(struct device *device, struct device_attribute *attr,
2558			char *buf)
2559{
2560	struct mlx5_ib_dev *dev =
2561		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2562	return sprintf(buf, "%x\n", dev->mdev->pdev->revision);
2563}
2564
2565static ssize_t show_board(struct device *device, struct device_attribute *attr,
2566			  char *buf)
2567{
2568	struct mlx5_ib_dev *dev =
2569		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2570	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
2571		       dev->mdev->board_id);
2572}
2573
2574static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
2575static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
2576static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
2577static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
2578static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
2579
2580static struct device_attribute *mlx5_class_attributes[] = {
2581	&dev_attr_hw_rev,
2582	&dev_attr_hca_type,
2583	&dev_attr_board_id,
2584	&dev_attr_fw_pages,
2585	&dev_attr_reg_pages,
2586};
2587
2588static void pkey_change_handler(struct work_struct *work)
2589{
2590	struct mlx5_ib_port_resources *ports =
2591		container_of(work, struct mlx5_ib_port_resources,
2592			     pkey_change_work);
2593
2594	mutex_lock(&ports->devr->mutex);
2595	mlx5_ib_gsi_pkey_change(ports->gsi);
2596	mutex_unlock(&ports->devr->mutex);
2597}
2598
2599static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
2600{
2601	struct mlx5_ib_qp *mqp;
2602	struct mlx5_ib_cq *send_mcq, *recv_mcq;
2603	struct mlx5_core_cq *mcq;
2604	struct list_head cq_armed_list;
2605	unsigned long flags_qp;
2606	unsigned long flags_cq;
2607	unsigned long flags;
2608
2609	INIT_LIST_HEAD(&cq_armed_list);
2610
2611	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2612	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2613	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2614		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2615		if (mqp->sq.tail != mqp->sq.head) {
2616			send_mcq = to_mcq(mqp->ibqp.send_cq);
2617			spin_lock_irqsave(&send_mcq->lock, flags_cq);
2618			if (send_mcq->mcq.comp &&
2619			    mqp->ibqp.send_cq->comp_handler) {
2620				if (!send_mcq->mcq.reset_notify_added) {
2621					send_mcq->mcq.reset_notify_added = 1;
2622					list_add_tail(&send_mcq->mcq.reset_notify,
2623						      &cq_armed_list);
2624				}
2625			}
2626			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2627		}
2628		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2629		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2630		/* no handling is needed for SRQ */
2631		if (!mqp->ibqp.srq) {
2632			if (mqp->rq.tail != mqp->rq.head) {
2633				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2634				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2635				if (recv_mcq->mcq.comp &&
2636				    mqp->ibqp.recv_cq->comp_handler) {
2637					if (!recv_mcq->mcq.reset_notify_added) {
2638						recv_mcq->mcq.reset_notify_added = 1;
2639						list_add_tail(&recv_mcq->mcq.reset_notify,
2640							      &cq_armed_list);
2641					}
2642				}
2643				spin_unlock_irqrestore(&recv_mcq->lock,
2644						       flags_cq);
2645			}
2646		}
2647		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2648	}
2649	/*At that point all inflight post send were put to be executed as of we
2650	 * lock/unlock above locks Now need to arm all involved CQs.
2651	 */
2652	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2653		mcq->comp(mcq, NULL);
2654	}
2655	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2656}
2657
2658static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
2659			  enum mlx5_dev_event event, unsigned long param)
2660{
2661	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
2662	struct ib_event ibev;
2663	bool fatal = false;
2664	u8 port = (u8)param;
2665
2666	switch (event) {
2667	case MLX5_DEV_EVENT_SYS_ERROR:
2668		ibev.event = IB_EVENT_DEVICE_FATAL;
2669		mlx5_ib_handle_internal_error(ibdev);
2670		fatal = true;
2671		break;
2672
2673	case MLX5_DEV_EVENT_PORT_UP:
2674	case MLX5_DEV_EVENT_PORT_DOWN:
2675	case MLX5_DEV_EVENT_PORT_INITIALIZED:
2676		/* In RoCE, port up/down events are handled in
2677		 * mlx5_netdev_event().
2678		 */
2679		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2680			IB_LINK_LAYER_ETHERNET)
2681			return;
2682
2683		ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ?
2684			     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
2685		break;
2686
2687	case MLX5_DEV_EVENT_LID_CHANGE:
2688		ibev.event = IB_EVENT_LID_CHANGE;
2689		break;
2690
2691	case MLX5_DEV_EVENT_PKEY_CHANGE:
2692		ibev.event = IB_EVENT_PKEY_CHANGE;
2693
2694		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
2695		break;
2696
2697	case MLX5_DEV_EVENT_GUID_CHANGE:
2698		ibev.event = IB_EVENT_GID_CHANGE;
2699		break;
2700
2701	case MLX5_DEV_EVENT_CLIENT_REREG:
2702		ibev.event = IB_EVENT_CLIENT_REREGISTER;
2703		break;
2704
2705	default:
2706		/* unsupported event */
2707		return;
2708	}
2709
2710	ibev.device	      = &ibdev->ib_dev;
2711	ibev.element.port_num = port;
2712
2713	if (!rdma_is_port_valid(&ibdev->ib_dev, port)) {
2714		mlx5_ib_warn(ibdev, "warning: event(%d) on port %d\n", event, port);
2715		return;
2716	}
2717
2718	if (ibdev->ib_active)
2719		ib_dispatch_event(&ibev);
2720
2721	if (fatal)
2722		ibdev->ib_active = false;
2723}
2724
2725static void get_ext_port_caps(struct mlx5_ib_dev *dev)
2726{
2727	int port;
2728
2729	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
2730		mlx5_query_ext_port_caps(dev, port);
2731}
2732
2733static int get_port_caps(struct mlx5_ib_dev *dev)
2734{
2735	struct ib_device_attr *dprops = NULL;
2736	struct ib_port_attr *pprops = NULL;
2737	int err = -ENOMEM;
2738	int port;
2739	struct ib_udata uhw = {.inlen = 0, .outlen = 0};
2740
2741	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
2742	if (!pprops)
2743		goto out;
2744
2745	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
2746	if (!dprops)
2747		goto out;
2748
2749	err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
2750	if (err) {
2751		mlx5_ib_warn(dev, "query_device failed %d\n", err);
2752		goto out;
2753	}
2754
2755	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2756		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
2757		if (err) {
2758			mlx5_ib_warn(dev, "query_port %d failed %d\n",
2759				     port, err);
2760			break;
2761		}
2762		dev->mdev->port_caps[port - 1].pkey_table_len =
2763						dprops->max_pkeys;
2764		dev->mdev->port_caps[port - 1].gid_table_len =
2765						pprops->gid_tbl_len;
2766		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
2767			    dprops->max_pkeys, pprops->gid_tbl_len);
2768	}
2769
2770out:
2771	kfree(pprops);
2772	kfree(dprops);
2773
2774	return err;
2775}
2776
2777static void destroy_umrc_res(struct mlx5_ib_dev *dev)
2778{
2779	int err;
2780
2781	err = mlx5_mr_cache_cleanup(dev);
2782	if (err)
2783		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
2784
2785	if (dev->umrc.qp)
2786		mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
2787	if (dev->umrc.cq)
2788		ib_free_cq(dev->umrc.cq);
2789	if (dev->umrc.pd)
2790		ib_dealloc_pd(dev->umrc.pd);
2791}
2792
2793enum {
2794	MAX_UMR_WR = 128,
2795};
2796
2797static int create_umr_res(struct mlx5_ib_dev *dev)
2798{
2799	struct ib_qp_init_attr *init_attr = NULL;
2800	struct ib_qp_attr *attr = NULL;
2801	struct ib_pd *pd;
2802	struct ib_cq *cq;
2803	struct ib_qp *qp;
2804	int ret;
2805
2806	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
2807	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
2808	if (!attr || !init_attr) {
2809		ret = -ENOMEM;
2810		goto error_0;
2811	}
2812
2813	pd = ib_alloc_pd(&dev->ib_dev, 0);
2814	if (IS_ERR(pd)) {
2815		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
2816		ret = PTR_ERR(pd);
2817		goto error_0;
2818	}
2819
2820	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
2821	if (IS_ERR(cq)) {
2822		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
2823		ret = PTR_ERR(cq);
2824		goto error_2;
2825	}
2826
2827	init_attr->send_cq = cq;
2828	init_attr->recv_cq = cq;
2829	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
2830	init_attr->cap.max_send_wr = MAX_UMR_WR;
2831	init_attr->cap.max_send_sge = 1;
2832	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
2833	init_attr->port_num = 1;
2834	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
2835	if (IS_ERR(qp)) {
2836		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
2837		ret = PTR_ERR(qp);
2838		goto error_3;
2839	}
2840	qp->device     = &dev->ib_dev;
2841	qp->real_qp    = qp;
2842	qp->uobject    = NULL;
2843	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
2844
2845	attr->qp_state = IB_QPS_INIT;
2846	attr->port_num = 1;
2847	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
2848				IB_QP_PORT, NULL);
2849	if (ret) {
2850		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
2851		goto error_4;
2852	}
2853
2854	memset(attr, 0, sizeof(*attr));
2855	attr->qp_state = IB_QPS_RTR;
2856	attr->path_mtu = IB_MTU_256;
2857
2858	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2859	if (ret) {
2860		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
2861		goto error_4;
2862	}
2863
2864	memset(attr, 0, sizeof(*attr));
2865	attr->qp_state = IB_QPS_RTS;
2866	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2867	if (ret) {
2868		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
2869		goto error_4;
2870	}
2871
2872	dev->umrc.qp = qp;
2873	dev->umrc.cq = cq;
2874	dev->umrc.pd = pd;
2875
2876	sema_init(&dev->umrc.sem, MAX_UMR_WR);
2877	ret = mlx5_mr_cache_init(dev);
2878	if (ret) {
2879		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
2880		goto error_4;
2881	}
2882
2883	kfree(attr);
2884	kfree(init_attr);
2885
2886	return 0;
2887
2888error_4:
2889	mlx5_ib_destroy_qp(qp, NULL);
2890	dev->umrc.qp = NULL;
2891
2892error_3:
2893	ib_free_cq(cq);
2894	dev->umrc.cq = NULL;
2895
2896error_2:
2897	ib_dealloc_pd(pd);
2898	dev->umrc.pd = NULL;
2899
2900error_0:
2901	kfree(attr);
2902	kfree(init_attr);
2903	return ret;
2904}
2905
2906static int create_dev_resources(struct mlx5_ib_resources *devr)
2907{
2908	struct ib_srq_init_attr attr;
2909	struct mlx5_ib_dev *dev;
2910	struct ib_device *ibdev;
2911	struct ib_cq_init_attr cq_attr = {.cqe = 1};
2912	int port;
2913	int ret = 0;
2914
2915	dev = container_of(devr, struct mlx5_ib_dev, devr);
2916	ibdev = &dev->ib_dev;
2917
2918	mutex_init(&devr->mutex);
2919
2920	devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
2921	if (!devr->p0)
2922		return -ENOMEM;
2923
2924	devr->p0->device  = ibdev;
2925	devr->p0->uobject = NULL;
2926	atomic_set(&devr->p0->usecnt, 0);
2927
2928	ret = mlx5_ib_alloc_pd(devr->p0, NULL);
2929	if (ret)
2930		goto error0;
2931
2932	devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq);
2933	if (!devr->c0) {
2934		ret = -ENOMEM;
2935		goto error1;
2936	}
2937
2938	devr->c0->device = &dev->ib_dev;
2939	atomic_set(&devr->c0->usecnt, 0);
2940
2941	ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL);
2942	if (ret)
2943		goto err_create_cq;
2944
2945	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
2946	if (IS_ERR(devr->x0)) {
2947		ret = PTR_ERR(devr->x0);
2948		goto error2;
2949	}
2950	devr->x0->device = &dev->ib_dev;
2951	devr->x0->inode = NULL;
2952	atomic_set(&devr->x0->usecnt, 0);
2953	mutex_init(&devr->x0->tgt_qp_mutex);
2954	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
2955
2956	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
2957	if (IS_ERR(devr->x1)) {
2958		ret = PTR_ERR(devr->x1);
2959		goto error3;
2960	}
2961	devr->x1->device = &dev->ib_dev;
2962	devr->x1->inode = NULL;
2963	atomic_set(&devr->x1->usecnt, 0);
2964	mutex_init(&devr->x1->tgt_qp_mutex);
2965	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
2966
2967	memset(&attr, 0, sizeof(attr));
2968	attr.attr.max_sge = 1;
2969	attr.attr.max_wr = 1;
2970	attr.srq_type = IB_SRQT_XRC;
2971	attr.ext.cq = devr->c0;
2972	attr.ext.xrc.xrcd = devr->x0;
2973
2974	devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
2975	if (!devr->s0) {
2976		ret = -ENOMEM;
2977		goto error4;
2978	}
2979
2980	devr->s0->device	= &dev->ib_dev;
2981	devr->s0->pd		= devr->p0;
2982	devr->s0->srq_type      = IB_SRQT_XRC;
2983	devr->s0->ext.xrc.xrcd	= devr->x0;
2984	devr->s0->ext.cq	= devr->c0;
2985	ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
2986	if (ret)
2987		goto err_create;
2988
2989	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
2990	atomic_inc(&devr->s0->ext.cq->usecnt);
2991	atomic_inc(&devr->p0->usecnt);
2992	atomic_set(&devr->s0->usecnt, 0);
2993
2994	memset(&attr, 0, sizeof(attr));
2995	attr.attr.max_sge = 1;
2996	attr.attr.max_wr = 1;
2997	attr.srq_type = IB_SRQT_BASIC;
2998	devr->s1 = rdma_zalloc_drv_obj(ibdev, ib_srq);
2999	if (!devr->s1) {
3000		ret = -ENOMEM;
3001		goto error5;
3002	}
3003
3004	devr->s1->device	= &dev->ib_dev;
3005	devr->s1->pd		= devr->p0;
3006	devr->s1->srq_type      = IB_SRQT_BASIC;
3007	devr->s1->ext.cq	= devr->c0;
3008
3009	ret = mlx5_ib_create_srq(devr->s1, &attr, NULL);
3010	if (ret)
3011		goto error6;
3012
3013	atomic_inc(&devr->p0->usecnt);
3014	atomic_set(&devr->s1->usecnt, 0);
3015
3016	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
3017		INIT_WORK(&devr->ports[port].pkey_change_work,
3018			  pkey_change_handler);
3019		devr->ports[port].devr = devr;
3020	}
3021
3022	return 0;
3023
3024error6:
3025	kfree(devr->s1);
3026error5:
3027	mlx5_ib_destroy_srq(devr->s0, NULL);
3028err_create:
3029	kfree(devr->s0);
3030error4:
3031	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
3032error3:
3033	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
3034error2:
3035	mlx5_ib_destroy_cq(devr->c0, NULL);
3036err_create_cq:
3037	kfree(devr->c0);
3038error1:
3039	mlx5_ib_dealloc_pd(devr->p0, NULL);
3040error0:
3041	kfree(devr->p0);
3042	return ret;
3043}
3044
3045static void destroy_dev_resources(struct mlx5_ib_resources *devr)
3046{
3047	int port;
3048
3049	mlx5_ib_destroy_srq(devr->s1, NULL);
3050	kfree(devr->s1);
3051	mlx5_ib_destroy_srq(devr->s0, NULL);
3052	kfree(devr->s0);
3053	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
3054	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
3055	mlx5_ib_destroy_cq(devr->c0, NULL);
3056	kfree(devr->c0);
3057	mlx5_ib_dealloc_pd(devr->p0, NULL);
3058	kfree(devr->p0);
3059
3060	/* Make sure no change P_Key work items are still executing */
3061	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
3062		cancel_work_sync(&devr->ports[port].pkey_change_work);
3063}
3064
3065static u32 get_core_cap_flags(struct ib_device *ibdev)
3066{
3067	struct mlx5_ib_dev *dev = to_mdev(ibdev);
3068	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
3069	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
3070	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
3071	u32 ret = 0;
3072
3073	if (ll == IB_LINK_LAYER_INFINIBAND)
3074		return RDMA_CORE_PORT_IBA_IB;
3075
3076	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
3077		return 0;
3078
3079	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
3080		return 0;
3081
3082	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
3083		ret |= RDMA_CORE_PORT_IBA_ROCE;
3084
3085	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
3086		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
3087
3088	return ret;
3089}
3090
3091static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
3092			       struct ib_port_immutable *immutable)
3093{
3094	struct ib_port_attr attr;
3095	struct mlx5_ib_dev *dev = to_mdev(ibdev);
3096	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
3097	int err;
3098
3099	err = mlx5_ib_query_port(ibdev, port_num, &attr);
3100	if (err)
3101		return err;
3102
3103	immutable->pkey_tbl_len = attr.pkey_tbl_len;
3104	immutable->gid_tbl_len = attr.gid_tbl_len;
3105	immutable->core_cap_flags = get_core_cap_flags(ibdev);
3106	if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
3107		immutable->max_mad_size = IB_MGMT_MAD_SIZE;
3108
3109	return 0;
3110}
3111
3112static void get_dev_fw_str(struct ib_device *ibdev, char *str,
3113			   size_t str_len)
3114{
3115	struct mlx5_ib_dev *dev =
3116		container_of(ibdev, struct mlx5_ib_dev, ib_dev);
3117	snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
3118		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
3119}
3120
3121static int mlx5_roce_lag_init(struct mlx5_ib_dev *dev)
3122{
3123	return 0;
3124}
3125
3126static void mlx5_roce_lag_cleanup(struct mlx5_ib_dev *dev)
3127{
3128}
3129
3130static void mlx5_remove_roce_notifier(struct mlx5_ib_dev *dev)
3131{
3132	if (dev->roce.nb.notifier_call) {
3133		unregister_netdevice_notifier(&dev->roce.nb);
3134		dev->roce.nb.notifier_call = NULL;
3135	}
3136}
3137
3138static int
3139mlx5_enable_roce_if_cb(if_t ifp, void *arg)
3140{
3141	struct mlx5_ib_dev *dev = arg;
3142
3143	/* check if network interface belongs to mlx5en */
3144	if (!mlx5_netdev_match(ifp, dev->mdev, "mce"))
3145		return (0);
3146
3147	write_lock(&dev->roce.netdev_lock);
3148	dev->roce.netdev = ifp;
3149	write_unlock(&dev->roce.netdev_lock);
3150
3151	return (0);
3152}
3153
3154static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
3155{
3156	struct epoch_tracker et;
3157	VNET_ITERATOR_DECL(vnet_iter);
3158	int err;
3159
3160	/* Check if mlx5en net device already exists */
3161	VNET_LIST_RLOCK();
3162	NET_EPOCH_ENTER(et);
3163	VNET_FOREACH(vnet_iter) {
3164		CURVNET_SET_QUIET(vnet_iter);
3165		if_foreach(mlx5_enable_roce_if_cb, dev);
3166		CURVNET_RESTORE();
3167	}
3168	NET_EPOCH_EXIT(et);
3169	VNET_LIST_RUNLOCK();
3170
3171	dev->roce.nb.notifier_call = mlx5_netdev_event;
3172	err = register_netdevice_notifier(&dev->roce.nb);
3173	if (err) {
3174		dev->roce.nb.notifier_call = NULL;
3175		return err;
3176	}
3177
3178	if (MLX5_CAP_GEN(dev->mdev, roce)) {
3179		err = mlx5_nic_vport_enable_roce(dev->mdev);
3180		if (err)
3181			goto err_unregister_netdevice_notifier;
3182	}
3183
3184	err = mlx5_roce_lag_init(dev);
3185	if (err)
3186		goto err_disable_roce;
3187
3188	return 0;
3189
3190err_disable_roce:
3191	if (MLX5_CAP_GEN(dev->mdev, roce))
3192		mlx5_nic_vport_disable_roce(dev->mdev);
3193
3194err_unregister_netdevice_notifier:
3195	mlx5_remove_roce_notifier(dev);
3196	return err;
3197}
3198
3199static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
3200{
3201	mlx5_roce_lag_cleanup(dev);
3202	if (MLX5_CAP_GEN(dev->mdev, roce))
3203		mlx5_nic_vport_disable_roce(dev->mdev);
3204}
3205
3206static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num)
3207{
3208	mlx5_vport_dealloc_q_counter(dev->mdev,
3209				     MLX5_INTERFACE_PROTOCOL_IB,
3210				     dev->port[port_num].q_cnt_id);
3211	dev->port[port_num].q_cnt_id = 0;
3212}
3213
3214static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
3215{
3216	unsigned int i;
3217
3218	for (i = 0; i < dev->num_ports; i++)
3219		mlx5_ib_dealloc_q_port_counter(dev, i);
3220}
3221
3222static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
3223{
3224	int i;
3225	int ret;
3226
3227	for (i = 0; i < dev->num_ports; i++) {
3228		ret = mlx5_vport_alloc_q_counter(dev->mdev,
3229						 MLX5_INTERFACE_PROTOCOL_IB,
3230						 &dev->port[i].q_cnt_id);
3231		if (ret) {
3232			mlx5_ib_warn(dev,
3233				     "couldn't allocate queue counter for port %d, err %d\n",
3234				     i + 1, ret);
3235			goto dealloc_counters;
3236		}
3237	}
3238
3239	return 0;
3240
3241dealloc_counters:
3242	while (--i >= 0)
3243		mlx5_ib_dealloc_q_port_counter(dev, i);
3244
3245	return ret;
3246}
3247
3248static const char * const names[] = {
3249	"rx_write_requests",
3250	"rx_read_requests",
3251	"rx_atomic_requests",
3252	"out_of_buffer",
3253	"out_of_sequence",
3254	"duplicate_request",
3255	"rnr_nak_retry_err",
3256	"packet_seq_err",
3257	"implied_nak_seq_err",
3258	"local_ack_timeout_err",
3259};
3260
3261static const size_t stats_offsets[] = {
3262	MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests),
3263	MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests),
3264	MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests),
3265	MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer),
3266	MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence),
3267	MLX5_BYTE_OFF(query_q_counter_out, duplicate_request),
3268	MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err),
3269	MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err),
3270	MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err),
3271	MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err),
3272};
3273
3274static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
3275						    u8 port_num)
3276{
3277	BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets));
3278
3279	/* We support only per port stats */
3280	if (port_num == 0)
3281		return NULL;
3282
3283	return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names),
3284					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
3285}
3286
3287static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
3288				struct rdma_hw_stats *stats,
3289				u8 port, int index)
3290{
3291	struct mlx5_ib_dev *dev = to_mdev(ibdev);
3292	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
3293	void *out;
3294	__be32 val;
3295	int ret;
3296	int i;
3297
3298	if (!port || !stats)
3299		return -ENOSYS;
3300
3301	out = mlx5_vzalloc(outlen);
3302	if (!out)
3303		return -ENOMEM;
3304
3305	ret = mlx5_vport_query_q_counter(dev->mdev,
3306					dev->port[port - 1].q_cnt_id, 0,
3307					out, outlen);
3308	if (ret)
3309		goto free;
3310
3311	for (i = 0; i < ARRAY_SIZE(names); i++) {
3312		val = *(__be32 *)(out + stats_offsets[i]);
3313		stats->value[i] = (u64)be32_to_cpu(val);
3314	}
3315free:
3316	kvfree(out);
3317	return ARRAY_SIZE(names);
3318}
3319
3320static int mlx5_ib_stage_bfreg_init(struct mlx5_ib_dev *dev)
3321{
3322	int err;
3323
3324	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
3325	if (err)
3326		return err;
3327
3328	err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
3329	if (err) {
3330		mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3331		return err;
3332	}
3333
3334	err = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false);
3335	if (err) {
3336		mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
3337		mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3338	}
3339
3340	return err;
3341}
3342
3343static void mlx5_ib_stage_bfreg_cleanup(struct mlx5_ib_dev *dev)
3344{
3345	mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg);
3346	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
3347	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3348}
3349
3350static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
3351{
3352	struct mlx5_ib_dev *dev;
3353	enum rdma_link_layer ll;
3354	int port_type_cap;
3355	int err;
3356	int i;
3357
3358	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
3359	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
3360
3361	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
3362	if (!dev)
3363		return NULL;
3364
3365	dev->mdev = mdev;
3366
3367	dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
3368			    GFP_KERNEL);
3369	if (!dev->port)
3370		goto err_dealloc;
3371
3372	rwlock_init(&dev->roce.netdev_lock);
3373	err = get_port_caps(dev);
3374	if (err)
3375		goto err_free_port;
3376
3377	if (mlx5_use_mad_ifc(dev))
3378		get_ext_port_caps(dev);
3379
3380	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
3381
3382	mutex_init(&dev->lb_mutex);
3383
3384	INIT_IB_DEVICE_OPS(&dev->ib_dev.ops, mlx5, MLX5);
3385	snprintf(dev->ib_dev.name, IB_DEVICE_NAME_MAX, "mlx5_%d", device_get_unit(mdev->pdev->dev.bsddev));
3386	dev->ib_dev.owner		= THIS_MODULE;
3387	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
3388	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
3389	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
3390	dev->ib_dev.phys_port_cnt     = dev->num_ports;
3391	dev->ib_dev.num_comp_vectors    =
3392		dev->mdev->priv.eq_table.num_comp_vectors;
3393	dev->ib_dev.dma_device	= &mdev->pdev->dev;
3394
3395	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
3396	dev->ib_dev.uverbs_cmd_mask	=
3397		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
3398		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
3399		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
3400		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
3401		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
3402		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
3403		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
3404		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
3405		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
3406		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
3407		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
3408		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
3409		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
3410		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
3411		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
3412		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
3413		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
3414		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
3415		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
3416		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
3417		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
3418		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
3419		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
3420		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
3421		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
3422		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
3423	dev->ib_dev.uverbs_ex_cmd_mask =
3424		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)	|
3425		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)	|
3426		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
3427
3428	dev->ib_dev.query_device	= mlx5_ib_query_device;
3429	dev->ib_dev.query_port		= mlx5_ib_query_port;
3430	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
3431	if (ll == IB_LINK_LAYER_ETHERNET)
3432		dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
3433	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
3434	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
3435	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
3436	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
3437	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
3438	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
3439	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
3440	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
3441	dev->ib_dev.mmap		= mlx5_ib_mmap;
3442	dev->ib_dev.mmap_free		= mlx5_ib_mmap_free;
3443	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
3444	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
3445	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
3446	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
3447	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
3448	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
3449	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
3450	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
3451	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
3452	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
3453	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
3454	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
3455	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
3456	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
3457	dev->ib_dev.post_send		= mlx5_ib_post_send;
3458	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
3459	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
3460	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
3461	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
3462	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
3463	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
3464	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
3465	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
3466	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
3467	dev->ib_dev.rereg_user_mr	= mlx5_ib_rereg_user_mr;
3468	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
3469	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
3470	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
3471	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
3472	dev->ib_dev.alloc_mr		= mlx5_ib_alloc_mr;
3473	dev->ib_dev.map_mr_sg		= mlx5_ib_map_mr_sg;
3474	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
3475	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
3476	dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
3477	if (mlx5_core_is_pf(mdev)) {
3478		dev->ib_dev.get_vf_config	= mlx5_ib_get_vf_config;
3479		dev->ib_dev.set_vf_link_state	= mlx5_ib_set_vf_link_state;
3480		dev->ib_dev.get_vf_stats	= mlx5_ib_get_vf_stats;
3481		dev->ib_dev.set_vf_guid		= mlx5_ib_set_vf_guid;
3482	}
3483
3484	dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
3485
3486	mlx5_ib_internal_fill_odp_caps(dev);
3487
3488	if (MLX5_CAP_GEN(mdev, imaicl)) {
3489		dev->ib_dev.alloc_mw		= mlx5_ib_alloc_mw;
3490		dev->ib_dev.dealloc_mw		= mlx5_ib_dealloc_mw;
3491		dev->ib_dev.uverbs_cmd_mask |=
3492			(1ull << IB_USER_VERBS_CMD_ALLOC_MW)	|
3493			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
3494	}
3495
3496	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
3497	    MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
3498		dev->ib_dev.get_hw_stats	= mlx5_ib_get_hw_stats;
3499		dev->ib_dev.alloc_hw_stats	= mlx5_ib_alloc_hw_stats;
3500	}
3501
3502	if (MLX5_CAP_GEN(mdev, xrc)) {
3503		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
3504		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
3505		dev->ib_dev.uverbs_cmd_mask |=
3506			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
3507			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
3508	}
3509
3510	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
3511	    IB_LINK_LAYER_ETHERNET) {
3512		dev->ib_dev.create_flow	= mlx5_ib_create_flow;
3513		dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
3514		dev->ib_dev.create_wq	 = mlx5_ib_create_wq;
3515		dev->ib_dev.modify_wq	 = mlx5_ib_modify_wq;
3516		dev->ib_dev.destroy_wq	 = mlx5_ib_destroy_wq;
3517		dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
3518		dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
3519		dev->ib_dev.uverbs_ex_cmd_mask |=
3520			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
3521			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
3522			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
3523			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
3524			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
3525			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
3526			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
3527	}
3528	err = init_node_data(dev);
3529	if (err)
3530		goto err_free_port;
3531
3532	mutex_init(&dev->flow_db.lock);
3533	mutex_init(&dev->cap_mask_mutex);
3534	INIT_LIST_HEAD(&dev->qp_list);
3535	spin_lock_init(&dev->reset_flow_resource_lock);
3536
3537	if (ll == IB_LINK_LAYER_ETHERNET) {
3538		err = mlx5_enable_roce(dev);
3539		if (err)
3540			goto err_free_port;
3541	}
3542
3543	err = create_dev_resources(&dev->devr);
3544	if (err)
3545		goto err_disable_roce;
3546
3547	err = mlx5_ib_odp_init_one(dev);
3548	if (err)
3549		goto err_rsrc;
3550
3551	err = mlx5_ib_alloc_q_counters(dev);
3552	if (err)
3553		goto err_odp;
3554
3555	err = mlx5_ib_stage_bfreg_init(dev);
3556	if (err)
3557		goto err_q_cnt;
3558
3559	err = ib_register_device(&dev->ib_dev, NULL);
3560	if (err)
3561		goto err_bfreg;
3562
3563	err = create_umr_res(dev);
3564	if (err)
3565		goto err_dev;
3566
3567	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
3568		err = device_create_file(&dev->ib_dev.dev,
3569					 mlx5_class_attributes[i]);
3570		if (err)
3571			goto err_umrc;
3572	}
3573
3574	err = mlx5_ib_init_congestion(dev);
3575	if (err)
3576		goto err_umrc;
3577
3578	dev->ib_active = true;
3579
3580	return dev;
3581
3582err_umrc:
3583	destroy_umrc_res(dev);
3584
3585err_dev:
3586	ib_unregister_device(&dev->ib_dev);
3587
3588err_bfreg:
3589	mlx5_ib_stage_bfreg_cleanup(dev);
3590
3591err_q_cnt:
3592	mlx5_ib_dealloc_q_counters(dev);
3593
3594err_odp:
3595	mlx5_ib_odp_remove_one(dev);
3596
3597err_rsrc:
3598	destroy_dev_resources(&dev->devr);
3599
3600err_disable_roce:
3601	if (ll == IB_LINK_LAYER_ETHERNET) {
3602		mlx5_disable_roce(dev);
3603		mlx5_remove_roce_notifier(dev);
3604	}
3605
3606err_free_port:
3607	kfree(dev->port);
3608
3609err_dealloc:
3610	ib_dealloc_device((struct ib_device *)dev);
3611
3612	return NULL;
3613}
3614
3615static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
3616{
3617	struct mlx5_ib_dev *dev = context;
3618	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
3619
3620	mlx5_ib_cleanup_congestion(dev);
3621	mlx5_remove_roce_notifier(dev);
3622	ib_unregister_device(&dev->ib_dev);
3623	mlx5_ib_stage_bfreg_cleanup(dev);
3624	mlx5_ib_dealloc_q_counters(dev);
3625	destroy_umrc_res(dev);
3626	mlx5_ib_odp_remove_one(dev);
3627	destroy_dev_resources(&dev->devr);
3628	if (ll == IB_LINK_LAYER_ETHERNET)
3629		mlx5_disable_roce(dev);
3630	kfree(dev->port);
3631	ib_dealloc_device(&dev->ib_dev);
3632}
3633
3634static struct mlx5_interface mlx5_ib_interface = {
3635	.add            = mlx5_ib_add,
3636	.remove         = mlx5_ib_remove,
3637	.event          = mlx5_ib_event,
3638	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
3639};
3640
3641static int __init mlx5_ib_init(void)
3642{
3643	int err;
3644
3645	err = mlx5_ib_odp_init();
3646	if (err)
3647		return err;
3648
3649	err = mlx5_register_interface(&mlx5_ib_interface);
3650	if (err)
3651		goto clean_odp;
3652
3653	return err;
3654
3655clean_odp:
3656	mlx5_ib_odp_cleanup();
3657	return err;
3658}
3659
3660static void __exit mlx5_ib_cleanup(void)
3661{
3662	mlx5_unregister_interface(&mlx5_ib_interface);
3663	mlx5_ib_odp_cleanup();
3664}
3665
3666module_init_order(mlx5_ib_init, SI_ORDER_SEVENTH);
3667module_exit_order(mlx5_ib_cleanup, SI_ORDER_SEVENTH);
3668